From 90ec5e89e393c76e19afc845d8f88a5dc8315919 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 22 Feb 2017 19:23:37 +0530
Subject: kretprobes: Ensure probe location is at function entry

kretprobes can be registered by specifying an absolute address or by
specifying offset to a symbol. However, we need to ensure this falls at
function entry so as to be able to determine the return address.

Validate the same during kretprobe registration. By default, there
should not be any offset from a function entry, as determined through a
kallsyms_lookup(). Introduce arch_function_offset_within_entry() as a
way for architectures to override this.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/f1583bc4839a3862cfc2acefcc56f9c8837fa2ba.1487770934.git.naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/kprobes.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index c328e4f7dcad..177bdf6c6aeb 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -267,6 +267,7 @@ extern int arch_init_kprobes(void);
 extern void show_registers(struct pt_regs *regs);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 extern bool arch_within_kprobe_blacklist(unsigned long addr);
+extern bool arch_function_offset_within_entry(unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
 
-- 
cgit v1.3-6-gb490


From 5e6039d8a307d8411422c154f3d446b44fa32b6d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 27 Dec 2016 18:00:15 -0500
Subject: uaccess: move VERIFY_{READ,WRITE} definitions to linux/uaccess.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/include/asm/uaccess.h      | 3 ---
 arch/arm/include/asm/uaccess.h        | 3 ---
 arch/arm64/include/asm/uaccess.h      | 3 ---
 arch/avr32/include/asm/uaccess.h      | 3 ---
 arch/blackfin/include/asm/uaccess.h   | 3 ---
 arch/cris/include/asm/uaccess.h       | 3 ---
 arch/frv/include/asm/uaccess.h        | 3 ---
 arch/hexagon/include/asm/uaccess.h    | 2 --
 arch/ia64/include/asm/uaccess.h       | 3 ---
 arch/m32r/include/asm/uaccess.h       | 3 ---
 arch/m68k/include/asm/uaccess_mm.h    | 3 ---
 arch/m68k/include/asm/uaccess_no.h    | 3 ---
 arch/metag/include/asm/uaccess.h      | 3 ---
 arch/microblaze/include/asm/uaccess.h | 3 ---
 arch/mips/include/asm/uaccess.h       | 3 ---
 arch/mn10300/include/asm/uaccess.h    | 3 ---
 arch/nios2/include/asm/uaccess.h      | 3 ---
 arch/openrisc/include/asm/uaccess.h   | 3 ---
 arch/parisc/include/asm/uaccess.h     | 3 ---
 arch/powerpc/include/asm/uaccess.h    | 3 ---
 arch/s390/include/asm/uaccess.h       | 3 ---
 arch/score/include/asm/uaccess.h      | 3 ---
 arch/sh/include/asm/uaccess.h         | 3 ---
 arch/sparc/include/asm/uaccess_32.h   | 3 ---
 arch/sparc/include/asm/uaccess_64.h   | 3 ---
 arch/tile/include/asm/uaccess.h       | 3 ---
 arch/x86/include/asm/uaccess.h        | 3 ---
 arch/xtensa/include/asm/asm-uaccess.h | 3 ---
 arch/xtensa/include/asm/uaccess.h     | 3 ---
 include/asm-generic/uaccess.h         | 3 ---
 include/linux/uaccess.h               | 4 ++++
 31 files changed, 4 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h
index 94f587535dee..a37267a5d399 100644
--- a/arch/alpha/include/asm/uaccess.h
+++ b/arch/alpha/include/asm/uaccess.h
@@ -20,9 +20,6 @@
 #define KERNEL_DS	((mm_segment_t) { 0UL })
 #define USER_DS		((mm_segment_t) { -0x40000000000UL })
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define get_fs()  (current_thread_info()->addr_limit)
 #define get_ds()  (KERNEL_DS)
 #define set_fs(x) (current_thread_info()->addr_limit = (x))
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index b7e0125c0bbf..a13f39b3e9f8 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -26,9 +26,6 @@
 #define __put_user_unaligned __put_user
 #endif
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 /*
  * The exception table consists of pairs of addresses: the first is the
  * address of an instruction that is allowed to fault, and the second is
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 5308d696311b..f5e1e090b4d2 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -36,9 +36,6 @@
 #include <asm/memory.h>
 #include <asm/compiler.h>
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 /*
  * The exception table consists of pairs of relative offsets: the first
  * is the relative offset to an instruction that is allowed to fault,
diff --git a/arch/avr32/include/asm/uaccess.h b/arch/avr32/include/asm/uaccess.h
index b1ec1fa06463..1c7f234385fc 100644
--- a/arch/avr32/include/asm/uaccess.h
+++ b/arch/avr32/include/asm/uaccess.h
@@ -11,9 +11,6 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 typedef struct {
 	unsigned int is_user_space;
 } mm_segment_t;
diff --git a/arch/blackfin/include/asm/uaccess.h b/arch/blackfin/include/asm/uaccess.h
index 0eff88aa6d6a..d9a91108964f 100644
--- a/arch/blackfin/include/asm/uaccess.h
+++ b/arch/blackfin/include/asm/uaccess.h
@@ -29,9 +29,6 @@ static inline void set_fs(mm_segment_t fs)
 
 #define segment_eq(a, b) ((a) == (b))
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define access_ok(type, addr, size) _access_ok((unsigned long)(addr), (size))
 
 /*
diff --git a/arch/cris/include/asm/uaccess.h b/arch/cris/include/asm/uaccess.h
index 56c7d5750abd..82bfcb5e2c9b 100644
--- a/arch/cris/include/asm/uaccess.h
+++ b/arch/cris/include/asm/uaccess.h
@@ -21,9 +21,6 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/frv/include/asm/uaccess.h b/arch/frv/include/asm/uaccess.h
index c0f4057eab60..9e01bd798a03 100644
--- a/arch/frv/include/asm/uaccess.h
+++ b/arch/frv/include/asm/uaccess.h
@@ -22,9 +22,6 @@
 
 #define __ptr(x) ((unsigned long __force *)(x))
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * check that a range of addresses falls within the current address limit
  */
diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h
index f61cfb28e9f2..21f63593e2b6 100644
--- a/arch/hexagon/include/asm/uaccess.h
+++ b/arch/hexagon/include/asm/uaccess.h
@@ -50,8 +50,6 @@
  * reasonably simple and not *too* slow.  After all, we've got the
  * MMU for backup.
  */
-#define VERIFY_READ     0
-#define VERIFY_WRITE    1
 
 #define __access_ok(addr, size) \
 	((get_fs().seg == KERNEL_DS.seg) || \
diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h
index 471044be2a3b..c60ff6cc8dbd 100644
--- a/arch/ia64/include/asm/uaccess.h
+++ b/arch/ia64/include/asm/uaccess.h
@@ -48,9 +48,6 @@
 #define KERNEL_DS	((mm_segment_t) { ~0UL })		/* cf. access_ok() */
 #define USER_DS		((mm_segment_t) { TASK_SIZE-1 })	/* cf. access_ok() */
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define get_ds()  (KERNEL_DS)
 #define get_fs()  (current_thread_info()->addr_limit)
 #define set_fs(x) (current_thread_info()->addr_limit = (x))
diff --git a/arch/m32r/include/asm/uaccess.h b/arch/m32r/include/asm/uaccess.h
index 6f8982157a75..7d993a837e39 100644
--- a/arch/m32r/include/asm/uaccess.h
+++ b/arch/m32r/include/asm/uaccess.h
@@ -16,9 +16,6 @@
 #include <asm/page.h>
 #include <asm/setup.h>
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/m68k/include/asm/uaccess_mm.h b/arch/m68k/include/asm/uaccess_mm.h
index d228601b3afc..fa84e9c6e8f4 100644
--- a/arch/m68k/include/asm/uaccess_mm.h
+++ b/arch/m68k/include/asm/uaccess_mm.h
@@ -10,9 +10,6 @@
 #include <linux/sched.h>
 #include <asm/segment.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /* We let the MMU do all checking */
 static inline int access_ok(int type, const void __user *addr,
 			    unsigned long size)
diff --git a/arch/m68k/include/asm/uaccess_no.h b/arch/m68k/include/asm/uaccess_no.h
index 36deeb36503b..fab489a25b95 100644
--- a/arch/m68k/include/asm/uaccess_no.h
+++ b/arch/m68k/include/asm/uaccess_no.h
@@ -10,9 +10,6 @@
 
 #include <asm/segment.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define access_ok(type,addr,size)	_access_ok((unsigned long)(addr),(size))
 
 /*
diff --git a/arch/metag/include/asm/uaccess.h b/arch/metag/include/asm/uaccess.h
index 273e61225c27..46c1f6c54103 100644
--- a/arch/metag/include/asm/uaccess.h
+++ b/arch/metag/include/asm/uaccess.h
@@ -6,9 +6,6 @@
  */
 #include <linux/sched.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 253a67e275ad..70cf5f3dfae3 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -24,9 +24,6 @@
 #include <asm/pgtable.h>
 #include <linux/string.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * On Microblaze the fs value is actually the top of the corresponding
  * address space.
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index 5347cfe15af2..a058c04b8dd4 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -71,9 +71,6 @@ extern u64 __ua_limit;
 #define USER_DS		((mm_segment_t) { __UA_LIMIT })
 #endif
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #define get_ds()	(KERNEL_DS)
 #define get_fs()	(current_thread_info()->addr_limit)
 #define set_fs(x)	(current_thread_info()->addr_limit = (x))
diff --git a/arch/mn10300/include/asm/uaccess.h b/arch/mn10300/include/asm/uaccess.h
index 2eedf6f46a57..3e16850c4ccd 100644
--- a/arch/mn10300/include/asm/uaccess.h
+++ b/arch/mn10300/include/asm/uaccess.h
@@ -19,9 +19,6 @@
 #include <asm/page.h>
 #include <asm/errno.h>
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/nios2/include/asm/uaccess.h b/arch/nios2/include/asm/uaccess.h
index 0ab82324c817..07fc68c3e23c 100644
--- a/arch/nios2/include/asm/uaccess.h
+++ b/arch/nios2/include/asm/uaccess.h
@@ -19,9 +19,6 @@
 
 #include <asm/page.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * The exception table consists of pairs of addresses: the first is the
  * address of an instruction that is allowed to fault, and the second is
diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h
index 140faa16685a..6f88cf8bd112 100644
--- a/arch/openrisc/include/asm/uaccess.h
+++ b/arch/openrisc/include/asm/uaccess.h
@@ -28,9 +28,6 @@
 #include <linux/string.h>
 #include <asm/page.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h
index fb4382c28259..598b52e5aa03 100644
--- a/arch/parisc/include/asm/uaccess.h
+++ b/arch/parisc/include/asm/uaccess.h
@@ -13,9 +13,6 @@
 #include <linux/string.h>
 #include <linux/thread_info.h>
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 #define KERNEL_DS	((mm_segment_t){0})
 #define USER_DS 	((mm_segment_t){1})
 
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 0e6add3187bc..81307633c33f 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -11,9 +11,6 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 136932ff4250..0e1f515d239b 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -17,9 +17,6 @@
 #include <asm/processor.h>
 #include <asm/ctl_reg.h>
 
-#define VERIFY_READ     0
-#define VERIFY_WRITE    1
-
 
 /*
  * The fs value determines whether argument validity checking should be
diff --git a/arch/score/include/asm/uaccess.h b/arch/score/include/asm/uaccess.h
index db58ab98ec4b..51914244e867 100644
--- a/arch/score/include/asm/uaccess.h
+++ b/arch/score/include/asm/uaccess.h
@@ -6,9 +6,6 @@
 #include <linux/thread_info.h>
 #include <asm/extable.h>
 
-#define VERIFY_READ		0
-#define VERIFY_WRITE		1
-
 #define get_ds()		(KERNEL_DS)
 #define get_fs()		(current_thread_info()->addr_limit)
 #define segment_eq(a, b)	((a).seg == (b).seg)
diff --git a/arch/sh/include/asm/uaccess.h b/arch/sh/include/asm/uaccess.h
index c4f0fee812c3..6b66d67c21d2 100644
--- a/arch/sh/include/asm/uaccess.h
+++ b/arch/sh/include/asm/uaccess.h
@@ -5,9 +5,6 @@
 #include <linux/sched.h>
 #include <asm/segment.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #define __addr_ok(addr) \
 	((unsigned long __force)(addr) < current_thread_info()->addr_limit.seg)
 
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h
index ea55f86d7ccd..d8857f5fafad 100644
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -30,9 +30,6 @@
 #define KERNEL_DS   ((mm_segment_t) { 0 })
 #define USER_DS     ((mm_segment_t) { -1 })
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define get_ds()	(KERNEL_DS)
 #define get_fs()	(current->thread.current_ds)
 #define set_fs(val)	((current->thread.current_ds) = (val))
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index 5373136c412b..619223dc9022 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -36,9 +36,6 @@
 #define KERNEL_DS   ((mm_segment_t) { ASI_P })
 #define USER_DS     ((mm_segment_t) { ASI_AIUS })	/* har har har */
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define get_fs() ((mm_segment_t){(current_thread_info()->current_ds)})
 #define get_ds() (KERNEL_DS)
 
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index a77369e91e54..730073326b46 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -24,9 +24,6 @@
 #include <asm/processor.h>
 #include <asm/page.h>
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index ea148313570f..8dffb8b1d328 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -13,9 +13,6 @@
 #include <asm/smap.h>
 #include <asm/extable.h>
 
-#define VERIFY_READ 0
-#define VERIFY_WRITE 1
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/xtensa/include/asm/asm-uaccess.h b/arch/xtensa/include/asm/asm-uaccess.h
index a7a110039786..dfdf9fae1f84 100644
--- a/arch/xtensa/include/asm/asm-uaccess.h
+++ b/arch/xtensa/include/asm/asm-uaccess.h
@@ -19,9 +19,6 @@
 #include <linux/errno.h>
 #include <asm/types.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/processor.h>
diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h
index 848a3d736bcb..dd6b13649aad 100644
--- a/arch/xtensa/include/asm/uaccess.h
+++ b/arch/xtensa/include/asm/uaccess.h
@@ -20,9 +20,6 @@
 #include <linux/prefetch.h>
 #include <asm/types.h>
 
-#define VERIFY_READ    0
-#define VERIFY_WRITE   1
-
 #include <linux/sched.h>
 
 /*
diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index cc6bb319e464..d7c17bfd4601 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -35,9 +35,6 @@ static inline void set_fs(mm_segment_t fs)
 #define segment_eq(a, b) ((a).seg == (b).seg)
 #endif
 
-#define VERIFY_READ	0
-#define VERIFY_WRITE	1
-
 #define access_ok(type, addr, size) __access_ok((unsigned long)(addr),(size))
 
 /*
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index f30c187ed785..b660f37beaf5 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -2,6 +2,10 @@
 #define __LINUX_UACCESS_H__
 
 #include <linux/sched.h>
+
+#define VERIFY_READ 0
+#define VERIFY_WRITE 1
+
 #include <asm/uaccess.h>
 
 static __always_inline void pagefault_disabled_inc(void)
-- 
cgit v1.3-6-gb490


From af1d5b37d6211c814fac0d5d0b71ec695618054a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 27 Dec 2016 18:14:09 -0500
Subject: uaccess: drop duplicate includes from asm/uaccess.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/include/asm/uaccess.h      | 4 ----
 arch/arc/include/asm/uaccess.h        | 2 --
 arch/arm/include/asm/uaccess.h        | 2 --
 arch/arm64/include/asm/uaccess.h      | 2 --
 arch/avr32/include/asm/uaccess.h      | 3 ---
 arch/blackfin/include/asm/uaccess.h   | 1 -
 arch/cris/include/asm/uaccess.h       | 2 --
 arch/frv/include/asm/uaccess.h        | 1 -
 arch/hexagon/include/asm/uaccess.h    | 1 -
 arch/ia64/include/asm/uaccess.h       | 2 --
 arch/m32r/include/asm/uaccess.h       | 2 --
 arch/m68k/include/asm/uaccess_mm.h    | 2 --
 arch/m68k/include/asm/uaccess_no.h    | 1 -
 arch/metag/include/asm/uaccess.h      | 1 -
 arch/microblaze/include/asm/uaccess.h | 2 --
 arch/mips/include/asm/uaccess.h       | 2 --
 arch/mn10300/include/asm/uaccess.h    | 2 --
 arch/nios2/include/asm/uaccess.h      | 2 --
 arch/openrisc/include/asm/uaccess.h   | 2 --
 arch/parisc/include/asm/uaccess.h     | 2 --
 arch/powerpc/include/asm/uaccess.h    | 2 --
 arch/s390/include/asm/uaccess.h       | 2 --
 arch/score/include/asm/uaccess.h      | 2 --
 arch/sh/include/asm/uaccess.h         | 2 --
 arch/sparc/include/asm/uaccess_32.h   | 2 --
 arch/sparc/include/asm/uaccess_64.h   | 2 --
 arch/tile/include/asm/uaccess.h       | 1 -
 arch/um/include/asm/uaccess.h         | 1 -
 arch/unicore32/include/asm/uaccess.h  | 3 ---
 arch/x86/include/asm/uaccess.h        | 2 --
 arch/x86/include/asm/uaccess_32.h     | 2 --
 arch/x86/include/asm/uaccess_64.h     | 1 -
 arch/xtensa/include/asm/uaccess.h     | 3 ---
 include/asm-generic/uaccess.h         | 1 -
 include/linux/uaccess.h               | 1 +
 35 files changed, 1 insertion(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h
index a37267a5d399..77c55ce89936 100644
--- a/arch/alpha/include/asm/uaccess.h
+++ b/arch/alpha/include/asm/uaccess.h
@@ -1,10 +1,6 @@
 #ifndef __ALPHA_UACCESS_H
 #define __ALPHA_UACCESS_H
 
-#include <linux/errno.h>
-#include <linux/sched.h>
-
-
 /*
  * The fs value determines whether argument validity checking should be
  * performed or not.  If get_fs() == USER_DS, checking is performed, with
diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h
index 41faf17cd28d..0431f5668354 100644
--- a/arch/arc/include/asm/uaccess.h
+++ b/arch/arc/include/asm/uaccess.h
@@ -24,8 +24,6 @@
 #ifndef _ASM_ARC_UACCESS_H
 #define _ASM_ARC_UACCESS_H
 
-#include <linux/sched.h>
-#include <asm/errno.h>
 #include <linux/string.h>	/* for generic string functions */
 
 
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index a13f39b3e9f8..9677a7cf7987 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -12,8 +12,6 @@
  * User space memory access functions
  */
 #include <linux/string.h>
-#include <linux/thread_info.h>
-#include <asm/errno.h>
 #include <asm/memory.h>
 #include <asm/domain.h>
 #include <asm/unified.h>
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index f5e1e090b4d2..7c514e10a08e 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -28,11 +28,9 @@
 #include <linux/bitops.h>
 #include <linux/kasan-checks.h>
 #include <linux/string.h>
-#include <linux/thread_info.h>
 
 #include <asm/cpufeature.h>
 #include <asm/ptrace.h>
-#include <asm/errno.h>
 #include <asm/memory.h>
 #include <asm/compiler.h>
 
diff --git a/arch/avr32/include/asm/uaccess.h b/arch/avr32/include/asm/uaccess.h
index 1c7f234385fc..7ca5cb33369b 100644
--- a/arch/avr32/include/asm/uaccess.h
+++ b/arch/avr32/include/asm/uaccess.h
@@ -8,9 +8,6 @@
 #ifndef __ASM_AVR32_UACCESS_H
 #define __ASM_AVR32_UACCESS_H
 
-#include <linux/errno.h>
-#include <linux/sched.h>
-
 typedef struct {
 	unsigned int is_user_space;
 } mm_segment_t;
diff --git a/arch/blackfin/include/asm/uaccess.h b/arch/blackfin/include/asm/uaccess.h
index d9a91108964f..c9fedc3be30c 100644
--- a/arch/blackfin/include/asm/uaccess.h
+++ b/arch/blackfin/include/asm/uaccess.h
@@ -12,7 +12,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 
diff --git a/arch/cris/include/asm/uaccess.h b/arch/cris/include/asm/uaccess.h
index 82bfcb5e2c9b..bb3004a2b2f7 100644
--- a/arch/cris/include/asm/uaccess.h
+++ b/arch/cris/include/asm/uaccess.h
@@ -16,8 +16,6 @@
 #define _CRIS_UACCESS_H
 
 #ifndef __ASSEMBLY__
-#include <linux/sched.h>
-#include <linux/errno.h>
 #include <asm/processor.h>
 #include <asm/page.h>
 
diff --git a/arch/frv/include/asm/uaccess.h b/arch/frv/include/asm/uaccess.h
index 9e01bd798a03..55b3a69c6c53 100644
--- a/arch/frv/include/asm/uaccess.h
+++ b/arch/frv/include/asm/uaccess.h
@@ -15,7 +15,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/segment.h>
 #include <asm/sections.h>
diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h
index 21f63593e2b6..3a7f818e5ef7 100644
--- a/arch/hexagon/include/asm/uaccess.h
+++ b/arch/hexagon/include/asm/uaccess.h
@@ -23,7 +23,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm/segment.h>
 #include <asm/sections.h>
diff --git a/arch/ia64/include/asm/uaccess.h b/arch/ia64/include/asm/uaccess.h
index c60ff6cc8dbd..d471d1a1afd0 100644
--- a/arch/ia64/include/asm/uaccess.h
+++ b/arch/ia64/include/asm/uaccess.h
@@ -33,8 +33,6 @@
  */
 
 #include <linux/compiler.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
 #include <linux/page-flags.h>
 #include <linux/mm.h>
 
diff --git a/arch/m32r/include/asm/uaccess.h b/arch/m32r/include/asm/uaccess.h
index 7d993a837e39..96b0efdb5f22 100644
--- a/arch/m32r/include/asm/uaccess.h
+++ b/arch/m32r/include/asm/uaccess.h
@@ -11,8 +11,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <asm/page.h>
 #include <asm/setup.h>
 
diff --git a/arch/m68k/include/asm/uaccess_mm.h b/arch/m68k/include/asm/uaccess_mm.h
index fa84e9c6e8f4..fb72b710759e 100644
--- a/arch/m68k/include/asm/uaccess_mm.h
+++ b/arch/m68k/include/asm/uaccess_mm.h
@@ -5,9 +5,7 @@
  * User space memory access functions
  */
 #include <linux/compiler.h>
-#include <linux/errno.h>
 #include <linux/types.h>
-#include <linux/sched.h>
 #include <asm/segment.h>
 
 /* We let the MMU do all checking */
diff --git a/arch/m68k/include/asm/uaccess_no.h b/arch/m68k/include/asm/uaccess_no.h
index fab489a25b95..e77ce66c14d5 100644
--- a/arch/m68k/include/asm/uaccess_no.h
+++ b/arch/m68k/include/asm/uaccess_no.h
@@ -4,7 +4,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/string.h>
 
diff --git a/arch/metag/include/asm/uaccess.h b/arch/metag/include/asm/uaccess.h
index 46c1f6c54103..7fc5277ae71f 100644
--- a/arch/metag/include/asm/uaccess.h
+++ b/arch/metag/include/asm/uaccess.h
@@ -4,7 +4,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 
 /*
  * The fs value determines whether argument validity checking should be
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 70cf5f3dfae3..a3c0a06d7848 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -15,8 +15,6 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/sched.h> /* RLIMIT_FSIZE */
 #include <linux/mm.h>
 
 #include <asm/mmu.h>
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index a058c04b8dd4..dd25b312c973 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -12,8 +12,6 @@
 #define _ASM_UACCESS_H
 
 #include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm-eva.h>
 #include <asm/extable.h>
diff --git a/arch/mn10300/include/asm/uaccess.h b/arch/mn10300/include/asm/uaccess.h
index 3e16850c4ccd..2da7b0fed4aa 100644
--- a/arch/mn10300/include/asm/uaccess.h
+++ b/arch/mn10300/include/asm/uaccess.h
@@ -14,10 +14,8 @@
 /*
  * User space memory access functions
  */
-#include <linux/thread_info.h>
 #include <linux/kernel.h>
 #include <asm/page.h>
-#include <asm/errno.h>
 
 /*
  * The fs value determines whether argument validity checking should be
diff --git a/arch/nios2/include/asm/uaccess.h b/arch/nios2/include/asm/uaccess.h
index 07fc68c3e23c..198bbf15f644 100644
--- a/arch/nios2/include/asm/uaccess.h
+++ b/arch/nios2/include/asm/uaccess.h
@@ -13,8 +13,6 @@
 #ifndef _ASM_NIOS2_UACCESS_H
 #define _ASM_NIOS2_UACCESS_H
 
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <linux/string.h>
 
 #include <asm/page.h>
diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h
index 6f88cf8bd112..0b0f60444b76 100644
--- a/arch/openrisc/include/asm/uaccess.h
+++ b/arch/openrisc/include/asm/uaccess.h
@@ -22,8 +22,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <linux/prefetch.h>
 #include <linux/string.h>
 #include <asm/page.h>
diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h
index 598b52e5aa03..a0b461336b6a 100644
--- a/arch/parisc/include/asm/uaccess.h
+++ b/arch/parisc/include/asm/uaccess.h
@@ -6,12 +6,10 @@
  */
 #include <asm/page.h>
 #include <asm/cache.h>
-#include <asm/errno.h>
 #include <asm-generic/uaccess-unaligned.h>
 
 #include <linux/bug.h>
 #include <linux/string.h>
-#include <linux/thread_info.h>
 
 #define KERNEL_DS	((mm_segment_t){0})
 #define USER_DS 	((mm_segment_t){1})
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 81307633c33f..2ec70aa1cc5d 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -4,8 +4,6 @@
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 
-#include <linux/sched.h>
-#include <linux/errno.h>
 #include <asm/asm-compat.h>
 #include <asm/ppc_asm.h>
 #include <asm/processor.h>
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 0e1f515d239b..9e9a5e8d6cf6 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -12,8 +12,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
-#include <linux/errno.h>
 #include <asm/processor.h>
 #include <asm/ctl_reg.h>
 
diff --git a/arch/score/include/asm/uaccess.h b/arch/score/include/asm/uaccess.h
index 51914244e867..7a6c6982420a 100644
--- a/arch/score/include/asm/uaccess.h
+++ b/arch/score/include/asm/uaccess.h
@@ -2,8 +2,6 @@
 #define __SCORE_UACCESS_H
 
 #include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <asm/extable.h>
 
 #define get_ds()		(KERNEL_DS)
diff --git a/arch/sh/include/asm/uaccess.h b/arch/sh/include/asm/uaccess.h
index 6b66d67c21d2..89a28dfbabfa 100644
--- a/arch/sh/include/asm/uaccess.h
+++ b/arch/sh/include/asm/uaccess.h
@@ -1,8 +1,6 @@
 #ifndef __ASM_SH_UACCESS_H
 #define __ASM_SH_UACCESS_H
 
-#include <linux/errno.h>
-#include <linux/sched.h>
 #include <asm/segment.h>
 
 #define __addr_ok(addr) \
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h
index d8857f5fafad..b10f7d626f0e 100644
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -9,9 +9,7 @@
 
 #ifdef __KERNEL__
 #include <linux/compiler.h>
-#include <linux/sched.h>
 #include <linux/string.h>
-#include <linux/errno.h>
 #endif
 
 #ifndef __ASSEMBLY__
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index 619223dc9022..d76362cad80f 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -6,10 +6,8 @@
  */
 
 #ifdef __KERNEL__
-#include <linux/errno.h>
 #include <linux/compiler.h>
 #include <linux/string.h>
-#include <linux/thread_info.h>
 #include <asm/asi.h>
 #include <asm/spitfire.h>
 #include <asm-generic/uaccess-unaligned.h>
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index 730073326b46..14ea3d1ca2c7 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -18,7 +18,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <asm-generic/uaccess-unaligned.h>
 #include <asm/processor.h>
diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h
index 3705620ca298..076bdcb0c2ad 100644
--- a/arch/um/include/asm/uaccess.h
+++ b/arch/um/include/asm/uaccess.h
@@ -7,7 +7,6 @@
 #ifndef __UM_UACCESS_H
 #define __UM_UACCESS_H
 
-#include <asm/thread_info.h>
 #include <asm/elf.h>
 
 #define __under_task_size(addr, size) \
diff --git a/arch/unicore32/include/asm/uaccess.h b/arch/unicore32/include/asm/uaccess.h
index 897e11ad8124..1622f37a0514 100644
--- a/arch/unicore32/include/asm/uaccess.h
+++ b/arch/unicore32/include/asm/uaccess.h
@@ -12,9 +12,6 @@
 #ifndef __UNICORE_UACCESS_H__
 #define __UNICORE_UACCESS_H__
 
-#include <linux/thread_info.h>
-#include <linux/errno.h>
-
 #include <asm/memory.h>
 
 #define __copy_from_user	__copy_from_user
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 8dffb8b1d328..0522d88a7f90 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -3,10 +3,8 @@
 /*
  * User space memory access functions
  */
-#include <linux/errno.h>
 #include <linux/compiler.h>
 #include <linux/kasan-checks.h>
-#include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm.h>
 #include <asm/page.h>
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 7d3bdd1ed697..5268ecceea96 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -4,8 +4,6 @@
 /*
  * User space memory access functions
  */
-#include <linux/errno.h>
-#include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm.h>
 #include <asm/page.h>
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 673059a109fe..8ddadd93639e 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -5,7 +5,6 @@
  * User space memory access functions
  */
 #include <linux/compiler.h>
-#include <linux/errno.h>
 #include <linux/lockdep.h>
 #include <linux/kasan-checks.h>
 #include <asm/alternative.h>
diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h
index dd6b13649aad..bd8861c811ef 100644
--- a/arch/xtensa/include/asm/uaccess.h
+++ b/arch/xtensa/include/asm/uaccess.h
@@ -16,12 +16,9 @@
 #ifndef _XTENSA_UACCESS_H
 #define _XTENSA_UACCESS_H
 
-#include <linux/errno.h>
 #include <linux/prefetch.h>
 #include <asm/types.h>
 
-#include <linux/sched.h>
-
 /*
  * The fs value determines whether argument validity checking should
  * be performed or not.  If get_fs() == USER_DS, checking is
diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index d7c17bfd4601..d20955e495b3 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -6,7 +6,6 @@
  * on any machine that has kernel and user data in the same
  * address space, e.g. all NOMMU machines.
  */
-#include <linux/sched.h>
 #include <linux/string.h>
 
 #include <asm/segment.h>
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index b660f37beaf5..b786ca2419b4 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -2,6 +2,7 @@
 #define __LINUX_UACCESS_H__
 
 #include <linux/sched.h>
+#include <linux/thread_info.h>
 
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1
-- 
cgit v1.3-6-gb490


From 387ad9674b0013c8756ad20d854ff005b0c313ad Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Mon, 20 Feb 2017 12:19:00 +0200
Subject: kernel: convert cgroup_namespace.count from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h    | 7 ++++---
 kernel/cgroup/cgroup.c    | 2 +-
 kernel/cgroup/namespace.c | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f6b43fbb141c..44129793c7b8 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,7 @@
 #include <linux/ns_common.h>
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
+#include <linux/refcount.h>
 
 #include <linux/cgroup-defs.h>
 
@@ -640,7 +641,7 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
 #endif	/* CONFIG_CGROUP_DATA */
 
 struct cgroup_namespace {
-	atomic_t		count;
+	refcount_t		count;
 	struct ns_common	ns;
 	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
@@ -675,12 +676,12 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
 static inline void get_cgroup_ns(struct cgroup_namespace *ns)
 {
 	if (ns)
-		atomic_inc(&ns->count);
+		refcount_inc(&ns->count);
 }
 
 static inline void put_cgroup_ns(struct cgroup_namespace *ns)
 {
-	if (ns && atomic_dec_and_test(&ns->count))
+	if (ns && refcount_dec_and_test(&ns->count))
 		free_cgroup_ns(ns);
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0125589c7428..8ee78688e36d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly;
 
 /* cgroup namespace for init task */
 struct cgroup_namespace init_cgroup_ns = {
-	.count		= { .counter = 2, },
+	.count		= REFCOUNT_INIT(2),
 	.user_ns	= &init_user_ns,
 	.ns.ops		= &cgroupns_operations,
 	.ns.inum	= PROC_CGROUP_INIT_INO,
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 96d38dab6fb2..66129eb4371d 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
 		kfree(new_ns);
 		return ERR_PTR(ret);
 	}
-	atomic_set(&new_ns->count, 1);
+	refcount_set(&new_ns->count, 1);
 	new_ns->ns.ops = &cgroupns_operations;
 	return new_ns;
 }
-- 
cgit v1.3-6-gb490


From fa5923cea8da3b5d4eb943651922b327b1df673c Mon Sep 17 00:00:00 2001
From: Michał Kępień <kernel@kempniu.pl>
Date: Fri, 17 Feb 2017 08:57:48 +0100
Subject: ALSA: hda - use dell_micmute_led_set() instead of
 dell_app_wmi_led_set()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dell_app_wmi_led_set() method introduced in commit db6d8cc00773
("dell-led: add mic mute led interface") was implemented as an easily
extensible entry point for other modules to set the state of various
LEDs.  However, almost three years later it is still only used to
control the mic mute LED, so it will be replaced with direct calls to
dell_micmute_led_set().

Signed-off-by: Michał Kępień <kernel@kempniu.pl>
Tested-by: Alex Hung <alex.hung@canonical.com>
Reviewed-by: Pali Rohár <pali.rohar@gmail.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/leds/dell-led.c         | 20 ++------------------
 include/linux/dell-led.h        |  6 +-----
 sound/pci/hda/dell_wmi_helper.c | 12 ++++++------
 3 files changed, 9 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/dell-led.c b/drivers/leds/dell-led.c
index e8e8f67224c1..f9002d9bb757 100644
--- a/drivers/leds/dell-led.c
+++ b/drivers/leds/dell-led.c
@@ -46,7 +46,7 @@ MODULE_ALIAS("wmi:" DELL_LED_BIOS_GUID);
 #define GLOBAL_MIC_MUTE_ENABLE	0x364
 #define GLOBAL_MIC_MUTE_DISABLE	0x365
 
-static int dell_micmute_led_set(int state)
+int dell_micmute_led_set(int state)
 {
 	struct calling_interface_buffer *buffer;
 	struct calling_interface_token *token;
@@ -69,23 +69,7 @@ static int dell_micmute_led_set(int state)
 
 	return state;
 }
-
-int dell_app_wmi_led_set(int whichled, int on)
-{
-	int state = 0;
-
-	switch (whichled) {
-	case DELL_LED_MICMUTE:
-		state = dell_micmute_led_set(on);
-		break;
-	default:
-		pr_warn("led type %x is not supported\n", whichled);
-		break;
-	}
-
-	return state;
-}
-EXPORT_SYMBOL_GPL(dell_app_wmi_led_set);
+EXPORT_SYMBOL_GPL(dell_micmute_led_set);
 
 struct bios_args {
 	unsigned char length;
diff --git a/include/linux/dell-led.h b/include/linux/dell-led.h
index 7009b8bec77b..3f033c48071e 100644
--- a/include/linux/dell-led.h
+++ b/include/linux/dell-led.h
@@ -1,10 +1,6 @@
 #ifndef __DELL_LED_H__
 #define __DELL_LED_H__
 
-enum {
-	DELL_LED_MICMUTE,
-};
-
-int dell_app_wmi_led_set(int whichled, int on);
+int dell_micmute_led_set(int on);
 
 #endif
diff --git a/sound/pci/hda/dell_wmi_helper.c b/sound/pci/hda/dell_wmi_helper.c
index 19d41da79f93..e128c8096772 100644
--- a/sound/pci/hda/dell_wmi_helper.c
+++ b/sound/pci/hda/dell_wmi_helper.c
@@ -6,7 +6,7 @@
 #include <linux/dell-led.h>
 
 static int dell_led_value;
-static int (*dell_led_set_func)(int, int);
+static int (*dell_led_set_func)(int);
 static void (*dell_old_cap_hook)(struct hda_codec *,
 			         struct snd_kcontrol *,
 				 struct snd_ctl_elem_value *);
@@ -27,7 +27,7 @@ static void update_dell_wmi_micmute_led(struct hda_codec *codec,
 			return;
 		dell_led_value = val;
 		if (dell_led_set_func)
-			dell_led_set_func(DELL_LED_MICMUTE, dell_led_value);
+			dell_led_set_func(dell_led_value);
 	}
 }
 
@@ -40,14 +40,14 @@ static void alc_fixup_dell_wmi(struct hda_codec *codec,
 
 	if (action == HDA_FIXUP_ACT_PROBE) {
 		if (!dell_led_set_func)
-			dell_led_set_func = symbol_request(dell_app_wmi_led_set);
+			dell_led_set_func = symbol_request(dell_micmute_led_set);
 		if (!dell_led_set_func) {
-			codec_warn(codec, "Failed to find dell wmi symbol dell_app_wmi_led_set\n");
+			codec_warn(codec, "Failed to find dell wmi symbol dell_micmute_led_set\n");
 			return;
 		}
 
 		removefunc = true;
-		if (dell_led_set_func(DELL_LED_MICMUTE, false) >= 0) {
+		if (dell_led_set_func(false) >= 0) {
 			dell_led_value = 0;
 			if (spec->gen.num_adc_nids > 1 && !spec->gen.dyn_adc_switch)
 				codec_dbg(codec, "Skipping micmute LED control due to several ADCs");
@@ -61,7 +61,7 @@ static void alc_fixup_dell_wmi(struct hda_codec *codec,
 	}
 
 	if (dell_led_set_func && (action == HDA_FIXUP_ACT_FREE || removefunc)) {
-		symbol_put(dell_app_wmi_led_set);
+		symbol_put(dell_micmute_led_set);
 		dell_led_set_func = NULL;
 		dell_old_cap_hook = NULL;
 	}
-- 
cgit v1.3-6-gb490


From 442c609830e98919faa78b797e9b89c53bab9cbf Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <rafal@milecki.pl>
Date: Mon, 6 Mar 2017 06:19:44 +0100
Subject: leds: core: add OF variants of LED registering functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These new functions allow passing an additional device_node argument
that will be internally set for created LED device. Thanks to this LED
core code and triggers will be able to access DT node for reading extra
info.

The easiest solution for achieving this was reworking old functions to
more generic ones & adding simple defines for API compatibility.

Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 drivers/leds/led-class.c | 26 ++++++++++++++++----------
 include/linux/leds.h     | 14 ++++++++++----
 2 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c
index f2b0a80a62b4..b0e2d55acbd6 100644
--- a/drivers/leds/led-class.c
+++ b/drivers/leds/led-class.c
@@ -244,11 +244,14 @@ static int led_classdev_next_name(const char *init_name, char *name,
 }
 
 /**
- * led_classdev_register - register a new object of led_classdev class.
- * @parent: The device to register.
+ * of_led_classdev_register - register a new object of led_classdev class.
+ *
+ * @parent: parent of LED device
  * @led_cdev: the led_classdev structure for this device.
+ * @np: DT node describing this LED
  */
-int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
+int of_led_classdev_register(struct device *parent, struct device_node *np,
+			    struct led_classdev *led_cdev)
 {
 	char name[LED_MAX_NAME_SIZE];
 	int ret;
@@ -261,6 +264,7 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
 				led_cdev, led_cdev->groups, "%s", name);
 	if (IS_ERR(led_cdev->dev))
 		return PTR_ERR(led_cdev->dev);
+	led_cdev->dev->of_node = np;
 
 	if (ret)
 		dev_warn(parent, "Led %s renamed to %s due to name collision",
@@ -303,7 +307,7 @@ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(led_classdev_register);
+EXPORT_SYMBOL_GPL(of_led_classdev_register);
 
 /**
  * led_classdev_unregister - unregisters a object of led_properties class.
@@ -348,12 +352,14 @@ static void devm_led_classdev_release(struct device *dev, void *res)
 }
 
 /**
- * devm_led_classdev_register - resource managed led_classdev_register()
- * @parent: The device to register.
+ * devm_of_led_classdev_register - resource managed led_classdev_register()
+ *
+ * @parent: parent of LED device
  * @led_cdev: the led_classdev structure for this device.
  */
-int devm_led_classdev_register(struct device *parent,
-			       struct led_classdev *led_cdev)
+int devm_of_led_classdev_register(struct device *parent,
+				  struct device_node *np,
+				  struct led_classdev *led_cdev)
 {
 	struct led_classdev **dr;
 	int rc;
@@ -362,7 +368,7 @@ int devm_led_classdev_register(struct device *parent,
 	if (!dr)
 		return -ENOMEM;
 
-	rc = led_classdev_register(parent, led_cdev);
+	rc = of_led_classdev_register(parent, np, led_cdev);
 	if (rc) {
 		devres_free(dr);
 		return rc;
@@ -373,7 +379,7 @@ int devm_led_classdev_register(struct device *parent,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(devm_led_classdev_register);
+EXPORT_SYMBOL_GPL(devm_of_led_classdev_register);
 
 static int devm_led_classdev_match(struct device *dev, void *res, void *data)
 {
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 38c0bd7ca107..64c56d454f7d 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -122,10 +122,16 @@ struct led_classdev {
 	struct mutex		led_access;
 };
 
-extern int led_classdev_register(struct device *parent,
-				 struct led_classdev *led_cdev);
-extern int devm_led_classdev_register(struct device *parent,
-				      struct led_classdev *led_cdev);
+extern int of_led_classdev_register(struct device *parent,
+				    struct device_node *np,
+				    struct led_classdev *led_cdev);
+#define led_classdev_register(parent, led_cdev)				\
+	of_led_classdev_register(parent, NULL, led_cdev)
+extern int devm_of_led_classdev_register(struct device *parent,
+					 struct device_node *np,
+					 struct led_classdev *led_cdev);
+#define devm_led_classdev_register(parent, led_cdev)			\
+	devm_of_led_classdev_register(parent, NULL, led_cdev)
 extern void led_classdev_unregister(struct led_classdev *led_cdev);
 extern void devm_led_classdev_unregister(struct device *parent,
 					 struct led_classdev *led_cdev);
-- 
cgit v1.3-6-gb490


From 4b9502e63b5e2b1b5ef491919d3219b9440fe0b3 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Wed, 8 Mar 2017 10:00:40 +0200
Subject: kernel: convert css_set.refcount from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h     | 3 ++-
 kernel/cgroup/cgroup-internal.h | 5 +++--
 kernel/cgroup/cgroup-v1.c       | 4 ++--
 kernel/cgroup/cgroup.c          | 6 +++---
 4 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 6a3f850cabab..c74b78ecd583 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -13,6 +13,7 @@
 #include <linux/wait.h>
 #include <linux/mutex.h>
 #include <linux/rcupdate.h>
+#include <linux/refcount.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
@@ -156,7 +157,7 @@ struct css_set {
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 
 	/* reference count */
-	atomic_t refcount;
+	refcount_t refcount;
 
 	/* the default cgroup associated with this css_set */
 	struct cgroup *dfl_cgrp;
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 9203bfb05603..4567f12b02e9 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -5,6 +5,7 @@
 #include <linux/kernfs.h>
 #include <linux/workqueue.h>
 #include <linux/list.h>
+#include <linux/refcount.h>
 
 /*
  * A cgroup can be associated with multiple css_sets as different tasks may
@@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset)
 	 * can see it. Similar to atomic_dec_and_lock(), but for an
 	 * rwlock
 	 */
-	if (atomic_add_unless(&cset->refcount, -1, 1))
+	if (refcount_dec_not_one(&cset->refcount))
 		return;
 
 	spin_lock_irqsave(&css_set_lock, flags);
@@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset)
  */
 static inline void get_css_set(struct css_set *cset)
 {
-	atomic_inc(&cset->refcount);
+	refcount_inc(&cset->refcount);
 }
 
 bool cgroup_ssid_enabled(int ssid);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 56eba9caa632..c4a68c438fde 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 
 	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
-		count += atomic_read(&link->cset->refcount);
+		count += refcount_read(&link->cset->refcount);
 	spin_unlock_irq(&css_set_lock);
 	return count;
 }
@@ -1286,7 +1286,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
 	u64 count;
 
 	rcu_read_lock();
-	count = atomic_read(&task_css_set(current)->refcount);
+	count = refcount_read(&task_css_set(current)->refcount);
 	rcu_read_unlock();
 	return count;
 }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8ee78688e36d..b1cc1c306668 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -554,7 +554,7 @@ EXPORT_SYMBOL_GPL(of_css);
  * haven't been created.
  */
 struct css_set init_css_set = {
-	.refcount		= ATOMIC_INIT(1),
+	.refcount		= REFCOUNT_INIT(1),
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
 	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
@@ -724,7 +724,7 @@ void put_css_set_locked(struct css_set *cset)
 
 	lockdep_assert_held(&css_set_lock);
 
-	if (!atomic_dec_and_test(&cset->refcount))
+	if (!refcount_dec_and_test(&cset->refcount))
 		return;
 
 	/* This css_set is dead. unlink it and release cgroup and css refs */
@@ -977,7 +977,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 		return NULL;
 	}
 
-	atomic_set(&cset->refcount, 1);
+	refcount_set(&cset->refcount, 1);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
 	INIT_LIST_HEAD(&cset->task_iters);
-- 
cgit v1.3-6-gb490


From bd174169c7a12a37b3b4aa2221f084ade010b182 Mon Sep 17 00:00:00 2001
From: David Windsor <dwindsor@gmail.com>
Date: Fri, 10 Mar 2017 10:34:12 -0500
Subject: locking/refcount: Add refcount_t API kernel-doc comments

Signed-off-by: David Windsor <dwindsor@gmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: elena.reshetova@intel.com
Cc: kernel-hardening@lists.openwall.com
Link: http://lkml.kernel.org/r/1489160052-20293-1-git-send-email-dwindsor@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/refcount.h |  19 ++++++++
 lib/refcount.c           | 122 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 129 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 0023fee4bbbc..b34aa649d204 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -6,17 +6,36 @@
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 
+/**
+ * refcount_t - variant of atomic_t specialized for reference counts
+ * @refs: atomic_t counter field
+ *
+ * The counter saturates at UINT_MAX and will not move once
+ * there. This avoids wrapping the counter and causing 'spurious'
+ * use-after-free bugs.
+ */
 typedef struct refcount_struct {
 	atomic_t refs;
 } refcount_t;
 
 #define REFCOUNT_INIT(n)	{ .refs = ATOMIC_INIT(n), }
 
+/**
+ * refcount_set - set a refcount's value
+ * @r: the refcount
+ * @n: value to which the refcount will be set
+ */
 static inline void refcount_set(refcount_t *r, unsigned int n)
 {
 	atomic_set(&r->refs, n);
 }
 
+/**
+ * refcount_read - get a refcount's value
+ * @r: the refcount
+ *
+ * Return: the refcount's value
+ */
 static inline unsigned int refcount_read(const refcount_t *r)
 {
 	return atomic_read(&r->refs);
diff --git a/lib/refcount.c b/lib/refcount.c
index aa09ad3c30b0..8e206ce5609b 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -37,6 +37,24 @@
 #include <linux/refcount.h>
 #include <linux/bug.h>
 
+/**
+ * refcount_add_not_zero - add a value to a refcount unless it is 0
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ *
+ * Return: false if the passed refcount is 0, true otherwise
+ */
 bool refcount_add_not_zero(unsigned int i, refcount_t *r)
 {
 	unsigned int old, new, val = atomic_read(&r->refs);
@@ -64,18 +82,39 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_add_not_zero);
 
+/**
+ * refcount_add - add a value to a refcount
+ * @i: the value to add to the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_add(), but will saturate at UINT_MAX and WARN.
+ *
+ * Provides no memory ordering, it is assumed the caller has guaranteed the
+ * object memory to be stable (RCU, etc.). It does provide a control dependency
+ * and thereby orders future stores. See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_inc(), or one of its variants, should instead be used to
+ * increment a reference count.
+ */
 void refcount_add(unsigned int i, refcount_t *r)
 {
 	WARN_ONCE(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n");
 }
 EXPORT_SYMBOL_GPL(refcount_add);
 
-/*
- * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN.
+/**
+ * refcount_inc_not_zero - increment a refcount unless it is 0
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc_not_zero(), but will saturate at UINT_MAX and WARN.
  *
  * Provides no memory ordering, it is assumed the caller has guaranteed the
  * object memory to be stable (RCU, etc.). It does provide a control dependency
  * and thereby orders future stores. See the comment on top.
+ *
+ * Return: true if the increment was successful, false otherwise
  */
 bool refcount_inc_not_zero(refcount_t *r)
 {
@@ -103,11 +142,17 @@ bool refcount_inc_not_zero(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_inc_not_zero);
 
-/*
- * Similar to atomic_inc(), will saturate at UINT_MAX and WARN.
+/**
+ * refcount_inc - increment a refcount
+ * @r: the refcount to increment
+ *
+ * Similar to atomic_inc(), but will saturate at UINT_MAX and WARN.
  *
  * Provides no memory ordering, it is assumed the caller already has a
- * reference on the object, will WARN when this is not so.
+ * reference on the object.
+ *
+ * Will WARN if the refcount is 0, as this represents a possible use-after-free
+ * condition.
  */
 void refcount_inc(refcount_t *r)
 {
@@ -115,6 +160,26 @@ void refcount_inc(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_inc);
 
+/**
+ * refcount_sub_and_test - subtract from a refcount and test if it is 0
+ * @i: amount to subtract from the refcount
+ * @r: the refcount
+ *
+ * Similar to atomic_dec_and_test(), but it will WARN, return false and
+ * ultimately leak on underflow and will fail to decrement when saturated
+ * at UINT_MAX.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ *
+ * Use of this function is not recommended for the normal reference counting
+ * use case in which references are taken and released one at a time.  In these
+ * cases, refcount_dec(), or one of its variants, should instead be used to
+ * decrement a reference count.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
+ */
 bool refcount_sub_and_test(unsigned int i, refcount_t *r)
 {
 	unsigned int old, new, val = atomic_read(&r->refs);
@@ -140,13 +205,18 @@ bool refcount_sub_and_test(unsigned int i, refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_sub_and_test);
 
-/*
+/**
+ * refcount_dec_and_test - decrement a refcount and test if it is 0
+ * @r: the refcount
+ *
  * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
  * decrement when saturated at UINT_MAX.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
  * See the comment on top.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
  */
 bool refcount_dec_and_test(refcount_t *r)
 {
@@ -154,21 +224,26 @@ bool refcount_dec_and_test(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_dec_and_test);
 
-/*
+/**
+ * refcount_dec - decrement a refcount
+ * @r: the refcount
+ *
  * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
  * when saturated at UINT_MAX.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before.
  */
-
 void refcount_dec(refcount_t *r)
 {
 	WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n");
 }
 EXPORT_SYMBOL_GPL(refcount_dec);
 
-/*
+/**
+ * refcount_dec_if_one - decrement a refcount if it is 1
+ * @r: the refcount
+ *
  * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the
  * success thereof.
  *
@@ -178,6 +253,8 @@ EXPORT_SYMBOL_GPL(refcount_dec);
  * It can be used like a try-delete operator; this explicit case is provided
  * and not cmpxchg in generic, because that would allow implementing unsafe
  * operations.
+ *
+ * Return: true if the resulting refcount is 0, false otherwise
  */
 bool refcount_dec_if_one(refcount_t *r)
 {
@@ -185,11 +262,16 @@ bool refcount_dec_if_one(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_dec_if_one);
 
-/*
+/**
+ * refcount_dec_not_one - decrement a refcount if it is not 1
+ * @r: the refcount
+ *
  * No atomic_t counterpart, it decrements unless the value is 1, in which case
  * it will return false.
  *
  * Was often done like: atomic_add_unless(&var, -1, 1)
+ *
+ * Return: true if the decrement operation was successful, false otherwise
  */
 bool refcount_dec_not_one(refcount_t *r)
 {
@@ -219,13 +301,21 @@ bool refcount_dec_not_one(refcount_t *r)
 }
 EXPORT_SYMBOL_GPL(refcount_dec_not_one);
 
-/*
+/**
+ * refcount_dec_and_mutex_lock - return holding mutex if able to decrement
+ *                               refcount to 0
+ * @r: the refcount
+ * @lock: the mutex to be locked
+ *
  * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
  * to decrement when saturated at UINT_MAX.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
  * See the comment on top.
+ *
+ * Return: true and hold mutex if able to decrement refcount to 0, false
+ *         otherwise
  */
 bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
 {
@@ -242,13 +332,21 @@ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock)
 }
 EXPORT_SYMBOL_GPL(refcount_dec_and_mutex_lock);
 
-/*
+/**
+ * refcount_dec_and_lock - return holding spinlock if able to decrement
+ *                         refcount to 0
+ * @r: the refcount
+ * @lock: the spinlock to be locked
+ *
  * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to
  * decrement when saturated at UINT_MAX.
  *
  * Provides release memory ordering, such that prior loads and stores are done
  * before, and provides a control dependency such that free() must come after.
  * See the comment on top.
+ *
+ * Return: true and hold spinlock if able to decrement refcount to 0, false
+ *         otherwise
  */
 bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock)
 {
-- 
cgit v1.3-6-gb490


From 1b028f784e8c341e762c264f70dc0ca1418c8b7a Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dsafonov@virtuozzo.com>
Date: Mon, 6 Mar 2017 17:17:19 +0300
Subject: x86/mm: Introduce mmap_compat_base() for 32-bit mmap()

mmap() uses a base address, from which it starts to look for a free space
for allocation.

The base address is stored in mm->mmap_base, which is calculated during
exec(). The address depends on task's size, set rlimit for stack, ASLR
randomization. The base depends on the task size and the number of random
bits which are different for 64-bit and 32bit applications.

Due to the fact, that the base address is fixed, its mmap() from a compat
(32bit) syscall issued by a 64bit task will return a address which is based
on the 64bit base address and does not fit into the 32bit address space
(4GB). The returned pointer is truncated to 32bit, which results in an
invalid address.

To solve store a seperate compat address base plus a compat legacy address
base in mm_struct. These bases are calculated at exec() time and can be
used later to address the 32bit compat mmap() issued by 64 bit
applications.

As a consequence of this change 32-bit applications issuing a 64-bit
syscall (after doing a long jump) will get a 64-bit mapping now. Before
this change 32-bit applications always got a 32bit mapping.

[ tglx: Massaged changelog and added a comment ]

Signed-off-by: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: 0x7f454c46@gmail.com
Cc: linux-mm@kvack.org
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Link: http://lkml.kernel.org/r/20170306141721.9188-4-dsafonov@virtuozzo.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/Kconfig                 |  7 +++++++
 arch/x86/Kconfig             |  1 +
 arch/x86/include/asm/elf.h   |  3 +++
 arch/x86/kernel/sys_x86_64.c | 23 ++++++++++++++++++----
 arch/x86/mm/mmap.c           | 47 ++++++++++++++++++++++++++++++++------------
 include/linux/mm_types.h     |  5 +++++
 6 files changed, 69 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index cd211a14a88f..c4d6833aacd9 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -700,6 +700,13 @@ config ARCH_MMAP_RND_COMPAT_BITS
 	  This value can be changed after boot using the
 	  /proc/sys/vm/mmap_rnd_compat_bits tunable
 
+config HAVE_ARCH_COMPAT_MMAP_BASES
+	bool
+	help
+	  This allows 64bit applications to invoke 32-bit mmap() syscall
+	  and vice-versa 32-bit applications to call 64-bit mmap().
+	  Required for applications doing different bitness syscalls.
+
 config HAVE_COPY_THREAD_TLS
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc98d5a294ee..2bab9d093b51 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -106,6 +106,7 @@ config X86
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_ARCH_MMAP_RND_BITS		if MMU
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if MMU && COMPAT
+	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index b908141cf0c4..ac5be5ba8527 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -303,6 +303,9 @@ static inline int mmap_is_ia32(void)
 		test_thread_flag(TIF_ADDR32));
 }
 
+extern unsigned long tasksize_32bit(void);
+extern unsigned long tasksize_64bit(void);
+
 #ifdef CONFIG_X86_32
 
 #define __STACK_RND_MASK(is32bit) (0x7ff)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 50215a4b9347..c54817baabc7 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -17,6 +17,8 @@
 #include <linux/uaccess.h>
 #include <linux/elf.h>
 
+#include <asm/elf.h>
+#include <asm/compat.h>
 #include <asm/ia32.h>
 #include <asm/syscalls.h>
 
@@ -98,6 +100,18 @@ out:
 	return error;
 }
 
+static unsigned long get_mmap_base(int is_legacy)
+{
+	struct mm_struct *mm = current->mm;
+
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+	if (in_compat_syscall())
+		return is_legacy ? mm->mmap_compat_legacy_base
+				 : mm->mmap_compat_base;
+#endif
+	return is_legacy ? mm->mmap_legacy_base : mm->mmap_base;
+}
+
 static void find_start_end(unsigned long flags, unsigned long *begin,
 			   unsigned long *end)
 {
@@ -114,10 +128,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 		if (current->flags & PF_RANDOMIZE) {
 			*begin = randomize_page(*begin, 0x02000000);
 		}
-	} else {
-		*begin = current->mm->mmap_legacy_base;
-		*end = TASK_SIZE;
+		return;
 	}
+
+	*begin	= get_mmap_base(1);
+	*end	= in_compat_syscall() ? tasksize_32bit() : tasksize_64bit();
 }
 
 unsigned long
@@ -191,7 +206,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = PAGE_SIZE;
-	info.high_limit = mm->mmap_base;
+	info.high_limit = get_mmap_base(0);
 	info.align_mask = 0;
 	info.align_offset = pgoff << PAGE_SHIFT;
 	if (filp) {
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 1e9cb945dca1..529ab79800af 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -36,11 +36,16 @@ struct va_alignment __read_mostly va_align = {
 	.flags = -1,
 };
 
-static inline unsigned long tasksize_32bit(void)
+unsigned long tasksize_32bit(void)
 {
 	return IA32_PAGE_OFFSET;
 }
 
+unsigned long tasksize_64bit(void)
+{
+	return TASK_SIZE_MAX;
+}
+
 static unsigned long stack_maxrandom_size(unsigned long task_size)
 {
 	unsigned long max = 0;
@@ -81,6 +86,8 @@ static unsigned long arch_rnd(unsigned int rndbits)
 
 unsigned long arch_mmap_rnd(void)
 {
+	if (!(current->flags & PF_RANDOMIZE))
+		return 0;
 	return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
 }
 
@@ -114,22 +121,36 @@ static unsigned long mmap_legacy_base(unsigned long rnd,
  * This function, called very early during the creation of a new
  * process VM image, sets up which VM layout function to use:
  */
-void arch_pick_mmap_layout(struct mm_struct *mm)
+static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
+		unsigned long random_factor, unsigned long task_size)
 {
-	unsigned long random_factor = 0UL;
-
-	if (current->flags & PF_RANDOMIZE)
-		random_factor = arch_mmap_rnd();
-
-	mm->mmap_legacy_base = mmap_legacy_base(random_factor, TASK_SIZE);
+	*legacy_base = mmap_legacy_base(random_factor, task_size);
+	if (mmap_is_legacy())
+		*base = *legacy_base;
+	else
+		*base = mmap_base(random_factor, task_size);
+}
 
-	if (mmap_is_legacy()) {
-		mm->mmap_base = mm->mmap_legacy_base;
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	if (mmap_is_legacy())
 		mm->get_unmapped_area = arch_get_unmapped_area;
-	} else {
-		mm->mmap_base = mmap_base(random_factor, TASK_SIZE);
+	else
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
-	}
+
+	arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
+			arch_rnd(mmap64_rnd_bits), tasksize_64bit());
+
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+	/*
+	 * The mmap syscall mapping base decision depends solely on the
+	 * syscall type (64-bit or compat). This applies for 64bit
+	 * applications and 32bit applications. The 64bit syscall uses
+	 * mmap_base, the compat syscall uses mmap_compat_base.
+	 */
+	arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
+			arch_rnd(mmap32_rnd_bits), tasksize_32bit());
+#endif
 }
 
 const char *arch_vma_name(struct vm_area_struct *vma)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f60f45fe226f..45cdb27791a3 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -367,6 +367,11 @@ struct mm_struct {
 #endif
 	unsigned long mmap_base;		/* base of mmap area */
 	unsigned long mmap_legacy_base;         /* base of mmap area in bottom-up allocations */
+#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
+	/* Base adresses for compatible mmap() */
+	unsigned long mmap_compat_base;
+	unsigned long mmap_compat_legacy_base;
+#endif
 	unsigned long task_size;		/* size of task vm space */
 	unsigned long highest_vm_end;		/* highest vma end address */
 	pgd_t * pgd;
-- 
cgit v1.3-6-gb490


From e422267322cd319e2695a535e47c5b1feeac45eb Mon Sep 17 00:00:00 2001
From: Hari Bathini <hbathini@linux.vnet.ibm.com>
Date: Wed, 8 Mar 2017 02:11:36 +0530
Subject: perf: Add PERF_RECORD_NAMESPACES to include namespaces related info

With the advert of container technologies like docker, that depend on
namespaces for isolation, there is a need for tracing support for
namespaces. This patch introduces new PERF_RECORD_NAMESPACES event for
recording namespaces related info. By recording info for every
namespace, it is left to userspace to take a call on the definition of a
container and trace containers by updating perf tool accordingly.

Each namespace has a combination of device and inode numbers. Though
every namespace has the same device number currently, that may change in
future to avoid the need for a namespace of namespaces. Considering such
possibility, record both device and inode numbers separately for each
namespace.

Signed-off-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Aravinda Prasad <aravinda@linux.vnet.ibm.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Sargun Dhillon <sargun@sargun.me>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/148891929686.25309.2827618988917007768.stgit@hbathini.in.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/perf_event.h      |   2 +
 include/uapi/linux/perf_event.h |  32 ++++++++-
 kernel/events/core.c            | 139 ++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c                   |   2 +
 kernel/nsproxy.c                |   3 +
 5 files changed, 177 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 000fdb211c7d..f19a82362851 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1112,6 +1112,7 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks
 
 extern void perf_event_exec(void);
 extern void perf_event_comm(struct task_struct *tsk, bool exec);
+extern void perf_event_namespaces(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
 
 /* Callchains */
@@ -1315,6 +1316,7 @@ static inline int perf_unregister_guest_info_callbacks
 static inline void perf_event_mmap(struct vm_area_struct *vma)		{ }
 static inline void perf_event_exec(void)				{ }
 static inline void perf_event_comm(struct task_struct *tsk, bool exec)	{ }
+static inline void perf_event_namespaces(struct task_struct *tsk)	{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
 static inline void perf_event_init(void)				{ }
 static inline int  perf_swevent_get_recursion_context(void)		{ return -1; }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index c66a485a24ac..bec0aad0e15c 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -344,7 +344,8 @@ struct perf_event_attr {
 				use_clockid    :  1, /* use @clockid for time fields */
 				context_switch :  1, /* context switch data */
 				write_backward :  1, /* Write ring buffer from end to beginning */
-				__reserved_1   : 36;
+				namespaces     :  1, /* include namespaces data */
+				__reserved_1   : 35;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -610,6 +611,23 @@ struct perf_event_header {
 	__u16	size;
 };
 
+struct perf_ns_link_info {
+	__u64	dev;
+	__u64	ino;
+};
+
+enum {
+	NET_NS_INDEX		= 0,
+	UTS_NS_INDEX		= 1,
+	IPC_NS_INDEX		= 2,
+	PID_NS_INDEX		= 3,
+	USER_NS_INDEX		= 4,
+	MNT_NS_INDEX		= 5,
+	CGROUP_NS_INDEX		= 6,
+
+	NR_NAMESPACES,		/* number of available namespaces */
+};
+
 enum perf_event_type {
 
 	/*
@@ -862,6 +880,18 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_SWITCH_CPU_WIDE		= 15,
 
+	/*
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u32				pid;
+	 *	u32				tid;
+	 *	u64				nr_namespaces;
+	 *	{ u64				dev, inode; } [nr_namespaces];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_NAMESPACES			= 16,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6f41548f2e32..16c877a121c8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -48,6 +48,8 @@
 #include <linux/parser.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/mm.h>
+#include <linux/proc_ns.h>
+#include <linux/mount.h>
 
 #include "internal.h"
 
@@ -379,6 +381,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
+static atomic_t nr_namespaces_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
 static atomic_t nr_switch_events __read_mostly;
@@ -3991,6 +3994,8 @@ static void unaccount_event(struct perf_event *event)
 		atomic_dec(&nr_mmap_events);
 	if (event->attr.comm)
 		atomic_dec(&nr_comm_events);
+	if (event->attr.namespaces)
+		atomic_dec(&nr_namespaces_events);
 	if (event->attr.task)
 		atomic_dec(&nr_task_events);
 	if (event->attr.freq)
@@ -6491,6 +6496,7 @@ static void perf_event_task(struct task_struct *task,
 void perf_event_fork(struct task_struct *task)
 {
 	perf_event_task(task, NULL, 1);
+	perf_event_namespaces(task);
 }
 
 /*
@@ -6592,6 +6598,132 @@ void perf_event_comm(struct task_struct *task, bool exec)
 	perf_event_comm_event(&comm_event);
 }
 
+/*
+ * namespaces tracking
+ */
+
+struct perf_namespaces_event {
+	struct task_struct		*task;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				tid;
+		u64				nr_namespaces;
+		struct perf_ns_link_info	link_info[NR_NAMESPACES];
+	} event_id;
+};
+
+static int perf_event_namespaces_match(struct perf_event *event)
+{
+	return event->attr.namespaces;
+}
+
+static void perf_event_namespaces_output(struct perf_event *event,
+					 void *data)
+{
+	struct perf_namespaces_event *namespaces_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	int ret;
+
+	if (!perf_event_namespaces_match(event))
+		return;
+
+	perf_event_header__init_id(&namespaces_event->event_id.header,
+				   &sample, event);
+	ret = perf_output_begin(&handle, event,
+				namespaces_event->event_id.header.size);
+	if (ret)
+		return;
+
+	namespaces_event->event_id.pid = perf_event_pid(event,
+							namespaces_event->task);
+	namespaces_event->event_id.tid = perf_event_tid(event,
+							namespaces_event->task);
+
+	perf_output_put(&handle, namespaces_event->event_id);
+
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
+static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
+				   struct task_struct *task,
+				   const struct proc_ns_operations *ns_ops)
+{
+	struct path ns_path;
+	struct inode *ns_inode;
+	void *error;
+
+	error = ns_get_path(&ns_path, task, ns_ops);
+	if (!error) {
+		ns_inode = ns_path.dentry->d_inode;
+		ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
+		ns_link_info->ino = ns_inode->i_ino;
+	}
+}
+
+void perf_event_namespaces(struct task_struct *task)
+{
+	struct perf_namespaces_event namespaces_event;
+	struct perf_ns_link_info *ns_link_info;
+
+	if (!atomic_read(&nr_namespaces_events))
+		return;
+
+	namespaces_event = (struct perf_namespaces_event){
+		.task	= task,
+		.event_id  = {
+			.header = {
+				.type = PERF_RECORD_NAMESPACES,
+				.misc = 0,
+				.size = sizeof(namespaces_event.event_id),
+			},
+			/* .pid */
+			/* .tid */
+			.nr_namespaces = NR_NAMESPACES,
+			/* .link_info[NR_NAMESPACES] */
+		},
+	};
+
+	ns_link_info = namespaces_event.event_id.link_info;
+
+	perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
+			       task, &mntns_operations);
+
+#ifdef CONFIG_USER_NS
+	perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
+			       task, &userns_operations);
+#endif
+#ifdef CONFIG_NET_NS
+	perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
+			       task, &netns_operations);
+#endif
+#ifdef CONFIG_UTS_NS
+	perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
+			       task, &utsns_operations);
+#endif
+#ifdef CONFIG_IPC_NS
+	perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
+			       task, &ipcns_operations);
+#endif
+#ifdef CONFIG_PID_NS
+	perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
+			       task, &pidns_operations);
+#endif
+#ifdef CONFIG_CGROUPS
+	perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
+			       task, &cgroupns_operations);
+#endif
+
+	perf_iterate_sb(perf_event_namespaces_output,
+			&namespaces_event,
+			NULL);
+}
+
 /*
  * mmap tracking
  */
@@ -9146,6 +9278,8 @@ static void account_event(struct perf_event *event)
 		atomic_inc(&nr_mmap_events);
 	if (event->attr.comm)
 		atomic_inc(&nr_comm_events);
+	if (event->attr.namespaces)
+		atomic_inc(&nr_namespaces_events);
 	if (event->attr.task)
 		atomic_inc(&nr_task_events);
 	if (event->attr.freq)
@@ -9691,6 +9825,11 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EACCES;
 	}
 
+	if (attr.namespaces) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+	}
+
 	if (attr.freq) {
 		if (attr.sample_freq > sysctl_perf_event_sample_rate)
 			return -EINVAL;
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..afa2947286cd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2352,6 +2352,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 		}
 	}
 
+	perf_event_namespaces(current);
+
 bad_unshare_cleanup_cred:
 	if (new_cred)
 		put_cred(new_cred);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 782102e59eed..f6c5d330059a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/syscalls.h>
 #include <linux/cgroup.h>
+#include <linux/perf_event.h>
 
 static struct kmem_cache *nsproxy_cachep;
 
@@ -262,6 +263,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 		goto out;
 	}
 	switch_task_namespaces(tsk, new_nsproxy);
+
+	perf_event_namespaces(tsk);
 out:
 	fput(file);
 	return err;
-- 
cgit v1.3-6-gb490


From 1d585e70905e03e8c19c9aaf523ec246ae6b18a1 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 8 Mar 2017 13:56:06 +0530
Subject: trace/kprobes: Fix check for kretprobe offset within function entry

perf specifies an offset from _text and since this offset is fed
directly into the arch-specific helper, kprobes tracer rejects
installation of kretprobes through perf. Fix this by looking up the
actual offset from a function for the specified sym+offset.

Refactor and reuse existing routines to limit code duplication -- we
repurpose kprobe_addr() for determining final kprobe address and we
split out the function entry offset determination into a separate
generic helper.

Before patch:

  naveen@ubuntu:~/linux/tools/perf$ sudo ./perf probe -v do_open%return
  probe-definition(0): do_open%return
  symbol:do_open file:(null) line:0 offset:0 return:1 lazy:(null)
  0 arguments
  Looking at the vmlinux_path (8 entries long)
  Using /boot/vmlinux for symbols
  Open Debuginfo file: /boot/vmlinux
  Try to find probe point from debuginfo.
  Matched function: do_open [2d0c7ff]
  Probe point found: do_open+0
  Matched function: do_open [35d76dc]
  found inline addr: 0xc0000000004ba9c4
  Failed to find "do_open%return",
   because do_open is an inlined function and has no return point.
  An error occurred in debuginfo analysis (-22).
  Trying to use symbols.
  Opening /sys/kernel/debug/tracing//README write=0
  Opening /sys/kernel/debug/tracing//kprobe_events write=1
  Writing event: r:probe/do_open _text+4469776
  Failed to write event: Invalid argument
    Error: Failed to add events. Reason: Invalid argument (Code: -22)
  naveen@ubuntu:~/linux/tools/perf$ dmesg | tail
  <snip>
  [   33.568656] Given offset is not valid for return probe.

After patch:

  naveen@ubuntu:~/linux/tools/perf$ sudo ./perf probe -v do_open%return
  probe-definition(0): do_open%return
  symbol:do_open file:(null) line:0 offset:0 return:1 lazy:(null)
  0 arguments
  Looking at the vmlinux_path (8 entries long)
  Using /boot/vmlinux for symbols
  Open Debuginfo file: /boot/vmlinux
  Try to find probe point from debuginfo.
  Matched function: do_open [2d0c7d6]
  Probe point found: do_open+0
  Matched function: do_open [35d76b3]
  found inline addr: 0xc0000000004ba9e4
  Failed to find "do_open%return",
   because do_open is an inlined function and has no return point.
  An error occurred in debuginfo analysis (-22).
  Trying to use symbols.
  Opening /sys/kernel/debug/tracing//README write=0
  Opening /sys/kernel/debug/tracing//kprobe_events write=1
  Writing event: r:probe/do_open _text+4469808
  Writing event: r:probe/do_open_1 _text+4956344
  Added new events:
    probe:do_open        (on do_open%return)
    probe:do_open_1      (on do_open%return)

  You can now use it in all perf tools, such as:

	  perf record -e probe:do_open_1 -aR sleep 1

  naveen@ubuntu:~/linux/tools/perf$ sudo cat /sys/kernel/debug/kprobes/list
  c000000000041370  k  kretprobe_trampoline+0x0    [OPTIMIZED]
  c0000000004ba0b8  r  do_open+0x8    [DISABLED]
  c000000000443430  r  do_open+0x0    [DISABLED]

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/d8cd1ef420ec22e3643ac332fdabcffc77319a42.1488961018.git.naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/kprobes.h     |  1 +
 kernel/kprobes.c            | 40 ++++++++++++++++++++++++++--------------
 kernel/trace/trace_kprobe.c |  2 +-
 3 files changed, 28 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 177bdf6c6aeb..47e4da5b4fa2 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -268,6 +268,7 @@ extern void show_registers(struct pt_regs *regs);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 extern bool arch_within_kprobe_blacklist(unsigned long addr);
 extern bool arch_function_offset_within_entry(unsigned long offset);
+extern bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 4780ec236035..d733479a10ee 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1391,21 +1391,19 @@ bool within_kprobe_blacklist(unsigned long addr)
  * This returns encoded errors if it fails to look up symbol or invalid
  * combination of parameters.
  */
-static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
+			const char *symbol_name, unsigned int offset)
 {
-	kprobe_opcode_t *addr = p->addr;
-
-	if ((p->symbol_name && p->addr) ||
-	    (!p->symbol_name && !p->addr))
+	if ((symbol_name && addr) || (!symbol_name && !addr))
 		goto invalid;
 
-	if (p->symbol_name) {
-		kprobe_lookup_name(p->symbol_name, addr);
+	if (symbol_name) {
+		kprobe_lookup_name(symbol_name, addr);
 		if (!addr)
 			return ERR_PTR(-ENOENT);
 	}
 
-	addr = (kprobe_opcode_t *)(((char *)addr) + p->offset);
+	addr = (kprobe_opcode_t *)(((char *)addr) + offset);
 	if (addr)
 		return addr;
 
@@ -1413,6 +1411,11 @@ invalid:
 	return ERR_PTR(-EINVAL);
 }
 
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
+{
+	return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+}
+
 /* Check passed kprobe is valid and return kprobe in kprobe_table. */
 static struct kprobe *__get_valid_kprobe(struct kprobe *p)
 {
@@ -1881,19 +1884,28 @@ bool __weak arch_function_offset_within_entry(unsigned long offset)
 	return !offset;
 }
 
+bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+{
+	kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+
+	if (IS_ERR(kp_addr))
+		return false;
+
+	if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
+						!arch_function_offset_within_entry(offset))
+		return false;
+
+	return true;
+}
+
 int register_kretprobe(struct kretprobe *rp)
 {
 	int ret = 0;
 	struct kretprobe_instance *inst;
 	int i;
 	void *addr;
-	unsigned long offset;
-
-	addr = kprobe_addr(&rp->kp);
-	if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset))
-		return -EINVAL;
 
-	if (!arch_function_offset_within_entry(offset))
+	if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
 		return -EINVAL;
 
 	if (kretprobe_blacklist_size) {
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 12fb540da0e5..013f4e7146d4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -697,7 +697,7 @@ static int create_trace_kprobe(int argc, char **argv)
 			return ret;
 		}
 		if (offset && is_return &&
-		    !arch_function_offset_within_entry(offset)) {
+		    !function_offset_within_entry(NULL, symbol, offset)) {
 			pr_info("Given offset is not valid for return probe.\n");
 			return -EINVAL;
 		}
-- 
cgit v1.3-6-gb490


From f4c0b0aa58d9b7e30ab0a95e33da84d53b3d764a Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 20 Feb 2017 15:33:50 +0200
Subject: perf/core: Keep AUX flags in the output handle

In preparation for adding more flags to perf AUX records, introduce a
separate API for setting the flags for a session, rather than appending
more bool arguments to perf_aux_output_end. This allows to set each
flag at the time a corresponding condition is detected, instead of
tracking it in each driver's private state.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20170220133352.17995-3-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/intel/bts.c                      | 16 +++++------
 arch/x86/events/intel/pt.c                       | 17 ++++++------
 arch/x86/events/intel/pt.h                       |  1 -
 drivers/hwtracing/coresight/coresight-etb10.c    |  9 +++----
 drivers/hwtracing/coresight/coresight-etm-perf.c |  9 +++----
 drivers/hwtracing/coresight/coresight-priv.h     |  2 --
 drivers/hwtracing/coresight/coresight-tmc-etf.c  |  7 +++--
 include/linux/coresight.h                        |  2 +-
 include/linux/perf_event.h                       |  8 +++---
 kernel/events/ring_buffer.c                      | 34 ++++++++++++++++--------
 10 files changed, 56 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 982c9e31daca..8ae8c5ce3a1f 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -63,7 +63,6 @@ struct bts_buffer {
 	unsigned int	cur_buf;
 	bool		snapshot;
 	local_t		data_size;
-	local_t		lost;
 	local_t		head;
 	unsigned long	end;
 	void		**data_pages;
@@ -199,7 +198,8 @@ static void bts_update(struct bts_ctx *bts)
 			return;
 
 		if (ds->bts_index >= ds->bts_absolute_maximum)
-			local_inc(&buf->lost);
+			perf_aux_output_flag(&bts->handle,
+			                     PERF_AUX_FLAG_TRUNCATED);
 
 		/*
 		 * old and head are always in the same physical buffer, so we
@@ -276,7 +276,7 @@ static void bts_event_start(struct perf_event *event, int flags)
 	return;
 
 fail_end_stop:
-	perf_aux_output_end(&bts->handle, 0, false);
+	perf_aux_output_end(&bts->handle, 0);
 
 fail_stop:
 	event->hw.state = PERF_HES_STOPPED;
@@ -319,9 +319,8 @@ static void bts_event_stop(struct perf_event *event, int flags)
 				bts->handle.head =
 					local_xchg(&buf->data_size,
 						   buf->nr_pages << PAGE_SHIFT);
-
-			perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-					    !!local_xchg(&buf->lost, 0));
+			perf_aux_output_end(&bts->handle,
+			                    local_xchg(&buf->data_size, 0));
 		}
 
 		cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
@@ -484,8 +483,7 @@ int intel_bts_interrupt(void)
 	if (old_head == local_read(&buf->head))
 		return handled;
 
-	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
-			    !!local_xchg(&buf->lost, 0));
+	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0));
 
 	buf = perf_aux_output_begin(&bts->handle, event);
 	if (buf)
@@ -500,7 +498,7 @@ int intel_bts_interrupt(void)
 			 * cleared handle::event
 			 */
 			barrier();
-			perf_aux_output_end(&bts->handle, 0, false);
+			perf_aux_output_end(&bts->handle, 0);
 		}
 	}
 
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 5900471ee508..0218728be37a 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -753,7 +753,8 @@ static void pt_handle_status(struct pt *pt)
 		 */
 		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
 		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
-			local_inc(&buf->lost);
+			perf_aux_output_flag(&pt->handle,
+			                     PERF_AUX_FLAG_TRUNCATED);
 			advance++;
 		}
 	}
@@ -846,8 +847,10 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
 
 	/* can't stop in the middle of an output region */
 	if (buf->output_off + handle->size + 1 <
-	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 		return -EINVAL;
+	}
 
 
 	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
@@ -1192,8 +1195,7 @@ void intel_pt_interrupt(void)
 
 	pt_update_head(pt);
 
-	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-			    local_xchg(&buf->lost, 0));
+	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
 
 	if (!event->hw.state) {
 		int ret;
@@ -1208,7 +1210,7 @@ void intel_pt_interrupt(void)
 		/* snapshot counters don't use PMI, so it's safe */
 		ret = pt_buffer_reset_markers(buf, &pt->handle);
 		if (ret) {
-			perf_aux_output_end(&pt->handle, 0, true);
+			perf_aux_output_end(&pt->handle, 0);
 			return;
 		}
 
@@ -1280,7 +1282,7 @@ static void pt_event_start(struct perf_event *event, int mode)
 	return;
 
 fail_end_stop:
-	perf_aux_output_end(&pt->handle, 0, true);
+	perf_aux_output_end(&pt->handle, 0);
 fail_stop:
 	hwc->state = PERF_HES_STOPPED;
 }
@@ -1321,8 +1323,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
 			pt->handle.head =
 				local_xchg(&buf->data_size,
 					   buf->nr_pages << PAGE_SHIFT);
-		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
-				    local_xchg(&buf->lost, 0));
+		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0));
 	}
 }
 
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index 53473c21b554..b528e8f373e4 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -143,7 +143,6 @@ struct pt_buffer {
 	size_t			output_off;
 	unsigned long		nr_pages;
 	local_t			data_size;
-	local_t			lost;
 	local64_t		head;
 	bool			snapshot;
 	unsigned long		stop_pos, intr_pos;
diff --git a/drivers/hwtracing/coresight/coresight-etb10.c b/drivers/hwtracing/coresight/coresight-etb10.c
index d7325c6534ad..979ea6ec7902 100644
--- a/drivers/hwtracing/coresight/coresight-etb10.c
+++ b/drivers/hwtracing/coresight/coresight-etb10.c
@@ -321,7 +321,7 @@ static int etb_set_buffer(struct coresight_device *csdev,
 
 static unsigned long etb_reset_buffer(struct coresight_device *csdev,
 				      struct perf_output_handle *handle,
-				      void *sink_config, bool *lost)
+				      void *sink_config)
 {
 	unsigned long size = 0;
 	struct cs_buffers *buf = sink_config;
@@ -343,7 +343,6 @@ static unsigned long etb_reset_buffer(struct coresight_device *csdev,
 		 * resetting parameters here and squaring off with the ring
 		 * buffer API in the tracer PMU is fine.
 		 */
-		*lost = !!local_xchg(&buf->lost, 0);
 		size = local_xchg(&buf->data_size, 0);
 	}
 
@@ -385,7 +384,7 @@ static void etb_update_buffer(struct coresight_device *csdev,
 			(unsigned long)write_ptr);
 
 		write_ptr &= ~(ETB_FRAME_SIZE_WORDS - 1);
-		local_inc(&buf->lost);
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 	}
 
 	/*
@@ -396,7 +395,7 @@ static void etb_update_buffer(struct coresight_device *csdev,
 	 */
 	status = readl_relaxed(drvdata->base + ETB_STATUS_REG);
 	if (status & ETB_STATUS_RAM_FULL) {
-		local_inc(&buf->lost);
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 		to_read = capacity;
 		read_ptr = write_ptr;
 	} else {
@@ -429,7 +428,7 @@ static void etb_update_buffer(struct coresight_device *csdev,
 		if (read_ptr > (drvdata->buffer_depth - 1))
 			read_ptr -= drvdata->buffer_depth;
 		/* let the decoder know we've skipped ahead */
-		local_inc(&buf->lost);
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 	}
 
 	/* finally tell HW where we want to start reading from */
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 26cfac3e6de7..288a423c1b27 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -302,7 +302,8 @@ out:
 	return;
 
 fail_end_stop:
-	perf_aux_output_end(handle, 0, true);
+	perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
+	perf_aux_output_end(handle, 0);
 fail:
 	event->hw.state = PERF_HES_STOPPED;
 	goto out;
@@ -310,7 +311,6 @@ fail:
 
 static void etm_event_stop(struct perf_event *event, int mode)
 {
-	bool lost;
 	int cpu = smp_processor_id();
 	unsigned long size;
 	struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu);
@@ -348,10 +348,9 @@ static void etm_event_stop(struct perf_event *event, int mode)
 			return;
 
 		size = sink_ops(sink)->reset_buffer(sink, handle,
-						    event_data->snk_config,
-						    &lost);
+						    event_data->snk_config);
 
-		perf_aux_output_end(handle, size, lost);
+		perf_aux_output_end(handle, size);
 	}
 
 	/* Disabling the path make its elements available to other sessions */
diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h
index ef9d8e93e3b2..5f662d82052c 100644
--- a/drivers/hwtracing/coresight/coresight-priv.h
+++ b/drivers/hwtracing/coresight/coresight-priv.h
@@ -76,7 +76,6 @@ enum cs_mode {
  * @nr_pages:	max number of pages granted to us
  * @offset:	offset within the current buffer
  * @data_size:	how much we collected in this run
- * @lost:	other than zero if we had a HW buffer wrap around
  * @snapshot:	is this run in snapshot mode
  * @data_pages:	a handle the ring buffer
  */
@@ -85,7 +84,6 @@ struct cs_buffers {
 	unsigned int		nr_pages;
 	unsigned long		offset;
 	local_t			data_size;
-	local_t			lost;
 	bool			snapshot;
 	void			**data_pages;
 };
diff --git a/drivers/hwtracing/coresight/coresight-tmc-etf.c b/drivers/hwtracing/coresight/coresight-tmc-etf.c
index 1549436e2492..aec61a6d5c63 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etf.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etf.c
@@ -329,7 +329,7 @@ static int tmc_set_etf_buffer(struct coresight_device *csdev,
 
 static unsigned long tmc_reset_etf_buffer(struct coresight_device *csdev,
 					  struct perf_output_handle *handle,
-					  void *sink_config, bool *lost)
+					  void *sink_config)
 {
 	long size = 0;
 	struct cs_buffers *buf = sink_config;
@@ -350,7 +350,6 @@ static unsigned long tmc_reset_etf_buffer(struct coresight_device *csdev,
 		 * resetting parameters here and squaring off with the ring
 		 * buffer API in the tracer PMU is fine.
 		 */
-		*lost = !!local_xchg(&buf->lost, 0);
 		size = local_xchg(&buf->data_size, 0);
 	}
 
@@ -389,7 +388,7 @@ static void tmc_update_etf_buffer(struct coresight_device *csdev,
 	 */
 	status = readl_relaxed(drvdata->base + TMC_STS);
 	if (status & TMC_STS_FULL) {
-		local_inc(&buf->lost);
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 		to_read = drvdata->size;
 	} else {
 		to_read = CIRC_CNT(write_ptr, read_ptr, drvdata->size);
@@ -434,7 +433,7 @@ static void tmc_update_etf_buffer(struct coresight_device *csdev,
 			read_ptr -= drvdata->size;
 		/* Tell the HW */
 		writel_relaxed(read_ptr, drvdata->base + TMC_RRP);
-		local_inc(&buf->lost);
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED);
 	}
 
 	cur = buf->cur;
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 2a5982c37dfb..035c16c9a505 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -201,7 +201,7 @@ struct coresight_ops_sink {
 			  void *sink_config);
 	unsigned long (*reset_buffer)(struct coresight_device *csdev,
 				      struct perf_output_handle *handle,
-				      void *sink_config, bool *lost);
+				      void *sink_config);
 	void (*update_buffer)(struct coresight_device *csdev,
 			      struct perf_output_handle *handle,
 			      void *sink_config);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f19a82362851..b6e75c9d4791 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -801,6 +801,7 @@ struct perf_output_handle {
 	struct ring_buffer		*rb;
 	unsigned long			wakeup;
 	unsigned long			size;
+	u64				aux_flags;
 	union {
 		void			*addr;
 		unsigned long		head;
@@ -849,10 +850,11 @@ perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
 extern void *perf_aux_output_begin(struct perf_output_handle *handle,
 				   struct perf_event *event);
 extern void perf_aux_output_end(struct perf_output_handle *handle,
-				unsigned long size, bool truncated);
+				unsigned long size);
 extern int perf_aux_output_skip(struct perf_output_handle *handle,
 				unsigned long size);
 extern void *perf_get_aux(struct perf_output_handle *handle);
+extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
 
 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
@@ -1268,8 +1270,8 @@ static inline void *
 perf_aux_output_begin(struct perf_output_handle *handle,
 		      struct perf_event *event)				{ return NULL; }
 static inline void
-perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
-		    bool truncated)					{ }
+perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
+									{ }
 static inline int
 perf_aux_output_skip(struct perf_output_handle *handle,
 		     unsigned long size)				{ return -EINVAL; }
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 257fa460b846..9654e55c38d6 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -297,6 +297,19 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 		rb->paused = 1;
 }
 
+void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags)
+{
+	/*
+	 * OVERWRITE is determined by perf_aux_output_end() and can't
+	 * be passed in directly.
+	 */
+	if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE))
+		return;
+
+	handle->aux_flags |= flags;
+}
+EXPORT_SYMBOL_GPL(perf_aux_output_flag);
+
 /*
  * This is called before hardware starts writing to the AUX area to
  * obtain an output handle and make sure there's room in the buffer.
@@ -360,6 +373,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	handle->event = event;
 	handle->head = aux_head;
 	handle->size = 0;
+	handle->aux_flags = 0;
 
 	/*
 	 * In overwrite mode, AUX data stores do not depend on aux_tail,
@@ -408,34 +422,32 @@ err:
  * of the AUX buffer management code is that after pmu::stop(), the AUX
  * transaction must be stopped and therefore drop the AUX reference count.
  */
-void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
-			 bool truncated)
+void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 {
 	struct ring_buffer *rb = handle->rb;
-	bool wakeup = truncated;
+	bool wakeup = !!handle->aux_flags;
 	unsigned long aux_head;
-	u64 flags = 0;
-
-	if (truncated)
-		flags |= PERF_AUX_FLAG_TRUNCATED;
 
 	/* in overwrite mode, driver provides aux_head via handle */
 	if (rb->aux_overwrite) {
-		flags |= PERF_AUX_FLAG_OVERWRITE;
+		handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
 
 		aux_head = handle->head;
 		local_set(&rb->aux_head, aux_head);
 	} else {
+		handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
+
 		aux_head = local_read(&rb->aux_head);
 		local_add(size, &rb->aux_head);
 	}
 
-	if (size || flags) {
+	if (size || handle->aux_flags) {
 		/*
 		 * Only send RECORD_AUX if we have something useful to communicate
 		 */
 
-		perf_event_aux_event(handle->event, aux_head, size, flags);
+		perf_event_aux_event(handle->event, aux_head, size,
+		                     handle->aux_flags);
 	}
 
 	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
@@ -446,7 +458,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 	}
 
 	if (wakeup) {
-		if (truncated)
+		if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
 			handle->event->pending_disable = 1;
 		perf_output_wakeup(handle);
 	}
-- 
cgit v1.3-6-gb490


From 6419c4af777a773a45a1b1af735de0fcd9a7dcc7 Mon Sep 17 00:00:00 2001
From: "J. R. Okajima" <hooanon05g@gmail.com>
Date: Fri, 3 Feb 2017 01:38:17 +0900
Subject: locking/lockdep: Add new check to lock_downgrade()

Commit:

  f8319483f57f ("locking/lockdep: Provide a type check for lock_is_held")

didn't fully cover rwsems as downgrade_write() was left out.

Introduce lock_downgrade() and use it to add new checks.

See-also: http://marc.info/?l=linux-kernel&m=148581164003149&w=2
Originally-written-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: J. R. Okajima <hooanon05g@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1486053497-9948-3-git-send-email-hooanon05g@gmail.com
[ Rewrote the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h  |  3 +++
 kernel/locking/lockdep.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/locking/rwsem.c   |  6 ++----
 3 files changed, 60 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 1e327bb80838..fffe49f188e6 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -361,6 +361,8 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
 	lock_set_class(lock, lock->name, lock->key, subclass, ip);
 }
 
+extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);
+
 extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
 extern void lockdep_clear_current_reclaim_state(void);
 extern void lockdep_trace_alloc(gfp_t mask);
@@ -411,6 +413,7 @@ static inline void lockdep_on(void)
 
 # define lock_acquire(l, s, t, r, c, n, i)	do { } while (0)
 # define lock_release(l, n, i)			do { } while (0)
+# define lock_downgrade(l, i)			do { } while (0)
 # define lock_set_class(l, n, k, s, i)		do { } while (0)
 # define lock_set_subclass(l, s, i)		do { } while (0)
 # define lockdep_set_current_reclaim_state(g)	do { } while (0)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index da795480e0aa..b1a1cefb8244 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3533,6 +3533,44 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
 	return 1;
 }
 
+static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+	struct task_struct *curr = current;
+	struct held_lock *hlock;
+	unsigned int depth;
+	int i;
+
+	depth = curr->lockdep_depth;
+	/*
+	 * This function is about (re)setting the class of a held lock,
+	 * yet we're not actually holding any locks. Naughty user!
+	 */
+	if (DEBUG_LOCKS_WARN_ON(!depth))
+		return 0;
+
+	hlock = find_held_lock(curr, lock, depth, &i);
+	if (!hlock)
+		return print_unlock_imbalance_bug(curr, lock, ip);
+
+	curr->lockdep_depth = i;
+	curr->curr_chain_key = hlock->prev_chain_key;
+
+	WARN(hlock->read, "downgrading a read lock");
+	hlock->read = 1;
+	hlock->acquire_ip = ip;
+
+	if (reacquire_held_locks(curr, depth, i))
+		return 0;
+
+	/*
+	 * I took it apart and put it back together again, except now I have
+	 * these 'spare' parts.. where shall I put them.
+	 */
+	if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+		return 0;
+	return 1;
+}
+
 /*
  * Remove the lock to the list of currently held locks - this gets
  * called on mutex_unlock()/spin_unlock*() (or on a failed
@@ -3759,6 +3797,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lock_set_class);
 
+void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	current->lockdep_recursion = 1;
+	check_flags(flags);
+	if (__lock_downgrade(lock, ip))
+		check_chain_key(current);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_downgrade);
+
 /*
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 90a74ccd85a4..4d48b1c4870d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -124,10 +124,8 @@ EXPORT_SYMBOL(up_write);
  */
 void downgrade_write(struct rw_semaphore *sem)
 {
-	/*
-	 * lockdep: a downgraded write will live on as a write
-	 * dependency.
-	 */
+	lock_downgrade(&sem->dep_map, _RET_IP_);
+
 	rwsem_set_reader_owned(sem);
 	__downgrade_write(sem);
 }
-- 
cgit v1.3-6-gb490


From 383776fa7527745224446337f2dcfb0f0d1b8b56 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 27 Feb 2017 15:37:36 +0100
Subject: locking/lockdep: Handle statically initialized PER_CPU locks properly

If a PER_CPU struct which contains a spin_lock is statically initialized
via:

DEFINE_PER_CPU(struct foo, bla) = {
	.lock = __SPIN_LOCK_UNLOCKED(bla.lock)
};

then lockdep assigns a seperate key to each lock because the logic for
assigning a key to statically initialized locks is to use the address as
the key. With per CPU locks the address is obvioulsy different on each CPU.

That's wrong, because all locks should have the same key.

To solve this the following modifications are required:

 1) Extend the is_kernel/module_percpu_addr() functions to hand back the
    canonical address of the per CPU address, i.e. the per CPU address
    minus the per CPU offset.

 2) Check the lock address with these functions and if the per CPU check
    matches use the returned canonical address as the lock key, so all per
    CPU locks have the same key.

 3) Move the static_obj(key) check into look_up_lock_class() so this check
    can be avoided for statically initialized per CPU locks.  That's
    required because the canonical address fails the static_obj(key) check
    for obvious reasons.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[ Merged Dan's fixups for !MODULES and !SMP into this patch. ]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dan Murphy <dmurphy@ti.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20170227143736.pectaimkjkan5kow@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/module.h   |  6 ++++++
 include/linux/percpu.h   |  1 +
 kernel/locking/lockdep.c | 33 +++++++++++++++++++++++----------
 kernel/module.c          | 36 ++++++++++++++++++++++++------------
 mm/percpu.c              | 37 +++++++++++++++++++++++--------------
 5 files changed, 77 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 0297c5cd7cdf..9ad68561d8c2 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -493,6 +493,7 @@ static inline int module_is_live(struct module *mod)
 struct module *__module_text_address(unsigned long addr);
 struct module *__module_address(unsigned long addr);
 bool is_module_address(unsigned long addr);
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr);
 bool is_module_percpu_address(unsigned long addr);
 bool is_module_text_address(unsigned long addr);
 
@@ -660,6 +661,11 @@ static inline bool is_module_percpu_address(unsigned long addr)
 	return false;
 }
 
+static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+	return false;
+}
+
 static inline bool is_module_text_address(unsigned long addr)
 {
 	return false;
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 56939d3f6e53..491b3f5a5f8a 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -110,6 +110,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
 #endif
 
 extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
+extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
 extern bool is_kernel_percpu_address(unsigned long addr);
 
 #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index b1a1cefb8244..98dd6231d43b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -660,6 +660,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 	struct lockdep_subclass_key *key;
 	struct hlist_head *hash_head;
 	struct lock_class *class;
+	bool is_static = false;
 
 	if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
 		debug_locks_off();
@@ -673,10 +674,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 
 	/*
 	 * Static locks do not have their class-keys yet - for them the key
-	 * is the lock object itself:
+	 * is the lock object itself. If the lock is in the per cpu area,
+	 * the canonical address of the lock (per cpu offset removed) is
+	 * used.
 	 */
-	if (unlikely(!lock->key))
-		lock->key = (void *)lock;
+	if (unlikely(!lock->key)) {
+		unsigned long can_addr, addr = (unsigned long)lock;
+
+		if (__is_kernel_percpu_address(addr, &can_addr))
+			lock->key = (void *)can_addr;
+		else if (__is_module_percpu_address(addr, &can_addr))
+			lock->key = (void *)can_addr;
+		else if (static_obj(lock))
+			lock->key = (void *)lock;
+		else
+			return ERR_PTR(-EINVAL);
+		is_static = true;
+	}
 
 	/*
 	 * NOTE: the class-key must be unique. For dynamic locks, a static
@@ -708,7 +722,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
 		}
 	}
 
-	return NULL;
+	return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL);
 }
 
 /*
@@ -726,19 +740,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	DEBUG_LOCKS_WARN_ON(!irqs_disabled());
 
 	class = look_up_lock_class(lock, subclass);
-	if (likely(class))
+	if (likely(!IS_ERR_OR_NULL(class)))
 		goto out_set_class_cache;
 
 	/*
 	 * Debug-check: all keys must be persistent!
- 	 */
-	if (!static_obj(lock->key)) {
+	 */
+	if (IS_ERR(class)) {
 		debug_locks_off();
 		printk("INFO: trying to register non-static key.\n");
 		printk("the code is fine but needs lockdep annotation.\n");
 		printk("turning off the locking correctness validator.\n");
 		dump_stack();
-
 		return NULL;
 	}
 
@@ -3419,7 +3432,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
 		 * Clearly if the lock hasn't been acquired _ever_, we're not
 		 * holding it either, so report failure.
 		 */
-		if (!class)
+		if (IS_ERR_OR_NULL(class))
 			return 0;
 
 		/*
@@ -4225,7 +4238,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
 		 * If the class exists we look it up and zap it:
 		 */
 		class = look_up_lock_class(lock, j);
-		if (class)
+		if (!IS_ERR_OR_NULL(class))
 			zap_class(class);
 	}
 	/*
diff --git a/kernel/module.c b/kernel/module.c
index 7eba6dea4f41..5ef618133849 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -665,16 +665,7 @@ static void percpu_modcopy(struct module *mod,
 		memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
 }
 
-/**
- * is_module_percpu_address - test whether address is from module static percpu
- * @addr: address to test
- *
- * Test whether @addr belongs to module static percpu area.
- *
- * RETURNS:
- * %true if @addr is from module static percpu area
- */
-bool is_module_percpu_address(unsigned long addr)
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
 {
 	struct module *mod;
 	unsigned int cpu;
@@ -688,9 +679,11 @@ bool is_module_percpu_address(unsigned long addr)
 			continue;
 		for_each_possible_cpu(cpu) {
 			void *start = per_cpu_ptr(mod->percpu, cpu);
+			void *va = (void *)addr;
 
-			if ((void *)addr >= start &&
-			    (void *)addr < start + mod->percpu_size) {
+			if (va >= start && va < start + mod->percpu_size) {
+				if (can_addr)
+					*can_addr = (unsigned long) (va - start);
 				preempt_enable();
 				return true;
 			}
@@ -701,6 +694,20 @@ bool is_module_percpu_address(unsigned long addr)
 	return false;
 }
 
+/**
+ * is_module_percpu_address - test whether address is from module static percpu
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to module static percpu area.
+ *
+ * RETURNS:
+ * %true if @addr is from module static percpu area
+ */
+bool is_module_percpu_address(unsigned long addr)
+{
+	return __is_module_percpu_address(addr, NULL);
+}
+
 #else /* ... !CONFIG_SMP */
 
 static inline void __percpu *mod_percpu(struct module *mod)
@@ -732,6 +739,11 @@ bool is_module_percpu_address(unsigned long addr)
 	return false;
 }
 
+bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
+{
+	return false;
+}
+
 #endif /* CONFIG_SMP */
 
 #define MODINFO_ATTR(field)	\
diff --git a/mm/percpu.c b/mm/percpu.c
index 5696039b5c07..7d3b728c0254 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1281,18 +1281,7 @@ void free_percpu(void __percpu *ptr)
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 
-/**
- * is_kernel_percpu_address - test whether address is from static percpu area
- * @addr: address to test
- *
- * Test whether @addr belongs to in-kernel static percpu area.  Module
- * static percpu areas are not considered.  For those, use
- * is_module_percpu_address().
- *
- * RETURNS:
- * %true if @addr is from in-kernel static percpu area, %false otherwise.
- */
-bool is_kernel_percpu_address(unsigned long addr)
+bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
 {
 #ifdef CONFIG_SMP
 	const size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -1301,15 +1290,35 @@ bool is_kernel_percpu_address(unsigned long addr)
 
 	for_each_possible_cpu(cpu) {
 		void *start = per_cpu_ptr(base, cpu);
+		void *va = (void *)addr;
 
-		if ((void *)addr >= start && (void *)addr < start + static_size)
+		if (va >= start && va < start + static_size) {
+			if (can_addr)
+				*can_addr = (unsigned long) (va - start);
 			return true;
-        }
+		}
+	}
 #endif
 	/* on UP, can't distinguish from other static vars, always false */
 	return false;
 }
 
+/**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area.  Module
+ * static percpu areas are not considered.  For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+	return __is_kernel_percpu_address(addr, NULL);
+}
+
 /**
  * per_cpu_ptr_to_phys - convert translated percpu address to physical address
  * @addr: the address to be converted to physical address
-- 
cgit v1.3-6-gb490


From 016da20148a1692e34d35d1f1787400a2a2d2c58 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 16 Mar 2017 18:08:13 -0700
Subject: hrtimer: Remove hrtimer_peek_ahead_timers() leftovers

This function was removed in commit c6eb3f70d448 (hrtimer: Get rid of
hrtimer softirq, 2015-04-14) but the prototype wasn't ever deleted.

Delete it now.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Link: http://lkml.kernel.org/r/20170317010814.2591-1-sboyd@codeaurora.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 4 ----
 kernel/time/hrtimer.c   | 5 +----
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 249e579ecd4c..23d58fcd4d9a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -276,8 +276,6 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 	return timer->base->cpu_base->hres_active;
 }
 
-extern void hrtimer_peek_ahead_timers(void);
-
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
@@ -300,8 +298,6 @@ extern unsigned int hrtimer_resolution;
 
 #define hrtimer_resolution	(unsigned int)LOW_RES_NSEC
 
-static inline void hrtimer_peek_ahead_timers(void) { }
-
 static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 {
 	return 0;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index ec08f527d7ee..1ef82cdb61ff 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1368,10 +1368,7 @@ retry:
 		    ktime_to_ns(delta));
 }
 
-/*
- * local version of hrtimer_peek_ahead_timers() called with interrupts
- * disabled.
- */
+/* called with interrupts disabled */
 static inline void __hrtimer_peek_ahead_timers(void)
 {
 	struct tick_device *td;
-- 
cgit v1.3-6-gb490


From b59f65fa076a8eac2ff3a8ab7f8e1705b9fa86cb Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 16 Mar 2017 18:26:53 +0300
Subject: mm/gup: Implement the dev_pagemap() logic in the generic
 get_user_pages_fast() function

This is a preparation patch for the transition of x86 to the generic GUP_fast()
implementation.

Prepare generic GUP_fast() to handle dev_pagemap(). At the moment, it's
only implemented on x86. On non-x86, the new code will be compiled out.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar K . V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dann Frazier <dann.frazier@canonical.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170316152655.37789-6-kirill.shutemov@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mm.h |  4 +++
 mm/gup.c           | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 89 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5f01c88f0800..e197d3ca3e8a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -430,6 +430,10 @@ static inline int pud_devmap(pud_t pud)
 {
 	return 0;
 }
+static inline int pgd_devmap(pgd_t pgd)
+{
+	return 0;
+}
 #endif
 
 /*
diff --git a/mm/gup.c b/mm/gup.c
index 2b6cd3573457..e3d1e80424f4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1200,12 +1200,23 @@ static inline pte_t gup_get_pte(pte_t *ptep)
 }
 #endif
 
+static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+{
+	while ((*nr) - nr_start) {
+		struct page *page = pages[--(*nr)];
+
+		ClearPageReferenced(page);
+		put_page(page);
+	}
+}
+
 #ifdef __HAVE_ARCH_PTE_SPECIAL
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 			 int write, struct page **pages, int *nr)
 {
+	struct dev_pagemap *pgmap = NULL;
+	int nr_start = *nr, ret = 0;
 	pte_t *ptep, *ptem;
-	int ret = 0;
 
 	ptem = ptep = pte_offset_map(&pmd, addr);
 	do {
@@ -1222,7 +1233,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 		if (!pte_access_permitted(pte, write))
 			goto pte_unmap;
 
-		if (pte_special(pte))
+		if (pte_devmap(pte)) {
+			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
+			if (unlikely(!pgmap)) {
+				undo_dev_pagemap(nr, nr_start, pages);
+				goto pte_unmap;
+			}
+		} else if (pte_special(pte))
 			goto pte_unmap;
 
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -1239,6 +1256,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 
 		VM_BUG_ON_PAGE(compound_head(page) != head, page);
 
+		put_dev_pagemap(pgmap);
 		SetPageReferenced(page);
 		pages[*nr] = page;
 		(*nr)++;
@@ -1269,6 +1287,64 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 }
 #endif /* __HAVE_ARCH_PTE_SPECIAL */
 
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static int __gup_device_huge(unsigned long pfn, unsigned long addr,
+		unsigned long end, struct page **pages, int *nr)
+{
+	int nr_start = *nr;
+	struct dev_pagemap *pgmap = NULL;
+
+	do {
+		struct page *page = pfn_to_page(pfn);
+
+		pgmap = get_dev_pagemap(pfn, pgmap);
+		if (unlikely(!pgmap)) {
+			undo_dev_pagemap(nr, nr_start, pages);
+			return 0;
+		}
+		SetPageReferenced(page);
+		pages[*nr] = page;
+		get_page(page);
+		put_dev_pagemap(pgmap);
+		(*nr)++;
+		pfn++;
+	} while (addr += PAGE_SIZE, addr != end);
+	return 1;
+}
+
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+		unsigned long end, struct page **pages, int *nr)
+{
+	unsigned long fault_pfn;
+
+	fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+		unsigned long end, struct page **pages, int *nr)
+{
+	unsigned long fault_pfn;
+
+	fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+	return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+#else
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+		unsigned long end, struct page **pages, int *nr)
+{
+	BUILD_BUG();
+	return 0;
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+		unsigned long end, struct page **pages, int *nr)
+{
+	BUILD_BUG();
+	return 0;
+}
+#endif
+
 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		unsigned long end, int write, struct page **pages, int *nr)
 {
@@ -1278,6 +1354,9 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 	if (!pmd_access_permitted(orig, write))
 		return 0;
 
+	if (pmd_devmap(orig))
+		return __gup_device_huge_pmd(orig, addr, end, pages, nr);
+
 	refs = 0;
 	head = pmd_page(orig);
 	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1314,6 +1393,9 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 	if (!pud_access_permitted(orig, write))
 		return 0;
 
+	if (pud_devmap(orig))
+		return __gup_device_huge_pud(orig, addr, end, pages, nr);
+
 	refs = 0;
 	head = pud_page(orig);
 	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -1351,6 +1433,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 	if (!pgd_access_permitted(orig, write))
 		return 0;
 
+	BUILD_BUG_ON(pgd_devmap(orig));
 	refs = 0;
 	head = pgd_page(orig);
 	page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
-- 
cgit v1.3-6-gb490


From 79170fda313ed5be2394f87aa2a00d597f8ed4a1 Mon Sep 17 00:00:00 2001
From: Kyle Huey <me@kylehuey.com>
Date: Mon, 20 Mar 2017 01:16:24 -0700
Subject: x86/syscalls/32: Wire up arch_prctl on x86-32
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Hook up arch_prctl to call do_arch_prctl() on x86-32, and in 32 bit compat
mode on x86-64. This allows to have arch_prctls that are not specific to 64
bits.

On UML, simply stub out this syscall.

Signed-off-by: Kyle Huey <khuey@kylehuey.com>
Cc: Grzegorz Andrejczuk <grzegorz.andrejczuk@intel.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Robert O'Callahan <robert@ocallahan.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: user-mode-linux-devel@lists.sourceforge.net
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: user-mode-linux-user@lists.sourceforge.net
Cc: David Matlack <dmatlack@google.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Link: http://lkml.kernel.org/r/20170320081628.18952-7-khuey@kylehuey.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/entry/syscalls/syscall_32.tbl | 1 +
 arch/x86/kernel/process_32.c           | 7 +++++++
 arch/x86/kernel/process_64.c           | 7 +++++++
 arch/x86/um/Makefile                   | 2 +-
 arch/x86/um/syscalls_32.c              | 7 +++++++
 include/linux/compat.h                 | 2 ++
 6 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/um/syscalls_32.c

(limited to 'include/linux')

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 9ba050fe47f3..0af59fa789ea 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -390,3 +390,4 @@
 381	i386	pkey_alloc		sys_pkey_alloc
 382	i386	pkey_free		sys_pkey_free
 383	i386	statx			sys_statx
+384	i386	arch_prctl		sys_arch_prctl			compat_sys_arch_prctl
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4c818f8bc135..ff40e74c9181 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
@@ -56,6 +57,7 @@
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
 #include <asm/intel_rdt.h>
+#include <asm/proto.h>
 
 void __show_regs(struct pt_regs *regs, int all)
 {
@@ -304,3 +306,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	return prev_p;
 }
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+	return do_arch_prctl_common(current, option, arg2);
+}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d81b0a60a45c..ea1a6180bf39 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -635,6 +635,13 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 	return ret;
 }
 
+#ifdef CONFIG_IA32_EMULATION
+COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+	return do_arch_prctl_common(current, option, arg2);
+}
+#endif
+
 unsigned long KSTK_ESP(struct task_struct *task)
 {
 	return task_pt_regs(task)->sp;
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index e7e7055a8658..69f0827d5f53 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -16,7 +16,7 @@ obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \
 
 ifeq ($(CONFIG_X86_32),y)
 
-obj-y += checksum_32.o
+obj-y += checksum_32.o syscalls_32.o
 obj-$(CONFIG_ELF_CORE) += elfcore.o
 
 subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c
new file mode 100644
index 000000000000..627d68836b16
--- /dev/null
+++ b/arch/x86/um/syscalls_32.c
@@ -0,0 +1,7 @@
+#include <linux/syscalls.h>
+#include <os.h>
+
+SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
+{
+	return -EINVAL;
+}
diff --git a/include/linux/compat.h b/include/linux/compat.h
index aef47be2a5c1..af9dbc44fd92 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -723,6 +723,8 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
 asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
 					    int, const char __user *);
 
+asmlinkage long compat_sys_arch_prctl(int option, unsigned long arg2);
+
 /*
  * For most but not all architectures, "am I in a compat syscall?" and
  * "am I a compat task?" are the same question.  For architectures on which
-- 
cgit v1.3-6-gb490


From e9ea1e7f53b852147cbd568b0568c7ad97ec21a3 Mon Sep 17 00:00:00 2001
From: Kyle Huey <me@kylehuey.com>
Date: Mon, 20 Mar 2017 01:16:26 -0700
Subject: x86/arch_prctl: Add ARCH_[GET|SET]_CPUID
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Intel supports faulting on the CPUID instruction beginning with Ivy Bridge.
When enabled, the processor will fault on attempts to execute the CPUID
instruction with CPL>0. Exposing this feature to userspace will allow a
ptracer to trap and emulate the CPUID instruction.

When supported, this feature is controlled by toggling bit 0 of
MSR_MISC_FEATURES_ENABLES. It is documented in detail in Section 2.3.2 of
https://bugzilla.kernel.org/attachment.cgi?id=243991

Implement a new pair of arch_prctls, available on both x86-32 and x86-64.

ARCH_GET_CPUID: Returns the current CPUID state, either 0 if CPUID faulting
    is enabled (and thus the CPUID instruction is not available) or 1 if
    CPUID faulting is not enabled.

ARCH_SET_CPUID: Set the CPUID state to the second argument. If
    cpuid_enabled is 0 CPUID faulting will be activated, otherwise it will
    be deactivated. Returns ENODEV if CPUID faulting is not supported on
    this system.

The state of the CPUID faulting flag is propagated across forks, but reset
upon exec.

Signed-off-by: Kyle Huey <khuey@kylehuey.com>
Cc: Grzegorz Andrejczuk <grzegorz.andrejczuk@intel.com>
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: linux-kselftest@vger.kernel.org
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Robert O'Callahan <robert@ocallahan.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Len Brown <len.brown@intel.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: user-mode-linux-devel@lists.sourceforge.net
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: user-mode-linux-user@lists.sourceforge.net
Cc: David Matlack <dmatlack@google.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Dmitry Safonov <dsafonov@virtuozzo.com>
Cc: linux-fsdevel@vger.kernel.org
Cc: Paolo Bonzini <pbonzini@redhat.com>
Link: http://lkml.kernel.org/r/20170320081628.18952-9-khuey@kylehuey.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/include/asm/msr-index.h   |  2 +
 arch/x86/include/asm/processor.h   |  2 +
 arch/x86/include/asm/thread_info.h |  6 ++-
 arch/x86/include/uapi/asm/prctl.h  | 11 ++++--
 arch/x86/kernel/cpu/intel.c        | 18 +++++----
 arch/x86/kernel/process.c          | 78 ++++++++++++++++++++++++++++++++++++++
 fs/exec.c                          |  1 +
 include/linux/thread_info.h        |  4 ++
 8 files changed, 109 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b1f75daca34b..673f9ac50f6d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -558,6 +558,8 @@
 /* MISC_FEATURES_ENABLES non-architectural features */
 #define MSR_MISC_FEATURES_ENABLES	0x00000140
 
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT	0
+#define MSR_MISC_FEATURES_ENABLES_CPUID_FAULT		BIT_ULL(MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT)
 #define MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT	1
 
 #define MSR_IA32_TSC_DEADLINE		0x000006E0
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca5407a..a80c1b3997ed 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -884,6 +884,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
 extern int get_tsc_mode(unsigned long adr);
 extern int set_tsc_mode(unsigned int val);
 
+DECLARE_PER_CPU(u64, msr_misc_features_shadow);
+
 /* Register/unregister a process' MPX related resource */
 #define MPX_ENABLE_MANAGEMENT()	mpx_enable_management()
 #define MPX_DISABLE_MANAGEMENT()	mpx_disable_management()
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ad6f5eb07a95..9fc44b95f7cb 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -87,6 +87,7 @@ struct thread_info {
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
+#define TIF_NOCPUID		15	/* CPUID is not accessible in userland */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* IA32 compatibility process */
 #define TIF_NOHZ		19	/* in adaptive nohz mode */
@@ -110,6 +111,7 @@ struct thread_info {
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
+#define _TIF_NOCPUID		(1 << TIF_NOCPUID)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_NOHZ		(1 << TIF_NOHZ)
@@ -138,7 +140,7 @@ struct thread_info {
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
-	(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
+	(_TIF_IO_BITMAP|_TIF_NOCPUID|_TIF_NOTSC|_TIF_BLOCKSTEP)
 
 #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
@@ -239,6 +241,8 @@ static inline int arch_within_stack_frames(const void * const stack,
 extern void arch_task_cache_init(void);
 extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
 extern void arch_release_task_struct(struct task_struct *tsk);
+extern void arch_setup_new_exec(void);
+#define arch_setup_new_exec arch_setup_new_exec
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index 835aa51c7f6e..c45765517092 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -1,10 +1,13 @@
 #ifndef _ASM_X86_PRCTL_H
 #define _ASM_X86_PRCTL_H
 
-#define ARCH_SET_GS 0x1001
-#define ARCH_SET_FS 0x1002
-#define ARCH_GET_FS 0x1003
-#define ARCH_GET_GS 0x1004
+#define ARCH_SET_GS		0x1001
+#define ARCH_SET_FS		0x1002
+#define ARCH_GET_FS		0x1003
+#define ARCH_GET_GS		0x1004
+
+#define ARCH_GET_CPUID		0x1011
+#define ARCH_SET_CPUID		0x1012
 
 #define ARCH_MAP_VDSO_X32	0x2001
 #define ARCH_MAP_VDSO_32	0x2002
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index a07f8295c9ed..dfa90a3a5145 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -90,16 +90,12 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c)
 		return;
 	}
 
-	if (ring3mwait_disabled) {
-		msr_clear_bit(MSR_MISC_FEATURES_ENABLES,
-			      MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
+	if (ring3mwait_disabled)
 		return;
-	}
-
-	msr_set_bit(MSR_MISC_FEATURES_ENABLES,
-		    MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
 
 	set_cpu_cap(c, X86_FEATURE_RING3MWAIT);
+	this_cpu_or(msr_misc_features_shadow,
+		    1UL << MSR_MISC_FEATURES_ENABLES_RING3MWAIT_BIT);
 
 	if (c == &boot_cpu_data)
 		ELF_HWCAP2 |= HWCAP2_RING3MWAIT;
@@ -505,9 +501,15 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
 	if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr))
 		return;
 
-	/* Check features and update capabilities */
+	/* Clear all MISC features */
+	this_cpu_write(msr_misc_features_shadow, 0);
+
+	/* Check features and update capabilities and shadow control bits */
 	init_cpuid_fault(c);
 	probe_xeon_phi_r3mwait(c);
+
+	msr = this_cpu_read(msr_misc_features_shadow);
+	wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
 }
 
 static void init_intel(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b12e95eceb83..0bb88428cbf2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -37,6 +37,7 @@
 #include <asm/vm86.h>
 #include <asm/switch_to.h>
 #include <asm/desc.h>
+#include <asm/prctl.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -172,6 +173,73 @@ int set_tsc_mode(unsigned int val)
 	return 0;
 }
 
+DEFINE_PER_CPU(u64, msr_misc_features_shadow);
+
+static void set_cpuid_faulting(bool on)
+{
+	u64 msrval;
+
+	msrval = this_cpu_read(msr_misc_features_shadow);
+	msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
+	msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
+	this_cpu_write(msr_misc_features_shadow, msrval);
+	wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
+}
+
+static void disable_cpuid(void)
+{
+	preempt_disable();
+	if (!test_and_set_thread_flag(TIF_NOCPUID)) {
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOCPUID in the current running context.
+		 */
+		set_cpuid_faulting(true);
+	}
+	preempt_enable();
+}
+
+static void enable_cpuid(void)
+{
+	preempt_disable();
+	if (test_and_clear_thread_flag(TIF_NOCPUID)) {
+		/*
+		 * Must flip the CPU state synchronously with
+		 * TIF_NOCPUID in the current running context.
+		 */
+		set_cpuid_faulting(false);
+	}
+	preempt_enable();
+}
+
+static int get_cpuid_mode(void)
+{
+	return !test_thread_flag(TIF_NOCPUID);
+}
+
+static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
+{
+	if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
+		return -ENODEV;
+
+	if (cpuid_enabled)
+		enable_cpuid();
+	else
+		disable_cpuid();
+
+	return 0;
+}
+
+/*
+ * Called immediately after a successful exec.
+ */
+void arch_setup_new_exec(void)
+{
+	/* If cpuid was previously disabled for this task, re-enable it. */
+	if (test_thread_flag(TIF_NOCPUID))
+		enable_cpuid();
+}
+
 static inline void switch_to_bitmap(struct tss_struct *tss,
 				    struct thread_struct *prev,
 				    struct thread_struct *next,
@@ -225,6 +293,9 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 
 	if ((tifp ^ tifn) & _TIF_NOTSC)
 		cr4_toggle_bits(X86_CR4_TSD);
+
+	if ((tifp ^ tifn) & _TIF_NOCPUID)
+		set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
 }
 
 /*
@@ -549,5 +620,12 @@ out:
 long do_arch_prctl_common(struct task_struct *task, int option,
 			  unsigned long cpuid_enabled)
 {
+	switch (option) {
+	case ARCH_GET_CPUID:
+		return get_cpuid_mode();
+	case ARCH_SET_CPUID:
+		return set_cpuid_mode(task, cpuid_enabled);
+	}
+
 	return -EINVAL;
 }
diff --git a/fs/exec.c b/fs/exec.c
index 65145a3df065..72934df68471 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1320,6 +1320,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	else
 		set_dumpable(current->mm, suid_dumpable);
 
+	arch_setup_new_exec();
 	perf_event_exec();
 	__set_task_comm(current, kbasename(bprm->filename), true);
 
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 58373875e8ee..55125d674338 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -101,6 +101,10 @@ static inline void check_object_size(const void *ptr, unsigned long n,
 { }
 #endif /* CONFIG_HARDENED_USERCOPY */
 
+#ifndef arch_setup_new_exec
+static inline void arch_setup_new_exec(void) { }
+#endif
+
 #endif	/* __KERNEL__ */
 
 #endif /* _LINUX_THREAD_INFO_H */
-- 
cgit v1.3-6-gb490


From aeb9dd1de98c1a5f2007ea5d2a154c1244caf8a0 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 21 Mar 2017 16:01:30 +0800
Subject: usb/early: Add driver for xhci debug capability

XHCI debug capability (DbC) is an optional but standalone
functionality provided by an xHCI host controller. Software
learns this capability by walking through the extended
capability list of the host. XHCI specification describes
DbC in section 7.6.

This patch introduces the code to probe and initialize the
debug capability hardware during early boot. With hardware
initialized, the debug target (system on which this code is
running) will present a debug device through the debug port
(normally the first USB3 port). The debug device is fully
compliant with the USB framework and provides the equivalent
of a very high performance (USB3) full-duplex serial link
between the debug host and target. The DbC functionality is
independent of the xHCI host. There isn't any precondition
from the xHCI host side for the DbC to work.

One use for this feature is kernel debugging, for example
when your machine crashes very early before the regular
console code is initialized. Other uses include simpler,
lockless logging instead of a full-blown printk console
driver and klogd.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathias Nyman <mathias.nyman@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-usb@vger.kernel.org
Link: http://lkml.kernel.org/r/1490083293-3792-3-git-send-email-baolu.lu@linux.intel.com
[ Small fix to the Kconfig help text. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig.debug        |   27 +-
 drivers/usb/Makefile          |    2 +-
 drivers/usb/early/Makefile    |    1 +
 drivers/usb/early/xhci-dbc.c  | 1014 +++++++++++++++++++++++++++++++++++++++++
 drivers/usb/early/xhci-dbc.h  |  211 +++++++++
 include/linux/usb/xhci-dbgp.h |   29 ++
 6 files changed, 1281 insertions(+), 3 deletions(-)
 create mode 100644 drivers/usb/early/xhci-dbc.c
 create mode 100644 drivers/usb/early/xhci-dbc.h
 create mode 100644 include/linux/usb/xhci-dbgp.h

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 63c1d13aaf9f..fcb7604172ce 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,6 +5,9 @@ config TRACE_IRQFLAGS_SUPPORT
 
 source "lib/Kconfig.debug"
 
+config EARLY_PRINTK_USB
+	bool
+
 config X86_VERBOSE_BOOTUP
 	bool "Enable verbose x86 bootup info messages"
 	default y
@@ -23,19 +26,20 @@ config EARLY_PRINTK
 	  This is useful for kernel debugging when your machine crashes very
 	  early before the console code is initialized. For normal operation
 	  it is not recommended because it looks ugly and doesn't cooperate
-	  with klogd/syslogd or the X server. You should normally N here,
+	  with klogd/syslogd or the X server. You should normally say N here,
 	  unless you want to debug such a crash.
 
 config EARLY_PRINTK_DBGP
 	bool "Early printk via EHCI debug port"
 	depends on EARLY_PRINTK && PCI
+	select EARLY_PRINTK_USB
 	---help---
 	  Write kernel log output directly into the EHCI debug port.
 
 	  This is useful for kernel debugging when your machine crashes very
 	  early before the console code is initialized. For normal operation
 	  it is not recommended because it looks ugly and doesn't cooperate
-	  with klogd/syslogd or the X server. You should normally N here,
+	  with klogd/syslogd or the X server. You should normally say N here,
 	  unless you want to debug such a crash. You need usb debug device.
 
 config EARLY_PRINTK_EFI
@@ -48,6 +52,25 @@ config EARLY_PRINTK_EFI
 	  This is useful for kernel debugging when your machine crashes very
 	  early before the console code is initialized.
 
+config EARLY_PRINTK_USB_XDBC
+	bool "Early printk via the xHCI debug port"
+	depends on EARLY_PRINTK && PCI
+	select EARLY_PRINTK_USB
+	---help---
+	  Write kernel log output directly into the xHCI debug port.
+
+	  One use for this feature is kernel debugging, for example when your
+	  machine crashes very early before the regular console code is
+	  initialized. Other uses include simpler, lockless logging instead of
+	  a full-blown printk console driver + klogd.
+
+	  For normal production environments this is normally not recommended,
+	  because it doesn't feed events into klogd/syslogd and doesn't try to
+	  print anything on the screen.
+
+	  You should normally say N here, unless you want to debug early
+	  crashes or need a very simple printk logging facility.
+
 config X86_PTDUMP_CORE
 	def_bool n
 
diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile
index 7791af6c102c..53d1356bbd06 100644
--- a/drivers/usb/Makefile
+++ b/drivers/usb/Makefile
@@ -49,7 +49,7 @@ obj-$(CONFIG_USB_MICROTEK)	+= image/
 obj-$(CONFIG_USB_SERIAL)	+= serial/
 
 obj-$(CONFIG_USB)		+= misc/
-obj-$(CONFIG_EARLY_PRINTK_DBGP)	+= early/
+obj-$(CONFIG_EARLY_PRINTK_USB)	+= early/
 
 obj-$(CONFIG_USB_ATM)		+= atm/
 obj-$(CONFIG_USB_SPEEDTOUCH)	+= atm/
diff --git a/drivers/usb/early/Makefile b/drivers/usb/early/Makefile
index 24bbe519c737..fcde2286da1c 100644
--- a/drivers/usb/early/Makefile
+++ b/drivers/usb/early/Makefile
@@ -3,3 +3,4 @@
 #
 
 obj-$(CONFIG_EARLY_PRINTK_DBGP) += ehci-dbgp.o
+obj-$(CONFIG_EARLY_PRINTK_USB_XDBC) += xhci-dbc.o
diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c
new file mode 100644
index 000000000000..1268818e2263
--- /dev/null
+++ b/drivers/usb/early/xhci-dbc.c
@@ -0,0 +1,1014 @@
+/**
+ * xhci-dbc.c - xHCI debug capability early driver
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ":%s: " fmt, __func__
+
+#include <linux/console.h>
+#include <linux/pci_regs.h>
+#include <linux/pci_ids.h>
+#include <linux/bootmem.h>
+#include <linux/io.h>
+#include <asm/pci-direct.h>
+#include <asm/fixmap.h>
+#include <linux/bcd.h>
+#include <linux/export.h>
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+
+#include "../host/xhci.h"
+#include "xhci-dbc.h"
+
+static struct xdbc_state xdbc;
+static bool early_console_keep;
+
+#define XDBC_TRACE
+#ifdef XDBC_TRACE
+#define	xdbc_trace	trace_printk
+#else
+static inline void xdbc_trace(const char *fmt, ...) { }
+#endif /* XDBC_TRACE */
+
+static void __iomem * __init xdbc_map_pci_mmio(u32 bus, u32 dev, u32 func)
+{
+	u64 val64, sz64, mask64;
+	void __iomem *base;
+	u32 val, sz;
+	u8 byte;
+
+	val = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0);
+	write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0, ~0);
+	sz = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0);
+	write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0, val);
+
+	if (val == 0xffffffff || sz == 0xffffffff) {
+		pr_notice("invalid mmio bar\n");
+		return NULL;
+	}
+
+	val64	= val & PCI_BASE_ADDRESS_MEM_MASK;
+	sz64	= sz & PCI_BASE_ADDRESS_MEM_MASK;
+	mask64	= PCI_BASE_ADDRESS_MEM_MASK;
+
+	if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+		val = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4);
+		write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4, ~0);
+		sz = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4);
+		write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4, val);
+
+		val64	|= (u64)val << 32;
+		sz64	|= (u64)sz << 32;
+		mask64	|= ~0ULL << 32;
+	}
+
+	sz64 &= mask64;
+
+	if (!sz64) {
+		pr_notice("invalid mmio address\n");
+		return NULL;
+	}
+
+	sz64 = 1ULL << __ffs64(sz64);
+
+	/* Check if the mem space is enabled: */
+	byte = read_pci_config_byte(bus, dev, func, PCI_COMMAND);
+	if (!(byte & PCI_COMMAND_MEMORY)) {
+		byte |= PCI_COMMAND_MEMORY;
+		write_pci_config_byte(bus, dev, func, PCI_COMMAND, byte);
+	}
+
+	xdbc.xhci_start = val64;
+	xdbc.xhci_length = sz64;
+	base = early_ioremap(val64, sz64);
+
+	return base;
+}
+
+static void * __init xdbc_get_page(dma_addr_t *dma_addr)
+{
+	void *virt;
+
+	virt = alloc_bootmem_pages_nopanic(PAGE_SIZE);
+	if (!virt)
+		return NULL;
+
+	if (dma_addr)
+		*dma_addr = (dma_addr_t)__pa(virt);
+
+	return virt;
+}
+
+static u32 __init xdbc_find_dbgp(int xdbc_num, u32 *b, u32 *d, u32 *f)
+{
+	u32 bus, dev, func, class;
+
+	for (bus = 0; bus < XDBC_PCI_MAX_BUSES; bus++) {
+		for (dev = 0; dev < XDBC_PCI_MAX_DEVICES; dev++) {
+			for (func = 0; func < XDBC_PCI_MAX_FUNCTION; func++) {
+
+				class = read_pci_config(bus, dev, func, PCI_CLASS_REVISION);
+				if ((class >> 8) != PCI_CLASS_SERIAL_USB_XHCI)
+					continue;
+
+				if (xdbc_num-- != 0)
+					continue;
+
+				*b = bus;
+				*d = dev;
+				*f = func;
+
+				return 0;
+			}
+		}
+	}
+
+	return -1;
+}
+
+static int handshake(void __iomem *ptr, u32 mask, u32 done, int wait, int delay)
+{
+	u32 result;
+
+	do {
+		result = readl(ptr);
+		result &= mask;
+		if (result == done)
+			return 0;
+		udelay(delay);
+		wait -= delay;
+	} while (wait > 0);
+
+	return -ETIMEDOUT;
+}
+
+static void __init xdbc_bios_handoff(void)
+{
+	int offset, timeout;
+	u32 val;
+
+	offset = xhci_find_next_ext_cap(xdbc.xhci_base, 0, XHCI_EXT_CAPS_LEGACY);
+	val = readl(xdbc.xhci_base + offset);
+
+	if (val & XHCI_HC_BIOS_OWNED) {
+		writel(val | XHCI_HC_OS_OWNED, xdbc.xhci_base + offset);
+		timeout = handshake(xdbc.xhci_base + offset, XHCI_HC_BIOS_OWNED, 0, 5000, 10);
+
+		if (timeout) {
+			pr_notice("failed to hand over xHCI control from BIOS\n");
+			writel(val & ~XHCI_HC_BIOS_OWNED, xdbc.xhci_base + offset);
+		}
+	}
+
+	/* Disable BIOS SMIs and clear all SMI events: */
+	val = readl(xdbc.xhci_base + offset + XHCI_LEGACY_CONTROL_OFFSET);
+	val &= XHCI_LEGACY_DISABLE_SMI;
+	val |= XHCI_LEGACY_SMI_EVENTS;
+	writel(val, xdbc.xhci_base + offset + XHCI_LEGACY_CONTROL_OFFSET);
+}
+
+static int __init
+xdbc_alloc_ring(struct xdbc_segment *seg, struct xdbc_ring *ring)
+{
+	seg->trbs = xdbc_get_page(&seg->dma);
+	if (!seg->trbs)
+		return -ENOMEM;
+
+	ring->segment = seg;
+
+	return 0;
+}
+
+static void __init xdbc_free_ring(struct xdbc_ring *ring)
+{
+	struct xdbc_segment *seg = ring->segment;
+
+	if (!seg)
+		return;
+
+	free_bootmem(seg->dma, PAGE_SIZE);
+	ring->segment = NULL;
+}
+
+static void xdbc_reset_ring(struct xdbc_ring *ring)
+{
+	struct xdbc_segment *seg = ring->segment;
+	struct xdbc_trb *link_trb;
+
+	memset(seg->trbs, 0, PAGE_SIZE);
+
+	ring->enqueue = seg->trbs;
+	ring->dequeue = seg->trbs;
+	ring->cycle_state = 1;
+
+	if (ring != &xdbc.evt_ring) {
+		link_trb = &seg->trbs[XDBC_TRBS_PER_SEGMENT - 1];
+		link_trb->field[0] = cpu_to_le32(lower_32_bits(seg->dma));
+		link_trb->field[1] = cpu_to_le32(upper_32_bits(seg->dma));
+		link_trb->field[3] = cpu_to_le32(TRB_TYPE(TRB_LINK)) | cpu_to_le32(LINK_TOGGLE);
+	}
+}
+
+static inline void xdbc_put_utf16(u16 *s, const char *c, size_t size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		s[i] = cpu_to_le16(c[i]);
+}
+
+static void xdbc_mem_init(void)
+{
+	struct xdbc_ep_context *ep_in, *ep_out;
+	struct usb_string_descriptor *s_desc;
+	struct xdbc_erst_entry *entry;
+	struct xdbc_strings *strings;
+	struct xdbc_context *ctx;
+	unsigned int max_burst;
+	u32 string_length;
+	int index = 0;
+	u32 dev_info;
+
+	xdbc_reset_ring(&xdbc.evt_ring);
+	xdbc_reset_ring(&xdbc.in_ring);
+	xdbc_reset_ring(&xdbc.out_ring);
+	memset(xdbc.table_base, 0, PAGE_SIZE);
+	memset(xdbc.out_buf, 0, PAGE_SIZE);
+
+	/* Initialize event ring segment table: */
+	xdbc.erst_size	= 16;
+	xdbc.erst_base	= xdbc.table_base + index * XDBC_TABLE_ENTRY_SIZE;
+	xdbc.erst_dma	= xdbc.table_dma + index * XDBC_TABLE_ENTRY_SIZE;
+
+	index += XDBC_ERST_ENTRY_NUM;
+	entry = (struct xdbc_erst_entry *)xdbc.erst_base;
+
+	entry->seg_addr		= cpu_to_le64(xdbc.evt_seg.dma);
+	entry->seg_size		= cpu_to_le32(XDBC_TRBS_PER_SEGMENT);
+	entry->__reserved_0	= 0;
+
+	/* Initialize ERST registers: */
+	writel(1, &xdbc.xdbc_reg->ersts);
+	xdbc_write64(xdbc.erst_dma, &xdbc.xdbc_reg->erstba);
+	xdbc_write64(xdbc.evt_seg.dma, &xdbc.xdbc_reg->erdp);
+
+	/* Debug capability contexts: */
+	xdbc.dbcc_size	= 64 * 3;
+	xdbc.dbcc_base	= xdbc.table_base + index * XDBC_TABLE_ENTRY_SIZE;
+	xdbc.dbcc_dma	= xdbc.table_dma + index * XDBC_TABLE_ENTRY_SIZE;
+
+	index += XDBC_DBCC_ENTRY_NUM;
+
+	/* Popluate the strings: */
+	xdbc.string_size = sizeof(struct xdbc_strings);
+	xdbc.string_base = xdbc.table_base + index * XDBC_TABLE_ENTRY_SIZE;
+	xdbc.string_dma	 = xdbc.table_dma + index * XDBC_TABLE_ENTRY_SIZE;
+	strings		 = (struct xdbc_strings *)xdbc.string_base;
+
+	index += XDBC_STRING_ENTRY_NUM;
+
+	/* Serial string: */
+	s_desc			= (struct usb_string_descriptor *)strings->serial;
+	s_desc->bLength		= (strlen(XDBC_STRING_SERIAL) + 1) * 2;
+	s_desc->bDescriptorType	= USB_DT_STRING;
+
+	xdbc_put_utf16(s_desc->wData, XDBC_STRING_SERIAL, strlen(XDBC_STRING_SERIAL));
+	string_length = s_desc->bLength;
+	string_length <<= 8;
+
+	/* Product string: */
+	s_desc			= (struct usb_string_descriptor *)strings->product;
+	s_desc->bLength		= (strlen(XDBC_STRING_PRODUCT) + 1) * 2;
+	s_desc->bDescriptorType	= USB_DT_STRING;
+
+	xdbc_put_utf16(s_desc->wData, XDBC_STRING_PRODUCT, strlen(XDBC_STRING_PRODUCT));
+	string_length += s_desc->bLength;
+	string_length <<= 8;
+
+	/* Manufacture string: */
+	s_desc			= (struct usb_string_descriptor *)strings->manufacturer;
+	s_desc->bLength		= (strlen(XDBC_STRING_MANUFACTURER) + 1) * 2;
+	s_desc->bDescriptorType	= USB_DT_STRING;
+
+	xdbc_put_utf16(s_desc->wData, XDBC_STRING_MANUFACTURER, strlen(XDBC_STRING_MANUFACTURER));
+	string_length += s_desc->bLength;
+	string_length <<= 8;
+
+	/* String0: */
+	strings->string0[0]	= 4;
+	strings->string0[1]	= USB_DT_STRING;
+	strings->string0[2]	= 0x09;
+	strings->string0[3]	= 0x04;
+
+	string_length += 4;
+
+	/* Populate info Context: */
+	ctx = (struct xdbc_context *)xdbc.dbcc_base;
+
+	ctx->info.string0	= cpu_to_le64(xdbc.string_dma);
+	ctx->info.manufacturer	= cpu_to_le64(xdbc.string_dma + XDBC_MAX_STRING_LENGTH);
+	ctx->info.product	= cpu_to_le64(xdbc.string_dma + XDBC_MAX_STRING_LENGTH * 2);
+	ctx->info.serial	= cpu_to_le64(xdbc.string_dma + XDBC_MAX_STRING_LENGTH * 3);
+	ctx->info.length	= cpu_to_le32(string_length);
+
+	/* Populate bulk out endpoint context: */
+	max_burst = DEBUG_MAX_BURST(readl(&xdbc.xdbc_reg->control));
+	ep_out = (struct xdbc_ep_context *)&ctx->out;
+
+	ep_out->ep_info1	= 0;
+	ep_out->ep_info2	= cpu_to_le32(EP_TYPE(BULK_OUT_EP) | MAX_PACKET(1024) | MAX_BURST(max_burst));
+	ep_out->deq		= cpu_to_le64(xdbc.out_seg.dma | xdbc.out_ring.cycle_state);
+
+	/* Populate bulk in endpoint context: */
+	ep_in = (struct xdbc_ep_context *)&ctx->in;
+
+	ep_in->ep_info1		= 0;
+	ep_in->ep_info2		= cpu_to_le32(EP_TYPE(BULK_OUT_EP) | MAX_PACKET(1024) | MAX_BURST(max_burst));
+	ep_in->deq		= cpu_to_le64(xdbc.in_seg.dma | xdbc.in_ring.cycle_state);
+
+	/* Set DbC context and info registers: */
+	xdbc_write64(xdbc.dbcc_dma, &xdbc.xdbc_reg->dccp);
+
+	dev_info = cpu_to_le32((XDBC_VENDOR_ID << 16) | XDBC_PROTOCOL);
+	writel(dev_info, &xdbc.xdbc_reg->devinfo1);
+
+	dev_info = cpu_to_le32((XDBC_DEVICE_REV << 16) | XDBC_PRODUCT_ID);
+	writel(dev_info, &xdbc.xdbc_reg->devinfo2);
+
+	xdbc.in_buf = xdbc.out_buf + XDBC_MAX_PACKET;
+	xdbc.in_dma = xdbc.out_dma + XDBC_MAX_PACKET;
+}
+
+static void xdbc_do_reset_debug_port(u32 id, u32 count)
+{
+	void __iomem *ops_reg;
+	void __iomem *portsc;
+	u32 val, cap_length;
+	int i;
+
+	cap_length = readl(xdbc.xhci_base) & 0xff;
+	ops_reg = xdbc.xhci_base + cap_length;
+
+	id--;
+	for (i = id; i < (id + count); i++) {
+		portsc = ops_reg + 0x400 + i * 0x10;
+		val = readl(portsc);
+		if (!(val & PORT_CONNECT))
+			writel(val | PORT_RESET, portsc);
+	}
+}
+
+static void xdbc_reset_debug_port(void)
+{
+	u32 val, port_offset, port_count;
+	int offset = 0;
+
+	do {
+		offset = xhci_find_next_ext_cap(xdbc.xhci_base, offset, XHCI_EXT_CAPS_PROTOCOL);
+		if (!offset)
+			break;
+
+		val = readl(xdbc.xhci_base + offset);
+		if (XHCI_EXT_PORT_MAJOR(val) != 0x3)
+			continue;
+
+		val = readl(xdbc.xhci_base + offset + 8);
+		port_offset = XHCI_EXT_PORT_OFF(val);
+		port_count = XHCI_EXT_PORT_COUNT(val);
+
+		xdbc_do_reset_debug_port(port_offset, port_count);
+	} while (1);
+}
+
+static void
+xdbc_queue_trb(struct xdbc_ring *ring, u32 field1, u32 field2, u32 field3, u32 field4)
+{
+	struct xdbc_trb *trb, *link_trb;
+
+	trb = ring->enqueue;
+	trb->field[0] = cpu_to_le32(field1);
+	trb->field[1] = cpu_to_le32(field2);
+	trb->field[2] = cpu_to_le32(field3);
+	trb->field[3] = cpu_to_le32(field4);
+
+	++(ring->enqueue);
+	if (ring->enqueue >= &ring->segment->trbs[TRBS_PER_SEGMENT - 1]) {
+		link_trb = ring->enqueue;
+		if (ring->cycle_state)
+			link_trb->field[3] |= cpu_to_le32(TRB_CYCLE);
+		else
+			link_trb->field[3] &= cpu_to_le32(~TRB_CYCLE);
+
+		ring->enqueue = ring->segment->trbs;
+		ring->cycle_state ^= 1;
+	}
+}
+
+static void xdbc_ring_doorbell(int target)
+{
+	writel(DOOR_BELL_TARGET(target), &xdbc.xdbc_reg->doorbell);
+}
+
+static int xdbc_start(void)
+{
+	u32 ctrl, status;
+	int ret;
+
+	ctrl = readl(&xdbc.xdbc_reg->control);
+	writel(ctrl | CTRL_DBC_ENABLE | CTRL_PORT_ENABLE, &xdbc.xdbc_reg->control);
+	ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_ENABLE, CTRL_DBC_ENABLE, 100000, 100);
+	if (ret) {
+		xdbc_trace("failed to initialize hardware\n");
+		return ret;
+	}
+
+	/* Reset port to avoid bus hang: */
+	if (xdbc.vendor == PCI_VENDOR_ID_INTEL)
+		xdbc_reset_debug_port();
+
+	/* Wait for port connection: */
+	ret = handshake(&xdbc.xdbc_reg->portsc, PORTSC_CONN_STATUS, PORTSC_CONN_STATUS, 5000000, 100);
+	if (ret) {
+		xdbc_trace("waiting for connection timed out\n");
+		return ret;
+	}
+
+	/* Wait for debug device to be configured: */
+	ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_RUN, CTRL_DBC_RUN, 5000000, 100);
+	if (ret) {
+		xdbc_trace("waiting for device configuration timed out\n");
+		return ret;
+	}
+
+	/* Check port number: */
+	status = readl(&xdbc.xdbc_reg->status);
+	if (!DCST_DEBUG_PORT(status)) {
+		xdbc_trace("invalid root hub port number\n");
+		return -ENODEV;
+	}
+
+	xdbc.port_number = DCST_DEBUG_PORT(status);
+
+	xdbc_trace("DbC is running now, control 0x%08x port ID %d\n",
+		   readl(&xdbc.xdbc_reg->control), xdbc.port_number);
+
+	return 0;
+}
+
+static int xdbc_bulk_transfer(void *data, int size, bool read)
+{
+	struct xdbc_ring *ring;
+	struct xdbc_trb *trb;
+	u32 length, control;
+	u32 cycle;
+	u64 addr;
+
+	if (size > XDBC_MAX_PACKET) {
+		xdbc_trace("bad parameter, size %d\n", size);
+		return -EINVAL;
+	}
+
+	if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED) ||
+	    !(xdbc.flags & XDBC_FLAGS_CONFIGURED) ||
+	    (!read && (xdbc.flags & XDBC_FLAGS_OUT_STALL)) ||
+	    (read && (xdbc.flags & XDBC_FLAGS_IN_STALL))) {
+
+		xdbc_trace("connection not ready, flags %08x\n", xdbc.flags);
+		return -EIO;
+	}
+
+	ring = (read ? &xdbc.in_ring : &xdbc.out_ring);
+	trb = ring->enqueue;
+	cycle = ring->cycle_state;
+	length = TRB_LEN(size);
+	control = TRB_TYPE(TRB_NORMAL) | TRB_IOC;
+
+	if (cycle)
+		control &= cpu_to_le32(~TRB_CYCLE);
+	else
+		control |= cpu_to_le32(TRB_CYCLE);
+
+	if (read) {
+		memset(xdbc.in_buf, 0, XDBC_MAX_PACKET);
+		addr = xdbc.in_dma;
+		xdbc.flags |= XDBC_FLAGS_IN_PROCESS;
+	} else {
+		memset(xdbc.out_buf, 0, XDBC_MAX_PACKET);
+		memcpy(xdbc.out_buf, data, size);
+		addr = xdbc.out_dma;
+		xdbc.flags |= XDBC_FLAGS_OUT_PROCESS;
+	}
+
+	xdbc_queue_trb(ring, lower_32_bits(addr), upper_32_bits(addr), length, control);
+
+	/*
+	 * Add a barrier between writes of trb fields and flipping
+	 * the cycle bit:
+	 */
+	wmb();
+	if (cycle)
+		trb->field[3] |= cpu_to_le32(cycle);
+	else
+		trb->field[3] &= cpu_to_le32(~TRB_CYCLE);
+
+	xdbc_ring_doorbell(read ? IN_EP_DOORBELL : OUT_EP_DOORBELL);
+
+	return size;
+}
+
+static int xdbc_handle_external_reset(void)
+{
+	int ret = 0;
+
+	xdbc.flags = 0;
+	writel(0, &xdbc.xdbc_reg->control);
+	ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_ENABLE, 0, 100000, 10);
+	if (ret)
+		goto reset_out;
+
+	xdbc_mem_init();
+
+	mmiowb();
+
+	ret = xdbc_start();
+	if (ret < 0)
+		goto reset_out;
+
+	xdbc_trace("dbc recovered\n");
+
+	xdbc.flags |= XDBC_FLAGS_INITIALIZED | XDBC_FLAGS_CONFIGURED;
+
+	xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true);
+
+	return 0;
+
+reset_out:
+	xdbc_trace("failed to recover from external reset\n");
+	return ret;
+}
+
+static int __init xdbc_early_setup(void)
+{
+	int ret;
+
+	writel(0, &xdbc.xdbc_reg->control);
+	ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_ENABLE, 0, 100000, 100);
+	if (ret)
+		return ret;
+
+	/* Allocate the table page: */
+	xdbc.table_base = xdbc_get_page(&xdbc.table_dma);
+	if (!xdbc.table_base)
+		return -ENOMEM;
+
+	/* Get and store the transfer buffer: */
+	xdbc.out_buf = xdbc_get_page(&xdbc.out_dma);
+	if (!xdbc.out_buf)
+		return -ENOMEM;
+
+	/* Allocate the event ring: */
+	ret = xdbc_alloc_ring(&xdbc.evt_seg, &xdbc.evt_ring);
+	if (ret < 0)
+		return ret;
+
+	/* Allocate IN/OUT endpoint transfer rings: */
+	ret = xdbc_alloc_ring(&xdbc.in_seg, &xdbc.in_ring);
+	if (ret < 0)
+		return ret;
+
+	ret = xdbc_alloc_ring(&xdbc.out_seg, &xdbc.out_ring);
+	if (ret < 0)
+		return ret;
+
+	xdbc_mem_init();
+
+	mmiowb();
+
+	ret = xdbc_start();
+	if (ret < 0) {
+		writel(0, &xdbc.xdbc_reg->control);
+		return ret;
+	}
+
+	xdbc.flags |= XDBC_FLAGS_INITIALIZED | XDBC_FLAGS_CONFIGURED;
+
+	xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true);
+
+	return 0;
+}
+
+int __init early_xdbc_parse_parameter(char *s)
+{
+	unsigned long dbgp_num = 0;
+	u32 bus, dev, func, offset;
+	int ret;
+
+	if (!early_pci_allowed())
+		return -EPERM;
+
+	if (strstr(s, "keep"))
+		early_console_keep = true;
+
+	if (xdbc.xdbc_reg)
+		return 0;
+
+	if (*s && kstrtoul(s, 0, &dbgp_num))
+		dbgp_num = 0;
+
+	pr_notice("dbgp_num: %lu\n", dbgp_num);
+
+	/* Locate the host controller: */
+	ret = xdbc_find_dbgp(dbgp_num, &bus, &dev, &func);
+	if (ret) {
+		pr_notice("failed to locate xhci host\n");
+		return -ENODEV;
+	}
+
+	xdbc.vendor	= read_pci_config_16(bus, dev, func, PCI_VENDOR_ID);
+	xdbc.device	= read_pci_config_16(bus, dev, func, PCI_DEVICE_ID);
+	xdbc.bus	= bus;
+	xdbc.dev	= dev;
+	xdbc.func	= func;
+
+	/* Map the IO memory: */
+	xdbc.xhci_base = xdbc_map_pci_mmio(bus, dev, func);
+	if (!xdbc.xhci_base)
+		return -EINVAL;
+
+	/* Locate DbC registers: */
+	offset = xhci_find_next_ext_cap(xdbc.xhci_base, 0, XHCI_EXT_CAPS_DEBUG);
+	if (!offset) {
+		pr_notice("xhci host doesn't support debug capability\n");
+		early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
+		xdbc.xhci_base = NULL;
+		xdbc.xhci_length = 0;
+
+		return -ENODEV;
+	}
+	xdbc.xdbc_reg = (struct xdbc_regs __iomem *)(xdbc.xhci_base + offset);
+
+	return 0;
+}
+
+int __init early_xdbc_setup_hardware(void)
+{
+	int ret;
+
+	if (!xdbc.xdbc_reg)
+		return -ENODEV;
+
+	xdbc_bios_handoff();
+
+	raw_spin_lock_init(&xdbc.lock);
+
+	ret = xdbc_early_setup();
+	if (ret) {
+		pr_notice("failed to setup the connection to host\n");
+
+		xdbc_free_ring(&xdbc.evt_ring);
+		xdbc_free_ring(&xdbc.out_ring);
+		xdbc_free_ring(&xdbc.in_ring);
+
+		if (xdbc.table_dma)
+			free_bootmem(xdbc.table_dma, PAGE_SIZE);
+
+		if (xdbc.out_dma)
+			free_bootmem(xdbc.out_dma, PAGE_SIZE);
+
+		xdbc.table_base = NULL;
+		xdbc.out_buf = NULL;
+	}
+
+	return ret;
+}
+
+static void xdbc_handle_port_status(struct xdbc_trb *evt_trb)
+{
+	u32 port_reg;
+
+	port_reg = readl(&xdbc.xdbc_reg->portsc);
+	if (port_reg & PORTSC_CONN_CHANGE) {
+		xdbc_trace("connect status change event\n");
+
+		/* Check whether cable unplugged: */
+		if (!(port_reg & PORTSC_CONN_STATUS)) {
+			xdbc.flags = 0;
+			xdbc_trace("cable unplugged\n");
+		}
+	}
+
+	if (port_reg & PORTSC_RESET_CHANGE)
+		xdbc_trace("port reset change event\n");
+
+	if (port_reg & PORTSC_LINK_CHANGE)
+		xdbc_trace("port link status change event\n");
+
+	if (port_reg & PORTSC_CONFIG_CHANGE)
+		xdbc_trace("config error change\n");
+
+	/* Write back the value to clear RW1C bits: */
+	writel(port_reg, &xdbc.xdbc_reg->portsc);
+}
+
+static void xdbc_handle_tx_event(struct xdbc_trb *evt_trb)
+{
+	size_t remain_length;
+	u32 comp_code;
+	int ep_id;
+
+	comp_code	= GET_COMP_CODE(le32_to_cpu(evt_trb->field[2]));
+	remain_length	= EVENT_TRB_LEN(le32_to_cpu(evt_trb->field[2]));
+	ep_id		= TRB_TO_EP_ID(le32_to_cpu(evt_trb->field[3]));
+
+	switch (comp_code) {
+	case COMP_SUCCESS:
+		remain_length = 0;
+	case COMP_SHORT_PACKET:
+		break;
+	case COMP_TRB_ERROR:
+	case COMP_BABBLE_DETECTED_ERROR:
+	case COMP_USB_TRANSACTION_ERROR:
+	case COMP_STALL_ERROR:
+	default:
+		if (ep_id == XDBC_EPID_OUT)
+			xdbc.flags |= XDBC_FLAGS_OUT_STALL;
+		if (ep_id == XDBC_EPID_IN)
+			xdbc.flags |= XDBC_FLAGS_IN_STALL;
+
+		xdbc_trace("endpoint %d stalled\n", ep_id);
+		break;
+	}
+
+	if (ep_id == XDBC_EPID_IN) {
+		xdbc.flags &= ~XDBC_FLAGS_IN_PROCESS;
+		xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true);
+	} else if (ep_id == XDBC_EPID_OUT) {
+		xdbc.flags &= ~XDBC_FLAGS_OUT_PROCESS;
+	} else {
+		xdbc_trace("invalid endpoint id %d\n", ep_id);
+	}
+}
+
+static void xdbc_handle_events(void)
+{
+	struct xdbc_trb *evt_trb;
+	bool update_erdp = false;
+	u32 reg;
+	u8 cmd;
+
+	cmd = read_pci_config_byte(xdbc.bus, xdbc.dev, xdbc.func, PCI_COMMAND);
+	if (!(cmd & PCI_COMMAND_MASTER)) {
+		cmd |= PCI_COMMAND_MASTER | PCI_COMMAND_MEMORY;
+		write_pci_config_byte(xdbc.bus, xdbc.dev, xdbc.func, PCI_COMMAND, cmd);
+	}
+
+	if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED))
+		return;
+
+	/* Handle external reset events: */
+	reg = readl(&xdbc.xdbc_reg->control);
+	if (!(reg & CTRL_DBC_ENABLE)) {
+		if (xdbc_handle_external_reset()) {
+			xdbc_trace("failed to recover connection\n");
+			return;
+		}
+	}
+
+	/* Handle configure-exit event: */
+	reg = readl(&xdbc.xdbc_reg->control);
+	if (reg & CTRL_DBC_RUN_CHANGE) {
+		writel(reg, &xdbc.xdbc_reg->control);
+		if (reg & CTRL_DBC_RUN)
+			xdbc.flags |= XDBC_FLAGS_CONFIGURED;
+		else
+			xdbc.flags &= ~XDBC_FLAGS_CONFIGURED;
+	}
+
+	/* Handle endpoint stall event: */
+	reg = readl(&xdbc.xdbc_reg->control);
+	if (reg & CTRL_HALT_IN_TR) {
+		xdbc.flags |= XDBC_FLAGS_IN_STALL;
+	} else {
+		xdbc.flags &= ~XDBC_FLAGS_IN_STALL;
+		if (!(xdbc.flags & XDBC_FLAGS_IN_PROCESS))
+			xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true);
+	}
+
+	if (reg & CTRL_HALT_OUT_TR)
+		xdbc.flags |= XDBC_FLAGS_OUT_STALL;
+	else
+		xdbc.flags &= ~XDBC_FLAGS_OUT_STALL;
+
+	/* Handle the events in the event ring: */
+	evt_trb = xdbc.evt_ring.dequeue;
+	while ((le32_to_cpu(evt_trb->field[3]) & TRB_CYCLE) == xdbc.evt_ring.cycle_state) {
+		/*
+		 * Add a barrier between reading the cycle flag and any
+		 * reads of the event's flags/data below:
+		 */
+		rmb();
+
+		switch ((le32_to_cpu(evt_trb->field[3]) & TRB_TYPE_BITMASK)) {
+		case TRB_TYPE(TRB_PORT_STATUS):
+			xdbc_handle_port_status(evt_trb);
+			break;
+		case TRB_TYPE(TRB_TRANSFER):
+			xdbc_handle_tx_event(evt_trb);
+			break;
+		default:
+			break;
+		}
+
+		++(xdbc.evt_ring.dequeue);
+		if (xdbc.evt_ring.dequeue == &xdbc.evt_seg.trbs[TRBS_PER_SEGMENT]) {
+			xdbc.evt_ring.dequeue = xdbc.evt_seg.trbs;
+			xdbc.evt_ring.cycle_state ^= 1;
+		}
+
+		evt_trb = xdbc.evt_ring.dequeue;
+		update_erdp = true;
+	}
+
+	/* Update event ring dequeue pointer: */
+	if (update_erdp)
+		xdbc_write64(__pa(xdbc.evt_ring.dequeue), &xdbc.xdbc_reg->erdp);
+}
+
+static int xdbc_bulk_write(const char *bytes, int size)
+{
+	int ret, timeout = 0;
+	unsigned long flags;
+
+retry:
+	if (in_nmi()) {
+		if (!raw_spin_trylock_irqsave(&xdbc.lock, flags))
+			return -EAGAIN;
+	} else {
+		raw_spin_lock_irqsave(&xdbc.lock, flags);
+	}
+
+	xdbc_handle_events();
+
+	/* Check completion of the previous request: */
+	if ((xdbc.flags & XDBC_FLAGS_OUT_PROCESS) && (timeout < 2000000)) {
+		raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+		udelay(100);
+		timeout += 100;
+		goto retry;
+	}
+
+	if (xdbc.flags & XDBC_FLAGS_OUT_PROCESS) {
+		raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+		xdbc_trace("previous transfer not completed yet\n");
+
+		return -ETIMEDOUT;
+	}
+
+	ret = xdbc_bulk_transfer((void *)bytes, size, false);
+	raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+
+	return ret;
+}
+
+static void early_xdbc_write(struct console *con, const char *str, u32 n)
+{
+	static char buf[XDBC_MAX_PACKET];
+	int chunk, ret;
+	int use_cr = 0;
+
+	if (!xdbc.xdbc_reg)
+		return;
+	memset(buf, 0, XDBC_MAX_PACKET);
+	while (n > 0) {
+		for (chunk = 0; chunk < XDBC_MAX_PACKET && n > 0; str++, chunk++, n--) {
+
+			if (!use_cr && *str == '\n') {
+				use_cr = 1;
+				buf[chunk] = '\r';
+				str--;
+				n++;
+				continue;
+			}
+
+			if (use_cr)
+				use_cr = 0;
+			buf[chunk] = *str;
+		}
+
+		if (chunk > 0) {
+			ret = xdbc_bulk_write(buf, chunk);
+			if (ret < 0)
+				xdbc_trace("missed message {%s}\n", buf);
+		}
+	}
+}
+
+static struct console early_xdbc_console = {
+	.name		= "earlyxdbc",
+	.write		= early_xdbc_write,
+	.flags		= CON_PRINTBUFFER,
+	.index		= -1,
+};
+
+void __init early_xdbc_register_console(void)
+{
+	if (early_console)
+		return;
+
+	early_console = &early_xdbc_console;
+	if (early_console_keep)
+		early_console->flags &= ~CON_BOOT;
+	else
+		early_console->flags |= CON_BOOT;
+	register_console(early_console);
+}
+
+static void xdbc_unregister_console(void)
+{
+	if (early_xdbc_console.flags & CON_ENABLED)
+		unregister_console(&early_xdbc_console);
+}
+
+static int xdbc_scrub_function(void *ptr)
+{
+	unsigned long flags;
+
+	while (true) {
+		raw_spin_lock_irqsave(&xdbc.lock, flags);
+		xdbc_handle_events();
+
+		if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED)) {
+			raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+			break;
+		}
+
+		raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+		schedule_timeout_interruptible(1);
+	}
+
+	xdbc_unregister_console();
+	writel(0, &xdbc.xdbc_reg->control);
+	xdbc_trace("dbc scrub function exits\n");
+
+	return 0;
+}
+
+static int __init xdbc_init(void)
+{
+	unsigned long flags;
+	void __iomem *base;
+	int ret = 0;
+	u32 offset;
+
+	if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED))
+		return 0;
+
+	/*
+	 * It's time to shut down the DbC, so that the debug
+	 * port can be reused by the host controller:
+	 */
+	if (early_xdbc_console.index == -1 ||
+	    (early_xdbc_console.flags & CON_BOOT)) {
+		xdbc_trace("hardware not used anymore\n");
+		goto free_and_quit;
+	}
+
+	base = ioremap_nocache(xdbc.xhci_start, xdbc.xhci_length);
+	if (!base) {
+		xdbc_trace("failed to remap the io address\n");
+		ret = -ENOMEM;
+		goto free_and_quit;
+	}
+
+	raw_spin_lock_irqsave(&xdbc.lock, flags);
+	early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
+	xdbc.xhci_base = base;
+	offset = xhci_find_next_ext_cap(xdbc.xhci_base, 0, XHCI_EXT_CAPS_DEBUG);
+	xdbc.xdbc_reg = (struct xdbc_regs __iomem *)(xdbc.xhci_base + offset);
+	raw_spin_unlock_irqrestore(&xdbc.lock, flags);
+
+	kthread_run(xdbc_scrub_function, NULL, "%s", "xdbc");
+
+	return 0;
+
+free_and_quit:
+	xdbc_free_ring(&xdbc.evt_ring);
+	xdbc_free_ring(&xdbc.out_ring);
+	xdbc_free_ring(&xdbc.in_ring);
+	free_bootmem(xdbc.table_dma, PAGE_SIZE);
+	free_bootmem(xdbc.out_dma, PAGE_SIZE);
+	writel(0, &xdbc.xdbc_reg->control);
+	early_iounmap(xdbc.xhci_base, xdbc.xhci_length);
+
+	return ret;
+}
+subsys_initcall(xdbc_init);
diff --git a/drivers/usb/early/xhci-dbc.h b/drivers/usb/early/xhci-dbc.h
new file mode 100644
index 000000000000..2df0f6e613fe
--- /dev/null
+++ b/drivers/usb/early/xhci-dbc.h
@@ -0,0 +1,211 @@
+/*
+ * xhci-dbc.h - xHCI debug capability early driver
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_XHCI_DBC_H
+#define __LINUX_XHCI_DBC_H
+
+#include <linux/types.h>
+#include <linux/usb/ch9.h>
+
+/*
+ * xHCI Debug Capability Register interfaces:
+ */
+struct xdbc_regs {
+	__le32	capability;
+	__le32	doorbell;
+	__le32	ersts;		/* Event Ring Segment Table Size*/
+	__le32	__reserved_0;	/* 0c~0f reserved bits */
+	__le64	erstba;		/* Event Ring Segment Table Base Address */
+	__le64	erdp;		/* Event Ring Dequeue Pointer */
+	__le32	control;
+	__le32	status;
+	__le32	portsc;		/* Port status and control */
+	__le32	__reserved_1;	/* 2b~28 reserved bits */
+	__le64	dccp;		/* Debug Capability Context Pointer */
+	__le32	devinfo1;	/* Device Descriptor Info Register 1 */
+	__le32	devinfo2;	/* Device Descriptor Info Register 2 */
+};
+
+#define DEBUG_MAX_BURST(p)	(((p) >> 16) & 0xff)
+
+#define CTRL_DBC_RUN		BIT(0)
+#define CTRL_PORT_ENABLE	BIT(1)
+#define CTRL_HALT_OUT_TR	BIT(2)
+#define CTRL_HALT_IN_TR		BIT(3)
+#define CTRL_DBC_RUN_CHANGE	BIT(4)
+#define CTRL_DBC_ENABLE		BIT(31)
+
+#define DCST_DEBUG_PORT(p)	(((p) >> 24) & 0xff)
+
+#define PORTSC_CONN_STATUS	BIT(0)
+#define PORTSC_CONN_CHANGE	BIT(17)
+#define PORTSC_RESET_CHANGE	BIT(21)
+#define PORTSC_LINK_CHANGE	BIT(22)
+#define PORTSC_CONFIG_CHANGE	BIT(23)
+
+/*
+ * xHCI Debug Capability data structures:
+ */
+struct xdbc_trb {
+	__le32 field[4];
+};
+
+struct xdbc_erst_entry {
+	__le64	seg_addr;
+	__le32	seg_size;
+	__le32	__reserved_0;
+};
+
+struct xdbc_info_context {
+	__le64	string0;
+	__le64	manufacturer;
+	__le64	product;
+	__le64	serial;
+	__le32	length;
+	__le32	__reserved_0[7];
+};
+
+struct xdbc_ep_context {
+	__le32	ep_info1;
+	__le32	ep_info2;
+	__le64	deq;
+	__le32	tx_info;
+	__le32	__reserved_0[11];
+};
+
+struct xdbc_context {
+	struct xdbc_info_context	info;
+	struct xdbc_ep_context		out;
+	struct xdbc_ep_context		in;
+};
+
+#define XDBC_INFO_CONTEXT_SIZE		48
+#define XDBC_MAX_STRING_LENGTH		64
+#define XDBC_STRING_MANUFACTURER	"Linux"
+#define XDBC_STRING_PRODUCT		"Remote GDB"
+#define XDBC_STRING_SERIAL		"0001"
+
+struct xdbc_strings {
+	char	string0[XDBC_MAX_STRING_LENGTH];
+	char	manufacturer[XDBC_MAX_STRING_LENGTH];
+	char	product[XDBC_MAX_STRING_LENGTH];
+	char	serial[XDBC_MAX_STRING_LENGTH];
+};
+
+#define XDBC_PROTOCOL		1	/* GNU Remote Debug Command Set */
+#define XDBC_VENDOR_ID		0x1d6b	/* Linux Foundation 0x1d6b */
+#define XDBC_PRODUCT_ID		0x0004	/* __le16 idProduct; device 0004 */
+#define XDBC_DEVICE_REV		0x0010	/* 0.10 */
+
+/*
+ * xHCI Debug Capability software state structures:
+ */
+struct xdbc_segment {
+	struct xdbc_trb		*trbs;
+	dma_addr_t		dma;
+};
+
+#define XDBC_TRBS_PER_SEGMENT	256
+
+struct xdbc_ring {
+	struct xdbc_segment	*segment;
+	struct xdbc_trb		*enqueue;
+	struct xdbc_trb		*dequeue;
+	u32			cycle_state;
+};
+
+#define XDBC_EPID_OUT		2
+#define XDBC_EPID_IN		3
+
+struct xdbc_state {
+	u16			vendor;
+	u16			device;
+	u32			bus;
+	u32			dev;
+	u32			func;
+	void __iomem		*xhci_base;
+	u64			xhci_start;
+	size_t			xhci_length;
+	int			port_number;
+
+	/* DbC register base */
+	struct xdbc_regs __iomem *xdbc_reg;
+
+	/* DbC table page */
+	dma_addr_t		table_dma;
+	void			*table_base;
+
+	/* event ring segment table */
+	dma_addr_t		erst_dma;
+	size_t			erst_size;
+	void			*erst_base;
+
+	/* event ring segments */
+	struct xdbc_ring	evt_ring;
+	struct xdbc_segment	evt_seg;
+
+	/* debug capability contexts */
+	dma_addr_t		dbcc_dma;
+	size_t			dbcc_size;
+	void			*dbcc_base;
+
+	/* descriptor strings */
+	dma_addr_t		string_dma;
+	size_t			string_size;
+	void			*string_base;
+
+	/* bulk OUT endpoint */
+	struct xdbc_ring	out_ring;
+	struct xdbc_segment	out_seg;
+	void			*out_buf;
+	dma_addr_t		out_dma;
+
+	/* bulk IN endpoint */
+	struct xdbc_ring	in_ring;
+	struct xdbc_segment	in_seg;
+	void			*in_buf;
+	dma_addr_t		in_dma;
+
+	u32			flags;
+
+	/* spinlock for early_xdbc_write() reentrancy */
+	raw_spinlock_t		lock;
+};
+
+#define XDBC_PCI_MAX_BUSES	256
+#define XDBC_PCI_MAX_DEVICES	32
+#define XDBC_PCI_MAX_FUNCTION	8
+
+#define XDBC_TABLE_ENTRY_SIZE	64
+#define XDBC_ERST_ENTRY_NUM	1
+#define XDBC_DBCC_ENTRY_NUM	3
+#define XDBC_STRING_ENTRY_NUM	4
+
+/* Bits definitions for xdbc_state.flags: */
+#define XDBC_FLAGS_INITIALIZED	BIT(0)
+#define XDBC_FLAGS_IN_STALL	BIT(1)
+#define XDBC_FLAGS_OUT_STALL	BIT(2)
+#define XDBC_FLAGS_IN_PROCESS	BIT(3)
+#define XDBC_FLAGS_OUT_PROCESS	BIT(4)
+#define XDBC_FLAGS_CONFIGURED	BIT(5)
+
+#define XDBC_MAX_PACKET		1024
+
+/* Door bell target: */
+#define OUT_EP_DOORBELL		0
+#define IN_EP_DOORBELL		1
+#define DOOR_BELL_TARGET(p)	(((p) & 0xff) << 8)
+
+#define xdbc_read64(regs)	xhci_read_64(NULL, (regs))
+#define xdbc_write64(val, regs)	xhci_write_64(NULL, (val), (regs))
+
+#endif /* __LINUX_XHCI_DBC_H */
diff --git a/include/linux/usb/xhci-dbgp.h b/include/linux/usb/xhci-dbgp.h
new file mode 100644
index 000000000000..80c1cca1f529
--- /dev/null
+++ b/include/linux/usb/xhci-dbgp.h
@@ -0,0 +1,29 @@
+/*
+ * Standalone xHCI debug capability driver
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_XHCI_DBGP_H
+#define __LINUX_XHCI_DBGP_H
+
+#ifdef CONFIG_EARLY_PRINTK_USB_XDBC
+int __init early_xdbc_parse_parameter(char *s);
+int __init early_xdbc_setup_hardware(void);
+void __init early_xdbc_register_console(void);
+#else
+static inline int __init early_xdbc_setup_hardware(void)
+{
+	return -ENODEV;
+}
+static inline void __init early_xdbc_register_console(void)
+{
+}
+#endif /* CONFIG_EARLY_PRINTK_USB_XDBC */
+#endif /* __LINUX_XHCI_DBGP_H */
-- 
cgit v1.3-6-gb490


From 4875253fddd7b6d322f028ad023d44b6efb7f73b Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Mar 2017 08:56:07 -0700
Subject: blk-stat: move BLK_RQ_STAT_BATCH definition to blk-stat.c

This is an implementation detail that no-one outside of blk-stat.c uses.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-stat.c          | 2 ++
 include/linux/blk_types.h | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-stat.c b/block/blk-stat.c
index f80582be5344..4681c488c262 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -9,6 +9,8 @@
 #include "blk-stat.h"
 #include "blk-mq.h"
 
+#define BLK_RQ_STAT_BATCH	64
+
 static void blk_stat_flush_batch(struct blk_rq_stat *stat)
 {
 	const s32 nr_batch = READ_ONCE(stat->nr_batch);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d703acb55d0f..e213c5e7500b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -287,8 +287,6 @@ struct blk_issue_stat {
 	u64 time;
 };
 
-#define BLK_RQ_STAT_BATCH	64
-
 struct blk_rq_stat {
 	s64 mean;
 	u64 min;
-- 
cgit v1.3-6-gb490


From 34dbad5d26e2f4b88e60f0e9ad03f99480802812 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 21 Mar 2017 08:56:08 -0700
Subject: blk-stat: convert to callback-based statistics reporting

Currently, statistics are gathered in ~0.13s windows, and users grab the
statistics whenever they need them. This is not ideal for both in-tree
users:

1. Writeback throttling wants its own dynamically sized window of
   statistics. Since the blk-stats statistics are reset after every
   window and the wbt windows don't line up with the blk-stats windows,
   wbt doesn't see every I/O.
2. Polling currently grabs the statistics on every I/O. Again, depending
   on how the window lines up, we may miss some I/Os. It's also
   unnecessary overhead to get the statistics on every I/O; the hybrid
   polling heuristic would be just as happy with the statistics from the
   previous full window.

This reworks the blk-stats infrastructure to be callback-based: users
register a callback that they want called at a given time with all of
the statistics from the window during which the callback was active.
Users can dynamically bucketize the statistics. wbt and polling both
currently use read vs. write, but polling can be extended to further
subdivide based on request size.

The callbacks are kept on an RCU list, and each callback has percpu
stats buffers. There will only be a few users, so the overhead on the
I/O completion side is low. The stats flushing is also simplified
considerably: since the timer function is responsible for clearing the
statistics, we don't have to worry about stale statistics.

wbt is a trivial conversion. After the conversion, the windowing problem
mentioned above is fixed.

For polling, we register an extra callback that caches the previous
window's statistics in the struct request_queue for the hybrid polling
heuristic to use.

Since we no longer have a single stats buffer for the request queue,
this also removes the sysfs and debugfs stats entries. To replace those,
we add a debugfs entry for the poll statistics.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c          |   6 +-
 block/blk-mq-debugfs.c    |  99 +++++++--------
 block/blk-mq.c            |  76 +++++++----
 block/blk-mq.h            |   1 -
 block/blk-stat.c          | 311 ++++++++++++++++++++++------------------------
 block/blk-stat.h          | 181 +++++++++++++++++++++++++--
 block/blk-sysfs.c         |  31 +----
 block/blk-wbt.c           |  51 +++-----
 block/blk-wbt.h           |   2 +-
 include/linux/blk_types.h |   1 -
 include/linux/blkdev.h    |  10 +-
 11 files changed, 449 insertions(+), 320 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index e8a9bc0d4bbb..78d04ddededc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -852,6 +852,10 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
 
 int blk_init_allocated_queue(struct request_queue *q)
 {
+	q->stats = blk_alloc_queue_stats();
+	if (!q->stats)
+		return -ENOMEM;
+
 	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
 	if (!q->fq)
 		return -ENOMEM;
@@ -2698,7 +2702,7 @@ void blk_finish_request(struct request *req, int error)
 	struct request_queue *q = req->q;
 
 	if (req->rq_flags & RQF_STATS)
-		blk_stat_add(&q->rq_stats[rq_data_dir(req)], req);
+		blk_stat_add(req);
 
 	if (req->rq_flags & RQF_QUEUED)
 		blk_queue_end_tag(q, req);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 48c88723944a..4b3f962a9c7a 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -43,6 +43,42 @@ static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file,
 	return ret;
 }
 
+static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
+{
+	if (stat->nr_samples) {
+		seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
+			   stat->nr_samples, stat->mean, stat->min, stat->max);
+	} else {
+		seq_puts(m, "samples=0");
+	}
+}
+
+static int queue_poll_stat_show(struct seq_file *m, void *v)
+{
+	struct request_queue *q = m->private;
+
+	seq_puts(m, "read: ");
+	print_stat(m, &q->poll_stat[READ]);
+	seq_puts(m, "\n");
+
+	seq_puts(m, "write: ");
+	print_stat(m, &q->poll_stat[WRITE]);
+	seq_puts(m, "\n");
+	return 0;
+}
+
+static int queue_poll_stat_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, queue_poll_stat_show, inode->i_private);
+}
+
+static const struct file_operations queue_poll_stat_fops = {
+	.open		= queue_poll_stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int hctx_state_show(struct seq_file *m, void *v)
 {
 	struct blk_mq_hw_ctx *hctx = m->private;
@@ -322,60 +358,6 @@ static const struct file_operations hctx_io_poll_fops = {
 	.release	= single_release,
 };
 
-static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
-{
-	seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
-		   stat->nr_samples, stat->mean, stat->min, stat->max);
-}
-
-static int hctx_stats_show(struct seq_file *m, void *v)
-{
-	struct blk_mq_hw_ctx *hctx = m->private;
-	struct blk_rq_stat stat[2];
-
-	blk_stat_init(&stat[READ]);
-	blk_stat_init(&stat[WRITE]);
-
-	blk_hctx_stat_get(hctx, stat);
-
-	seq_puts(m, "read: ");
-	print_stat(m, &stat[READ]);
-	seq_puts(m, "\n");
-
-	seq_puts(m, "write: ");
-	print_stat(m, &stat[WRITE]);
-	seq_puts(m, "\n");
-	return 0;
-}
-
-static int hctx_stats_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, hctx_stats_show, inode->i_private);
-}
-
-static ssize_t hctx_stats_write(struct file *file, const char __user *buf,
-				size_t count, loff_t *ppos)
-{
-	struct seq_file *m = file->private_data;
-	struct blk_mq_hw_ctx *hctx = m->private;
-	struct blk_mq_ctx *ctx;
-	int i;
-
-	hctx_for_each_ctx(hctx, ctx, i) {
-		blk_stat_init(&ctx->stat[READ]);
-		blk_stat_init(&ctx->stat[WRITE]);
-	}
-	return count;
-}
-
-static const struct file_operations hctx_stats_fops = {
-	.open		= hctx_stats_open,
-	.read		= seq_read,
-	.write		= hctx_stats_write,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 static int hctx_dispatched_show(struct seq_file *m, void *v)
 {
 	struct blk_mq_hw_ctx *hctx = m->private;
@@ -636,6 +618,11 @@ static const struct file_operations ctx_completed_fops = {
 	.release	= single_release,
 };
 
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
+	{"poll_stat", 0400, &queue_poll_stat_fops},
+	{},
+};
+
 static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"state", 0400, &hctx_state_fops},
 	{"flags", 0400, &hctx_flags_fops},
@@ -646,7 +633,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"sched_tags", 0400, &hctx_sched_tags_fops},
 	{"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops},
 	{"io_poll", 0600, &hctx_io_poll_fops},
-	{"stats", 0600, &hctx_stats_fops},
 	{"dispatched", 0600, &hctx_dispatched_fops},
 	{"queued", 0600, &hctx_queued_fops},
 	{"run", 0600, &hctx_run_fops},
@@ -753,6 +739,9 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
 	if (!q->mq_debugfs_dir)
 		goto err;
 
+	if (!debugfs_create_files(q->mq_debugfs_dir, q, blk_mq_debugfs_queue_attrs))
+		goto err;
+
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (blk_mq_debugfs_register_hctx(q, hctx))
 			goto err;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 559e5363bb2c..5ff66f203cd0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -39,6 +39,9 @@
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 
+static void blk_mq_poll_stats_start(struct request_queue *q);
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
+
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
@@ -432,15 +435,8 @@ static void blk_mq_ipi_complete_request(struct request *rq)
 static void blk_mq_stat_add(struct request *rq)
 {
 	if (rq->rq_flags & RQF_STATS) {
-		/*
-		 * We could rq->mq_ctx here, but there's less of a risk
-		 * of races if we have the completion event add the stats
-		 * to the local software queue.
-		 */
-		struct blk_mq_ctx *ctx;
-
-		ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
-		blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
+		blk_mq_poll_stats_start(rq->q);
+		blk_stat_add(rq);
 	}
 }
 
@@ -2040,8 +2036,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		spin_lock_init(&__ctx->lock);
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;
-		blk_stat_init(&__ctx->stat[READ]);
-		blk_stat_init(&__ctx->stat[WRITE]);
 
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
 		if (!cpu_online(i))
@@ -2339,6 +2333,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	/* mark the queue as mq asap */
 	q->mq_ops = set->ops;
 
+	q->stats = blk_alloc_queue_stats();
+	if (!q->stats)
+		goto err_exit;
+
+	q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
+					     blk_stat_rq_ddir, 2, q);
+	if (!q->poll_cb)
+		goto err_exit;
+
 	q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
 	if (!q->queue_ctx)
 		goto err_exit;
@@ -2740,27 +2743,52 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
 }
 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
 
+/* Enable polling stats and return whether they were already enabled. */
+static bool blk_poll_stats_enable(struct request_queue *q)
+{
+	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+	    test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+		return true;
+	blk_stat_add_callback(q, q->poll_cb);
+	return false;
+}
+
+static void blk_mq_poll_stats_start(struct request_queue *q)
+{
+	/*
+	 * We don't arm the callback if polling stats are not enabled or the
+	 * callback is already active.
+	 */
+	if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+	    blk_stat_is_active(q->poll_cb))
+		return;
+
+	blk_stat_activate_msecs(q->poll_cb, 100);
+}
+
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
+{
+	struct request_queue *q = cb->data;
+
+	if (cb->stat[READ].nr_samples)
+		q->poll_stat[READ] = cb->stat[READ];
+	if (cb->stat[WRITE].nr_samples)
+		q->poll_stat[WRITE] = cb->stat[WRITE];
+}
+
 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
 				       struct blk_mq_hw_ctx *hctx,
 				       struct request *rq)
 {
-	struct blk_rq_stat stat[2];
 	unsigned long ret = 0;
 
 	/*
 	 * If stats collection isn't on, don't sleep but turn it on for
 	 * future users
 	 */
-	if (!blk_stat_enable(q))
+	if (!blk_poll_stats_enable(q))
 		return 0;
 
-	/*
-	 * We don't have to do this once per IO, should optimize this
-	 * to just use the current window of stats until it changes
-	 */
-	memset(&stat, 0, sizeof(stat));
-	blk_hctx_stat_get(hctx, stat);
-
 	/*
 	 * As an optimistic guess, use half of the mean service time
 	 * for this type of request. We can (and should) make this smarter.
@@ -2769,10 +2797,10 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
 	 * important on devices where the completion latencies are longer
 	 * than ~10 usec.
 	 */
-	if (req_op(rq) == REQ_OP_READ && stat[READ].nr_samples)
-		ret = (stat[READ].mean + 1) / 2;
-	else if (req_op(rq) == REQ_OP_WRITE && stat[WRITE].nr_samples)
-		ret = (stat[WRITE].mean + 1) / 2;
+	if (req_op(rq) == REQ_OP_READ && q->poll_stat[READ].nr_samples)
+		ret = (q->poll_stat[READ].mean + 1) / 2;
+	else if (req_op(rq) == REQ_OP_WRITE && q->poll_stat[WRITE].nr_samples)
+		ret = (q->poll_stat[WRITE].mean + 1) / 2;
 
 	return ret;
 }
diff --git a/block/blk-mq.h b/block/blk-mq.h
index b79f9a7d8cf6..8d49c06fc520 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -20,7 +20,6 @@ struct blk_mq_ctx {
 
 	/* incremented at completion time */
 	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
-	struct blk_rq_stat	stat[2];
 
 	struct request_queue	*queue;
 	struct kobject		kobj;
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 4681c488c262..0d8721a60db9 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2016 Jens Axboe
  */
 #include <linux/kernel.h>
+#include <linux/rculist.h>
 #include <linux/blk-mq.h>
 
 #include "blk-stat.h"
@@ -11,6 +12,24 @@
 
 #define BLK_RQ_STAT_BATCH	64
 
+struct blk_queue_stats {
+	struct list_head callbacks;
+	spinlock_t lock;
+};
+
+unsigned int blk_stat_rq_ddir(const struct request *rq)
+{
+	return rq_data_dir(rq);
+}
+EXPORT_SYMBOL_GPL(blk_stat_rq_ddir);
+
+static void blk_stat_init(struct blk_rq_stat *stat)
+{
+	stat->min = -1ULL;
+	stat->max = stat->nr_samples = stat->mean = 0;
+	stat->batch = stat->nr_batch = 0;
+}
+
 static void blk_stat_flush_batch(struct blk_rq_stat *stat)
 {
 	const s32 nr_batch = READ_ONCE(stat->nr_batch);
@@ -50,207 +69,171 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
 	dst->nr_samples += src->nr_samples;
 }
 
-static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
 {
-	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
-	uint64_t latest = 0;
-	int i, j, nr;
-
-	blk_stat_init(&dst[READ]);
-	blk_stat_init(&dst[WRITE]);
-
-	nr = 0;
-	do {
-		uint64_t newest = 0;
-
-		queue_for_each_hw_ctx(q, hctx, i) {
-			hctx_for_each_ctx(hctx, ctx, j) {
-				blk_stat_flush_batch(&ctx->stat[READ]);
-				blk_stat_flush_batch(&ctx->stat[WRITE]);
-
-				if (!ctx->stat[READ].nr_samples &&
-				    !ctx->stat[WRITE].nr_samples)
-					continue;
-				if (ctx->stat[READ].time > newest)
-					newest = ctx->stat[READ].time;
-				if (ctx->stat[WRITE].time > newest)
-					newest = ctx->stat[WRITE].time;
-			}
-		}
+	stat->min = min(stat->min, value);
+	stat->max = max(stat->max, value);
 
-		/*
-		 * No samples
-		 */
-		if (!newest)
-			break;
-
-		if (newest > latest)
-			latest = newest;
-
-		queue_for_each_hw_ctx(q, hctx, i) {
-			hctx_for_each_ctx(hctx, ctx, j) {
-				if (ctx->stat[READ].time == newest) {
-					blk_stat_sum(&dst[READ],
-						     &ctx->stat[READ]);
-					nr++;
-				}
-				if (ctx->stat[WRITE].time == newest) {
-					blk_stat_sum(&dst[WRITE],
-						     &ctx->stat[WRITE]);
-					nr++;
-				}
-			}
-		}
-		/*
-		 * If we race on finding an entry, just loop back again.
-		 * Should be very rare.
-		 */
-	} while (!nr);
+	if (stat->batch + value < stat->batch ||
+	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+		blk_stat_flush_batch(stat);
 
-	dst[READ].time = dst[WRITE].time = latest;
+	stat->batch += value;
+	stat->nr_batch++;
 }
 
-void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+void blk_stat_add(struct request *rq)
 {
-	if (q->mq_ops)
-		blk_mq_stat_get(q, dst);
-	else {
-		blk_stat_flush_batch(&q->rq_stats[READ]);
-		blk_stat_flush_batch(&q->rq_stats[WRITE]);
-		memcpy(&dst[READ], &q->rq_stats[READ],
-		       sizeof(struct blk_rq_stat));
-		memcpy(&dst[WRITE], &q->rq_stats[WRITE],
-		       sizeof(struct blk_rq_stat));
+	struct request_queue *q = rq->q;
+	struct blk_stat_callback *cb;
+	struct blk_rq_stat *stat;
+	int bucket;
+	s64 now, value;
+
+	now = __blk_stat_time(ktime_to_ns(ktime_get()));
+	if (now < blk_stat_time(&rq->issue_stat))
+		return;
+
+	value = now - blk_stat_time(&rq->issue_stat);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
+		if (blk_stat_is_active(cb)) {
+			bucket = cb->bucket_fn(rq);
+			stat = &this_cpu_ptr(cb->cpu_stat)[bucket];
+			__blk_stat_add(stat, value);
+		}
 	}
+	rcu_read_unlock();
 }
 
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+static void blk_stat_timer_fn(unsigned long data)
 {
-	struct blk_mq_ctx *ctx;
-	unsigned int i, nr;
+	struct blk_stat_callback *cb = (void *)data;
+	unsigned int bucket;
+	int cpu;
 
-	nr = 0;
-	do {
-		uint64_t newest = 0;
+	for (bucket = 0; bucket < cb->buckets; bucket++)
+		blk_stat_init(&cb->stat[bucket]);
 
-		hctx_for_each_ctx(hctx, ctx, i) {
-			blk_stat_flush_batch(&ctx->stat[READ]);
-			blk_stat_flush_batch(&ctx->stat[WRITE]);
+	for_each_online_cpu(cpu) {
+		struct blk_rq_stat *cpu_stat;
 
-			if (!ctx->stat[READ].nr_samples &&
-			    !ctx->stat[WRITE].nr_samples)
-				continue;
-
-			if (ctx->stat[READ].time > newest)
-				newest = ctx->stat[READ].time;
-			if (ctx->stat[WRITE].time > newest)
-				newest = ctx->stat[WRITE].time;
+		cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
+		for (bucket = 0; bucket < cb->buckets; bucket++) {
+			blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
+			blk_stat_init(&cpu_stat[bucket]);
 		}
+	}
 
-		if (!newest)
-			break;
-
-		hctx_for_each_ctx(hctx, ctx, i) {
-			if (ctx->stat[READ].time == newest) {
-				blk_stat_sum(&dst[READ], &ctx->stat[READ]);
-				nr++;
-			}
-			if (ctx->stat[WRITE].time == newest) {
-				blk_stat_sum(&dst[WRITE], &ctx->stat[WRITE]);
-				nr++;
-			}
-		}
-		/*
-		 * If we race on finding an entry, just loop back again.
-		 * Should be very rare, as the window is only updated
-		 * occasionally
-		 */
-	} while (!nr);
+	cb->timer_fn(cb);
 }
 
-static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
+			unsigned int (*bucket_fn)(const struct request *),
+			unsigned int buckets, void *data)
 {
-	stat->min = -1ULL;
-	stat->max = stat->nr_samples = stat->mean = 0;
-	stat->batch = stat->nr_batch = 0;
-	stat->time = time_now & BLK_STAT_NSEC_MASK;
-}
+	struct blk_stat_callback *cb;
 
-void blk_stat_init(struct blk_rq_stat *stat)
-{
-	__blk_stat_init(stat, ktime_to_ns(ktime_get()));
-}
+	cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+	if (!cb)
+		return NULL;
 
-static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
-{
-	return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
-}
+	cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat),
+				 GFP_KERNEL);
+	if (!cb->stat) {
+		kfree(cb);
+		return NULL;
+	}
+	cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat),
+				      __alignof__(struct blk_rq_stat));
+	if (!cb->cpu_stat) {
+		kfree(cb->stat);
+		kfree(cb);
+		return NULL;
+	}
 
-bool blk_stat_is_current(struct blk_rq_stat *stat)
-{
-	return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+	cb->timer_fn = timer_fn;
+	cb->bucket_fn = bucket_fn;
+	cb->data = data;
+	cb->buckets = buckets;
+	setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb);
+
+	return cb;
 }
+EXPORT_SYMBOL_GPL(blk_stat_alloc_callback);
 
-void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+void blk_stat_add_callback(struct request_queue *q,
+			   struct blk_stat_callback *cb)
 {
-	s64 now, value;
+	unsigned int bucket;
+	int cpu;
 
-	now = __blk_stat_time(ktime_to_ns(ktime_get()));
-	if (now < blk_stat_time(&rq->issue_stat))
-		return;
+	for_each_possible_cpu(cpu) {
+		struct blk_rq_stat *cpu_stat;
 
-	if (!__blk_stat_is_current(stat, now))
-		__blk_stat_init(stat, now);
+		cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
+		for (bucket = 0; bucket < cb->buckets; bucket++)
+			blk_stat_init(&cpu_stat[bucket]);
+	}
 
-	value = now - blk_stat_time(&rq->issue_stat);
-	if (value > stat->max)
-		stat->max = value;
-	if (value < stat->min)
-		stat->min = value;
+	spin_lock(&q->stats->lock);
+	list_add_tail_rcu(&cb->list, &q->stats->callbacks);
+	set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+	spin_unlock(&q->stats->lock);
+}
+EXPORT_SYMBOL_GPL(blk_stat_add_callback);
 
-	if (stat->batch + value < stat->batch ||
-	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
-		blk_stat_flush_batch(stat);
+void blk_stat_remove_callback(struct request_queue *q,
+			      struct blk_stat_callback *cb)
+{
+	spin_lock(&q->stats->lock);
+	list_del_rcu(&cb->list);
+	if (list_empty(&q->stats->callbacks))
+		clear_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+	spin_unlock(&q->stats->lock);
 
-	stat->batch += value;
-	stat->nr_batch++;
+	del_timer_sync(&cb->timer);
 }
+EXPORT_SYMBOL_GPL(blk_stat_remove_callback);
 
-void blk_stat_clear(struct request_queue *q)
+static void blk_stat_free_callback_rcu(struct rcu_head *head)
 {
-	if (q->mq_ops) {
-		struct blk_mq_hw_ctx *hctx;
-		struct blk_mq_ctx *ctx;
-		int i, j;
-
-		queue_for_each_hw_ctx(q, hctx, i) {
-			hctx_for_each_ctx(hctx, ctx, j) {
-				blk_stat_init(&ctx->stat[READ]);
-				blk_stat_init(&ctx->stat[WRITE]);
-			}
-		}
-	} else {
-		blk_stat_init(&q->rq_stats[READ]);
-		blk_stat_init(&q->rq_stats[WRITE]);
-	}
+	struct blk_stat_callback *cb;
+
+	cb = container_of(head, struct blk_stat_callback, rcu);
+	free_percpu(cb->cpu_stat);
+	kfree(cb->stat);
+	kfree(cb);
 }
 
-void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+void blk_stat_free_callback(struct blk_stat_callback *cb)
 {
-	stat->time = (stat->time & BLK_STAT_MASK) |
-			(ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
+	call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
 }
+EXPORT_SYMBOL_GPL(blk_stat_free_callback);
 
-/*
- * Enable stat tracking, return whether it was enabled
- */
-bool blk_stat_enable(struct request_queue *q)
+struct blk_queue_stats *blk_alloc_queue_stats(void)
 {
-	if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-		set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
-		return false;
-	}
+	struct blk_queue_stats *stats;
+
+	stats = kmalloc(sizeof(*stats), GFP_KERNEL);
+	if (!stats)
+		return NULL;
+
+	INIT_LIST_HEAD(&stats->callbacks);
+	spin_lock_init(&stats->lock);
+
+	return stats;
+}
+
+void blk_free_queue_stats(struct blk_queue_stats *stats)
+{
+	if (!stats)
+		return;
+
+	WARN_ON(!list_empty(&stats->callbacks));
 
-	return true;
+	kfree(stats);
 }
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 34384328b46b..6ad5b8c59a79 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -1,11 +1,11 @@
 #ifndef BLK_STAT_H
 #define BLK_STAT_H
 
-/*
- * ~0.13s window as a power-of-2 (2^27 nsecs)
- */
-#define BLK_STAT_NSEC		134217728ULL
-#define BLK_STAT_NSEC_MASK	~(BLK_STAT_NSEC - 1)
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/ktime.h>
+#include <linux/rcupdate.h>
+#include <linux/timer.h>
 
 /*
  * Upper 3 bits can be used elsewhere
@@ -15,14 +15,69 @@
 #define BLK_STAT_TIME_MASK	((1ULL << BLK_STAT_SHIFT) - 1)
 #define BLK_STAT_MASK		~BLK_STAT_TIME_MASK
 
-void blk_stat_add(struct blk_rq_stat *, struct request *);
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
-void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
-void blk_stat_clear(struct request_queue *);
-void blk_stat_init(struct blk_rq_stat *);
-bool blk_stat_is_current(struct blk_rq_stat *);
-void blk_stat_set_issue_time(struct blk_issue_stat *);
-bool blk_stat_enable(struct request_queue *);
+/**
+ * struct blk_stat_callback - Block statistics callback.
+ *
+ * A &struct blk_stat_callback is associated with a &struct request_queue. While
+ * @timer is active, that queue's request completion latencies are sorted into
+ * buckets by @bucket_fn and added to a per-cpu buffer, @cpu_stat. When the
+ * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked.
+ */
+struct blk_stat_callback {
+	/*
+	 * @list: RCU list of callbacks for a &struct request_queue.
+	 */
+	struct list_head list;
+
+	/**
+	 * @timer: Timer for the next callback invocation.
+	 */
+	struct timer_list timer;
+
+	/**
+	 * @cpu_stat: Per-cpu statistics buckets.
+	 */
+	struct blk_rq_stat __percpu *cpu_stat;
+
+	/**
+	 * @bucket_fn: Given a request, returns which statistics bucket it
+	 * should be accounted under.
+	 */
+	unsigned int (*bucket_fn)(const struct request *);
+
+	/**
+	 * @buckets: Number of statistics buckets.
+	 */
+	unsigned int buckets;
+
+	/**
+	 * @stat: Array of statistics buckets.
+	 */
+	struct blk_rq_stat *stat;
+
+	/**
+	 * @fn: Callback function.
+	 */
+	void (*timer_fn)(struct blk_stat_callback *);
+
+	/**
+	 * @data: Private pointer for the user.
+	 */
+	void *data;
+
+	struct rcu_head rcu;
+};
+
+struct blk_queue_stats *blk_alloc_queue_stats(void);
+void blk_free_queue_stats(struct blk_queue_stats *);
+
+void blk_stat_add(struct request *);
+
+static inline void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+{
+	stat->time = ((stat->time & BLK_STAT_MASK) |
+		      (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK));
+}
 
 static inline u64 __blk_stat_time(u64 time)
 {
@@ -34,4 +89,104 @@ static inline u64 blk_stat_time(struct blk_issue_stat *stat)
 	return __blk_stat_time(stat->time);
 }
 
+/*
+ * blk_stat_rq_ddir() - Bucket callback function for the request data direction.
+ * @rq: Request.
+ *
+ * This is the same as rq_data_dir() but as a function so it can be used as
+ * @bucket_fn for blk_stat_alloc_callback().
+ *
+ * Return: Data direction of the request, either READ or WRITE.
+ */
+unsigned int blk_stat_rq_ddir(const struct request *rq);
+
+/**
+ * blk_stat_alloc_callback() - Allocate a block statistics callback.
+ * @timer_fn: Timer callback function.
+ * @bucket_fn: Bucket callback function.
+ * @buckets: Number of statistics buckets.
+ * @data: Value for the @data field of the &struct blk_stat_callback.
+ *
+ * See &struct blk_stat_callback for details on the callback functions.
+ *
+ * Return: &struct blk_stat_callback on success or NULL on ENOMEM.
+ */
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
+			unsigned int (*bucket_fn)(const struct request *),
+			unsigned int buckets, void *data);
+
+/**
+ * blk_stat_add_callback() - Add a block statistics callback to be run on a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * Note that a single &struct blk_stat_callback can only be added to a single
+ * &struct request_queue.
+ */
+void blk_stat_add_callback(struct request_queue *q,
+			   struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_remove_callback() - Remove a block statistics callback from a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * When this returns, the callback is not running on any CPUs and will not be
+ * called again unless readded.
+ */
+void blk_stat_remove_callback(struct request_queue *q,
+			      struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_free_callback() - Free a block statistics callback.
+ * @cb: The callback.
+ *
+ * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must
+ * not be associated with a request queue. I.e., if it was previously added with
+ * blk_stat_add_callback(), it must also have been removed since then with
+ * blk_stat_remove_callback().
+ */
+void blk_stat_free_callback(struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_is_active() - Check if a block statistics callback is currently
+ * gathering statistics.
+ * @cb: The callback.
+ */
+static inline bool blk_stat_is_active(struct blk_stat_callback *cb)
+{
+	return timer_pending(&cb->timer);
+}
+
+/**
+ * blk_stat_activate_nsecs() - Gather block statistics during a time window in
+ * nanoseconds.
+ * @cb: The callback.
+ * @nsecs: Number of nanoseconds to gather statistics for.
+ *
+ * The timer callback will be called when the window expires.
+ */
+static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
+					   u64 nsecs)
+{
+	mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs));
+}
+
+/**
+ * blk_stat_activate_msecs() - Gather block statistics during a time window in
+ * milliseconds.
+ * @cb: The callback.
+ * @msecs: Number of milliseconds to gather statistics for.
+ *
+ * The timer callback will be called when the window expires.
+ */
+static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
+					   unsigned int msecs)
+{
+	mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
+}
+
 #endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index fdb45fd0db0b..fa831cb2fc30 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -503,26 +503,6 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
 	return queue_var_show(blk_queue_dax(q), page);
 }
 
-static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
-{
-	return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
-			pre, (long long) stat->nr_samples,
-			(long long) stat->mean, (long long) stat->min,
-			(long long) stat->max);
-}
-
-static ssize_t queue_stats_show(struct request_queue *q, char *page)
-{
-	struct blk_rq_stat stat[2];
-	ssize_t ret;
-
-	blk_queue_stat_get(q, stat);
-
-	ret = print_stat(page, &stat[READ], "read :");
-	ret += print_stat(page + ret, &stat[WRITE], "write:");
-	return ret;
-}
-
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
@@ -691,11 +671,6 @@ static struct queue_sysfs_entry queue_dax_entry = {
 	.show = queue_dax_show,
 };
 
-static struct queue_sysfs_entry queue_stats_entry = {
-	.attr = {.name = "stats", .mode = S_IRUGO },
-	.show = queue_stats_show,
-};
-
 static struct queue_sysfs_entry queue_wb_lat_entry = {
 	.attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_wb_lat_show,
@@ -733,7 +708,6 @@ static struct attribute *default_attrs[] = {
 	&queue_poll_entry.attr,
 	&queue_wc_entry.attr,
 	&queue_dax_entry.attr,
-	&queue_stats_entry.attr,
 	&queue_wb_lat_entry.attr,
 	&queue_poll_delay_entry.attr,
 	NULL,
@@ -811,6 +785,9 @@ static void blk_release_queue(struct kobject *kobj)
 		container_of(kobj, struct request_queue, kobj);
 
 	wbt_exit(q);
+	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+		blk_stat_remove_callback(q, q->poll_cb);
+	blk_stat_free_callback(q->poll_cb);
 	bdi_put(q->backing_dev_info);
 	blkcg_exit_queue(q);
 
@@ -819,6 +796,8 @@ static void blk_release_queue(struct kobject *kobj)
 		elevator_exit(q->elevator);
 	}
 
+	blk_free_queue_stats(q->stats);
+
 	blk_exit_rl(&q->root_rl);
 
 	if (q->queue_tags)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index aafe5b551224..ffa80e11cf14 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -277,7 +277,7 @@ enum {
 	LAT_EXCEEDED,
 };
 
-static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 {
 	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
 	u64 thislat;
@@ -308,8 +308,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 		 * waited or still has writes in flights, consider us doing
 		 * just writes as well.
 		 */
-		if ((stat[WRITE].nr_samples && blk_stat_is_current(stat)) ||
-		    wb_recent_wait(rwb) || wbt_inflight(rwb))
+		if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
+		    wbt_inflight(rwb))
 			return LAT_UNKNOWN_WRITES;
 		return LAT_UNKNOWN;
 	}
@@ -329,14 +329,6 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
 	return LAT_OK;
 }
 
-static int latency_exceeded(struct rq_wb *rwb)
-{
-	struct blk_rq_stat stat[2];
-
-	blk_queue_stat_get(rwb->queue, stat);
-	return __latency_exceeded(rwb, stat);
-}
-
 static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
 {
 	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
@@ -355,7 +347,6 @@ static void scale_up(struct rq_wb *rwb)
 
 	rwb->scale_step--;
 	rwb->unknown_cnt = 0;
-	blk_stat_clear(rwb->queue);
 
 	rwb->scaled_max = calc_wb_limits(rwb);
 
@@ -385,15 +376,12 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle)
 
 	rwb->scaled_max = false;
 	rwb->unknown_cnt = 0;
-	blk_stat_clear(rwb->queue);
 	calc_wb_limits(rwb);
 	rwb_trace_step(rwb, "step down");
 }
 
 static void rwb_arm_timer(struct rq_wb *rwb)
 {
-	unsigned long expires;
-
 	if (rwb->scale_step > 0) {
 		/*
 		 * We should speed this up, using some variant of a fast
@@ -411,17 +399,16 @@ static void rwb_arm_timer(struct rq_wb *rwb)
 		rwb->cur_win_nsec = rwb->win_nsec;
 	}
 
-	expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
-	mod_timer(&rwb->window_timer, expires);
+	blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
 }
 
-static void wb_timer_fn(unsigned long data)
+static void wb_timer_fn(struct blk_stat_callback *cb)
 {
-	struct rq_wb *rwb = (struct rq_wb *) data;
+	struct rq_wb *rwb = cb->data;
 	unsigned int inflight = wbt_inflight(rwb);
 	int status;
 
-	status = latency_exceeded(rwb);
+	status = latency_exceeded(rwb, cb->stat);
 
 	trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
 			inflight);
@@ -614,7 +601,7 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
 
 	__wbt_wait(rwb, bio->bi_opf, lock);
 
-	if (!timer_pending(&rwb->window_timer))
+	if (!blk_stat_is_active(rwb->cb))
 		rwb_arm_timer(rwb);
 
 	if (current_is_kswapd())
@@ -675,7 +662,7 @@ void wbt_disable_default(struct request_queue *q)
 	struct rq_wb *rwb = q->rq_wb;
 
 	if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) {
-		del_timer_sync(&rwb->window_timer);
+		blk_stat_remove_callback(q, rwb->cb);
 		rwb->win_nsec = rwb->min_lat_nsec = 0;
 		wbt_update_limits(rwb);
 	}
@@ -699,24 +686,23 @@ int wbt_init(struct request_queue *q)
 	struct rq_wb *rwb;
 	int i;
 
-	/*
-	 * For now, we depend on the stats window being larger than
-	 * our monitoring window. Ensure that this isn't inadvertently
-	 * violated.
-	 */
-	BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
 	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
 
 	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
 	if (!rwb)
 		return -ENOMEM;
 
+	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, blk_stat_rq_ddir, 2, rwb);
+	if (!rwb->cb) {
+		kfree(rwb);
+		return -ENOMEM;
+	}
+
 	for (i = 0; i < WBT_NUM_RWQ; i++) {
 		atomic_set(&rwb->rq_wait[i].inflight, 0);
 		init_waitqueue_head(&rwb->rq_wait[i].wait);
 	}
 
-	setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
 	rwb->wc = 1;
 	rwb->queue_depth = RWB_DEF_DEPTH;
 	rwb->last_comp = rwb->last_issue = jiffies;
@@ -726,10 +712,10 @@ int wbt_init(struct request_queue *q)
 	wbt_update_limits(rwb);
 
 	/*
-	 * Assign rwb, and turn on stats tracking for this queue
+	 * Assign rwb and add the stats callback.
 	 */
 	q->rq_wb = rwb;
-	blk_stat_enable(q);
+	blk_stat_add_callback(q, rwb->cb);
 
 	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
 
@@ -744,7 +730,8 @@ void wbt_exit(struct request_queue *q)
 	struct rq_wb *rwb = q->rq_wb;
 
 	if (rwb) {
-		del_timer_sync(&rwb->window_timer);
+		blk_stat_remove_callback(q, rwb->cb);
+		blk_stat_free_callback(rwb->cb);
 		q->rq_wb = NULL;
 		kfree(rwb);
 	}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 65f1de519f67..591ff2f4b2ee 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -81,7 +81,7 @@ struct rq_wb {
 	u64 win_nsec;				/* default window size */
 	u64 cur_win_nsec;			/* current window size */
 
-	struct timer_list window_timer;
+	struct blk_stat_callback *cb;
 
 	s64 sync_issue;
 	void *sync_cookie;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index e213c5e7500b..270119a501fb 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -294,7 +294,6 @@ struct blk_rq_stat {
 	s32 nr_samples;
 	s32 nr_batch;
 	u64 batch;
-	s64 time;
 };
 
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5a7da607ca04..1a7dc42a8918 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -40,6 +40,8 @@ struct blkcg_gq;
 struct blk_flush_queue;
 struct pr_ops;
 struct rq_wb;
+struct blk_queue_stats;
+struct blk_stat_callback;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -388,6 +390,7 @@ struct request_queue {
 	int			nr_rqs[2];	/* # allocated [a]sync rqs */
 	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */
 
+	struct blk_queue_stats	*stats;
 	struct rq_wb		*rq_wb;
 
 	/*
@@ -505,8 +508,6 @@ struct request_queue {
 	unsigned int		nr_sorted;
 	unsigned int		in_flight[2];
 
-	struct blk_rq_stat	rq_stats[2];
-
 	/*
 	 * Number of active block driver functions for which blk_drain_queue()
 	 * must wait. Must be incremented around functions that unlock the
@@ -516,6 +517,10 @@ struct request_queue {
 
 	unsigned int		rq_timeout;
 	int			poll_nsec;
+
+	struct blk_stat_callback	*poll_cb;
+	struct blk_rq_stat	poll_stat[2];
+
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
 	struct list_head	timeout_list;
@@ -611,6 +616,7 @@ struct request_queue {
 #define QUEUE_FLAG_DAX         26	/* device supports DAX */
 #define QUEUE_FLAG_STATS       27	/* track rq completion times */
 #define QUEUE_FLAG_RESTART     28	/* queue needs restart at completion */
+#define QUEUE_FLAG_POLL_STATS  29	/* collecting stats for hybrid polling */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
cgit v1.3-6-gb490


From b7eaf1aab9f8bd2e49fceed77ebc66c1b5800718 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 22 Mar 2017 00:08:50 +0100
Subject: cpufreq: schedutil: Avoid reducing frequency of busy CPUs prematurely

The way the schedutil governor uses the PELT metric causes it to
underestimate the CPU utilization in some cases.

That can be easily demonstrated by running kernel compilation on
a Sandy Bridge Intel processor, running turbostat in parallel with
it and looking at the values written to the MSR_IA32_PERF_CTL
register.  Namely, the expected result would be that when all CPUs
were 100% busy, all of them would be requested to run in the maximum
P-state, but observation shows that this clearly isn't the case.
The CPUs run in the maximum P-state for a while and then are
requested to run slower and go back to the maximum P-state after
a while again.  That causes the actual frequency of the processor to
visibly oscillate below the sustainable maximum in a jittery fashion
which clearly is not desirable.

That has been attributed to CPU utilization metric updates on task
migration that cause the total utilization value for the CPU to be
reduced by the utilization of the migrated task.  If that happens,
the schedutil governor may see a CPU utilization reduction and will
attempt to reduce the CPU frequency accordingly right away.  That
may be premature, though, for example if the system is generally
busy and there are other runnable tasks waiting to be run on that
CPU already.

This is unlikely to be an issue on systems where cpufreq policies are
shared between multiple CPUs, because in those cases the policy
utilization is computed as the maximum of the CPU utilization values
over the whole policy and if that turns out to be low, reducing the
frequency for the policy most likely is a good idea anyway.  On
systems with one CPU per policy, however, it may affect performance
adversely and even lead to increased energy consumption in some cases.

On those systems it may be addressed by taking another utilization
metric into consideration, like whether or not the CPU whose
frequency is about to be reduced has been idle recently, because if
that's not the case, the CPU is likely to be busy in the near future
and its frequency should not be reduced.

To that end, use the counter of idle calls in the timekeeping code.
Namely, make the schedutil governor look at that counter for the
current CPU every time before its frequency is about to be reduced.
If the counter has not changed since the previous iteration of the
governor computations for that CPU, the CPU has been busy for all
that time and its frequency should not be decreased, so if the new
frequency would be lower than the one set previously, the governor
will skip the frequency update.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Joel Fernandes <joelaf@google.com>
---
 include/linux/tick.h             |  1 +
 kernel/sched/cpufreq_schedutil.c | 27 +++++++++++++++++++++++++++
 kernel/time/tick-sched.c         | 12 ++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index a04fea19676f..fe01e68bf520 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -117,6 +117,7 @@ extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
 extern ktime_t tick_nohz_get_sleep_length(void);
+extern unsigned long tick_nohz_get_idle_calls(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 #else /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index f5ffe241812e..c1ffb5dc8af6 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -61,6 +61,11 @@ struct sugov_cpu {
 	unsigned long util;
 	unsigned long max;
 	unsigned int flags;
+
+	/* The field below is for single-CPU policies only. */
+#ifdef CONFIG_NO_HZ_COMMON
+	unsigned long saved_idle_calls;
+#endif
 };
 
 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -192,6 +197,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
 	sg_cpu->iowait_boost >>= 1;
 }
 
+#ifdef CONFIG_NO_HZ_COMMON
+static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+{
+	unsigned long idle_calls = tick_nohz_get_idle_calls();
+	bool ret = idle_calls == sg_cpu->saved_idle_calls;
+
+	sg_cpu->saved_idle_calls = idle_calls;
+	return ret;
+}
+#else
+static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+#endif /* CONFIG_NO_HZ_COMMON */
+
 static void sugov_update_single(struct update_util_data *hook, u64 time,
 				unsigned int flags)
 {
@@ -200,6 +218,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	struct cpufreq_policy *policy = sg_policy->policy;
 	unsigned long util, max;
 	unsigned int next_f;
+	bool busy;
 
 	sugov_set_iowait_boost(sg_cpu, time, flags);
 	sg_cpu->last_update = time;
@@ -207,12 +226,20 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 	if (!sugov_should_update_freq(sg_policy, time))
 		return;
 
+	busy = sugov_cpu_is_busy(sg_cpu);
+
 	if (flags & SCHED_CPUFREQ_RT_DL) {
 		next_f = policy->cpuinfo.max_freq;
 	} else {
 		sugov_get_util(&util, &max);
 		sugov_iowait_boost(sg_cpu, &util, &max);
 		next_f = get_next_freq(sg_policy, util, max);
+		/*
+		 * Do not reduce the frequency if the CPU has not been idle
+		 * recently, as the reduction is likely to be premature then.
+		 */
+		if (busy && next_f < sg_policy->next_freq)
+			next_f = sg_policy->next_freq;
 	}
 	sugov_update_commit(sg_policy, time, next_f);
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7fe53be86077..64c97fc130c4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -993,6 +993,18 @@ ktime_t tick_nohz_get_sleep_length(void)
 	return ts->sleep_length;
 }
 
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+	return ts->idle_calls;
+}
+
 static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
 {
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-- 
cgit v1.3-6-gb490


From b7d680d7bf584bce6023343304b819009a7c3336 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Mar 2017 01:36:54 +0100
Subject: bdi: Mark congested->bdi as internal

congested->bdi pointer is used only to be able to remove congested
structure from bdi->cgwb_congested_tree on structure release. Moreover
the pointer can become NULL when we unregister the bdi. Rename the field
to __bdi and add a comment to make it more explicit this is internal
stuff of memcg writeback code and people should not use the field as
such use will be likely race prone.

We do not bother with converting congested->bdi to a proper refcounted
reference. It will be slightly ugly to special-case bdi->wb.congested to
avoid effectively a cyclic reference of bdi to itself and the reference
gets cleared from bdi_unregister() making it impossible to reference
a freed bdi.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev-defs.h |  4 +++-
 mm/backing-dev.c                 | 10 +++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ad955817916d..8fb3dcdebc80 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -54,7 +54,9 @@ struct bdi_writeback_congested {
 	atomic_t refcnt;		/* nr of attached wb's and blkg */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
-	struct backing_dev_info *bdi;	/* the associated bdi */
+	struct backing_dev_info *__bdi;	/* the associated bdi, set to NULL
+					 * on bdi unregistration. For memcg-wb
+					 * internal use only! */
 	int blkcg_id;			/* ID of the associated blkcg */
 	struct rb_node rb_node;		/* on bdi->cgwb_congestion_tree */
 #endif
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c6f2a37028c2..12408f86783c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -438,7 +438,7 @@ retry:
 		return NULL;
 
 	atomic_set(&new_congested->refcnt, 0);
-	new_congested->bdi = bdi;
+	new_congested->__bdi = bdi;
 	new_congested->blkcg_id = blkcg_id;
 	goto retry;
 
@@ -466,10 +466,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
 	}
 
 	/* bdi might already have been destroyed leaving @congested unlinked */
-	if (congested->bdi) {
+	if (congested->__bdi) {
 		rb_erase(&congested->rb_node,
-			 &congested->bdi->cgwb_congested_tree);
-		congested->bdi = NULL;
+			 &congested->__bdi->cgwb_congested_tree);
+		congested->__bdi = NULL;
 	}
 
 	spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -752,7 +752,7 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi)
 			rb_entry(rbn, struct bdi_writeback_congested, rb_node);
 
 		rb_erase(rbn, &bdi->cgwb_congested_tree);
-		congested->bdi = NULL;	/* mark @congested unlinked */
+		congested->__bdi = NULL;	/* mark @congested unlinked */
 	}
 	spin_unlock_irq(&cgwb_lock);
 }
-- 
cgit v1.3-6-gb490


From 5318ce7d46866e1dbc20ab9349b93753edba0b3e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Mar 2017 01:36:57 +0100
Subject: bdi: Shutdown writeback on all cgwbs in cgwb_bdi_destroy()

Currently we waited for all cgwbs to get freed in cgwb_bdi_destroy()
which also means that writeback has been shutdown on them. Since this
wait is going away, directly shutdown writeback on cgwbs from
cgwb_bdi_destroy() to avoid live writeback structures after
bdi_unregister() has finished. To make that safe with concurrent
shutdown from cgwb_release_workfn(), we also have to make sure
wb_shutdown() returns only after the bdi_writeback structure is really
shutdown.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev-defs.h |  1 +
 mm/backing-dev.c                 | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 8fb3dcdebc80..8af720f22a2d 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -21,6 +21,7 @@ struct dentry;
  */
 enum wb_state {
 	WB_registered,		/* bdi_register() was done */
+	WB_shutting_down,	/* wb_shutdown() in progress */
 	WB_writeback_running,	/* Writeback is in progress */
 	WB_has_dirty_io,	/* Dirty inodes on ->b_{dirty|io|more_io} */
 };
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e3d56dba4da8..b67be4fc12c4 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -356,8 +356,15 @@ static void wb_shutdown(struct bdi_writeback *wb)
 	spin_lock_bh(&wb->work_lock);
 	if (!test_and_clear_bit(WB_registered, &wb->state)) {
 		spin_unlock_bh(&wb->work_lock);
+		/*
+		 * Wait for wb shutdown to finish if someone else is just
+		 * running wb_shutdown(). Otherwise we could proceed to wb /
+		 * bdi destruction before wb_shutdown() is finished.
+		 */
+		wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
 		return;
 	}
+	set_bit(WB_shutting_down, &wb->state);
 	spin_unlock_bh(&wb->work_lock);
 
 	cgwb_remove_from_bdi_list(wb);
@@ -369,6 +376,12 @@ static void wb_shutdown(struct bdi_writeback *wb)
 	mod_delayed_work(bdi_wq, &wb->dwork, 0);
 	flush_delayed_work(&wb->dwork);
 	WARN_ON(!list_empty(&wb->work_list));
+	/*
+	 * Make sure bit gets cleared after shutdown is finished. Matches with
+	 * the barrier provided by test_and_clear_bit() above.
+	 */
+	smp_wmb();
+	clear_bit(WB_shutting_down, &wb->state);
 }
 
 static void wb_exit(struct bdi_writeback *wb)
@@ -699,12 +712,21 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
 {
 	struct radix_tree_iter iter;
 	void **slot;
+	struct bdi_writeback *wb;
 
 	WARN_ON(test_bit(WB_registered, &bdi->wb.state));
 
 	spin_lock_irq(&cgwb_lock);
 	radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
 		cgwb_kill(*slot);
+
+	while (!list_empty(&bdi->wb_list)) {
+		wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
+				      bdi_node);
+		spin_unlock_irq(&cgwb_lock);
+		wb_shutdown(wb);
+		spin_lock_irq(&cgwb_lock);
+	}
 	spin_unlock_irq(&cgwb_lock);
 
 	/*
-- 
cgit v1.3-6-gb490


From 4514451e79ae5baabb85d22ba3523602e59d5218 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Mar 2017 01:36:58 +0100
Subject: bdi: Do not wait for cgwbs release in bdi_unregister()

Currently we wait for all cgwbs to get released in cgwb_bdi_destroy()
(called from bdi_unregister()). That is however unnecessary now when
cgwb->bdi is a proper refcounted reference (thus bdi cannot get
released before all cgwbs are released) and when cgwb_bdi_destroy()
shuts down writeback directly.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev-defs.h |  1 -
 mm/backing-dev.c                 | 22 +---------------------
 2 files changed, 1 insertion(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 8af720f22a2d..e66d4722db8e 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -164,7 +164,6 @@ struct backing_dev_info {
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
 	struct rb_root cgwb_congested_tree; /* their congested states */
-	atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
 #else
 	struct bdi_writeback_congested *wb_congested;
 #endif
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b67be4fc12c4..8c30b1a7aae5 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -406,11 +406,9 @@ static void wb_exit(struct bdi_writeback *wb)
 /*
  * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
  * blkcg->cgwb_list, and memcg->cgwb_list.  bdi->cgwb_tree is also RCU
- * protected.  cgwb_release_wait is used to wait for the completion of cgwb
- * releases from bdi destruction path.
+ * protected.
  */
 static DEFINE_SPINLOCK(cgwb_lock);
-static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
 
 /**
  * wb_congested_get_create - get or create a wb_congested
@@ -505,7 +503,6 @@ static void cgwb_release_workfn(struct work_struct *work)
 {
 	struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
 						release_work);
-	struct backing_dev_info *bdi = wb->bdi;
 
 	wb_shutdown(wb);
 
@@ -516,9 +513,6 @@ static void cgwb_release_workfn(struct work_struct *work)
 	percpu_ref_exit(&wb->refcnt);
 	wb_exit(wb);
 	kfree_rcu(wb, rcu);
-
-	if (atomic_dec_and_test(&bdi->usage_cnt))
-		wake_up_all(&cgwb_release_wait);
 }
 
 static void cgwb_release(struct percpu_ref *refcnt)
@@ -608,7 +602,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
 		/* we might have raced another instance of this function */
 		ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
 		if (!ret) {
-			atomic_inc(&bdi->usage_cnt);
 			list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
 			list_add(&wb->memcg_node, memcg_cgwb_list);
 			list_add(&wb->blkcg_node, blkcg_cgwb_list);
@@ -698,7 +691,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
 
 	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
 	bdi->cgwb_congested_tree = RB_ROOT;
-	atomic_set(&bdi->usage_cnt, 1);
 
 	ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
 	if (!ret) {
@@ -728,18 +720,6 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
 		spin_lock_irq(&cgwb_lock);
 	}
 	spin_unlock_irq(&cgwb_lock);
-
-	/*
-	 * All cgwb's must be shutdown and released before returning.  Drain
-	 * the usage counter to wait for all cgwb's ever created on @bdi.
-	 */
-	atomic_dec(&bdi->usage_cnt);
-	wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
-	/*
-	 * Grab back our reference so that we hold it when @bdi gets
-	 * re-registered.
-	 */
-	atomic_inc(&bdi->usage_cnt);
 }
 
 /**
-- 
cgit v1.3-6-gb490


From f759741d9d913eb57784a94b9bca78b376fc26a9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Mar 2017 01:37:00 +0100
Subject: block: Fix oops in locked_inode_to_wb_and_lock_list()

When block device is closed, we call inode_detach_wb() in __blkdev_put()
which sets inode->i_wb to NULL. That is contrary to expectations that
inode->i_wb stays valid once set during the whole inode's lifetime and
leads to oops in wb_get() in locked_inode_to_wb_and_lock_list() because
inode_to_wb() returned NULL.

The reason why we called inode_detach_wb() is not valid anymore though.
BDI is guaranteed to stay along until we call bdi_put() from
bdev_evict_inode() so we can postpone calling inode_detach_wb() to that
moment.

Also add a warning to catch if someone uses inode_detach_wb() in a
dangerous way.

Reported-by: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/block_dev.c            | 8 ++------
 include/linux/writeback.h | 1 +
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 53e2389ae4d4..f2d59f143ef4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -885,6 +885,8 @@ static void bdev_evict_inode(struct inode *inode)
 	spin_lock(&bdev_lock);
 	list_del_init(&bdev->bd_list);
 	spin_unlock(&bdev_lock);
+	/* Detach inode from wb early as bdi_put() may free bdi->wb */
+	inode_detach_wb(inode);
 	if (bdev->bd_bdi != &noop_backing_dev_info) {
 		bdi_put(bdev->bd_bdi);
 		bdev->bd_bdi = &noop_backing_dev_info;
@@ -1875,12 +1877,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		kill_bdev(bdev);
 
 		bdev_write_inode(bdev);
-		/*
-		 * Detaching bdev inode from its wb in __destroy_inode()
-		 * is too late: the queue which embeds its bdi (along with
-		 * root wb) can be gone as soon as we put_disk() below.
-		 */
-		inode_detach_wb(bdev->bd_inode);
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a3c0cbd7c888..d5815794416c 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -237,6 +237,7 @@ static inline void inode_attach_wb(struct inode *inode, struct page *page)
 static inline void inode_detach_wb(struct inode *inode)
 {
 	if (inode->i_wb) {
+		WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
 		wb_put(inode->i_wb);
 		inode->i_wb = NULL;
 	}
-- 
cgit v1.3-6-gb490


From c70c176ff8c3ff0ac6ef9a831cd591ea9a66bd1a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Mar 2017 01:37:01 +0100
Subject: kobject: Export kobject_get_unless_zero()

Make the function available for outside use and fortify it against NULL
kobject.

CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/kobject.h | 2 ++
 lib/kobject.c           | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index e6284591599e..ca85cb80e99a 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -108,6 +108,8 @@ extern int __must_check kobject_rename(struct kobject *, const char *new_name);
 extern int __must_check kobject_move(struct kobject *, struct kobject *);
 
 extern struct kobject *kobject_get(struct kobject *kobj);
+extern struct kobject * __must_check kobject_get_unless_zero(
+						struct kobject *kobj);
 extern void kobject_put(struct kobject *kobj);
 
 extern const void *kobject_namespace(struct kobject *kobj);
diff --git a/lib/kobject.c b/lib/kobject.c
index 445dcaeb0f56..763d70a18941 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -601,12 +601,15 @@ struct kobject *kobject_get(struct kobject *kobj)
 }
 EXPORT_SYMBOL(kobject_get);
 
-static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
+struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
 {
+	if (!kobj)
+		return NULL;
 	if (!kref_get_unless_zero(&kobj->kref))
 		kobj = NULL;
 	return kobj;
 }
+EXPORT_SYMBOL(kobject_get_unless_zero);
 
 /*
  * kobject_cleanup - free kobject resources.
-- 
cgit v1.3-6-gb490


From 7642747d674aff1f7cfe74ad9af7e9b12ab1d5ee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 22 Mar 2017 15:01:49 -0400
Subject: blk-mq: remove BLK_MQ_F_DEFER_ISSUE

This flag was never used since it was introduced.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 8 +-------
 include/linux/blk-mq.h | 1 -
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5ff66f203cd0..998c041358f2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1531,13 +1531,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	plug = current->plug;
-	/*
-	 * If the driver supports defer issued based on 'last', then
-	 * queue it up like normal since we can potentially save some
-	 * CPU this way.
-	 */
-	if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
-	    !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
+	if (((plug && !blk_queue_nomerges(q)) || is_sync)) {
 		struct request *old_rq = NULL;
 
 		blk_mq_bio_to_request(rq, bio);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b296a9006117..5b3e201c8d4f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -152,7 +152,6 @@ enum {
 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
 	BLK_MQ_F_TAG_SHARED	= 1 << 1,
 	BLK_MQ_F_SG_MERGE	= 1 << 2,
-	BLK_MQ_F_DEFER_ISSUE	= 1 << 4,
 	BLK_MQ_F_BLOCKING	= 1 << 5,
 	BLK_MQ_F_NO_SCHED	= 1 << 6,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
-- 
cgit v1.3-6-gb490


From a9ebf306f52c756c4f9e50ee9a60cd6389d71344 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 1 Feb 2017 16:39:38 +0100
Subject: locking/atomic: Introduce atomic_try_cmpxchg()

Add a new cmpxchg interface:

  bool try_cmpxchg(u{8,16,32,64} *ptr, u{8,16,32,64} *val, u{8,16,32,64} new);

Where the boolean returns the result of the compare; and thus if the
exchange happened; and in case of failure, the new value of *ptr is
returned in *val.

This allows simplification/improvement of loops like:

	for (;;) {
		new = val $op $imm;
		old = cmpxchg(ptr, val, new);
		if (old == val)
			break;
		val = old;
	}

into:

	do {
	} while (!try_cmpxchg(ptr, &val, val $op $imm));

while also generating better code (GCC6 and onwards).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/atomic.h      |  6 ++++
 arch/x86/include/asm/atomic64_64.h |  6 ++++
 arch/x86/include/asm/cmpxchg.h     | 69 ++++++++++++++++++++++++++++++++++++++
 include/linux/atomic.h             | 42 +++++++++++++++++++++++
 4 files changed, 123 insertions(+)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 14635c5ea025..8410377c6869 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -186,6 +186,12 @@ static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 	return cmpxchg(&v->counter, old, new);
 }
 
+#define atomic_try_cmpxchg atomic_try_cmpxchg
+static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new)
+{
+	return try_cmpxchg(&v->counter, old, new);
+}
+
 static inline int atomic_xchg(atomic_t *v, int new)
 {
 	return xchg(&v->counter, new);
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 89ed2f6ae2f7..12fb57413732 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -176,6 +176,12 @@ static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
 	return cmpxchg(&v->counter, old, new);
 }
 
+#define atomic64_try_cmpxchg atomic64_try_cmpxchg
+static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, long *old, long new)
+{
+	return try_cmpxchg(&v->counter, old, new);
+}
+
 static inline long atomic64_xchg(atomic64_t *v, long new)
 {
 	return xchg(&v->counter, new);
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 97848cdfcb1a..fb961db51a2a 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -153,6 +153,75 @@ extern void __add_wrong_size(void)
 #define cmpxchg_local(ptr, old, new)					\
 	__cmpxchg_local(ptr, old, new, sizeof(*(ptr)))
 
+
+#define __raw_try_cmpxchg(_ptr, _pold, _new, size, lock)		\
+({									\
+	bool success;							\
+	__typeof__(_ptr) _old = (_pold);				\
+	__typeof__(*(_ptr)) __old = *_old;				\
+	__typeof__(*(_ptr)) __new = (_new);				\
+	switch (size) {							\
+	case __X86_CASE_B:						\
+	{								\
+		volatile u8 *__ptr = (volatile u8 *)(_ptr);		\
+		asm volatile(lock "cmpxchgb %[new], %[ptr]"		\
+			     CC_SET(z)					\
+			     : CC_OUT(z) (success),			\
+			       [ptr] "+m" (*__ptr),			\
+			       [old] "+a" (__old)			\
+			     : [new] "q" (__new)			\
+			     : "memory");				\
+		break;							\
+	}								\
+	case __X86_CASE_W:						\
+	{								\
+		volatile u16 *__ptr = (volatile u16 *)(_ptr);		\
+		asm volatile(lock "cmpxchgw %[new], %[ptr]"		\
+			     CC_SET(z)					\
+			     : CC_OUT(z) (success),			\
+			       [ptr] "+m" (*__ptr),			\
+			       [old] "+a" (__old)			\
+			     : [new] "r" (__new)			\
+			     : "memory");				\
+		break;							\
+	}								\
+	case __X86_CASE_L:						\
+	{								\
+		volatile u32 *__ptr = (volatile u32 *)(_ptr);		\
+		asm volatile(lock "cmpxchgl %[new], %[ptr]"		\
+			     CC_SET(z)					\
+			     : CC_OUT(z) (success),			\
+			       [ptr] "+m" (*__ptr),			\
+			       [old] "+a" (__old)			\
+			     : [new] "r" (__new)			\
+			     : "memory");				\
+		break;							\
+	}								\
+	case __X86_CASE_Q:						\
+	{								\
+		volatile u64 *__ptr = (volatile u64 *)(_ptr);		\
+		asm volatile(lock "cmpxchgq %[new], %[ptr]"		\
+			     CC_SET(z)					\
+			     : CC_OUT(z) (success),			\
+			       [ptr] "+m" (*__ptr),			\
+			       [old] "+a" (__old)			\
+			     : [new] "r" (__new)			\
+			     : "memory");				\
+		break;							\
+	}								\
+	default:							\
+		__cmpxchg_wrong_size();					\
+	}								\
+	*_old = __old;							\
+	success;							\
+})
+
+#define __try_cmpxchg(ptr, pold, new, size)				\
+	__raw_try_cmpxchg((ptr), (pold), (new), (size), LOCK_PREFIX)
+
+#define try_cmpxchg(ptr, pold, new)					\
+	__try_cmpxchg((ptr), (pold), (new), sizeof(*(ptr)))
+
 /*
  * xadd() adds "inc" to "*ptr" and atomically returns the previous
  * value of "*ptr".
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index e71835bf60a9..aae5953817d6 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -423,6 +423,27 @@
 #endif
 #endif /* atomic_cmpxchg_relaxed */
 
+#ifndef atomic_try_cmpxchg
+
+#define __atomic_try_cmpxchg(type, _p, _po, _n)				\
+({									\
+	typeof(_po) __po = (_po);					\
+	typeof(*(_po)) __o = *__po;					\
+	*__po = atomic_cmpxchg##type((_p), __o, (_n));			\
+	(*__po == __o);							\
+})
+
+#define atomic_try_cmpxchg(_p, _po, _n)		__atomic_try_cmpxchg(, _p, _po, _n)
+#define atomic_try_cmpxchg_relaxed(_p, _po, _n)	__atomic_try_cmpxchg(_relaxed, _p, _po, _n)
+#define atomic_try_cmpxchg_acquire(_p, _po, _n)	__atomic_try_cmpxchg(_acquire, _p, _po, _n)
+#define atomic_try_cmpxchg_release(_p, _po, _n)	__atomic_try_cmpxchg(_release, _p, _po, _n)
+
+#else /* atomic_try_cmpxchg */
+#define atomic_try_cmpxchg_relaxed	atomic_try_cmpxchg
+#define atomic_try_cmpxchg_acquire	atomic_try_cmpxchg
+#define atomic_try_cmpxchg_release	atomic_try_cmpxchg
+#endif /* atomic_try_cmpxchg */
+
 /* cmpxchg_relaxed */
 #ifndef cmpxchg_relaxed
 #define  cmpxchg_relaxed		cmpxchg
@@ -996,6 +1017,27 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #endif
 #endif /* atomic64_cmpxchg_relaxed */
 
+#ifndef atomic64_try_cmpxchg
+
+#define __atomic64_try_cmpxchg(type, _p, _po, _n)			\
+({									\
+	typeof(_po) __po = (_po);					\
+	typeof(*(_po)) __o = *__po;					\
+	*__po = atomic64_cmpxchg##type((_p), __o, (_n));		\
+	(*__po == __o);							\
+})
+
+#define atomic64_try_cmpxchg(_p, _po, _n)		__atomic64_try_cmpxchg(, _p, _po, _n)
+#define atomic64_try_cmpxchg_relaxed(_p, _po, _n)	__atomic64_try_cmpxchg(_relaxed, _p, _po, _n)
+#define atomic64_try_cmpxchg_acquire(_p, _po, _n)	__atomic64_try_cmpxchg(_acquire, _p, _po, _n)
+#define atomic64_try_cmpxchg_release(_p, _po, _n)	__atomic64_try_cmpxchg(_release, _p, _po, _n)
+
+#else /* atomic64_try_cmpxchg */
+#define atomic64_try_cmpxchg_relaxed	atomic64_try_cmpxchg
+#define atomic64_try_cmpxchg_acquire	atomic64_try_cmpxchg
+#define atomic64_try_cmpxchg_release	atomic64_try_cmpxchg
+#endif /* atomic64_try_cmpxchg */
+
 #ifndef atomic64_andnot
 static inline void atomic64_andnot(long long i, atomic64_t *v)
 {
-- 
cgit v1.3-6-gb490


From 7a88fa191944589b2ed795bbed32ca6e9e2df31f Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 23 Mar 2017 13:24:55 +0300
Subject: block: make nr_iovecs unsigned in bio_alloc_bioset()

There isn't a bug here, but Smatch is not smart enough to know that
"nr_iovecs" can't be negative so it complains about underflows.
Really, it's slightly cleaner to make this parameter unsigned.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c         | 3 ++-
 include/linux/bio.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index e75878f8b14a..6194a8cf2aab 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -427,7 +427,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
  *   RETURNS:
  *   Pointer to new bio on success, NULL on failure.
  */
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
+			     struct bio_set *bs)
 {
 	gfp_t saved_gfp = gfp_mask;
 	unsigned front_pad;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc..4931756d86d9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -383,7 +383,7 @@ extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
 extern mempool_t *biovec_create_pool(int pool_entries);
 
-extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
+extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
 extern void bio_put(struct bio *);
 
 extern void __bio_clone_fast(struct bio *, struct bio *);
-- 
cgit v1.3-6-gb490


From 0695bd99c0d22bef4d9d4c72cf537b722ba98531 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Mon, 6 Feb 2017 22:12:04 +0100
Subject: clockevents: Make clockevents_config() static

A clockevent device's rate should be configured before or at registration
and changed afterwards through clockevents_update_freq() only.

For the configuration at registration, we already have
clockevents_config_and_register().

Right now, there are no clockevents_config() users outside of the
clockevents core.

To mitigiate the risk of drivers errorneously reconfiguring their rates
through clockevents_config() *after* device registration, make
clockevents_config() static.

Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/clockchips.h | 1 -
 kernel/time/clockevents.c  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 5d3053c34fb3..eef1569e5cd0 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -182,7 +182,6 @@ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *e
 extern void clockevents_register_device(struct clock_event_device *dev);
 extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu);
 
-extern void clockevents_config(struct clock_event_device *dev, u32 freq);
 extern void clockevents_config_and_register(struct clock_event_device *dev,
 					    u32 freq, unsigned long min_delta,
 					    unsigned long max_delta);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 97ac0951f164..4237e0744e26 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -468,7 +468,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 }
 EXPORT_SYMBOL_GPL(clockevents_register_device);
 
-void clockevents_config(struct clock_event_device *dev, u32 freq)
+static void clockevents_config(struct clock_event_device *dev, u32 freq)
 {
 	u64 sec;
 
-- 
cgit v1.3-6-gb490


From 8f48cfabac57977338f5c828ed3e12fc34373c7d Mon Sep 17 00:00:00 2001
From: Geliang Tang <geliangtang@gmail.com>
Date: Fri, 24 Mar 2017 22:13:35 +0800
Subject: cgroup: drop duplicate header nsproxy.h

Drop duplicate header nsproxy.h from linux/cgroup.h.

Signed-off-by: Geliang Tang <geliangtang@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 44129793c7b8..34b4a298e52e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -17,7 +17,6 @@
 #include <linux/seq_file.h>
 #include <linux/kernfs.h>
 #include <linux/jump_label.h>
-#include <linux/nsproxy.h>
 #include <linux/types.h>
 #include <linux/ns_common.h>
 #include <linux/nsproxy.h>
-- 
cgit v1.3-6-gb490


From 869ab90f0ae0002ce6e9d3a5c75156ae8de48ffc Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 24 Mar 2017 18:03:48 -0700
Subject: block: constify struct blk_integrity_profile

blk_integrity_profile's are never modified, so mark them 'const' so that
they are placed in .rodata and benefit from memory protection.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-integrity.c  |  2 +-
 block/t10-pi.c         |  8 ++++----
 include/linux/genhd.h  | 10 +++++-----
 include/linux/t10-pi.h |  8 ++++----
 4 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 9f0ff5ba4f84..b3622cb00fc2 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -389,7 +389,7 @@ static int blk_integrity_nop_fn(struct blk_integrity_iter *iter)
 	return 0;
 }
 
-static struct blk_integrity_profile nop_profile = {
+static const struct blk_integrity_profile nop_profile = {
 	.name = "nop",
 	.generate_fn = blk_integrity_nop_fn,
 	.verify_fn = blk_integrity_nop_fn,
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 2c97912335a9..680c6d636298 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -160,28 +160,28 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
 	return t10_pi_verify(iter, t10_pi_ip_fn, 3);
 }
 
-struct blk_integrity_profile t10_pi_type1_crc = {
+const struct blk_integrity_profile t10_pi_type1_crc = {
 	.name			= "T10-DIF-TYPE1-CRC",
 	.generate_fn		= t10_pi_type1_generate_crc,
 	.verify_fn		= t10_pi_type1_verify_crc,
 };
 EXPORT_SYMBOL(t10_pi_type1_crc);
 
-struct blk_integrity_profile t10_pi_type1_ip = {
+const struct blk_integrity_profile t10_pi_type1_ip = {
 	.name			= "T10-DIF-TYPE1-IP",
 	.generate_fn		= t10_pi_type1_generate_ip,
 	.verify_fn		= t10_pi_type1_verify_ip,
 };
 EXPORT_SYMBOL(t10_pi_type1_ip);
 
-struct blk_integrity_profile t10_pi_type3_crc = {
+const struct blk_integrity_profile t10_pi_type3_crc = {
 	.name			= "T10-DIF-TYPE3-CRC",
 	.generate_fn		= t10_pi_type3_generate_crc,
 	.verify_fn		= t10_pi_type3_verify_crc,
 };
 EXPORT_SYMBOL(t10_pi_type3_crc);
 
-struct blk_integrity_profile t10_pi_type3_ip = {
+const struct blk_integrity_profile t10_pi_type3_ip = {
 	.name			= "T10-DIF-TYPE3-IP",
 	.generate_fn		= t10_pi_type3_generate_ip,
 	.verify_fn		= t10_pi_type3_verify_ip,
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 76f39754e7b0..9e11082c7f9b 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -159,11 +159,11 @@ struct badblocks;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
 struct blk_integrity {
-	struct blk_integrity_profile	*profile;
-	unsigned char			flags;
-	unsigned char			tuple_size;
-	unsigned char			interval_exp;
-	unsigned char			tag_size;
+	const struct blk_integrity_profile	*profile;
+	unsigned char				flags;
+	unsigned char				tuple_size;
+	unsigned char				interval_exp;
+	unsigned char				tag_size;
 };
 
 #endif	/* CONFIG_BLK_DEV_INTEGRITY */
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index 9fba9dd33544..9375d23a24e7 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -34,9 +34,9 @@ struct t10_pi_tuple {
 };
 
 
-extern struct blk_integrity_profile t10_pi_type1_crc;
-extern struct blk_integrity_profile t10_pi_type1_ip;
-extern struct blk_integrity_profile t10_pi_type3_crc;
-extern struct blk_integrity_profile t10_pi_type3_ip;
+extern const struct blk_integrity_profile t10_pi_type1_crc;
+extern const struct blk_integrity_profile t10_pi_type1_ip;
+extern const struct blk_integrity_profile t10_pi_type3_crc;
+extern const struct blk_integrity_profile t10_pi_type3_ip;
 
 #endif
-- 
cgit v1.3-6-gb490


From 8ce371f9846ef1e8b3cc8f6865766cb5c1f17e40 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 20 Mar 2017 12:26:55 +0100
Subject: lockdep: Fix per-cpu static objects

Since commit 383776fa7527 ("locking/lockdep: Handle statically initialized
PER_CPU locks properly") we try to collapse per-cpu locks into a single
class by giving them all the same key. For this key we choose the canonical
address of the per-cpu object, which would be the offset into the per-cpu
area.

This has two problems:

 - there is a case where we run !0 lock->key through static_obj() and
   expect this to pass; it doesn't for canonical pointers.

 - 0 is a valid canonical address.

Cure both issues by redefining the canonical address as the address of the
per-cpu variable on the boot CPU.

Since I didn't want to rely on CPU0 being the boot-cpu, or even existing at
all, track the boot CPU in a variable.

Fixes: 383776fa7527 ("locking/lockdep: Handle statically initialized PER_CPU locks properly")
Reported-by: kernel test robot <fengguang.wu@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Borislav Petkov <bp@suse.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: linux-mm@kvack.org
Cc: wfg@linux.intel.com
Cc: kernel test robot <fengguang.wu@intel.com>
Cc: LKP <lkp@01.org>
Link: http://lkml.kernel.org/r/20170320114108.kbvcsuepem45j5cr@hirez.programming.kicks-ass.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/smp.h | 12 ++++++++++++
 kernel/cpu.c        |  6 ++++++
 kernel/module.c     |  6 +++++-
 mm/percpu.c         |  5 ++++-
 4 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 8e0cb7a0f836..68123c1fe549 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus;
 extern void __init setup_nr_cpu_ids(void);
 extern void __init smp_init(void);
 
+extern int __boot_cpu_id;
+
+static inline int get_boot_cpu_id(void)
+{
+	return __boot_cpu_id;
+}
+
 #else /* !SMP */
 
 static inline void smp_send_stop(void) { }
@@ -158,6 +165,11 @@ static inline void smp_init(void) { up_late_init(); }
 static inline void smp_init(void) { }
 #endif
 
+static inline int get_boot_cpu_id(void)
+{
+	return 0;
+}
+
 #endif /* !SMP */
 
 /*
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f7c063239fa5..a795725774c5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1125,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
 
 #endif /* CONFIG_PM_SLEEP_SMP */
 
+int __boot_cpu_id;
+
 #endif /* CONFIG_SMP */
 
 /* Boot processor state steps */
@@ -1815,6 +1817,10 @@ void __init boot_cpu_init(void)
 	set_cpu_active(cpu, true);
 	set_cpu_present(cpu, true);
 	set_cpu_possible(cpu, true);
+
+#ifdef CONFIG_SMP
+	__boot_cpu_id = cpu;
+#endif
 }
 
 /*
diff --git a/kernel/module.c b/kernel/module.c
index 5ef618133849..6d9988031c5b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -682,8 +682,12 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
 			void *va = (void *)addr;
 
 			if (va >= start && va < start + mod->percpu_size) {
-				if (can_addr)
+				if (can_addr) {
 					*can_addr = (unsigned long) (va - start);
+					*can_addr += (unsigned long)
+						per_cpu_ptr(mod->percpu,
+							    get_boot_cpu_id());
+				}
 				preempt_enable();
 				return true;
 			}
diff --git a/mm/percpu.c b/mm/percpu.c
index 7d3b728c0254..bd7416752819 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1293,8 +1293,11 @@ bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
 		void *va = (void *)addr;
 
 		if (va >= start && va < start + static_size) {
-			if (can_addr)
+			if (can_addr) {
 				*can_addr = (unsigned long) (va - start);
+				*can_addr += (unsigned long)
+					per_cpu_ptr(base, get_boot_cpu_id());
+			}
 			return true;
 		}
 	}
-- 
cgit v1.3-6-gb490


From 591a3d7c09fa08baff48ad86c2347dbd28a52753 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 24 Mar 2017 14:13:05 +0300
Subject: mm: Fix false-positive VM_BUG_ON() in
 page_cache_{get,add}_speculative()

0day testing by Fengguang Wu triggered this crash while running Trinity:

  kernel BUG at include/linux/pagemap.h:151!
  ...
  CPU: 0 PID: 458 Comm: trinity-c0 Not tainted 4.11.0-rc2-00251-g2947ba0 #1
  ...
  Call Trace:
   __get_user_pages_fast()
   get_user_pages_fast()
   get_futex_key()
   futex_requeue()
   do_futex()
   SyS_futex()
   do_syscall_64()
   entry_SYSCALL64_slow_path()

It' VM_BUG_ON() due to false-negative in_atomic(). We call
page_cache_get_speculative() with disabled local interrupts.
It should be atomic enough.

So let's check for disabled interrupts in the VM_BUG_ON() condition
too, to resolve this.

( This got triggered by the conversion of the x86 GUP code to the
  generic GUP code. )

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/20170324114709.pcytvyb3d6ajux33@black.fi.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/pagemap.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 84943e8057ef..316a19f6b635 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -148,7 +148,7 @@ static inline int page_cache_get_speculative(struct page *page)
 
 #ifdef CONFIG_TINY_RCU
 # ifdef CONFIG_PREEMPT_COUNT
-	VM_BUG_ON(!in_atomic());
+	VM_BUG_ON(!in_atomic() && !irqs_disabled());
 # endif
 	/*
 	 * Preempt must be disabled here - we rely on rcu_read_lock doing
@@ -186,7 +186,7 @@ static inline int page_cache_add_speculative(struct page *page, int count)
 
 #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
 # ifdef CONFIG_PREEMPT_COUNT
-	VM_BUG_ON(!in_atomic());
+	VM_BUG_ON(!in_atomic() && !irqs_disabled());
 # endif
 	VM_BUG_ON_PAGE(page_count(page) == 0, page);
 	page_ref_add(page, count);
-- 
cgit v1.3-6-gb490


From 011d8261117249eab97bc86a8e1ac7731e03e319 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 27 Mar 2017 11:33:02 +0200
Subject: RAS: Add a Corrected Errors Collector

Introduce a simple data structure for collecting correctable errors
along with accessors. More detailed description in the code itself.

The error decoding is done with the decoding chain now and
mce_first_notifier() gets to see the error first and the CEC decides
whether to log it and then the rest of the chain doesn't hear about it -
basically the main reason for the CE collector - or to continue running
the notifiers.

When the CEC hits the action threshold, it will try to soft-offine the
page containing the ECC and then the whole decoding chain gets to see
the error.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170327093304.10683-5-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |   6 +
 arch/x86/include/asm/mce.h                      |   9 +-
 arch/x86/kernel/cpu/mcheck/mce.c                | 191 +++++----
 arch/x86/ras/Kconfig                            |  14 +
 drivers/ras/Makefile                            |   3 +-
 drivers/ras/cec.c                               | 532 ++++++++++++++++++++++++
 drivers/ras/debugfs.c                           |   2 +-
 drivers/ras/debugfs.h                           |   8 +
 drivers/ras/ras.c                               |  11 +
 include/linux/ras.h                             |  13 +-
 10 files changed, 706 insertions(+), 83 deletions(-)
 create mode 100644 drivers/ras/cec.c
 create mode 100644 drivers/ras/debugfs.h

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2ba45caabada..927b8b8854cd 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3172,6 +3172,12 @@
 	ramdisk_size=	[RAM] Sizes of RAM disks in kilobytes
 			See Documentation/blockdev/ramdisk.txt.
 
+	ras=option[,option,...]	[KNL] RAS-specific options
+
+		cec_disable	[X86]
+				Disable the Correctable Errors Collector,
+				see CONFIG_RAS_CEC help text.
+
 	rcu_nocbs=	[KNL]
 			The argument is a cpu list, as described above.
 
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 0512dcc11750..c5ae545d27d8 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -191,10 +191,11 @@ extern struct mca_config mca_cfg;
 extern struct mca_msr_regs msr_ops;
 
 enum mce_notifier_prios {
-	MCE_PRIO_SRAO		= INT_MAX,
-	MCE_PRIO_EXTLOG		= INT_MAX - 1,
-	MCE_PRIO_NFIT		= INT_MAX - 2,
-	MCE_PRIO_EDAC		= INT_MAX - 3,
+	MCE_PRIO_FIRST		= INT_MAX,
+	MCE_PRIO_SRAO		= INT_MAX - 1,
+	MCE_PRIO_EXTLOG		= INT_MAX - 2,
+	MCE_PRIO_NFIT		= INT_MAX - 3,
+	MCE_PRIO_EDAC		= INT_MAX - 4,
 	MCE_PRIO_LOWEST		= 0,
 };
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8ada093d4e40..4a907758a516 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -35,6 +35,7 @@
 #include <linux/poll.h>
 #include <linux/nmi.h>
 #include <linux/cpu.h>
+#include <linux/ras.h>
 #include <linux/smp.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
@@ -160,47 +161,8 @@ static struct mce_log_buffer mcelog_buf = {
 
 void mce_log(struct mce *m)
 {
-	unsigned next, entry;
-
-	/* Emit the trace record: */
-	trace_mce_record(m);
-
 	if (!mce_gen_pool_add(m))
 		irq_work_queue(&mce_irq_work);
-
-	wmb();
-	for (;;) {
-		entry = mce_log_get_idx_check(mcelog_buf.next);
-		for (;;) {
-
-			/*
-			 * When the buffer fills up discard new entries.
-			 * Assume that the earlier errors are the more
-			 * interesting ones:
-			 */
-			if (entry >= MCE_LOG_LEN) {
-				set_bit(MCE_OVERFLOW,
-					(unsigned long *)&mcelog_buf.flags);
-				return;
-			}
-			/* Old left over entry. Skip: */
-			if (mcelog_buf.entry[entry].finished) {
-				entry++;
-				continue;
-			}
-			break;
-		}
-		smp_rmb();
-		next = entry + 1;
-		if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
-			break;
-	}
-	memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
-	wmb();
-	mcelog_buf.entry[entry].finished = 1;
-	wmb();
-
-	set_bit(0, &mce_need_notify);
 }
 
 void mce_inject_log(struct mce *m)
@@ -213,6 +175,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);
 
 static struct notifier_block mce_srao_nb;
 
+/*
+ * We run the default notifier if we have only the SRAO, the first and the
+ * default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
+ * notifiers registered on the chain.
+ */
+#define NUM_DEFAULT_NOTIFIERS	3
 static atomic_t num_notifiers;
 
 void mce_register_decode_chain(struct notifier_block *nb)
@@ -522,7 +490,6 @@ static void mce_schedule_work(void)
 
 static void mce_irq_work_cb(struct irq_work *entry)
 {
-	mce_notify_irq();
 	mce_schedule_work();
 }
 
@@ -565,6 +532,111 @@ static int mce_usable_address(struct mce *m)
 	return 1;
 }
 
+static bool memory_error(struct mce *m)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	if (c->x86_vendor == X86_VENDOR_AMD) {
+		/* ErrCodeExt[20:16] */
+		u8 xec = (m->status >> 16) & 0x1f;
+
+		return (xec == 0x0 || xec == 0x8);
+	} else if (c->x86_vendor == X86_VENDOR_INTEL) {
+		/*
+		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
+		 *
+		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
+		 * indicating a memory error. Bit 8 is used for indicating a
+		 * cache hierarchy error. The combination of bit 2 and bit 3
+		 * is used for indicating a `generic' cache hierarchy error
+		 * But we can't just blindly check the above bits, because if
+		 * bit 11 is set, then it is a bus/interconnect error - and
+		 * either way the above bits just gives more detail on what
+		 * bus/interconnect error happened. Note that bit 12 can be
+		 * ignored, as it's the "filter" bit.
+		 */
+		return (m->status & 0xef80) == BIT(7) ||
+		       (m->status & 0xef00) == BIT(8) ||
+		       (m->status & 0xeffc) == 0xc;
+	}
+
+	return false;
+}
+
+static bool cec_add_mce(struct mce *m)
+{
+	if (!m)
+		return false;
+
+	/* We eat only correctable DRAM errors with usable addresses. */
+	if (memory_error(m) &&
+	    !(m->status & MCI_STATUS_UC) &&
+	    mce_usable_address(m))
+		if (!cec_add_elem(m->addr >> PAGE_SHIFT))
+			return true;
+
+	return false;
+}
+
+static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
+			      void *data)
+{
+	struct mce *m = (struct mce *)data;
+	unsigned int next, entry;
+
+	if (!m)
+		return NOTIFY_DONE;
+
+	if (cec_add_mce(m))
+		return NOTIFY_STOP;
+
+	/* Emit the trace record: */
+	trace_mce_record(m);
+
+	wmb();
+	for (;;) {
+		entry = mce_log_get_idx_check(mcelog_buf.next);
+		for (;;) {
+
+			/*
+			 * When the buffer fills up discard new entries.
+			 * Assume that the earlier errors are the more
+			 * interesting ones:
+			 */
+			if (entry >= MCE_LOG_LEN) {
+				set_bit(MCE_OVERFLOW,
+					(unsigned long *)&mcelog_buf.flags);
+				return NOTIFY_DONE;
+			}
+			/* Old left over entry. Skip: */
+			if (mcelog_buf.entry[entry].finished) {
+				entry++;
+				continue;
+			}
+			break;
+		}
+		smp_rmb();
+		next = entry + 1;
+		if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
+			break;
+	}
+	memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
+	wmb();
+	mcelog_buf.entry[entry].finished = 1;
+	wmb();
+
+	set_bit(0, &mce_need_notify);
+
+	mce_notify_irq();
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block first_nb = {
+	.notifier_call	= mce_first_notifier,
+	.priority	= MCE_PRIO_FIRST,
+};
+
 static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
 				void *data)
 {
@@ -594,11 +666,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
 	if (!m)
 		return NOTIFY_DONE;
 
-	/*
-	 * Run the default notifier if we have only the SRAO
-	 * notifier and us registered.
-	 */
-	if (atomic_read(&num_notifiers) > 2)
+	if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
 		return NOTIFY_DONE;
 
 	/* Don't print when mcelog is running */
@@ -655,37 +723,6 @@ static void mce_read_aux(struct mce *m, int i)
 	}
 }
 
-static bool memory_error(struct mce *m)
-{
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-
-	if (c->x86_vendor == X86_VENDOR_AMD) {
-		/* ErrCodeExt[20:16] */
-		u8 xec = (m->status >> 16) & 0x1f;
-
-		return (xec == 0x0 || xec == 0x8);
-	} else if (c->x86_vendor == X86_VENDOR_INTEL) {
-		/*
-		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
-		 *
-		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
-		 * indicating a memory error. Bit 8 is used for indicating a
-		 * cache hierarchy error. The combination of bit 2 and bit 3
-		 * is used for indicating a `generic' cache hierarchy error
-		 * But we can't just blindly check the above bits, because if
-		 * bit 11 is set, then it is a bus/interconnect error - and
-		 * either way the above bits just gives more detail on what
-		 * bus/interconnect error happened. Note that bit 12 can be
-		 * ignored, as it's the "filter" bit.
-		 */
-		return (m->status & 0xef80) == BIT(7) ||
-		       (m->status & 0xef00) == BIT(8) ||
-		       (m->status & 0xeffc) == 0xc;
-	}
-
-	return false;
-}
-
 DEFINE_PER_CPU(unsigned, mce_poll_count);
 
 /*
@@ -2167,6 +2204,7 @@ __setup("mce", mcheck_enable);
 int __init mcheck_init(void)
 {
 	mcheck_intel_therm_init();
+	mce_register_decode_chain(&first_nb);
 	mce_register_decode_chain(&mce_srao_nb);
 	mce_register_decode_chain(&mce_default_nb);
 	mcheck_vendor_init_severity();
@@ -2716,6 +2754,7 @@ static int __init mcheck_late_init(void)
 		static_branch_inc(&mcsafe_key);
 
 	mcheck_debugfs_init();
+	cec_init();
 
 	/*
 	 * Flush out everything that has been logged during early boot, now that
diff --git a/arch/x86/ras/Kconfig b/arch/x86/ras/Kconfig
index 0bc60a308730..2a2d89d39af6 100644
--- a/arch/x86/ras/Kconfig
+++ b/arch/x86/ras/Kconfig
@@ -7,3 +7,17 @@ config MCE_AMD_INJ
 	  aspects of the MCE handling code.
 
 	  WARNING: Do not even assume this interface is staying stable!
+
+config RAS_CEC
+	bool "Correctable Errors Collector"
+	depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
+	---help---
+	  This is a small cache which collects correctable memory errors per 4K
+	  page PFN and counts their repeated occurrence. Once the counter for a
+	  PFN overflows, we try to soft-offline that page as we take it to mean
+	  that it has reached a relatively high error count and would probably
+	  be best if we don't use it anymore.
+
+	  Bear in mind that this is absolutely useless if your platform doesn't
+	  have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
+
diff --git a/drivers/ras/Makefile b/drivers/ras/Makefile
index d7f73341ced3..7b26dd3aa5d0 100644
--- a/drivers/ras/Makefile
+++ b/drivers/ras/Makefile
@@ -1 +1,2 @@
-obj-$(CONFIG_RAS) += ras.o debugfs.o
+obj-$(CONFIG_RAS)	+= ras.o debugfs.o
+obj-$(CONFIG_RAS_CEC)	+= cec.o
diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c
new file mode 100644
index 000000000000..6aab46d91d33
--- /dev/null
+++ b/drivers/ras/cec.c
@@ -0,0 +1,532 @@
+#include <linux/mm.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+
+#include <asm/mce.h>
+
+#include "debugfs.h"
+
+/*
+ * RAS Correctable Errors Collector
+ *
+ * This is a simple gadget which collects correctable errors and counts their
+ * occurrence per physical page address.
+ *
+ * We've opted for possibly the simplest data structure to collect those - an
+ * array of the size of a memory page. It stores 512 u64's with the following
+ * structure:
+ *
+ * [63 ... PFN ... 12 | 11 ... generation ... 10 | 9 ... count ... 0]
+ *
+ * The generation in the two highest order bits is two bits which are set to 11b
+ * on every insertion. During the course of each entry's existence, the
+ * generation field gets decremented during spring cleaning to 10b, then 01b and
+ * then 00b.
+ *
+ * This way we're employing the natural numeric ordering to make sure that newly
+ * inserted/touched elements have higher 12-bit counts (which we've manufactured)
+ * and thus iterating over the array initially won't kick out those elements
+ * which were inserted last.
+ *
+ * Spring cleaning is what we do when we reach a certain number CLEAN_ELEMS of
+ * elements entered into the array, during which, we're decaying all elements.
+ * If, after decay, an element gets inserted again, its generation is set to 11b
+ * to make sure it has higher numerical count than other, older elements and
+ * thus emulate an an LRU-like behavior when deleting elements to free up space
+ * in the page.
+ *
+ * When an element reaches it's max count of count_threshold, we try to poison
+ * it by assuming that errors triggered count_threshold times in a single page
+ * are excessive and that page shouldn't be used anymore. count_threshold is
+ * initialized to COUNT_MASK which is the maximum.
+ *
+ * That error event entry causes cec_add_elem() to return !0 value and thus
+ * signal to its callers to log the error.
+ *
+ * To the question why we've chosen a page and moving elements around with
+ * memmove(), it is because it is a very simple structure to handle and max data
+ * movement is 4K which on highly optimized modern CPUs is almost unnoticeable.
+ * We wanted to avoid the pointer traversal of more complex structures like a
+ * linked list or some sort of a balancing search tree.
+ *
+ * Deleting an element takes O(n) but since it is only a single page, it should
+ * be fast enough and it shouldn't happen all too often depending on error
+ * patterns.
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) "RAS: " fmt
+
+/*
+ * We use DECAY_BITS bits of PAGE_SHIFT bits for counting decay, i.e., how long
+ * elements have stayed in the array without having been accessed again.
+ */
+#define DECAY_BITS		2
+#define DECAY_MASK		((1ULL << DECAY_BITS) - 1)
+#define MAX_ELEMS		(PAGE_SIZE / sizeof(u64))
+
+/*
+ * Threshold amount of inserted elements after which we start spring
+ * cleaning.
+ */
+#define CLEAN_ELEMS		(MAX_ELEMS >> DECAY_BITS)
+
+/* Bits which count the number of errors happened in this 4K page. */
+#define COUNT_BITS		(PAGE_SHIFT - DECAY_BITS)
+#define COUNT_MASK		((1ULL << COUNT_BITS) - 1)
+#define FULL_COUNT_MASK		(PAGE_SIZE - 1)
+
+/*
+ * u64: [ 63 ... 12 | DECAY_BITS | COUNT_BITS ]
+ */
+
+#define PFN(e)			((e) >> PAGE_SHIFT)
+#define DECAY(e)		(((e) >> COUNT_BITS) & DECAY_MASK)
+#define COUNT(e)		((unsigned int)(e) & COUNT_MASK)
+#define FULL_COUNT(e)		((e) & (PAGE_SIZE - 1))
+
+static struct ce_array {
+	u64 *array;			/* container page */
+	unsigned int n;			/* number of elements in the array */
+
+	unsigned int decay_count;	/*
+					 * number of element insertions/increments
+					 * since the last spring cleaning.
+					 */
+
+	u64 pfns_poisoned;		/*
+					 * number of PFNs which got poisoned.
+					 */
+
+	u64 ces_entered;		/*
+					 * The number of correctable errors
+					 * entered into the collector.
+					 */
+
+	u64 decays_done;		/*
+					 * Times we did spring cleaning.
+					 */
+
+	union {
+		struct {
+			__u32	disabled : 1,	/* cmdline disabled */
+			__resv   : 31;
+		};
+		__u32 flags;
+	};
+} ce_arr;
+
+static DEFINE_MUTEX(ce_mutex);
+static u64 dfs_pfn;
+
+/* Amount of errors after which we offline */
+static unsigned int count_threshold = COUNT_MASK;
+
+/*
+ * The timer "decays" element count each timer_interval which is 24hrs by
+ * default.
+ */
+
+#define CEC_TIMER_DEFAULT_INTERVAL	24 * 60 * 60	/* 24 hrs */
+#define CEC_TIMER_MIN_INTERVAL		 1 * 60 * 60	/* 1h */
+#define CEC_TIMER_MAX_INTERVAL	   30 *	24 * 60 * 60	/* one month */
+static struct timer_list cec_timer;
+static u64 timer_interval = CEC_TIMER_DEFAULT_INTERVAL;
+
+/*
+ * Decrement decay value. We're using DECAY_BITS bits to denote decay of an
+ * element in the array. On insertion and any access, it gets reset to max.
+ */
+static void do_spring_cleaning(struct ce_array *ca)
+{
+	int i;
+
+	for (i = 0; i < ca->n; i++) {
+		u8 decay = DECAY(ca->array[i]);
+
+		if (!decay)
+			continue;
+
+		decay--;
+
+		ca->array[i] &= ~(DECAY_MASK << COUNT_BITS);
+		ca->array[i] |= (decay << COUNT_BITS);
+	}
+	ca->decay_count = 0;
+	ca->decays_done++;
+}
+
+/*
+ * @interval in seconds
+ */
+static void cec_mod_timer(struct timer_list *t, unsigned long interval)
+{
+	unsigned long iv;
+
+	iv = interval * HZ + jiffies;
+
+	mod_timer(t, round_jiffies(iv));
+}
+
+static void cec_timer_fn(unsigned long data)
+{
+	struct ce_array *ca = (struct ce_array *)data;
+
+	do_spring_cleaning(ca);
+
+	cec_mod_timer(&cec_timer, timer_interval);
+}
+
+/*
+ * @to: index of the smallest element which is >= then @pfn.
+ *
+ * Return the index of the pfn if found, otherwise negative value.
+ */
+static int __find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
+{
+	u64 this_pfn;
+	int min = 0, max = ca->n;
+
+	while (min < max) {
+		int tmp = (max + min) >> 1;
+
+		this_pfn = PFN(ca->array[tmp]);
+
+		if (this_pfn < pfn)
+			min = tmp + 1;
+		else if (this_pfn > pfn)
+			max = tmp;
+		else {
+			min = tmp;
+			break;
+		}
+	}
+
+	if (to)
+		*to = min;
+
+	this_pfn = PFN(ca->array[min]);
+
+	if (this_pfn == pfn)
+		return min;
+
+	return -ENOKEY;
+}
+
+static int find_elem(struct ce_array *ca, u64 pfn, unsigned int *to)
+{
+	WARN_ON(!to);
+
+	if (!ca->n) {
+		*to = 0;
+		return -ENOKEY;
+	}
+	return __find_elem(ca, pfn, to);
+}
+
+static void del_elem(struct ce_array *ca, int idx)
+{
+	/* Save us a function call when deleting the last element. */
+	if (ca->n - (idx + 1))
+		memmove((void *)&ca->array[idx],
+			(void *)&ca->array[idx + 1],
+			(ca->n - (idx + 1)) * sizeof(u64));
+
+	ca->n--;
+}
+
+static u64 del_lru_elem_unlocked(struct ce_array *ca)
+{
+	unsigned int min = FULL_COUNT_MASK;
+	int i, min_idx = 0;
+
+	for (i = 0; i < ca->n; i++) {
+		unsigned int this = FULL_COUNT(ca->array[i]);
+
+		if (min > this) {
+			min = this;
+			min_idx = i;
+		}
+	}
+
+	del_elem(ca, min_idx);
+
+	return PFN(ca->array[min_idx]);
+}
+
+/*
+ * We return the 0th pfn in the error case under the assumption that it cannot
+ * be poisoned and excessive CEs in there are a serious deal anyway.
+ */
+static u64 __maybe_unused del_lru_elem(void)
+{
+	struct ce_array *ca = &ce_arr;
+	u64 pfn;
+
+	if (!ca->n)
+		return 0;
+
+	mutex_lock(&ce_mutex);
+	pfn = del_lru_elem_unlocked(ca);
+	mutex_unlock(&ce_mutex);
+
+	return pfn;
+}
+
+
+int cec_add_elem(u64 pfn)
+{
+	struct ce_array *ca = &ce_arr;
+	unsigned int to;
+	int count, ret = 0;
+
+	/*
+	 * We can be called very early on the identify_cpu() path where we are
+	 * not initialized yet. We ignore the error for simplicity.
+	 */
+	if (!ce_arr.array || ce_arr.disabled)
+		return -ENODEV;
+
+	ca->ces_entered++;
+
+	mutex_lock(&ce_mutex);
+
+	if (ca->n == MAX_ELEMS)
+		WARN_ON(!del_lru_elem_unlocked(ca));
+
+	ret = find_elem(ca, pfn, &to);
+	if (ret < 0) {
+		/*
+		 * Shift range [to-end] to make room for one more element.
+		 */
+		memmove((void *)&ca->array[to + 1],
+			(void *)&ca->array[to],
+			(ca->n - to) * sizeof(u64));
+
+		ca->array[to] = (pfn << PAGE_SHIFT) |
+				(DECAY_MASK << COUNT_BITS) | 1;
+
+		ca->n++;
+
+		ret = 0;
+
+		goto decay;
+	}
+
+	count = COUNT(ca->array[to]);
+
+	if (count < count_threshold) {
+		ca->array[to] |= (DECAY_MASK << COUNT_BITS);
+		ca->array[to]++;
+
+		ret = 0;
+	} else {
+		u64 pfn = ca->array[to] >> PAGE_SHIFT;
+
+		if (!pfn_valid(pfn)) {
+			pr_warn("CEC: Invalid pfn: 0x%llx\n", pfn);
+		} else {
+			/* We have reached max count for this page, soft-offline it. */
+			pr_err("Soft-offlining pfn: 0x%llx\n", pfn);
+			memory_failure_queue(pfn, 0, MF_SOFT_OFFLINE);
+			ca->pfns_poisoned++;
+		}
+
+		del_elem(ca, to);
+
+		/*
+		 * Return a >0 value to denote that we've reached the offlining
+		 * threshold.
+		 */
+		ret = 1;
+
+		goto unlock;
+	}
+
+decay:
+	ca->decay_count++;
+
+	if (ca->decay_count >= CLEAN_ELEMS)
+		do_spring_cleaning(ca);
+
+unlock:
+	mutex_unlock(&ce_mutex);
+
+	return ret;
+}
+
+static int u64_get(void *data, u64 *val)
+{
+	*val = *(u64 *)data;
+
+	return 0;
+}
+
+static int pfn_set(void *data, u64 val)
+{
+	*(u64 *)data = val;
+
+	return cec_add_elem(val);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(pfn_ops, u64_get, pfn_set, "0x%llx\n");
+
+static int decay_interval_set(void *data, u64 val)
+{
+	*(u64 *)data = val;
+
+	if (val < CEC_TIMER_MIN_INTERVAL)
+		return -EINVAL;
+
+	if (val > CEC_TIMER_MAX_INTERVAL)
+		return -EINVAL;
+
+	timer_interval = val;
+
+	cec_mod_timer(&cec_timer, timer_interval);
+	return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(decay_interval_ops, u64_get, decay_interval_set, "%lld\n");
+
+static int count_threshold_set(void *data, u64 val)
+{
+	*(u64 *)data = val;
+
+	if (val > COUNT_MASK)
+		val = COUNT_MASK;
+
+	count_threshold = val;
+
+	return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(count_threshold_ops, u64_get, count_threshold_set, "%lld\n");
+
+static int array_dump(struct seq_file *m, void *v)
+{
+	struct ce_array *ca = &ce_arr;
+	u64 prev = 0;
+	int i;
+
+	mutex_lock(&ce_mutex);
+
+	seq_printf(m, "{ n: %d\n", ca->n);
+	for (i = 0; i < ca->n; i++) {
+		u64 this = PFN(ca->array[i]);
+
+		seq_printf(m, " %03d: [%016llx|%03llx]\n", i, this, FULL_COUNT(ca->array[i]));
+
+		WARN_ON(prev > this);
+
+		prev = this;
+	}
+
+	seq_printf(m, "}\n");
+
+	seq_printf(m, "Stats:\nCEs: %llu\nofflined pages: %llu\n",
+		   ca->ces_entered, ca->pfns_poisoned);
+
+	seq_printf(m, "Flags: 0x%x\n", ca->flags);
+
+	seq_printf(m, "Timer interval: %lld seconds\n", timer_interval);
+	seq_printf(m, "Decays: %lld\n", ca->decays_done);
+
+	seq_printf(m, "Action threshold: %d\n", count_threshold);
+
+	mutex_unlock(&ce_mutex);
+
+	return 0;
+}
+
+static int array_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, array_dump, NULL);
+}
+
+static const struct file_operations array_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = array_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release,
+};
+
+static int __init create_debugfs_nodes(void)
+{
+	struct dentry *d, *pfn, *decay, *count, *array;
+
+	d = debugfs_create_dir("cec", ras_debugfs_dir);
+	if (!d) {
+		pr_warn("Error creating cec debugfs node!\n");
+		return -1;
+	}
+
+	pfn = debugfs_create_file("pfn", S_IRUSR | S_IWUSR, d, &dfs_pfn, &pfn_ops);
+	if (!pfn) {
+		pr_warn("Error creating pfn debugfs node!\n");
+		goto err;
+	}
+
+	array = debugfs_create_file("array", S_IRUSR, d, NULL, &array_ops);
+	if (!array) {
+		pr_warn("Error creating array debugfs node!\n");
+		goto err;
+	}
+
+	decay = debugfs_create_file("decay_interval", S_IRUSR | S_IWUSR, d,
+				    &timer_interval, &decay_interval_ops);
+	if (!decay) {
+		pr_warn("Error creating decay_interval debugfs node!\n");
+		goto err;
+	}
+
+	count = debugfs_create_file("count_threshold", S_IRUSR | S_IWUSR, d,
+				    &count_threshold, &count_threshold_ops);
+	if (!decay) {
+		pr_warn("Error creating count_threshold debugfs node!\n");
+		goto err;
+	}
+
+
+	return 0;
+
+err:
+	debugfs_remove_recursive(d);
+
+	return 1;
+}
+
+void __init cec_init(void)
+{
+	if (ce_arr.disabled)
+		return;
+
+	ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!ce_arr.array) {
+		pr_err("Error allocating CE array page!\n");
+		return;
+	}
+
+	if (create_debugfs_nodes())
+		return;
+
+	setup_timer(&cec_timer, cec_timer_fn, (unsigned long)&ce_arr);
+	cec_mod_timer(&cec_timer, CEC_TIMER_DEFAULT_INTERVAL);
+
+	pr_info("Correctable Errors collector initialized.\n");
+}
+
+int __init parse_cec_param(char *str)
+{
+	if (!str)
+		return 0;
+
+	if (*str == '=')
+		str++;
+
+	if (!strncmp(str, "cec_disable", 7))
+		ce_arr.disabled = 1;
+	else
+		return 0;
+
+	return 1;
+}
diff --git a/drivers/ras/debugfs.c b/drivers/ras/debugfs.c
index 0322acf67ea5..501603057dff 100644
--- a/drivers/ras/debugfs.c
+++ b/drivers/ras/debugfs.c
@@ -1,6 +1,6 @@
 #include <linux/debugfs.h>
 
-static struct dentry *ras_debugfs_dir;
+struct dentry *ras_debugfs_dir;
 
 static atomic_t trace_count = ATOMIC_INIT(0);
 
diff --git a/drivers/ras/debugfs.h b/drivers/ras/debugfs.h
new file mode 100644
index 000000000000..db72e4513191
--- /dev/null
+++ b/drivers/ras/debugfs.h
@@ -0,0 +1,8 @@
+#ifndef __RAS_DEBUGFS_H__
+#define __RAS_DEBUGFS_H__
+
+#include <linux/debugfs.h>
+
+extern struct dentry *ras_debugfs_dir;
+
+#endif /* __RAS_DEBUGFS_H__ */
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index b67dd362b7b6..94f8038864b4 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -27,3 +27,14 @@ subsys_initcall(ras_init);
 EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
 #endif
 EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
+
+
+int __init parse_ras_param(char *str)
+{
+#ifdef CONFIG_RAS_CEC
+	parse_cec_param(str);
+#endif
+
+	return 1;
+}
+__setup("ras", parse_ras_param);
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 2aceeafd6fe5..ffb147185e8d 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -1,14 +1,25 @@
 #ifndef __RAS_H__
 #define __RAS_H__
 
+#include <asm/errno.h>
+
 #ifdef CONFIG_DEBUG_FS
 int ras_userspace_consumers(void);
 void ras_debugfs_init(void);
 int ras_add_daemon_trace(void);
 #else
 static inline int ras_userspace_consumers(void) { return 0; }
-static inline void ras_debugfs_init(void) { return; }
+static inline void ras_debugfs_init(void) { }
 static inline int ras_add_daemon_trace(void) { return 0; }
 #endif
 
+#ifdef CONFIG_RAS_CEC
+void __init cec_init(void);
+int __init parse_cec_param(char *str);
+int cec_add_elem(u64 pfn);
+#else
+static inline void __init cec_init(void)	{ }
+static inline int cec_add_elem(u64 pfn)		{ return -ENODEV; }
 #endif
+
+#endif /* __RAS_H__ */
-- 
cgit v1.3-6-gb490


From 4f9ab0c1570800002e77515888600ca2e3dce4a9 Mon Sep 17 00:00:00 2001
From: Sebastian Reichel <sre@kernel.org>
Date: Fri, 24 Mar 2017 09:47:31 +0100
Subject: mfd: cpcap: Add missing include dependencies

This fixes compilation for files, that try to include the
cpcap header in alphabetically sorted #include lists.

Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/motorola-cpcap.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/motorola-cpcap.h b/include/linux/mfd/motorola-cpcap.h
index b4031c2b2214..53758a7d7c32 100644
--- a/include/linux/mfd/motorola-cpcap.h
+++ b/include/linux/mfd/motorola-cpcap.h
@@ -14,6 +14,9 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/device.h>
+#include <linux/regmap.h>
+
 #define CPCAP_VENDOR_ST		0
 #define CPCAP_VENDOR_TI		1
 
-- 
cgit v1.3-6-gb490


From 9e234eeafbe17e85908584392f249f0b329b8e1b Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 27 Mar 2017 10:51:41 -0700
Subject: blk-throttle: add a simple idle detection

A cgroup gets assigned a low limit, but the cgroup could never dispatch
enough IO to cross the low limit. In such case, the queue state machine
will remain in LIMIT_LOW state and all other cgroups will be throttled
according to low limit. This is unfair for other cgroups. We should
treat the cgroup idle and upgrade the state machine to lower state.

We also have a downgrade logic. If the state machine upgrades because of
cgroup idle (real idle), the state machine will downgrade soon as the
cgroup is below its low limit. This isn't what we want. A more
complicated case is cgroup isn't idle when queue is in LIMIT_LOW. But
when queue gets upgraded to lower state, other cgroups could dispatch
more IO and this cgroup can't dispatch enough IO, so the cgroup is below
its low limit and looks like idle (fake idle). In this case, the queue
should downgrade soon. The key to determine if we should do downgrade is
to detect if cgroup is truely idle.

Unfortunately it's very hard to determine if a cgroup is real idle. This
patch uses the 'think time check' idea from CFQ for the purpose. Please
note, the idea doesn't work for all workloads. For example, a workload
with io depth 8 has disk utilization 100%, hence think time is 0, eg,
not idle. But the workload can run higher bandwidth with io depth 16.
Compared to io depth 16, the io depth 8 workload is idle. We use the
idea to roughly determine if a cgroup is idle.

We treat a cgroup idle if its think time is above a threshold (by
default 1ms for SSD and 100ms for HD). The idea is think time above the
threshold will start to harm performance. HD is much slower so a longer
think time is ok.

The patch (and the latter patches) uses 'unsigned long' to track time.
We convert 'ns' to 'us' with 'ns >> 10'. This is fast but loses
precision, should not a big deal.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c               |  2 ++
 block/blk-throttle.c      | 82 ++++++++++++++++++++++++++++++++++++++++++++++-
 block/blk.h               |  3 ++
 include/linux/blk_types.h |  3 ++
 4 files changed, 89 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index 6194a8cf2aab..f1857c0f0826 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -30,6 +30,7 @@
 #include <linux/cgroup.h>
 
 #include <trace/events/block.h>
+#include "blk.h"
 
 /*
  * Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -1845,6 +1846,7 @@ again:
 		goto again;
 	}
 
+	blk_throtl_bio_endio(bio);
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio);
 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 62984fc92015..6300f3ed70d2 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -22,6 +22,9 @@ static int throtl_quantum = 32;
 #define DFL_THROTL_SLICE_HD (HZ / 10)
 #define DFL_THROTL_SLICE_SSD (HZ / 50)
 #define MAX_THROTL_SLICE (HZ)
+#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */
+#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */
+#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
 
 static struct blkcg_policy blkcg_policy_throtl;
 
@@ -154,6 +157,11 @@ struct throtl_grp {
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
+
+	unsigned long last_finish_time; /* ns / 1024 */
+	unsigned long checked_last_finish_time; /* ns / 1024 */
+	unsigned long avg_idletime; /* ns / 1024 */
+	unsigned long idletime_threshold; /* us */
 };
 
 struct throtl_data
@@ -468,6 +476,11 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
 		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
 	tg->td = td;
+
+	if (blk_queue_nonrot(td->queue))
+		tg->idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
+	else
+		tg->idletime_threshold = DFL_IDLE_THRESHOLD_HD;
 }
 
 /*
@@ -1644,6 +1657,21 @@ static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
 	return ret;
 }
 
+static bool throtl_tg_is_idle(struct throtl_grp *tg)
+{
+	/*
+	 * cgroup is idle if:
+	 * - single idle is too long, longer than a fixed value (in case user
+	 *   configure a too big threshold) or 4 times of slice
+	 * - average think time is more than threshold
+	 */
+	unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
+
+	time = min_t(unsigned long, MAX_IDLE_TIME, time);
+	return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
+	       tg->avg_idletime > tg->idletime_threshold;
+}
+
 static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
 {
 	struct throtl_service_queue *sq = &tg->service_queue;
@@ -1843,6 +1871,19 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
 	tg->last_io_disp[WRITE] = 0;
 }
 
+static void blk_throtl_update_idletime(struct throtl_grp *tg)
+{
+	unsigned long now = ktime_get_ns() >> 10;
+	unsigned long last_finish_time = tg->last_finish_time;
+
+	if (now <= last_finish_time || last_finish_time == 0 ||
+	    last_finish_time == tg->checked_last_finish_time)
+		return;
+
+	tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
+	tg->checked_last_finish_time = last_finish_time;
+}
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		    struct bio *bio)
 {
@@ -1851,6 +1892,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	struct throtl_service_queue *sq;
 	bool rw = bio_data_dir(bio);
 	bool throttled = false;
+	int ret;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
@@ -1863,6 +1905,13 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	if (unlikely(blk_queue_bypass(q)))
 		goto out_unlock;
 
+	ret = bio_associate_current(bio);
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+	if (ret == 0 || ret == -EBUSY)
+		bio->bi_cg_private = tg;
+#endif
+	blk_throtl_update_idletime(tg);
+
 	sq = &tg->service_queue;
 
 again:
@@ -1923,7 +1972,6 @@ again:
 
 	tg->last_low_overflow_time[rw] = jiffies;
 
-	bio_associate_current(bio);
 	tg->td->nr_queued[rw]++;
 	throtl_add_bio_tg(bio, qn, tg);
 	throttled = true;
@@ -1952,6 +2000,20 @@ out:
 	return throttled;
 }
 
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+void blk_throtl_bio_endio(struct bio *bio)
+{
+	struct throtl_grp *tg;
+
+	tg = bio->bi_cg_private;
+	if (!tg)
+		return;
+	bio->bi_cg_private = NULL;
+
+	tg->last_finish_time = ktime_get_ns() >> 10;
+}
+#endif
+
 /*
  * Dispatch all bios from all children tg's queued on @parent_sq.  On
  * return, @parent_sq is guaranteed to not have any active children tg's
@@ -2035,6 +2097,7 @@ int blk_throtl_init(struct request_queue *q)
 	td->limit_index = LIMIT_MAX;
 	td->low_upgrade_time = jiffies;
 	td->low_downgrade_time = jiffies;
+
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
 	if (ret)
@@ -2053,6 +2116,8 @@ void blk_throtl_exit(struct request_queue *q)
 void blk_throtl_register_queue(struct request_queue *q)
 {
 	struct throtl_data *td;
+	struct cgroup_subsys_state *pos_css;
+	struct blkcg_gq *blkg;
 
 	td = q->td;
 	BUG_ON(!td);
@@ -2065,6 +2130,21 @@ void blk_throtl_register_queue(struct request_queue *q)
 	/* if no low limit, use previous default */
 	td->throtl_slice = DFL_THROTL_SLICE_HD;
 #endif
+
+	/*
+	 * some tg are created before queue is fully initialized, eg, nonrot
+	 * isn't initialized yet
+	 */
+	rcu_read_lock();
+	blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+		struct throtl_grp *tg = blkg_to_tg(blkg);
+
+		if (blk_queue_nonrot(q))
+			tg->idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
+		else
+			tg->idletime_threshold = DFL_IDLE_THRESHOLD_HD;
+	}
+	rcu_read_unlock();
 }
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
diff --git a/block/blk.h b/block/blk.h
index 13070c325858..3ac833ec2adb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -330,6 +330,9 @@ static inline void blk_throtl_register_queue(struct request_queue *q) { }
 extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
 extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
 	const char *page, size_t count);
+extern void blk_throtl_bio_endio(struct bio *bio);
+#else
+static inline void blk_throtl_bio_endio(struct bio *bio) { }
 #endif
 
 #endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 270119a501fb..07a9e9607909 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -58,6 +58,9 @@ struct bio {
 	 */
 	struct io_context	*bi_ioc;
 	struct cgroup_subsys_state *bi_css;
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+	void			*bi_cg_private;
+#endif
 #endif
 	union {
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
-- 
cgit v1.3-6-gb490


From 88eeca495ba7de749ff253376ec6be19bb05368d Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 27 Mar 2017 15:19:41 -0700
Subject: block: track request size in blk_issue_stat

Currently there is no way to know the request size when the request is
finished. Next patch will need this info. We could add extra field to
record the size, but blk_issue_stat has enough space to record it, so
this patch just overloads blk_issue_stat. With this, we will have 49bits
to track time, which still is very long time.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c          |  2 +-
 block/blk-mq.c            |  2 +-
 block/blk-stat.h          | 41 ++++++++++++++++++++++++++++++-----------
 block/blk-wbt.h           | 10 +++++-----
 include/linux/blk_types.h |  2 +-
 5 files changed, 38 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index ad388d5e309a..4234332aa23c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2483,7 +2483,7 @@ void blk_start_request(struct request *req)
 	blk_dequeue_request(req);
 
 	if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
-		blk_stat_set_issue_time(&req->issue_stat);
+		blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req));
 		req->rq_flags |= RQF_STATS;
 		wbt_issue(req->q->rq_wb, &req->issue_stat);
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 45b9bebf8436..caef6ee08b04 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -488,7 +488,7 @@ void blk_mq_start_request(struct request *rq)
 	trace_block_rq_issue(q, rq);
 
 	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-		blk_stat_set_issue_time(&rq->issue_stat);
+		blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
 		rq->rq_flags |= RQF_STATS;
 		wbt_issue(q->rq_wb, &rq->issue_stat);
 	}
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 6ad5b8c59a79..ee47f816d5bd 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -8,12 +8,19 @@
 #include <linux/timer.h>
 
 /*
- * Upper 3 bits can be used elsewhere
+ * from upper:
+ * 3 bits: reserved for other usage
+ * 12 bits: size
+ * 49 bits: time
  */
 #define BLK_STAT_RES_BITS	3
-#define BLK_STAT_SHIFT		(64 - BLK_STAT_RES_BITS)
-#define BLK_STAT_TIME_MASK	((1ULL << BLK_STAT_SHIFT) - 1)
-#define BLK_STAT_MASK		~BLK_STAT_TIME_MASK
+#define BLK_STAT_SIZE_BITS	12
+#define BLK_STAT_RES_SHIFT	(64 - BLK_STAT_RES_BITS)
+#define BLK_STAT_SIZE_SHIFT	(BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS)
+#define BLK_STAT_TIME_MASK	((1ULL << BLK_STAT_SIZE_SHIFT) - 1)
+#define BLK_STAT_SIZE_MASK	\
+	(((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT)
+#define BLK_STAT_RES_MASK	(~((1ULL << BLK_STAT_RES_SHIFT) - 1))
 
 /**
  * struct blk_stat_callback - Block statistics callback.
@@ -73,12 +80,6 @@ void blk_free_queue_stats(struct blk_queue_stats *);
 
 void blk_stat_add(struct request *);
 
-static inline void blk_stat_set_issue_time(struct blk_issue_stat *stat)
-{
-	stat->time = ((stat->time & BLK_STAT_MASK) |
-		      (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK));
-}
-
 static inline u64 __blk_stat_time(u64 time)
 {
 	return time & BLK_STAT_TIME_MASK;
@@ -86,7 +87,25 @@ static inline u64 __blk_stat_time(u64 time)
 
 static inline u64 blk_stat_time(struct blk_issue_stat *stat)
 {
-	return __blk_stat_time(stat->time);
+	return __blk_stat_time(stat->stat);
+}
+
+static inline sector_t blk_capped_size(sector_t size)
+{
+	return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1);
+}
+
+static inline sector_t blk_stat_size(struct blk_issue_stat *stat)
+{
+	return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT;
+}
+
+static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
+	sector_t size)
+{
+	stat->stat = (stat->stat & BLK_STAT_RES_MASK) |
+		(ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) |
+		(((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
 }
 
 /*
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 591ff2f4b2ee..ad6c78507c3a 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -32,27 +32,27 @@ enum {
 
 static inline void wbt_clear_state(struct blk_issue_stat *stat)
 {
-	stat->time &= BLK_STAT_TIME_MASK;
+	stat->stat &= ~BLK_STAT_RES_MASK;
 }
 
 static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
 {
-	return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT;
+	return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT;
 }
 
 static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
 {
-	stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT;
+	stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT;
 }
 
 static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
 {
-	return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED;
+	return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED;
 }
 
 static inline bool wbt_is_read(struct blk_issue_stat *stat)
 {
-	return (stat->time >> BLK_STAT_SHIFT) & WBT_READ;
+	return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
 }
 
 struct rq_wait {
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 07a9e9607909..3ad567347671 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -287,7 +287,7 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
 }
 
 struct blk_issue_stat {
-	u64 time;
+	u64 stat;
 };
 
 struct blk_rq_stat {
-- 
cgit v1.3-6-gb490


From b9147dd1bae2b15d6931ecd42f8606c775fecbc9 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 27 Mar 2017 15:19:42 -0700
Subject: blk-throttle: add a mechanism to estimate IO latency

User configures latency target, but the latency threshold for each
request size isn't fixed. For a SSD, the IO latency highly depends on
request size. To calculate latency threshold, we sample some data, eg,
average latency for request size 4k, 8k, 16k, 32k .. 1M. The latency
threshold of each request size will be the sample latency (I'll call it
base latency) plus latency target. For example, the base latency for
request size 4k is 80us and user configures latency target 60us. The 4k
latency threshold will be 80 + 60 = 140us.

To sample data, we calculate the order base 2 of rounded up IO sectors.
If the IO size is bigger than 1M, it will be accounted as 1M. Since the
calculation does round up, the base latency will be slightly smaller
than actual value. Also if there isn't any IO dispatched for a specific
IO size, we will use the base latency of smaller IO size for this IO
size.

But we shouldn't sample data at any time. The base latency is supposed
to be latency where disk isn't congested, because we use latency
threshold to schedule IOs between cgroups. If disk is congested, the
latency is higher, using it for scheduling is meaningless. Hence we only
do the sampling when block throttling is in the LOW limit, with
assumption disk isn't congested in such state. If the assumption isn't
true, eg, low limit is too high, calculated latency threshold will be
higher.

Hard disk is completely different. Latency depends on spindle seek
instead of request size. Currently this feature is SSD only, we probably
can use a fixed threshold like 4ms for hard disk though.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-stat.c          |  15 ++++-
 block/blk-stat.h          |   3 +
 block/blk-throttle.c      | 166 ++++++++++++++++++++++++++++++++++++++++++++--
 block/blk.h               |   2 +
 include/linux/blk_types.h |   9 +--
 5 files changed, 185 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-stat.c b/block/blk-stat.c
index 188b535cf4d6..e77ec52f5bb5 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -9,12 +9,14 @@
 
 #include "blk-stat.h"
 #include "blk-mq.h"
+#include "blk.h"
 
 #define BLK_RQ_STAT_BATCH	64
 
 struct blk_queue_stats {
 	struct list_head callbacks;
 	spinlock_t lock;
+	bool enable_accounting;
 };
 
 unsigned int blk_stat_rq_ddir(const struct request *rq)
@@ -96,6 +98,8 @@ void blk_stat_add(struct request *rq)
 
 	value = now - blk_stat_time(&rq->issue_stat);
 
+	blk_throtl_stat_add(rq, value);
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
 		if (blk_stat_is_active(cb)) {
@@ -190,7 +194,7 @@ void blk_stat_remove_callback(struct request_queue *q,
 {
 	spin_lock(&q->stats->lock);
 	list_del_rcu(&cb->list);
-	if (list_empty(&q->stats->callbacks))
+	if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
 		clear_bit(QUEUE_FLAG_STATS, &q->queue_flags);
 	spin_unlock(&q->stats->lock);
 
@@ -215,6 +219,14 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
 }
 EXPORT_SYMBOL_GPL(blk_stat_free_callback);
 
+void blk_stat_enable_accounting(struct request_queue *q)
+{
+	spin_lock(&q->stats->lock);
+	q->stats->enable_accounting = true;
+	set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+	spin_unlock(&q->stats->lock);
+}
+
 struct blk_queue_stats *blk_alloc_queue_stats(void)
 {
 	struct blk_queue_stats *stats;
@@ -225,6 +237,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void)
 
 	INIT_LIST_HEAD(&stats->callbacks);
 	spin_lock_init(&stats->lock);
+	stats->enable_accounting = false;
 
 	return stats;
 }
diff --git a/block/blk-stat.h b/block/blk-stat.h
index ee47f816d5bd..53f08a63bf15 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -108,6 +108,9 @@ static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
 		(((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
 }
 
+/* record time/size info in request but not add a callback */
+void blk_stat_enable_accounting(struct request_queue *q);
+
 /*
  * blk_stat_rq_ddir() - Bucket callback function for the request data direction.
  * @rq: Request.
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 6e1c29860eec..140da29f5800 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -28,6 +28,8 @@ static int throtl_quantum = 32;
 /* default latency target is 0, eg, guarantee IO latency by default */
 #define DFL_LATENCY_TARGET (0)
 
+#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
+
 static struct blkcg_policy blkcg_policy_throtl;
 
 /* A workqueue to queue throttle related work */
@@ -165,6 +167,19 @@ struct throtl_grp {
 	unsigned long idletime_threshold; /* us */
 };
 
+/* We measure latency for request size from <= 4k to >= 1M */
+#define LATENCY_BUCKET_SIZE 9
+
+struct latency_bucket {
+	unsigned long total_latency; /* ns / 1024 */
+	int samples;
+};
+
+struct avg_latency_bucket {
+	unsigned long latency; /* ns / 1024 */
+	bool valid;
+};
+
 struct throtl_data
 {
 	/* service tree for active throtl groups */
@@ -188,6 +203,13 @@ struct throtl_data
 	unsigned long low_downgrade_time;
 
 	unsigned int scale;
+
+	struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
+	struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
+	struct latency_bucket __percpu *latency_buckets;
+	unsigned long last_calculate_time;
+
+	bool track_bio_latency;
 };
 
 static void throtl_pending_timer_fn(unsigned long arg);
@@ -306,6 +328,9 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
 	return ret;
 }
 
+#define request_bucket_index(sectors) \
+	clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
+
 /**
  * throtl_log - log debug message via blktrace
  * @sq: the service_queue being reported
@@ -1931,6 +1956,73 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
 	tg->checked_last_finish_time = last_finish_time;
 }
 
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_update_latency_buckets(struct throtl_data *td)
+{
+	struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
+	int i, cpu;
+	unsigned long last_latency = 0;
+	unsigned long latency;
+
+	if (!blk_queue_nonrot(td->queue))
+		return;
+	if (time_before(jiffies, td->last_calculate_time + HZ))
+		return;
+	td->last_calculate_time = jiffies;
+
+	memset(avg_latency, 0, sizeof(avg_latency));
+	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+		struct latency_bucket *tmp = &td->tmp_buckets[i];
+
+		for_each_possible_cpu(cpu) {
+			struct latency_bucket *bucket;
+
+			/* this isn't race free, but ok in practice */
+			bucket = per_cpu_ptr(td->latency_buckets, cpu);
+			tmp->total_latency += bucket[i].total_latency;
+			tmp->samples += bucket[i].samples;
+			bucket[i].total_latency = 0;
+			bucket[i].samples = 0;
+		}
+
+		if (tmp->samples >= 32) {
+			int samples = tmp->samples;
+
+			latency = tmp->total_latency;
+
+			tmp->total_latency = 0;
+			tmp->samples = 0;
+			latency /= samples;
+			if (latency == 0)
+				continue;
+			avg_latency[i].latency = latency;
+		}
+	}
+
+	for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+		if (!avg_latency[i].latency) {
+			if (td->avg_buckets[i].latency < last_latency)
+				td->avg_buckets[i].latency = last_latency;
+			continue;
+		}
+
+		if (!td->avg_buckets[i].valid)
+			latency = avg_latency[i].latency;
+		else
+			latency = (td->avg_buckets[i].latency * 7 +
+				avg_latency[i].latency) >> 3;
+
+		td->avg_buckets[i].latency = max(latency, last_latency);
+		td->avg_buckets[i].valid = true;
+		last_latency = td->avg_buckets[i].latency;
+	}
+}
+#else
+static inline void throtl_update_latency_buckets(struct throtl_data *td)
+{
+}
+#endif
+
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		    struct bio *bio)
 {
@@ -1939,6 +2031,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	struct throtl_service_queue *sq;
 	bool rw = bio_data_dir(bio);
 	bool throttled = false;
+	struct throtl_data *td = tg->td;
 	int ret;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
@@ -1949,6 +2042,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 	spin_lock_irq(q->queue_lock);
 
+	throtl_update_latency_buckets(td);
+
 	if (unlikely(blk_queue_bypass(q)))
 		goto out_unlock;
 
@@ -1956,6 +2051,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	if (ret == 0 || ret == -EBUSY)
 		bio->bi_cg_private = tg;
+	blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
 #endif
 	blk_throtl_update_idletime(tg);
 
@@ -1974,8 +2070,8 @@ again:
 		/* if above limits, break to queue */
 		if (!tg_may_dispatch(tg, bio, NULL)) {
 			tg->last_low_overflow_time[rw] = jiffies;
-			if (throtl_can_upgrade(tg->td, tg)) {
-				throtl_upgrade_state(tg->td);
+			if (throtl_can_upgrade(td, tg)) {
+				throtl_upgrade_state(td);
 				goto again;
 			}
 			break;
@@ -2019,7 +2115,7 @@ again:
 
 	tg->last_low_overflow_time[rw] = jiffies;
 
-	tg->td->nr_queued[rw]++;
+	td->nr_queued[rw]++;
 	throtl_add_bio_tg(bio, qn, tg);
 	throttled = true;
 
@@ -2044,20 +2140,67 @@ out:
 	 */
 	if (!throttled)
 		bio_clear_flag(bio, BIO_THROTTLED);
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+	if (throttled || !td->track_bio_latency)
+		bio->bi_issue_stat.stat |= SKIP_LATENCY;
+#endif
 	return throttled;
 }
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_track_latency(struct throtl_data *td, sector_t size,
+	int op, unsigned long time)
+{
+	struct latency_bucket *latency;
+	int index;
+
+	if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+	    !blk_queue_nonrot(td->queue))
+		return;
+
+	index = request_bucket_index(size);
+
+	latency = get_cpu_ptr(td->latency_buckets);
+	latency[index].total_latency += time;
+	latency[index].samples++;
+	put_cpu_ptr(td->latency_buckets);
+}
+
+void blk_throtl_stat_add(struct request *rq, u64 time_ns)
+{
+	struct request_queue *q = rq->q;
+	struct throtl_data *td = q->td;
+
+	throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
+		req_op(rq), time_ns >> 10);
+}
+
 void blk_throtl_bio_endio(struct bio *bio)
 {
 	struct throtl_grp *tg;
+	u64 finish_time_ns;
+	unsigned long finish_time;
+	unsigned long start_time;
+	unsigned long lat;
 
 	tg = bio->bi_cg_private;
 	if (!tg)
 		return;
 	bio->bi_cg_private = NULL;
 
-	tg->last_finish_time = ktime_get_ns() >> 10;
+	finish_time_ns = ktime_get_ns();
+	tg->last_finish_time = finish_time_ns >> 10;
+
+	start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
+	finish_time = __blk_stat_time(finish_time_ns) >> 10;
+	/* this is only for bio based driver */
+	if (start_time && finish_time > start_time &&
+	    !(bio->bi_issue_stat.stat & SKIP_LATENCY)) {
+		lat = finish_time - start_time;
+		throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
+			bio_op(bio), lat);
+	}
 }
 #endif
 
@@ -2133,6 +2276,12 @@ int blk_throtl_init(struct request_queue *q)
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
 	if (!td)
 		return -ENOMEM;
+	td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+		LATENCY_BUCKET_SIZE, __alignof__(u64));
+	if (!td->latency_buckets) {
+		kfree(td);
+		return -ENOMEM;
+	}
 
 	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
 	throtl_service_queue_init(&td->service_queue);
@@ -2147,8 +2296,10 @@ int blk_throtl_init(struct request_queue *q)
 
 	/* activate policy */
 	ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
-	if (ret)
+	if (ret) {
+		free_percpu(td->latency_buckets);
 		kfree(td);
+	}
 	return ret;
 }
 
@@ -2157,6 +2308,7 @@ void blk_throtl_exit(struct request_queue *q)
 	BUG_ON(!q->td);
 	throtl_shutdown_wq(q);
 	blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+	free_percpu(q->td->latency_buckets);
 	kfree(q->td);
 }
 
@@ -2181,6 +2333,10 @@ void blk_throtl_register_queue(struct request_queue *q)
 	td->throtl_slice = DFL_THROTL_SLICE_HD;
 #endif
 
+	td->track_bio_latency = !q->mq_ops && !q->request_fn;
+	if (!td->track_bio_latency)
+		blk_stat_enable_accounting(q);
+
 	/*
 	 * some tg are created before queue is fully initialized, eg, nonrot
 	 * isn't initialized yet
diff --git a/block/blk.h b/block/blk.h
index 3ac833ec2adb..07d375183f31 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -331,8 +331,10 @@ extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
 extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
 	const char *page, size_t count);
 extern void blk_throtl_bio_endio(struct bio *bio);
+extern void blk_throtl_stat_add(struct request *rq, u64 time);
 #else
 static inline void blk_throtl_bio_endio(struct bio *bio) { }
+static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
 #endif
 
 #endif /* BLK_INTERNAL_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3ad567347671..67bcf8a5326e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -17,6 +17,10 @@ struct io_context;
 struct cgroup_subsys_state;
 typedef void (bio_end_io_t) (struct bio *);
 
+struct blk_issue_stat {
+	u64 stat;
+};
+
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
@@ -60,6 +64,7 @@ struct bio {
 	struct cgroup_subsys_state *bi_css;
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	void			*bi_cg_private;
+	struct blk_issue_stat	bi_issue_stat;
 #endif
 #endif
 	union {
@@ -286,10 +291,6 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
 	return (cookie & BLK_QC_T_INTERNAL) != 0;
 }
 
-struct blk_issue_stat {
-	u64 stat;
-};
-
 struct blk_rq_stat {
 	s64 mean;
 	u64 min;
-- 
cgit v1.3-6-gb490


From dbc049eee73004db996cc8f63754f8dd5f86d0f7 Mon Sep 17 00:00:00 2001
From: Anup Patel <anup.patel@broadcom.com>
Date: Wed, 15 Mar 2017 12:10:00 +0530
Subject: mailbox: Add driver for Broadcom FlexRM ring manager

Some of the Broadcom iProc SoCs have FlexRM ring manager
which provides a ring-based programming interface to various
offload engines (e.g. RAID, Crypto, etc).

This patch adds a common mailbox driver for Broadcom FlexRM
ring manager which can be shared by various offload engine
drivers (implemented as mailbox clients).

Reviewed-by: Ray Jui <ray.jui@broadcom.com>
Reviewed-by: Scott Branden <scott.branden@broadcom.com>
Reviewed-by: Pramod KUMAR <pramod.kumar@broadcom.com>
Signed-off-by: Anup Patel <anup.patel@broadcom.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/Kconfig              |   11 +
 drivers/mailbox/Makefile             |    2 +
 drivers/mailbox/bcm-flexrm-mailbox.c | 1595 ++++++++++++++++++++++++++++++++++
 include/linux/mailbox/brcm-message.h |   14 +-
 4 files changed, 1618 insertions(+), 4 deletions(-)
 create mode 100644 drivers/mailbox/bcm-flexrm-mailbox.c

(limited to 'include/linux')

diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index 91eafb56568e..0b6f25e26c7c 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -151,4 +151,15 @@ config BCM_PDC_MBOX
 	  Mailbox implementation for the Broadcom FlexSparx DMA ring manager,
 	  which provides access to various offload engines on Broadcom
 	  SoCs, including FA2/FA+ on Northstar Plus and PDC on Northstar 2.
+
+config BCM_FLEXRM_MBOX
+	tristate "Broadcom FlexRM Mailbox"
+	depends on ARM64 || COMPILE_TEST
+	depends on HAS_DMA
+	select GENERIC_MSI_IRQ_DOMAIN
+	default ARCH_BCM_IPROC
+	help
+	  Mailbox implementation of the Broadcom FlexRM ring manager,
+	  which provides access to various offload engines on Broadcom
+	  SoCs. Say Y here if you want to use the Broadcom FlexRM.
 endif
diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile
index 7dde4f609ae8..e2bcb03cd35b 100644
--- a/drivers/mailbox/Makefile
+++ b/drivers/mailbox/Makefile
@@ -30,4 +30,6 @@ obj-$(CONFIG_HI6220_MBOX)	+= hi6220-mailbox.o
 
 obj-$(CONFIG_BCM_PDC_MBOX)	+= bcm-pdc-mailbox.o
 
+obj-$(CONFIG_BCM_FLEXRM_MBOX)	+= bcm-flexrm-mailbox.o
+
 obj-$(CONFIG_TEGRA_HSP_MBOX)	+= tegra-hsp.o
diff --git a/drivers/mailbox/bcm-flexrm-mailbox.c b/drivers/mailbox/bcm-flexrm-mailbox.c
new file mode 100644
index 000000000000..da67882caa7b
--- /dev/null
+++ b/drivers/mailbox/bcm-flexrm-mailbox.c
@@ -0,0 +1,1595 @@
+/* Broadcom FlexRM Mailbox Driver
+ *
+ * Copyright (C) 2017 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Each Broadcom FlexSparx4 offload engine is implemented as an
+ * extension to Broadcom FlexRM ring manager. The FlexRM ring
+ * manager provides a set of rings which can be used to submit
+ * work to a FlexSparx4 offload engine.
+ *
+ * This driver creates a mailbox controller using a set of FlexRM
+ * rings where each mailbox channel represents a separate FlexRM ring.
+ */
+
+#include <asm/barrier.h>
+#include <asm/byteorder.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmapool.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/mailbox_controller.h>
+#include <linux/mailbox_client.h>
+#include <linux/mailbox/brcm-message.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+
+/* ====== FlexRM register defines ===== */
+
+/* FlexRM configuration */
+#define RING_REGS_SIZE					0x10000
+#define RING_DESC_SIZE					8
+#define RING_DESC_INDEX(offset)				\
+			((offset) / RING_DESC_SIZE)
+#define RING_DESC_OFFSET(index)				\
+			((index) * RING_DESC_SIZE)
+#define RING_MAX_REQ_COUNT				1024
+#define RING_BD_ALIGN_ORDER				12
+#define RING_BD_ALIGN_CHECK(addr)			\
+			(!((addr) & ((0x1 << RING_BD_ALIGN_ORDER) - 1)))
+#define RING_BD_TOGGLE_INVALID(offset)			\
+			(((offset) >> RING_BD_ALIGN_ORDER) & 0x1)
+#define RING_BD_TOGGLE_VALID(offset)			\
+			(!RING_BD_TOGGLE_INVALID(offset))
+#define RING_BD_DESC_PER_REQ				32
+#define RING_BD_DESC_COUNT				\
+			(RING_MAX_REQ_COUNT * RING_BD_DESC_PER_REQ)
+#define RING_BD_SIZE					\
+			(RING_BD_DESC_COUNT * RING_DESC_SIZE)
+#define RING_CMPL_ALIGN_ORDER				13
+#define RING_CMPL_DESC_COUNT				RING_MAX_REQ_COUNT
+#define RING_CMPL_SIZE					\
+			(RING_CMPL_DESC_COUNT * RING_DESC_SIZE)
+#define RING_VER_MAGIC					0x76303031
+
+/* Per-Ring register offsets */
+#define RING_VER					0x000
+#define RING_BD_START_ADDR				0x004
+#define RING_BD_READ_PTR				0x008
+#define RING_BD_WRITE_PTR				0x00c
+#define RING_BD_READ_PTR_DDR_LS				0x010
+#define RING_BD_READ_PTR_DDR_MS				0x014
+#define RING_CMPL_START_ADDR				0x018
+#define RING_CMPL_WRITE_PTR				0x01c
+#define RING_NUM_REQ_RECV_LS				0x020
+#define RING_NUM_REQ_RECV_MS				0x024
+#define RING_NUM_REQ_TRANS_LS				0x028
+#define RING_NUM_REQ_TRANS_MS				0x02c
+#define RING_NUM_REQ_OUTSTAND				0x030
+#define RING_CONTROL					0x034
+#define RING_FLUSH_DONE					0x038
+#define RING_MSI_ADDR_LS				0x03c
+#define RING_MSI_ADDR_MS				0x040
+#define RING_MSI_CONTROL				0x048
+#define RING_BD_READ_PTR_DDR_CONTROL			0x04c
+#define RING_MSI_DATA_VALUE				0x064
+
+/* Register RING_BD_START_ADDR fields */
+#define BD_LAST_UPDATE_HW_SHIFT				28
+#define BD_LAST_UPDATE_HW_MASK				0x1
+#define BD_START_ADDR_VALUE(pa)				\
+	((u32)((((dma_addr_t)(pa)) >> RING_BD_ALIGN_ORDER) & 0x0fffffff))
+#define BD_START_ADDR_DECODE(val)			\
+	((dma_addr_t)((val) & 0x0fffffff) << RING_BD_ALIGN_ORDER)
+
+/* Register RING_CMPL_START_ADDR fields */
+#define CMPL_START_ADDR_VALUE(pa)			\
+	((u32)((((u64)(pa)) >> RING_CMPL_ALIGN_ORDER) & 0x03ffffff))
+
+/* Register RING_CONTROL fields */
+#define CONTROL_MASK_DISABLE_CONTROL			12
+#define CONTROL_FLUSH_SHIFT				5
+#define CONTROL_ACTIVE_SHIFT				4
+#define CONTROL_RATE_ADAPT_MASK				0xf
+#define CONTROL_RATE_DYNAMIC				0x0
+#define CONTROL_RATE_FAST				0x8
+#define CONTROL_RATE_MEDIUM				0x9
+#define CONTROL_RATE_SLOW				0xa
+#define CONTROL_RATE_IDLE				0xb
+
+/* Register RING_FLUSH_DONE fields */
+#define FLUSH_DONE_MASK					0x1
+
+/* Register RING_MSI_CONTROL fields */
+#define MSI_TIMER_VAL_SHIFT				16
+#define MSI_TIMER_VAL_MASK				0xffff
+#define MSI_ENABLE_SHIFT				15
+#define MSI_ENABLE_MASK					0x1
+#define MSI_COUNT_SHIFT					0
+#define MSI_COUNT_MASK					0x3ff
+
+/* Register RING_BD_READ_PTR_DDR_CONTROL fields */
+#define BD_READ_PTR_DDR_TIMER_VAL_SHIFT			16
+#define BD_READ_PTR_DDR_TIMER_VAL_MASK			0xffff
+#define BD_READ_PTR_DDR_ENABLE_SHIFT			15
+#define BD_READ_PTR_DDR_ENABLE_MASK			0x1
+
+/* ====== FlexRM ring descriptor defines ===== */
+
+/* Completion descriptor format */
+#define CMPL_OPAQUE_SHIFT			0
+#define CMPL_OPAQUE_MASK			0xffff
+#define CMPL_ENGINE_STATUS_SHIFT		16
+#define CMPL_ENGINE_STATUS_MASK			0xffff
+#define CMPL_DME_STATUS_SHIFT			32
+#define CMPL_DME_STATUS_MASK			0xffff
+#define CMPL_RM_STATUS_SHIFT			48
+#define CMPL_RM_STATUS_MASK			0xffff
+
+/* Completion DME status code */
+#define DME_STATUS_MEM_COR_ERR			BIT(0)
+#define DME_STATUS_MEM_UCOR_ERR			BIT(1)
+#define DME_STATUS_FIFO_UNDERFLOW		BIT(2)
+#define DME_STATUS_FIFO_OVERFLOW		BIT(3)
+#define DME_STATUS_RRESP_ERR			BIT(4)
+#define DME_STATUS_BRESP_ERR			BIT(5)
+#define DME_STATUS_ERROR_MASK			(DME_STATUS_MEM_COR_ERR | \
+						 DME_STATUS_MEM_UCOR_ERR | \
+						 DME_STATUS_FIFO_UNDERFLOW | \
+						 DME_STATUS_FIFO_OVERFLOW | \
+						 DME_STATUS_RRESP_ERR | \
+						 DME_STATUS_BRESP_ERR)
+
+/* Completion RM status code */
+#define RM_STATUS_CODE_SHIFT			0
+#define RM_STATUS_CODE_MASK			0x3ff
+#define RM_STATUS_CODE_GOOD			0x0
+#define RM_STATUS_CODE_AE_TIMEOUT		0x3ff
+
+/* General descriptor format */
+#define DESC_TYPE_SHIFT				60
+#define DESC_TYPE_MASK				0xf
+#define DESC_PAYLOAD_SHIFT			0
+#define DESC_PAYLOAD_MASK			0x0fffffffffffffff
+
+/* Null descriptor format  */
+#define NULL_TYPE				0
+#define NULL_TOGGLE_SHIFT			58
+#define NULL_TOGGLE_MASK			0x1
+
+/* Header descriptor format */
+#define HEADER_TYPE				1
+#define HEADER_TOGGLE_SHIFT			58
+#define HEADER_TOGGLE_MASK			0x1
+#define HEADER_ENDPKT_SHIFT			57
+#define HEADER_ENDPKT_MASK			0x1
+#define HEADER_STARTPKT_SHIFT			56
+#define HEADER_STARTPKT_MASK			0x1
+#define HEADER_BDCOUNT_SHIFT			36
+#define HEADER_BDCOUNT_MASK			0x1f
+#define HEADER_BDCOUNT_MAX			HEADER_BDCOUNT_MASK
+#define HEADER_FLAGS_SHIFT			16
+#define HEADER_FLAGS_MASK			0xffff
+#define HEADER_OPAQUE_SHIFT			0
+#define HEADER_OPAQUE_MASK			0xffff
+
+/* Source (SRC) descriptor format */
+#define SRC_TYPE				2
+#define SRC_LENGTH_SHIFT			44
+#define SRC_LENGTH_MASK				0xffff
+#define SRC_ADDR_SHIFT				0
+#define SRC_ADDR_MASK				0x00000fffffffffff
+
+/* Destination (DST) descriptor format */
+#define DST_TYPE				3
+#define DST_LENGTH_SHIFT			44
+#define DST_LENGTH_MASK				0xffff
+#define DST_ADDR_SHIFT				0
+#define DST_ADDR_MASK				0x00000fffffffffff
+
+/* Immediate (IMM) descriptor format */
+#define IMM_TYPE				4
+#define IMM_DATA_SHIFT				0
+#define IMM_DATA_MASK				0x0fffffffffffffff
+
+/* Next pointer (NPTR) descriptor format */
+#define NPTR_TYPE				5
+#define NPTR_TOGGLE_SHIFT			58
+#define NPTR_TOGGLE_MASK			0x1
+#define NPTR_ADDR_SHIFT				0
+#define NPTR_ADDR_MASK				0x00000fffffffffff
+
+/* Mega source (MSRC) descriptor format */
+#define MSRC_TYPE				6
+#define MSRC_LENGTH_SHIFT			44
+#define MSRC_LENGTH_MASK			0xffff
+#define MSRC_ADDR_SHIFT				0
+#define MSRC_ADDR_MASK				0x00000fffffffffff
+
+/* Mega destination (MDST) descriptor format */
+#define MDST_TYPE				7
+#define MDST_LENGTH_SHIFT			44
+#define MDST_LENGTH_MASK			0xffff
+#define MDST_ADDR_SHIFT				0
+#define MDST_ADDR_MASK				0x00000fffffffffff
+
+/* Source with tlast (SRCT) descriptor format */
+#define SRCT_TYPE				8
+#define SRCT_LENGTH_SHIFT			44
+#define SRCT_LENGTH_MASK			0xffff
+#define SRCT_ADDR_SHIFT				0
+#define SRCT_ADDR_MASK				0x00000fffffffffff
+
+/* Destination with tlast (DSTT) descriptor format */
+#define DSTT_TYPE				9
+#define DSTT_LENGTH_SHIFT			44
+#define DSTT_LENGTH_MASK			0xffff
+#define DSTT_ADDR_SHIFT				0
+#define DSTT_ADDR_MASK				0x00000fffffffffff
+
+/* Immediate with tlast (IMMT) descriptor format */
+#define IMMT_TYPE				10
+#define IMMT_DATA_SHIFT				0
+#define IMMT_DATA_MASK				0x0fffffffffffffff
+
+/* Descriptor helper macros */
+#define DESC_DEC(_d, _s, _m)			(((_d) >> (_s)) & (_m))
+#define DESC_ENC(_d, _v, _s, _m)		\
+			do { \
+				(_d) &= ~((u64)(_m) << (_s)); \
+				(_d) |= (((u64)(_v) & (_m)) << (_s)); \
+			} while (0)
+
+/* ====== FlexRM data structures ===== */
+
+struct flexrm_ring {
+	/* Unprotected members */
+	int num;
+	struct flexrm_mbox *mbox;
+	void __iomem *regs;
+	bool irq_requested;
+	unsigned int irq;
+	unsigned int msi_timer_val;
+	unsigned int msi_count_threshold;
+	struct ida requests_ida;
+	struct brcm_message *requests[RING_MAX_REQ_COUNT];
+	void *bd_base;
+	dma_addr_t bd_dma_base;
+	u32 bd_write_offset;
+	void *cmpl_base;
+	dma_addr_t cmpl_dma_base;
+	/* Protected members */
+	spinlock_t lock;
+	struct brcm_message *last_pending_msg;
+	u32 cmpl_read_offset;
+};
+
+struct flexrm_mbox {
+	struct device *dev;
+	void __iomem *regs;
+	u32 num_rings;
+	struct flexrm_ring *rings;
+	struct dma_pool *bd_pool;
+	struct dma_pool *cmpl_pool;
+	struct mbox_controller controller;
+};
+
+/* ====== FlexRM ring descriptor helper routines ===== */
+
+static u64 flexrm_read_desc(void *desc_ptr)
+{
+	return le64_to_cpu(*((u64 *)desc_ptr));
+}
+
+static void flexrm_write_desc(void *desc_ptr, u64 desc)
+{
+	*((u64 *)desc_ptr) = cpu_to_le64(desc);
+}
+
+static u32 flexrm_cmpl_desc_to_reqid(u64 cmpl_desc)
+{
+	return (u32)(cmpl_desc & CMPL_OPAQUE_MASK);
+}
+
+static int flexrm_cmpl_desc_to_error(u64 cmpl_desc)
+{
+	u32 status;
+
+	status = DESC_DEC(cmpl_desc, CMPL_DME_STATUS_SHIFT,
+			  CMPL_DME_STATUS_MASK);
+	if (status & DME_STATUS_ERROR_MASK)
+		return -EIO;
+
+	status = DESC_DEC(cmpl_desc, CMPL_RM_STATUS_SHIFT,
+			  CMPL_RM_STATUS_MASK);
+	status &= RM_STATUS_CODE_MASK;
+	if (status == RM_STATUS_CODE_AE_TIMEOUT)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+static bool flexrm_is_next_table_desc(void *desc_ptr)
+{
+	u64 desc = flexrm_read_desc(desc_ptr);
+	u32 type = DESC_DEC(desc, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+
+	return (type == NPTR_TYPE) ? true : false;
+}
+
+static u64 flexrm_next_table_desc(u32 toggle, dma_addr_t next_addr)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, NPTR_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, toggle, NPTR_TOGGLE_SHIFT, NPTR_TOGGLE_MASK);
+	DESC_ENC(desc, next_addr, NPTR_ADDR_SHIFT, NPTR_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_null_desc(u32 toggle)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, NULL_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, toggle, NULL_TOGGLE_SHIFT, NULL_TOGGLE_MASK);
+
+	return desc;
+}
+
+static u32 flexrm_estimate_header_desc_count(u32 nhcnt)
+{
+	u32 hcnt = nhcnt / HEADER_BDCOUNT_MAX;
+
+	if (!(nhcnt % HEADER_BDCOUNT_MAX))
+		hcnt += 1;
+
+	return hcnt;
+}
+
+static void flexrm_flip_header_toogle(void *desc_ptr)
+{
+	u64 desc = flexrm_read_desc(desc_ptr);
+
+	if (desc & ((u64)0x1 << HEADER_TOGGLE_SHIFT))
+		desc &= ~((u64)0x1 << HEADER_TOGGLE_SHIFT);
+	else
+		desc |= ((u64)0x1 << HEADER_TOGGLE_SHIFT);
+
+	flexrm_write_desc(desc_ptr, desc);
+}
+
+static u64 flexrm_header_desc(u32 toggle, u32 startpkt, u32 endpkt,
+			       u32 bdcount, u32 flags, u32 opaque)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, HEADER_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, toggle, HEADER_TOGGLE_SHIFT, HEADER_TOGGLE_MASK);
+	DESC_ENC(desc, startpkt, HEADER_STARTPKT_SHIFT, HEADER_STARTPKT_MASK);
+	DESC_ENC(desc, endpkt, HEADER_ENDPKT_SHIFT, HEADER_ENDPKT_MASK);
+	DESC_ENC(desc, bdcount, HEADER_BDCOUNT_SHIFT, HEADER_BDCOUNT_MASK);
+	DESC_ENC(desc, flags, HEADER_FLAGS_SHIFT, HEADER_FLAGS_MASK);
+	DESC_ENC(desc, opaque, HEADER_OPAQUE_SHIFT, HEADER_OPAQUE_MASK);
+
+	return desc;
+}
+
+static void flexrm_enqueue_desc(u32 nhpos, u32 nhcnt, u32 reqid,
+				 u64 desc, void **desc_ptr, u32 *toggle,
+				 void *start_desc, void *end_desc)
+{
+	u64 d;
+	u32 nhavail, _toggle, _startpkt, _endpkt, _bdcount;
+
+	/* Sanity check */
+	if (nhcnt <= nhpos)
+		return;
+
+	/*
+	 * Each request or packet start with a HEADER descriptor followed
+	 * by one or more non-HEADER descriptors (SRC, SRCT, MSRC, DST,
+	 * DSTT, MDST, IMM, and IMMT). The number of non-HEADER descriptors
+	 * following a HEADER descriptor is represented by BDCOUNT field
+	 * of HEADER descriptor. The max value of BDCOUNT field is 31 which
+	 * means we can only have 31 non-HEADER descriptors following one
+	 * HEADER descriptor.
+	 *
+	 * In general use, number of non-HEADER descriptors can easily go
+	 * beyond 31. To tackle this situation, we have packet (or request)
+	 * extenstion bits (STARTPKT and ENDPKT) in the HEADER descriptor.
+	 *
+	 * To use packet extension, the first HEADER descriptor of request
+	 * (or packet) will have STARTPKT=1 and ENDPKT=0. The intermediate
+	 * HEADER descriptors will have STARTPKT=0 and ENDPKT=0. The last
+	 * HEADER descriptor will have STARTPKT=0 and ENDPKT=1. Also, the
+	 * TOGGLE bit of the first HEADER will be set to invalid state to
+	 * ensure that FlexRM does not start fetching descriptors till all
+	 * descriptors are enqueued. The user of this function will flip
+	 * the TOGGLE bit of first HEADER after all descriptors are
+	 * enqueued.
+	 */
+
+	if ((nhpos % HEADER_BDCOUNT_MAX == 0) && (nhcnt - nhpos)) {
+		/* Prepare the header descriptor */
+		nhavail = (nhcnt - nhpos);
+		_toggle = (nhpos == 0) ? !(*toggle) : (*toggle);
+		_startpkt = (nhpos == 0) ? 0x1 : 0x0;
+		_endpkt = (nhavail <= HEADER_BDCOUNT_MAX) ? 0x1 : 0x0;
+		_bdcount = (nhavail <= HEADER_BDCOUNT_MAX) ?
+				nhavail : HEADER_BDCOUNT_MAX;
+		if (nhavail <= HEADER_BDCOUNT_MAX)
+			_bdcount = nhavail;
+		else
+			_bdcount = HEADER_BDCOUNT_MAX;
+		d = flexrm_header_desc(_toggle, _startpkt, _endpkt,
+					_bdcount, 0x0, reqid);
+
+		/* Write header descriptor */
+		flexrm_write_desc(*desc_ptr, d);
+
+		/* Point to next descriptor */
+		*desc_ptr += sizeof(desc);
+		if (*desc_ptr == end_desc)
+			*desc_ptr = start_desc;
+
+		/* Skip next pointer descriptors */
+		while (flexrm_is_next_table_desc(*desc_ptr)) {
+			*toggle = (*toggle) ? 0 : 1;
+			*desc_ptr += sizeof(desc);
+			if (*desc_ptr == end_desc)
+				*desc_ptr = start_desc;
+		}
+	}
+
+	/* Write desired descriptor */
+	flexrm_write_desc(*desc_ptr, desc);
+
+	/* Point to next descriptor */
+	*desc_ptr += sizeof(desc);
+	if (*desc_ptr == end_desc)
+		*desc_ptr = start_desc;
+
+	/* Skip next pointer descriptors */
+	while (flexrm_is_next_table_desc(*desc_ptr)) {
+		*toggle = (*toggle) ? 0 : 1;
+		*desc_ptr += sizeof(desc);
+		if (*desc_ptr == end_desc)
+			*desc_ptr = start_desc;
+	}
+}
+
+static u64 flexrm_src_desc(dma_addr_t addr, unsigned int length)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, SRC_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, length, SRC_LENGTH_SHIFT, SRC_LENGTH_MASK);
+	DESC_ENC(desc, addr, SRC_ADDR_SHIFT, SRC_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_msrc_desc(dma_addr_t addr, unsigned int length_div_16)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, MSRC_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, length_div_16, MSRC_LENGTH_SHIFT, MSRC_LENGTH_MASK);
+	DESC_ENC(desc, addr, MSRC_ADDR_SHIFT, MSRC_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_dst_desc(dma_addr_t addr, unsigned int length)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, DST_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, length, DST_LENGTH_SHIFT, DST_LENGTH_MASK);
+	DESC_ENC(desc, addr, DST_ADDR_SHIFT, DST_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_mdst_desc(dma_addr_t addr, unsigned int length_div_16)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, MDST_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, length_div_16, MDST_LENGTH_SHIFT, MDST_LENGTH_MASK);
+	DESC_ENC(desc, addr, MDST_ADDR_SHIFT, MDST_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_imm_desc(u64 data)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, IMM_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, data, IMM_DATA_SHIFT, IMM_DATA_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_srct_desc(dma_addr_t addr, unsigned int length)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, SRCT_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, length, SRCT_LENGTH_SHIFT, SRCT_LENGTH_MASK);
+	DESC_ENC(desc, addr, SRCT_ADDR_SHIFT, SRCT_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_dstt_desc(dma_addr_t addr, unsigned int length)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, DSTT_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, length, DSTT_LENGTH_SHIFT, DSTT_LENGTH_MASK);
+	DESC_ENC(desc, addr, DSTT_ADDR_SHIFT, DSTT_ADDR_MASK);
+
+	return desc;
+}
+
+static u64 flexrm_immt_desc(u64 data)
+{
+	u64 desc = 0;
+
+	DESC_ENC(desc, IMMT_TYPE, DESC_TYPE_SHIFT, DESC_TYPE_MASK);
+	DESC_ENC(desc, data, IMMT_DATA_SHIFT, IMMT_DATA_MASK);
+
+	return desc;
+}
+
+static bool flexrm_spu_sanity_check(struct brcm_message *msg)
+{
+	struct scatterlist *sg;
+
+	if (!msg->spu.src || !msg->spu.dst)
+		return false;
+	for (sg = msg->spu.src; sg; sg = sg_next(sg)) {
+		if (sg->length & 0xf) {
+			if (sg->length > SRC_LENGTH_MASK)
+				return false;
+		} else {
+			if (sg->length > (MSRC_LENGTH_MASK * 16))
+				return false;
+		}
+	}
+	for (sg = msg->spu.dst; sg; sg = sg_next(sg)) {
+		if (sg->length & 0xf) {
+			if (sg->length > DST_LENGTH_MASK)
+				return false;
+		} else {
+			if (sg->length > (MDST_LENGTH_MASK * 16))
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static u32 flexrm_spu_estimate_nonheader_desc_count(struct brcm_message *msg)
+{
+	u32 cnt = 0;
+	unsigned int dst_target = 0;
+	struct scatterlist *src_sg = msg->spu.src, *dst_sg = msg->spu.dst;
+
+	while (src_sg || dst_sg) {
+		if (src_sg) {
+			cnt++;
+			dst_target = src_sg->length;
+			src_sg = sg_next(src_sg);
+		} else
+			dst_target = UINT_MAX;
+
+		while (dst_target && dst_sg) {
+			cnt++;
+			if (dst_sg->length < dst_target)
+				dst_target -= dst_sg->length;
+			else
+				dst_target = 0;
+			dst_sg = sg_next(dst_sg);
+		}
+	}
+
+	return cnt;
+}
+
+static int flexrm_spu_dma_map(struct device *dev, struct brcm_message *msg)
+{
+	int rc;
+
+	rc = dma_map_sg(dev, msg->spu.src, sg_nents(msg->spu.src),
+			DMA_TO_DEVICE);
+	if (rc < 0)
+		return rc;
+
+	rc = dma_map_sg(dev, msg->spu.dst, sg_nents(msg->spu.dst),
+			DMA_FROM_DEVICE);
+	if (rc < 0) {
+		dma_unmap_sg(dev, msg->spu.src, sg_nents(msg->spu.src),
+			     DMA_TO_DEVICE);
+		return rc;
+	}
+
+	return 0;
+}
+
+static void flexrm_spu_dma_unmap(struct device *dev, struct brcm_message *msg)
+{
+	dma_unmap_sg(dev, msg->spu.dst, sg_nents(msg->spu.dst),
+		     DMA_FROM_DEVICE);
+	dma_unmap_sg(dev, msg->spu.src, sg_nents(msg->spu.src),
+		     DMA_TO_DEVICE);
+}
+
+static void *flexrm_spu_write_descs(struct brcm_message *msg, u32 nhcnt,
+				     u32 reqid, void *desc_ptr, u32 toggle,
+				     void *start_desc, void *end_desc)
+{
+	u64 d;
+	u32 nhpos = 0;
+	void *orig_desc_ptr = desc_ptr;
+	unsigned int dst_target = 0;
+	struct scatterlist *src_sg = msg->spu.src, *dst_sg = msg->spu.dst;
+
+	while (src_sg || dst_sg) {
+		if (src_sg) {
+			if (sg_dma_len(src_sg) & 0xf)
+				d = flexrm_src_desc(sg_dma_address(src_sg),
+						     sg_dma_len(src_sg));
+			else
+				d = flexrm_msrc_desc(sg_dma_address(src_sg),
+						      sg_dma_len(src_sg)/16);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+			dst_target = sg_dma_len(src_sg);
+			src_sg = sg_next(src_sg);
+		} else
+			dst_target = UINT_MAX;
+
+		while (dst_target && dst_sg) {
+			if (sg_dma_len(dst_sg) & 0xf)
+				d = flexrm_dst_desc(sg_dma_address(dst_sg),
+						     sg_dma_len(dst_sg));
+			else
+				d = flexrm_mdst_desc(sg_dma_address(dst_sg),
+						      sg_dma_len(dst_sg)/16);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+			if (sg_dma_len(dst_sg) < dst_target)
+				dst_target -= sg_dma_len(dst_sg);
+			else
+				dst_target = 0;
+			dst_sg = sg_next(dst_sg);
+		}
+	}
+
+	/* Null descriptor with invalid toggle bit */
+	flexrm_write_desc(desc_ptr, flexrm_null_desc(!toggle));
+
+	/* Ensure that descriptors have been written to memory */
+	wmb();
+
+	/* Flip toggle bit in header */
+	flexrm_flip_header_toogle(orig_desc_ptr);
+
+	return desc_ptr;
+}
+
+static bool flexrm_sba_sanity_check(struct brcm_message *msg)
+{
+	u32 i;
+
+	if (!msg->sba.cmds || !msg->sba.cmds_count)
+		return false;
+
+	for (i = 0; i < msg->sba.cmds_count; i++) {
+		if (((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_B) ||
+		     (msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_C)) &&
+		    (msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_OUTPUT))
+			return false;
+		if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_B) &&
+		    (msg->sba.cmds[i].data_len > SRCT_LENGTH_MASK))
+			return false;
+		if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_C) &&
+		    (msg->sba.cmds[i].data_len > SRCT_LENGTH_MASK))
+			return false;
+		if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_RESP) &&
+		    (msg->sba.cmds[i].resp_len > DSTT_LENGTH_MASK))
+			return false;
+		if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_OUTPUT) &&
+		    (msg->sba.cmds[i].data_len > DSTT_LENGTH_MASK))
+			return false;
+	}
+
+	return true;
+}
+
+static u32 flexrm_sba_estimate_nonheader_desc_count(struct brcm_message *msg)
+{
+	u32 i, cnt;
+
+	cnt = 0;
+	for (i = 0; i < msg->sba.cmds_count; i++) {
+		cnt++;
+
+		if ((msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_B) ||
+		    (msg->sba.cmds[i].flags & BRCM_SBA_CMD_TYPE_C))
+			cnt++;
+
+		if (msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_RESP)
+			cnt++;
+
+		if (msg->sba.cmds[i].flags & BRCM_SBA_CMD_HAS_OUTPUT)
+			cnt++;
+	}
+
+	return cnt;
+}
+
+static void *flexrm_sba_write_descs(struct brcm_message *msg, u32 nhcnt,
+				     u32 reqid, void *desc_ptr, u32 toggle,
+				     void *start_desc, void *end_desc)
+{
+	u64 d;
+	u32 i, nhpos = 0;
+	struct brcm_sba_command *c;
+	void *orig_desc_ptr = desc_ptr;
+
+	/* Convert SBA commands into descriptors */
+	for (i = 0; i < msg->sba.cmds_count; i++) {
+		c = &msg->sba.cmds[i];
+
+		if ((c->flags & BRCM_SBA_CMD_HAS_RESP) &&
+		    (c->flags & BRCM_SBA_CMD_HAS_OUTPUT)) {
+			/* Destination response descriptor */
+			d = flexrm_dst_desc(c->resp, c->resp_len);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+		} else if (c->flags & BRCM_SBA_CMD_HAS_RESP) {
+			/* Destination response with tlast descriptor */
+			d = flexrm_dstt_desc(c->resp, c->resp_len);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+		}
+
+		if (c->flags & BRCM_SBA_CMD_HAS_OUTPUT) {
+			/* Destination with tlast descriptor */
+			d = flexrm_dstt_desc(c->data, c->data_len);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+		}
+
+		if (c->flags & BRCM_SBA_CMD_TYPE_B) {
+			/* Command as immediate descriptor */
+			d = flexrm_imm_desc(c->cmd);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+		} else {
+			/* Command as immediate descriptor with tlast */
+			d = flexrm_immt_desc(c->cmd);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+		}
+
+		if ((c->flags & BRCM_SBA_CMD_TYPE_B) ||
+		    (c->flags & BRCM_SBA_CMD_TYPE_C)) {
+			/* Source with tlast descriptor */
+			d = flexrm_srct_desc(c->data, c->data_len);
+			flexrm_enqueue_desc(nhpos, nhcnt, reqid,
+					     d, &desc_ptr, &toggle,
+					     start_desc, end_desc);
+			nhpos++;
+		}
+	}
+
+	/* Null descriptor with invalid toggle bit */
+	flexrm_write_desc(desc_ptr, flexrm_null_desc(!toggle));
+
+	/* Ensure that descriptors have been written to memory */
+	wmb();
+
+	/* Flip toggle bit in header */
+	flexrm_flip_header_toogle(orig_desc_ptr);
+
+	return desc_ptr;
+}
+
+static bool flexrm_sanity_check(struct brcm_message *msg)
+{
+	if (!msg)
+		return false;
+
+	switch (msg->type) {
+	case BRCM_MESSAGE_SPU:
+		return flexrm_spu_sanity_check(msg);
+	case BRCM_MESSAGE_SBA:
+		return flexrm_sba_sanity_check(msg);
+	default:
+		return false;
+	};
+}
+
+static u32 flexrm_estimate_nonheader_desc_count(struct brcm_message *msg)
+{
+	if (!msg)
+		return 0;
+
+	switch (msg->type) {
+	case BRCM_MESSAGE_SPU:
+		return flexrm_spu_estimate_nonheader_desc_count(msg);
+	case BRCM_MESSAGE_SBA:
+		return flexrm_sba_estimate_nonheader_desc_count(msg);
+	default:
+		return 0;
+	};
+}
+
+static int flexrm_dma_map(struct device *dev, struct brcm_message *msg)
+{
+	if (!dev || !msg)
+		return -EINVAL;
+
+	switch (msg->type) {
+	case BRCM_MESSAGE_SPU:
+		return flexrm_spu_dma_map(dev, msg);
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static void flexrm_dma_unmap(struct device *dev, struct brcm_message *msg)
+{
+	if (!dev || !msg)
+		return;
+
+	switch (msg->type) {
+	case BRCM_MESSAGE_SPU:
+		flexrm_spu_dma_unmap(dev, msg);
+		break;
+	default:
+		break;
+	}
+}
+
+static void *flexrm_write_descs(struct brcm_message *msg, u32 nhcnt,
+				u32 reqid, void *desc_ptr, u32 toggle,
+				void *start_desc, void *end_desc)
+{
+	if (!msg || !desc_ptr || !start_desc || !end_desc)
+		return ERR_PTR(-ENOTSUPP);
+
+	if ((desc_ptr < start_desc) || (end_desc <= desc_ptr))
+		return ERR_PTR(-ERANGE);
+
+	switch (msg->type) {
+	case BRCM_MESSAGE_SPU:
+		return flexrm_spu_write_descs(msg, nhcnt, reqid,
+					       desc_ptr, toggle,
+					       start_desc, end_desc);
+	case BRCM_MESSAGE_SBA:
+		return flexrm_sba_write_descs(msg, nhcnt, reqid,
+					       desc_ptr, toggle,
+					       start_desc, end_desc);
+	default:
+		return ERR_PTR(-ENOTSUPP);
+	};
+}
+
+/* ====== FlexRM driver helper routines ===== */
+
+static int flexrm_new_request(struct flexrm_ring *ring,
+				struct brcm_message *batch_msg,
+				struct brcm_message *msg)
+{
+	void *next;
+	unsigned long flags;
+	u32 val, count, nhcnt;
+	u32 read_offset, write_offset;
+	bool exit_cleanup = false;
+	int ret = 0, reqid;
+
+	/* Do sanity check on message */
+	if (!flexrm_sanity_check(msg))
+		return -EIO;
+	msg->error = 0;
+
+	/* If no requests possible then save data pointer and goto done. */
+	reqid = ida_simple_get(&ring->requests_ida, 0,
+				RING_MAX_REQ_COUNT, GFP_KERNEL);
+	if (reqid < 0) {
+		spin_lock_irqsave(&ring->lock, flags);
+		if (batch_msg)
+			ring->last_pending_msg = batch_msg;
+		else
+			ring->last_pending_msg = msg;
+		spin_unlock_irqrestore(&ring->lock, flags);
+		return 0;
+	}
+	ring->requests[reqid] = msg;
+
+	/* Do DMA mappings for the message */
+	ret = flexrm_dma_map(ring->mbox->dev, msg);
+	if (ret < 0) {
+		ring->requests[reqid] = NULL;
+		ida_simple_remove(&ring->requests_ida, reqid);
+		return ret;
+	}
+
+	/* If last_pending_msg is already set then goto done with error */
+	spin_lock_irqsave(&ring->lock, flags);
+	if (ring->last_pending_msg)
+		ret = -ENOSPC;
+	spin_unlock_irqrestore(&ring->lock, flags);
+	if (ret < 0) {
+		dev_warn(ring->mbox->dev, "no space in ring %d\n", ring->num);
+		exit_cleanup = true;
+		goto exit;
+	}
+
+	/* Determine current HW BD read offset */
+	read_offset = readl_relaxed(ring->regs + RING_BD_READ_PTR);
+	val = readl_relaxed(ring->regs + RING_BD_START_ADDR);
+	read_offset *= RING_DESC_SIZE;
+	read_offset += (u32)(BD_START_ADDR_DECODE(val) - ring->bd_dma_base);
+
+	/*
+	 * Number required descriptors = number of non-header descriptors +
+	 *				 number of header descriptors +
+	 *				 1x null descriptor
+	 */
+	nhcnt = flexrm_estimate_nonheader_desc_count(msg);
+	count = flexrm_estimate_header_desc_count(nhcnt) + nhcnt + 1;
+
+	/* Check for available descriptor space. */
+	write_offset = ring->bd_write_offset;
+	while (count) {
+		if (!flexrm_is_next_table_desc(ring->bd_base + write_offset))
+			count--;
+		write_offset += RING_DESC_SIZE;
+		if (write_offset == RING_BD_SIZE)
+			write_offset = 0x0;
+		if (write_offset == read_offset)
+			break;
+	}
+	if (count) {
+		spin_lock_irqsave(&ring->lock, flags);
+		if (batch_msg)
+			ring->last_pending_msg = batch_msg;
+		else
+			ring->last_pending_msg = msg;
+		spin_unlock_irqrestore(&ring->lock, flags);
+		ret = 0;
+		exit_cleanup = true;
+		goto exit;
+	}
+
+	/* Write descriptors to ring */
+	next = flexrm_write_descs(msg, nhcnt, reqid,
+			ring->bd_base + ring->bd_write_offset,
+			RING_BD_TOGGLE_VALID(ring->bd_write_offset),
+			ring->bd_base, ring->bd_base + RING_BD_SIZE);
+	if (IS_ERR(next)) {
+		ret = PTR_ERR(next);
+		exit_cleanup = true;
+		goto exit;
+	}
+
+	/* Save ring BD write offset */
+	ring->bd_write_offset = (unsigned long)(next - ring->bd_base);
+
+exit:
+	/* Update error status in message */
+	msg->error = ret;
+
+	/* Cleanup if we failed */
+	if (exit_cleanup) {
+		flexrm_dma_unmap(ring->mbox->dev, msg);
+		ring->requests[reqid] = NULL;
+		ida_simple_remove(&ring->requests_ida, reqid);
+	}
+
+	return ret;
+}
+
+static int flexrm_process_completions(struct flexrm_ring *ring)
+{
+	u64 desc;
+	int err, count = 0;
+	unsigned long flags;
+	struct brcm_message *msg = NULL;
+	u32 reqid, cmpl_read_offset, cmpl_write_offset;
+	struct mbox_chan *chan = &ring->mbox->controller.chans[ring->num];
+
+	spin_lock_irqsave(&ring->lock, flags);
+
+	/* Check last_pending_msg */
+	if (ring->last_pending_msg) {
+		msg = ring->last_pending_msg;
+		ring->last_pending_msg = NULL;
+	}
+
+	/*
+	 * Get current completion read and write offset
+	 *
+	 * Note: We should read completion write pointer atleast once
+	 * after we get a MSI interrupt because HW maintains internal
+	 * MSI status which will allow next MSI interrupt only after
+	 * completion write pointer is read.
+	 */
+	cmpl_write_offset = readl_relaxed(ring->regs + RING_CMPL_WRITE_PTR);
+	cmpl_write_offset *= RING_DESC_SIZE;
+	cmpl_read_offset = ring->cmpl_read_offset;
+	ring->cmpl_read_offset = cmpl_write_offset;
+
+	spin_unlock_irqrestore(&ring->lock, flags);
+
+	/* If last_pending_msg was set then queue it back */
+	if (msg)
+		mbox_send_message(chan, msg);
+
+	/* For each completed request notify mailbox clients */
+	reqid = 0;
+	while (cmpl_read_offset != cmpl_write_offset) {
+		/* Dequeue next completion descriptor */
+		desc = *((u64 *)(ring->cmpl_base + cmpl_read_offset));
+
+		/* Next read offset */
+		cmpl_read_offset += RING_DESC_SIZE;
+		if (cmpl_read_offset == RING_CMPL_SIZE)
+			cmpl_read_offset = 0;
+
+		/* Decode error from completion descriptor */
+		err = flexrm_cmpl_desc_to_error(desc);
+		if (err < 0) {
+			dev_warn(ring->mbox->dev,
+				 "got completion desc=0x%lx with error %d",
+				 (unsigned long)desc, err);
+		}
+
+		/* Determine request id from completion descriptor */
+		reqid = flexrm_cmpl_desc_to_reqid(desc);
+
+		/* Determine message pointer based on reqid */
+		msg = ring->requests[reqid];
+		if (!msg) {
+			dev_warn(ring->mbox->dev,
+				 "null msg pointer for completion desc=0x%lx",
+				 (unsigned long)desc);
+			continue;
+		}
+
+		/* Release reqid for recycling */
+		ring->requests[reqid] = NULL;
+		ida_simple_remove(&ring->requests_ida, reqid);
+
+		/* Unmap DMA mappings */
+		flexrm_dma_unmap(ring->mbox->dev, msg);
+
+		/* Give-back message to mailbox client */
+		msg->error = err;
+		mbox_chan_received_data(chan, msg);
+
+		/* Increment number of completions processed */
+		count++;
+	}
+
+	return count;
+}
+
+/* ====== FlexRM interrupt handler ===== */
+
+static irqreturn_t flexrm_irq_event(int irq, void *dev_id)
+{
+	/* We only have MSI for completions so just wakeup IRQ thread */
+	/* Ring related errors will be informed via completion descriptors */
+
+	return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t flexrm_irq_thread(int irq, void *dev_id)
+{
+	flexrm_process_completions(dev_id);
+
+	return IRQ_HANDLED;
+}
+
+/* ====== FlexRM mailbox callbacks ===== */
+
+static int flexrm_send_data(struct mbox_chan *chan, void *data)
+{
+	int i, rc;
+	struct flexrm_ring *ring = chan->con_priv;
+	struct brcm_message *msg = data;
+
+	if (msg->type == BRCM_MESSAGE_BATCH) {
+		for (i = msg->batch.msgs_queued;
+		     i < msg->batch.msgs_count; i++) {
+			rc = flexrm_new_request(ring, msg,
+						 &msg->batch.msgs[i]);
+			if (rc) {
+				msg->error = rc;
+				return rc;
+			}
+			msg->batch.msgs_queued++;
+		}
+		return 0;
+	}
+
+	return flexrm_new_request(ring, NULL, data);
+}
+
+static bool flexrm_peek_data(struct mbox_chan *chan)
+{
+	int cnt = flexrm_process_completions(chan->con_priv);
+
+	return (cnt > 0) ? true : false;
+}
+
+static int flexrm_startup(struct mbox_chan *chan)
+{
+	u64 d;
+	u32 val, off;
+	int ret = 0;
+	dma_addr_t next_addr;
+	struct flexrm_ring *ring = chan->con_priv;
+
+	/* Allocate BD memory */
+	ring->bd_base = dma_pool_alloc(ring->mbox->bd_pool,
+				       GFP_KERNEL, &ring->bd_dma_base);
+	if (!ring->bd_base) {
+		dev_err(ring->mbox->dev, "can't allocate BD memory\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	/* Configure next table pointer entries in BD memory */
+	for (off = 0; off < RING_BD_SIZE; off += RING_DESC_SIZE) {
+		next_addr = off + RING_DESC_SIZE;
+		if (next_addr == RING_BD_SIZE)
+			next_addr = 0;
+		next_addr += ring->bd_dma_base;
+		if (RING_BD_ALIGN_CHECK(next_addr))
+			d = flexrm_next_table_desc(RING_BD_TOGGLE_VALID(off),
+						    next_addr);
+		else
+			d = flexrm_null_desc(RING_BD_TOGGLE_INVALID(off));
+		flexrm_write_desc(ring->bd_base + off, d);
+	}
+
+	/* Allocate completion memory */
+	ring->cmpl_base = dma_pool_alloc(ring->mbox->cmpl_pool,
+					 GFP_KERNEL, &ring->cmpl_dma_base);
+	if (!ring->cmpl_base) {
+		dev_err(ring->mbox->dev, "can't allocate completion memory\n");
+		ret = -ENOMEM;
+		goto fail_free_bd_memory;
+	}
+	memset(ring->cmpl_base, 0, RING_CMPL_SIZE);
+
+	/* Request IRQ */
+	if (ring->irq == UINT_MAX) {
+		dev_err(ring->mbox->dev, "ring IRQ not available\n");
+		ret = -ENODEV;
+		goto fail_free_cmpl_memory;
+	}
+	ret = request_threaded_irq(ring->irq,
+				   flexrm_irq_event,
+				   flexrm_irq_thread,
+				   0, dev_name(ring->mbox->dev), ring);
+	if (ret) {
+		dev_err(ring->mbox->dev, "failed to request ring IRQ\n");
+		goto fail_free_cmpl_memory;
+	}
+	ring->irq_requested = true;
+
+	/* Disable/inactivate ring */
+	writel_relaxed(0x0, ring->regs + RING_CONTROL);
+
+	/* Program BD start address */
+	val = BD_START_ADDR_VALUE(ring->bd_dma_base);
+	writel_relaxed(val, ring->regs + RING_BD_START_ADDR);
+
+	/* BD write pointer will be same as HW write pointer */
+	ring->bd_write_offset =
+			readl_relaxed(ring->regs + RING_BD_WRITE_PTR);
+	ring->bd_write_offset *= RING_DESC_SIZE;
+
+	/* Program completion start address */
+	val = CMPL_START_ADDR_VALUE(ring->cmpl_dma_base);
+	writel_relaxed(val, ring->regs + RING_CMPL_START_ADDR);
+
+	/* Ensure last pending message is cleared */
+	ring->last_pending_msg = NULL;
+
+	/* Completion read pointer will be same as HW write pointer */
+	ring->cmpl_read_offset =
+			readl_relaxed(ring->regs + RING_CMPL_WRITE_PTR);
+	ring->cmpl_read_offset *= RING_DESC_SIZE;
+
+	/* Read ring Tx, Rx, and Outstanding counts to clear */
+	readl_relaxed(ring->regs + RING_NUM_REQ_RECV_LS);
+	readl_relaxed(ring->regs + RING_NUM_REQ_RECV_MS);
+	readl_relaxed(ring->regs + RING_NUM_REQ_TRANS_LS);
+	readl_relaxed(ring->regs + RING_NUM_REQ_TRANS_MS);
+	readl_relaxed(ring->regs + RING_NUM_REQ_OUTSTAND);
+
+	/* Configure RING_MSI_CONTROL */
+	val = 0;
+	val |= (ring->msi_timer_val << MSI_TIMER_VAL_SHIFT);
+	val |= BIT(MSI_ENABLE_SHIFT);
+	val |= (ring->msi_count_threshold & MSI_COUNT_MASK) << MSI_COUNT_SHIFT;
+	writel_relaxed(val, ring->regs + RING_MSI_CONTROL);
+
+	/* Enable/activate ring */
+	val = BIT(CONTROL_ACTIVE_SHIFT);
+	writel_relaxed(val, ring->regs + RING_CONTROL);
+
+	return 0;
+
+fail_free_cmpl_memory:
+	dma_pool_free(ring->mbox->cmpl_pool,
+		      ring->cmpl_base, ring->cmpl_dma_base);
+	ring->cmpl_base = NULL;
+fail_free_bd_memory:
+	dma_pool_free(ring->mbox->bd_pool,
+		      ring->bd_base, ring->bd_dma_base);
+	ring->bd_base = NULL;
+fail:
+	return ret;
+}
+
+static void flexrm_shutdown(struct mbox_chan *chan)
+{
+	u32 reqid;
+	unsigned int timeout;
+	struct brcm_message *msg;
+	struct flexrm_ring *ring = chan->con_priv;
+
+	/* Disable/inactivate ring */
+	writel_relaxed(0x0, ring->regs + RING_CONTROL);
+
+	/* Flush ring with timeout of 1s */
+	timeout = 1000;
+	writel_relaxed(BIT(CONTROL_FLUSH_SHIFT),
+			ring->regs + RING_CONTROL);
+	do {
+		if (readl_relaxed(ring->regs + RING_FLUSH_DONE) &
+		    FLUSH_DONE_MASK)
+			break;
+		mdelay(1);
+	} while (timeout--);
+
+	/* Abort all in-flight requests */
+	for (reqid = 0; reqid < RING_MAX_REQ_COUNT; reqid++) {
+		msg = ring->requests[reqid];
+		if (!msg)
+			continue;
+
+		/* Release reqid for recycling */
+		ring->requests[reqid] = NULL;
+		ida_simple_remove(&ring->requests_ida, reqid);
+
+		/* Unmap DMA mappings */
+		flexrm_dma_unmap(ring->mbox->dev, msg);
+
+		/* Give-back message to mailbox client */
+		msg->error = -EIO;
+		mbox_chan_received_data(chan, msg);
+	}
+
+	/* Release IRQ */
+	if (ring->irq_requested) {
+		free_irq(ring->irq, ring);
+		ring->irq_requested = false;
+	}
+
+	/* Free-up completion descriptor ring */
+	if (ring->cmpl_base) {
+		dma_pool_free(ring->mbox->cmpl_pool,
+			      ring->cmpl_base, ring->cmpl_dma_base);
+		ring->cmpl_base = NULL;
+	}
+
+	/* Free-up BD descriptor ring */
+	if (ring->bd_base) {
+		dma_pool_free(ring->mbox->bd_pool,
+			      ring->bd_base, ring->bd_dma_base);
+		ring->bd_base = NULL;
+	}
+}
+
+static bool flexrm_last_tx_done(struct mbox_chan *chan)
+{
+	bool ret;
+	unsigned long flags;
+	struct flexrm_ring *ring = chan->con_priv;
+
+	spin_lock_irqsave(&ring->lock, flags);
+	ret = (ring->last_pending_msg) ? false : true;
+	spin_unlock_irqrestore(&ring->lock, flags);
+
+	return ret;
+}
+
+static const struct mbox_chan_ops flexrm_mbox_chan_ops = {
+	.send_data	= flexrm_send_data,
+	.startup	= flexrm_startup,
+	.shutdown	= flexrm_shutdown,
+	.last_tx_done	= flexrm_last_tx_done,
+	.peek_data	= flexrm_peek_data,
+};
+
+static struct mbox_chan *flexrm_mbox_of_xlate(struct mbox_controller *cntlr,
+					const struct of_phandle_args *pa)
+{
+	struct mbox_chan *chan;
+	struct flexrm_ring *ring;
+
+	if (pa->args_count < 3)
+		return ERR_PTR(-EINVAL);
+
+	if (pa->args[0] >= cntlr->num_chans)
+		return ERR_PTR(-ENOENT);
+
+	if (pa->args[1] > MSI_COUNT_MASK)
+		return ERR_PTR(-EINVAL);
+
+	if (pa->args[2] > MSI_TIMER_VAL_MASK)
+		return ERR_PTR(-EINVAL);
+
+	chan = &cntlr->chans[pa->args[0]];
+	ring = chan->con_priv;
+	ring->msi_count_threshold = pa->args[1];
+	ring->msi_timer_val = pa->args[2];
+
+	return chan;
+}
+
+/* ====== FlexRM platform driver ===== */
+
+static void flexrm_mbox_msi_write(struct msi_desc *desc, struct msi_msg *msg)
+{
+	struct device *dev = msi_desc_to_dev(desc);
+	struct flexrm_mbox *mbox = dev_get_drvdata(dev);
+	struct flexrm_ring *ring = &mbox->rings[desc->platform.msi_index];
+
+	/* Configure per-Ring MSI registers */
+	writel_relaxed(msg->address_lo, ring->regs + RING_MSI_ADDR_LS);
+	writel_relaxed(msg->address_hi, ring->regs + RING_MSI_ADDR_MS);
+	writel_relaxed(msg->data, ring->regs + RING_MSI_DATA_VALUE);
+}
+
+static int flexrm_mbox_probe(struct platform_device *pdev)
+{
+	int index, ret = 0;
+	void __iomem *regs;
+	void __iomem *regs_end;
+	struct msi_desc *desc;
+	struct resource *iomem;
+	struct flexrm_ring *ring;
+	struct flexrm_mbox *mbox;
+	struct device *dev = &pdev->dev;
+
+	/* Allocate driver mailbox struct */
+	mbox = devm_kzalloc(dev, sizeof(*mbox), GFP_KERNEL);
+	if (!mbox) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	mbox->dev = dev;
+	platform_set_drvdata(pdev, mbox);
+
+	/* Get resource for registers */
+	iomem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!iomem || (resource_size(iomem) < RING_REGS_SIZE)) {
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	/* Map registers of all rings */
+	mbox->regs = devm_ioremap_resource(&pdev->dev, iomem);
+	if (IS_ERR(mbox->regs)) {
+		ret = PTR_ERR(mbox->regs);
+		dev_err(&pdev->dev, "Failed to remap mailbox regs: %d\n", ret);
+		goto fail;
+	}
+	regs_end = mbox->regs + resource_size(iomem);
+
+	/* Scan and count available rings */
+	mbox->num_rings = 0;
+	for (regs = mbox->regs; regs < regs_end; regs += RING_REGS_SIZE) {
+		if (readl_relaxed(regs + RING_VER) == RING_VER_MAGIC)
+			mbox->num_rings++;
+	}
+	if (!mbox->num_rings) {
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	/* Allocate driver ring structs */
+	ring = devm_kcalloc(dev, mbox->num_rings, sizeof(*ring), GFP_KERNEL);
+	if (!ring) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	mbox->rings = ring;
+
+	/* Initialize members of driver ring structs */
+	regs = mbox->regs;
+	for (index = 0; index < mbox->num_rings; index++) {
+		ring = &mbox->rings[index];
+		ring->num = index;
+		ring->mbox = mbox;
+		while ((regs < regs_end) &&
+		       (readl_relaxed(regs + RING_VER) != RING_VER_MAGIC))
+			regs += RING_REGS_SIZE;
+		if (regs_end <= regs) {
+			ret = -ENODEV;
+			goto fail;
+		}
+		ring->regs = regs;
+		regs += RING_REGS_SIZE;
+		ring->irq = UINT_MAX;
+		ring->irq_requested = false;
+		ring->msi_timer_val = MSI_TIMER_VAL_MASK;
+		ring->msi_count_threshold = 0x1;
+		ida_init(&ring->requests_ida);
+		memset(ring->requests, 0, sizeof(ring->requests));
+		ring->bd_base = NULL;
+		ring->bd_dma_base = 0;
+		ring->cmpl_base = NULL;
+		ring->cmpl_dma_base = 0;
+		spin_lock_init(&ring->lock);
+		ring->last_pending_msg = NULL;
+		ring->cmpl_read_offset = 0;
+	}
+
+	/* FlexRM is capable of 40-bit physical addresses only */
+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(40));
+	if (ret) {
+		ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+		if (ret)
+			goto fail;
+	}
+
+	/* Create DMA pool for ring BD memory */
+	mbox->bd_pool = dma_pool_create("bd", dev, RING_BD_SIZE,
+					1 << RING_BD_ALIGN_ORDER, 0);
+	if (!mbox->bd_pool) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	/* Create DMA pool for ring completion memory */
+	mbox->cmpl_pool = dma_pool_create("cmpl", dev, RING_CMPL_SIZE,
+					  1 << RING_CMPL_ALIGN_ORDER, 0);
+	if (!mbox->cmpl_pool) {
+		ret = -ENOMEM;
+		goto fail_destroy_bd_pool;
+	}
+
+	/* Allocate platform MSIs for each ring */
+	ret = platform_msi_domain_alloc_irqs(dev, mbox->num_rings,
+						flexrm_mbox_msi_write);
+	if (ret)
+		goto fail_destroy_cmpl_pool;
+
+	/* Save alloced IRQ numbers for each ring */
+	for_each_msi_entry(desc, dev) {
+		ring = &mbox->rings[desc->platform.msi_index];
+		ring->irq = desc->irq;
+	}
+
+	/* Initialize mailbox controller */
+	mbox->controller.txdone_irq = false;
+	mbox->controller.txdone_poll = true;
+	mbox->controller.txpoll_period = 1;
+	mbox->controller.ops = &flexrm_mbox_chan_ops;
+	mbox->controller.dev = dev;
+	mbox->controller.num_chans = mbox->num_rings;
+	mbox->controller.of_xlate = flexrm_mbox_of_xlate;
+	mbox->controller.chans = devm_kcalloc(dev, mbox->num_rings,
+				sizeof(*mbox->controller.chans), GFP_KERNEL);
+	if (!mbox->controller.chans) {
+		ret = -ENOMEM;
+		goto fail_free_msis;
+	}
+	for (index = 0; index < mbox->num_rings; index++)
+		mbox->controller.chans[index].con_priv = &mbox->rings[index];
+
+	/* Register mailbox controller */
+	ret = mbox_controller_register(&mbox->controller);
+	if (ret)
+		goto fail_free_msis;
+
+	dev_info(dev, "registered flexrm mailbox with %d channels\n",
+			mbox->controller.num_chans);
+
+	return 0;
+
+fail_free_msis:
+	platform_msi_domain_free_irqs(dev);
+fail_destroy_cmpl_pool:
+	dma_pool_destroy(mbox->cmpl_pool);
+fail_destroy_bd_pool:
+	dma_pool_destroy(mbox->bd_pool);
+fail:
+	return ret;
+}
+
+static int flexrm_mbox_remove(struct platform_device *pdev)
+{
+	int index;
+	struct device *dev = &pdev->dev;
+	struct flexrm_ring *ring;
+	struct flexrm_mbox *mbox = platform_get_drvdata(pdev);
+
+	mbox_controller_unregister(&mbox->controller);
+
+	platform_msi_domain_free_irqs(dev);
+
+	dma_pool_destroy(mbox->cmpl_pool);
+	dma_pool_destroy(mbox->bd_pool);
+
+	for (index = 0; index < mbox->num_rings; index++) {
+		ring = &mbox->rings[index];
+		ida_destroy(&ring->requests_ida);
+	}
+
+	return 0;
+}
+
+static const struct of_device_id flexrm_mbox_of_match[] = {
+	{ .compatible = "brcm,iproc-flexrm-mbox", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, flexrm_mbox_of_match);
+
+static struct platform_driver flexrm_mbox_driver = {
+	.driver = {
+		.name = "brcm-flexrm-mbox",
+		.of_match_table = flexrm_mbox_of_match,
+	},
+	.probe		= flexrm_mbox_probe,
+	.remove		= flexrm_mbox_remove,
+};
+module_platform_driver(flexrm_mbox_driver);
+
+MODULE_AUTHOR("Anup Patel <anup.patel@broadcom.com>");
+MODULE_DESCRIPTION("Broadcom FlexRM mailbox driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/mailbox/brcm-message.h b/include/linux/mailbox/brcm-message.h
index 6b55c938b401..c20b4843fc2d 100644
--- a/include/linux/mailbox/brcm-message.h
+++ b/include/linux/mailbox/brcm-message.h
@@ -16,6 +16,7 @@
 
 enum brcm_message_type {
 	BRCM_MESSAGE_UNKNOWN = 0,
+	BRCM_MESSAGE_BATCH,
 	BRCM_MESSAGE_SPU,
 	BRCM_MESSAGE_SBA,
 	BRCM_MESSAGE_MAX,
@@ -23,23 +24,28 @@ enum brcm_message_type {
 
 struct brcm_sba_command {
 	u64 cmd;
+	u64 *cmd_dma;
+	dma_addr_t cmd_dma_addr;
 #define BRCM_SBA_CMD_TYPE_A		BIT(0)
 #define BRCM_SBA_CMD_TYPE_B		BIT(1)
 #define BRCM_SBA_CMD_TYPE_C		BIT(2)
 #define BRCM_SBA_CMD_HAS_RESP		BIT(3)
 #define BRCM_SBA_CMD_HAS_OUTPUT		BIT(4)
 	u64 flags;
-	dma_addr_t input;
-	size_t input_len;
 	dma_addr_t resp;
 	size_t resp_len;
-	dma_addr_t output;
-	size_t output_len;
+	dma_addr_t data;
+	size_t data_len;
 };
 
 struct brcm_message {
 	enum brcm_message_type type;
 	union {
+		struct {
+			struct brcm_message *msgs;
+			unsigned int msgs_queued;
+			unsigned int msgs_count;
+		} batch;
 		struct {
 			struct scatterlist *src;
 			struct scatterlist *dst;
-- 
cgit v1.3-6-gb490


From db68ce10c4f0a27c1ff9fa0e789e5c41f8c4ea63 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Mar 2017 21:08:07 -0400
Subject: new helper: uaccess_kernel()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/include/asm/uaccess.h     |  2 +-
 arch/arc/include/asm/uaccess.h       |  2 +-
 arch/arm/include/asm/uaccess.h       |  2 +-
 arch/arm/lib/uaccess_with_memcpy.c   |  4 ++--
 arch/blackfin/kernel/process.c       |  2 +-
 arch/c6x/kernel/sys_c6x.c            |  2 +-
 arch/cris/include/asm/uaccess.h      |  2 +-
 arch/m68k/include/asm/uaccess_mm.h   |  2 +-
 arch/metag/include/asm/uaccess.h     |  2 +-
 arch/mips/include/asm/checksum.h     |  4 ++--
 arch/mips/include/asm/r4kcache.h     |  4 ++--
 arch/mips/include/asm/uaccess.h      |  2 +-
 arch/mips/kernel/unaligned.c         | 10 +++++-----
 arch/openrisc/include/asm/uaccess.h  |  2 +-
 arch/parisc/include/asm/futex.h      |  2 +-
 arch/parisc/lib/memcpy.c             |  2 +-
 arch/s390/include/asm/uaccess.h      |  2 +-
 arch/sparc/include/asm/uaccess.h     |  2 +-
 arch/sparc/include/asm/uaccess_32.h  |  2 +-
 arch/um/include/asm/uaccess.h        |  2 +-
 arch/um/kernel/skas/uaccess.c        | 10 +++++-----
 arch/unicore32/include/asm/uaccess.h |  2 +-
 arch/unicore32/kernel/process.c      |  2 +-
 arch/xtensa/include/asm/uaccess.h    |  2 +-
 block/bsg.c                          |  2 +-
 drivers/scsi/sg.c                    |  2 +-
 include/linux/uaccess.h              |  2 ++
 include/rdma/ib.h                    |  2 +-
 kernel/trace/bpf_trace.c             |  2 +-
 lib/iov_iter.c                       |  2 +-
 mm/memory.c                          |  2 +-
 security/tomoyo/network.c            |  2 +-
 32 files changed, 44 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h
index 77c55ce89936..b2f62b98e290 100644
--- a/arch/alpha/include/asm/uaccess.h
+++ b/arch/alpha/include/asm/uaccess.h
@@ -425,7 +425,7 @@ clear_user(void __user *to, long len)
 #undef __module_call
 
 #define user_addr_max() \
-        (segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+        (uaccess_kernel() ? ~0UL : TASK_SIZE)
 
 extern long strncpy_from_user(char *dest, const char __user *src, long count);
 extern __must_check long strlen_user(const char __user *str);
diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h
index d837a53c6e59..ffd14e630aca 100644
--- a/arch/arc/include/asm/uaccess.h
+++ b/arch/arc/include/asm/uaccess.h
@@ -27,7 +27,7 @@
 #include <linux/string.h>	/* for generic string functions */
 
 
-#define __kernel_ok		(segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok		(uaccess_kernel())
 
 /*
  * Algorithmically, for __user_ok() we want do:
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 9677a7cf7987..b63527359d52 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -266,7 +266,7 @@ static inline void set_fs(mm_segment_t fs)
 #define access_ok(type, addr, size)	(__range_ok(addr, size) == 0)
 
 #define user_addr_max() \
-	(segment_eq(get_fs(), KERNEL_DS) ? ~0UL : get_fs())
+	(uaccess_kernel() ? ~0UL : get_fs())
 
 /*
  * The "__xxx" versions of the user access functions do not verify the
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c
index 6bd1089b07e0..9b4ed1728616 100644
--- a/arch/arm/lib/uaccess_with_memcpy.c
+++ b/arch/arm/lib/uaccess_with_memcpy.c
@@ -90,7 +90,7 @@ __copy_to_user_memcpy(void __user *to, const void *from, unsigned long n)
 	unsigned long ua_flags;
 	int atomic;
 
-	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+	if (uaccess_kernel()) {
 		memcpy((void *)to, from, n);
 		return 0;
 	}
@@ -162,7 +162,7 @@ __clear_user_memset(void __user *addr, unsigned long n)
 {
 	unsigned long ua_flags;
 
-	if (unlikely(segment_eq(get_fs(), KERNEL_DS))) {
+	if (uaccess_kernel()) {
 		memset((void *)addr, 0, n);
 		return 0;
 	}
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 89d5162d4ca6..89814850b08b 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -370,7 +370,7 @@ int _access_ok(unsigned long addr, unsigned long size)
 	/* Check that things do not wrap around */
 	if (addr > ULONG_MAX - size)
 		return 0;
-	if (segment_eq(get_fs(), KERNEL_DS))
+	if (uaccess_kernel())
 		return 1;
 #ifdef CONFIG_MTD_UCLINUX
 	if (1)
diff --git a/arch/c6x/kernel/sys_c6x.c b/arch/c6x/kernel/sys_c6x.c
index 3e9bdfbee8ad..a742ae259239 100644
--- a/arch/c6x/kernel/sys_c6x.c
+++ b/arch/c6x/kernel/sys_c6x.c
@@ -23,7 +23,7 @@ int _access_ok(unsigned long addr, unsigned long size)
 	if (!addr || addr > (0xffffffffUL - (size - 1)))
 		goto _bad_access;
 
-	if (segment_eq(get_fs(), KERNEL_DS))
+	if (uaccess_kernel())
 		return 1;
 
 	if (memory_start <= addr && (addr + size - 1) < memory_end)
diff --git a/arch/cris/include/asm/uaccess.h b/arch/cris/include/asm/uaccess.h
index c2462ef04eaf..5f5b8f53d2d7 100644
--- a/arch/cris/include/asm/uaccess.h
+++ b/arch/cris/include/asm/uaccess.h
@@ -43,7 +43,7 @@
 
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
-#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok (uaccess_kernel())
 #define __user_ok(addr, size) \
 	(((size) <= TASK_SIZE) && ((addr) <= TASK_SIZE-(size)))
 #define __access_ok(addr, size) (__kernel_ok || __user_ok((addr), (size)))
diff --git a/arch/m68k/include/asm/uaccess_mm.h b/arch/m68k/include/asm/uaccess_mm.h
index fb72b710759e..14054a4e4216 100644
--- a/arch/m68k/include/asm/uaccess_mm.h
+++ b/arch/m68k/include/asm/uaccess_mm.h
@@ -375,7 +375,7 @@ __constant_copy_to_user(void __user *to, const void *from, unsigned long n)
 #define copy_to_user(to, from, n)	__copy_to_user(to, from, n)
 
 #define user_addr_max() \
-	(segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+	(uaccess_kernel() ? ~0UL : TASK_SIZE)
 
 extern long strncpy_from_user(char *dst, const char __user *src, long count);
 extern __must_check long strlen_user(const char __user *str);
diff --git a/arch/metag/include/asm/uaccess.h b/arch/metag/include/asm/uaccess.h
index 7fc5277ae71f..83de7554d2b3 100644
--- a/arch/metag/include/asm/uaccess.h
+++ b/arch/metag/include/asm/uaccess.h
@@ -24,7 +24,7 @@
 
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
-#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok (uaccess_kernel())
 /*
  * Explicitly allow NULL pointers here. Parts of the kernel such
  * as readv/writev use access_ok to validate pointers, but want
diff --git a/arch/mips/include/asm/checksum.h b/arch/mips/include/asm/checksum.h
index c8b574f7e0cc..77cad232a1c6 100644
--- a/arch/mips/include/asm/checksum.h
+++ b/arch/mips/include/asm/checksum.h
@@ -50,7 +50,7 @@ __wsum csum_partial_copy_from_user(const void __user *src, void *dst, int len,
 				   __wsum sum, int *err_ptr)
 {
 	might_fault();
-	if (segment_eq(get_fs(), get_ds()))
+	if (uaccess_kernel())
 		return __csum_partial_copy_kernel((__force void *)src, dst,
 						  len, sum, err_ptr);
 	else
@@ -82,7 +82,7 @@ __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len,
 {
 	might_fault();
 	if (access_ok(VERIFY_WRITE, dst, len)) {
-		if (segment_eq(get_fs(), get_ds()))
+		if (uaccess_kernel())
 			return __csum_partial_copy_kernel(src,
 							  (__force void *)dst,
 							  len, sum, err_ptr);
diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h
index 55fd94e6cd0b..7f12d7e27c94 100644
--- a/arch/mips/include/asm/r4kcache.h
+++ b/arch/mips/include/asm/r4kcache.h
@@ -20,7 +20,7 @@
 #include <asm/cpu-features.h>
 #include <asm/cpu-type.h>
 #include <asm/mipsmtregs.h>
-#include <linux/uaccess.h> /* for segment_eq() */
+#include <linux/uaccess.h> /* for uaccess_kernel() */
 
 extern void (*r4k_blast_dcache)(void);
 extern void (*r4k_blast_icache)(void);
@@ -714,7 +714,7 @@ static inline void protected_blast_##pfx##cache##_range(unsigned long start,\
 									\
 	__##pfx##flush_prologue						\
 									\
-	if (segment_eq(get_fs(), USER_DS)) {				\
+	if (!uaccess_kernel()) {					\
 		while (1) {						\
 			protected_cachee_op(hitop, addr);		\
 			if (addr == aend)				\
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index dd25b312c973..70ca8eee166a 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -88,7 +88,7 @@ static inline bool eva_kernel_access(void)
 	if (!IS_ENABLED(CONFIG_EVA))
 		return false;
 
-	return segment_eq(get_fs(), get_ds());
+	return uaccess_kernel();
 }
 
 /*
diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
index 7ed98354fe9d..f806ee56e639 100644
--- a/arch/mips/kernel/unaligned.c
+++ b/arch/mips/kernel/unaligned.c
@@ -1026,7 +1026,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 			goto sigbus;
 
 		if (IS_ENABLED(CONFIG_EVA)) {
-			if (segment_eq(get_fs(), get_ds()))
+			if (uaccess_kernel())
 				LoadHW(addr, value, res);
 			else
 				LoadHWE(addr, value, res);
@@ -1045,7 +1045,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 			goto sigbus;
 
 		if (IS_ENABLED(CONFIG_EVA)) {
-			if (segment_eq(get_fs(), get_ds()))
+			if (uaccess_kernel())
 				LoadW(addr, value, res);
 			else
 				LoadWE(addr, value, res);
@@ -1064,7 +1064,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 			goto sigbus;
 
 		if (IS_ENABLED(CONFIG_EVA)) {
-			if (segment_eq(get_fs(), get_ds()))
+			if (uaccess_kernel())
 				LoadHWU(addr, value, res);
 			else
 				LoadHWUE(addr, value, res);
@@ -1132,7 +1132,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		value = regs->regs[insn.i_format.rt];
 
 		if (IS_ENABLED(CONFIG_EVA)) {
-			if (segment_eq(get_fs(), get_ds()))
+			if (uaccess_kernel())
 				StoreHW(addr, value, res);
 			else
 				StoreHWE(addr, value, res);
@@ -1152,7 +1152,7 @@ static void emulate_load_store_insn(struct pt_regs *regs,
 		value = regs->regs[insn.i_format.rt];
 
 		if (IS_ENABLED(CONFIG_EVA)) {
-			if (segment_eq(get_fs(), get_ds()))
+			if (uaccess_kernel())
 				StoreW(addr, value, res);
 			else
 				StoreWE(addr, value, res);
diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h
index 0b0f60444b76..227af1acb8bd 100644
--- a/arch/openrisc/include/asm/uaccess.h
+++ b/arch/openrisc/include/asm/uaccess.h
@@ -292,7 +292,7 @@ clear_user(void *addr, unsigned long size)
 }
 
 #define user_addr_max() \
-	(segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+	(uaccess_kernel() ? ~0UL : TASK_SIZE)
 
 extern long strncpy_from_user(char *dest, const char __user *src, long count);
 
diff --git a/arch/parisc/include/asm/futex.h b/arch/parisc/include/asm/futex.h
index ac8bd586ace8..0ba14300cd8e 100644
--- a/arch/parisc/include/asm/futex.h
+++ b/arch/parisc/include/asm/futex.h
@@ -109,7 +109,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	/* futex.c wants to do a cmpxchg_inatomic on kernel NULL, which is
 	 * our gateway page, and causes no end of trouble...
 	 */
-	if (segment_eq(KERNEL_DS, get_fs()) && !uaddr)
+	if (uaccess_kernel() && !uaddr)
 		return -EFAULT;
 
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c
index f82ff10ed974..66f5160136c2 100644
--- a/arch/parisc/lib/memcpy.c
+++ b/arch/parisc/lib/memcpy.c
@@ -76,7 +76,7 @@ DECLARE_PER_CPU(struct exception_data, exception_data);
 		goto label;						\
 } while (0)
 
-#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
+#define get_user_space() (uaccess_kernel() ? 0 : mfsp(3))
 #define get_kernel_space() (0)
 
 #define MERGE(w0, sh_1, w1, sh_2)  ({					\
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 9e9a5e8d6cf6..7228ed8da67d 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -37,7 +37,7 @@
 static inline void set_fs(mm_segment_t fs)
 {
 	current->thread.mm_segment = fs;
-	if (segment_eq(fs, KERNEL_DS)) {
+	if (uaccess_kernel()) {
 		set_cpu_flag(CIF_ASCE_SECONDARY);
 		__ctl_load(S390_lowcore.kernel_asce, 7, 7);
 	} else {
diff --git a/arch/sparc/include/asm/uaccess.h b/arch/sparc/include/asm/uaccess.h
index bd56c28fff9f..9e068bf9060a 100644
--- a/arch/sparc/include/asm/uaccess.h
+++ b/arch/sparc/include/asm/uaccess.h
@@ -7,7 +7,7 @@
 #endif
 
 #define user_addr_max() \
-	(segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+	(uaccess_kernel() ? ~0UL : TASK_SIZE)
 
 long strncpy_from_user(char *dest, const char __user *src, long count);
 
diff --git a/arch/sparc/include/asm/uaccess_32.h b/arch/sparc/include/asm/uaccess_32.h
index 952d512a64f2..a59a1e81986d 100644
--- a/arch/sparc/include/asm/uaccess_32.h
+++ b/arch/sparc/include/asm/uaccess_32.h
@@ -36,7 +36,7 @@
  * large size and address near to PAGE_OFFSET - a fault will break his intentions.
  */
 #define __user_ok(addr, size) ({ (void)(size); (addr) < STACK_TOP; })
-#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok (uaccess_kernel())
 #define __access_ok(addr, size) (__user_ok((addr) & get_fs().seg, (size)))
 #define access_ok(type, addr, size) \
 	({ (void)(type); __access_ok((unsigned long)(addr), size); })
diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h
index 076bdcb0c2ad..e992bb57da5a 100644
--- a/arch/um/include/asm/uaccess.h
+++ b/arch/um/include/asm/uaccess.h
@@ -45,7 +45,7 @@ static inline int __access_ok(unsigned long addr, unsigned long size)
 	return __addr_range_nowrap(addr, size) &&
 		(__under_task_size(addr, size) ||
 		__access_ok_vsyscall(addr, size) ||
-		segment_eq(get_fs(), KERNEL_DS));
+		uaccess_kernel());
 }
 
 #endif
diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c
index 85ac8adb069b..22c9f79db8e6 100644
--- a/arch/um/kernel/skas/uaccess.c
+++ b/arch/um/kernel/skas/uaccess.c
@@ -141,7 +141,7 @@ static int copy_chunk_from_user(unsigned long from, int len, void *arg)
 
 long __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-	if (segment_eq(get_fs(), KERNEL_DS)) {
+	if (uaccess_kernel()) {
 		memcpy(to, (__force void*)from, n);
 		return 0;
 	}
@@ -161,7 +161,7 @@ static int copy_chunk_to_user(unsigned long to, int len, void *arg)
 
 long __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-	if (segment_eq(get_fs(), KERNEL_DS)) {
+	if (uaccess_kernel()) {
 		memcpy((__force void *) to, from, n);
 		return 0;
 	}
@@ -189,7 +189,7 @@ long __strncpy_from_user(char *dst, const char __user *src, long count)
 	long n;
 	char *ptr = dst;
 
-	if (segment_eq(get_fs(), KERNEL_DS)) {
+	if (uaccess_kernel()) {
 		strncpy(dst, (__force void *) src, count);
 		return strnlen(dst, count);
 	}
@@ -210,7 +210,7 @@ static int clear_chunk(unsigned long addr, int len, void *unused)
 
 unsigned long __clear_user(void __user *mem, unsigned long len)
 {
-	if (segment_eq(get_fs(), KERNEL_DS)) {
+	if (uaccess_kernel()) {
 		memset((__force void*)mem, 0, len);
 		return 0;
 	}
@@ -235,7 +235,7 @@ long __strnlen_user(const void __user *str, long len)
 {
 	int count = 0, n;
 
-	if (segment_eq(get_fs(), KERNEL_DS))
+	if (uaccess_kernel())
 		return strnlen((__force char*)str, len) + 1;
 
 	n = buffer_op((unsigned long) str, len, 0, strnlen_chunk, &count);
diff --git a/arch/unicore32/include/asm/uaccess.h b/arch/unicore32/include/asm/uaccess.h
index f60fab718b59..1196c88c2f9b 100644
--- a/arch/unicore32/include/asm/uaccess.h
+++ b/arch/unicore32/include/asm/uaccess.h
@@ -20,7 +20,7 @@
 #define __strnlen_user		__strnlen_user
 #define __clear_user		__clear_user
 
-#define __kernel_ok		(segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok		(uaccess_kernel())
 #define __user_ok(addr, size)	(((size) <= TASK_SIZE)			\
 				&& ((addr) <= TASK_SIZE - (size)))
 #define __access_ok(addr, size)	(__kernel_ok || __user_ok((addr), (size)))
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index d22c1dc7e39e..ddaf78ae6854 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -178,7 +178,7 @@ void __show_regs(struct pt_regs *regs)
 		buf, interrupts_enabled(regs) ? "n" : "ff",
 		fast_interrupts_enabled(regs) ? "n" : "ff",
 		processor_modes[processor_mode(regs)],
-		segment_eq(get_fs(), get_ds()) ? "kernel" : "user");
+		uaccess_kernel() ? "kernel" : "user");
 	{
 		unsigned int ctrl;
 
diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h
index bd8861c811ef..26512692e28f 100644
--- a/arch/xtensa/include/asm/uaccess.h
+++ b/arch/xtensa/include/asm/uaccess.h
@@ -37,7 +37,7 @@
 
 #define segment_eq(a, b)	((a).seg == (b).seg)
 
-#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS))
+#define __kernel_ok (uaccess_kernel())
 #define __user_ok(addr, size) \
 		(((size) <= TASK_SIZE)&&((addr) <= TASK_SIZE-(size)))
 #define __access_ok(addr, size) (__kernel_ok || __user_ok((addr), (size)))
diff --git a/block/bsg.c b/block/bsg.c
index 74835dbf0c47..69ccb7801a75 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -650,7 +650,7 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 
 	dprintk("%s: write %zd bytes\n", bd->name, count);
 
-	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+	if (unlikely(uaccess_kernel()))
 		return -EINVAL;
 
 	bsg_set_block(bd, file);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 29b86505f796..5d9136a345ec 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -581,7 +581,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos)
 	sg_io_hdr_t *hp;
 	unsigned char cmnd[SG_MAX_CDB_SIZE];
 
-	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+	if (unlikely(uaccess_kernel()))
 		return -EINVAL;
 
 	if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp)))
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index b786ca2419b4..9c3ae8706e9d 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -7,6 +7,8 @@
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1
 
+#define uaccess_kernel() segment_eq(get_fs(), KERNEL_DS)
+
 #include <asm/uaccess.h>
 
 static __always_inline void pagefault_disabled_inc(void)
diff --git a/include/rdma/ib.h b/include/rdma/ib.h
index 9b4c22a36931..66dbed0c146d 100644
--- a/include/rdma/ib.h
+++ b/include/rdma/ib.h
@@ -100,7 +100,7 @@ struct sockaddr_ib {
  */
 static inline bool ib_safe_file_access(struct file *filp)
 {
-	return filp->f_cred == current_cred() && segment_eq(get_fs(), USER_DS);
+	return filp->f_cred == current_cred() && !uaccess_kernel();
 }
 
 #endif /* _RDMA_IB_H */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cee9802cf3e0..f806dbd66de9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -96,7 +96,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
 	if (unlikely(in_interrupt() ||
 		     current->flags & (PF_KTHREAD | PF_EXITING)))
 		return -EPERM;
-	if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+	if (unlikely(uaccess_kernel()))
 		return -EPERM;
 	if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
 		return -EPERM;
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index e68604ae3ced..97db876c6862 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -413,7 +413,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
 			size_t count)
 {
 	/* It will get better.  Eventually... */
-	if (segment_eq(get_fs(), KERNEL_DS)) {
+	if (uaccess_kernel()) {
 		direction |= ITER_KVEC;
 		i->type = direction;
 		i->kvec = (struct kvec *)iov;
diff --git a/mm/memory.c b/mm/memory.c
index a97a4cec2e1f..e8f4e10e770a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4136,7 +4136,7 @@ void __might_fault(const char *file, int line)
 	 * get paged out, therefore we'll never actually fault, and the
 	 * below annotations will generate false positives.
 	 */
-	if (segment_eq(get_fs(), KERNEL_DS))
+	if (uaccess_kernel())
 		return;
 	if (pagefault_disabled())
 		return;
diff --git a/security/tomoyo/network.c b/security/tomoyo/network.c
index 97527710a72a..6c02ac478247 100644
--- a/security/tomoyo/network.c
+++ b/security/tomoyo/network.c
@@ -608,7 +608,7 @@ static int tomoyo_check_unix_address(struct sockaddr *addr,
 static bool tomoyo_kernel_service(void)
 {
 	/* Nothing to do if I am a kernel service. */
-	return segment_eq(get_fs(), KERNEL_DS);
+	return uaccess_kernel();
 }
 
 /**
-- 
cgit v1.3-6-gb490


From 7fc6b87a9ff537e7df32b1278118ce9c5bcd6788 Mon Sep 17 00:00:00 2001
From: Tahsin Erdogan <tahsin@google.com>
Date: Thu, 9 Mar 2017 00:05:31 -0800
Subject: blkcg: allocate struct blkcg_gq outside request queue spinlock

blkg_conf_prep() currently calls blkg_lookup_create() while holding
request queue spinlock. This means allocating memory for struct
blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM
failures in call paths like below:

  pcpu_alloc+0x68f/0x710
  __alloc_percpu_gfp+0xd/0x10
  __percpu_counter_init+0x55/0xc0
  cfq_pd_alloc+0x3b2/0x4e0
  blkg_alloc+0x187/0x230
  blkg_create+0x489/0x670
  blkg_lookup_create+0x9a/0x230
  blkg_conf_prep+0x1fb/0x240
  __cfqg_set_weight_device.isra.105+0x5c/0x180
  cfq_set_weight_on_dfl+0x69/0xc0
  cgroup_file_write+0x39/0x1c0
  kernfs_fop_write+0x13f/0x1d0
  __vfs_write+0x23/0x120
  vfs_write+0xc2/0x1f0
  SyS_write+0x44/0xb0
  entry_SYSCALL_64_fastpath+0x18/0xad

In the code path above, percpu allocator cannot call vmalloc() due to
queue spinlock.

A failure in this call path gives grief to tools which are trying to
configure io weights. We see occasional failures happen shortly after
reboots even when system is not under any memory pressure. Machines
with a lot of cpus are more vulnerable to this condition.

Update blkg_create() function to temporarily drop the rcu and queue
locks when it is allowed by gfp mask.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 138 ++++++++++++++++++++++++++++-----------------
 include/linux/blk-cgroup.h |   6 +-
 2 files changed, 91 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bbe7ee00bd3d..bdf87f0c1b1b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -165,16 +165,18 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
 /*
- * If @new_blkg is %NULL, this function tries to allocate a new one as
- * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
+ * If gfp mask allows blocking, this function temporarily drops rcu and queue
+ * locks to allocate memory.
  */
 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
-				    struct request_queue *q,
-				    struct blkcg_gq *new_blkg)
+				    struct request_queue *q, gfp_t gfp,
+				    const struct blkcg_policy *pol)
 {
-	struct blkcg_gq *blkg;
+	struct blkcg_gq *blkg = NULL;
 	struct bdi_writeback_congested *wb_congested;
 	int i, ret;
+	const bool drop_locks = gfpflags_allow_blocking(gfp);
+	bool preloaded = false;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
@@ -185,31 +187,53 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		goto err_free_blkg;
 	}
 
+	if (drop_locks) {
+		spin_unlock_irq(q->queue_lock);
+		rcu_read_unlock();
+	}
+
 	wb_congested = wb_congested_get_create(q->backing_dev_info,
-					       blkcg->css.id,
-					       GFP_NOWAIT | __GFP_NOWARN);
-	if (!wb_congested) {
+					       blkcg->css.id, gfp);
+	blkg = blkg_alloc(blkcg, q, gfp);
+
+	if (drop_locks) {
+		preloaded = !radix_tree_preload(gfp);
+		rcu_read_lock();
+		spin_lock_irq(q->queue_lock);
+	}
+
+	if (unlikely(!wb_congested || !blkg)) {
 		ret = -ENOMEM;
-		goto err_put_css;
+		goto err_put;
 	}
 
-	/* allocate */
-	if (!new_blkg) {
-		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
-		if (unlikely(!new_blkg)) {
-			ret = -ENOMEM;
-			goto err_put_congested;
+	blkg->wb_congested = wb_congested;
+
+	if (pol) {
+		WARN_ON(!drop_locks);
+
+		if (!blkcg_policy_enabled(q, pol)) {
+			ret = -EOPNOTSUPP;
+			goto err_put;
+		}
+
+		/*
+		 * This could be the first entry point of blkcg implementation
+		 * and we shouldn't allow anything to go through for a bypassing
+		 * queue.
+		 */
+		if (unlikely(blk_queue_bypass(q))) {
+			ret = blk_queue_dying(q) ? -ENODEV : -EBUSY;
+			goto err_put;
 		}
 	}
-	blkg = new_blkg;
-	blkg->wb_congested = wb_congested;
 
 	/* link parent */
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
 			ret = -ENODEV;
-			goto err_put_congested;
+			goto err_put;
 		}
 		blkg_get(blkg->parent);
 	}
@@ -236,6 +260,9 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 				pol->pd_online_fn(blkg->pd[i]);
 		}
 	}
+
+	if (preloaded)
+		radix_tree_preload_end();
 	blkg->online = true;
 	spin_unlock(&blkcg->lock);
 
@@ -246,44 +273,45 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	blkg_put(blkg);
 	return ERR_PTR(ret);
 
-err_put_congested:
-	wb_congested_put(wb_congested);
-err_put_css:
+err_put:
+	if (preloaded)
+		radix_tree_preload_end();
+	if (wb_congested)
+		wb_congested_put(wb_congested);
 	css_put(&blkcg->css);
 err_free_blkg:
-	blkg_free(new_blkg);
+	blkg_free(blkg);
 	return ERR_PTR(ret);
 }
 
 /**
- * blkg_lookup_create - lookup blkg, try to create one if not there
+ * __blkg_lookup_create - lookup blkg, try to create one if not there
  * @blkcg: blkcg of interest
  * @q: request_queue of interest
+ * @gfp: gfp mask
+ * @pol: blkcg policy (optional)
  *
  * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
  * create one.  blkg creation is performed recursively from blkcg_root such
  * that all non-root blkg's have access to the parent blkg.  This function
  * should be called under RCU read lock and @q->queue_lock.
  *
+ * When gfp mask allows blocking, rcu and queue locks may be dropped for
+ * allocating memory. In this case, the locks will be reacquired on return.
+ *
  * Returns pointer to the looked up or created blkg on success, ERR_PTR()
  * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
  * dead and bypassing, returns ERR_PTR(-EBUSY).
  */
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-				    struct request_queue *q)
+struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
+				      struct request_queue *q, gfp_t gfp,
+				      const struct blkcg_policy *pol)
 {
 	struct blkcg_gq *blkg;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
 
-	/*
-	 * This could be the first entry point of blkcg implementation and
-	 * we shouldn't allow anything to go through for a bypassing queue.
-	 */
-	if (unlikely(blk_queue_bypass(q)))
-		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
-
 	blkg = __blkg_lookup(blkcg, q, true);
 	if (blkg)
 		return blkg;
@@ -301,12 +329,35 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 			parent = blkcg_parent(parent);
 		}
 
-		blkg = blkg_create(pos, q, NULL);
+		blkg = blkg_create(pos, q, gfp, pol);
 		if (pos == blkcg || IS_ERR(blkg))
 			return blkg;
 	}
 }
 
+/**
+ * blkg_lookup_create - lookup blkg, try to create one if not there
+ *
+ * Performs an initial queue bypass check and then passes control to
+ * __blkg_lookup_create().
+ */
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+				    struct request_queue *q, gfp_t gfp,
+				    const struct blkcg_policy *pol)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	lockdep_assert_held(q->queue_lock);
+
+	/*
+	 * This could be the first entry point of blkcg implementation and
+	 * we shouldn't allow anything to go through for a bypassing queue.
+	 */
+	if (unlikely(blk_queue_bypass(q)))
+		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+
+	return __blkg_lookup_create(blkcg, q, gfp, pol);
+}
+
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
 	struct blkcg *blkcg = blkg->blkcg;
@@ -817,7 +868,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	spin_lock_irq(disk->queue->queue_lock);
 
 	if (blkcg_policy_enabled(disk->queue, pol))
-		blkg = blkg_lookup_create(blkcg, disk->queue);
+		blkg = blkg_lookup_create(blkcg, disk->queue, GFP_KERNEL, pol);
 	else
 		blkg = ERR_PTR(-EOPNOTSUPP);
 
@@ -1056,30 +1107,15 @@ free_blkcg:
  */
 int blkcg_init_queue(struct request_queue *q)
 {
-	struct blkcg_gq *new_blkg, *blkg;
-	bool preloaded;
+	struct blkcg_gq *blkg;
 	int ret;
 
-	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
-	if (!new_blkg)
-		return -ENOMEM;
-
-	preloaded = !radix_tree_preload(GFP_KERNEL);
-
-	/*
-	 * Make sure the root blkg exists and count the existing blkgs.  As
-	 * @q is bypassing at this point, blkg_lookup_create() can't be
-	 * used.  Open code insertion.
-	 */
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
-	blkg = blkg_create(&blkcg_root, q, new_blkg);
+	blkg = __blkg_lookup_create(&blkcg_root, q, GFP_KERNEL, NULL);
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
 
-	if (preloaded)
-		radix_tree_preload_end();
-
 	if (IS_ERR(blkg))
 		return PTR_ERR(blkg);
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 01b62e7bac74..955903a8f6cb 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -172,7 +172,8 @@ extern struct cgroup_subsys_state * const blkcg_root_css;
 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 				      struct request_queue *q, bool update_hint);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-				    struct request_queue *q);
+				    struct request_queue *q, gfp_t gfp,
+				    const struct blkcg_policy *pol);
 int blkcg_init_queue(struct request_queue *q);
 void blkcg_drain_queue(struct request_queue *q);
 void blkcg_exit_queue(struct request_queue *q);
@@ -694,7 +695,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
 		spin_lock_irq(q->queue_lock);
-		blkg = blkg_lookup_create(blkcg, q);
+		blkg = blkg_lookup_create(blkcg, q, GFP_NOWAIT | __GFP_NOWARN,
+					  NULL);
 		if (IS_ERR(blkg))
 			blkg = NULL;
 		spin_unlock_irq(q->queue_lock);
-- 
cgit v1.3-6-gb490


From dfa672fbc0d9e83ff0dc1a75f1f5d0e59a30706b Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:16 +0300
Subject: ACPI / property: Add possiblity to retrieve parent firmware node

Sometimes it is useful to be able to navigate firmware node hierarchy
upwards toward parent nodes. ACPI device nodes are pretty much already
supported because ACPICA provides acpi_get_parent(). ACPI data nodes,
however, are all below the same parent ACPI device. Their hierarchy is
created by "linking" each other using references in the value field.

Add parent pointer to the parent data node while we create them so it is
easy to navigate the hierarchy backwards. We use this parent pointer in a
new function acpi_node_get_parent() that is able to extract parent of both
ACPI firmware node types.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c | 72 ++++++++++++++++++++++++++++++++++++++-----------
 include/acpi/acpi_bus.h |  1 +
 include/linux/acpi.h    |  7 +++++
 3 files changed, 65 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 3afddcd834ef..587c9d000f0e 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -37,14 +37,16 @@ static const u8 ads_uuid[16] = {
 
 static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
 					   const union acpi_object *desc,
-					   struct acpi_device_data *data);
+					   struct acpi_device_data *data,
+					   struct fwnode_handle *parent);
 static bool acpi_extract_properties(const union acpi_object *desc,
 				    struct acpi_device_data *data);
 
 static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 					acpi_handle handle,
 					const union acpi_object *link,
-					struct list_head *list)
+					struct list_head *list,
+					struct fwnode_handle *parent)
 {
 	struct acpi_data_node *dn;
 	bool result;
@@ -55,6 +57,7 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 
 	dn->name = link->package.elements[0].string.pointer;
 	dn->fwnode.type = FWNODE_ACPI_DATA;
+	dn->parent = parent;
 	INIT_LIST_HEAD(&dn->data.subnodes);
 
 	result = acpi_extract_properties(desc, &dn->data);
@@ -71,9 +74,11 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 		 */
 		status = acpi_get_parent(handle, &scope);
 		if (ACPI_SUCCESS(status)
-		    && acpi_enumerate_nondev_subnodes(scope, desc, &dn->data))
+		    && acpi_enumerate_nondev_subnodes(scope, desc, &dn->data,
+						      &dn->fwnode))
 			result = true;
-	} else if (acpi_enumerate_nondev_subnodes(NULL, desc, &dn->data)) {
+	} else if (acpi_enumerate_nondev_subnodes(NULL, desc, &dn->data,
+						  &dn->fwnode)) {
 		result = true;
 	}
 
@@ -91,7 +96,8 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc,
 
 static bool acpi_nondev_subnode_data_ok(acpi_handle handle,
 					const union acpi_object *link,
-					struct list_head *list)
+					struct list_head *list,
+					struct fwnode_handle *parent)
 {
 	struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER };
 	acpi_status status;
@@ -101,7 +107,8 @@ static bool acpi_nondev_subnode_data_ok(acpi_handle handle,
 	if (ACPI_FAILURE(status))
 		return false;
 
-	if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list))
+	if (acpi_nondev_subnode_extract(buf.pointer, handle, link, list,
+					parent))
 		return true;
 
 	ACPI_FREE(buf.pointer);
@@ -110,7 +117,8 @@ static bool acpi_nondev_subnode_data_ok(acpi_handle handle,
 
 static bool acpi_nondev_subnode_ok(acpi_handle scope,
 				   const union acpi_object *link,
-				   struct list_head *list)
+				   struct list_head *list,
+				   struct fwnode_handle *parent)
 {
 	acpi_handle handle;
 	acpi_status status;
@@ -123,12 +131,13 @@ static bool acpi_nondev_subnode_ok(acpi_handle scope,
 	if (ACPI_FAILURE(status))
 		return false;
 
-	return acpi_nondev_subnode_data_ok(handle, link, list);
+	return acpi_nondev_subnode_data_ok(handle, link, list, parent);
 }
 
 static int acpi_add_nondev_subnodes(acpi_handle scope,
 				    const union acpi_object *links,
-				    struct list_head *list)
+				    struct list_head *list,
+				    struct fwnode_handle *parent)
 {
 	bool ret = false;
 	int i;
@@ -150,15 +159,18 @@ static int acpi_add_nondev_subnodes(acpi_handle scope,
 		/* The second one may be a string, a reference or a package. */
 		switch (link->package.elements[1].type) {
 		case ACPI_TYPE_STRING:
-			result = acpi_nondev_subnode_ok(scope, link, list);
+			result = acpi_nondev_subnode_ok(scope, link, list,
+							 parent);
 			break;
 		case ACPI_TYPE_LOCAL_REFERENCE:
 			handle = link->package.elements[1].reference.handle;
-			result = acpi_nondev_subnode_data_ok(handle, link, list);
+			result = acpi_nondev_subnode_data_ok(handle, link, list,
+							     parent);
 			break;
 		case ACPI_TYPE_PACKAGE:
 			desc = &link->package.elements[1];
-			result = acpi_nondev_subnode_extract(desc, NULL, link, list);
+			result = acpi_nondev_subnode_extract(desc, NULL, link,
+							     list, parent);
 			break;
 		default:
 			result = false;
@@ -172,7 +184,8 @@ static int acpi_add_nondev_subnodes(acpi_handle scope,
 
 static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
 					   const union acpi_object *desc,
-					   struct acpi_device_data *data)
+					   struct acpi_device_data *data,
+					   struct fwnode_handle *parent)
 {
 	int i;
 
@@ -194,7 +207,8 @@ static bool acpi_enumerate_nondev_subnodes(acpi_handle scope,
 		if (memcmp(uuid->buffer.pointer, ads_uuid, sizeof(ads_uuid)))
 			continue;
 
-		return acpi_add_nondev_subnodes(scope, links, &data->subnodes);
+		return acpi_add_nondev_subnodes(scope, links, &data->subnodes,
+						parent);
 	}
 
 	return false;
@@ -345,7 +359,8 @@ void acpi_init_properties(struct acpi_device *adev)
 		if (acpi_of)
 			acpi_init_of_compatible(adev);
 	}
-	if (acpi_enumerate_nondev_subnodes(adev->handle, buf.pointer, &adev->data))
+	if (acpi_enumerate_nondev_subnodes(adev->handle, buf.pointer,
+					&adev->data, acpi_fwnode_handle(adev)))
 		adev->data.pointer = buf.pointer;
 
 	if (!adev->data.pointer) {
@@ -920,3 +935,30 @@ struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
 	}
 	return NULL;
 }
+
+/**
+ * acpi_node_get_parent - Return parent fwnode of this fwnode
+ * @fwnode: Firmware node whose parent to get
+ *
+ * Returns parent node of an ACPI device or data firmware node or %NULL if
+ * not available.
+ */
+struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode)
+{
+	if (is_acpi_data_node(fwnode)) {
+		/* All data nodes have parent pointer so just return that */
+		return to_acpi_data_node(fwnode)->parent;
+	} else if (is_acpi_device_node(fwnode)) {
+		acpi_handle handle, parent_handle;
+
+		handle = to_acpi_device_node(fwnode)->handle;
+		if (ACPI_SUCCESS(acpi_get_parent(handle, &parent_handle))) {
+			struct acpi_device *adev;
+
+			if (!acpi_bus_get_device(parent_handle, &adev))
+				return acpi_fwnode_handle(adev);
+		}
+	}
+
+	return NULL;
+}
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index ef0ae8aaa567..49cca52b214b 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -386,6 +386,7 @@ struct acpi_data_node {
 	const char *name;
 	acpi_handle handle;
 	struct fwnode_handle fwnode;
+	struct fwnode_handle *parent;
 	struct acpi_device_data data;
 	struct list_head sibling;
 	struct kobject kobj;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9b05886f9773..e74e8bdbb6af 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -999,6 +999,7 @@ int acpi_dev_prop_read(struct acpi_device *adev, const char *propname,
 
 struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
 					    struct fwnode_handle *subnode);
+struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode);
 
 struct acpi_probe_entry;
 typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
@@ -1121,6 +1122,12 @@ static inline struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
 	return NULL;
 }
 
+static inline struct fwnode_handle *
+acpi_node_get_parent(struct fwnode_handle *fwnode)
+{
+	return NULL;
+}
+
 #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \
 	static const void * __acpi_table_##name[]			\
 		__attribute__((unused))					\
-- 
cgit v1.3-6-gb490


From afaf26fd8458be29949ae5a52c65a464a1b0cbb6 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:17 +0300
Subject: device property: Add fwnode_get_parent()

Now that ACPI has support for returning parent firmware node for both types
of nodes we can expose this to others as well. This adds a new function
fwnode_get_parent() that can be used for DT and ACPI nodes to retrieve the
parent firmware node.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 25 +++++++++++++++++++++++++
 include/linux/property.h |  2 ++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index c458c63e353f..a25233430d18 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -931,6 +931,31 @@ int device_add_properties(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_add_properties);
 
+/**
+ * fwnode_get_parent - Return parent firwmare node
+ * @fwnode: Firmware whose parent is retrieved
+ *
+ * Return parent firmware node of the given node if possible or %NULL if no
+ * parent was available.
+ */
+struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *parent = NULL;
+
+	if (is_of_node(fwnode)) {
+		struct device_node *node;
+
+		node = of_get_parent(to_of_node(fwnode));
+		if (node)
+			parent = &node->fwnode;
+	} else if (is_acpi_node(fwnode)) {
+		parent = acpi_node_get_parent(fwnode);
+	}
+
+	return parent;
+}
+EXPORT_SYMBOL_GPL(fwnode_get_parent);
+
 /**
  * device_get_next_child_node - Return the next child node handle for a device
  * @dev: Device to find the next child node for.
diff --git a/include/linux/property.h b/include/linux/property.h
index 64e3a9c6d95f..ab0a8160cef6 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -70,6 +70,8 @@ int fwnode_property_read_string(struct fwnode_handle *fwnode,
 int fwnode_property_match_string(struct fwnode_handle *fwnode,
 				 const char *propname, const char *string);
 
+struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode);
+
 struct fwnode_handle *device_get_next_child_node(struct device *dev,
 						 struct fwnode_handle *child);
 
-- 
cgit v1.3-6-gb490


From 34055190b19d7c634caf738c8ca195cad06550cd Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:18 +0300
Subject: ACPI / property: Add fwnode_get_next_child_node()

The ACPI _DSD hierarchical data extension makes it possible to have
hierarchies deeper than one level in similar way than DT allows. These
"subsubnodes" have not been accessible because device property
implementation only provides device_get_next_child_node() that is limited
to direct descendants of a device.

We need this ability in order support things like remote endpoints
currently supported in DT with of_graph_* APIs.

Modify acpi_get_next_subnode() to accept fwnode handle instead and update
callers accordingly. Also add a new function fwnode_get_next_child_node()
that works directly with fwnodes and modify device_get_next_child_node() to
call it directly. While there add a macro fwnode_for_each_child_node()
analogous to the current device_for_each_child_node() but it works with
fwnodes instead of devices.

Link: http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.pdf
Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c  | 27 +++++++++++++++++----------
 drivers/base/property.c  | 38 ++++++++++++++++++++++++++++++--------
 include/linux/acpi.h     |  8 ++++----
 include/linux/property.h |  6 ++++++
 4 files changed, 57 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 587c9d000f0e..8730ce745b43 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -880,21 +880,22 @@ int acpi_node_prop_read(struct fwnode_handle *fwnode,  const char *propname,
 }
 
 /**
- * acpi_get_next_subnode - Return the next child node handle for a device.
- * @dev: Device to find the next child node for.
+ * acpi_get_next_subnode - Return the next child node handle for a fwnode
+ * @fwnode: Firmware node to find the next child node for.
  * @child: Handle to one of the device's child nodes or a null handle.
  */
-struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
+struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
 					    struct fwnode_handle *child)
 {
-	struct acpi_device *adev = ACPI_COMPANION(dev);
+	struct acpi_device *adev = to_acpi_device_node(fwnode);
 	struct list_head *head, *next;
 
-	if (!adev)
-		return NULL;
-
 	if (!child || child->type == FWNODE_ACPI) {
-		head = &adev->children;
+		if (adev)
+			head = &adev->children;
+		else
+			goto nondev;
+
 		if (list_empty(head))
 			goto nondev;
 
@@ -903,7 +904,6 @@ struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
 			next = adev->node.next;
 			if (next == head) {
 				child = NULL;
-				adev = ACPI_COMPANION(dev);
 				goto nondev;
 			}
 			adev = list_entry(next, struct acpi_device, node);
@@ -915,9 +915,16 @@ struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
 
  nondev:
 	if (!child || child->type == FWNODE_ACPI_DATA) {
+		struct acpi_data_node *data = to_acpi_data_node(fwnode);
 		struct acpi_data_node *dn;
 
-		head = &adev->data.subnodes;
+		if (adev)
+			head = &adev->data.subnodes;
+		else if (data)
+			head = &data->data.subnodes;
+		else
+			return NULL;
+
 		if (list_empty(head))
 			return NULL;
 
diff --git a/drivers/base/property.c b/drivers/base/property.c
index a25233430d18..bc0f07ac48f6 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -957,24 +957,46 @@ struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode)
 EXPORT_SYMBOL_GPL(fwnode_get_parent);
 
 /**
- * device_get_next_child_node - Return the next child node handle for a device
- * @dev: Device to find the next child node for.
- * @child: Handle to one of the device's child nodes or a null handle.
+ * fwnode_get_next_child_node - Return the next child node handle for a node
+ * @fwnode: Firmware node to find the next child node for.
+ * @child: Handle to one of the node's child nodes or a %NULL handle.
  */
-struct fwnode_handle *device_get_next_child_node(struct device *dev,
+struct fwnode_handle *fwnode_get_next_child_node(struct fwnode_handle *fwnode,
 						 struct fwnode_handle *child)
 {
-	if (IS_ENABLED(CONFIG_OF) && dev->of_node) {
+	if (is_of_node(fwnode)) {
 		struct device_node *node;
 
-		node = of_get_next_available_child(dev->of_node, to_of_node(child));
+		node = of_get_next_available_child(to_of_node(fwnode),
+						   to_of_node(child));
 		if (node)
 			return &node->fwnode;
-	} else if (IS_ENABLED(CONFIG_ACPI)) {
-		return acpi_get_next_subnode(dev, child);
+	} else if (is_acpi_node(fwnode)) {
+		return acpi_get_next_subnode(fwnode, child);
 	}
+
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(fwnode_get_next_child_node);
+
+/**
+ * device_get_next_child_node - Return the next child node handle for a device
+ * @dev: Device to find the next child node for.
+ * @child: Handle to one of the device's child nodes or a null handle.
+ */
+struct fwnode_handle *device_get_next_child_node(struct device *dev,
+						 struct fwnode_handle *child)
+{
+	struct acpi_device *adev = ACPI_COMPANION(dev);
+	struct fwnode_handle *fwnode = NULL;
+
+	if (dev->of_node)
+		fwnode = &dev->of_node->fwnode;
+	else if (adev)
+		fwnode = acpi_fwnode_handle(adev);
+
+	return fwnode_get_next_child_node(fwnode, child);
+}
 EXPORT_SYMBOL_GPL(device_get_next_child_node);
 
 /**
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e74e8bdbb6af..4eb1f5941ede 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -997,8 +997,8 @@ int acpi_node_prop_read(struct fwnode_handle *fwnode, const char *propname,
 int acpi_dev_prop_read(struct acpi_device *adev, const char *propname,
 		       enum dev_prop_type proptype, void *val, size_t nval);
 
-struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
-					    struct fwnode_handle *subnode);
+struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
+					    struct fwnode_handle *child);
 struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode);
 
 struct acpi_probe_entry;
@@ -1116,8 +1116,8 @@ static inline int acpi_dev_prop_read(struct acpi_device *adev,
 	return -ENXIO;
 }
 
-static inline struct fwnode_handle *acpi_get_next_subnode(struct device *dev,
-						struct fwnode_handle *subnode)
+static inline struct fwnode_handle *
+acpi_get_next_subnode(struct fwnode_handle *fwnode, struct fwnode_handle *child)
 {
 	return NULL;
 }
diff --git a/include/linux/property.h b/include/linux/property.h
index ab0a8160cef6..f4786a8655f1 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -71,6 +71,12 @@ int fwnode_property_match_string(struct fwnode_handle *fwnode,
 				 const char *propname, const char *string);
 
 struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_get_next_child_node(struct fwnode_handle *fwnode,
+						 struct fwnode_handle *child);
+
+#define fwnode_for_each_child_node(fwnode, child)			\
+	for (child = fwnode_get_next_child_node(fwnode, NULL); child;	\
+	     child = fwnode_get_next_child_node(fwnode, child))
 
 struct fwnode_handle *device_get_next_child_node(struct device *dev,
 						 struct fwnode_handle *child);
-- 
cgit v1.3-6-gb490


From 21ea73f54c6d77f35381c79870160496c9e78b60 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:19 +0300
Subject: device property: Add fwnode_get_named_child_node()

Since now we have means to enumerate all children of any fwnode even in
ACPI we can implement fwnode_get_named_child_node(). This is similar than
device_get_named_child_node() with the exception that it can be called to
any fwnode handle. Make device_get_named_child_node() call directly this
new function.

This is useful in cases where we need to be able to find child nodes which
are not direct descendants of the parent device.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 22 +++++++++++++++++-----
 include/linux/property.h |  2 ++
 2 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index bc0f07ac48f6..538b248d5dcf 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -1000,20 +1000,20 @@ struct fwnode_handle *device_get_next_child_node(struct device *dev,
 EXPORT_SYMBOL_GPL(device_get_next_child_node);
 
 /**
- * device_get_named_child_node - Return first matching named child node handle
- * @dev: Device to find the named child node for.
+ * fwnode_get_named_child_node - Return first matching named child node handle
+ * @fwnode: Firmware node to find the named child node for.
  * @childname: String to match child node name against.
  */
-struct fwnode_handle *device_get_named_child_node(struct device *dev,
+struct fwnode_handle *fwnode_get_named_child_node(struct fwnode_handle *fwnode,
 						  const char *childname)
 {
 	struct fwnode_handle *child;
 
 	/*
-	 * Find first matching named child node of this device.
+	 * Find first matching named child node of this fwnode.
 	 * For ACPI this will be a data only sub-node.
 	 */
-	device_for_each_child_node(dev, child) {
+	fwnode_for_each_child_node(fwnode, child) {
 		if (is_of_node(child)) {
 			if (!of_node_cmp(to_of_node(child)->name, childname))
 				return child;
@@ -1025,6 +1025,18 @@ struct fwnode_handle *device_get_named_child_node(struct device *dev,
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(fwnode_get_named_child_node);
+
+/**
+ * device_get_named_child_node - Return first matching named child node handle
+ * @dev: Device to find the named child node for.
+ * @childname: String to match child node name against.
+ */
+struct fwnode_handle *device_get_named_child_node(struct device *dev,
+						  const char *childname)
+{
+	return fwnode_get_named_child_node(dev_fwnode(dev), childname);
+}
 EXPORT_SYMBOL_GPL(device_get_named_child_node);
 
 /**
diff --git a/include/linux/property.h b/include/linux/property.h
index f4786a8655f1..514b19559fbe 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -85,6 +85,8 @@ struct fwnode_handle *device_get_next_child_node(struct device *dev,
 	for (child = device_get_next_child_node(dev, NULL); child;	\
 	     child = device_get_next_child_node(dev, child))
 
+struct fwnode_handle *fwnode_get_named_child_node(struct fwnode_handle *fwnode,
+						  const char *childname);
 struct fwnode_handle *device_get_named_child_node(struct device *dev,
 						  const char *childname);
 
-- 
cgit v1.3-6-gb490


From 79389a83bc3888a900191e3990cda5c76f2ca1ec Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:20 +0300
Subject: ACPI / property: Add support for remote endpoints

DT has had concept of remote endpoints for some time already. It makes
possible to reference another firmware node through a property called
remote-endpoint. This is already used by some subsystems like v4l2 for
parsing hardware properties related to camera.

This patch adds ACPI support for remote endpoints utilizing _DSD
hierarchical data extensions.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/property.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h    |  23 ++++++++
 2 files changed, 161 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index 8730ce745b43..9eb7c13c5bb7 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -969,3 +969,141 @@ struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode)
 
 	return NULL;
 }
+
+/**
+ * acpi_graph_get_next_endpoint - Get next endpoint ACPI firmware node
+ * @fwnode: Pointer to the parent firmware node
+ * @prev: Previous endpoint node or %NULL to get the first
+ *
+ * Looks up next endpoint ACPI firmware node below a given @fwnode. Returns
+ * %NULL if there is no next endpoint, ERR_PTR() in case of error. In case
+ * of success the next endpoint is returned.
+ */
+struct fwnode_handle *acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+						   struct fwnode_handle *prev)
+{
+	struct fwnode_handle *port = NULL;
+	struct fwnode_handle *endpoint;
+
+	if (!prev) {
+		do {
+			port = fwnode_get_next_child_node(fwnode, port);
+			/* Ports must have port property */
+			if (fwnode_property_present(port, "port"))
+				break;
+		} while (port);
+	} else {
+		port = fwnode_get_parent(prev);
+	}
+
+	if (!port)
+		return NULL;
+
+	endpoint = fwnode_get_next_child_node(port, prev);
+	while (!endpoint) {
+		port = fwnode_get_next_child_node(fwnode, port);
+		if (!port)
+			break;
+		if (fwnode_property_present(port, "port"))
+			endpoint = fwnode_get_next_child_node(port, NULL);
+	}
+
+	if (endpoint) {
+		/* Endpoints must have "endpoint" property */
+		if (!fwnode_property_present(endpoint, "endpoint"))
+			return ERR_PTR(-EPROTO);
+	}
+
+	return endpoint;
+}
+
+/**
+ * acpi_graph_get_child_prop_value - Return a child with a given property value
+ * @fwnode: device fwnode
+ * @prop_name: The name of the property to look for
+ * @val: the desired property value
+ *
+ * Return the port node corresponding to a given port number. Returns
+ * the child node on success, NULL otherwise.
+ */
+static struct fwnode_handle *acpi_graph_get_child_prop_value(
+	struct fwnode_handle *fwnode, const char *prop_name, unsigned int val)
+{
+	struct fwnode_handle *child;
+
+	fwnode_for_each_child_node(fwnode, child) {
+		u32 nr;
+
+		if (!fwnode_property_read_u32(fwnode, prop_name, &nr))
+			continue;
+
+		if (val == nr)
+			return child;
+	}
+
+	return NULL;
+}
+
+
+/**
+ * acpi_graph_get_remote_enpoint - Parses and returns remote end of an endpoint
+ * @fwnode: Endpoint firmware node pointing to a remote device
+ * @parent: Firmware node of remote port parent is filled here if not %NULL
+ * @port: Firmware node of remote port is filled here if not %NULL
+ * @endpoint: Firmware node of remote endpoint is filled here if not %NULL
+ *
+ * Function parses remote end of ACPI firmware remote endpoint and fills in
+ * fields requested by the caller. Returns %0 in case of success and
+ * negative errno otherwise.
+ */
+int acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
+				   struct fwnode_handle **parent,
+				   struct fwnode_handle **port,
+				   struct fwnode_handle **endpoint)
+{
+	unsigned int port_nr, endpoint_nr;
+	struct acpi_reference_args args;
+	int ret;
+
+	memset(&args, 0, sizeof(args));
+	ret = acpi_node_get_property_reference(fwnode, "remote-endpoint", 0,
+					       &args);
+	if (ret)
+		return ret;
+
+	/*
+	 * Always require two arguments with the reference: port and
+	 * endpoint indices.
+	 */
+	if (args.nargs != 2)
+		return -EPROTO;
+
+	fwnode = acpi_fwnode_handle(args.adev);
+	port_nr = args.args[0];
+	endpoint_nr = args.args[1];
+
+	if (parent)
+		*parent = fwnode;
+
+	if (!port && !endpoint)
+		return 0;
+
+	fwnode = acpi_graph_get_child_prop_value(fwnode, "port", port_nr);
+	if (!fwnode)
+		return -EPROTO;
+
+	if (port)
+		*port = fwnode;
+
+	if (!endpoint)
+		return 0;
+
+	fwnode = acpi_graph_get_child_prop_value(fwnode, "endpoint",
+						 endpoint_nr);
+	if (!fwnode)
+		return -EPROTO;
+
+	*endpoint = fwnode;
+
+	return 0;
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4eb1f5941ede..add8a96e1977 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1001,6 +1001,13 @@ struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode,
 					    struct fwnode_handle *child);
 struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode);
 
+struct fwnode_handle *acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+						   struct fwnode_handle *prev);
+int acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
+				   struct fwnode_handle **remote,
+				   struct fwnode_handle **port,
+				   struct fwnode_handle **endpoint);
+
 struct acpi_probe_entry;
 typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *,
 						 struct acpi_probe_entry *);
@@ -1128,6 +1135,22 @@ acpi_node_get_parent(struct fwnode_handle *fwnode)
 	return NULL;
 }
 
+static inline struct fwnode_handle *
+acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+			     struct fwnode_handle *prev)
+{
+	return ERR_PTR(-ENXIO);
+}
+
+static inline int
+acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode,
+			       struct fwnode_handle **remote,
+			       struct fwnode_handle **port,
+			       struct fwnode_handle **endpoint)
+{
+	return -ENXIO;
+}
+
 #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \
 	static const void * __acpi_table_##name[]			\
 		__attribute__((unused))					\
-- 
cgit v1.3-6-gb490


From 07bb80d40b0e6a43aafb422296d33baed255569a Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:21 +0300
Subject: device property: Add support for remote endpoints

This follows DT implementation of of_graph_* APIs but we call them
fwnode_graph_* instead. For DT nodes the existing of_graph_* implementation
will be used. For ACPI we use the new ACPI graph implementation instead.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 123 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/property.h |   9 ++++
 2 files changed, 132 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 538b248d5dcf..4e98a6fad33f 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/of_graph.h>
 #include <linux/property.h>
 #include <linux/etherdevice.h>
 #include <linux/phy.h>
@@ -1176,3 +1177,125 @@ void *device_get_mac_address(struct device *dev, char *addr, int alen)
 	return device_get_mac_addr(dev, "address", addr, alen);
 }
 EXPORT_SYMBOL(device_get_mac_address);
+
+/**
+ * device_graph_get_next_endpoint - Get next endpoint firmware node
+ * @fwnode: Pointer to the parent firmware node
+ * @prev: Previous endpoint node or %NULL to get the first
+ *
+ * Returns an endpoint firmware node pointer or %NULL if no more endpoints
+ * are available.
+ */
+struct fwnode_handle *
+fwnode_graph_get_next_endpoint(struct fwnode_handle *fwnode,
+			       struct fwnode_handle *prev)
+{
+	struct fwnode_handle *endpoint = NULL;
+
+	if (is_of_node(fwnode)) {
+		struct device_node *node;
+
+		node = of_graph_get_next_endpoint(to_of_node(fwnode),
+						  to_of_node(prev));
+
+		if (node)
+			endpoint = &node->fwnode;
+	} else if (is_acpi_node(fwnode)) {
+		endpoint = acpi_graph_get_next_endpoint(fwnode, prev);
+		if (IS_ERR(endpoint))
+			endpoint = NULL;
+	}
+
+	return endpoint;
+
+}
+EXPORT_SYMBOL_GPL(fwnode_graph_get_next_endpoint);
+
+/**
+ * fwnode_graph_get_remote_port_parent - Return fwnode of a remote device
+ * @fwnode: Endpoint firmware node pointing to the remote endpoint
+ *
+ * Extracts firmware node of a remote device the @fwnode points to.
+ */
+struct fwnode_handle *
+fwnode_graph_get_remote_port_parent(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *parent = NULL;
+
+	if (is_of_node(fwnode)) {
+		struct device_node *node;
+
+		node = of_graph_get_remote_port_parent(to_of_node(fwnode));
+		if (node)
+			parent = &node->fwnode;
+	} else if (is_acpi_node(fwnode)) {
+		int ret;
+
+		ret = acpi_graph_get_remote_endpoint(fwnode, &parent, NULL,
+						     NULL);
+		if (ret)
+			return NULL;
+	}
+
+	return parent;
+}
+EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port_parent);
+
+/**
+ * fwnode_graph_get_remote_port - Return fwnode of a remote port
+ * @fwnode: Endpoint firmware node pointing to the remote endpoint
+ *
+ * Extracts firmware node of a remote port the @fwnode points to.
+ */
+struct fwnode_handle *fwnode_graph_get_remote_port(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *port = NULL;
+
+	if (is_of_node(fwnode)) {
+		struct device_node *node;
+
+		node = of_graph_get_remote_port(to_of_node(fwnode));
+		if (node)
+			port = &node->fwnode;
+	} else if (is_acpi_node(fwnode)) {
+		int ret;
+
+		ret = acpi_graph_get_remote_endpoint(fwnode, NULL, &port, NULL);
+		if (ret)
+			return NULL;
+	}
+
+	return port;
+}
+EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_port);
+
+/**
+ * fwnode_graph_get_remote_endpoint - Return fwnode of a remote endpoint
+ * @fwnode: Endpoint firmware node pointing to the remote endpoint
+ *
+ * Extracts firmware node of a remote endpoint the @fwnode points to.
+ */
+struct fwnode_handle *
+fwnode_graph_get_remote_endpoint(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *endpoint = NULL;
+
+	if (is_of_node(fwnode)) {
+		struct device_node *node;
+
+		node = of_parse_phandle(to_of_node(fwnode), "remote-endpoint",
+					0);
+		if (node)
+			endpoint = &node->fwnode;
+	} else if (is_acpi_node(fwnode)) {
+		int ret;
+
+		ret = acpi_graph_get_remote_endpoint(fwnode, NULL, NULL,
+						     &endpoint);
+		if (ret)
+			return NULL;
+	}
+
+	return endpoint;
+}
+EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint);
diff --git a/include/linux/property.h b/include/linux/property.h
index 514b19559fbe..8d7809c2c42d 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -268,4 +268,13 @@ int device_get_phy_mode(struct device *dev);
 
 void *device_get_mac_address(struct device *dev, char *addr, int alen);
 
+struct fwnode_handle *fwnode_graph_get_next_endpoint(
+	struct fwnode_handle *fwnode, struct fwnode_handle *prev);
+struct fwnode_handle *fwnode_graph_get_remote_port_parent(
+	struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_graph_get_remote_port(
+	struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_graph_get_remote_endpoint(
+	struct fwnode_handle *fwnode);
+
 #endif /* _LINUX_PROPERTY_H_ */
-- 
cgit v1.3-6-gb490


From e7887c284969a23a98fe1aff2f631c5ccdcd1757 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:22 +0300
Subject: device property: Add fwnode_handle_get()

fwnode_handle_get() is used to obtain a reference to a fwnode_handle
container. In this case this is OF specific struct device_node.

This complements fwnode_handle_put() which is already implemented.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 11 +++++++++++
 include/linux/property.h |  1 +
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 4e98a6fad33f..23514bf67933 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -1040,6 +1040,17 @@ struct fwnode_handle *device_get_named_child_node(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_get_named_child_node);
 
+/**
+ * fwnode_handle_get - Obtain a reference to a device node
+ * @fwnode: Pointer to the device node to obtain the reference to.
+ */
+void fwnode_handle_get(struct fwnode_handle *fwnode)
+{
+	if (is_of_node(fwnode))
+		of_node_get(to_of_node(fwnode));
+}
+EXPORT_SYMBOL_GPL(fwnode_handle_get);
+
 /**
  * fwnode_handle_put - Drop reference to a device node
  * @fwnode: Pointer to the device node to drop the reference to.
diff --git a/include/linux/property.h b/include/linux/property.h
index 8d7809c2c42d..0ae7d209f6c2 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -90,6 +90,7 @@ struct fwnode_handle *fwnode_get_named_child_node(struct fwnode_handle *fwnode,
 struct fwnode_handle *device_get_named_child_node(struct device *dev,
 						  const char *childname);
 
+void fwnode_handle_get(struct fwnode_handle *fwnode);
 void fwnode_handle_put(struct fwnode_handle *fwnode);
 
 unsigned int device_get_child_node_count(struct device *dev);
-- 
cgit v1.3-6-gb490


From 67831837e0b192fe0b8ee8b5e502d95ad2c497c0 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:23 +0300
Subject: of: Add of_fwnode_handle() to convert device nodes to fwnode_handle

of_fwnode_handle() returns a struct fwnode_handle of the struct
device_node. This may be used on the fwnode property API.

Use a macro instead of a function in order to support const and non-const
arguments.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/of.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index 21e6323de0f3..e5d4225fda35 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -159,6 +159,8 @@ static inline struct device_node *to_of_node(struct fwnode_handle *fwnode)
 		container_of(fwnode, struct device_node, fwnode) : NULL;
 }
 
+#define of_fwnode_handle(node) (&(node)->fwnode)
+
 static inline bool of_have_populated_dt(void)
 {
 	return of_root != NULL;
@@ -602,6 +604,8 @@ static inline struct device_node *of_find_node_with_property(
 	return NULL;
 }
 
+#define of_fwnode_handle(node) NULL
+
 static inline bool of_have_populated_dt(void)
 {
 	return false;
-- 
cgit v1.3-6-gb490


From e44bb0cbdc88686c21e2175a990b40bf6db5d005 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:24 +0300
Subject: device property: Make dev_fwnode() public

The function to obtain a fwnode related to a struct device is useful for
drivers that use the fwnode property API: it allows not being aware of the
underlying firmware implementation.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 3 ++-
 include/linux/property.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 23514bf67933..22849a89a8b5 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -183,11 +183,12 @@ static int pset_prop_read_string(struct property_set *pset,
 	return 0;
 }
 
-static inline struct fwnode_handle *dev_fwnode(struct device *dev)
+struct fwnode_handle *dev_fwnode(struct device *dev)
 {
 	return IS_ENABLED(CONFIG_OF) && dev->of_node ?
 		&dev->of_node->fwnode : dev->fwnode;
 }
+EXPORT_SYMBOL_GPL(dev_fwnode);
 
 /**
  * device_property_present - check if a property of a device is present
diff --git a/include/linux/property.h b/include/linux/property.h
index 0ae7d209f6c2..6e20a12a2eec 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -33,6 +33,8 @@ enum dev_dma_attr {
 	DEV_DMA_COHERENT,
 };
 
+struct fwnode_handle *dev_fwnode(struct device *dev);
+
 bool device_property_present(struct device *dev, const char *propname);
 int device_property_read_u8_array(struct device *dev, const char *propname,
 				  u8 *val, size_t nval);
-- 
cgit v1.3-6-gb490


From 2bd5452d46df46d99b869b59a1532647e2981d75 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:25 +0300
Subject: device property: Add support for fwnode endpoints

Similar to OF endpoints, endpoint type nodes can be also supported on
ACPI. In order to make it possible for drivers to ignore the matter,
add a type for fwnode_endpoint and a function to parse them.

On ACPI, find the child node index instead of relying on the "endpoint"
property.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 32 ++++++++++++++++++++++++++++++++
 include/linux/fwnode.h   | 12 ++++++++++++
 include/linux/property.h |  3 +++
 3 files changed, 47 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 22849a89a8b5..8a53b8f1db66 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -1311,3 +1311,35 @@ fwnode_graph_get_remote_endpoint(struct fwnode_handle *fwnode)
 	return endpoint;
 }
 EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint);
+
+/**
+ * fwnode_graph_parse_endpoint - parse common endpoint node properties
+ * @fwnode: pointer to endpoint fwnode_handle
+ * @endpoint: pointer to the fwnode endpoint data structure
+ *
+ * Parse @fwnode representing a graph endpoint node and store the
+ * information in @endpoint. The caller must hold a reference to
+ * @fwnode.
+ */
+int fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode,
+				struct fwnode_endpoint *endpoint)
+{
+	struct fwnode_handle *port_fwnode = fwnode_get_parent(fwnode);
+
+	memset(endpoint, 0, sizeof(*endpoint));
+
+	endpoint->local_fwnode = fwnode;
+
+	if (is_acpi_node(port_fwnode)) {
+		fwnode_property_read_u32(port_fwnode, "port", &endpoint->port);
+		fwnode_property_read_u32(fwnode, "endpoint", &endpoint->id);
+	} else {
+		fwnode_property_read_u32(port_fwnode, "reg", &endpoint->port);
+		fwnode_property_read_u32(fwnode, "reg", &endpoint->id);
+	}
+
+	fwnode_handle_put(port_fwnode);
+
+	return 0;
+}
+EXPORT_SYMBOL(fwnode_graph_parse_endpoint);
diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h
index 8bd28ce6d76e..3dff2398a5f0 100644
--- a/include/linux/fwnode.h
+++ b/include/linux/fwnode.h
@@ -27,4 +27,16 @@ struct fwnode_handle {
 	struct fwnode_handle *secondary;
 };
 
+/**
+ * struct fwnode_endpoint - Fwnode graph endpoint
+ * @port: Port number
+ * @id: Endpoint id
+ * @local_fwnode: reference to the related fwnode
+ */
+struct fwnode_endpoint {
+	unsigned int port;
+	unsigned int id;
+	const struct fwnode_handle *local_fwnode;
+};
+
 #endif
diff --git a/include/linux/property.h b/include/linux/property.h
index 6e20a12a2eec..3a4e43599e01 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -280,4 +280,7 @@ struct fwnode_handle *fwnode_graph_get_remote_port(
 struct fwnode_handle *fwnode_graph_get_remote_endpoint(
 	struct fwnode_handle *fwnode);
 
+int fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode,
+				struct fwnode_endpoint *endpoint);
+
 #endif /* _LINUX_PROPERTY_H_ */
-- 
cgit v1.3-6-gb490


From 233872585de1cf26c3c3da5859ffb3aba45bd486 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 28 Mar 2017 10:52:26 +0300
Subject: device property: Add fwnode_get_next_parent()

In order to differentiate the functionality between dropping a reference
to the node (or not) for the benefit of OF, introduce
fwnode_get_next_parent().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/property.c  | 21 +++++++++++++++++++++
 include/linux/property.h |  1 +
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 8a53b8f1db66..627ebc9b570d 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -933,6 +933,27 @@ int device_add_properties(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(device_add_properties);
 
+/**
+ * fwnode_get_next_parent - Iterate to the node's parent
+ * @fwnode: Firmware whose parent is retrieved
+ *
+ * This is like fwnode_get_parent() except that it drops the refcount
+ * on the passed node, making it suitable for iterating through a
+ * node's parents.
+ *
+ * Returns a node pointer with refcount incremented, use
+ * fwnode_handle_node() on it when done.
+ */
+struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode)
+{
+	struct fwnode_handle *parent = fwnode_get_parent(fwnode);
+
+	fwnode_handle_put(fwnode);
+
+	return parent;
+}
+EXPORT_SYMBOL_GPL(fwnode_get_next_parent);
+
 /**
  * fwnode_get_parent - Return parent firwmare node
  * @fwnode: Firmware whose parent is retrieved
diff --git a/include/linux/property.h b/include/linux/property.h
index 3a4e43599e01..2f482616a2f2 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -73,6 +73,7 @@ int fwnode_property_match_string(struct fwnode_handle *fwnode,
 				 const char *propname, const char *string);
 
 struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode);
+struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode);
 struct fwnode_handle *fwnode_get_next_child_node(struct fwnode_handle *fwnode,
 						 struct fwnode_handle *child);
 
-- 
cgit v1.3-6-gb490


From ffaa42e8a40b7f1041e36b022cd28b7c45e2b564 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 20 Mar 2017 11:19:21 +0100
Subject: PM / Domains: Enable users of genpd to specify always on PM domains

The current way to implement an always on PM domain consists of returning
-EBUSY from the ->power_off() callback. This is a bit different compared to
using the always on genpd governor, which prevents the PM domain from being
powered off via runtime suspend, but not via system suspend.

The approach to return -EBUSY from the ->power_off() callback to support
always on PM domains in genpd is suboptimal. That is because it requires
genpd to follow the regular execution path of the power off sequence, which
ends by invoking the ->power_off() callback.

To enable genpd to early abort the power off sequence for always on PM
domains, it needs static information about these configurations. Therefore
let's add a new genpd configuration flag, GENPD_FLAG_ALWAYS_ON.

Users of the new GENPD_FLAG_ALWAYS_ON flag, are by genpd required to make
sure the PM domain is powered on before calling pm_genpd_init(). Moreover,
users don't need to implement the ->power_off() callback, as genpd doesn't
ever invoke it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 14 ++++++++++++--
 include/linux/pm_domain.h   |  1 +
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 792fbab3dfc4..c71a7ef08b05 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -123,6 +123,7 @@ static const struct genpd_lock_ops genpd_spin_ops = {
 
 #define genpd_status_on(genpd)		(genpd->status == GPD_STATE_ACTIVE)
 #define genpd_is_irq_safe(genpd)	(genpd->flags & GENPD_FLAG_IRQ_SAFE)
+#define genpd_is_always_on(genpd)	(genpd->flags & GENPD_FLAG_ALWAYS_ON)
 
 static inline bool irq_safe_dev_in_no_sleep_domain(struct device *dev,
 		struct generic_pm_domain *genpd)
@@ -300,7 +301,12 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on,
 	if (!genpd_status_on(genpd) || genpd->prepared_count > 0)
 		return 0;
 
-	if (atomic_read(&genpd->sd_count) > 0)
+	/*
+	 * Abort power off for the PM domain in the following situations:
+	 * (1) The domain is configured as always on.
+	 * (2) When the domain has a subdomain being powered on.
+	 */
+	if (genpd_is_always_on(genpd) || atomic_read(&genpd->sd_count) > 0)
 		return -EBUSY;
 
 	list_for_each_entry(pdd, &genpd->dev_list, list_node) {
@@ -752,7 +758,7 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock,
 {
 	struct gpd_link *link;
 
-	if (!genpd_status_on(genpd))
+	if (!genpd_status_on(genpd) || genpd_is_always_on(genpd))
 		return;
 
 	if (genpd->suspended_count != genpd->device_count
@@ -1491,6 +1497,10 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 		genpd->dev_ops.start = pm_clk_resume;
 	}
 
+	/* Always-on domains must be powered on at initialization. */
+	if (genpd_is_always_on(genpd) && !genpd_status_on(genpd))
+		return -EINVAL;
+
 	/* Use only one "off" state if there were no states declared */
 	if (genpd->state_count == 0) {
 		ret = genpd_set_default_power_state(genpd);
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 5339ed5bd6f9..9b6abe632587 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -20,6 +20,7 @@
 /* Defines used for the flags field in the struct generic_pm_domain */
 #define GENPD_FLAG_PM_CLK	(1U << 0) /* PM domain uses PM clk */
 #define GENPD_FLAG_IRQ_SAFE	(1U << 1) /* PM domain operates in atomic */
+#define GENPD_FLAG_ALWAYS_ON	(1U << 2) /* PM domain is always powered on */
 
 enum gpd_status {
 	GPD_STATE_ACTIVE = 0,	/* PM domain is active */
-- 
cgit v1.3-6-gb490


From d597580d373774b1bdab84b3d26ff0b55162b916 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 Mar 2017 21:56:06 -0400
Subject: generic ...copy_..._user primitives

provide raw_copy_..._user() and select ARCH_HAS_RAW_COPY_USER to use those.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/Kconfig                  |   3 +
 include/asm-generic/uaccess.h |  11 +++
 include/linux/uaccess.h       | 187 ++++++++++++++++++++++++++++++++++++++++++
 lib/Makefile                  |   2 +
 lib/usercopy.c                |  26 ++++++
 5 files changed, 229 insertions(+)
 create mode 100644 lib/usercopy.c

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index cd211a14a88f..315d37626ddc 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -847,4 +847,7 @@ config STRICT_MODULE_RWX
 config ARCH_WANT_RELAX_ORDER
 	bool
 
+config ARCH_HAS_RAW_COPY_USER
+	bool
+
 source "kernel/gcov/Kconfig"
diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index bd7a05e9582b..d65c311eb128 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -86,7 +86,11 @@ static inline int __access_ok(unsigned long addr, unsigned long size)
 
 static inline int __put_user_fn(size_t size, void __user *ptr, void *x)
 {
+#ifdef CONFIG_ARCH_HAS_RAW_COPY_USER
+	return unlikely(raw_copy_to_user(ptr, x, size)) ? -EFAULT : 0;
+#else
 	return unlikely(__copy_to_user(ptr, x, size)) ? -EFAULT : 0;
+#endif
 }
 
 #define __put_user_fn(sz, u, k)	__put_user_fn(sz, u, k)
@@ -147,7 +151,11 @@ extern int __put_user_bad(void) __attribute__((noreturn));
 #ifndef __get_user_fn
 static inline int __get_user_fn(size_t size, const void __user *ptr, void *x)
 {
+#ifdef CONFIG_ARCH_HAS_RAW_COPY_USER
+	return unlikely(raw_copy_from_user(x, ptr, size)) ? -EFAULT : 0;
+#else
 	return unlikely(__copy_from_user(x, ptr, size)) ? -EFAULT : 0;
+#endif
 }
 
 #define __get_user_fn(sz, u, k)	__get_user_fn(sz, u, k)
@@ -156,6 +164,8 @@ static inline int __get_user_fn(size_t size, const void __user *ptr, void *x)
 
 extern int __get_user_bad(void) __attribute__((noreturn));
 
+#ifndef CONFIG_ARCH_HAS_RAW_COPY_USER
+
 #ifndef __copy_from_user_inatomic
 #define __copy_from_user_inatomic __copy_from_user
 #endif
@@ -185,6 +195,7 @@ static inline long copy_to_user(void __user *to,
 	else
 		return n;
 }
+#endif
 
 /*
  * Copy a null terminated string from userspace.
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 9c3ae8706e9d..5f76bc995d96 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -3,6 +3,7 @@
 
 #include <linux/sched.h>
 #include <linux/thread_info.h>
+#include <linux/kasan-checks.h>
 
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1
@@ -11,6 +12,192 @@
 
 #include <asm/uaccess.h>
 
+#ifdef CONFIG_ARCH_HAS_RAW_COPY_USER
+/*
+ * Architectures should provide two primitives (raw_copy_{to,from}_user())
+ * select ARCH_HAS_RAW_COPY_FROM_USER and get rid of their private instances
+ * of copy_{to,from}_user() and __copy_{to,from}_user{,_inatomic}().  Once
+ * all of them switch, this part of linux/uaccess.h will become unconditional.
+ *
+ * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
+ * return the amount left to copy.  They should assume that access_ok() has
+ * already been checked (and succeeded); they should *not* zero-pad anything.
+ * No KASAN or object size checks either - those belong here.
+ *
+ * Both of these functions should attempt to copy size bytes starting at from
+ * into the area starting at to.  They must not fetch or store anything
+ * outside of those areas.  Return value must be between 0 (everything
+ * copied successfully) and size (nothing copied).
+ *
+ * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting
+ * at to must become equal to the bytes fetched from the corresponding area
+ * starting at from.  All data past to + size - N must be left unmodified.
+ *
+ * If copying succeeds, the return value must be 0.  If some data cannot be
+ * fetched, it is permitted to copy less than had been fetched; the only
+ * hard requirement is that not storing anything at all (i.e. returning size)
+ * should happen only when nothing could be copied.  In other words, you don't
+ * have to squeeze as much as possible - it is allowed, but not necessary.
+ *
+ * For raw_copy_from_user() to always points to kernel memory and no faults
+ * on store should happen.  Interpretation of from is affected by set_fs().
+ * For raw_copy_to_user() it's the other way round.
+ *
+ * Both can be inlined - it's up to architectures whether it wants to bother
+ * with that.  They should not be used directly; they are used to implement
+ * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic())
+ * that are used instead.  Out of those, __... ones are inlined.  Plain
+ * copy_{to,from}_user() might or might not be inlined.  If you want them
+ * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER.
+ *
+ * NOTE: only copy_from_user() zero-pads the destination in case of short copy.
+ * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything
+ * at all; their callers absolutely must check the return value.
+ *
+ * Biarch ones should also provide raw_copy_in_user() - similar to the above,
+ * but both source and destination are __user pointers (affected by set_fs()
+ * as usual) and both source and destination can trigger faults.
+ */
+
+static __always_inline unsigned long
+__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
+{
+	kasan_check_write(to, n);
+	check_object_size(to, n, false);
+	return raw_copy_from_user(to, from, n);
+}
+
+static __always_inline unsigned long
+__copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+	might_fault();
+	kasan_check_write(to, n);
+	check_object_size(to, n, false);
+	return raw_copy_from_user(to, from, n);
+}
+
+/**
+ * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
+ * @to:   Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n:    Number of bytes to copy.
+ *
+ * Context: User context only.
+ *
+ * Copy data from kernel space to user space.  Caller must check
+ * the specified block with access_ok() before calling this function.
+ * The caller should also make sure he pins the user space address
+ * so that we don't result in page fault and sleep.
+ */
+static __always_inline unsigned long
+__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
+{
+	kasan_check_read(from, n);
+	check_object_size(from, n, true);
+	return raw_copy_to_user(to, from, n);
+}
+
+static __always_inline unsigned long
+__copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	might_fault();
+	kasan_check_read(from, n);
+	check_object_size(from, n, true);
+	return raw_copy_to_user(to, from, n);
+}
+
+#ifdef INLINE_COPY_FROM_USER
+static inline unsigned long
+_copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+	unsigned long res = n;
+	if (likely(access_ok(VERIFY_READ, from, n)))
+		res = raw_copy_from_user(to, from, n);
+	if (unlikely(res))
+		memset(to + (n - res), 0, res);
+	return res;
+}
+#else
+extern unsigned long
+_copy_from_user(void *, const void __user *, unsigned long);
+#endif
+
+#ifdef INLINE_COPY_TO_USER
+static inline unsigned long
+_copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	if (access_ok(VERIFY_WRITE, to, n))
+		n = raw_copy_to_user(to, from, n);
+	return n;
+}
+#else
+extern unsigned long
+_copy_to_user(void __user *, const void *, unsigned long);
+#endif
+
+extern void __compiletime_error("usercopy buffer size is too small")
+__bad_copy_user(void);
+
+static inline void copy_user_overflow(int size, unsigned long count)
+{
+	WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
+}
+
+static __always_inline unsigned long __must_check
+copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+	int sz = __compiletime_object_size(to);
+
+	might_fault();
+	kasan_check_write(to, n);
+
+	if (likely(sz < 0 || sz >= n)) {
+		check_object_size(to, n, false);
+		n = _copy_from_user(to, from, n);
+	} else if (!__builtin_constant_p(n))
+		copy_user_overflow(sz, n);
+	else
+		__bad_copy_user();
+
+	return n;
+}
+
+static __always_inline unsigned long __must_check
+copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+	int sz = __compiletime_object_size(from);
+
+	kasan_check_read(from, n);
+	might_fault();
+
+	if (likely(sz < 0 || sz >= n)) {
+		check_object_size(from, n, true);
+		n = _copy_to_user(to, from, n);
+	} else if (!__builtin_constant_p(n))
+		copy_user_overflow(sz, n);
+	else
+		__bad_copy_user();
+
+	return n;
+}
+#ifdef CONFIG_COMPAT
+static __always_inline unsigned long __must_check
+__copy_in_user(void __user *to, const void *from, unsigned long n)
+{
+	might_fault();
+	return raw_copy_in_user(to, from, n);
+}
+static __always_inline unsigned long __must_check
+copy_in_user(void __user *to, const void *from, unsigned long n)
+{
+	might_fault();
+	if (access_ok(VERIFY_WRITE, to, n) && access_ok(VERIFY_READ, from, n))
+		n = raw_copy_in_user(to, from, n);
+	return n;
+}
+#endif
+#endif
+
 static __always_inline void pagefault_disabled_inc(void)
 {
 	current->pagefault_disabled++;
diff --git a/lib/Makefile b/lib/Makefile
index 320ac46a8725..7d875c389172 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -242,3 +242,5 @@ UBSAN_SANITIZE_ubsan.o := n
 obj-$(CONFIG_SBITMAP) += sbitmap.o
 
 obj-$(CONFIG_PARMAN) += parman.o
+
+obj-$(CONFIG_ARCH_HAS_RAW_COPY_USER) += usercopy.o
diff --git a/lib/usercopy.c b/lib/usercopy.c
new file mode 100644
index 000000000000..1b6010a3beb8
--- /dev/null
+++ b/lib/usercopy.c
@@ -0,0 +1,26 @@
+#include <linux/uaccess.h>
+
+/* out-of-line parts */
+
+#ifndef INLINE_COPY_FROM_USER
+unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+	unsigned long res = n;
+	if (likely(access_ok(VERIFY_READ, from, n)))
+		res = raw_copy_from_user(to, from, n);
+	if (unlikely(res))
+		memset(to + (n - res), 0, res);
+	return res;
+}
+EXPORT_SYMBOL(_copy_from_user);
+#endif
+
+#ifndef INLINE_COPY_TO_USER
+unsigned long _copy_to_user(void *to, const void __user *from, unsigned long n)
+{
+	if (likely(access_ok(VERIFY_WRITE, to, n)))
+		n = raw_copy_to_user(to, from, n);
+	return n;
+}
+EXPORT_SYMBOL(_copy_to_user);
+#endif
-- 
cgit v1.3-6-gb490


From 3f763453e6f27d82fa0ac58f8e1ac4094c1fb1f8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 25 Mar 2017 18:47:28 -0400
Subject: kill __copy_from_user_nocache()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/include/asm/uaccess_32.h |  30 ----------
 arch/x86/include/asm/uaccess_64.h |   8 ---
 arch/x86/lib/usercopy_32.c        | 118 --------------------------------------
 include/linux/uaccess.h           |   6 --
 lib/iov_iter.c                    |   4 +-
 5 files changed, 2 insertions(+), 164 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 5268ecceea96..19e6c050c438 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -14,8 +14,6 @@ unsigned long __must_check __copy_from_user_ll
 		(void *to, const void __user *from, unsigned long n);
 unsigned long __must_check __copy_from_user_ll_nozero
 		(void *to, const void __user *from, unsigned long n);
-unsigned long __must_check __copy_from_user_ll_nocache
-		(void *to, const void __user *from, unsigned long n);
 unsigned long __must_check __copy_from_user_ll_nocache_nozero
 		(void *to, const void __user *from, unsigned long n);
 
@@ -119,34 +117,6 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 	return __copy_from_user_ll(to, from, n);
 }
 
-static __always_inline unsigned long __copy_from_user_nocache(void *to,
-				const void __user *from, unsigned long n)
-{
-	might_fault();
-	if (__builtin_constant_p(n)) {
-		unsigned long ret;
-
-		switch (n) {
-		case 1:
-			__uaccess_begin();
-			__get_user_size(*(u8 *)to, from, 1, ret, 1);
-			__uaccess_end();
-			return ret;
-		case 2:
-			__uaccess_begin();
-			__get_user_size(*(u16 *)to, from, 2, ret, 2);
-			__uaccess_end();
-			return ret;
-		case 4:
-			__uaccess_begin();
-			__get_user_size(*(u32 *)to, from, 4, ret, 4);
-			__uaccess_end();
-			return ret;
-		}
-	}
-	return __copy_from_user_ll_nocache(to, from, n);
-}
-
 static __always_inline unsigned long
 __copy_from_user_inatomic_nocache(void *to, const void __user *from,
 				  unsigned long n)
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 142f0f1230be..242936b0cb4b 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -260,14 +260,6 @@ __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
 extern long __copy_user_nocache(void *dst, const void __user *src,
 				unsigned size, int zerorest);
 
-static inline int
-__copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
-{
-	might_fault();
-	kasan_check_write(dst, size);
-	return __copy_user_nocache(dst, src, size, 1);
-}
-
 static inline int
 __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 				  unsigned size)
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 1f65ff6540f0..02aa7aa8b9f3 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -293,105 +293,6 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
 	return size;
 }
 
-/*
- * Non Temporal Hint version of __copy_user_zeroing_intel.  It is cache aware.
- * hyoshiok@miraclelinux.com
- */
-
-static unsigned long __copy_user_zeroing_intel_nocache(void *to,
-				const void __user *from, unsigned long size)
-{
-	int d0, d1;
-
-	__asm__ __volatile__(
-	       "        .align 2,0x90\n"
-	       "0:      movl 32(%4), %%eax\n"
-	       "        cmpl $67, %0\n"
-	       "        jbe 2f\n"
-	       "1:      movl 64(%4), %%eax\n"
-	       "        .align 2,0x90\n"
-	       "2:      movl 0(%4), %%eax\n"
-	       "21:     movl 4(%4), %%edx\n"
-	       "        movnti %%eax, 0(%3)\n"
-	       "        movnti %%edx, 4(%3)\n"
-	       "3:      movl 8(%4), %%eax\n"
-	       "31:     movl 12(%4),%%edx\n"
-	       "        movnti %%eax, 8(%3)\n"
-	       "        movnti %%edx, 12(%3)\n"
-	       "4:      movl 16(%4), %%eax\n"
-	       "41:     movl 20(%4), %%edx\n"
-	       "        movnti %%eax, 16(%3)\n"
-	       "        movnti %%edx, 20(%3)\n"
-	       "10:     movl 24(%4), %%eax\n"
-	       "51:     movl 28(%4), %%edx\n"
-	       "        movnti %%eax, 24(%3)\n"
-	       "        movnti %%edx, 28(%3)\n"
-	       "11:     movl 32(%4), %%eax\n"
-	       "61:     movl 36(%4), %%edx\n"
-	       "        movnti %%eax, 32(%3)\n"
-	       "        movnti %%edx, 36(%3)\n"
-	       "12:     movl 40(%4), %%eax\n"
-	       "71:     movl 44(%4), %%edx\n"
-	       "        movnti %%eax, 40(%3)\n"
-	       "        movnti %%edx, 44(%3)\n"
-	       "13:     movl 48(%4), %%eax\n"
-	       "81:     movl 52(%4), %%edx\n"
-	       "        movnti %%eax, 48(%3)\n"
-	       "        movnti %%edx, 52(%3)\n"
-	       "14:     movl 56(%4), %%eax\n"
-	       "91:     movl 60(%4), %%edx\n"
-	       "        movnti %%eax, 56(%3)\n"
-	       "        movnti %%edx, 60(%3)\n"
-	       "        addl $-64, %0\n"
-	       "        addl $64, %4\n"
-	       "        addl $64, %3\n"
-	       "        cmpl $63, %0\n"
-	       "        ja  0b\n"
-	       "        sfence \n"
-	       "5:      movl  %0, %%eax\n"
-	       "        shrl  $2, %0\n"
-	       "        andl $3, %%eax\n"
-	       "        cld\n"
-	       "6:      rep; movsl\n"
-	       "        movl %%eax,%0\n"
-	       "7:      rep; movsb\n"
-	       "8:\n"
-	       ".section .fixup,\"ax\"\n"
-	       "9:      lea 0(%%eax,%0,4),%0\n"
-	       "16:     pushl %0\n"
-	       "        pushl %%eax\n"
-	       "        xorl %%eax,%%eax\n"
-	       "        rep; stosb\n"
-	       "        popl %%eax\n"
-	       "        popl %0\n"
-	       "        jmp 8b\n"
-	       ".previous\n"
-	       _ASM_EXTABLE(0b,16b)
-	       _ASM_EXTABLE(1b,16b)
-	       _ASM_EXTABLE(2b,16b)
-	       _ASM_EXTABLE(21b,16b)
-	       _ASM_EXTABLE(3b,16b)
-	       _ASM_EXTABLE(31b,16b)
-	       _ASM_EXTABLE(4b,16b)
-	       _ASM_EXTABLE(41b,16b)
-	       _ASM_EXTABLE(10b,16b)
-	       _ASM_EXTABLE(51b,16b)
-	       _ASM_EXTABLE(11b,16b)
-	       _ASM_EXTABLE(61b,16b)
-	       _ASM_EXTABLE(12b,16b)
-	       _ASM_EXTABLE(71b,16b)
-	       _ASM_EXTABLE(13b,16b)
-	       _ASM_EXTABLE(81b,16b)
-	       _ASM_EXTABLE(14b,16b)
-	       _ASM_EXTABLE(91b,16b)
-	       _ASM_EXTABLE(6b,9b)
-	       _ASM_EXTABLE(7b,16b)
-	       : "=&c"(size), "=&D" (d0), "=&S" (d1)
-	       :  "1"(to), "2"(from), "0"(size)
-	       : "eax", "edx", "memory");
-	return size;
-}
-
 static unsigned long __copy_user_intel_nocache(void *to,
 				const void __user *from, unsigned long size)
 {
@@ -490,8 +391,6 @@ unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
 					unsigned long size);
 unsigned long __copy_user_intel(void __user *to, const void *from,
 					unsigned long size);
-unsigned long __copy_user_zeroing_intel_nocache(void *to,
-				const void __user *from, unsigned long size);
 #endif /* CONFIG_X86_INTEL_USERCOPY */
 
 /* Generic arbitrary sized copy.  */
@@ -607,23 +506,6 @@ unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
 }
 EXPORT_SYMBOL(__copy_from_user_ll_nozero);
 
-unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
-					unsigned long n)
-{
-	stac();
-#ifdef CONFIG_X86_INTEL_USERCOPY
-	if (n > 64 && static_cpu_has(X86_FEATURE_XMM2))
-		n = __copy_user_zeroing_intel_nocache(to, from, n);
-	else
-		__copy_user_zeroing(to, from, n);
-#else
-	__copy_user_zeroing(to, from, n);
-#endif
-	clac();
-	return n;
-}
-EXPORT_SYMBOL(__copy_from_user_ll_nocache);
-
 unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
 					unsigned long n)
 {
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5f76bc995d96..7fc2104b88bc 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -261,12 +261,6 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
 	return __copy_from_user_inatomic(to, from, n);
 }
 
-static inline unsigned long __copy_from_user_nocache(void *to,
-				const void __user *from, unsigned long n)
-{
-	return __copy_from_user(to, from, n);
-}
-
 #endif		/* ARCH_HAS_NOCACHE_UACCESS */
 
 /*
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 97db876c6862..672c32f9f960 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -604,7 +604,7 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
 		return 0;
 	}
 	iterate_and_advance(i, bytes, v,
-		__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
+		__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
 					 v.iov_base, v.iov_len),
 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
 				 v.bv_offset, v.bv_len),
@@ -625,7 +625,7 @@ bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
 	if (unlikely(i->count < bytes))
 		return false;
 	iterate_all_kinds(i, bytes, v, ({
-		if (__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
+		if (__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
 					     v.iov_base, v.iov_len))
 			return false;
 		0;}),
-- 
cgit v1.3-6-gb490


From 1671d522cdd9933dee7dddfcf9f62c561283824a Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Mon, 27 Mar 2017 20:06:57 +0800
Subject: block: rename blk_mq_freeze_queue_start()

As the .q_usage_counter is used by both legacy and
mq path, we need to block new I/O if queue becomes
dead in blk_queue_enter().

So rename it and we can use this function in both
paths.

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c                  |  2 +-
 block/blk-mq.c                    | 10 +++++-----
 drivers/block/mtip32xx/mtip32xx.c |  2 +-
 drivers/nvme/host/core.c          |  2 +-
 include/linux/blk-mq.h            |  2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6373febc7716..7b66f76f9cff 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -670,7 +670,7 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
 			return -EBUSY;
 
 		/*
-		 * read pair of barrier in blk_mq_freeze_queue_start(),
+		 * read pair of barrier in blk_freeze_queue_start(),
 		 * we need to order reading __PERCPU_REF_DEAD flag of
 		 * .q_usage_counter and reading .mq_freeze_depth,
 		 * otherwise the following wait may never return if the
diff --git a/block/blk-mq.c b/block/blk-mq.c
index baebd6c8210e..0ed00eca4d5a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -68,7 +68,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 	sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
 }
 
-void blk_mq_freeze_queue_start(struct request_queue *q)
+void blk_freeze_queue_start(struct request_queue *q)
 {
 	int freeze_depth;
 
@@ -78,7 +78,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
 		blk_mq_run_hw_queues(q, false);
 	}
 }
-EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
+EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 
 void blk_mq_freeze_queue_wait(struct request_queue *q)
 {
@@ -108,7 +108,7 @@ void blk_freeze_queue(struct request_queue *q)
 	 * no blk_unfreeze_queue(), and blk_freeze_queue() is not
 	 * exported to drivers as the only user for unfreeze is blk_mq.
 	 */
-	blk_mq_freeze_queue_start(q);
+	blk_freeze_queue_start(q);
 	blk_mq_freeze_queue_wait(q);
 }
 
@@ -746,7 +746,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	 * percpu_ref_tryget directly, because we need to be able to
 	 * obtain a reference even in the short window between the queue
 	 * starting to freeze, by dropping the first reference in
-	 * blk_mq_freeze_queue_start, and the moment the last request is
+	 * blk_freeze_queue_start, and the moment the last request is
 	 * consumed, marked by the instant q_usage_counter reaches
 	 * zero.
 	 */
@@ -2376,7 +2376,7 @@ static void blk_mq_queue_reinit_work(void)
 	 * take place in parallel.
 	 */
 	list_for_each_entry(q, &all_q_list, all_q_node)
-		blk_mq_freeze_queue_start(q);
+		blk_freeze_queue_start(q);
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_freeze_queue_wait(q);
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index f96ab717534c..c96c35ab39df 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4162,7 +4162,7 @@ static int mtip_block_remove(struct driver_data *dd)
 		dev_info(&dd->pdev->dev, "device %s surprise removal\n",
 						dd->disk->disk_name);
 
-	blk_mq_freeze_queue_start(dd->queue);
+	blk_freeze_queue_start(dd->queue);
 	blk_mq_stop_hw_queues(dd->queue);
 	blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd);
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9b3b57fef446..4a6d7f408769 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2386,7 +2386,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 
 	mutex_lock(&ctrl->namespaces_mutex);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
-		blk_mq_freeze_queue_start(ns->queue);
+		blk_freeze_queue_start(ns->queue);
 	mutex_unlock(&ctrl->namespaces_mutex);
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5b3e201c8d4f..ea2e9dcd3aef 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -243,7 +243,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 		busy_tag_iter_fn *fn, void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
-void blk_mq_freeze_queue_start(struct request_queue *q);
+void blk_freeze_queue_start(struct request_queue *q);
 void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
-- 
cgit v1.3-6-gb490


From 334335d2f7a077a5ff561d86b0ad43bedd83ca05 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Tue, 28 Mar 2017 16:12:15 -0700
Subject: block: warn if sharing request queue across gendisks

Now that the remaining drivers have been converted to one request queue
per gendisk, let's warn if a request queue gets registered more than
once. This will catch future drivers which might do it inadvertently or
any old drivers that I may have missed.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-sysfs.c      | 7 +++++++
 include/linux/blkdev.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7f090dd15ca6..833fb7f9ce9d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -871,6 +871,11 @@ int blk_register_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return -ENXIO;
 
+	WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
+		  "%s is registering an already registered queue\n",
+		  kobject_name(&dev->kobj));
+	queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);
+
 	/*
 	 * SCSI probing may synchronously create and destroy a lot of
 	 * request_queues for non-existent devices.  Shutting down a fully
@@ -931,6 +936,8 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return;
 
+	queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
+
 	if (q->mq_ops)
 		blk_mq_unregister_dev(disk_to_dev(disk), q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1a7dc42a8918..a2dc6b390d48 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -617,6 +617,7 @@ struct request_queue {
 #define QUEUE_FLAG_STATS       27	/* track rq completion times */
 #define QUEUE_FLAG_RESTART     28	/* queue needs restart at completion */
 #define QUEUE_FLAG_POLL_STATS  29	/* collecting stats for hybrid polling */
+#define QUEUE_FLAG_REGISTERED  30	/* queue has been registered to a disk */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
cgit v1.3-6-gb490


From d708f0d5026f48081debdd1c5b0a5636455a9589 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 29 Mar 2017 11:25:48 -0600
Subject: Revert "blkcg: allocate struct blkcg_gq outside request queue
 spinlock"

I inadvertently applied the v5 version of this patch, whereas
the agreed upon version was v5. Revert this one so we can apply
the right one.

This reverts commit 7fc6b87a9ff537e7df32b1278118ce9c5bcd6788.
---
 block/blk-cgroup.c         | 138 +++++++++++++++++----------------------------
 include/linux/blk-cgroup.h |   6 +-
 2 files changed, 53 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bdf87f0c1b1b..bbe7ee00bd3d 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -165,18 +165,16 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
 /*
- * If gfp mask allows blocking, this function temporarily drops rcu and queue
- * locks to allocate memory.
+ * If @new_blkg is %NULL, this function tries to allocate a new one as
+ * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
  */
 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
-				    struct request_queue *q, gfp_t gfp,
-				    const struct blkcg_policy *pol)
+				    struct request_queue *q,
+				    struct blkcg_gq *new_blkg)
 {
-	struct blkcg_gq *blkg = NULL;
+	struct blkcg_gq *blkg;
 	struct bdi_writeback_congested *wb_congested;
 	int i, ret;
-	const bool drop_locks = gfpflags_allow_blocking(gfp);
-	bool preloaded = false;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
@@ -187,53 +185,31 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		goto err_free_blkg;
 	}
 
-	if (drop_locks) {
-		spin_unlock_irq(q->queue_lock);
-		rcu_read_unlock();
-	}
-
 	wb_congested = wb_congested_get_create(q->backing_dev_info,
-					       blkcg->css.id, gfp);
-	blkg = blkg_alloc(blkcg, q, gfp);
-
-	if (drop_locks) {
-		preloaded = !radix_tree_preload(gfp);
-		rcu_read_lock();
-		spin_lock_irq(q->queue_lock);
-	}
-
-	if (unlikely(!wb_congested || !blkg)) {
+					       blkcg->css.id,
+					       GFP_NOWAIT | __GFP_NOWARN);
+	if (!wb_congested) {
 		ret = -ENOMEM;
-		goto err_put;
+		goto err_put_css;
 	}
 
-	blkg->wb_congested = wb_congested;
-
-	if (pol) {
-		WARN_ON(!drop_locks);
-
-		if (!blkcg_policy_enabled(q, pol)) {
-			ret = -EOPNOTSUPP;
-			goto err_put;
-		}
-
-		/*
-		 * This could be the first entry point of blkcg implementation
-		 * and we shouldn't allow anything to go through for a bypassing
-		 * queue.
-		 */
-		if (unlikely(blk_queue_bypass(q))) {
-			ret = blk_queue_dying(q) ? -ENODEV : -EBUSY;
-			goto err_put;
+	/* allocate */
+	if (!new_blkg) {
+		new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
+		if (unlikely(!new_blkg)) {
+			ret = -ENOMEM;
+			goto err_put_congested;
 		}
 	}
+	blkg = new_blkg;
+	blkg->wb_congested = wb_congested;
 
 	/* link parent */
 	if (blkcg_parent(blkcg)) {
 		blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
 		if (WARN_ON_ONCE(!blkg->parent)) {
 			ret = -ENODEV;
-			goto err_put;
+			goto err_put_congested;
 		}
 		blkg_get(blkg->parent);
 	}
@@ -260,9 +236,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 				pol->pd_online_fn(blkg->pd[i]);
 		}
 	}
-
-	if (preloaded)
-		radix_tree_preload_end();
 	blkg->online = true;
 	spin_unlock(&blkcg->lock);
 
@@ -273,45 +246,44 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	blkg_put(blkg);
 	return ERR_PTR(ret);
 
-err_put:
-	if (preloaded)
-		radix_tree_preload_end();
-	if (wb_congested)
-		wb_congested_put(wb_congested);
+err_put_congested:
+	wb_congested_put(wb_congested);
+err_put_css:
 	css_put(&blkcg->css);
 err_free_blkg:
-	blkg_free(blkg);
+	blkg_free(new_blkg);
 	return ERR_PTR(ret);
 }
 
 /**
- * __blkg_lookup_create - lookup blkg, try to create one if not there
+ * blkg_lookup_create - lookup blkg, try to create one if not there
  * @blkcg: blkcg of interest
  * @q: request_queue of interest
- * @gfp: gfp mask
- * @pol: blkcg policy (optional)
  *
  * Lookup blkg for the @blkcg - @q pair.  If it doesn't exist, try to
  * create one.  blkg creation is performed recursively from blkcg_root such
  * that all non-root blkg's have access to the parent blkg.  This function
  * should be called under RCU read lock and @q->queue_lock.
  *
- * When gfp mask allows blocking, rcu and queue locks may be dropped for
- * allocating memory. In this case, the locks will be reacquired on return.
- *
  * Returns pointer to the looked up or created blkg on success, ERR_PTR()
  * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
  * dead and bypassing, returns ERR_PTR(-EBUSY).
  */
-struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
-				      struct request_queue *q, gfp_t gfp,
-				      const struct blkcg_policy *pol)
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+				    struct request_queue *q)
 {
 	struct blkcg_gq *blkg;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
 
+	/*
+	 * This could be the first entry point of blkcg implementation and
+	 * we shouldn't allow anything to go through for a bypassing queue.
+	 */
+	if (unlikely(blk_queue_bypass(q)))
+		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+
 	blkg = __blkg_lookup(blkcg, q, true);
 	if (blkg)
 		return blkg;
@@ -329,35 +301,12 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
 			parent = blkcg_parent(parent);
 		}
 
-		blkg = blkg_create(pos, q, gfp, pol);
+		blkg = blkg_create(pos, q, NULL);
 		if (pos == blkcg || IS_ERR(blkg))
 			return blkg;
 	}
 }
 
-/**
- * blkg_lookup_create - lookup blkg, try to create one if not there
- *
- * Performs an initial queue bypass check and then passes control to
- * __blkg_lookup_create().
- */
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-				    struct request_queue *q, gfp_t gfp,
-				    const struct blkcg_policy *pol)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-	lockdep_assert_held(q->queue_lock);
-
-	/*
-	 * This could be the first entry point of blkcg implementation and
-	 * we shouldn't allow anything to go through for a bypassing queue.
-	 */
-	if (unlikely(blk_queue_bypass(q)))
-		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
-
-	return __blkg_lookup_create(blkcg, q, gfp, pol);
-}
-
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
 	struct blkcg *blkcg = blkg->blkcg;
@@ -868,7 +817,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	spin_lock_irq(disk->queue->queue_lock);
 
 	if (blkcg_policy_enabled(disk->queue, pol))
-		blkg = blkg_lookup_create(blkcg, disk->queue, GFP_KERNEL, pol);
+		blkg = blkg_lookup_create(blkcg, disk->queue);
 	else
 		blkg = ERR_PTR(-EOPNOTSUPP);
 
@@ -1107,15 +1056,30 @@ free_blkcg:
  */
 int blkcg_init_queue(struct request_queue *q)
 {
-	struct blkcg_gq *blkg;
+	struct blkcg_gq *new_blkg, *blkg;
+	bool preloaded;
 	int ret;
 
+	new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+	if (!new_blkg)
+		return -ENOMEM;
+
+	preloaded = !radix_tree_preload(GFP_KERNEL);
+
+	/*
+	 * Make sure the root blkg exists and count the existing blkgs.  As
+	 * @q is bypassing at this point, blkg_lookup_create() can't be
+	 * used.  Open code insertion.
+	 */
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
-	blkg = __blkg_lookup_create(&blkcg_root, q, GFP_KERNEL, NULL);
+	blkg = blkg_create(&blkcg_root, q, new_blkg);
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
 
+	if (preloaded)
+		radix_tree_preload_end();
+
 	if (IS_ERR(blkg))
 		return PTR_ERR(blkg);
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 955903a8f6cb..01b62e7bac74 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -172,8 +172,7 @@ extern struct cgroup_subsys_state * const blkcg_root_css;
 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 				      struct request_queue *q, bool update_hint);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-				    struct request_queue *q, gfp_t gfp,
-				    const struct blkcg_policy *pol);
+				    struct request_queue *q);
 int blkcg_init_queue(struct request_queue *q);
 void blkcg_drain_queue(struct request_queue *q);
 void blkcg_exit_queue(struct request_queue *q);
@@ -695,8 +694,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
 		spin_lock_irq(q->queue_lock);
-		blkg = blkg_lookup_create(blkcg, q, GFP_NOWAIT | __GFP_NOWARN,
-					  NULL);
+		blkg = blkg_lookup_create(blkcg, q);
 		if (IS_ERR(blkg))
 			blkg = NULL;
 		spin_unlock_irq(q->queue_lock);
-- 
cgit v1.3-6-gb490


From 44fe84459faf1a7781595b7c64cd36daf2f2827d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 27 Mar 2017 13:54:38 +0200
Subject: locking/atomic: Fix atomic_try_cmpxchg() semantics

Dmitry noted that the new atomic_try_cmpxchg() primitive is broken when
the old pointer doesn't point to the local stack.

He writes:

  "Consider a classical lock-free stack push:

    node->next = atomic_read(&head);
    do {
    } while (!atomic_try_cmpxchg(&head, &node->next, node));

  This code is broken with the current implementation, the problem is
  with unconditional update of *__po.

  In case of success it writes the same value back into *__po, but in
  case of cmpxchg success we might have lose ownership of some memory
  locations and potentially over what __po has pointed to. The same
  holds for the re-read of *__po. "

He also points out that this makes it surprisingly different from the
similar C/C++ atomic operation.

After investigating the code-gen differences caused by this patch; and
a number of alternatives (Linus dislikes this interface lots), we
arrived at these results (size x86_64-defconfig/vmlinux):

  GCC-6.3.0:

  10735757        cmpxchg
  10726413        try_cmpxchg
  10730509        try_cmpxchg + patch
  10730445        try_cmpxchg-linus

  GCC-7 (20170327):

  10709514        cmpxchg
  10704266        try_cmpxchg
  10704266        try_cmpxchg + patch
  10704394        try_cmpxchg-linus

From this we see that the patch has the advantage of better code-gen
on GCC-7 and keeps the interface roughly consistent with the C
language variant.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Fixes: a9ebf306f52c ("locking/atomic: Introduce atomic_try_cmpxchg()")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cmpxchg.h |  5 +++--
 include/linux/atomic.h         | 16 ++++++++++------
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index fb961db51a2a..d90296d061e8 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -212,8 +212,9 @@ extern void __add_wrong_size(void)
 	default:							\
 		__cmpxchg_wrong_size();					\
 	}								\
-	*_old = __old;							\
-	success;							\
+	if (unlikely(!success))						\
+		*_old = __old;						\
+	likely(success);						\
 })
 
 #define __try_cmpxchg(ptr, pold, new, size)				\
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index aae5953817d6..c56be7410130 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -428,9 +428,11 @@
 #define __atomic_try_cmpxchg(type, _p, _po, _n)				\
 ({									\
 	typeof(_po) __po = (_po);					\
-	typeof(*(_po)) __o = *__po;					\
-	*__po = atomic_cmpxchg##type((_p), __o, (_n));			\
-	(*__po == __o);							\
+	typeof(*(_po)) __r, __o = *__po;				\
+	__r = atomic_cmpxchg##type((_p), __o, (_n));			\
+	if (unlikely(__r != __o))					\
+		*__po = __r;						\
+	likely(__r == __o);						\
 })
 
 #define atomic_try_cmpxchg(_p, _po, _n)		__atomic_try_cmpxchg(, _p, _po, _n)
@@ -1022,9 +1024,11 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 #define __atomic64_try_cmpxchg(type, _p, _po, _n)			\
 ({									\
 	typeof(_po) __po = (_po);					\
-	typeof(*(_po)) __o = *__po;					\
-	*__po = atomic64_cmpxchg##type((_p), __o, (_n));		\
-	(*__po == __o);							\
+	typeof(*(_po)) __r, __o = *__po;				\
+	__r = atomic64_cmpxchg##type((_p), __o, (_n));			\
+	if (unlikely(__r != __o))					\
+		*__po = __r;						\
+	likely(__r == __o);						\
 })
 
 #define atomic64_try_cmpxchg(_p, _po, _n)		__atomic64_try_cmpxchg(, _p, _po, _n)
-- 
cgit v1.3-6-gb490


From 19d436268dde95389c616bb3819da73f0a8b28a8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 25 Feb 2017 08:56:53 +0100
Subject: debug: Add _ONCE() logic to report_bug()

Josh suggested moving the _ONCE logic inside the trap handler, using a
bit in the bug_entry::flags field, avoiding the need for the extra
variable.

Sadly this only works for WARN_ON_ONCE(), since the others have
printk() statements prior to triggering the trap.

Still, this saves a fair amount of text and some data:

  text         data       filename
  10682460     4530992    defconfig-build/vmlinux.orig
  10665111     4530096    defconfig-build/vmlinux.patched

Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/include/asm/bug.h      |  2 +-
 arch/parisc/include/asm/bug.h     |  8 ++++----
 arch/powerpc/include/asm/bug.h    |  4 ++--
 arch/s390/include/asm/bug.h       |  4 ++--
 arch/sh/include/asm/bug.h         |  4 ++--
 arch/x86/include/asm/bug.h        |  2 +-
 include/asm-generic/bug.h         | 18 +++++++++++++++++-
 include/asm-generic/vmlinux.lds.h |  5 ++---
 include/linux/bug.h               |  2 +-
 lib/bug.c                         | 28 ++++++++++++++++++++--------
 10 files changed, 52 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h
index 561190d15881..a9be1072933c 100644
--- a/arch/arm64/include/asm/bug.h
+++ b/arch/arm64/include/asm/bug.h
@@ -55,7 +55,7 @@ _BUGVERBOSE_LOCATION(__FILE__, __LINE__)		\
 	unreachable();				\
 } while (0)
 
-#define __WARN_TAINT(taint) _BUG_FLAGS(BUGFLAG_TAINT(taint))
+#define __WARN_FLAGS(flags) _BUG_FLAGS(BUGFLAG_WARNING|(flags))
 
 #endif /* ! CONFIG_GENERIC_BUG */
 
diff --git a/arch/parisc/include/asm/bug.h b/arch/parisc/include/asm/bug.h
index 62a33338549c..d2742273a685 100644
--- a/arch/parisc/include/asm/bug.h
+++ b/arch/parisc/include/asm/bug.h
@@ -46,7 +46,7 @@
 #endif
 
 #ifdef CONFIG_DEBUG_BUGVERBOSE
-#define __WARN_TAINT(taint)						\
+#define __WARN_FLAGS(flags)						\
 	do {								\
 		asm volatile("\n"					\
 			     "1:\t" PARISC_BUG_BREAK_ASM "\n"		\
@@ -56,11 +56,11 @@
 			     "\t.org 2b+%c3\n"				\
 			     "\t.popsection"				\
 			     : : "i" (__FILE__), "i" (__LINE__),	\
-			     "i" (BUGFLAG_TAINT(taint)), 		\
+			     "i" (BUGFLAG_WARNING|(flags)),		\
 			     "i" (sizeof(struct bug_entry)) );		\
 	} while(0)
 #else
-#define __WARN_TAINT(taint)						\
+#define __WARN_FLAGS(flags)						\
 	do {								\
 		asm volatile("\n"					\
 			     "1:\t" PARISC_BUG_BREAK_ASM "\n"		\
@@ -69,7 +69,7 @@
 			     "\t.short %c0\n"				\
 			     "\t.org 2b+%c1\n"				\
 			     "\t.popsection"				\
-			     : : "i" (BUGFLAG_TAINT(taint)),		\
+			     : : "i" (BUGFLAG_WARNING|(flags)),		\
 			     "i" (sizeof(struct bug_entry)) );		\
 	} while(0)
 #endif
diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index 3a39283333c3..f2c562a0a427 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -85,12 +85,12 @@
 	}							\
 } while (0)
 
-#define __WARN_TAINT(taint) do {				\
+#define __WARN_FLAGS(flags) do {				\
 	__asm__ __volatile__(					\
 		"1:	twi 31,0,0\n"				\
 		_EMIT_BUG_ENTRY					\
 		: : "i" (__FILE__), "i" (__LINE__),		\
-		  "i" (BUGFLAG_TAINT(taint)),			\
+		  "i" (BUGFLAG_WARNING|(flags)),		\
 		  "i" (sizeof(struct bug_entry)));		\
 } while (0)
 
diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h
index bf90d1fd97a5..1bbd9dbfe4e0 100644
--- a/arch/s390/include/asm/bug.h
+++ b/arch/s390/include/asm/bug.h
@@ -46,8 +46,8 @@
 	unreachable();					\
 } while (0)
 
-#define __WARN_TAINT(taint) do {			\
-	__EMIT_BUG(BUGFLAG_TAINT(taint));		\
+#define __WARN_FLAGS(flags) do {			\
+	__EMIT_BUG(BUGFLAG_WARNING|(flags));		\
 } while (0)
 
 #define WARN_ON(x) ({					\
diff --git a/arch/sh/include/asm/bug.h b/arch/sh/include/asm/bug.h
index dcf278075429..1b77f068be2b 100644
--- a/arch/sh/include/asm/bug.h
+++ b/arch/sh/include/asm/bug.h
@@ -50,7 +50,7 @@ do {							\
 		   "i" (sizeof(struct bug_entry)));	\
 } while (0)
 
-#define __WARN_TAINT(taint)				\
+#define __WARN_FLAGS(flags)				\
 do {							\
 	__asm__ __volatile__ (				\
 		"1:\t.short %O0\n"			\
@@ -59,7 +59,7 @@ do {							\
 		 : "n" (TRAPA_BUG_OPCODE),		\
 		   "i" (__FILE__),			\
 		   "i" (__LINE__),			\
-		   "i" (BUGFLAG_TAINT(taint)),		\
+		   "i" (BUGFLAG_WARNING|(flags)),	\
 		   "i" (sizeof(struct bug_entry)));	\
 } while (0)
 
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index cecf559d0012..39e702d90cdb 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -76,7 +76,7 @@ do {								\
 	unreachable();						\
 } while (0)
 
-#define __WARN_TAINT(taint)	_BUG_FLAGS(ASM_UD0, BUGFLAG_TAINT(taint))
+#define __WARN_FLAGS(flags)	_BUG_FLAGS(ASM_UD0, BUGFLAG_WARNING|(flags))
 
 #include <asm-generic/bug.h>
 
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 6f96247226a4..5fe9f39cec44 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -5,6 +5,8 @@
 
 #ifdef CONFIG_GENERIC_BUG
 #define BUGFLAG_WARNING		(1 << 0)
+#define BUGFLAG_ONCE		(1 << 1)
+#define BUGFLAG_DONE		(1 << 2)
 #define BUGFLAG_TAINT(taint)	(BUGFLAG_WARNING | ((taint) << 8))
 #define BUG_GET_TAINT(bug)	((bug)->flags >> 8)
 #endif
@@ -55,6 +57,18 @@ struct bug_entry {
 #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0)
 #endif
 
+#ifdef __WARN_FLAGS
+#define __WARN_TAINT(taint)		__WARN_FLAGS(BUGFLAG_TAINT(taint))
+#define __WARN_ONCE_TAINT(taint)	__WARN_FLAGS(BUGFLAG_ONCE|BUGFLAG_TAINT(taint))
+
+#define WARN_ON_ONCE(condition) ({				\
+	int __ret_warn_on = !!(condition);			\
+	if (unlikely(__ret_warn_on))				\
+		__WARN_ONCE_TAINT(TAINT_WARN);			\
+	unlikely(__ret_warn_on);				\
+})
+#endif
+
 /*
  * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report
  * significant issues that need prompt attention if they should ever
@@ -97,7 +111,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 #endif
 
 #ifndef WARN
-#define WARN(condition, format...) ({						\
+#define WARN(condition, format...) ({					\
 	int __ret_warn_on = !!(condition);				\
 	if (unlikely(__ret_warn_on))					\
 		__WARN_printf(format);					\
@@ -112,6 +126,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 	unlikely(__ret_warn_on);					\
 })
 
+#ifndef WARN_ON_ONCE
 #define WARN_ON_ONCE(condition)	({				\
 	static bool __section(.data.unlikely) __warned;		\
 	int __ret_warn_once = !!(condition);			\
@@ -122,6 +137,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 	}							\
 	unlikely(__ret_warn_once);				\
 })
+#endif
 
 #define WARN_ONCE(condition, format...)	({			\
 	static bool __section(.data.unlikely) __warned;		\
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0968d13b3885..51c8dbe8e8d9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -286,8 +286,6 @@
 		*(.rodata1)						\
 	}								\
 									\
-	BUG_TABLE							\
-									\
 	/* PCI quirks */						\
 	.pci_fixup        : AT(ADDR(.pci_fixup) - LOAD_OFFSET) {	\
 		VMLINUX_SYMBOL(__start_pci_fixups_early) = .;		\
@@ -855,7 +853,8 @@
 		READ_MOSTLY_DATA(cacheline)				\
 		DATA_DATA						\
 		CONSTRUCTORS						\
-	}
+	}								\
+	BUG_TABLE
 
 #define INIT_TEXT_SECTION(inittext_align)				\
 	. = ALIGN(inittext_align);					\
diff --git a/include/linux/bug.h b/include/linux/bug.h
index 5828489309bb..687b557fc5eb 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -105,7 +105,7 @@ static inline int is_warning_bug(const struct bug_entry *bug)
 	return bug->flags & BUGFLAG_WARNING;
 }
 
-const struct bug_entry *find_bug(unsigned long bugaddr);
+struct bug_entry *find_bug(unsigned long bugaddr);
 
 enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
 
diff --git a/lib/bug.c b/lib/bug.c
index 06edbbef0623..a6a1137d06db 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -47,7 +47,7 @@
 #include <linux/sched.h>
 #include <linux/rculist.h>
 
-extern const struct bug_entry __start___bug_table[], __stop___bug_table[];
+extern struct bug_entry __start___bug_table[], __stop___bug_table[];
 
 static inline unsigned long bug_addr(const struct bug_entry *bug)
 {
@@ -62,10 +62,10 @@ static inline unsigned long bug_addr(const struct bug_entry *bug)
 /* Updates are protected by module mutex */
 static LIST_HEAD(module_bug_list);
 
-static const struct bug_entry *module_find_bug(unsigned long bugaddr)
+static struct bug_entry *module_find_bug(unsigned long bugaddr)
 {
 	struct module *mod;
-	const struct bug_entry *bug = NULL;
+	struct bug_entry *bug = NULL;
 
 	rcu_read_lock_sched();
 	list_for_each_entry_rcu(mod, &module_bug_list, bug_list) {
@@ -122,15 +122,15 @@ void module_bug_cleanup(struct module *mod)
 
 #else
 
-static inline const struct bug_entry *module_find_bug(unsigned long bugaddr)
+static inline struct bug_entry *module_find_bug(unsigned long bugaddr)
 {
 	return NULL;
 }
 #endif
 
-const struct bug_entry *find_bug(unsigned long bugaddr)
+struct bug_entry *find_bug(unsigned long bugaddr)
 {
-	const struct bug_entry *bug;
+	struct bug_entry *bug;
 
 	for (bug = __start___bug_table; bug < __stop___bug_table; ++bug)
 		if (bugaddr == bug_addr(bug))
@@ -141,9 +141,9 @@ const struct bug_entry *find_bug(unsigned long bugaddr)
 
 enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 {
-	const struct bug_entry *bug;
+	struct bug_entry *bug;
 	const char *file;
-	unsigned line, warning;
+	unsigned line, warning, once, done;
 
 	if (!is_valid_bugaddr(bugaddr))
 		return BUG_TRAP_TYPE_NONE;
@@ -164,6 +164,18 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 		line = bug->line;
 #endif
 		warning = (bug->flags & BUGFLAG_WARNING) != 0;
+		once = (bug->flags & BUGFLAG_ONCE) != 0;
+		done = (bug->flags & BUGFLAG_DONE) != 0;
+
+		if (warning && once) {
+			if (done)
+				return BUG_TRAP_TYPE_WARN;
+
+			/*
+			 * Since this is the only store, concurrency is not an issue.
+			 */
+			bug->flags |= BUGFLAG_DONE;
+		}
 	}
 
 	if (warning) {
-- 
cgit v1.3-6-gb490


From cf25f904ef75aa7c25097eb4981bbc634bf5ff9e Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Fri, 24 Feb 2017 02:48:21 -0600
Subject: x86/events/amd/iommu: Add IOMMU-specific hw_perf_event struct
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current AMD IOMMU perf PMU inappropriately uses the hardware struct
inside the union in struct hw_perf_event, extra_reg in particular.

Instead, introduce an AMD IOMMU-specific struct with required parameters
to be programmed into the IOMMU performance counter control register.

Update the pasid field from 16 to 20 bits while at it.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
[ Fixup macros, shorten get_next_avail_iommu_bnk_cntr() local vars, massage commit message. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Jörg Rödel <joro@8bytes.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: iommu@lists.linux-foundation.org
Link: http://lkml.kernel.org/r/1487926102-13073-10-git-send-email-Suravee.Suthikulpanit@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/amd/iommu.c | 113 ++++++++++++++++++++------------------------
 include/linux/perf_event.h  |   7 +++
 2 files changed, 57 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index 7ac8138023cc..f0d94c8b382a 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -23,17 +23,16 @@
 
 #define COUNTER_SHIFT		16
 
-#define _GET_BANK(ev)       ((u8)(ev->hw.extra_reg.reg >> 8))
-#define _GET_CNTR(ev)       ((u8)(ev->hw.extra_reg.reg))
-
-/* iommu pmu config masks */
-#define _GET_CSOURCE(ev)    ((ev->hw.config & 0xFFULL))
-#define _GET_DEVID(ev)      ((ev->hw.config >> 8)  & 0xFFFFULL)
-#define _GET_PASID(ev)      ((ev->hw.config >> 24) & 0xFFFFULL)
-#define _GET_DOMID(ev)      ((ev->hw.config >> 40) & 0xFFFFULL)
-#define _GET_DEVID_MASK(ev) ((ev->hw.extra_reg.config)  & 0xFFFFULL)
-#define _GET_PASID_MASK(ev) ((ev->hw.extra_reg.config >> 16) & 0xFFFFULL)
-#define _GET_DOMID_MASK(ev) ((ev->hw.extra_reg.config >> 32) & 0xFFFFULL)
+/* iommu pmu conf masks */
+#define GET_CSOURCE(x)     ((x)->conf & 0xFFULL)
+#define GET_DEVID(x)       (((x)->conf >> 8)  & 0xFFFFULL)
+#define GET_DOMID(x)       (((x)->conf >> 24) & 0xFFFFULL)
+#define GET_PASID(x)       (((x)->conf >> 40) & 0xFFFFFULL)
+
+/* iommu pmu conf1 masks */
+#define GET_DEVID_MASK(x)  ((x)->conf1  & 0xFFFFULL)
+#define GET_DOMID_MASK(x)  (((x)->conf1 >> 16) & 0xFFFFULL)
+#define GET_PASID_MASK(x)  (((x)->conf1 >> 32) & 0xFFFFFULL)
 
 static struct perf_amd_iommu __perf_iommu;
 
@@ -50,11 +49,11 @@ struct perf_amd_iommu {
  *---------------------------------------------*/
 PMU_FORMAT_ATTR(csource,    "config:0-7");
 PMU_FORMAT_ATTR(devid,      "config:8-23");
-PMU_FORMAT_ATTR(pasid,      "config:24-39");
-PMU_FORMAT_ATTR(domid,      "config:40-55");
+PMU_FORMAT_ATTR(domid,      "config:24-39");
+PMU_FORMAT_ATTR(pasid,      "config:40-59");
 PMU_FORMAT_ATTR(devid_mask, "config1:0-15");
-PMU_FORMAT_ATTR(pasid_mask, "config1:16-31");
-PMU_FORMAT_ATTR(domid_mask, "config1:32-47");
+PMU_FORMAT_ATTR(domid_mask, "config1:16-31");
+PMU_FORMAT_ATTR(pasid_mask, "config1:32-51");
 
 static struct attribute *iommu_format_attrs[] = {
 	&format_attr_csource.attr,
@@ -150,30 +149,34 @@ static struct attribute_group amd_iommu_cpumask_group = {
 
 /*---------------------------------------------*/
 
-static int get_next_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu)
+static int get_next_avail_iommu_bnk_cntr(struct perf_event *event)
 {
+	struct perf_amd_iommu *piommu = container_of(event->pmu, struct perf_amd_iommu, pmu);
+	int max_cntrs = piommu->max_counters;
+	int max_banks = piommu->max_banks;
+	u32 shift, bank, cntr;
 	unsigned long flags;
-	int shift, bank, cntr, retval;
-	int max_banks = perf_iommu->max_banks;
-	int max_cntrs = perf_iommu->max_counters;
+	int retval;
 
-	raw_spin_lock_irqsave(&perf_iommu->lock, flags);
+	raw_spin_lock_irqsave(&piommu->lock, flags);
 
 	for (bank = 0, shift = 0; bank < max_banks; bank++) {
 		for (cntr = 0; cntr < max_cntrs; cntr++) {
 			shift = bank + (bank*3) + cntr;
-			if (perf_iommu->cntr_assign_mask & BIT_ULL(shift)) {
+			if (piommu->cntr_assign_mask & BIT_ULL(shift)) {
 				continue;
 			} else {
-				perf_iommu->cntr_assign_mask |= BIT_ULL(shift);
-				retval = ((bank & 0xFF) << 8) | (cntr & 0xFF);
+				piommu->cntr_assign_mask |= BIT_ULL(shift);
+				event->hw.iommu_bank = bank;
+				event->hw.iommu_cntr = cntr;
+				retval = 0;
 				goto out;
 			}
 		}
 	}
 	retval = -ENOSPC;
 out:
-	raw_spin_unlock_irqrestore(&perf_iommu->lock, flags);
+	raw_spin_unlock_irqrestore(&piommu->lock, flags);
 	return retval;
 }
 
@@ -202,8 +205,6 @@ static int clear_avail_iommu_bnk_cntr(struct perf_amd_iommu *perf_iommu,
 static int perf_iommu_event_init(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	struct perf_amd_iommu *perf_iommu;
-	u64 config, config1;
 
 	/* test the event attr type check for PMU enumeration */
 	if (event->attr.type != event->pmu->type)
@@ -225,21 +226,9 @@ static int perf_iommu_event_init(struct perf_event *event)
 	if (event->cpu < 0)
 		return -EINVAL;
 
-	perf_iommu = &__perf_iommu;
-
-	if (event->pmu != &perf_iommu->pmu)
-		return -ENOENT;
-
-	if (perf_iommu) {
-		config = event->attr.config;
-		config1 = event->attr.config1;
-	} else {
-		return -EINVAL;
-	}
-
 	/* update the hw_perf_event struct with the iommu config data */
-	hwc->config = config;
-	hwc->extra_reg.config = config1;
+	hwc->conf  = event->attr.config;
+	hwc->conf1 = event->attr.config1;
 
 	return 0;
 }
@@ -247,26 +236,28 @@ static int perf_iommu_event_init(struct perf_event *event)
 static void perf_iommu_enable_event(struct perf_event *ev)
 {
 	struct amd_iommu *iommu = get_amd_iommu(0);
-	u8 csource = _GET_CSOURCE(ev);
-	u16 devid = _GET_DEVID(ev);
-	u8 bank = _GET_BANK(ev);
-	u8 cntr = _GET_CNTR(ev);
+	struct hw_perf_event *hwc = &ev->hw;
+	u8 bank = hwc->iommu_bank;
+	u8 cntr = hwc->iommu_cntr;
 	u64 reg = 0ULL;
 
-	reg = csource;
+	reg = GET_CSOURCE(hwc);
 	amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_COUNTER_SRC_REG, &reg);
 
-	reg = devid | (_GET_DEVID_MASK(ev) << 32);
+	reg = GET_DEVID_MASK(hwc);
+	reg = GET_DEVID(hwc) | (reg << 32);
 	if (reg)
 		reg |= BIT(31);
 	amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DEVID_MATCH_REG, &reg);
 
-	reg = _GET_PASID(ev) | (_GET_PASID_MASK(ev) << 32);
+	reg = GET_PASID_MASK(hwc);
+	reg = GET_PASID(hwc) | (reg << 32);
 	if (reg)
 		reg |= BIT(31);
 	amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_PASID_MATCH_REG, &reg);
 
-	reg = _GET_DOMID(ev) | (_GET_DOMID_MASK(ev) << 32);
+	reg = GET_DOMID_MASK(hwc);
+	reg = GET_DOMID(hwc) | (reg << 32);
 	if (reg)
 		reg |= BIT(31);
 	amd_iommu_pc_set_reg(iommu, bank, cntr, IOMMU_PC_DOMID_MATCH_REG, &reg);
@@ -275,16 +266,16 @@ static void perf_iommu_enable_event(struct perf_event *ev)
 static void perf_iommu_disable_event(struct perf_event *event)
 {
 	struct amd_iommu *iommu = get_amd_iommu(0);
+	struct hw_perf_event *hwc = &event->hw;
 	u64 reg = 0ULL;
 
-	amd_iommu_pc_set_reg(iommu, _GET_BANK(event), _GET_CNTR(event),
+	amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
 			     IOMMU_PC_COUNTER_SRC_REG, &reg);
 }
 
 static void perf_iommu_start(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	struct amd_iommu *iommu = get_amd_iommu(0);
 
 	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
 		return;
@@ -293,8 +284,10 @@ static void perf_iommu_start(struct perf_event *event, int flags)
 	hwc->state = 0;
 
 	if (flags & PERF_EF_RELOAD) {
-		u64 prev_raw_count =  local64_read(&hwc->prev_count);
-		amd_iommu_pc_set_reg(iommu, _GET_BANK(event), _GET_CNTR(event),
+		u64 prev_raw_count = local64_read(&hwc->prev_count);
+		struct amd_iommu *iommu = get_amd_iommu(0);
+
+		amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
 				     IOMMU_PC_COUNTER_REG, &prev_raw_count);
 	}
 
@@ -309,7 +302,7 @@ static void perf_iommu_read(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	struct amd_iommu *iommu = get_amd_iommu(0);
 
-	if (amd_iommu_pc_get_reg(iommu, _GET_BANK(event), _GET_CNTR(event),
+	if (amd_iommu_pc_get_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr,
 				 IOMMU_PC_COUNTER_REG, &count))
 		return;
 
@@ -329,7 +322,6 @@ static void perf_iommu_read(struct perf_event *event)
 static void perf_iommu_stop(struct perf_event *event, int flags)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	u64 config;
 
 	if (hwc->state & PERF_HES_UPTODATE)
 		return;
@@ -341,7 +333,6 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
 	if (hwc->state & PERF_HES_UPTODATE)
 		return;
 
-	config = hwc->config;
 	perf_iommu_read(event);
 	hwc->state |= PERF_HES_UPTODATE;
 }
@@ -349,16 +340,12 @@ static void perf_iommu_stop(struct perf_event *event, int flags)
 static int perf_iommu_add(struct perf_event *event, int flags)
 {
 	int retval;
-	struct perf_amd_iommu *perf_iommu =
-			container_of(event->pmu, struct perf_amd_iommu, pmu);
 
 	event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 
 	/* request an iommu bank/counter */
-	retval = get_next_avail_iommu_bnk_cntr(perf_iommu);
-	if (retval != -ENOSPC)
-		event->hw.extra_reg.reg = (u16)retval;
-	else
+	retval = get_next_avail_iommu_bnk_cntr(event);
+	if (retval)
 		return retval;
 
 	if (flags & PERF_EF_START)
@@ -369,6 +356,7 @@ static int perf_iommu_add(struct perf_event *event, int flags)
 
 static void perf_iommu_del(struct perf_event *event, int flags)
 {
+	struct hw_perf_event *hwc = &event->hw;
 	struct perf_amd_iommu *perf_iommu =
 			container_of(event->pmu, struct perf_amd_iommu, pmu);
 
@@ -376,8 +364,7 @@ static void perf_iommu_del(struct perf_event *event, int flags)
 
 	/* clear the assigned iommu bank/counter */
 	clear_avail_iommu_bnk_cntr(perf_iommu,
-				     _GET_BANK(event),
-				     _GET_CNTR(event));
+				   hwc->iommu_bank, hwc->iommu_cntr);
 
 	perf_event_update_userpage(event);
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b6e75c9d4791..24a635887f28 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -165,6 +165,13 @@ struct hw_perf_event {
 			struct list_head		bp_list;
 		};
 #endif
+		struct { /* amd_iommu */
+			u8	iommu_bank;
+			u8	iommu_cntr;
+			u16	padding;
+			u64	conf;
+			u64	conf1;
+		};
 	};
 	/*
 	 * If the event is a per task event, this will point to the task in
-- 
cgit v1.3-6-gb490


From 5ba6bcbc335771c37d05b88cbfcad5441b57130b Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Tue, 7 Mar 2017 06:15:15 -0800
Subject: hwmon: Constify str parameter of hwmon_ops->read_string

The read_string callback is supposed to retrieve a pointer to a
constant string.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Reviewed-by: Peter Huewe <peterhuewe@gmx.de>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/hwmon.c | 2 +-
 include/linux/hwmon.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index 28375d59cc36..dd6e17c1076b 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -186,7 +186,7 @@ static ssize_t hwmon_attr_show_string(struct device *dev,
 				      char *buf)
 {
 	struct hwmon_device_attribute *hattr = to_hwmon_attr(devattr);
-	char *s;
+	const char *s;
 	int ret;
 
 	ret = hattr->ops->read_string(dev, hattr->type, hattr->attr,
diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h
index 88b673749121..ceb751987c40 100644
--- a/include/linux/hwmon.h
+++ b/include/linux/hwmon.h
@@ -337,7 +337,7 @@ struct hwmon_ops {
 	int (*read)(struct device *dev, enum hwmon_sensor_types type,
 		    u32 attr, int channel, long *val);
 	int (*read_string)(struct device *dev, enum hwmon_sensor_types type,
-		    u32 attr, int channel, char **str);
+		    u32 attr, int channel, const char **str);
 	int (*write)(struct device *dev, enum hwmon_sensor_types type,
 		     u32 attr, int channel, long val);
 };
-- 
cgit v1.3-6-gb490


From 815429b39d94c64f6d05eed9e7c1a9bdfdd5bd70 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Wed, 29 Mar 2017 19:30:17 +0900
Subject: extcon: Add new extcon_register_notifier_all() to monitor all
 external connectors

The extcon core already provides the extcon_register_notifier() function
in order to register the notifier block which is used to monitor
the state change for the specific external connector such as EXTCON_USB,
EXTCON_USB_HOST and so on. The extcon consumer uses the this function.

The extcon consumer might need to monitor the all supported external
connectors from the extcon device. In this case, The extcon consumer
should have each notifier_block structure for each external connector.

This patch adds the new extcon_register_notifier_all() function
that extcon consumer is able to monitor the state change of all
supported external connectors by using only one notifier_block structure.

- List of new added functions:
int extcon_register_notifier_all(struct extcon_dev *edev,
			struct notifier_block *nb);
int extcon_unregister_notifier_all(struct extcon_dev *edev,
			struct notifier_block *nb);
int devm_extcon_register_notifier_all(struct device *dev,
			struct extcon_dev *edev, struct notifier_block *nb);
void devm_extcon_unregister_notifier_all(struct device *dev,
			struct extcon_dev *edev, struct notifier_block *nb);

Suggested-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Tested-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/extcon/devres.c | 61 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/extcon/extcon.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/extcon/extcon.h |  3 +++
 include/linux/extcon.h  | 21 ++++++++++++----
 4 files changed, 146 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/devres.c b/drivers/extcon/devres.c
index b40eb1805927..186fd735eb28 100644
--- a/drivers/extcon/devres.c
+++ b/drivers/extcon/devres.c
@@ -50,6 +50,13 @@ static void devm_extcon_dev_notifier_unreg(struct device *dev, void *res)
 	extcon_unregister_notifier(this->edev, this->id, this->nb);
 }
 
+static void devm_extcon_dev_notifier_all_unreg(struct device *dev, void *res)
+{
+	struct extcon_dev_notifier_devres *this = res;
+
+	extcon_unregister_notifier_all(this->edev, this->nb);
+}
+
 /**
  * devm_extcon_dev_allocate - Allocate managed extcon device
  * @dev:		device owning the extcon device being created
@@ -214,3 +221,57 @@ void devm_extcon_unregister_notifier(struct device *dev,
 			       devm_extcon_dev_match, edev));
 }
 EXPORT_SYMBOL(devm_extcon_unregister_notifier);
+
+/**
+ * devm_extcon_register_notifier_all()
+ *		- Resource-managed extcon_register_notifier_all()
+ * @dev:	device to allocate extcon device
+ * @edev:	the extcon device that has the external connecotr.
+ * @nb:		a notifier block to be registered.
+ *
+ * This function manages automatically the notifier of extcon device using
+ * device resource management and simplify the control of unregistering
+ * the notifier of extcon device. To get more information, refer that function.
+ *
+ * Returns 0 if success or negaive error number if failure.
+ */
+int devm_extcon_register_notifier_all(struct device *dev, struct extcon_dev *edev,
+				struct notifier_block *nb)
+{
+	struct extcon_dev_notifier_devres *ptr;
+	int ret;
+
+	ptr = devres_alloc(devm_extcon_dev_notifier_all_unreg, sizeof(*ptr),
+				GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	ret = extcon_register_notifier_all(edev, nb);
+	if (ret) {
+		devres_free(ptr);
+		return ret;
+	}
+
+	ptr->edev = edev;
+	ptr->nb = nb;
+	devres_add(dev, ptr);
+
+	return 0;
+}
+EXPORT_SYMBOL(devm_extcon_register_notifier_all);
+
+/**
+ * devm_extcon_unregister_notifier_all()
+ *		- Resource-managed extcon_unregister_notifier_all()
+ * @dev:	device to allocate extcon device
+ * @edev:	the extcon device that has the external connecotr.
+ * @nb:		a notifier block to be registered.
+ */
+void devm_extcon_unregister_notifier_all(struct device *dev,
+				struct extcon_dev *edev,
+				struct notifier_block *nb)
+{
+	WARN_ON(devres_release(dev, devm_extcon_dev_notifier_all_unreg,
+			       devm_extcon_dev_match, edev));
+}
+EXPORT_SYMBOL(devm_extcon_unregister_notifier_all);
diff --git a/drivers/extcon/extcon.c b/drivers/extcon/extcon.c
index 09ac5e70c2f3..e7750545469f 100644
--- a/drivers/extcon/extcon.c
+++ b/drivers/extcon/extcon.c
@@ -448,8 +448,19 @@ int extcon_sync(struct extcon_dev *edev, unsigned int id)
 	spin_lock_irqsave(&edev->lock, flags);
 
 	state = !!(edev->state & BIT(index));
+
+	/*
+	 * Call functions in a raw notifier chain for the specific one
+	 * external connector.
+	 */
 	raw_notifier_call_chain(&edev->nh[index], state, edev);
 
+	/*
+	 * Call functions in a raw notifier chain for the all supported
+	 * external connectors.
+	 */
+	raw_notifier_call_chain(&edev->nh_all, state, edev);
+
 	/* This could be in interrupt handler */
 	prop_buf = (char *)get_zeroed_page(GFP_ATOMIC);
 	if (!prop_buf) {
@@ -954,6 +965,59 @@ int extcon_unregister_notifier(struct extcon_dev *edev, unsigned int id,
 }
 EXPORT_SYMBOL_GPL(extcon_unregister_notifier);
 
+/**
+ * extcon_register_notifier_all() - Register a notifier block for all connectors
+ * @edev:	the extcon device that has the external connecotr.
+ * @nb:		a notifier block to be registered.
+ *
+ * This fucntion registers a notifier block in order to receive the state
+ * change of all supported external connectors from extcon device.
+ * And The second parameter given to the callback of nb (val) is
+ * the current state and third parameter is the edev pointer.
+ *
+ * Returns 0 if success or error number if fail
+ */
+int extcon_register_notifier_all(struct extcon_dev *edev,
+				struct notifier_block *nb)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!edev || !nb)
+		return -EINVAL;
+
+	spin_lock_irqsave(&edev->lock, flags);
+	ret = raw_notifier_chain_register(&edev->nh_all, nb);
+	spin_unlock_irqrestore(&edev->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(extcon_register_notifier_all);
+
+/**
+ * extcon_unregister_notifier_all() - Unregister a notifier block from extcon.
+ * @edev:	the extcon device that has the external connecotr.
+ * @nb:		a notifier block to be registered.
+ *
+ * Returns 0 if success or error number if fail
+ */
+int extcon_unregister_notifier_all(struct extcon_dev *edev,
+				struct notifier_block *nb)
+{
+	unsigned long flags;
+	int ret;
+
+	if (!edev || !nb)
+		return -EINVAL;
+
+	spin_lock_irqsave(&edev->lock, flags);
+	ret = raw_notifier_chain_unregister(&edev->nh_all, nb);
+	spin_unlock_irqrestore(&edev->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(extcon_unregister_notifier_all);
+
 static struct attribute *extcon_attrs[] = {
 	&dev_attr_state.attr,
 	&dev_attr_name.attr,
@@ -1212,6 +1276,8 @@ int extcon_dev_register(struct extcon_dev *edev)
 	for (index = 0; index < edev->max_supported; index++)
 		RAW_INIT_NOTIFIER_HEAD(&edev->nh[index]);
 
+	RAW_INIT_NOTIFIER_HEAD(&edev->nh_all);
+
 	dev_set_drvdata(&edev->dev, edev);
 	edev->state = 0;
 
diff --git a/drivers/extcon/extcon.h b/drivers/extcon/extcon.h
index 993ddccafe11..dddddcfa0587 100644
--- a/drivers/extcon/extcon.h
+++ b/drivers/extcon/extcon.h
@@ -21,6 +21,8 @@
  * @dev:		Device of this extcon.
  * @state:		Attach/detach state of this extcon. Do not provide at
  *			register-time.
+ * @nh_all:		Notifier for the state change events for all supported
+ *			external connectors from this extcon.
  * @nh:			Notifier for the state change events from this extcon
  * @entry:		To support list of extcon devices so that users can
  *			search for extcon devices based on the extcon name.
@@ -43,6 +45,7 @@ struct extcon_dev {
 
 	/* Internal data. Please do not set. */
 	struct device dev;
+	struct raw_notifier_head nh_all;
 	struct raw_notifier_head *nh;
 	struct list_head entry;
 	int max_supported;
diff --git a/include/linux/extcon.h b/include/linux/extcon.h
index 7010fb01a81a..7e206a9f88db 100644
--- a/include/linux/extcon.h
+++ b/include/linux/extcon.h
@@ -236,11 +236,11 @@ extern int extcon_set_property_capability(struct extcon_dev *edev,
 				unsigned int id, unsigned int prop);
 
 /*
- * Following APIs are to monitor every action of a notifier.
- * Registrar gets notified for every external port of a connection device.
- * Probably this could be used to debug an action of notifier; however,
- * we do not recommend to use this for normal 'notifiee' device drivers who
- * want to be notified by a specific external port of the notifier.
+ * Following APIs are to monitor the status change of the external connectors.
+ * extcon_register_notifier(*edev, id, *nb) : Register a notifier block
+ *			for specific external connector of the extcon.
+ * extcon_register_notifier_all(*edev, *nb) : Register a notifier block
+ *			for all supported external connectors of the extcon.
  */
 extern int extcon_register_notifier(struct extcon_dev *edev, unsigned int id,
 				    struct notifier_block *nb);
@@ -253,6 +253,17 @@ extern void devm_extcon_unregister_notifier(struct device *dev,
 				struct extcon_dev *edev, unsigned int id,
 				struct notifier_block *nb);
 
+extern int extcon_register_notifier_all(struct extcon_dev *edev,
+				struct notifier_block *nb);
+extern int extcon_unregister_notifier_all(struct extcon_dev *edev,
+				struct notifier_block *nb);
+extern int devm_extcon_register_notifier_all(struct device *dev,
+				struct extcon_dev *edev,
+				struct notifier_block *nb);
+extern void devm_extcon_unregister_notifier_all(struct device *dev,
+				struct extcon_dev *edev,
+				struct notifier_block *nb);
+
 /*
  * Following API get the extcon device from devicetree.
  * This function use phandle of devicetree to get extcon device directly.
-- 
cgit v1.3-6-gb490


From e96a7705e7d3fef96aec9b590c63b2f6f7d2ba22 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Thu, 23 Mar 2017 15:56:08 +0100
Subject: sched/rtmutex/deadline: Fix a PI crash for deadline tasks

A crash happened while I was playing with deadline PI rtmutex.

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
    IP: [<ffffffff810eeb8f>] rt_mutex_get_top_task+0x1f/0x30
    PGD 232a75067 PUD 230947067 PMD 0
    Oops: 0000 [#1] SMP
    CPU: 1 PID: 10994 Comm: a.out Not tainted

    Call Trace:
    [<ffffffff810b658c>] enqueue_task+0x2c/0x80
    [<ffffffff810ba763>] activate_task+0x23/0x30
    [<ffffffff810d0ab5>] pull_dl_task+0x1d5/0x260
    [<ffffffff810d0be6>] pre_schedule_dl+0x16/0x20
    [<ffffffff8164e783>] __schedule+0xd3/0x900
    [<ffffffff8164efd9>] schedule+0x29/0x70
    [<ffffffff8165035b>] __rt_mutex_slowlock+0x4b/0xc0
    [<ffffffff81650501>] rt_mutex_slowlock+0xd1/0x190
    [<ffffffff810eeb33>] rt_mutex_timed_lock+0x53/0x60
    [<ffffffff810ecbfc>] futex_lock_pi.isra.18+0x28c/0x390
    [<ffffffff810ed8b0>] do_futex+0x190/0x5b0
    [<ffffffff810edd50>] SyS_futex+0x80/0x180

This is because rt_mutex_enqueue_pi() and rt_mutex_dequeue_pi()
are only protected by pi_lock when operating pi waiters, while
rt_mutex_get_top_task(), will access them with rq lock held but
not holding pi_lock.

In order to tackle it, we introduce new "pi_top_task" pointer
cached in task_struct, and add new rt_mutex_update_top_task()
to update its value, it can be called by rt_mutex_setprio()
which held both owner's pi_lock and rq lock. Thus "pi_top_task"
can be safely accessed by enqueue_task_dl() under rq lock.

Originally-From: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.157682758@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/init_task.h |  1 +
 include/linux/sched.h     |  2 ++
 include/linux/sched/rt.h  |  1 +
 kernel/fork.c             |  1 +
 kernel/locking/rtmutex.c  | 29 +++++++++++++++++++++--------
 kernel/sched/core.c       |  2 ++
 6 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 91d9049f0039..2c487e0879d5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -181,6 +181,7 @@ extern struct cred init_cred;
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
 	.pi_waiters = RB_ROOT,						\
+	.pi_top_task = NULL,						\
 	.pi_waiters_leftmost = NULL,
 #else
 # define INIT_RT_MUTEXES(tsk)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..1ea2eee7bc4f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -775,6 +775,8 @@ struct task_struct {
 	/* PI waiters blocked on a rt_mutex held by this task: */
 	struct rb_root			pi_waiters;
 	struct rb_node			*pi_waiters_leftmost;
+	/* Updated under owner's pi_lock and rq lock */
+	struct task_struct		*pi_top_task;
 	/* Deadlock detection and priority inheritance handling: */
 	struct rt_mutex_waiter		*pi_blocked_on;
 #endif
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 3bd668414f61..10ee7eeb0ee2 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -21,6 +21,7 @@ static inline int rt_task(struct task_struct *p)
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
 extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
+extern void rt_mutex_update_top_task(struct task_struct *p);
 extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
diff --git a/kernel/fork.c b/kernel/fork.c
index 6c463c80e93d..b30196a00b0d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1438,6 +1438,7 @@ static void rt_mutex_init_task(struct task_struct *p)
 #ifdef CONFIG_RT_MUTEXES
 	p->pi_waiters = RB_ROOT;
 	p->pi_waiters_leftmost = NULL;
+	p->pi_top_task = NULL;
 	p->pi_blocked_on = NULL;
 #endif
 }
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 71ecf0624410..bc05b104eaed 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -322,6 +322,19 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 	RB_CLEAR_NODE(&waiter->pi_tree_entry);
 }
 
+/*
+ * Must hold both p->pi_lock and task_rq(p)->lock.
+ */
+void rt_mutex_update_top_task(struct task_struct *p)
+{
+	if (!task_has_pi_waiters(p)) {
+		p->pi_top_task = NULL;
+		return;
+	}
+
+	p->pi_top_task = task_top_pi_waiter(p)->task;
+}
+
 /*
  * Calculate task priority from the waiter tree priority
  *
@@ -337,12 +350,12 @@ int rt_mutex_getprio(struct task_struct *task)
 		   task->normal_prio);
 }
 
+/*
+ * Must hold either p->pi_lock or task_rq(p)->lock.
+ */
 struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
-	if (likely(!task_has_pi_waiters(task)))
-		return NULL;
-
-	return task_top_pi_waiter(task)->task;
+	return task->pi_top_task;
 }
 
 /*
@@ -351,12 +364,12 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
  */
 int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
 {
-	if (!task_has_pi_waiters(task))
+	struct task_struct *top_task = rt_mutex_get_top_task(task);
+
+	if (!top_task)
 		return newprio;
 
-	if (task_top_pi_waiter(task)->task->prio <= newprio)
-		return task_top_pi_waiter(task)->task->prio;
-	return newprio;
+	return min(top_task->prio, newprio);
 }
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ab9f6ac099a7..e1f44ec701c8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3713,6 +3713,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		goto out_unlock;
 	}
 
+	rt_mutex_update_top_task(p);
+
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
 
-- 
cgit v1.3-6-gb490


From acd58620e415aee4a43a808d7d2fd87259ee0001 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 23 Mar 2017 15:56:11 +0100
Subject: sched/rtmutex: Refactor rt_mutex_setprio()

With the introduction of SCHED_DEADLINE the whole notion that priority
is a single number is gone, therefore the @prio argument to
rt_mutex_setprio() doesn't make sense anymore.

So rework the code to pass a pi_task instead.

Note this also fixes a problem with pi_top_task caching; previously we
would not set the pointer (call rt_mutex_update_top_task) if the
priority didn't change, this could lead to a stale pointer.

As for the XXX, I think its fine to use pi_task->prio, because if it
differs from waiter->prio, a PI chain update is immenent.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170323150216.303827095@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched/rt.h |  24 ++++------
 kernel/locking/rtmutex.c | 112 +++++++++++++----------------------------------
 kernel/sched/core.c      |  66 ++++++++++++++++++++++------
 3 files changed, 91 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 10ee7eeb0ee2..f93329aba31a 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -18,28 +18,20 @@ static inline int rt_task(struct task_struct *p)
 }
 
 #ifdef CONFIG_RT_MUTEXES
-extern int rt_mutex_getprio(struct task_struct *p);
-extern void rt_mutex_setprio(struct task_struct *p, int prio);
-extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio);
-extern void rt_mutex_update_top_task(struct task_struct *p);
-extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
+/*
+ * Must hold either p->pi_lock or task_rq(p)->lock.
+ */
+static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
+{
+	return p->pi_top_task;
+}
+extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
 {
 	return tsk->pi_blocked_on != NULL;
 }
 #else
-static inline int rt_mutex_getprio(struct task_struct *p)
-{
-	return p->normal_prio;
-}
-
-static inline int rt_mutex_get_effective_prio(struct task_struct *task,
-					      int newprio)
-{
-	return newprio;
-}
-
 static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
 	return NULL;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 4b1015ef0dc7..00b49cdbb4e0 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -322,67 +322,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter)
 	RB_CLEAR_NODE(&waiter->pi_tree_entry);
 }
 
-/*
- * Must hold both p->pi_lock and task_rq(p)->lock.
- */
-void rt_mutex_update_top_task(struct task_struct *p)
-{
-	if (!task_has_pi_waiters(p)) {
-		p->pi_top_task = NULL;
-		return;
-	}
-
-	p->pi_top_task = task_top_pi_waiter(p)->task;
-}
-
-/*
- * Calculate task priority from the waiter tree priority
- *
- * Return task->normal_prio when the waiter tree is empty or when
- * the waiter is not allowed to do priority boosting
- */
-int rt_mutex_getprio(struct task_struct *task)
-{
-	if (likely(!task_has_pi_waiters(task)))
-		return task->normal_prio;
-
-	return min(task_top_pi_waiter(task)->prio,
-		   task->normal_prio);
-}
-
-/*
- * Must hold either p->pi_lock or task_rq(p)->lock.
- */
-struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
-{
-	return task->pi_top_task;
-}
-
-/*
- * Called by sched_setscheduler() to get the priority which will be
- * effective after the change.
- */
-int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
+static void rt_mutex_adjust_prio(struct task_struct *p)
 {
-	struct task_struct *top_task = rt_mutex_get_top_task(task);
+	struct task_struct *pi_task = NULL;
 
-	if (!top_task)
-		return newprio;
+	lockdep_assert_held(&p->pi_lock);
 
-	return min(top_task->prio, newprio);
-}
+	if (task_has_pi_waiters(p))
+		pi_task = task_top_pi_waiter(p)->task;
 
-/*
- * Adjust the priority of a task, after its pi_waiters got modified.
- *
- * This can be both boosting and unboosting. task->pi_lock must be held.
- */
-static void __rt_mutex_adjust_prio(struct task_struct *task)
-{
-	int prio = rt_mutex_getprio(task);
-
-	if (task->prio != prio || dl_prio(prio))
-		rt_mutex_setprio(task, prio);
+	rt_mutex_setprio(p, pi_task);
 }
 
 /*
@@ -742,7 +691,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		 */
 		rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
 		rt_mutex_enqueue_pi(task, waiter);
-		__rt_mutex_adjust_prio(task);
+		rt_mutex_adjust_prio(task);
 
 	} else if (prerequeue_top_waiter == waiter) {
 		/*
@@ -758,7 +707,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
 		rt_mutex_dequeue_pi(task, waiter);
 		waiter = rt_mutex_top_waiter(lock);
 		rt_mutex_enqueue_pi(task, waiter);
-		__rt_mutex_adjust_prio(task);
+		rt_mutex_adjust_prio(task);
 	} else {
 		/*
 		 * Nothing changed. No need to do any priority
@@ -966,7 +915,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		return -EDEADLK;
 
 	raw_spin_lock(&task->pi_lock);
-	__rt_mutex_adjust_prio(task);
+	rt_mutex_adjust_prio(task);
 	waiter->task = task;
 	waiter->lock = lock;
 	waiter->prio = task->prio;
@@ -988,7 +937,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 		rt_mutex_dequeue_pi(owner, top_waiter);
 		rt_mutex_enqueue_pi(owner, waiter);
 
-		__rt_mutex_adjust_prio(owner);
+		rt_mutex_adjust_prio(owner);
 		if (owner->pi_blocked_on)
 			chain_walk = 1;
 	} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
@@ -1040,13 +989,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
 	waiter = rt_mutex_top_waiter(lock);
 
 	/*
-	 * Remove it from current->pi_waiters. We do not adjust a
-	 * possible priority boost right now. We execute wakeup in the
-	 * boosted mode and go back to normal after releasing
-	 * lock->wait_lock.
+	 * Remove it from current->pi_waiters and deboost.
+	 *
+	 * We must in fact deboost here in order to ensure we call
+	 * rt_mutex_setprio() to update p->pi_top_task before the
+	 * task unblocks.
 	 */
 	rt_mutex_dequeue_pi(current, waiter);
-	__rt_mutex_adjust_prio(current);
+	rt_mutex_adjust_prio(current);
 
 	/*
 	 * As we are waking up the top waiter, and the waiter stays
@@ -1058,9 +1008,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
 	 */
 	lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
 
-	raw_spin_unlock(&current->pi_lock);
-
+	/*
+	 * We deboosted before waking the top waiter task such that we don't
+	 * run two tasks with the 'same' priority (and ensure the
+	 * p->pi_top_task pointer points to a blocked task). This however can
+	 * lead to priority inversion if we would get preempted after the
+	 * deboost but before waking our donor task, hence the preempt_disable()
+	 * before unlock.
+	 *
+	 * Pairs with preempt_enable() in rt_mutex_postunlock();
+	 */
+	preempt_disable();
 	wake_q_add(wake_q, waiter->task);
+	raw_spin_unlock(&current->pi_lock);
 }
 
 /*
@@ -1095,7 +1055,7 @@ static void remove_waiter(struct rt_mutex *lock,
 	if (rt_mutex_has_waiters(lock))
 		rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
 
-	__rt_mutex_adjust_prio(owner);
+	rt_mutex_adjust_prio(owner);
 
 	/* Store the lock on which owner is blocked or NULL */
 	next_lock = task_blocked_on_lock(owner);
@@ -1134,8 +1094,7 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 
 	waiter = task->pi_blocked_on;
-	if (!waiter || (waiter->prio == task->prio &&
-			!dl_prio(task->prio))) {
+	if (!waiter || (waiter->prio == task->prio && !dl_prio(task->prio))) {
 		raw_spin_unlock_irqrestore(&task->pi_lock, flags);
 		return;
 	}
@@ -1389,17 +1348,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
 	 * Queue the next waiter for wakeup once we release the wait_lock.
 	 */
 	mark_wakeup_next_waiter(wake_q, lock);
-
-	/*
-	 * We should deboost before waking the top waiter task such that
-	 * we don't run two tasks with the 'same' priority. This however
-	 * can lead to prio-inversion if we would get preempted after
-	 * the deboost but before waking our high-prio task, hence the
-	 * preempt_disable before unlock. Pairs with preempt_enable() in
-	 * rt_mutex_postunlock();
-	 */
-	preempt_disable();
-
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
 	return true; /* call rt_mutex_postunlock() */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e1f44ec701c8..37b152a782ae 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3671,10 +3671,25 @@ EXPORT_SYMBOL(default_wake_function);
 
 #ifdef CONFIG_RT_MUTEXES
 
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+	if (pi_task)
+		prio = min(prio, pi_task->prio);
+
+	return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+	struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+	return __rt_effective_prio(pi_task, prio);
+}
+
 /*
  * rt_mutex_setprio - set the current priority of a task
- * @p: task
- * @prio: prio value (kernel-internal form)
+ * @p: task to boost
+ * @pi_task: donor task
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
@@ -3682,18 +3697,42 @@ EXPORT_SYMBOL(default_wake_function);
  * Used by the rt_mutex code to implement priority inheritance
  * logic. Call site only calls if the priority of the task changed.
  */
-void rt_mutex_setprio(struct task_struct *p, int prio)
+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
-	int oldprio, queued, running, queue_flag =
+	int prio, oldprio, queued, running, queue_flag =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 	const struct sched_class *prev_class;
 	struct rq_flags rf;
 	struct rq *rq;
 
-	BUG_ON(prio > MAX_PRIO);
+	/* XXX used to be waiter->prio, not waiter->task->prio */
+	prio = __rt_effective_prio(pi_task, p->normal_prio);
+
+	/*
+	 * If nothing changed; bail early.
+	 */
+	if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
+		return;
 
 	rq = __task_rq_lock(p, &rf);
 	update_rq_clock(rq);
+	/*
+	 * Set under pi_lock && rq->lock, such that the value can be used under
+	 * either lock.
+	 *
+	 * Note that there is loads of tricky to make this pointer cache work
+	 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
+	 * ensure a task is de-boosted (pi_task is set to NULL) before the
+	 * task is allowed to run again (and can exit). This ensures the pointer
+	 * points to a blocked task -- which guaratees the task is present.
+	 */
+	p->pi_top_task = pi_task;
+
+	/*
+	 * For FIFO/RR we only need to set prio, if that matches we're done.
+	 */
+	if (prio == p->prio && !dl_prio(prio))
+		goto out_unlock;
 
 	/*
 	 * Idle task boosting is a nono in general. There is one
@@ -3713,9 +3752,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 		goto out_unlock;
 	}
 
-	rt_mutex_update_top_task(p);
-
-	trace_sched_pi_setprio(p, prio);
+	trace_sched_pi_setprio(p, prio); /* broken */
 	oldprio = p->prio;
 
 	if (oldprio == prio)
@@ -3739,7 +3776,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	 *          running task
 	 */
 	if (dl_prio(prio)) {
-		struct task_struct *pi_task = rt_mutex_get_top_task(p);
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
@@ -3777,6 +3813,11 @@ out_unlock:
 	balance_callback(rq);
 	preempt_enable();
 }
+#else
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+	return prio;
+}
 #endif
 
 void set_user_nice(struct task_struct *p, long nice)
@@ -4023,10 +4064,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
 	 * Keep a potential priority boosting if called from
 	 * sched_setscheduler().
 	 */
+	p->prio = normal_prio(p);
 	if (keep_boost)
-		p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
-	else
-		p->prio = normal_prio(p);
+		p->prio = rt_effective_prio(p, p->prio);
 
 	if (dl_prio(p->prio))
 		p->sched_class = &dl_sched_class;
@@ -4313,7 +4353,7 @@ change:
 		 * the runqueue. This will be done when the task deboost
 		 * itself.
 		 */
-		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		new_effective_prio = rt_effective_prio(p, newprio);
 		if (new_effective_prio == oldprio)
 			queue_flags &= ~DEQUEUE_MOVE;
 	}
-- 
cgit v1.3-6-gb490


From b1a951fe469eb51646bf2e6d2c5f4a19fe1d4627 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Sun, 5 Feb 2017 21:47:22 +0200
Subject: net/utils: generic inet_pton_with_scope helper

Several locations in the stack need to handle ipv4/ipv6
(with scope) and port strings conversion to sockaddr.
Add a helper that takes either AF_INET, AF_INET6 or
AF_UNSPEC (for wildcard) to centralize this handling.

Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/inet.h |   6 +++
 net/core/utils.c     | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/inet.h b/include/linux/inet.h
index 4cca05c9678e..636ebe87e6f8 100644
--- a/include/linux/inet.h
+++ b/include/linux/inet.h
@@ -43,6 +43,8 @@
 #define _LINUX_INET_H
 
 #include <linux/types.h>
+#include <net/net_namespace.h>
+#include <linux/socket.h>
 
 /*
  * These mimic similar macros defined in user-space for inet_ntop(3).
@@ -54,4 +56,8 @@
 extern __be32 in_aton(const char *str);
 extern int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
 extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
+
+extern int inet_pton_with_scope(struct net *net, unsigned short af,
+		const char *src, const char *port, struct sockaddr_storage *addr);
+
 #endif	/* _LINUX_INET_H */
diff --git a/net/core/utils.c b/net/core/utils.c
index 6592d7bbed39..f96cf527bb8f 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -26,9 +26,11 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/ratelimit.h>
+#include <linux/socket.h>
 
 #include <net/sock.h>
 #include <net/net_ratelimit.h>
+#include <net/ipv6.h>
 
 #include <asm/byteorder.h>
 #include <linux/uaccess.h>
@@ -300,6 +302,107 @@ out:
 }
 EXPORT_SYMBOL(in6_pton);
 
+static int inet4_pton(const char *src, u16 port_num,
+		struct sockaddr_storage *addr)
+{
+	struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+	int srclen = strlen(src);
+
+	if (srclen > INET_ADDRSTRLEN)
+		return -EINVAL;
+
+	if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr,
+		     '\n', NULL) == 0)
+		return -EINVAL;
+
+	addr4->sin_family = AF_INET;
+	addr4->sin_port = htons(port_num);
+
+	return 0;
+}
+
+static int inet6_pton(struct net *net, const char *src, u16 port_num,
+		struct sockaddr_storage *addr)
+{
+	struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+	const char *scope_delim;
+	int srclen = strlen(src);
+
+	if (srclen > INET6_ADDRSTRLEN)
+		return -EINVAL;
+
+	if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr,
+		     '%', &scope_delim) == 0)
+		return -EINVAL;
+
+	if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL &&
+	    src + srclen != scope_delim && *scope_delim == '%') {
+		struct net_device *dev;
+		char scope_id[16];
+		size_t scope_len = min_t(size_t, sizeof(scope_id),
+					 src + srclen - scope_delim - 1);
+
+		memcpy(scope_id, scope_delim + 1, scope_len);
+		scope_id[scope_len] = '\0';
+
+		dev = dev_get_by_name(net, scope_id);
+		if (dev) {
+			addr6->sin6_scope_id = dev->ifindex;
+			dev_put(dev);
+		} else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) {
+			return -EINVAL;
+		}
+	}
+
+	addr6->sin6_family = AF_INET6;
+	addr6->sin6_port = htons(port_num);
+
+	return 0;
+}
+
+/**
+ * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address
+ * @net: net namespace (used for scope handling)
+ * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either
+ * @src: the start of the address string
+ * @port: the start of the port string (or NULL for none)
+ * @addr: output socket address
+ *
+ * Return zero on success, return errno when any error occurs.
+ */
+int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
+		const char *src, const char *port, struct sockaddr_storage *addr)
+{
+	u16 port_num;
+	int ret = -EINVAL;
+
+	if (port) {
+		if (kstrtou16(port, 0, &port_num))
+			return -EINVAL;
+	} else {
+		port_num = 0;
+	}
+
+	switch (af) {
+	case AF_INET:
+		ret = inet4_pton(src, port_num, addr);
+		break;
+	case AF_INET6:
+		ret = inet6_pton(net, src, port_num, addr);
+		break;
+	case AF_UNSPEC:
+		ret = inet4_pton(src, port_num, addr);
+		if (ret)
+			ret = inet6_pton(net, src, port_num, addr);
+		break;
+	default:
+		pr_err("unexpected address family %d\n", af);
+	};
+
+	return ret;
+}
+EXPORT_SYMBOL(inet_pton_with_scope);
+
 void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
 			      __be32 from, __be32 to, bool pseudohdr)
 {
-- 
cgit v1.3-6-gb490


From 0f222ccce359d21f927d07df2069e7029497b790 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Thu, 23 Mar 2017 20:41:22 -0700
Subject: nvme_fc: Sync FC-NVME header with standard

Update FC-NVME definitions to match FC-NVME r1.14 (16-020vB) plus
change voted in by 2/22 FC-NVME Adhoc (see HOSTID below).

Includes the following:
- Addition of "status_code" field to ERSP IU
- Addition of FC-NVME LS RJT reason_codes and reason_explanations
- CreateAssociation payload, HostID field shortened to 16 bytes

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/nvme-fc.h | 68 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h
index 4b45226bd604..e997c4a49a88 100644
--- a/include/linux/nvme-fc.h
+++ b/include/linux/nvme-fc.h
@@ -16,8 +16,7 @@
  */
 
 /*
- * This file contains definitions relative to FC-NVME r1.11 and a few
- * newer items
+ * This file contains definitions relative to FC-NVME r1.14 (16-020vB).
  */
 
 #ifndef _NVME_FC_H
@@ -47,8 +46,15 @@ struct nvme_fc_cmd_iu {
 
 #define NVME_FC_SIZEOF_ZEROS_RSP	12
 
+enum {
+	FCNVME_SC_SUCCESS		= 0,
+	FCNVME_SC_INVALID_FIELD		= 1,
+	FCNVME_SC_INVALID_CONNID	= 2,
+};
+
 struct nvme_fc_ersp_iu {
-	__u8			rsvd0[2];
+	__u8			status_code;
+	__u8			rsvd1;
 	__be16			iu_len;
 	__be32			rsn;
 	__be32			xfrd_len;
@@ -58,7 +64,7 @@ struct nvme_fc_ersp_iu {
 };
 
 
-/* FC-NVME r1.03/16-119v0 NVME Link Services */
+/* FC-NVME Link Services */
 enum {
 	FCNVME_LS_RSVD			= 0,
 	FCNVME_LS_RJT			= 1,
@@ -68,7 +74,7 @@ enum {
 	FCNVME_LS_DISCONNECT		= 5,
 };
 
-/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */
+/* FC-NVME Link Service Descriptors */
 enum {
 	FCNVME_LSDESC_RSVD		= 0x0,
 	FCNVME_LSDESC_RQST		= 0x1,
@@ -92,7 +98,6 @@ static inline __be32 fcnvme_lsdesc_len(size_t sz)
 	return cpu_to_be32(sz - (2 * sizeof(u32)));
 }
 
-
 struct fcnvme_ls_rqst_w0 {
 	u8	ls_cmd;			/* FCNVME_LS_xxx */
 	u8	zeros[3];
@@ -106,8 +111,53 @@ struct fcnvme_lsdesc_rqst {
 	__be32	rsvd12;
 };
 
+/* FC-NVME LS RJT reason_code values */
+enum fcnvme_ls_rjt_reason {
+	FCNVME_RJT_RC_NONE		= 0,
+	/* no reason - not to be sent */
+
+	FCNVME_RJT_RC_INVAL		= 0x01,
+	/* invalid NVMe_LS command code */
+
+	FCNVME_RJT_RC_LOGIC		= 0x03,
+	/* logical error */
+
+	FCNVME_RJT_RC_UNAB		= 0x09,
+	/* unable to perform command request */
+
+	FCNVME_RJT_RC_UNSUP		= 0x0b,
+	/* command not supported */
+
+	FCNVME_RJT_RC_INPROG		= 0x0e,
+	/* command already in progress */
 
+	FCNVME_RJT_RC_INV_ASSOC		= 0x40,
+	/* Invalid Association ID*/
 
+	FCNVME_RJT_RC_INV_CONN		= 0x41,
+	/* Invalid Connection ID*/
+
+	FCNVME_RJT_RC_VENDOR		= 0xff,
+	/* vendor specific error */
+};
+
+/* FC-NVME LS RJT reason_explanation values */
+enum fcnvme_ls_rjt_explan {
+	FCNVME_RJT_EXP_NONE		= 0x00,
+	/* No additional explanation */
+
+	FCNVME_RJT_EXP_OXID_RXID	= 0x17,
+	/* invalid OX_ID-RX_ID combination */
+
+	FCNVME_RJT_EXP_INSUF_RES	= 0x29,
+	/* insufficient resources */
+
+	FCNVME_RJT_EXP_UNAB_DATA	= 0x2a,
+	/* unable to supply requested data */
+
+	FCNVME_RJT_EXP_INV_LEN		= 0x2d,
+	/* Invalid payload length */
+};
 
 /* FCNVME_LSDESC_RJT */
 struct fcnvme_lsdesc_rjt {
@@ -119,15 +169,15 @@ struct fcnvme_lsdesc_rjt {
 	 * Reject reason and explanaction codes are generic
 	 * to ELs's from LS-3.
 	 */
-	u8	reason_code;
-	u8	reason_explanation;
+	u8	reason_code;		/* fcnvme_ls_rjt_reason */
+	u8	reason_explanation;	/* fcnvme_ls_rjt_explan */
 
 	u8	vendor;
 	__be32	rsvd12;
 };
 
 
-#define FCNVME_ASSOC_HOSTID_LEN		64
+#define FCNVME_ASSOC_HOSTID_LEN		16
 #define FCNVME_ASSOC_HOSTNQN_LEN	256
 #define FCNVME_ASSOC_SUBNQN_LEN		256
 
-- 
cgit v1.3-6-gb490


From 62eeacb0e04f2aff7099a7765f386bb7ba53d5e2 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Thu, 23 Mar 2017 20:41:27 -0700
Subject: nvme_fc: Clean up host fcpio done status handling

As Dan Carpenter pointed out: mixing 16-bit nvme status with 32-bit
error status from driver. Corrected comment on fcp request struct
status field, and converted done routine to explicitly set nvme status
codes for nvme status.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/nvme/host/fc.c         | 14 +++++++-------
 include/linux/nvme-fc-driver.h |  6 +++---
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index a02eeb69f85c..7a1d7ea5366e 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1147,7 +1147,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	struct nvme_fc_ctrl *ctrl = op->ctrl;
 	struct nvme_fc_queue *queue = op->queue;
 	struct nvme_completion *cqe = &op->rsp_iu.cqe;
-	u16 status;
+	u16 status = NVME_SC_SUCCESS;
 
 	/*
 	 * WARNING:
@@ -1183,8 +1183,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 
 	if (atomic_read(&op->state) == FCPOP_STATE_ABORTED)
 		status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
-	else
-		status = freq->status;
+	else if (freq->status)
+		status = NVME_SC_FC_TRANSPORT_ERROR;
 
 	/*
 	 * For the linux implementation, if we have an unsuccesful
@@ -1212,7 +1212,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 		 */
 		if (freq->transferred_length !=
 			be32_to_cpu(op->cmd_iu.data_len)) {
-			status = -EIO;
+			status = NVME_SC_FC_TRANSPORT_ERROR;
 			goto done;
 		}
 		op->nreq.result.u64 = 0;
@@ -1229,7 +1229,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 					freq->transferred_length ||
 			     op->rsp_iu.status_code ||
 			     op->rqno != le16_to_cpu(cqe->command_id))) {
-			status = -EIO;
+			status = NVME_SC_FC_TRANSPORT_ERROR;
 			goto done;
 		}
 		op->nreq.result = cqe->result;
@@ -1237,7 +1237,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 		break;
 
 	default:
-		status = -EIO;
+		status = NVME_SC_FC_TRANSPORT_ERROR;
 		goto done;
 	}
 
@@ -1763,7 +1763,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
 	op->fcp_req.io_dir = io_dir;
 	op->fcp_req.transferred_length = 0;
 	op->fcp_req.rcv_rsplen = 0;
-	op->fcp_req.status = 0;
+	op->fcp_req.status = NVME_SC_SUCCESS;
 	op->fcp_req.sqid = cpu_to_le16(queue->qnum);
 
 	/*
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index f21471f7ee40..16eb264980c2 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -137,9 +137,9 @@ enum nvmefc_fcp_datadir {
  *             transferred. Should equal payload_length on success.
  * @rcv_rsplen: length, in bytes, of the FCP RSP IU received.
  * @status:    Completion status of the FCP operation. must be 0 upon success,
- *             NVME_SC_FC_xxx value upon failure. Note: this is NOT a
- *             reflection of the NVME CQE completion status. Only the status
- *             of the FCP operation at the NVME-FC level.
+ *             negative errno value upon failure (ex: -EIO). Note: this is
+ *             NOT a reflection of the NVME CQE completion status. Only the
+ *             status of the FCP operation at the NVME-FC level.
  */
 struct nvmefc_fcp_req {
 	void			*cmdaddr;
-- 
cgit v1.3-6-gb490


From 6e7300cff1c410dde7ac4354b6a0a8cb0a561e54 Mon Sep 17 00:00:00 2001
From: Bhupesh Sharma <bhsharma@redhat.com>
Date: Tue, 4 Apr 2017 17:02:41 +0100
Subject: efi/bgrt: Enable ACPI BGRT handling on arm64

Now that the ACPI BGRT handling code has been made generic, we can
enable it for arm64.

Signed-off-by: Bhupesh Sharma <bhsharma@redhat.com>
[ Updated commit log to reflect that BGRT is only enabled for arm64, and added
  missing 'return' statement to the dummy acpi_parse_bgrt() function. ]
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-efi@vger.kernel.org
Link: http://lkml.kernel.org/r/20170404160245.27812-8-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/kernel/acpi.c    | 3 +++
 arch/x86/kernel/acpi/boot.c | 6 ------
 drivers/acpi/Kconfig        | 2 +-
 drivers/acpi/bgrt.c         | 6 ++++++
 include/linux/efi-bgrt.h    | 5 +++++
 5 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 64d9cbd61678..e25c11e727fe 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -18,6 +18,7 @@
 #include <linux/acpi.h>
 #include <linux/bootmem.h>
 #include <linux/cpumask.h>
+#include <linux/efi-bgrt.h>
 #include <linux/init.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
@@ -233,6 +234,8 @@ done:
 			early_init_dt_scan_chosen_stdout();
 	} else {
 		parse_spcr(earlycon_init_is_deferred);
+		if (IS_ENABLED(CONFIG_ACPI_BGRT))
+			acpi_table_parse(ACPI_SIG_BGRT, acpi_parse_bgrt);
 	}
 }
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index b2879cc23db4..70854988a963 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1564,12 +1564,6 @@ int __init early_acpi_boot_init(void)
 	return 0;
 }
 
-static int __init acpi_parse_bgrt(struct acpi_table_header *table)
-{
-	efi_bgrt_init(table);
-	return 0;
-}
-
 int __init acpi_boot_init(void)
 {
 	/* those are executed after early-quirks are executed */
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 83e5f7e1a20d..dad02c0f21b9 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -440,7 +440,7 @@ config ACPI_CUSTOM_METHOD
 
 config ACPI_BGRT
 	bool "Boottime Graphics Resource Table support"
-	depends on EFI && X86
+	depends on EFI && (X86 || ARM64)
         help
 	  This driver adds support for exposing the ACPI Boottime Graphics
 	  Resource Table, which allows the operating system to obtain
diff --git a/drivers/acpi/bgrt.c b/drivers/acpi/bgrt.c
index ca28aa572aa9..df1c629205e7 100644
--- a/drivers/acpi/bgrt.c
+++ b/drivers/acpi/bgrt.c
@@ -81,6 +81,12 @@ static struct attribute_group bgrt_attribute_group = {
 	.bin_attrs = bgrt_bin_attributes,
 };
 
+int __init acpi_parse_bgrt(struct acpi_table_header *table)
+{
+	efi_bgrt_init(table);
+	return 0;
+}
+
 static int __init bgrt_init(void)
 {
 	int ret;
diff --git a/include/linux/efi-bgrt.h b/include/linux/efi-bgrt.h
index 2fd3993c370b..e6f624b53c3d 100644
--- a/include/linux/efi-bgrt.h
+++ b/include/linux/efi-bgrt.h
@@ -6,6 +6,7 @@
 #ifdef CONFIG_ACPI_BGRT
 
 void efi_bgrt_init(struct acpi_table_header *table);
+int __init acpi_parse_bgrt(struct acpi_table_header *table);
 
 /* The BGRT data itself; only valid if bgrt_image != NULL. */
 extern size_t bgrt_image_size;
@@ -14,6 +15,10 @@ extern struct acpi_table_bgrt bgrt_tab;
 #else /* !CONFIG_ACPI_BGRT */
 
 static inline void efi_bgrt_init(struct acpi_table_header *table) {}
+static inline int __init acpi_parse_bgrt(struct acpi_table_header *table)
+{
+	return 0;
+}
 
 #endif /* !CONFIG_ACPI_BGRT */
 
-- 
cgit v1.3-6-gb490


From 60f38de7a8d4e816100ceafd1b382df52527bd50 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 4 Apr 2017 17:09:08 +0100
Subject: efi/libstub: Unify command line param parsing

Merge the parsing of the command line carried out in arm-stub.c with
the handling in efi_parse_options(). Note that this also fixes the
missing handling of CONFIG_CMDLINE_FORCE=y, in which case the builtin
command line should supersede the one passed by the firmware.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bhe@redhat.com
Cc: bhsharma@redhat.com
Cc: bp@alien8.de
Cc: eugene@hp.com
Cc: evgeny.kalugin@intel.com
Cc: jhugo@codeaurora.org
Cc: leif.lindholm@linaro.org
Cc: linux-efi@vger.kernel.org
Cc: mark.rutland@arm.com
Cc: roy.franz@cavium.com
Cc: rruigrok@codeaurora.org
Link: http://lkml.kernel.org/r/20170404160910.28115-1-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/libstub/arm-stub.c        | 24 +++++++-----------------
 drivers/firmware/efi/libstub/arm64-stub.c      |  4 +---
 drivers/firmware/efi/libstub/efi-stub-helper.c | 19 +++++++++++--------
 drivers/firmware/efi/libstub/efistub.h         |  2 ++
 include/linux/efi.h                            |  2 +-
 5 files changed, 22 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index 02049ff25c6b..ac3222f6f805 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -18,8 +18,6 @@
 
 #include "efistub.h"
 
-bool __nokaslr;
-
 efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg,
 			     void *__image, void **__fh)
 {
@@ -153,18 +151,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		goto fail;
 	}
 
-	/* check whether 'nokaslr' was passed on the command line */
-	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
-		static const u8 default_cmdline[] = CONFIG_CMDLINE;
-		const u8 *str, *cmdline = cmdline_ptr;
-
-		if (IS_ENABLED(CONFIG_CMDLINE_FORCE))
-			cmdline = default_cmdline;
-		str = strstr(cmdline, "nokaslr");
-		if (str == cmdline || (str > cmdline && *(str - 1) == ' '))
-			__nokaslr = true;
-	}
-
 	si = setup_graphics(sys_table);
 
 	status = handle_kernel_image(sys_table, image_addr, &image_size,
@@ -176,9 +162,13 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		goto fail_free_cmdline;
 	}
 
-	status = efi_parse_options(cmdline_ptr);
-	if (status != EFI_SUCCESS)
-		pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n");
+	if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
+	    IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
+	    cmdline_size == 0)
+		efi_parse_options(CONFIG_CMDLINE);
+
+	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && cmdline_size > 0)
+		efi_parse_options(cmdline_ptr);
 
 	secure_boot = efi_get_secureboot(sys_table);
 
diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c
index eae693eb3e91..b4c2589d7c91 100644
--- a/drivers/firmware/efi/libstub/arm64-stub.c
+++ b/drivers/firmware/efi/libstub/arm64-stub.c
@@ -16,8 +16,6 @@
 
 #include "efistub.h"
 
-extern bool __nokaslr;
-
 efi_status_t check_platform_features(efi_system_table_t *sys_table_arg)
 {
 	u64 tg;
@@ -52,7 +50,7 @@ efi_status_t handle_kernel_image(efi_system_table_t *sys_table_arg,
 	u64 phys_seed = 0;
 
 	if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
-		if (!__nokaslr) {
+		if (!nokaslr()) {
 			status = efi_get_random_bytes(sys_table_arg,
 						      sizeof(phys_seed),
 						      (u8 *)&phys_seed);
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 3290fae0b38f..2e17d2b8787c 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -32,6 +32,13 @@
 
 static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
 
+static int __section(.data) __nokaslr;
+
+int __pure nokaslr(void)
+{
+	return __nokaslr;
+}
+
 #define EFI_MMAP_NR_SLACK_SLOTS	8
 
 struct file_info {
@@ -409,17 +416,13 @@ static efi_status_t efi_file_close(void *handle)
  * environments, first in the early boot environment of the EFI boot
  * stub, and subsequently during the kernel boot.
  */
-efi_status_t efi_parse_options(char *cmdline)
+efi_status_t efi_parse_options(char const *cmdline)
 {
 	char *str;
 
-	/*
-	 * Currently, the only efi= option we look for is 'nochunk', which
-	 * is intended to work around known issues on certain x86 UEFI
-	 * versions. So ignore for now on other architectures.
-	 */
-	if (!IS_ENABLED(CONFIG_X86))
-		return EFI_SUCCESS;
+	str = strstr(cmdline, "nokaslr");
+	if (str == cmdline || (str && str > cmdline && *(str - 1) == ' '))
+		__nokaslr = 1;
 
 	/*
 	 * If no EFI parameters were specified on the cmdline we've got
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 71c4d0e3c4ed..a7a2a2c3f199 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -24,6 +24,8 @@
 #define EFI_ALLOC_ALIGN		EFI_PAGE_SIZE
 #endif
 
+extern int __pure nokaslr(void);
+
 void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
 
 efi_status_t efi_open_volume(efi_system_table_t *sys_table_arg, void *__image,
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 94d34e0be24f..e485e87615d1 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1471,7 +1471,7 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg,
 				  unsigned long *load_addr,
 				  unsigned long *load_size);
 
-efi_status_t efi_parse_options(char *cmdline);
+efi_status_t efi_parse_options(char const *cmdline);
 
 efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg,
 			   struct screen_info *si, efi_guid_t *proto,
-- 
cgit v1.3-6-gb490


From eeff7d634f4750306785be709ca444140c29b043 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Tue, 4 Apr 2017 17:09:09 +0100
Subject: efi/libstub/arm/arm64: Disable debug prints on 'quiet' cmdline arg

The EFI stub currently prints a number of diagnostic messages that do
not carry a lot of information. Since these prints are not controlled
by 'loglevel' or other command line parameters, and since they appear on
the EFI framebuffer as well (if enabled), it would be nice if we could
turn them off.

So let's add support for the 'quiet' command line parameter in the stub,
and disable the non-error prints if it is passed.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bhe@redhat.com
Cc: bhsharma@redhat.com
Cc: bp@alien8.de
Cc: eugene@hp.com
Cc: evgeny.kalugin@intel.com
Cc: jhugo@codeaurora.org
Cc: leif.lindholm@linaro.org
Cc: linux-efi@vger.kernel.org
Cc: roy.franz@cavium.com
Cc: rruigrok@codeaurora.org
Link: http://lkml.kernel.org/r/20170404160910.28115-2-ard.biesheuvel@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/firmware/efi/libstub/arm-stub.c        | 20 ++++++++++----------
 drivers/firmware/efi/libstub/arm32-stub.c      |  2 ++
 drivers/firmware/efi/libstub/efi-stub-helper.c |  9 +++++++++
 drivers/firmware/efi/libstub/efistub.h         |  7 +++++++
 drivers/firmware/efi/libstub/secureboot.c      |  2 ++
 include/linux/efi.h                            |  3 ---
 6 files changed, 30 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c
index ac3222f6f805..657bb72c9e0b 100644
--- a/drivers/firmware/efi/libstub/arm-stub.c
+++ b/drivers/firmware/efi/libstub/arm-stub.c
@@ -116,8 +116,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 	if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
 		goto fail;
 
-	pr_efi(sys_table, "Booting Linux Kernel...\n");
-
 	status = check_platform_features(sys_table);
 	if (status != EFI_SUCCESS)
 		goto fail;
@@ -151,6 +149,16 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		goto fail;
 	}
 
+	if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
+	    IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
+	    cmdline_size == 0)
+		efi_parse_options(CONFIG_CMDLINE);
+
+	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && cmdline_size > 0)
+		efi_parse_options(cmdline_ptr);
+
+	pr_efi(sys_table, "Booting Linux Kernel...\n");
+
 	si = setup_graphics(sys_table);
 
 	status = handle_kernel_image(sys_table, image_addr, &image_size,
@@ -162,14 +170,6 @@ unsigned long efi_entry(void *handle, efi_system_table_t *sys_table,
 		goto fail_free_cmdline;
 	}
 
-	if (IS_ENABLED(CONFIG_CMDLINE_EXTEND) ||
-	    IS_ENABLED(CONFIG_CMDLINE_FORCE) ||
-	    cmdline_size == 0)
-		efi_parse_options(CONFIG_CMDLINE);
-
-	if (!IS_ENABLED(CONFIG_CMDLINE_FORCE) && cmdline_size > 0)
-		efi_parse_options(cmdline_ptr);
-
 	secure_boot = efi_get_secureboot(sys_table);
 
 	/*
diff --git a/drivers/firmware/efi/libstub/arm32-stub.c b/drivers/firmware/efi/libstub/arm32-stub.c
index 18a8b5eb55e7..becbda445913 100644
--- a/drivers/firmware/efi/libstub/arm32-stub.c
+++ b/drivers/firmware/efi/libstub/arm32-stub.c
@@ -9,6 +9,8 @@
 #include <linux/efi.h>
 #include <asm/efi.h>
 
+#include "efistub.h"
+
 efi_status_t check_platform_features(efi_system_table_t *sys_table_arg)
 {
 	int block;
diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c
index 2e17d2b8787c..b0184360efc6 100644
--- a/drivers/firmware/efi/libstub/efi-stub-helper.c
+++ b/drivers/firmware/efi/libstub/efi-stub-helper.c
@@ -33,11 +33,16 @@
 static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE;
 
 static int __section(.data) __nokaslr;
+static int __section(.data) __quiet;
 
 int __pure nokaslr(void)
 {
 	return __nokaslr;
 }
+int __pure is_quiet(void)
+{
+	return __quiet;
+}
 
 #define EFI_MMAP_NR_SLACK_SLOTS	8
 
@@ -424,6 +429,10 @@ efi_status_t efi_parse_options(char const *cmdline)
 	if (str == cmdline || (str && str > cmdline && *(str - 1) == ' '))
 		__nokaslr = 1;
 
+	str = strstr(cmdline, "quiet");
+	if (str == cmdline || (str && str > cmdline && *(str - 1) == ' '))
+		__quiet = 1;
+
 	/*
 	 * If no EFI parameters were specified on the cmdline we've got
 	 * nothing to do.
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index a7a2a2c3f199..83f268c05007 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -25,6 +25,13 @@
 #endif
 
 extern int __pure nokaslr(void);
+extern int __pure is_quiet(void);
+
+#define pr_efi(sys_table, msg)		do {				\
+	if (!is_quiet()) efi_printk(sys_table, "EFI stub: "msg);	\
+} while (0)
+
+#define pr_efi_err(sys_table, msg) efi_printk(sys_table, "EFI stub: ERROR: "msg)
 
 void efi_char16_printk(efi_system_table_t *, efi_char16_t *);
 
diff --git a/drivers/firmware/efi/libstub/secureboot.c b/drivers/firmware/efi/libstub/secureboot.c
index 5da36e56b36a..8c34d50a4d80 100644
--- a/drivers/firmware/efi/libstub/secureboot.c
+++ b/drivers/firmware/efi/libstub/secureboot.c
@@ -12,6 +12,8 @@
 #include <linux/efi.h>
 #include <asm/efi.h>
 
+#include "efistub.h"
+
 /* BIOS variables */
 static const efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
 static const efi_char16_t const efi_SecureBoot_name[] = {
diff --git a/include/linux/efi.h b/include/linux/efi.h
index e485e87615d1..ec36f42a2add 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1435,9 +1435,6 @@ static inline int efi_runtime_map_copy(void *buf, size_t bufsz)
 
 /* prototypes shared between arch specific and generic stub code */
 
-#define pr_efi(sys_table, msg)     efi_printk(sys_table, "EFI stub: "msg)
-#define pr_efi_err(sys_table, msg) efi_printk(sys_table, "EFI stub: ERROR: "msg)
-
 void efi_printk(efi_system_table_t *sys_table_arg, char *str);
 
 void efi_free(efi_system_table_t *sys_table_arg, unsigned long size,
-- 
cgit v1.3-6-gb490


From f2fbc9dd78970accd7649e8b87c7f00a0da0cdbc Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 5 Apr 2017 08:39:18 -0700
Subject: blk-mq: Remove blk_mq_queue_data.list

The block layer core sets blk_mq_queue_data.list but no block
drivers read that member. Hence remove it and also the code that
is used to set this member.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 17 -----------------
 include/linux/blk-mq.h |  1 -
 2 files changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 061fc2cc88d3..f7cd3208bcdf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -984,16 +984,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
 	struct request_queue *q = hctx->queue;
 	struct request *rq;
-	LIST_HEAD(driver_list);
-	struct list_head *dptr;
 	int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
 
-	/*
-	 * Start off with dptr being NULL, so we start the first request
-	 * immediately, even if we have more pending.
-	 */
-	dptr = NULL;
-
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
@@ -1026,7 +1018,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 		list_del_init(&rq->queuelist);
 
 		bd.rq = rq;
-		bd.list = dptr;
 
 		/*
 		 * Flag last if we have no more requests, or if we have more
@@ -1062,13 +1053,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 
 		if (ret == BLK_MQ_RQ_QUEUE_BUSY)
 			break;
-
-		/*
-		 * We've done the first request. If we have more than 1
-		 * left in the list, set dptr to defer issue.
-		 */
-		if (!dptr && list->next != list->prev)
-			dptr = &driver_list;
 	}
 
 	hctx->dispatched[queued_to_index(queued)]++;
@@ -1451,7 +1435,6 @@ static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
 	struct request_queue *q = rq->q;
 	struct blk_mq_queue_data bd = {
 		.rq = rq,
-		.list = NULL,
 		.last = 1
 	};
 	struct blk_mq_hw_ctx *hctx;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ea2e9dcd3aef..bdea90d75274 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -81,7 +81,6 @@ struct blk_mq_tag_set {
 
 struct blk_mq_queue_data {
 	struct request *rq;
-	struct list_head *list;
 	bool last;
 };
 
-- 
cgit v1.3-6-gb490


From 64c7f1d1572cacadfc0a4ca5a937aeffa486de58 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:18:12 +0200
Subject: block, scsi: move the retries field to struct scsi_request

Instead of bloating the generic struct request with it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/scsi_ioctl.c                 | 8 ++++----
 drivers/scsi/osd/osd_initiator.c   | 2 +-
 drivers/scsi/osst.c                | 2 +-
 drivers/scsi/scsi_error.c          | 2 +-
 drivers/scsi/scsi_lib.c            | 4 ++--
 drivers/scsi/sg.c                  | 2 +-
 drivers/scsi/st.c                  | 2 +-
 drivers/target/target_core_pscsi.c | 2 +-
 include/linux/blkdev.h             | 1 -
 include/scsi/scsi_request.h        | 1 +
 10 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 2a2fc768b27a..82a43bb19967 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -362,7 +362,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 		goto out_free_cdb;
 
 	bio = rq->bio;
-	rq->retries = 0;
+	req->retries = 0;
 
 	start_time = jiffies;
 
@@ -476,13 +476,13 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		goto error;
 
 	/* default.  possible overriden later */
-	rq->retries = 5;
+	req->retries = 5;
 
 	switch (opcode) {
 	case SEND_DIAGNOSTIC:
 	case FORMAT_UNIT:
 		rq->timeout = FORMAT_UNIT_TIMEOUT;
-		rq->retries = 1;
+		req->retries = 1;
 		break;
 	case START_STOP:
 		rq->timeout = START_STOP_TIMEOUT;
@@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 		break;
 	case READ_DEFECT_DATA:
 		rq->timeout = READ_DEFECT_DATA_TIMEOUT;
-		rq->retries = 1;
+		req->retries = 1;
 		break;
 	default:
 		rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 6903f03c88af..9d0727b2bdec 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1602,7 +1602,7 @@ static int _init_blk_request(struct osd_request *or,
 	req->rq_flags |= RQF_QUIET;
 
 	req->timeout = or->timeout;
-	req->retries = or->retries;
+	scsi_req(req)->retries = or->retries;
 
 	if (has_out) {
 		or->out.req = req;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index c47f4b349bac..41bc1d64bf86 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -414,7 +414,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
 	memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
 	memcpy(rq->cmd, cmd, rq->cmd_len);
 	req->timeout = timeout;
-	req->retries = retries;
+	rq->retries = retries;
 	req->end_io_data = SRpnt;
 
 	blk_execute_rq_nowait(req->q, NULL, req, 1, osst_end_async);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index f2cafae150bc..2db412dd4b44 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1988,7 +1988,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
 
 	req->rq_flags |= RQF_QUIET;
 	req->timeout = 10 * HZ;
-	req->retries = 5;
+	rq->retries = 5;
 
 	blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done);
 }
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index c1519660824b..11972d1075f1 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -256,7 +256,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 
 	rq->cmd_len = COMMAND_SIZE(cmd[0]);
 	memcpy(rq->cmd, cmd, rq->cmd_len);
-	req->retries = retries;
+	rq->retries = retries;
 	req->timeout = timeout;
 	req->cmd_flags |= flags;
 	req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT;
@@ -1177,7 +1177,7 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
 	cmd->cmd_len = scsi_req(req)->cmd_len;
 	cmd->cmnd = scsi_req(req)->cmd;
 	cmd->transfersize = blk_rq_bytes(req);
-	cmd->allowed = req->retries;
+	cmd->allowed = scsi_req(req)->retries;
 	return BLKPREP_OK;
 }
 
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 29b86505f796..b61cc3c512d3 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1716,7 +1716,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
 
 	srp->rq = rq;
 	rq->end_io_data = srp;
-	rq->retries = SG_DEFAULT_RETRIES;
+	req->retries = SG_DEFAULT_RETRIES;
 
 	if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
 		return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index e5ef78a6848e..5408643431bb 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -579,7 +579,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
 	memset(rq->cmd, 0, BLK_MAX_CDB);
 	memcpy(rq->cmd, cmd, rq->cmd_len);
 	req->timeout = timeout;
-	req->retries = retries;
+	rq->retries = retries;
 	req->end_io_data = SRpnt;
 
 	blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 94cda7991e80..c7fa372c527a 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1008,7 +1008,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
 		req->timeout = PS_TIMEOUT_DISK;
 	else
 		req->timeout = PS_TIMEOUT_OTHER;
-	req->retries = PS_RETRY;
+	scsi_req(req)->retries = PS_RETRY;
 
 	blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req,
 			(cmd->sam_task_attr == TCM_HEAD_TAG),
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a2dc6b390d48..ce6f9a6534c9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -224,7 +224,6 @@ struct request {
 	unsigned long deadline;
 	struct list_head timeout_list;
 	unsigned int timeout;
-	int retries;
 
 	/*
 	 * completion callback.
diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h
index ba0aeb980f7e..7c583a0f363a 100644
--- a/include/scsi/scsi_request.h
+++ b/include/scsi/scsi_request.h
@@ -11,6 +11,7 @@ struct scsi_request {
 	unsigned short	cmd_len;
 	unsigned int	sense_len;
 	unsigned int	resid_len;	/* residual count */
+	int		retries;
 	void		*sense;
 };
 
-- 
cgit v1.3-6-gb490


From 1dd5198b2df913aec9b77c14529f9ff1b6d33e30 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 5 Apr 2017 12:16:38 -0600
Subject: block: move timeout field in struct request to pack better

After commit 64c7f1d1572c, we went from 1 to 2 holes in my
test setup. If we move the timeout field a bit, we remove
both of those holes and shrink struct request by 8 bytes.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ce6f9a6534c9..3cf241b0814d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -215,6 +215,8 @@ struct request {
 
 	unsigned short ioprio;
 
+	unsigned int timeout;
+
 	void *special;		/* opaque pointer available for LLD use */
 
 	int errors;
@@ -223,7 +225,6 @@ struct request {
 
 	unsigned long deadline;
 	struct list_head timeout_list;
-	unsigned int timeout;
 
 	/*
 	 * completion callback.
-- 
cgit v1.3-6-gb490


From dbde775cdbf5e401b8739f30c87d1af12c0028db Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 7 Apr 2017 11:10:44 +1000
Subject: block: simple improvements for bio->flags

The comment for the 'flags' field of 'bio' mentions
"command" which is no longer stored there, and doesn't
mention the bvec pool number, which is.

BIO_RESET_BITS is set in such a way that it would need to be
updated if new bits were added, which is easy to miss.

BVEC_POOL_BITS is larger than needed.  The BVEC_POOL_IDX()
ranges from 0 to 6, so 3 bits are sufficient.

This patch make improvements in each of these areas.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 67bcf8a5326e..1ebbc289b642 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -33,7 +33,7 @@ struct bio {
 						 * top bits REQ_OP. Use
 						 * accessors.
 						 */
-	unsigned short		bi_flags;	/* status, command, etc */
+	unsigned short		bi_flags;	/* status, etc and bvec pool number */
 	unsigned short		bi_ioprio;
 
 	struct bvec_iter	bi_iter;
@@ -110,12 +110,7 @@ struct bio {
 #define BIO_REFFED	8	/* bio has elevated ->bi_cnt */
 #define BIO_THROTTLED	9	/* This bio has already been subjected to
 				 * throttling rules. Don't do it again. */
-
-/*
- * Flags starting here get preserved by bio_reset() - this includes
- * BVEC_POOL_IDX()
- */
-#define BIO_RESET_BITS	10
+/* See BVEC_POOL_OFFSET below before adding new flags */
 
 /*
  * We support 6 different bvec pools, the last one is magic in that it
@@ -125,13 +120,22 @@ struct bio {
 #define BVEC_POOL_MAX		(BVEC_POOL_NR - 1)
 
 /*
- * Top 4 bits of bio flags indicate the pool the bvecs came from.  We add
+ * Top 3 bits of bio flags indicate the pool the bvecs came from.  We add
  * 1 to the actual index so that 0 indicates that there are no bvecs to be
  * freed.
  */
-#define BVEC_POOL_BITS		(4)
+#define BVEC_POOL_BITS		(3)
 #define BVEC_POOL_OFFSET	(16 - BVEC_POOL_BITS)
 #define BVEC_POOL_IDX(bio)	((bio)->bi_flags >> BVEC_POOL_OFFSET)
+#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
+# error "BVEC_POOL_BITS is too small"
+#endif
+
+/*
+ * Flags starting here get preserved by bio_reset() - this includes
+ * only BVEC_POOL_IDX()
+ */
+#define BIO_RESET_BITS	BVEC_POOL_OFFSET
 
 /*
  * Operations and flags common to the bio and request structures.
-- 
cgit v1.3-6-gb490


From fbbaf700e7b163a0f1704b2d542ee28be11fce21 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 7 Apr 2017 09:40:52 -0600
Subject: block: trace completion of all bios.

Currently only dm and md/raid5 bios trigger
trace_block_bio_complete().  Now that we have bio_chain() and
bio_inc_remaining(), it is not possible, in general, for a driver to
know when the bio is really complete.  Only bio_endio() knows that.

So move the trace_block_bio_complete() call to bio_endio().

Now trace_block_bio_complete() pairs with trace_block_bio_queue().
Any bio for which a 'queue' event is traced, will subsequently
generate a 'complete' event.

There are a few cases where completion tracing is not wanted.
1/ If blk_update_request() has already generated a completion
   trace event at the 'request' level, there is no point generating
   one at the bio level too.  In this case the bi_sector and bi_size
   will have changed, so the bio level event would be wrong

2/ If the bio hasn't actually been queued yet, but is being aborted
   early, then a trace event could be confusing.  Some filesystems
   call bio_endio() but do not want tracing.

3/ The bio_integrity code interposes itself by replacing bi_end_io,
   then restoring it and calling bio_endio() again.  This would produce
   two identical trace events if left like that.

To handle these, we introduce a flag BIO_TRACE_COMPLETION and only
produce the trace event when this is set.
We address point 1 above by clearing the flag in blk_update_request().
We address point 2 above by only setting the flag when
generic_make_request() is called.
We address point 3 above by clearing the flag after generating a
completion event.

When bio_split() is used on a bio, particularly in blk_queue_split(),
there is an extra complication.  A new bio is split off the front, and
may be handle directly without going through generic_make_request().
The old bio, which has been advanced, is passed to
generic_make_request(), so it will trigger a trace event a second
time.
Probably the best result when a split happens is to see a single
'queue' event for the whole bio, then multiple 'complete' events - one
for each component.  To achieve this was can:
- copy the BIO_TRACE_COMPLETION flag to the new bio in bio_split()
- avoid generating a 'queue' event if BIO_TRACE_COMPLETION is already set.
This way, the split-off bio won't create a queue event, the original
won't either even if it re-submitted to generic_make_request(),
but both will produce completion events, each for their own range.

So if generic_make_request() is called (which generates a QUEUED
event), then bi_endio() will create a single COMPLETE event for each
range that the bio is split into, unless the driver has explicitly
requested it not to.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c               | 14 ++++++++++++++
 block/blk-core.c          | 10 +++++++++-
 drivers/md/dm.c           |  1 -
 drivers/md/raid5.c        |  2 --
 include/linux/blk_types.h |  2 ++
 5 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/bio.c b/block/bio.c
index f1857c0f0826..f4d207180266 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1826,6 +1826,11 @@ static inline bool bio_remaining_done(struct bio *bio)
  *   bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
  *   way to end I/O on a bio. No one should call bi_end_io() directly on a
  *   bio unless they own it and thus know that it has an end_io function.
+ *
+ *   bio_endio() can be called several times on a bio that has been chained
+ *   using bio_chain().  The ->bi_end_io() function will only be called the
+ *   last time.  At this point the BLK_TA_COMPLETE tracing event will be
+ *   generated if BIO_TRACE_COMPLETION is set.
  **/
 void bio_endio(struct bio *bio)
 {
@@ -1846,6 +1851,12 @@ again:
 		goto again;
 	}
 
+	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+		trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
+					 bio, bio->bi_error);
+		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+	}
+
 	blk_throtl_bio_endio(bio);
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio);
@@ -1885,6 +1896,9 @@ struct bio *bio_split(struct bio *bio, int sectors,
 
 	bio_advance(bio, split->bi_iter.bi_size);
 
+	if (bio_flagged(bio, BIO_TRACE_COMPLETION))
+		bio_set_flag(bio, BIO_TRACE_COMPLETION);
+
 	return split;
 }
 EXPORT_SYMBOL(bio_split);
diff --git a/block/blk-core.c b/block/blk-core.c
index 316a5399fb15..8654aa0cef6d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1957,7 +1957,13 @@ generic_make_request_checks(struct bio *bio)
 	if (!blkcg_bio_issue_check(q, bio))
 		return false;
 
-	trace_block_bio_queue(q, bio);
+	if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+		trace_block_bio_queue(q, bio);
+		/* Now that enqueuing has been traced, we need to trace
+		 * completion as well.
+		 */
+		bio_set_flag(bio, BIO_TRACE_COMPLETION);
+	}
 	return true;
 
 not_supported:
@@ -2622,6 +2628,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		if (bio_bytes == bio->bi_iter.bi_size)
 			req->bio = bio->bi_next;
 
+		/* Completion has already been traced */
+		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 		req_bio_endio(req, bio, bio_bytes, error);
 
 		total_bytes += bio_bytes;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfb75979e455..cd93a3b9ceca 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -810,7 +810,6 @@ static void dec_pending(struct dm_io *io, int error)
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
-			trace_block_bio_complete(md->queue, bio, io_error);
 			bio->bi_error = io_error;
 			bio_endio(bio);
 		}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed5cd705b985..7aeb9691c2e1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5031,8 +5031,6 @@ static void raid5_align_endio(struct bio *bi)
 	rdev_dec_pending(rdev, conf->mddev);
 
 	if (!error) {
-		trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
-					 raid_bi, 0);
 		bio_endio(raid_bi);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
 			wake_up(&conf->wait_for_quiescent);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1ebbc289b642..72aa9519167e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -110,6 +110,8 @@ struct bio {
 #define BIO_REFFED	8	/* bio has elevated ->bi_cnt */
 #define BIO_THROTTLED	9	/* This bio has already been subjected to
 				 * throttling rules. Don't do it again. */
+#define BIO_TRACE_COMPLETION 10	/* bio_endio() should trace the final completion
+				 * of this bio. */
 /* See BVEC_POOL_OFFSET below before adding new flags */
 
 /*
-- 
cgit v1.3-6-gb490


From ee056f98126170ca8b16b9a4a6e20aae7c5c184e Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Wed, 5 Apr 2017 12:01:34 -0700
Subject: blk-mq-sched: provide hooks for initializing hardware queue data

Schedulers need to be informed when a hardware queue is added or removed
at runtime so they can allocate/free per-hardware queue data. So,
replace the blk_mq_sched_init_hctx_data() helper, which only makes sense
at init time, with .init_hctx() and .exit_hctx() hooks.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sched.c     | 81 +++++++++++++++++++++++++-----------------------
 block/blk-mq-sched.h     |  4 ---
 include/linux/elevator.h |  2 ++
 3 files changed, 45 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c974a1bbf4cb..9e3c0f92851b 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -30,43 +30,6 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
 
-int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
-				int (*init)(struct blk_mq_hw_ctx *),
-				void (*exit)(struct blk_mq_hw_ctx *))
-{
-	struct blk_mq_hw_ctx *hctx;
-	int ret;
-	int i;
-
-	queue_for_each_hw_ctx(q, hctx, i) {
-		hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
-		if (!hctx->sched_data) {
-			ret = -ENOMEM;
-			goto error;
-		}
-
-		if (init) {
-			ret = init(hctx);
-			if (ret) {
-				/*
-				 * We don't want to give exit() a partially
-				 * initialized sched_data. init() must clean up
-				 * if it fails.
-				 */
-				kfree(hctx->sched_data);
-				hctx->sched_data = NULL;
-				goto error;
-			}
-		}
-	}
-
-	return 0;
-error:
-	blk_mq_sched_free_hctx_data(q, exit);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
-
 static void __blk_mq_sched_assign_ioc(struct request_queue *q,
 				      struct request *rq,
 				      struct bio *bio,
@@ -508,11 +471,24 @@ int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 			   unsigned int hctx_idx)
 {
 	struct elevator_queue *e = q->elevator;
+	int ret;
 
 	if (!e)
 		return 0;
 
-	return blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
+	ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
+	if (ret)
+		return ret;
+
+	if (e->type->ops.mq.init_hctx) {
+		ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
+		if (ret) {
+			blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
+			return ret;
+		}
+	}
+
+	return 0;
 }
 
 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
@@ -523,12 +499,18 @@ void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
 	if (!e)
 		return;
 
+	if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
+		e->type->ops.mq.exit_hctx(hctx, hctx_idx);
+		hctx->sched_data = NULL;
+	}
+
 	blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
 }
 
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 {
 	struct blk_mq_hw_ctx *hctx;
+	struct elevator_queue *eq;
 	unsigned int i;
 	int ret;
 
@@ -553,6 +535,18 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	if (ret)
 		goto err;
 
+	if (e->ops.mq.init_hctx) {
+		queue_for_each_hw_ctx(q, hctx, i) {
+			ret = e->ops.mq.init_hctx(hctx, i);
+			if (ret) {
+				eq = q->elevator;
+				blk_mq_exit_sched(q, eq);
+				kobject_put(&eq->kobj);
+				return ret;
+			}
+		}
+	}
+
 	return 0;
 
 err:
@@ -563,6 +557,17 @@ err:
 
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 {
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+
+	if (e->type->ops.mq.exit_hctx) {
+		queue_for_each_hw_ctx(q, hctx, i) {
+			if (hctx->sched_data) {
+				e->type->ops.mq.exit_hctx(hctx, i);
+				hctx->sched_data = NULL;
+			}
+		}
+	}
 	if (e->type->ops.mq.exit_sched)
 		e->type->ops.mq.exit_sched(e);
 	blk_mq_sched_tags_teardown(q);
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 3a9e6e40558b..f4bc186c3440 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -4,10 +4,6 @@
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
-				int (*init)(struct blk_mq_hw_ctx *),
-				void (*exit)(struct blk_mq_hw_ctx *));
-
 void blk_mq_sched_free_hctx_data(struct request_queue *q,
 				 void (*exit)(struct blk_mq_hw_ctx *));
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 22d39e8d4de1..b7ec315ee7e7 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -93,6 +93,8 @@ struct blk_mq_hw_ctx;
 struct elevator_mq_ops {
 	int (*init_sched)(struct request_queue *, struct elevator_type *);
 	void (*exit_sched)(struct elevator_queue *);
+	int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
+	void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
 
 	bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
 	bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
-- 
cgit v1.3-6-gb490


From 1d62ac13634840e02f9b20df9d8e21204f9ab8b8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:00 +0200
Subject: block: renumber REQ_OP_WRITE_ZEROES

Make life easy for implementations that needs to send a data buffer
to the device (e.g. SCSI) by numbering it as a data out command.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 72aa9519167e..c5bae0a669d1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -174,7 +174,7 @@ enum req_opf {
 	/* write the same sector many times */
 	REQ_OP_WRITE_SAME	= 7,
 	/* write the zero filled sector many times */
-	REQ_OP_WRITE_ZEROES	= 8,
+	REQ_OP_WRITE_ZEROES	= 9,
 
 	/* SCSI passthrough using struct scsi_request */
 	REQ_OP_SCSI_IN		= 32,
-- 
cgit v1.3-6-gb490


From ac62d6208a7977107a47be4eb8566d6e5034b5f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:05 +0200
Subject: dm: support REQ_OP_WRITE_ZEROES

Copy & paste from the REQ_OP_WRITE_SAME code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/dm-core.h          |  1 +
 drivers/md/dm-io.c            |  8 ++++++--
 drivers/md/dm-linear.c        |  1 +
 drivers/md/dm-mpath.c         |  1 +
 drivers/md/dm-rq.c            | 11 ++++++++---
 drivers/md/dm-stripe.c        |  2 ++
 drivers/md/dm-table.c         | 30 ++++++++++++++++++++++++++++++
 drivers/md/dm.c               | 31 ++++++++++++++++++++++++++++---
 include/linux/device-mapper.h |  6 ++++++
 9 files changed, 83 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 136fda3ff9e5..fea5bd52ada8 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -132,6 +132,7 @@ void dm_init_md_queue(struct mapped_device *md);
 void dm_init_normal_md_queue(struct mapped_device *md);
 int md_in_flight(struct mapped_device *md);
 void disable_write_same(struct mapped_device *md);
+void disable_write_zeroes(struct mapped_device *md);
 
 static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
 {
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index b808cbe22678..3702e502466d 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -312,9 +312,12 @@ static void do_region(int op, int op_flags, unsigned region,
 	 */
 	if (op == REQ_OP_DISCARD)
 		special_cmd_max_sectors = q->limits.max_discard_sectors;
+	else if (op == REQ_OP_WRITE_ZEROES)
+		special_cmd_max_sectors = q->limits.max_write_zeroes_sectors;
 	else if (op == REQ_OP_WRITE_SAME)
 		special_cmd_max_sectors = q->limits.max_write_same_sectors;
-	if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_SAME) &&
+	if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES ||
+	     op == REQ_OP_WRITE_SAME)  &&
 	    special_cmd_max_sectors == 0) {
 		dec_count(io, region, -EOPNOTSUPP);
 		return;
@@ -330,6 +333,7 @@ static void do_region(int op, int op_flags, unsigned region,
 		 */
 		switch (op) {
 		case REQ_OP_DISCARD:
+		case REQ_OP_WRITE_ZEROES:
 			num_bvecs = 0;
 			break;
 		case REQ_OP_WRITE_SAME:
@@ -347,7 +351,7 @@ static void do_region(int op, int op_flags, unsigned region,
 		bio_set_op_attrs(bio, op, op_flags);
 		store_io_and_region_in_bio(bio, io, region);
 
-		if (op == REQ_OP_DISCARD) {
+		if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) {
 			num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
 			bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
 			remaining -= num_sectors;
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4788b0b989a9..e17fd44ceef5 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -59,6 +59,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
+	ti->num_write_zeroes_bios = 1;
 	ti->private = lc;
 	return 0;
 
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 7f223dbed49f..ab55955ed704 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1103,6 +1103,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->num_write_same_bios = 1;
+	ti->num_write_zeroes_bios = 1;
 	if (m->queue_mode == DM_TYPE_BIO_BASED)
 		ti->per_io_data_size = multipath_per_bio_data_size();
 	else
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index d19af1d21f4c..e60f1b6845be 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -298,9 +298,14 @@ static void dm_done(struct request *clone, int error, bool mapped)
 			r = rq_end_io(tio->ti, clone, error, &tio->info);
 	}
 
-	if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
-		     !clone->q->limits.max_write_same_sectors))
-		disable_write_same(tio->md);
+	if (unlikely(r == -EREMOTEIO)) {
+		if (req_op(clone) == REQ_OP_WRITE_SAME &&
+		    !clone->q->limits.max_write_same_sectors)
+			disable_write_same(tio->md);
+		if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
+		    !clone->q->limits.max_write_zeroes_sectors)
+			disable_write_zeroes(tio->md);
+	}
 
 	if (r <= 0)
 		/* The target wants to complete the I/O */
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 28193a57bf47..5ef49c121d99 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -169,6 +169,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_bios = stripes;
 	ti->num_discard_bios = stripes;
 	ti->num_write_same_bios = stripes;
+	ti->num_write_zeroes_bios = stripes;
 
 	sc->chunk_size = chunk_size;
 	if (chunk_size & (chunk_size - 1))
@@ -293,6 +294,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_REMAPPED;
 	}
 	if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
+	    unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) ||
 	    unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) {
 		target_bio_nr = dm_bio_get_target_bio_nr(bio);
 		BUG_ON(target_bio_nr >= sc->stripes);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ad16d9c9d5a..5cd665c91ead 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1533,6 +1533,34 @@ static bool dm_table_supports_write_same(struct dm_table *t)
 	return true;
 }
 
+static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
+					   sector_t start, sector_t len, void *data)
+{
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+
+	return q && !q->limits.max_write_zeroes_sectors;
+}
+
+static bool dm_table_supports_write_zeroes(struct dm_table *t)
+{
+	struct dm_target *ti;
+	unsigned i = 0;
+
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+
+		if (!ti->num_write_zeroes_bios)
+			return false;
+
+		if (!ti->type->iterate_devices ||
+		    ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL))
+			return false;
+	}
+
+	return true;
+}
+
+
 static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
 				  sector_t start, sector_t len, void *data)
 {
@@ -1603,6 +1631,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 
 	if (!dm_table_supports_write_same(t))
 		q->limits.max_write_same_sectors = 0;
+	if (!dm_table_supports_write_zeroes(t))
+		q->limits.max_write_zeroes_sectors = 0;
 
 	if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
 		queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index cd93a3b9ceca..8bf397729bbd 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -824,6 +824,14 @@ void disable_write_same(struct mapped_device *md)
 	limits->max_write_same_sectors = 0;
 }
 
+void disable_write_zeroes(struct mapped_device *md)
+{
+	struct queue_limits *limits = dm_get_queue_limits(md);
+
+	/* device doesn't really support WRITE ZEROES, disable it */
+	limits->max_write_zeroes_sectors = 0;
+}
+
 static void clone_endio(struct bio *bio)
 {
 	int error = bio->bi_error;
@@ -850,9 +858,14 @@ static void clone_endio(struct bio *bio)
 		}
 	}
 
-	if (unlikely(r == -EREMOTEIO && (bio_op(bio) == REQ_OP_WRITE_SAME) &&
-		     !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
-		disable_write_same(md);
+	if (unlikely(r == -EREMOTEIO)) {
+		if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+			disable_write_same(md);
+		if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+		    !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+			disable_write_zeroes(md);
+	}
 
 	free_tio(tio);
 	dec_pending(io, error);
@@ -1201,6 +1214,11 @@ static unsigned get_num_write_same_bios(struct dm_target *ti)
 	return ti->num_write_same_bios;
 }
 
+static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
+{
+	return ti->num_write_zeroes_bios;
+}
+
 typedef bool (*is_split_required_fn)(struct dm_target *ti);
 
 static bool is_split_required_for_discard(struct dm_target *ti)
@@ -1255,6 +1273,11 @@ static int __send_write_same(struct clone_info *ci)
 	return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
 }
 
+static int __send_write_zeroes(struct clone_info *ci)
+{
+	return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
+}
+
 /*
  * Select the correct strategy for processing a non-flush bio.
  */
@@ -1269,6 +1292,8 @@ static int __split_and_process_non_flush(struct clone_info *ci)
 		return __send_discard(ci);
 	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
 		return __send_write_same(ci);
+	else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
+		return __send_write_zeroes(ci);
 
 	ti = dm_table_find_target(ci->map, ci->sector);
 	if (!dm_target_is_valid(ti))
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index a7e6903866fd..3829bee2302a 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -254,6 +254,12 @@ struct dm_target {
 	 */
 	unsigned num_write_same_bios;
 
+	/*
+	 * The number of WRITE ZEROES bios that will be submitted to the target.
+	 * The bio number can be accessed with dm_bio_get_target_bio_nr.
+	 */
+	unsigned num_write_zeroes_bios;
+
 	/*
 	 * The minimum number of extra bytes allocated in each io for the
 	 * target to use.
-- 
cgit v1.3-6-gb490


From ee472d835c264a4cb77f8cf878603e1e40f3559e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:08 +0200
Subject: block: add a flags argument to (__)blkdev_issue_zeroout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Turn the existing discard flag into a new BLKDEV_ZERO_UNMAP flag with
similar semantics, but without referring to diѕcard.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-lib.c                    | 31 ++++++++++++++-----------------
 block/ioctl.c                      |  2 +-
 drivers/block/drbd/drbd_receiver.c |  9 ++++++---
 drivers/nvme/target/io-cmd.c       |  2 +-
 fs/block_dev.c                     |  2 +-
 fs/dax.c                           |  2 +-
 fs/xfs/xfs_bmap_util.c             |  2 +-
 include/linux/blkdev.h             | 16 ++++++++++------
 8 files changed, 35 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2a8d638544a7..f9f24ec69c27 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -282,14 +282,18 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
  * @biop:	pointer to anchor bio
- * @discard:	discard flag
+ * @flags:	controls detailed behavior
  *
  * Description:
- *  Generate and issue number of bios with zerofiled pages.
+ *  Zero-fill a block range, either using hardware offload or by explicitly
+ *  writing zeroes to the device.
+ *
+ *  If a device is using logical block provisioning, the underlying space will
+ *  not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
  */
 int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
-		bool discard)
+		unsigned flags)
 {
 	int ret;
 	int bi_size = 0;
@@ -337,28 +341,21 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @discard:	whether to discard the block range
+ * @flags:	controls detailed behavior
  *
  * Description:
- *  Zero-fill a block range.  If the discard flag is set and the block
- *  device guarantees that subsequent READ operations to the block range
- *  in question will return zeroes, the blocks will be discarded. Should
- *  the discard request fail, if the discard flag is not set, or if
- *  discard_zeroes_data is not supported, this function will resort to
- *  zeroing the blocks manually, thus provisioning (allocating,
- *  anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
- *  command(s), blkdev_issue_zeroout() will use it to optimize the process of
- *  clearing the block range. Otherwise the zeroing will be performed
- *  using regular WRITE calls.
+ *  Zero-fill a block range, either using hardware offload or by explicitly
+ *  writing zeroes to the device.  See __blkdev_issue_zeroout() for the
+ *  valid values for %flags.
  */
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			 sector_t nr_sects, gfp_t gfp_mask, bool discard)
+		sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
 {
 	int ret;
 	struct bio *bio = NULL;
 	struct blk_plug plug;
 
-	if (discard) {
+	if (!(flags & BLKDEV_ZERO_NOUNMAP)) {
 		if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
 				BLKDEV_DISCARD_ZERO))
 			return 0;
@@ -366,7 +363,7 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 
 	blk_start_plug(&plug);
 	ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
-			&bio, discard);
+			&bio, flags);
 	if (ret == 0 && bio) {
 		ret = submit_bio_wait(bio);
 		bio_put(bio);
diff --git a/block/ioctl.c b/block/ioctl.c
index 7b88820b93d9..8ea00a41be01 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -255,7 +255,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
 	truncate_inode_pages_range(mapping, start, end);
 
 	return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
-				    false);
+			BLKDEV_ZERO_NOUNMAP);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index aa6bf9692eff..dc9a6dcd431c 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1499,19 +1499,22 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u
 		tmp = start + granularity - sector_div(tmp, granularity);
 
 		nr = tmp - start;
-		err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
+		err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO,
+				BLKDEV_ZERO_NOUNMAP);
 		nr_sectors -= nr;
 		start = tmp;
 	}
 	while (nr_sectors >= granularity) {
 		nr = min_t(sector_t, nr_sectors, max_discard_sectors);
-		err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
+		err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO,
+				BLKDEV_ZERO_NOUNMAP);
 		nr_sectors -= nr;
 		start += nr;
 	}
  zero_out:
 	if (nr_sectors) {
-		err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0);
+		err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO,
+				BLKDEV_ZERO_NOUNMAP);
 	}
 	return err != 0;
 }
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index a4b455156bb5..c77940d80fc8 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -184,7 +184,7 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
 		(req->ns->blksize_shift - 9)) + 1;
 
 	if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
-				GFP_KERNEL, &bio, true))
+				GFP_KERNEL, &bio, 0))
 		status = NVME_SC_INTERNAL | NVME_SC_DNR;
 
 	if (bio) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f2d59f143ef4..2f704c3a816f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -2105,7 +2105,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 	case FALLOC_FL_ZERO_RANGE:
 	case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
 		error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
-					    GFP_KERNEL, false);
+					    GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
 		break;
 	case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
 		/* Only punch if the device can do zeroing discard. */
diff --git a/fs/dax.c b/fs/dax.c
index de622d4282a6..2bfbcd726047 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -982,7 +982,7 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 		sector_t start_sector = dax.sector + (offset >> 9);
 
 		return blkdev_issue_zeroout(bdev, start_sector,
-				length >> 9, GFP_NOFS, true);
+				length >> 9, GFP_NOFS, 0);
 	} else {
 		if (dax_map_atomic(bdev, &dax) < 0)
 			return PTR_ERR(dax.addr);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 8b75dcea5966..142bbbe06114 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -81,7 +81,7 @@ xfs_zero_extent(
 	return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
 		block << (mp->m_super->s_blocksize_bits - 9),
 		count_fsb << (mp->m_super->s_blocksize_bits - 9),
-		GFP_NOFS, true);
+		GFP_NOFS, 0);
 }
 
 int
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d76bebbc632e..bd60f4401c9d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1336,23 +1336,27 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 	return bqt->tag_index[tag];
 }
 
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
+extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 
 #define BLKDEV_DISCARD_SECURE	(1 << 0)	/* issue a secure erase */
 #define BLKDEV_DISCARD_ZERO	(1 << 1)	/* must reliably zero data */
 
-extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, int flags,
 		struct bio **biop);
-extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
+
+#define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
+
 extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
-		bool discard);
+		unsigned flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, bool discard);
+		sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
+
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
@@ -1366,7 +1370,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 	return blkdev_issue_zeroout(sb->s_bdev,
 				    block << (sb->s_blocksize_bits - 9),
 				    nr_blocks << (sb->s_blocksize_bits - 9),
-				    gfp_mask, true);
+				    gfp_mask, 0);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
-- 
cgit v1.3-6-gb490


From d928be9f853b9755692d7e9aed402c1809a88e56 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:09 +0200
Subject: block: add a REQ_NOUNMAP flag for REQ_OP_WRITE_ZEROES

If this flag is set logical provisioning capable device should
release space for the zeroed blocks if possible, if it is not set
devices should keep the blocks anchored.

Also remove an out of sync kerneldoc comment for a static function
that would have become even more out of data with this change.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-lib.c           | 19 +++++--------------
 include/linux/blk_types.h |  6 ++++++
 2 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index f9f24ec69c27..2f6d2cb2e1a2 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -226,20 +226,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL(blkdev_issue_write_same);
 
-/**
- * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
- * @bdev:	blockdev to issue
- * @sector:	start sector
- * @nr_sects:	number of sectors to write
- * @gfp_mask:	memory allocation flags (for bio_alloc)
- * @biop:	pointer to anchor bio
- *
- * Description:
- *  Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
- */
 static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
-		struct bio **biop)
+		struct bio **biop, unsigned flags)
 {
 	struct bio *bio = *biop;
 	unsigned int max_write_zeroes_sectors;
@@ -258,7 +247,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		bio = next_bio(bio, 0, gfp_mask);
 		bio->bi_iter.bi_sector = sector;
 		bio->bi_bdev = bdev;
-		bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
+		bio->bi_opf = REQ_OP_WRITE_ZEROES;
+		if (flags & BLKDEV_ZERO_NOUNMAP)
+			bio->bi_opf |= REQ_NOUNMAP;
 
 		if (nr_sects > max_write_zeroes_sectors) {
 			bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
@@ -306,7 +297,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		return -EINVAL;
 
 	ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
-			biop);
+			biop, flags);
 	if (ret == 0 || (ret && ret != -EOPNOTSUPP))
 		goto out;
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c5bae0a669d1..61339bc44400 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -201,6 +201,10 @@ enum req_flag_bits {
 	__REQ_PREFLUSH,		/* request for cache flush */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
+
+	/* command specific flags for REQ_OP_WRITE_ZEROES: */
+	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
+
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -218,6 +222,8 @@ enum req_flag_bits {
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
 
+#define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
+
 #define REQ_FAILFAST_MASK \
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 
-- 
cgit v1.3-6-gb490


From cb365b9675fda026caba4cb5df83292cb7c0811a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:10 +0200
Subject: block: add a new BLKDEV_ZERO_NOFALLBACK flag

This avoids fallbacks to explicit zeroing in (__)blkdev_issue_zeroout if
the caller doesn't want them.

Also clean up the convoluted check for the return condition that this
new flag is added to.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-lib.c        | 5 ++++-
 include/linux/blkdev.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 2f6d2cb2e1a2..2f882e22890b 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -281,6 +281,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
  *
  *  If a device is using logical block provisioning, the underlying space will
  *  not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
+ *
+ *  If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return
+ *  -EOPNOTSUPP if no explicit hardware offload for zeroing is provided.
  */
 int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
@@ -298,7 +301,7 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 
 	ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
 			biop, flags);
-	if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+	if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
 		goto out;
 
 	ret = 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bd60f4401c9d..21a30f011674 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1350,6 +1350,7 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		struct bio **biop);
 
 #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
+#define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */
 
 extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
-- 
cgit v1.3-6-gb490


From 48920ff2a5a940cd07d12cc79e4a2c75f1185aee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 5 Apr 2017 19:21:23 +0200
Subject: block: remove the discard_zeroes_data flag

Now that we use the proper REQ_OP_WRITE_ZEROES operation everywhere we can
kill this hack.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/ABI/testing/sysfs-block | 10 ++-----
 Documentation/block/queue-sysfs.txt   |  5 ----
 block/blk-lib.c                       |  7 +----
 block/blk-settings.c                  |  3 ---
 block/blk-sysfs.c                     |  2 +-
 block/compat_ioctl.c                  |  2 +-
 block/ioctl.c                         |  2 +-
 drivers/block/drbd/drbd_main.c        |  2 --
 drivers/block/drbd/drbd_nl.c          |  7 +----
 drivers/block/loop.c                  |  2 --
 drivers/block/mtip32xx/mtip32xx.c     |  1 -
 drivers/block/nbd.c                   |  1 -
 drivers/md/dm-cache-target.c          |  1 -
 drivers/md/dm-crypt.c                 |  1 -
 drivers/md/dm-raid.c                  |  6 ++---
 drivers/md/dm-raid1.c                 |  1 -
 drivers/md/dm-table.c                 | 19 -------------
 drivers/md/dm-thin.c                  |  2 --
 drivers/md/raid5.c                    | 50 +++++++++++------------------------
 drivers/scsi/sd.c                     |  5 ----
 drivers/target/target_core_device.c   |  2 +-
 include/linux/blkdev.h                | 15 -----------
 include/linux/device-mapper.h         |  5 ----
 23 files changed, 27 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 2da04ce6aeef..dea212db9df3 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -213,14 +213,8 @@ What:		/sys/block/<disk>/queue/discard_zeroes_data
 Date:		May 2011
 Contact:	Martin K. Petersen <martin.petersen@oracle.com>
 Description:
-		Devices that support discard functionality may return
-		stale or random data when a previously discarded block
-		is read back. This can cause problems if the filesystem
-		expects discarded blocks to be explicitly cleared. If a
-		device reports that it deterministically returns zeroes
-		when a discarded area is read the discard_zeroes_data
-		parameter will be set to one. Otherwise it will be 0 and
-		the result of reading a discarded area is undefined.
+		Will always return 0.  Don't rely on any specific behavior
+		for discards, and don't read this file.
 
 What:		/sys/block/<disk>/queue/write_same_max_bytes
 Date:		January 2012
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index b7f6bdc96d73..2c1e67058fd3 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -43,11 +43,6 @@ large discards are issued, setting this value lower will make Linux issue
 smaller discards and potentially help reduce latencies induced by large
 discard operations.
 
-discard_zeroes_data (RO)
-------------------------
-When read, this file will show if the discarded block are zeroed by the
-device or not. If its value is '1' the blocks are zeroed otherwise not.
-
 hw_sector_size (RO)
 -------------------
 This is the hardware sector size of the device, in bytes.
diff --git a/block/blk-lib.c b/block/blk-lib.c
index b0c6c4bcf441..e8caecd71688 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,17 +37,12 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		return -ENXIO;
 
 	if (flags & BLKDEV_DISCARD_SECURE) {
-		if (flags & BLKDEV_DISCARD_ZERO)
-			return -EOPNOTSUPP;
 		if (!blk_queue_secure_erase(q))
 			return -EOPNOTSUPP;
 		op = REQ_OP_SECURE_ERASE;
 	} else {
 		if (!blk_queue_discard(q))
 			return -EOPNOTSUPP;
-		if ((flags & BLKDEV_DISCARD_ZERO) &&
-		    !q->limits.discard_zeroes_data)
-			return -EOPNOTSUPP;
 		op = REQ_OP_DISCARD;
 	}
 
@@ -126,7 +121,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			&bio);
 	if (!ret && bio) {
 		ret = submit_bio_wait(bio);
-		if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO))
+		if (ret == -EOPNOTSUPP)
 			ret = 0;
 		bio_put(bio);
 	}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1e7174ffc9d4..4fa81ed383ca 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -103,7 +103,6 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
-	lim->discard_zeroes_data = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -127,7 +126,6 @@ void blk_set_stacking_limits(struct queue_limits *lim)
 	blk_set_default_limits(lim);
 
 	/* Inherit limits from component devices */
-	lim->discard_zeroes_data = 1;
 	lim->max_segments = USHRT_MAX;
 	lim->max_discard_segments = 1;
 	lim->max_hw_sectors = UINT_MAX;
@@ -609,7 +607,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
 
 	t->cluster &= b->cluster;
-	t->discard_zeroes_data &= b->discard_zeroes_data;
 
 	/* Physical block size a multiple of the logical block size? */
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index c47db43a40cc..fc20489f0d2b 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -208,7 +208,7 @@ static ssize_t queue_discard_max_store(struct request_queue *q,
 
 static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(queue_discard_zeroes_data(q), page);
+	return queue_var_show(0, page);
 }
 
 static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 570021a0dc1c..04325b81c2b4 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -685,7 +685,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKALIGNOFF:
 		return compat_put_int(arg, bdev_alignment_offset(bdev));
 	case BLKDISCARDZEROES:
-		return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
+		return compat_put_uint(arg, 0);
 	case BLKFLSBUF:
 	case BLKROSET:
 	case BLKDISCARD:
diff --git a/block/ioctl.c b/block/ioctl.c
index 8ea00a41be01..0de02ee67eed 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -547,7 +547,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKALIGNOFF:
 		return put_int(arg, bdev_alignment_offset(bdev));
 	case BLKDISCARDZEROES:
-		return put_uint(arg, bdev_discard_zeroes_data(bdev));
+		return put_uint(arg, 0);
 	case BLKSECTGET:
 		max_sectors = min_t(unsigned int, USHRT_MAX,
 				    queue_max_sectors(bdev_get_queue(bdev)));
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 8e62d9f65510..84455c365f57 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -931,7 +931,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
 		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 		p->qlim->discard_enabled = blk_queue_discard(q);
-		p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
 		p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
 	} else {
 		q = device->rq_queue;
@@ -941,7 +940,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
 		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
 		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
 		p->qlim->discard_enabled = 0;
-		p->qlim->discard_zeroes_data = 0;
 		p->qlim->write_same_capable = 0;
 	}
 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e4516d3b971d..02255a0d68b9 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1199,10 +1199,6 @@ static void decide_on_discard_support(struct drbd_device *device,
 	struct drbd_connection *connection = first_peer_device(device)->connection;
 	bool can_do = b ? blk_queue_discard(b) : true;
 
-	if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) {
-		can_do = false;
-		drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
-	}
 	if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
 		can_do = false;
 		drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
@@ -1484,8 +1480,7 @@ static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *dis
 	if (disk_conf->al_extents > drbd_al_extents_max(nbc))
 		disk_conf->al_extents = drbd_al_extents_max(nbc);
 
-	if (!blk_queue_discard(q)
-	    || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) {
+	if (!blk_queue_discard(q)) {
 		if (disk_conf->rs_discard_granularity) {
 			disk_conf->rs_discard_granularity = 0; /* disable feature */
 			drbd_info(device, "rs_discard_granularity feature disabled\n");
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3bb04c1a4ba1..3081d83d2ea3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -828,7 +828,6 @@ static void loop_config_discard(struct loop_device *lo)
 		q->limits.discard_alignment = 0;
 		blk_queue_max_discard_sectors(q, 0);
 		blk_queue_max_write_zeroes_sectors(q, 0);
-		q->limits.discard_zeroes_data = 0;
 		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
 		return;
 	}
@@ -837,7 +836,6 @@ static void loop_config_discard(struct loop_device *lo)
 	q->limits.discard_alignment = 0;
 	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
 	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
-	q->limits.discard_zeroes_data = 1;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
 }
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 30076e7753bc..05e3e664ea1b 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -4025,7 +4025,6 @@ skip_create_disk:
 		dd->queue->limits.discard_granularity = 4096;
 		blk_queue_max_discard_sectors(dd->queue,
 			MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
-		dd->queue->limits.discard_zeroes_data = 0;
 	}
 
 	/* Set the capacity of the device in 512 byte sectors. */
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 03ae72985c79..b02f2362fdf7 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1110,7 +1110,6 @@ static int nbd_dev_add(int index)
 	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
 	disk->queue->limits.discard_granularity = 512;
 	blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
-	disk->queue->limits.discard_zeroes_data = 0;
 	blk_queue_max_hw_sectors(disk->queue, 65536);
 	disk->queue->limits.max_sectors = 256;
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9c689b34e6e7..975922c8f231 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2773,7 +2773,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 
 	ti->num_discard_bios = 1;
 	ti->discards_supported = true;
-	ti->discard_zeroes_data_unsupported = true;
 	ti->split_discard_bios = false;
 
 	cache->features = ca->features;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 389a3637ffcc..ef1d836bd81b 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2030,7 +2030,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	wake_up_process(cc->write_thread);
 
 	ti->num_flush_bios = 1;
-	ti->discard_zeroes_data_unsupported = true;
 
 	return 0;
 
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index f8564d63982f..468f1380de1d 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2813,7 +2813,9 @@ static void configure_discard_support(struct raid_set *rs)
 	/* Assume discards not supported until after checks below. */
 	ti->discards_supported = false;
 
-	/* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
+	/*
+	 * XXX: RAID level 4,5,6 require zeroing for safety.
+	 */
 	raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
 
 	for (i = 0; i < rs->raid_disks; i++) {
@@ -2827,8 +2829,6 @@ static void configure_discard_support(struct raid_set *rs)
 			return;
 
 		if (raid456) {
-			if (!q->limits.discard_zeroes_data)
-				return;
 			if (!devices_handle_discard_safely) {
 				DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
 				DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 2ddc2d20e62d..a95cbb80fb34 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1124,7 +1124,6 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	ti->num_flush_bios = 1;
 	ti->num_discard_bios = 1;
 	ti->per_io_data_size = sizeof(struct dm_raid1_bio_record);
-	ti->discard_zeroes_data_unsupported = true;
 
 	ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
 	if (!ms->kmirrord_wq) {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5cd665c91ead..958275aca008 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1449,22 +1449,6 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
 	return false;
 }
 
-static bool dm_table_discard_zeroes_data(struct dm_table *t)
-{
-	struct dm_target *ti;
-	unsigned i = 0;
-
-	/* Ensure that all targets supports discard_zeroes_data. */
-	while (i < dm_table_get_num_targets(t)) {
-		ti = dm_table_get_target(t, i++);
-
-		if (ti->discard_zeroes_data_unsupported)
-			return false;
-	}
-
-	return true;
-}
-
 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
 			    sector_t start, sector_t len, void *data)
 {
@@ -1620,9 +1604,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	}
 	blk_queue_write_cache(q, wc, fua);
 
-	if (!dm_table_discard_zeroes_data(t))
-		q->limits.discard_zeroes_data = 0;
-
 	/* Ensure that all underlying devices are non-rotational. */
 	if (dm_table_all_devices_attribute(t, device_is_nonrot))
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2b266a2b5035..a5f1916f621a 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3263,7 +3263,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 * them down to the data device.  The thin device's discard
 	 * processing will cause mappings to be removed from the btree.
 	 */
-	ti->discard_zeroes_data_unsupported = true;
 	if (pf.discard_enabled && pf.discard_passdown) {
 		ti->num_discard_bios = 1;
 
@@ -4119,7 +4118,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
 
 	/* In case the pool supports discards, pass them on. */
-	ti->discard_zeroes_data_unsupported = true;
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1725a54042bb..2efdb0d67460 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7227,7 +7227,6 @@ static int raid5_run(struct mddev *mddev)
 
 	if (mddev->queue) {
 		int chunk_size;
-		bool discard_supported = true;
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
 		 * number of raid devices
@@ -7263,12 +7262,6 @@ static int raid5_run(struct mddev *mddev)
 		blk_queue_max_discard_sectors(mddev->queue,
 					      0xfffe * STRIPE_SECTORS);
 
-		/*
-		 * unaligned part of discard request will be ignored, so can't
-		 * guarantee discard_zeroes_data
-		 */
-		mddev->queue->limits.discard_zeroes_data = 0;
-
 		blk_queue_max_write_same_sectors(mddev->queue, 0);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 
@@ -7277,35 +7270,24 @@ static int raid5_run(struct mddev *mddev)
 					  rdev->data_offset << 9);
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->new_data_offset << 9);
-			/*
-			 * discard_zeroes_data is required, otherwise data
-			 * could be lost. Consider a scenario: discard a stripe
-			 * (the stripe could be inconsistent if
-			 * discard_zeroes_data is 0); write one disk of the
-			 * stripe (the stripe could be inconsistent again
-			 * depending on which disks are used to calculate
-			 * parity); the disk is broken; The stripe data of this
-			 * disk is lost.
-			 */
-			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
-			    !bdev_get_queue(rdev->bdev)->
-						limits.discard_zeroes_data)
-				discard_supported = false;
-			/* Unfortunately, discard_zeroes_data is not currently
-			 * a guarantee - just a hint.  So we only allow DISCARD
-			 * if the sysadmin has confirmed that only safe devices
-			 * are in use by setting a module parameter.
-			 */
-			if (!devices_handle_discard_safely) {
-				if (discard_supported) {
-					pr_info("md/raid456: discard support disabled due to uncertainty.\n");
-					pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
-				}
-				discard_supported = false;
-			}
 		}
 
-		if (discard_supported &&
+		/*
+		 * zeroing is required, otherwise data
+		 * could be lost. Consider a scenario: discard a stripe
+		 * (the stripe could be inconsistent if
+		 * discard_zeroes_data is 0); write one disk of the
+		 * stripe (the stripe could be inconsistent again
+		 * depending on which disks are used to calculate
+		 * parity); the disk is broken; The stripe data of this
+		 * disk is lost.
+		 *
+		 * We only allow DISCARD if the sysadmin has confirmed that
+		 * only safe devices are in use by setting a module parameter.
+		 * A better idea might be to turn DISCARD into WRITE_ZEROES
+		 * requests, as that is required to be safe.
+		 */
+		if (devices_handle_discard_safely &&
 		    mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
 		    mddev->queue->limits.discard_granularity >= stripe)
 			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 001593ed0444..bcb0cb020fd2 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -644,8 +644,6 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	unsigned int logical_block_size = sdkp->device->sector_size;
 	unsigned int max_blocks = 0;
 
-	q->limits.discard_zeroes_data = 0;
-
 	/*
 	 * When LBPRZ is reported, discard alignment and granularity
 	 * must be fixed to the logical block size. Otherwise the block
@@ -681,19 +679,16 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	case SD_LBP_WS16:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS16_BLOCKS);
-		q->limits.discard_zeroes_data = sdkp->lbprz;
 		break;
 
 	case SD_LBP_WS10:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS10_BLOCKS);
-		q->limits.discard_zeroes_data = sdkp->lbprz;
 		break;
 
 	case SD_LBP_ZERO:
 		max_blocks = min_not_zero(sdkp->max_ws_blocks,
 					  (u32)SD_MAX_WS10_BLOCKS);
-		q->limits.discard_zeroes_data = 1;
 		break;
 	}
 
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index c754ae33bf7b..d2f089cfa9ae 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -851,7 +851,7 @@ bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
 	attrib->unmap_granularity = q->limits.discard_granularity / block_size;
 	attrib->unmap_granularity_alignment = q->limits.discard_alignment /
 								block_size;
-	attrib->unmap_zeroes_data = q->limits.discard_zeroes_data;
+	attrib->unmap_zeroes_data = 0;
 	return true;
 }
 EXPORT_SYMBOL(target_configure_unmap_from_queue);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 21a30f011674..ec993573e0a8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -339,7 +339,6 @@ struct queue_limits {
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
 	unsigned char		cluster;
-	unsigned char		discard_zeroes_data;
 	unsigned char		raid_partial_stripes_expensive;
 	enum blk_zoned_model	zoned;
 };
@@ -1341,7 +1340,6 @@ extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 
 #define BLKDEV_DISCARD_SECURE	(1 << 0)	/* issue a secure erase */
-#define BLKDEV_DISCARD_ZERO	(1 << 1)	/* must reliably zero data */
 
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
@@ -1541,19 +1539,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev)
 	return q->limits.discard_alignment;
 }
 
-static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
-{
-	if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
-		return 1;
-
-	return 0;
-}
-
-static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
-{
-	return queue_discard_zeroes_data(bdev_get_queue(bdev));
-}
-
 static inline unsigned int bdev_write_same(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 3829bee2302a..c7ea33e38fb9 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -296,11 +296,6 @@ struct dm_target {
 	 * on max_io_len boundary.
 	 */
 	bool split_discard_bios:1;
-
-	/*
-	 * Set if this target does not return zeroes on discarded blocks.
-	 */
-	bool discard_zeroes_data_unsupported:1;
 };
 
 /* Each target can link one of these into the table */
-- 
cgit v1.3-6-gb490


From db47d5f856467ce0dd3af7e20a33df3d901266df Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 25 Jan 2017 20:30:29 +0100
Subject: x86/nmi, EDAC: Get rid of DRAM error reporting thru PCI SERR NMI

Apparently, some machines used to report DRAM errors through a PCI SERR
NMI. This is why we have a call into EDAC in the NMI handler. See

  c0d121720220 ("drivers/edac: add new nmi rescan").

From looking at the patch above, that's two drivers: e752x_edac.c and
e7xxx_edac.c. Now, I wanna say those are old machines which are probably
decommissioned already.

Tony says that "[t]the newest CPU supported by either of those drivers
is the Xeon E7520 (a.k.a. "Nehalem") released in Q1'2010. Possibly some
folks are still using these ... but people that hold onto h/w for 7
years generally cling to old s/w too ... so I'd guess it unlikely that
we will get complaints for breaking these in upstream."

So even if there is a small number still in use, we did load EDAC with
edac_op_state == EDAC_OPSTATE_POLL by default (we still do, in fact)
which means a default EDAC setup without any parameters supplied on the
command line or otherwise would never even log the error in the NMI
handler because we're polling by default:

  inline int edac_handler_set(void)
  {
         if (edac_op_state == EDAC_OPSTATE_POLL)
                 return 0;

         return atomic_read(&edac_handlers);
  }

So, long story short, I'd like to get rid of that nastiness called
edac_stub.c and confine all the EDAC drivers solely to drivers/edac/. If
we ever have to do stuff like that again, it should be notifiers we're
using and not some insanity like this one.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/nmi.c    | 11 -----------
 drivers/edac/edac_stub.c | 22 ----------------------
 include/linux/edac.h     |  2 --
 3 files changed, 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index f088ea4c66e7..f0c4c890f71b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -224,17 +224,6 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
 	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
 		 reason, smp_processor_id());
 
-	/*
-	 * On some machines, PCI SERR line is used to report memory
-	 * errors. EDAC makes use of it.
-	 */
-#if defined(CONFIG_EDAC)
-	if (edac_handler_set()) {
-		edac_atomic_assert_error();
-		return;
-	}
-#endif
-
 	if (panic_on_unrecovered_nmi)
 		nmi_panic(regs, "NMI: Not continuing");
 
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
index 952e411f01f2..d1389e4b7989 100644
--- a/drivers/edac/edac_stub.c
+++ b/drivers/edac/edac_stub.c
@@ -44,25 +44,3 @@ static int __init edac_report_setup(char *str)
 	return 0;
 }
 __setup("edac_report=", edac_report_setup);
-
-/*
- * called to determine if there is an EDAC driver interested in
- * knowing an event (such as NMI) occurred
- */
-int edac_handler_set(void)
-{
-	if (edac_op_state == EDAC_OPSTATE_POLL)
-		return 0;
-
-	return atomic_read(&edac_handlers);
-}
-EXPORT_SYMBOL_GPL(edac_handler_set);
-
-/*
- * handler for NMI type of interrupts to assert error
- */
-void edac_atomic_assert_error(void)
-{
-	edac_err_assert++;
-}
-EXPORT_SYMBOL_GPL(edac_atomic_assert_error);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 5b6adf964248..bf8daabf3d51 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -31,8 +31,6 @@ extern int edac_op_state;
 extern int edac_err_assert;
 extern atomic_t edac_handlers;
 
-extern int edac_handler_set(void);
-extern void edac_atomic_assert_error(void);
 extern struct bus_type *edac_get_sysfs_subsys(void);
 
 enum {
-- 
cgit v1.3-6-gb490


From 97bb6c17ad5a0892beb45070dfe8c7d6d0e5326e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 26 Jan 2017 16:49:59 +0100
Subject: EDAC: Get rid of edac_handlers

Use mc_devices list instead to check whether we have EDAC driver
instances successfully registered with EDAC core.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/edac/edac_mc.c   | 6 ++----
 drivers/edac/edac_stub.c | 3 ---
 include/linux/edac.h     | 1 -
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index e5573c56b15e..824d31193b69 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -601,7 +601,6 @@ static int add_mc_to_global_list(struct mem_ctl_info *mci)
 	}
 
 	list_add_tail_rcu(&mci->link, insert_before);
-	atomic_inc(&edac_handlers);
 	return 0;
 
 fail0:
@@ -619,7 +618,6 @@ fail1:
 
 static int del_mc_from_global_list(struct mem_ctl_info *mci)
 {
-	int handlers = atomic_dec_return(&edac_handlers);
 	list_del_rcu(&mci->link);
 
 	/* these are for safe removal of devices from global list while
@@ -628,7 +626,7 @@ static int del_mc_from_global_list(struct mem_ctl_info *mci)
 	synchronize_rcu();
 	INIT_LIST_HEAD(&mci->link);
 
-	return handlers;
+	return list_empty(&mc_devices);
 }
 
 struct mem_ctl_info *edac_mc_find(int idx)
@@ -763,7 +761,7 @@ struct mem_ctl_info *edac_mc_del_mc(struct device *dev)
 	/* mark MCI offline: */
 	mci->op_state = OP_OFFLINE;
 
-	if (!del_mc_from_global_list(mci))
+	if (del_mc_from_global_list(mci))
 		edac_mc_owner = NULL;
 
 	mutex_unlock(&mem_ctls_mutex);
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
index d1389e4b7989..cfb250fa38ce 100644
--- a/drivers/edac/edac_stub.c
+++ b/drivers/edac/edac_stub.c
@@ -20,9 +20,6 @@
 int edac_op_state = EDAC_OPSTATE_INVAL;
 EXPORT_SYMBOL_GPL(edac_op_state);
 
-atomic_t edac_handlers = ATOMIC_INIT(0);
-EXPORT_SYMBOL_GPL(edac_handlers);
-
 int edac_err_assert = 0;
 EXPORT_SYMBOL_GPL(edac_err_assert);
 
diff --git a/include/linux/edac.h b/include/linux/edac.h
index bf8daabf3d51..9fd6fe53ab2a 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -29,7 +29,6 @@ struct device;
 
 extern int edac_op_state;
 extern int edac_err_assert;
-extern atomic_t edac_handlers;
 
 extern struct bus_type *edac_get_sysfs_subsys(void);
 
-- 
cgit v1.3-6-gb490


From d3116a0837261405e0febb8043fe7040c8ebccb4 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 26 Jan 2017 18:25:11 +0100
Subject: EDAC: Remove edac_err_assert

... and the glue around it. It is not needed anymore.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/edac/edac_mc.c   | 18 +-----------------
 drivers/edac/edac_stub.c |  3 ---
 include/linux/edac.h     |  1 -
 3 files changed, 1 insertion(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 824d31193b69..482b6aea1ce7 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -504,22 +504,6 @@ struct mem_ctl_info *find_mci_by_dev(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(find_mci_by_dev);
 
-/*
- * handler for EDAC to check if NMI type handler has asserted interrupt
- */
-static int edac_mc_assert_error_check_and_clear(void)
-{
-	int old_state;
-
-	if (edac_op_state == EDAC_OPSTATE_POLL)
-		return 1;
-
-	old_state = edac_err_assert;
-	edac_err_assert = 0;
-
-	return old_state;
-}
-
 /*
  * edac_mc_workq_function
  *	performs the operation scheduled by a workq request
@@ -536,7 +520,7 @@ static void edac_mc_workq_function(struct work_struct *work_req)
 		return;
 	}
 
-	if (edac_mc_assert_error_check_and_clear())
+	if (edac_op_state == EDAC_OPSTATE_POLL)
 		mci->edac_check(mci);
 
 	mutex_unlock(&mem_ctls_mutex);
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
index cfb250fa38ce..f02d21d8130f 100644
--- a/drivers/edac/edac_stub.c
+++ b/drivers/edac/edac_stub.c
@@ -20,9 +20,6 @@
 int edac_op_state = EDAC_OPSTATE_INVAL;
 EXPORT_SYMBOL_GPL(edac_op_state);
 
-int edac_err_assert = 0;
-EXPORT_SYMBOL_GPL(edac_err_assert);
-
 int edac_report_status = EDAC_REPORTING_ENABLED;
 EXPORT_SYMBOL_GPL(edac_report_status);
 
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 9fd6fe53ab2a..c55e93975079 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -28,7 +28,6 @@ struct device;
 #define EDAC_OPSTATE_INT	2
 
 extern int edac_op_state;
-extern int edac_err_assert;
 
 extern struct bus_type *edac_get_sysfs_subsys(void);
 
-- 
cgit v1.3-6-gb490


From fee27d7d97886515a60cce38b4152b7f5b5a21fc Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 4 Feb 2017 17:42:03 +0100
Subject: EDAC: Delete edac_stub.c

Move the remaining functionality to edac_mc.c. Convert "edac_report=" to
a module parameter.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/edac/Makefile    |  2 +-
 drivers/edac/edac_mc.c   | 61 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/edac/edac_stub.c | 37 -----------------------------
 include/linux/edac.h     | 26 +++------------------
 4 files changed, 65 insertions(+), 61 deletions(-)
 delete mode 100644 drivers/edac/edac_stub.c

(limited to 'include/linux')

diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index a8fb734cb28d..0fd9ffa63299 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -6,7 +6,7 @@
 # GNU General Public License.
 #
 
-obj-$(CONFIG_EDAC)			:= edac_stub.o edac_core.o
+obj-$(CONFIG_EDAC)			:= edac_core.o
 
 edac_core-y	:= edac_mc.o edac_device.o edac_mc_sysfs.o
 edac_core-y	+= edac_module.o edac_device_sysfs.o wq.o
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 735546ea6ebe..536b65aa6fac 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -43,6 +43,8 @@
 int edac_op_state = EDAC_OPSTATE_INVAL;
 EXPORT_SYMBOL_GPL(edac_op_state);
 
+static int edac_report = EDAC_REPORTING_ENABLED;
+
 /* lock to memory controller's control array */
 static DEFINE_MUTEX(mem_ctls_mutex);
 static LIST_HEAD(mc_devices);
@@ -55,6 +57,65 @@ static void const *edac_mc_owner;
 
 static struct bus_type mc_bus[EDAC_MAX_MCS];
 
+int get_edac_report_status(void)
+{
+	return edac_report;
+}
+EXPORT_SYMBOL_GPL(get_edac_report_status);
+
+void set_edac_report_status(int new)
+{
+	if (new == EDAC_REPORTING_ENABLED ||
+	    new == EDAC_REPORTING_DISABLED ||
+	    new == EDAC_REPORTING_FORCE)
+		edac_report = new;
+}
+EXPORT_SYMBOL_GPL(set_edac_report_status);
+
+static int edac_report_set(const char *str, const struct kernel_param *kp)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!strncmp(str, "on", 2))
+		edac_report = EDAC_REPORTING_ENABLED;
+	else if (!strncmp(str, "off", 3))
+		edac_report = EDAC_REPORTING_DISABLED;
+	else if (!strncmp(str, "force", 5))
+		edac_report = EDAC_REPORTING_FORCE;
+
+	return 0;
+}
+
+static int edac_report_get(char *buffer, const struct kernel_param *kp)
+{
+	int ret = 0;
+
+	switch (edac_report) {
+	case EDAC_REPORTING_ENABLED:
+		ret = sprintf(buffer, "on");
+		break;
+	case EDAC_REPORTING_DISABLED:
+		ret = sprintf(buffer, "off");
+		break;
+	case EDAC_REPORTING_FORCE:
+		ret = sprintf(buffer, "force");
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static const struct kernel_param_ops edac_report_ops = {
+	.set = edac_report_set,
+	.get = edac_report_get,
+};
+
+module_param_cb(edac_report, &edac_report_ops, &edac_report, 0644);
+
 unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
 			         unsigned len)
 {
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
deleted file mode 100644
index 6aacc569401e..000000000000
--- a/drivers/edac/edac_stub.c
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * common EDAC components that must be in kernel
- *
- * Author: Dave Jiang <djiang@mvista.com>
- *
- * 2007 (c) MontaVista Software, Inc.
- * 2010 (c) Advanced Micro Devices Inc.
- *	    Borislav Petkov <bp@alien8.de>
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2. This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- *
- */
-#include <linux/module.h>
-#include <linux/edac.h>
-#include <linux/atomic.h>
-#include <linux/device.h>
-
-int edac_report_status = EDAC_REPORTING_ENABLED;
-EXPORT_SYMBOL_GPL(edac_report_status);
-
-static int __init __maybe_unused edac_report_setup(char *str)
-{
-	if (!str)
-		return -EINVAL;
-
-	if (!strncmp(str, "on", 2))
-		set_edac_report_status(EDAC_REPORTING_ENABLED);
-	else if (!strncmp(str, "off", 3))
-		set_edac_report_status(EDAC_REPORTING_DISABLED);
-	else if (!strncmp(str, "force", 5))
-		set_edac_report_status(EDAC_REPORTING_FORCE);
-
-	return 0;
-}
-__setup("edac_report=", edac_report_setup);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index c55e93975079..faf87e1eca21 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -29,7 +29,9 @@ struct device;
 
 extern int edac_op_state;
 
-extern struct bus_type *edac_get_sysfs_subsys(void);
+struct bus_type *edac_get_sysfs_subsys(void);
+int get_edac_report_status(void);
+void set_edac_report_status(int new);
 
 enum {
 	EDAC_REPORTING_ENABLED,
@@ -37,28 +39,6 @@ enum {
 	EDAC_REPORTING_FORCE
 };
 
-extern int edac_report_status;
-#ifdef CONFIG_EDAC
-static inline int get_edac_report_status(void)
-{
-	return edac_report_status;
-}
-
-static inline void set_edac_report_status(int new)
-{
-	edac_report_status = new;
-}
-#else
-static inline int get_edac_report_status(void)
-{
-	return EDAC_REPORTING_DISABLED;
-}
-
-static inline void set_edac_report_status(int new)
-{
-}
-#endif
-
 static inline void opstate_init(void)
 {
 	switch (edac_op_state) {
-- 
cgit v1.3-6-gb490


From bffc7dece92edd0b6445b76a378e2fa9e324c7ed Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 4 Feb 2017 18:10:14 +0100
Subject: EDAC: Rename report status accessors

Change them to have the edac_ prefix.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/acpi/acpi_extlog.c | 8 ++++----
 drivers/edac/edac_mc.c     | 8 ++++----
 drivers/edac/pnd2_edac.c   | 2 +-
 drivers/edac/sb_edac.c     | 4 ++--
 drivers/edac/skx_edac.c    | 2 +-
 include/linux/edac.h       | 4 ++--
 6 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index a15270a806fc..502ea4dc2080 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -229,7 +229,7 @@ static int __init extlog_init(void)
 	if (!(cap & MCG_ELOG_P) || !extlog_get_l1addr())
 		return -ENODEV;
 
-	if (get_edac_report_status() == EDAC_REPORTING_FORCE) {
+	if (edac_get_report_status() == EDAC_REPORTING_FORCE) {
 		pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n");
 		return -EPERM;
 	}
@@ -285,8 +285,8 @@ static int __init extlog_init(void)
 	 * eMCA event report method has higher priority than EDAC method,
 	 * unless EDAC event report method is mandatory.
 	 */
-	old_edac_report_status = get_edac_report_status();
-	set_edac_report_status(EDAC_REPORTING_DISABLED);
+	old_edac_report_status = edac_get_report_status();
+	edac_set_report_status(EDAC_REPORTING_DISABLED);
 	mce_register_decode_chain(&extlog_mce_dec);
 	/* enable OS to be involved to take over management from BIOS */
 	((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
@@ -308,7 +308,7 @@ err:
 
 static void __exit extlog_exit(void)
 {
-	set_edac_report_status(old_edac_report_status);
+	edac_set_report_status(old_edac_report_status);
 	mce_unregister_decode_chain(&extlog_mce_dec);
 	((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN;
 	if (extlog_l1_addr)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 536b65aa6fac..480072139b7a 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -57,20 +57,20 @@ static void const *edac_mc_owner;
 
 static struct bus_type mc_bus[EDAC_MAX_MCS];
 
-int get_edac_report_status(void)
+int edac_get_report_status(void)
 {
 	return edac_report;
 }
-EXPORT_SYMBOL_GPL(get_edac_report_status);
+EXPORT_SYMBOL_GPL(edac_get_report_status);
 
-void set_edac_report_status(int new)
+void edac_set_report_status(int new)
 {
 	if (new == EDAC_REPORTING_ENABLED ||
 	    new == EDAC_REPORTING_DISABLED ||
 	    new == EDAC_REPORTING_FORCE)
 		edac_report = new;
 }
-EXPORT_SYMBOL_GPL(set_edac_report_status);
+EXPORT_SYMBOL_GPL(edac_set_report_status);
 
 static int edac_report_set(const char *str, const struct kernel_param *kp)
 {
diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c
index 928e0dba41fc..1cad5a9af8d0 100644
--- a/drivers/edac/pnd2_edac.c
+++ b/drivers/edac/pnd2_edac.c
@@ -1349,7 +1349,7 @@ static int pnd2_mce_check_error(struct notifier_block *nb, unsigned long val, vo
 	struct dram_addr daddr;
 	char *type;
 
-	if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+	if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
 		return NOTIFY_DONE;
 
 	mci = pnd2_mci;
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index a65ea44e3b0b..ea21cb651b3c 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -3075,7 +3075,7 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
 	struct sbridge_pvt *pvt;
 	char *type;
 
-	if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+	if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
 		return NOTIFY_DONE;
 
 	mci = get_mci_for_node_id(mce->socketid);
@@ -3441,7 +3441,7 @@ static int __init sbridge_init(void)
 
 	if (rc >= 0) {
 		mce_register_decode_chain(&sbridge_mce_dec);
-		if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+		if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
 			sbridge_printk(KERN_WARNING, "Loading driver, error reporting disabled.\n");
 		return 0;
 	}
diff --git a/drivers/edac/skx_edac.c b/drivers/edac/skx_edac.c
index 1159dba4671f..64bef6c9cfb4 100644
--- a/drivers/edac/skx_edac.c
+++ b/drivers/edac/skx_edac.c
@@ -971,7 +971,7 @@ static int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	struct mem_ctl_info *mci;
 	char *type;
 
-	if (get_edac_report_status() == EDAC_REPORTING_DISABLED)
+	if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
 		return NOTIFY_DONE;
 
 	/* ignore unless this is memory related with an address */
diff --git a/include/linux/edac.h b/include/linux/edac.h
index faf87e1eca21..8ae0f45fafd6 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -30,8 +30,8 @@ struct device;
 extern int edac_op_state;
 
 struct bus_type *edac_get_sysfs_subsys(void);
-int get_edac_report_status(void);
-void set_edac_report_status(int new);
+int edac_get_report_status(void);
+void edac_set_report_status(int new);
 
 enum {
 	EDAC_REPORTING_ENABLED,
-- 
cgit v1.3-6-gb490


From 30e03acda5fd9c77ec9bf8b3c5def9540c6b0486 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Sun, 9 Apr 2017 07:36:14 +0600
Subject: cpuset: Remove cpuset_update_active_cpus()'s parameter.

In cpuset_update_active_cpus(), cpu_online isn't used anymore. Remove
it.

Signed-off-by: Rakib Mullick<rakib.mullick@gmail.com>
Acked-by: Zefan Li <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpuset.h | 4 ++--
 kernel/cgroup/cpuset.c | 2 +-
 kernel/sched/core.c    | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 611fce58d670..119a3f9604b0 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -42,7 +42,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_update_active_cpus(bool cpu_online);
+extern void cpuset_update_active_cpus(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -155,7 +155,7 @@ static inline bool cpusets_enabled(void) { return false; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_update_active_cpus(bool cpu_online)
+static inline void cpuset_update_active_cpus(void)
 {
 	partition_sched_domains(1, NULL, NULL);
 }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 8b84db2d2d17..f6501f4f6040 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2351,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 		rebuild_sched_domains();
 }
 
-void cpuset_update_active_cpus(bool cpu_online)
+void cpuset_update_active_cpus(void)
 {
 	/*
 	 * We're inside cpu hotplug critical region which usually nests
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 956383844116..e62802c044e1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5727,7 +5727,7 @@ static void cpuset_cpu_active(void)
 		 * cpuset configurations.
 		 */
 	}
-	cpuset_update_active_cpus(true);
+	cpuset_update_active_cpus();
 }
 
 static int cpuset_cpu_inactive(unsigned int cpu)
@@ -5750,7 +5750,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
 
 		if (overflow)
 			return -EBUSY;
-		cpuset_update_active_cpus(false);
+		cpuset_update_active_cpus();
 	} else {
 		num_cpus_frozen++;
 		partition_sched_domains(1, NULL, NULL);
-- 
cgit v1.3-6-gb490


From b8b1a2e5eca6bbf20e3a29c5f9db8331ec52af2d Mon Sep 17 00:00:00 2001
From: Todd Poynor <toddpoynor@google.com>
Date: Thu, 6 Apr 2017 18:47:57 -0700
Subject: cgroup: move cgroup_subsys_state parent field for cache locality

Various structures embed a struct cgroup_subsys_state, typically at
the top of the containing structure.  It is common for code that
accesses the structures to perform operations that iterate over the
chain of parent css pointers, also accessing data in each containing
structure.  In particular, struct cpuacct is used by fairly hot code
paths in the scheduler such as cpuacct_charge().

Move the parent css pointer field to the end of the structure to
increase the chances of residing in the same cache line as the data
from the containing structure.

Signed-off-by: Todd Poynor <toddpoynor@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c74b78ecd583..21745946cae1 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -107,9 +107,6 @@ struct cgroup_subsys_state {
 	/* reference count - access via css_[try]get() and css_put() */
 	struct percpu_ref refcnt;
 
-	/* PI: the parent css */
-	struct cgroup_subsys_state *parent;
-
 	/* siblings list anchored at the parent's ->children */
 	struct list_head sibling;
 	struct list_head children;
@@ -139,6 +136,12 @@ struct cgroup_subsys_state {
 	/* percpu_ref killing and RCU release */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
+
+	/*
+	 * PI: the parent css.	Placed here for cache proximity to following
+	 * fields of the containing structure.
+	 */
+	struct cgroup_subsys_state *parent;
 };
 
 /*
-- 
cgit v1.3-6-gb490


From 717a94b5fc7092afebe9c93791f29b2d8e5d297a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 7 Apr 2017 10:03:26 +1000
Subject: sched/core: Remove 'task' parameter and rename tsk_restore_flags() to
 current_restore_flags()

It is not safe for one thread to modify the ->flags
of another thread as there is no locking that can protect
the update.

So tsk_restore_flags(), which takes a task pointer and modifies
the flags, is an invitation to do the wrong thing.

All current users pass "current" as the task, so no developers have
accepted that invitation.  It would be best to ensure it remains
that way.

So rename tsk_restore_flags() to current_restore_flags() and don't
pass in a task_struct pointer.  Always operate on current->flags.

Signed-off-by: NeilBrown <neilb@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/block/nbd.c      | 2 +-
 drivers/scsi/iscsi_tcp.c | 2 +-
 fs/nfsd/vfs.c            | 2 +-
 include/linux/sched.h    | 6 +++---
 kernel/softirq.c         | 2 +-
 net/core/dev.c           | 2 +-
 net/core/sock.c          | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index d8a23561b4cb..3c9052bf2327 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -244,7 +244,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
 			*sent += result;
 	} while (msg_data_left(&msg));
 
-	tsk_restore_flags(current, pflags, PF_MEMALLOC);
+	current_restore_flags(pflags, PF_MEMALLOC);
 
 	return result;
 }
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 4228aba1f654..bbea8eac9abb 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -387,7 +387,7 @@ static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task)
 		rc = 0;
 	}
 
-	tsk_restore_flags(current, pflags, PF_MEMALLOC);
+	current_restore_flags(pflags, PF_MEMALLOC);
 	return rc;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 19d50f600e8d..9aaf6ca77569 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1004,7 +1004,7 @@ out_nfserr:
 	else
 		err = nfserrno(host_err);
 	if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
-		tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
+		current_restore_flags(pflags, PF_LESS_THROTTLE);
 	return err;
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..0978fb74e45a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1286,10 +1286,10 @@ TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
 TASK_PFA_SET(LMK_WAITING, lmk_waiting)
 
 static inline void
-tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags)
+current_restore_flags(unsigned long orig_flags, unsigned long flags)
 {
-	task->flags &= ~flags;
-	task->flags |= orig_flags & flags;
+	current->flags &= ~flags;
+	current->flags |= orig_flags & flags;
 }
 
 extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 744fa611cae0..4e09821f9d9e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -309,7 +309,7 @@ restart:
 	account_irq_exit_time(current);
 	__local_bh_enable(SOFTIRQ_OFFSET);
 	WARN_ON_ONCE(in_interrupt());
-	tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+	current_restore_flags(old_flags, PF_MEMALLOC);
 }
 
 asmlinkage __visible void do_softirq(void)
diff --git a/net/core/dev.c b/net/core/dev.c
index 7869ae3837ca..e8a366387a99 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4240,7 +4240,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
 		 */
 		current->flags |= PF_MEMALLOC;
 		ret = __netif_receive_skb_core(skb, true);
-		tsk_restore_flags(current, pflags, PF_MEMALLOC);
+		current_restore_flags(pflags, PF_MEMALLOC);
 	} else
 		ret = __netif_receive_skb_core(skb, false);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 2c4f574168fb..b416a537bd0a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -325,7 +325,7 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 
 	current->flags |= PF_MEMALLOC;
 	ret = sk->sk_backlog_rcv(sk, skb);
-	tsk_restore_flags(current, pflags, PF_MEMALLOC);
+	current_restore_flags(pflags, PF_MEMALLOC);
 
 	return ret;
 }
-- 
cgit v1.3-6-gb490


From 3ea6b7001ef5da9f9816ee3c4fe731f4fe08b865 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Thu, 6 Apr 2017 13:19:35 +0900
Subject: PM / devfreq: Move struct devfreq_governor to devfreq directory

This patch moves the struct devfreq_governor from header file
to the devfreq directory because this structure is private data
and it have to be only accessed by the devfreq core.

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/governor.h | 29 +++++++++++++++++++++++++++++
 include/linux/devfreq.h    | 30 +-----------------------------
 2 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/devfreq/governor.h b/drivers/devfreq/governor.h
index 71576b8bdfef..a4f2fa1091e4 100644
--- a/drivers/devfreq/governor.h
+++ b/drivers/devfreq/governor.h
@@ -25,6 +25,35 @@
 #define DEVFREQ_GOV_SUSPEND			0x4
 #define DEVFREQ_GOV_RESUME			0x5
 
+/**
+ * struct devfreq_governor - Devfreq policy governor
+ * @node:		list node - contains registered devfreq governors
+ * @name:		Governor's name
+ * @immutable:		Immutable flag for governor. If the value is 1,
+ *			this govenror is never changeable to other governor.
+ * @get_target_freq:	Returns desired operating frequency for the device.
+ *			Basically, get_target_freq will run
+ *			devfreq_dev_profile.get_dev_status() to get the
+ *			status of the device (load = busy_time / total_time).
+ *			If no_central_polling is set, this callback is called
+ *			only with update_devfreq() notified by OPP.
+ * @event_handler:      Callback for devfreq core framework to notify events
+ *                      to governors. Events include per device governor
+ *                      init and exit, opp changes out of devfreq, suspend
+ *                      and resume of per device devfreq during device idle.
+ *
+ * Note that the callbacks are called with devfreq->lock locked by devfreq.
+ */
+struct devfreq_governor {
+	struct list_head node;
+
+	const char name[DEVFREQ_NAME_LEN];
+	const unsigned int immutable;
+	int (*get_target_freq)(struct devfreq *this, unsigned long *freq);
+	int (*event_handler)(struct devfreq *devfreq,
+				unsigned int event, void *data);
+};
+
 /* Caution: devfreq->lock must be locked before calling update_devfreq */
 extern int update_devfreq(struct devfreq *devfreq);
 
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index e0acb0e5243b..6c220e4ebb6b 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -27,6 +27,7 @@
 #define DEVFREQ_POSTCHANGE		(1)
 
 struct devfreq;
+struct devfreq_governor;
 
 /**
  * struct devfreq_dev_status - Data given from devfreq user device to
@@ -100,35 +101,6 @@ struct devfreq_dev_profile {
 	unsigned int max_state;
 };
 
-/**
- * struct devfreq_governor - Devfreq policy governor
- * @node:		list node - contains registered devfreq governors
- * @name:		Governor's name
- * @immutable:		Immutable flag for governor. If the value is 1,
- *			this govenror is never changeable to other governor.
- * @get_target_freq:	Returns desired operating frequency for the device.
- *			Basically, get_target_freq will run
- *			devfreq_dev_profile.get_dev_status() to get the
- *			status of the device (load = busy_time / total_time).
- *			If no_central_polling is set, this callback is called
- *			only with update_devfreq() notified by OPP.
- * @event_handler:      Callback for devfreq core framework to notify events
- *                      to governors. Events include per device governor
- *                      init and exit, opp changes out of devfreq, suspend
- *                      and resume of per device devfreq during device idle.
- *
- * Note that the callbacks are called with devfreq->lock locked by devfreq.
- */
-struct devfreq_governor {
-	struct list_head node;
-
-	const char name[DEVFREQ_NAME_LEN];
-	const unsigned int immutable;
-	int (*get_target_freq)(struct devfreq *this, unsigned long *freq);
-	int (*event_handler)(struct devfreq *devfreq,
-				unsigned int event, void *data);
-};
-
 /**
  * struct devfreq - Device devfreq structure
  * @node:	list node - contains the devices with devfreq that have been
-- 
cgit v1.3-6-gb490


From 01c0e0a28da749e80cb7d549f75a5f52e2f40d0e Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 22 Mar 2017 15:55:30 +0100
Subject: power: supply: bq24190_charger: Use i2c-core irq-mapping code

The i2c-core already maps of irqs before calling the driver's probe
function and there are no in tree users of
bq24190_platform_data->gpio_int.

Remove the redundant custom irq-mapping code and just use client->irq.

Cc: Liam Breck <kernel@networkimprov.net>
Cc: Tony Lindgren <tony@atomide.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/supply/bq24190_charger.c | 66 ++--------------------------------
 include/linux/power/bq24190_charger.h  | 16 ---------
 2 files changed, 3 insertions(+), 79 deletions(-)
 delete mode 100644 include/linux/power/bq24190_charger.h

(limited to 'include/linux')

diff --git a/drivers/power/supply/bq24190_charger.c b/drivers/power/supply/bq24190_charger.c
index 451f2bc05ea5..fa2d2da2de5f 100644
--- a/drivers/power/supply/bq24190_charger.c
+++ b/drivers/power/supply/bq24190_charger.c
@@ -18,9 +18,6 @@
 #include <linux/gpio.h>
 #include <linux/i2c.h>
 
-#include <linux/power/bq24190_charger.h>
-
-
 #define	BQ24190_MANUFACTURER	"Texas Instruments"
 
 #define BQ24190_REG_ISC		0x00 /* Input Source Control */
@@ -153,8 +150,6 @@ struct bq24190_dev_info {
 	struct power_supply		*battery;
 	char				model_name[I2C_NAME_SIZE];
 	kernel_ulong_t			model;
-	unsigned int			gpio_int;
-	unsigned int			irq;
 	bool				initialized;
 	bool				irq_event;
 	struct mutex			f_reg_lock;
@@ -1310,56 +1305,11 @@ static int bq24190_hw_init(struct bq24190_dev_info *bdi)
 	return bq24190_read(bdi, BQ24190_REG_SS, &bdi->ss_reg);
 }
 
-#ifdef CONFIG_OF
-static int bq24190_setup_dt(struct bq24190_dev_info *bdi)
-{
-	bdi->irq = irq_of_parse_and_map(bdi->dev->of_node, 0);
-	if (bdi->irq <= 0)
-		return -1;
-
-	return 0;
-}
-#else
-static int bq24190_setup_dt(struct bq24190_dev_info *bdi)
-{
-	return -1;
-}
-#endif
-
-static int bq24190_setup_pdata(struct bq24190_dev_info *bdi,
-		struct bq24190_platform_data *pdata)
-{
-	int ret;
-
-	if (!gpio_is_valid(pdata->gpio_int))
-		return -1;
-
-	ret = gpio_request(pdata->gpio_int, dev_name(bdi->dev));
-	if (ret < 0)
-		return -1;
-
-	ret = gpio_direction_input(pdata->gpio_int);
-	if (ret < 0)
-		goto out;
-
-	bdi->irq = gpio_to_irq(pdata->gpio_int);
-	if (!bdi->irq)
-		goto out;
-
-	bdi->gpio_int = pdata->gpio_int;
-	return 0;
-
-out:
-	gpio_free(pdata->gpio_int);
-	return -1;
-}
-
 static int bq24190_probe(struct i2c_client *client,
 		const struct i2c_device_id *id)
 {
 	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
 	struct device *dev = &client->dev;
-	struct bq24190_platform_data *pdata = client->dev.platform_data;
 	struct power_supply_config charger_cfg = {}, battery_cfg = {};
 	struct bq24190_dev_info *bdi;
 	int ret;
@@ -1385,12 +1335,7 @@ static int bq24190_probe(struct i2c_client *client,
 
 	i2c_set_clientdata(client, bdi);
 
-	if (dev->of_node)
-		ret = bq24190_setup_dt(bdi);
-	else
-		ret = bq24190_setup_pdata(bdi, pdata);
-
-	if (ret) {
+	if (!client->irq) {
 		dev_err(dev, "Can't get irq info\n");
 		return -EINVAL;
 	}
@@ -1436,7 +1381,7 @@ static int bq24190_probe(struct i2c_client *client,
 
 	bdi->initialized = true;
 
-	ret = devm_request_threaded_irq(dev, bdi->irq, NULL,
+	ret = devm_request_threaded_irq(dev, client->irq, NULL,
 			bq24190_irq_handler_thread,
 			IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
 			"bq24190-charger", bdi);
@@ -1445,7 +1390,7 @@ static int bq24190_probe(struct i2c_client *client,
 		goto out5;
 	}
 
-	enable_irq_wake(bdi->irq);
+	enable_irq_wake(client->irq);
 
 	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
@@ -1467,8 +1412,6 @@ out2:
 out1:
 	pm_runtime_dont_use_autosuspend(dev);
 	pm_runtime_disable(dev);
-	if (bdi->gpio_int)
-		gpio_free(bdi->gpio_int);
 	return ret;
 }
 
@@ -1492,9 +1435,6 @@ static int bq24190_remove(struct i2c_client *client)
 	pm_runtime_dont_use_autosuspend(bdi->dev);
 	pm_runtime_disable(bdi->dev);
 
-	if (bdi->gpio_int)
-		gpio_free(bdi->gpio_int);
-
 	return 0;
 }
 
diff --git a/include/linux/power/bq24190_charger.h b/include/linux/power/bq24190_charger.h
deleted file mode 100644
index 9f0283721cbc..000000000000
--- a/include/linux/power/bq24190_charger.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*
- * Platform data for the TI bq24190 battery charger driver.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef _BQ24190_CHARGER_H_
-#define _BQ24190_CHARGER_H_
-
-struct bq24190_platform_data {
-	unsigned int	gpio_int;	/* GPIO pin that's connected to INT# */
-};
-
-#endif
-- 
cgit v1.3-6-gb490


From f7e30f01a9e221067bb4b579e3cfc25cd2617467 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Wed, 12 Apr 2017 11:20:29 -0700
Subject: cpumask: Add helper cpumask_available()

With CONFIG_CPUMASK_OFFSTACK=y cpumask_var_t is a struct cpumask
pointer, otherwise a struct cpumask array with a single element.

Some code dealing with cpumasks needs to validate that a cpumask_var_t
is not a NULL pointer when CONFIG_CPUMASK_OFFSTACK=y. This is typically
done by performing the check always, regardless of the underlying type
of cpumask_var_t. This works in both cases, however clang raises a
warning like this when CONFIG_CPUMASK_OFFSTACK=n:

kernel/irq/manage.c:839:28: error: address of array
'desc->irq_common_data.affinity' will always evaluate to 'true'
[-Werror,-Wpointer-bool-conversion]

Add the inline helper cpumask_available() which only performs the
pointer check if CONFIG_CPUMASK_OFFSTACK=y.

Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Grant Grundler <grundler@chromium.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Michael Davidson <md@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20170412182030.83657-1-mka@chromium.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpumask.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 96f1e88b767c..1a675604b17d 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -667,6 +667,11 @@ void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
 void free_cpumask_var(cpumask_var_t mask);
 void free_bootmem_cpumask_var(cpumask_var_t mask);
 
+static inline bool cpumask_available(cpumask_var_t mask)
+{
+	return mask != NULL;
+}
+
 #else
 typedef struct cpumask cpumask_var_t[1];
 
@@ -708,6 +713,11 @@ static inline void free_cpumask_var(cpumask_var_t mask)
 static inline void free_bootmem_cpumask_var(cpumask_var_t mask)
 {
 }
+
+static inline bool cpumask_available(cpumask_var_t mask)
+{
+	return true;
+}
 #endif /* CONFIG_CPUMASK_OFFSTACK */
 
 /* It's common to want to use cpu_all_mask in struct member initializers,
-- 
cgit v1.3-6-gb490


From 2ac00f17b2e110c67ed2af3713bc04aec62e4608 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:12 -0700
Subject: time: Delete do_sys_setimeofday()

struct timespec is not y2038 safe on 32 bit machines and needs to be
replaced with struct timespec64.

do_sys_timeofday() is just a wrapper function.  Replace all calls to this
function with direct calls to do_sys_timeofday64() instead and delete
do_sys_timeofday().

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: y2038@lists.linaro.org
Cc: john.stultz@linaro.org
Cc: arnd@arndb.de
Cc: linux-alpha@vger.kernel.org
Link: http://lkml.kernel.org/r/1490555058-4603-2-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/alpha/kernel/osf_sys.c |  4 +++-
 include/linux/timekeeping.h | 15 ---------------
 kernel/compat.c             |  4 ++--
 kernel/time/posix-stubs.c   |  5 ++++-
 kernel/time/posix-timers.c  |  5 ++++-
 kernel/time/time.c          |  4 ++--
 6 files changed, 15 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 0b961093ca5c..9de47a9c6df2 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1016,6 +1016,7 @@ SYSCALL_DEFINE2(osf_gettimeofday, struct timeval32 __user *, tv,
 SYSCALL_DEFINE2(osf_settimeofday, struct timeval32 __user *, tv,
 		struct timezone __user *, tz)
 {
+	struct timespec64 kts64;
 	struct timespec kts;
 	struct timezone ktz;
 
@@ -1023,13 +1024,14 @@ SYSCALL_DEFINE2(osf_settimeofday, struct timeval32 __user *, tv,
 		if (get_tv32((struct timeval *)&kts, tv))
 			return -EFAULT;
 		kts.tv_nsec *= 1000;
+		kts64 = timespec_to_timespec64(kts);
 	}
 	if (tz) {
 		if (copy_from_user(&ktz, tz, sizeof(*tz)))
 			return -EFAULT;
 	}
 
-	return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+	return do_sys_settimeofday64(tv ? &kts64 : NULL, tz ? &ktz : NULL);
 }
 
 asmlinkage long sys_ni_posix_timers(void);
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index b598cbc7b576..3617a78897bb 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -19,21 +19,6 @@ extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday64(const struct timespec64 *ts);
 extern int do_sys_settimeofday64(const struct timespec64 *tv,
 				 const struct timezone *tz);
-static inline int do_sys_settimeofday(const struct timespec *tv,
-				      const struct timezone *tz)
-{
-	struct timespec64 ts64;
-
-	if (!tv)
-		return do_sys_settimeofday64(NULL, tz);
-
-	if (!timespec_valid(tv))
-		return -EINVAL;
-
-	ts64 = timespec_to_timespec64(*tv);
-	return do_sys_settimeofday64(&ts64, tz);
-}
-
 /*
  * Kernel time accessors
  */
diff --git a/kernel/compat.c b/kernel/compat.c
index 19aec5d98108..1eb9e8aad869 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -108,8 +108,8 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
 COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
 		       struct timezone __user *, tz)
 {
+	struct timespec64 new_ts;
 	struct timeval user_tv;
-	struct timespec	new_ts;
 	struct timezone new_tz;
 
 	if (tv) {
@@ -123,7 +123,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
 			return -EFAULT;
 	}
 
-	return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+	return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
 static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index cd6716e115e8..95a1b1fc3968 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -49,13 +49,16 @@ SYS_NI(alarm);
 SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		const struct timespec __user *, tp)
 {
+	struct timespec64 new_tp64;
 	struct timespec new_tp;
 
 	if (which_clock != CLOCK_REALTIME)
 		return -EINVAL;
 	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
 		return -EFAULT;
-	return do_sys_settimeofday(&new_tp, NULL);
+
+	new_tp64 = timespec_to_timespec64(new_tp);
+	return do_sys_settimeofday64(&new_tp64, NULL);
 }
 
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 50a6a47020de..f215ef792772 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -214,7 +214,10 @@ static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
 static int posix_clock_realtime_set(const clockid_t which_clock,
 				    const struct timespec *tp)
 {
-	return do_sys_settimeofday(tp, NULL);
+	struct timespec64 tp64;
+
+	tp64 = timespec_to_timespec64(*tp);
+	return do_sys_settimeofday64(&tp64, NULL);
 }
 
 static int posix_clock_realtime_adj(const clockid_t which_clock,
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 25bdd2504571..6574bba44b55 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -193,8 +193,8 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
 SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
 		struct timezone __user *, tz)
 {
+	struct timespec64 new_ts;
 	struct timeval user_tv;
-	struct timespec	new_ts;
 	struct timezone new_tz;
 
 	if (tv) {
@@ -212,7 +212,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
 			return -EFAULT;
 	}
 
-	return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
+	return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
 SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
-- 
cgit v1.3-6-gb490


From d340266e19ddb70dbd608f9deedcfb35fdb9d419 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:13 -0700
Subject: time: Change posix clocks ops interfaces to use timespec64

struct timespec is not y2038 safe on 32 bit machines.

The posix clocks apis use struct timespec directly and through struct
itimerspec.

Replace the posix clock interfaces to use struct timespec64 and struct
itimerspec64 instead.  Also fix up their implementations accordingly.

Note that the clock_getres() interface has also been changed to use
timespec64 even though this particular interface is not affected by the
y2038 problem. This helps verification for internal kernel code for y2038
readiness by getting rid of time_t/ timeval/ timespec.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: arnd@arndb.de
Cc: y2038@lists.linaro.org
Cc: netdev@vger.kernel.org
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: john.stultz@linaro.org
Link: http://lkml.kernel.org/r/1490555058-4603-3-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/ptp/ptp_clock.c     | 18 +++++++-----------
 include/linux/posix-clock.h | 10 +++++-----
 kernel/time/posix-clock.c   | 34 ++++++++++++++++++++++++----------
 3 files changed, 36 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c
index e8142803a1a7..b77435783ef3 100644
--- a/drivers/ptp/ptp_clock.c
+++ b/drivers/ptp/ptp_clock.c
@@ -97,30 +97,26 @@ static s32 scaled_ppm_to_ppb(long ppm)
 
 /* posix clock implementation */
 
-static int ptp_clock_getres(struct posix_clock *pc, struct timespec *tp)
+static int ptp_clock_getres(struct posix_clock *pc, struct timespec64 *tp)
 {
 	tp->tv_sec = 0;
 	tp->tv_nsec = 1;
 	return 0;
 }
 
-static int ptp_clock_settime(struct posix_clock *pc, const struct timespec *tp)
+static int ptp_clock_settime(struct posix_clock *pc, const struct timespec64 *tp)
 {
 	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
-	struct timespec64 ts = timespec_to_timespec64(*tp);
 
-	return  ptp->info->settime64(ptp->info, &ts);
+	return  ptp->info->settime64(ptp->info, tp);
 }
 
-static int ptp_clock_gettime(struct posix_clock *pc, struct timespec *tp)
+static int ptp_clock_gettime(struct posix_clock *pc, struct timespec64 *tp)
 {
 	struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock);
-	struct timespec64 ts;
 	int err;
 
-	err = ptp->info->gettime64(ptp->info, &ts);
-	if (!err)
-		*tp = timespec64_to_timespec(ts);
+	err = ptp->info->gettime64(ptp->info, tp);
 	return err;
 }
 
@@ -133,7 +129,7 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct timex *tx)
 	ops = ptp->info;
 
 	if (tx->modes & ADJ_SETOFFSET) {
-		struct timespec ts;
+		struct timespec64 ts;
 		ktime_t kt;
 		s64 delta;
 
@@ -146,7 +142,7 @@ static int ptp_clock_adjtime(struct posix_clock *pc, struct timex *tx)
 		if ((unsigned long) ts.tv_nsec >= NSEC_PER_SEC)
 			return -EINVAL;
 
-		kt = timespec_to_ktime(ts);
+		kt = timespec64_to_ktime(ts);
 		delta = ktime_to_ns(kt);
 		err = ops->adjtime(ops, delta);
 	} else if (tx->modes & ADJ_FREQUENCY) {
diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h
index 34c4498b800f..83b22ae9ae12 100644
--- a/include/linux/posix-clock.h
+++ b/include/linux/posix-clock.h
@@ -59,23 +59,23 @@ struct posix_clock_operations {
 
 	int  (*clock_adjtime)(struct posix_clock *pc, struct timex *tx);
 
-	int  (*clock_gettime)(struct posix_clock *pc, struct timespec *ts);
+	int  (*clock_gettime)(struct posix_clock *pc, struct timespec64 *ts);
 
-	int  (*clock_getres) (struct posix_clock *pc, struct timespec *ts);
+	int  (*clock_getres) (struct posix_clock *pc, struct timespec64 *ts);
 
 	int  (*clock_settime)(struct posix_clock *pc,
-			      const struct timespec *ts);
+			      const struct timespec64 *ts);
 
 	int  (*timer_create) (struct posix_clock *pc, struct k_itimer *kit);
 
 	int  (*timer_delete) (struct posix_clock *pc, struct k_itimer *kit);
 
 	void (*timer_gettime)(struct posix_clock *pc,
-			      struct k_itimer *kit, struct itimerspec *tsp);
+			      struct k_itimer *kit, struct itimerspec64 *tsp);
 
 	int  (*timer_settime)(struct posix_clock *pc,
 			      struct k_itimer *kit, int flags,
-			      struct itimerspec *tsp, struct itimerspec *old);
+			      struct itimerspec64 *tsp, struct itimerspec64 *old);
 	/*
 	 * Optional character device methods:
 	 */
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 9cff0ab82b63..e24008c098c6 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -300,14 +300,17 @@ out:
 static int pc_clock_gettime(clockid_t id, struct timespec *ts)
 {
 	struct posix_clock_desc cd;
+	struct timespec64 ts64;
 	int err;
 
 	err = get_clock_desc(id, &cd);
 	if (err)
 		return err;
 
-	if (cd.clk->ops.clock_gettime)
-		err = cd.clk->ops.clock_gettime(cd.clk, ts);
+	if (cd.clk->ops.clock_gettime) {
+		err = cd.clk->ops.clock_gettime(cd.clk, &ts64);
+		*ts = timespec64_to_timespec(ts64);
+	}
 	else
 		err = -EOPNOTSUPP;
 
@@ -319,14 +322,17 @@ static int pc_clock_gettime(clockid_t id, struct timespec *ts)
 static int pc_clock_getres(clockid_t id, struct timespec *ts)
 {
 	struct posix_clock_desc cd;
+	struct timespec64 ts64;
 	int err;
 
 	err = get_clock_desc(id, &cd);
 	if (err)
 		return err;
 
-	if (cd.clk->ops.clock_getres)
-		err = cd.clk->ops.clock_getres(cd.clk, ts);
+	if (cd.clk->ops.clock_getres) {
+		err = cd.clk->ops.clock_getres(cd.clk, &ts64);
+		*ts = timespec64_to_timespec(ts64);
+	}
 	else
 		err = -EOPNOTSUPP;
 
@@ -337,6 +343,7 @@ static int pc_clock_getres(clockid_t id, struct timespec *ts)
 
 static int pc_clock_settime(clockid_t id, const struct timespec *ts)
 {
+	struct timespec64 ts64 = timespec_to_timespec64(*ts);
 	struct posix_clock_desc cd;
 	int err;
 
@@ -350,7 +357,7 @@ static int pc_clock_settime(clockid_t id, const struct timespec *ts)
 	}
 
 	if (cd.clk->ops.clock_settime)
-		err = cd.clk->ops.clock_settime(cd.clk, ts);
+		err = cd.clk->ops.clock_settime(cd.clk, &ts64);
 	else
 		err = -EOPNOTSUPP;
 out:
@@ -403,29 +410,36 @@ static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
 {
 	clockid_t id = kit->it_clock;
 	struct posix_clock_desc cd;
+	struct itimerspec64 ts64;
 
 	if (get_clock_desc(id, &cd))
 		return;
 
-	if (cd.clk->ops.timer_gettime)
-		cd.clk->ops.timer_gettime(cd.clk, kit, ts);
-
+	if (cd.clk->ops.timer_gettime) {
+		cd.clk->ops.timer_gettime(cd.clk, kit, &ts64);
+		*ts = itimerspec64_to_itimerspec(&ts64);
+	}
 	put_clock_desc(&cd);
 }
 
 static int pc_timer_settime(struct k_itimer *kit, int flags,
 			    struct itimerspec *ts, struct itimerspec *old)
 {
+	struct itimerspec64 ts64 = itimerspec_to_itimerspec64(ts);
 	clockid_t id = kit->it_clock;
 	struct posix_clock_desc cd;
+	struct itimerspec64 old64;
 	int err;
 
 	err = get_clock_desc(id, &cd);
 	if (err)
 		return err;
 
-	if (cd.clk->ops.timer_settime)
-		err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
+	if (cd.clk->ops.timer_settime) {
+		err = cd.clk->ops.timer_settime(cd.clk, kit, flags, &ts64, &old64);
+		if (old)
+			*old = itimerspec64_to_itimerspec(&old64);
+	}
 	else
 		err = -EOPNOTSUPP;
 
-- 
cgit v1.3-6-gb490


From 3c9c12f4b4610dba864038c7822b427816f5893c Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:14 -0700
Subject: time: Change k_clock clock_get() to use timespec64

struct timespec is not y2038 safe on 32 bit machines.  Replace uses of
struct timespec with struct timespec64 in the kernel.

The syscall interfaces themselves will be changed in a separate series.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: y2038@lists.linaro.org
Cc: john.stultz@linaro.org
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/1490555058-4603-4-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/char/mmtimer.c         |  4 ++--
 include/linux/posix-timers.h   |  2 +-
 include/linux/timekeeping.h    |  5 +++++
 kernel/time/alarmtimer.c       |  4 ++--
 kernel/time/posix-clock.c      |  9 +++------
 kernel/time/posix-cpu-timers.c | 10 +++++-----
 kernel/time/posix-stubs.c      |  9 ++++++---
 kernel/time/posix-timers.c     | 32 +++++++++++++++++---------------
 8 files changed, 41 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index b708c85dc9c1..40d880b8c02f 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -478,13 +478,13 @@ static int sgi_clock_period;
 static struct timespec sgi_clock_offset;
 static int sgi_clock_period;
 
-static int sgi_clock_get(clockid_t clockid, struct timespec *tp)
+static int sgi_clock_get(clockid_t clockid, struct timespec64 *tp)
 {
 	u64 nsec;
 
 	nsec = rtc_time() * sgi_clock_period
 			+ sgi_clock_offset.tv_nsec;
-	*tp = ns_to_timespec(nsec);
+	*tp = ns_to_timespec64(nsec);
 	tp->tv_sec += sgi_clock_offset.tv_sec;
 	return 0;
 };
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 64aa189efe21..0688f3975da7 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -90,7 +90,7 @@ struct k_clock {
 	int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
 	int (*clock_set) (const clockid_t which_clock,
 			  const struct timespec *tp);
-	int (*clock_get) (const clockid_t which_clock, struct timespec * tp);
+	int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp);
 	int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
 	int (*timer_create) (struct k_itimer *timer);
 	int (*nsleep) (const clockid_t which_clock, int flags,
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 3617a78897bb..ddc229ff6d1e 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -258,6 +258,11 @@ static inline void timekeeping_clocktai(struct timespec *ts)
 	*ts = ktime_to_timespec(ktime_get_clocktai());
 }
 
+static inline void timekeeping_clocktai64(struct timespec64 *ts)
+{
+	*ts = ktime_to_timespec64(ktime_get_clocktai());
+}
+
 /*
  * RTC specific
  */
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ce3a31e8eb36..944ca6e6f1c2 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -558,14 +558,14 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
  *
  * Provides the underlying alarm base time.
  */
-static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
+static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp)
 {
 	struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
 
 	if (!alarmtimer_get_rtcdev())
 		return -EINVAL;
 
-	*tp = ktime_to_timespec(base->gettime());
+	*tp = ktime_to_timespec64(base->gettime());
 	return 0;
 }
 
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index e24008c098c6..fab6bd33155e 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -297,20 +297,17 @@ out:
 	return err;
 }
 
-static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+static int pc_clock_gettime(clockid_t id, struct timespec64 *ts)
 {
 	struct posix_clock_desc cd;
-	struct timespec64 ts64;
 	int err;
 
 	err = get_clock_desc(id, &cd);
 	if (err)
 		return err;
 
-	if (cd.clk->ops.clock_gettime) {
-		err = cd.clk->ops.clock_gettime(cd.clk, &ts64);
-		*ts = timespec64_to_timespec(ts64);
-	}
+	if (cd.clk->ops.clock_gettime)
+		err = cd.clk->ops.clock_gettime(cd.clk, ts);
 	else
 		err = -EOPNOTSUPP;
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 76bea3a47d4b..082231cc7953 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -261,7 +261,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 
 static int posix_cpu_clock_get_task(struct task_struct *tsk,
 				    const clockid_t which_clock,
-				    struct timespec *tp)
+				    struct timespec64 *tp)
 {
 	int err = -EINVAL;
 	u64 rtn;
@@ -275,13 +275,13 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
 	}
 
 	if (!err)
-		*tp = ns_to_timespec(rtn);
+		*tp = ns_to_timespec64(rtn);
 
 	return err;
 }
 
 
-static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp)
 {
 	const pid_t pid = CPUCLOCK_PID(which_clock);
 	int err = -EINVAL;
@@ -1374,7 +1374,7 @@ static int process_cpu_clock_getres(const clockid_t which_clock,
 	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
 }
 static int process_cpu_clock_get(const clockid_t which_clock,
-				 struct timespec *tp)
+				 struct timespec64 *tp)
 {
 	return posix_cpu_clock_get(PROCESS_CLOCK, tp);
 }
@@ -1399,7 +1399,7 @@ static int thread_cpu_clock_getres(const clockid_t which_clock,
 	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
 }
 static int thread_cpu_clock_get(const clockid_t which_clock,
-				struct timespec *tp)
+				struct timespec64 *tp)
 {
 	return posix_cpu_clock_get(THREAD_CLOCK, tp);
 }
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 95a1b1fc3968..0fbd0c5b95ca 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -64,14 +64,17 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 		struct timespec __user *,tp)
 {
+	struct timespec64 kernel_tp64;
 	struct timespec kernel_tp;
 
 	switch (which_clock) {
-	case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break;
-	case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break;
-	case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break;
+	case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
+	case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
+	case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
 	default: return -EINVAL;
 	}
+
+	kernel_tp = timespec64_to_timespec(kernel_tp64);
 	if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
 		return -EFAULT;
 	return 0;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index f215ef792772..68170642c77c 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -204,9 +204,9 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 }
 
 /* Get clock_realtime */
-static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp)
 {
-	ktime_get_real_ts(tp);
+	ktime_get_real_ts64(tp);
 	return 0;
 }
 
@@ -229,32 +229,32 @@ static int posix_clock_realtime_adj(const clockid_t which_clock,
 /*
  * Get monotonic time for posix timers
  */
-static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp)
 {
-	ktime_get_ts(tp);
+	ktime_get_ts64(tp);
 	return 0;
 }
 
 /*
  * Get monotonic-raw time for posix timers
  */
-static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
+static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp)
 {
-	getrawmonotonic(tp);
+	getrawmonotonic64(tp);
 	return 0;
 }
 
 
-static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
+static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp)
 {
-	*tp = current_kernel_time();
+	*tp = current_kernel_time64();
 	return 0;
 }
 
 static int posix_get_monotonic_coarse(clockid_t which_clock,
-						struct timespec *tp)
+						struct timespec64 *tp)
 {
-	*tp = get_monotonic_coarse();
+	*tp = get_monotonic_coarse64();
 	return 0;
 }
 
@@ -264,15 +264,15 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
 	return 0;
 }
 
-static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp)
 {
-	get_monotonic_boottime(tp);
+	get_monotonic_boottime64(tp);
 	return 0;
 }
 
-static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
 {
-	timekeeping_clocktai(tp);
+	timekeeping_clocktai64(tp);
 	return 0;
 }
 
@@ -1032,13 +1032,15 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 		struct timespec __user *,tp)
 {
 	struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timespec64 kernel_tp64;
 	struct timespec kernel_tp;
 	int error;
 
 	if (!kc)
 		return -EINVAL;
 
-	error = kc->clock_get(which_clock, &kernel_tp);
+	error = kc->clock_get(which_clock, &kernel_tp64);
+	kernel_tp = timespec64_to_timespec(kernel_tp64);
 
 	if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
 		error = -EFAULT;
-- 
cgit v1.3-6-gb490


From d2e3e0ca5df7f4ffe90a04790b3be20485df056a Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:15 -0700
Subject: time: Change k_clock clock_getres() to use timespec64

struct timespec is not y2038 safe on 32 bit machines.  Replace uses of
struct timespec with struct timespec64 in the kernel. The syscall
interfaces themselves will be changed in a separate series.

The clock_getres() interface has also been changed to use timespec64 even
though this particular interface is not affected by the y2038 problem. This
helps verification for internal kernel code for y2038 readiness by getting
rid of time_t/ timeval/ timespec completely.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: y2038@lists.linaro.org
Cc: john.stultz@linaro.org
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/1490555058-4603-5-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/char/mmtimer.c         |  2 +-
 include/linux/posix-timers.h   |  2 +-
 kernel/time/alarmtimer.c       |  2 +-
 kernel/time/posix-clock.c      |  9 +++------
 kernel/time/posix-cpu-timers.c |  6 +++---
 kernel/time/posix-timers.c     | 10 ++++++----
 6 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index 40d880b8c02f..79d8ada08a80 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -765,7 +765,7 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
 	return err;
 }
 
-static int sgi_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int sgi_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
 	tp->tv_sec = 0;
 	tp->tv_nsec = sgi_clock_period;
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 0688f3975da7..dd05b49074f3 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -87,7 +87,7 @@ struct k_itimer {
 };
 
 struct k_clock {
-	int (*clock_getres) (const clockid_t which_clock, struct timespec *tp);
+	int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp);
 	int (*clock_set) (const clockid_t which_clock,
 			  const struct timespec *tp);
 	int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 944ca6e6f1c2..e8a45e2c3d95 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -541,7 +541,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
  *
  * Returns the granularity of underlying alarm base clock
  */
-static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
 	if (!alarmtimer_get_rtcdev())
 		return -EINVAL;
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index fab6bd33155e..af91031f64de 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -316,20 +316,17 @@ static int pc_clock_gettime(clockid_t id, struct timespec64 *ts)
 	return err;
 }
 
-static int pc_clock_getres(clockid_t id, struct timespec *ts)
+static int pc_clock_getres(clockid_t id, struct timespec64 *ts)
 {
 	struct posix_clock_desc cd;
-	struct timespec64 ts64;
 	int err;
 
 	err = get_clock_desc(id, &cd);
 	if (err)
 		return err;
 
-	if (cd.clk->ops.clock_getres) {
-		err = cd.clk->ops.clock_getres(cd.clk, &ts64);
-		*ts = timespec64_to_timespec(ts64);
-	}
+	if (cd.clk->ops.clock_getres)
+		err = cd.clk->ops.clock_getres(cd.clk, ts);
 	else
 		err = -EOPNOTSUPP;
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 082231cc7953..37ce9edfb968 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -116,7 +116,7 @@ static inline u64 virt_ticks(struct task_struct *p)
 }
 
 static int
-posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 {
 	int error = check_clock(which_clock);
 	if (!error) {
@@ -1369,7 +1369,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 #define THREAD_CLOCK	MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
 
 static int process_cpu_clock_getres(const clockid_t which_clock,
-				    struct timespec *tp)
+				    struct timespec64 *tp)
 {
 	return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
 }
@@ -1394,7 +1394,7 @@ static long process_cpu_nsleep_restart(struct restart_block *restart_block)
 	return -EINVAL;
 }
 static int thread_cpu_clock_getres(const clockid_t which_clock,
-				   struct timespec *tp)
+				   struct timespec64 *tp)
 {
 	return posix_cpu_clock_getres(THREAD_CLOCK, tp);
 }
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 68170642c77c..f67dae9f3bdf 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -258,9 +258,9 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
 	return 0;
 }
 
-static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
+static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp)
 {
-	*tp = ktime_to_timespec(KTIME_LOW_RES);
+	*tp = ktime_to_timespec64(KTIME_LOW_RES);
 	return 0;
 }
 
@@ -276,7 +276,7 @@ static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp)
 	return 0;
 }
 
-static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
 {
 	tp->tv_sec = 0;
 	tp->tv_nsec = hrtimer_resolution;
@@ -1075,13 +1075,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
 		struct timespec __user *, tp)
 {
 	struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timespec64 rtn_tp64;
 	struct timespec rtn_tp;
 	int error;
 
 	if (!kc)
 		return -EINVAL;
 
-	error = kc->clock_getres(which_clock, &rtn_tp);
+	error = kc->clock_getres(which_clock, &rtn_tp64);
+	rtn_tp = timespec64_to_timespec(rtn_tp64);
 
 	if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
 		error = -EFAULT;
-- 
cgit v1.3-6-gb490


From 0fe6afe3834ba13d75fa1168f0f66f08b427e1c0 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:16 -0700
Subject: time: Change k_clock clock_set() to use timespec64

struct timespec is not y2038 safe on 32 bit machines.  Replace uses of
struct timespec with struct timespec64 in the kernel.

The syscall interfaces themselves will be changed in a separate series.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: y2038@lists.linaro.org
Cc: john.stultz@linaro.org
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/1490555058-4603-6-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/char/mmtimer.c         |  2 +-
 include/linux/posix-timers.h   |  2 +-
 kernel/time/posix-clock.c      |  5 ++---
 kernel/time/posix-cpu-timers.c |  2 +-
 kernel/time/posix-timers.c     | 11 +++++------
 5 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index 79d8ada08a80..ba1b8925c4c8 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -489,7 +489,7 @@ static int sgi_clock_get(clockid_t clockid, struct timespec64 *tp)
 	return 0;
 };
 
-static int sgi_clock_set(const clockid_t clockid, const struct timespec *tp)
+static int sgi_clock_set(const clockid_t clockid, const struct timespec64 *tp)
 {
 
 	u64 nsec;
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index dd05b49074f3..7825e242b128 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -89,7 +89,7 @@ struct k_itimer {
 struct k_clock {
 	int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp);
 	int (*clock_set) (const clockid_t which_clock,
-			  const struct timespec *tp);
+			  const struct timespec64 *tp);
 	int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp);
 	int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
 	int (*timer_create) (struct k_itimer *timer);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index af91031f64de..3807a34519c4 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -335,9 +335,8 @@ static int pc_clock_getres(clockid_t id, struct timespec64 *ts)
 	return err;
 }
 
-static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+static int pc_clock_settime(clockid_t id, const struct timespec64 *ts)
 {
-	struct timespec64 ts64 = timespec_to_timespec64(*ts);
 	struct posix_clock_desc cd;
 	int err;
 
@@ -351,7 +350,7 @@ static int pc_clock_settime(clockid_t id, const struct timespec *ts)
 	}
 
 	if (cd.clk->ops.clock_settime)
-		err = cd.clk->ops.clock_settime(cd.clk, &ts64);
+		err = cd.clk->ops.clock_settime(cd.clk, ts);
 	else
 		err = -EOPNOTSUPP;
 out:
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 37ce9edfb968..2cd4428c81f0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -135,7 +135,7 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp)
 }
 
 static int
-posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp)
 {
 	/*
 	 * You can never reset a CPU clock, but we check for other errors
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index f67dae9f3bdf..7742da826f02 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -212,12 +212,9 @@ static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp
 
 /* Set clock_realtime */
 static int posix_clock_realtime_set(const clockid_t which_clock,
-				    const struct timespec *tp)
+				    const struct timespec64 *tp)
 {
-	struct timespec64 tp64;
-
-	tp64 = timespec_to_timespec64(*tp);
-	return do_sys_settimeofday64(&tp64, NULL);
+	return do_sys_settimeofday64(tp, NULL);
 }
 
 static int posix_clock_realtime_adj(const clockid_t which_clock,
@@ -1017,6 +1014,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 		const struct timespec __user *, tp)
 {
 	struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timespec64 new_tp64;
 	struct timespec new_tp;
 
 	if (!kc || !kc->clock_set)
@@ -1024,8 +1022,9 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
 
 	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
 		return -EFAULT;
+	new_tp64 = timespec_to_timespec64(new_tp);
 
-	return kc->clock_set(which_clock, &new_tp);
+	return kc->clock_set(which_clock, &new_tp64);
 }
 
 SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
-- 
cgit v1.3-6-gb490


From 5f252b325625c13db1dbc76ac6cdb49ee3bd062e Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:17 -0700
Subject: time: Change k_clock timer_set() and timer_get() to use timespec64

struct timespec is not y2038 safe on 32 bit machines.  Replace uses of
struct timespec with struct timespec64 in the kernel.

struct itimerspec internally uses struct timespec.  Use struct itimerspec64
which uses struct timespec64.

The syscall interfaces themselves will be changed in a separate series.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: y2038@lists.linaro.org
Cc: john.stultz@linaro.org
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/1490555058-4603-7-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/char/mmtimer.c         | 20 ++++++++++----------
 include/linux/posix-timers.h   | 12 ++++++------
 kernel/time/alarmtimer.c       | 14 +++++++-------
 kernel/time/posix-clock.c      | 21 +++++++--------------
 kernel/time/posix-cpu-timers.c | 28 ++++++++++++++++------------
 kernel/time/posix-timers.c     | 37 +++++++++++++++++++++----------------
 6 files changed, 67 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/mmtimer.c b/drivers/char/mmtimer.c
index ba1b8925c4c8..0e7fcb04f01e 100644
--- a/drivers/char/mmtimer.c
+++ b/drivers/char/mmtimer.c
@@ -657,7 +657,7 @@ static int sgi_timer_del(struct k_itimer *timr)
 }
 
 /* Assumption: it_lock is already held with irq's disabled */
-static void sgi_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+static void sgi_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 {
 
 	if (timr->it.mmtimer.clock == TIMER_OFF) {
@@ -668,14 +668,14 @@ static void sgi_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 		return;
 	}
 
-	cur_setting->it_interval = ns_to_timespec(timr->it.mmtimer.incr * sgi_clock_period);
-	cur_setting->it_value = ns_to_timespec((timr->it.mmtimer.expires - rtc_time()) * sgi_clock_period);
+	cur_setting->it_interval = ns_to_timespec64(timr->it.mmtimer.incr * sgi_clock_period);
+	cur_setting->it_value = ns_to_timespec64((timr->it.mmtimer.expires - rtc_time()) * sgi_clock_period);
 }
 
 
 static int sgi_timer_set(struct k_itimer *timr, int flags,
-	struct itimerspec * new_setting,
-	struct itimerspec * old_setting)
+	struct itimerspec64 *new_setting,
+	struct itimerspec64 *old_setting)
 {
 	unsigned long when, period, irqflags;
 	int err = 0;
@@ -687,8 +687,8 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
 		sgi_timer_get(timr, old_setting);
 
 	sgi_timer_del(timr);
-	when = timespec_to_ns(&new_setting->it_value);
-	period = timespec_to_ns(&new_setting->it_interval);
+	when = timespec64_to_ns(&new_setting->it_value);
+	period = timespec64_to_ns(&new_setting->it_interval);
 
 	if (when == 0)
 		/* Clear timer */
@@ -699,11 +699,11 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
 		return -ENOMEM;
 
 	if (flags & TIMER_ABSTIME) {
-		struct timespec n;
+		struct timespec64 n;
 		unsigned long now;
 
-		getnstimeofday(&n);
-		now = timespec_to_ns(&n);
+		getnstimeofday64(&n);
+		now = timespec64_to_ns(&n);
 		if (when > now)
 			when -= now;
 		else
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 7825e242b128..ebc4c4945339 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -96,13 +96,13 @@ struct k_clock {
 	int (*nsleep) (const clockid_t which_clock, int flags,
 		       struct timespec *, struct timespec __user *);
 	long (*nsleep_restart) (struct restart_block *restart_block);
-	int (*timer_set) (struct k_itimer * timr, int flags,
-			  struct itimerspec * new_setting,
-			  struct itimerspec * old_setting);
-	int (*timer_del) (struct k_itimer * timr);
+	int (*timer_set) (struct k_itimer *timr, int flags,
+			  struct itimerspec64 *new_setting,
+			  struct itimerspec64 *old_setting);
+	int (*timer_del) (struct k_itimer *timr);
 #define TIMER_RETRY 1
-	void (*timer_get) (struct k_itimer * timr,
-			   struct itimerspec * cur_setting);
+	void (*timer_get) (struct k_itimer *timr,
+			   struct itimerspec64 *cur_setting);
 };
 
 extern struct k_clock clock_posix_cpu;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e8a45e2c3d95..e069f94999a8 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -598,19 +598,19 @@ static int alarm_timer_create(struct k_itimer *new_timer)
  * Copies out the current itimerspec data
  */
 static void alarm_timer_get(struct k_itimer *timr,
-				struct itimerspec *cur_setting)
+			    struct itimerspec64 *cur_setting)
 {
 	ktime_t relative_expiry_time =
 		alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
 
 	if (ktime_to_ns(relative_expiry_time) > 0) {
-		cur_setting->it_value = ktime_to_timespec(relative_expiry_time);
+		cur_setting->it_value = ktime_to_timespec64(relative_expiry_time);
 	} else {
 		cur_setting->it_value.tv_sec = 0;
 		cur_setting->it_value.tv_nsec = 0;
 	}
 
-	cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval);
+	cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval);
 }
 
 /**
@@ -640,8 +640,8 @@ static int alarm_timer_del(struct k_itimer *timr)
  * Sets the timer to new_setting, and starts the timer.
  */
 static int alarm_timer_set(struct k_itimer *timr, int flags,
-				struct itimerspec *new_setting,
-				struct itimerspec *old_setting)
+			   struct itimerspec64 *new_setting,
+			   struct itimerspec64 *old_setting)
 {
 	ktime_t exp;
 
@@ -659,8 +659,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
 		return TIMER_RETRY;
 
 	/* start the timer */
-	timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
-	exp = timespec_to_ktime(new_setting->it_value);
+	timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval);
+	exp = timespec64_to_ktime(new_setting->it_value);
 	/* Convert (if necessary) to absolute time */
 	if (flags != TIMER_ABSTIME) {
 		ktime_t now;
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 3807a34519c4..31d588d37a17 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -399,40 +399,33 @@ static int pc_timer_delete(struct k_itimer *kit)
 	return err;
 }
 
-static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts)
 {
 	clockid_t id = kit->it_clock;
 	struct posix_clock_desc cd;
-	struct itimerspec64 ts64;
 
 	if (get_clock_desc(id, &cd))
 		return;
 
-	if (cd.clk->ops.timer_gettime) {
-		cd.clk->ops.timer_gettime(cd.clk, kit, &ts64);
-		*ts = itimerspec64_to_itimerspec(&ts64);
-	}
+	if (cd.clk->ops.timer_gettime)
+		cd.clk->ops.timer_gettime(cd.clk, kit, ts);
+
 	put_clock_desc(&cd);
 }
 
 static int pc_timer_settime(struct k_itimer *kit, int flags,
-			    struct itimerspec *ts, struct itimerspec *old)
+			    struct itimerspec64 *ts, struct itimerspec64 *old)
 {
-	struct itimerspec64 ts64 = itimerspec_to_itimerspec64(ts);
 	clockid_t id = kit->it_clock;
 	struct posix_clock_desc cd;
-	struct itimerspec64 old64;
 	int err;
 
 	err = get_clock_desc(id, &cd);
 	if (err)
 		return err;
 
-	if (cd.clk->ops.timer_settime) {
-		err = cd.clk->ops.timer_settime(cd.clk, kit, flags, &ts64, &old64);
-		if (old)
-			*old = itimerspec64_to_itimerspec(&old64);
-	}
+	if (cd.clk->ops.timer_settime)
+		err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
 	else
 		err = -EOPNOTSUPP;
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2cd4428c81f0..441064d1216f 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -562,7 +562,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
  * and try again.  (This happens when the timer is in the middle of firing.)
  */
 static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
-			       struct itimerspec *new, struct itimerspec *old)
+			       struct itimerspec64 *new, struct itimerspec64 *old)
 {
 	unsigned long flags;
 	struct sighand_struct *sighand;
@@ -572,7 +572,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 
 	WARN_ON_ONCE(p == NULL);
 
-	new_expires = timespec_to_ns(&new->it_value);
+	new_expires = timespec64_to_ns(&new->it_value);
 
 	/*
 	 * Protect against sighand release/switch in exit/exec and p->cpu_timers
@@ -633,7 +633,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 			bump_cpu_timer(timer, val);
 			if (val < timer->it.cpu.expires) {
 				old_expires = timer->it.cpu.expires - val;
-				old->it_value = ns_to_timespec(old_expires);
+				old->it_value = ns_to_timespec64(old_expires);
 			} else {
 				old->it_value.tv_nsec = 1;
 				old->it_value.tv_sec = 0;
@@ -671,7 +671,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	 * Install the new reload setting, and
 	 * set up the signal and overrun bookkeeping.
 	 */
-	timer->it.cpu.incr = timespec_to_ns(&new->it_interval);
+	timer->it.cpu.incr = timespec64_to_ns(&new->it_interval);
 
 	/*
 	 * This acts as a modification timestamp for the timer,
@@ -695,12 +695,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	ret = 0;
  out:
 	if (old)
-		old->it_interval = ns_to_timespec(old_incr);
+		old->it_interval = ns_to_timespec64(old_incr);
 
 	return ret;
 }
 
-static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp)
 {
 	u64 now;
 	struct task_struct *p = timer->it.cpu.task;
@@ -710,7 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	/*
 	 * Easy part: convert the reload time.
 	 */
-	itp->it_interval = ns_to_timespec(timer->it.cpu.incr);
+	itp->it_interval = ns_to_timespec64(timer->it.cpu.incr);
 
 	if (timer->it.cpu.expires == 0) {	/* Timer not armed at all.  */
 		itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
@@ -739,7 +739,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 			 * Call the timer disarmed, nothing else to do.
 			 */
 			timer->it.cpu.expires = 0;
-			itp->it_value = ns_to_timespec(timer->it.cpu.expires);
+			itp->it_value = ns_to_timespec64(timer->it.cpu.expires);
 			return;
 		} else {
 			cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -748,7 +748,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	}
 
 	if (now < timer->it.cpu.expires) {
-		itp->it_value = ns_to_timespec(timer->it.cpu.expires - now);
+		itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now);
 	} else {
 		/*
 		 * The timer should have expired already, but the firing
@@ -1221,6 +1221,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 			    struct timespec *rqtp, struct itimerspec *it)
 {
+	struct itimerspec64 it64;
 	struct k_itimer timer;
 	int error;
 
@@ -1234,13 +1235,14 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 	error = posix_cpu_timer_create(&timer);
 	timer.it_process = current;
 	if (!error) {
-		static struct itimerspec zero_it;
+		static struct itimerspec64 zero_it;
 
 		memset(it, 0, sizeof *it);
 		it->it_value = *rqtp;
 
 		spin_lock_irq(&timer.it_lock);
-		error = posix_cpu_timer_set(&timer, flags, it, NULL);
+		it64 = itimerspec_to_itimerspec64(it);
+		error = posix_cpu_timer_set(&timer, flags, &it64, NULL);
 		if (error) {
 			spin_unlock_irq(&timer.it_lock);
 			return error;
@@ -1270,7 +1272,9 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		 * We were interrupted by a signal.
 		 */
 		*rqtp = ns_to_timespec(timer.it.cpu.expires);
-		error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
+		it64 = itimerspec_to_itimerspec64(it);
+		error = posix_cpu_timer_set(&timer, 0, &zero_it, &it64);
+		*it = itimerspec64_to_itimerspec(&it64);
 		if (!error) {
 			/*
 			 * Timer is now unarmed, deletion can not fail.
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 7742da826f02..9da4797d0c63 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -133,9 +133,9 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
 static int common_nsleep(const clockid_t, int flags, struct timespec *t,
 			 struct timespec __user *rmtp);
 static int common_timer_create(struct k_itimer *new_timer);
-static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static void common_timer_get(struct k_itimer *, struct itimerspec64 *);
 static int common_timer_set(struct k_itimer *, int,
-			    struct itimerspec *, struct itimerspec *);
+			    struct itimerspec64 *, struct itimerspec64 *);
 static int common_timer_del(struct k_itimer *timer);
 
 static enum hrtimer_restart posix_timer_fn(struct hrtimer *data);
@@ -734,18 +734,18 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
  * report.
  */
 static void
-common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
+common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
 {
 	ktime_t now, remaining, iv;
 	struct hrtimer *timer = &timr->it.real.timer;
 
-	memset(cur_setting, 0, sizeof(struct itimerspec));
+	memset(cur_setting, 0, sizeof(*cur_setting));
 
 	iv = timr->it.real.interval;
 
 	/* interval timer ? */
 	if (iv)
-		cur_setting->it_interval = ktime_to_timespec(iv);
+		cur_setting->it_interval = ktime_to_timespec64(iv);
 	else if (!hrtimer_active(timer) &&
 		 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
 		return;
@@ -771,13 +771,14 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 		if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE)
 			cur_setting->it_value.tv_nsec = 1;
 	} else
-		cur_setting->it_value = ktime_to_timespec(remaining);
+		cur_setting->it_value = ktime_to_timespec64(remaining);
 }
 
 /* Get the time remaining on a POSIX.1b interval timer. */
 SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 		struct itimerspec __user *, setting)
 {
+	struct itimerspec64 cur_setting64;
 	struct itimerspec cur_setting;
 	struct k_itimer *timr;
 	struct k_clock *kc;
@@ -792,10 +793,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
 	if (WARN_ON_ONCE(!kc || !kc->timer_get))
 		ret = -EINVAL;
 	else
-		kc->timer_get(timr, &cur_setting);
+		kc->timer_get(timr, &cur_setting64);
 
 	unlock_timer(timr, flags);
 
+	cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
 	if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
 		return -EFAULT;
 
@@ -831,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
 /* timr->it_lock is taken. */
 static int
 common_timer_set(struct k_itimer *timr, int flags,
-		 struct itimerspec *new_setting, struct itimerspec *old_setting)
+		 struct itimerspec64 *new_setting, struct itimerspec64 *old_setting)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
 	enum hrtimer_mode mode;
@@ -860,10 +862,10 @@ common_timer_set(struct k_itimer *timr, int flags,
 	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
 	timr->it.real.timer.function = posix_timer_fn;
 
-	hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
+	hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value));
 
 	/* Convert interval */
-	timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
+	timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval);
 
 	/* SIGEV_NONE timers are not queued ! See common_timer_get */
 	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
@@ -883,21 +885,23 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
 		const struct itimerspec __user *, new_setting,
 		struct itimerspec __user *, old_setting)
 {
-	struct k_itimer *timr;
+	struct itimerspec64 new_spec64, old_spec64;
+	struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
 	struct itimerspec new_spec, old_spec;
-	int error = 0;
+	struct k_itimer *timr;
 	unsigned long flag;
-	struct itimerspec *rtn = old_setting ? &old_spec : NULL;
 	struct k_clock *kc;
+	int error = 0;
 
 	if (!new_setting)
 		return -EINVAL;
 
 	if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
 		return -EFAULT;
+	new_spec64 = itimerspec_to_itimerspec64(&new_spec);
 
-	if (!timespec_valid(&new_spec.it_interval) ||
-	    !timespec_valid(&new_spec.it_value))
+	if (!timespec64_valid(&new_spec64.it_interval) ||
+	    !timespec64_valid(&new_spec64.it_value))
 		return -EINVAL;
 retry:
 	timr = lock_timer(timer_id, &flag);
@@ -908,7 +912,7 @@ retry:
 	if (WARN_ON_ONCE(!kc || !kc->timer_set))
 		error = -EINVAL;
 	else
-		error = kc->timer_set(timr, flags, &new_spec, rtn);
+		error = kc->timer_set(timr, flags, &new_spec64, rtn);
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
@@ -916,6 +920,7 @@ retry:
 		goto retry;
 	}
 
+	old_spec = itimerspec64_to_itimerspec(&old_spec64);
 	if (old_setting && !error &&
 	    copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
 		error = -EFAULT;
-- 
cgit v1.3-6-gb490


From ad19638463a4b5c909fcebf0f19358c4bf4fed48 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Sun, 26 Mar 2017 12:04:18 -0700
Subject: time: Change k_clock nsleep() to use timespec64

struct timespec is not y2038 safe on 32 bit machines.  Replace uses of
struct timespec with struct timespec64 in the kernel.

The syscall interfaces themselves will be changed in a separate series.

Note that the restart_block parameter for nanosleep has also been left
unchanged and will be part of syscall series noted above.

Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: y2038@lists.linaro.org
Cc: john.stultz@linaro.org
Cc: arnd@arndb.de
Link: http://lkml.kernel.org/r/1490555058-4603-8-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h        |  2 +-
 include/linux/posix-timers.h   |  2 +-
 kernel/compat.c                |  6 ++++--
 kernel/time/alarmtimer.c       |  7 ++++---
 kernel/time/hrtimer.c          | 10 ++++++----
 kernel/time/posix-cpu-timers.c | 36 ++++++++++++++++++------------------
 kernel/time/posix-stubs.c      |  6 ++++--
 kernel/time/posix-timers.c     | 10 ++++++----
 8 files changed, 44 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 23d58fcd4d9a..8c5b10eb7265 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -452,7 +452,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer,
 }
 
 /* Precise sleep: */
-extern long hrtimer_nanosleep(struct timespec *rqtp,
+extern long hrtimer_nanosleep(struct timespec64 *rqtp,
 			      struct timespec __user *rmtp,
 			      const enum hrtimer_mode mode,
 			      const clockid_t clockid);
diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index ebc4c4945339..8c1e43ab14a9 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -94,7 +94,7 @@ struct k_clock {
 	int (*clock_adj) (const clockid_t which_clock, struct timex *tx);
 	int (*timer_create) (struct k_itimer *timer);
 	int (*nsleep) (const clockid_t which_clock, int flags,
-		       struct timespec *, struct timespec __user *);
+		       struct timespec64 *, struct timespec __user *);
 	long (*nsleep_restart) (struct restart_block *restart_block);
 	int (*timer_set) (struct k_itimer *timr, int flags,
 			  struct itimerspec64 *new_setting,
diff --git a/kernel/compat.c b/kernel/compat.c
index 1eb9e8aad869..933bcb31ae10 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -240,18 +240,20 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
 		       struct compat_timespec __user *, rmtp)
 {
 	struct timespec tu, rmt;
+	struct timespec64 tu64;
 	mm_segment_t oldfs;
 	long ret;
 
 	if (compat_get_timespec(&tu, rqtp))
 		return -EFAULT;
 
-	if (!timespec_valid(&tu))
+	tu64 = timespec_to_timespec64(tu);
+	if (!timespec64_valid(&tu64))
 		return -EINVAL;
 
 	oldfs = get_fs();
 	set_fs(KERNEL_DS);
-	ret = hrtimer_nanosleep(&tu,
+	ret = hrtimer_nanosleep(&tu64,
 				rmtp ? (struct timespec __user *)&rmt : NULL,
 				HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 	set_fs(oldfs);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index e069f94999a8..5cb5b0008d97 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -790,13 +790,14 @@ out:
  * Handles clock_nanosleep calls against _ALARM clockids
  */
 static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
-		     struct timespec *tsreq, struct timespec __user *rmtp)
+			      struct timespec64 *tsreq,
+			      struct timespec __user *rmtp)
 {
 	enum  alarmtimer_type type = clock2alarm(which_clock);
+	struct restart_block *restart;
 	struct alarm alarm;
 	ktime_t exp;
 	int ret = 0;
-	struct restart_block *restart;
 
 	if (!alarmtimer_get_rtcdev())
 		return -ENOTSUPP;
@@ -809,7 +810,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
 
 	alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
 
-	exp = timespec_to_ktime(*tsreq);
+	exp = timespec64_to_ktime(*tsreq);
 	/* Convert (if necessary) to absolute time */
 	if (flags != TIMER_ABSTIME) {
 		ktime_t now = alarm_bases[type].gettime();
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1ef82cdb61ff..a7560123617c 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1503,7 +1503,7 @@ out:
 	return ret;
 }
 
-long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
+long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp,
 		       const enum hrtimer_mode mode, const clockid_t clockid)
 {
 	struct restart_block *restart;
@@ -1516,7 +1516,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 		slack = 0;
 
 	hrtimer_init_on_stack(&t.timer, clockid, mode);
-	hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
+	hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack);
 	if (do_nanosleep(&t, mode))
 		goto out;
 
@@ -1547,15 +1547,17 @@ out:
 SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 		struct timespec __user *, rmtp)
 {
+	struct timespec64 tu64;
 	struct timespec tu;
 
 	if (copy_from_user(&tu, rqtp, sizeof(tu)))
 		return -EFAULT;
 
-	if (!timespec_valid(&tu))
+	tu64 = timespec_to_timespec64(tu);
+	if (!timespec64_valid(&tu64))
 		return -EINVAL;
 
-	return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+	return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
 }
 
 /*
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 441064d1216f..949e434d3536 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1219,9 +1219,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 }
 
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
-			    struct timespec *rqtp, struct itimerspec *it)
+			    struct timespec64 *rqtp, struct itimerspec64 *it)
 {
-	struct itimerspec64 it64;
 	struct k_itimer timer;
 	int error;
 
@@ -1241,8 +1240,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		it->it_value = *rqtp;
 
 		spin_lock_irq(&timer.it_lock);
-		it64 = itimerspec_to_itimerspec64(it);
-		error = posix_cpu_timer_set(&timer, flags, &it64, NULL);
+		error = posix_cpu_timer_set(&timer, flags, it, NULL);
 		if (error) {
 			spin_unlock_irq(&timer.it_lock);
 			return error;
@@ -1271,10 +1269,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 		/*
 		 * We were interrupted by a signal.
 		 */
-		*rqtp = ns_to_timespec(timer.it.cpu.expires);
-		it64 = itimerspec_to_itimerspec64(it);
-		error = posix_cpu_timer_set(&timer, 0, &zero_it, &it64);
-		*it = itimerspec64_to_itimerspec(&it64);
+		*rqtp = ns_to_timespec64(timer.it.cpu.expires);
+		error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
 		if (!error) {
 			/*
 			 * Timer is now unarmed, deletion can not fail.
@@ -1310,10 +1306,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
 static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
 
 static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
-			    struct timespec *rqtp, struct timespec __user *rmtp)
+			    struct timespec64 *rqtp, struct timespec __user *rmtp)
 {
 	struct restart_block *restart_block = &current->restart_block;
-	struct itimerspec it;
+	struct itimerspec64 it;
+	struct timespec ts;
 	int error;
 
 	/*
@@ -1333,13 +1330,14 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 		/*
 		 * Report back to the user the time still remaining.
 		 */
-		if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+		ts = timespec64_to_timespec(it.it_value);
+		if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp)))
 			return -EFAULT;
 
 		restart_block->fn = posix_cpu_nsleep_restart;
 		restart_block->nanosleep.clockid = which_clock;
 		restart_block->nanosleep.rmtp = rmtp;
-		restart_block->nanosleep.expires = timespec_to_ns(rqtp);
+		restart_block->nanosleep.expires = timespec64_to_ns(rqtp);
 	}
 	return error;
 }
@@ -1347,11 +1345,12 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
 static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->nanosleep.clockid;
-	struct timespec t;
-	struct itimerspec it;
+	struct itimerspec64 it;
+	struct timespec64 t;
+	struct timespec tmp;
 	int error;
 
-	t = ns_to_timespec(restart_block->nanosleep.expires);
+	t = ns_to_timespec64(restart_block->nanosleep.expires);
 
 	error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
 
@@ -1360,10 +1359,11 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 		/*
 		 * Report back to the user the time still remaining.
 		 */
-		if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+		 tmp = timespec64_to_timespec(it.it_value);
+		if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp)))
 			return -EFAULT;
 
-		restart_block->nanosleep.expires = timespec_to_ns(&t);
+		restart_block->nanosleep.expires = timespec64_to_ns(&t);
 	}
 	return error;
 
@@ -1388,7 +1388,7 @@ static int process_cpu_timer_create(struct k_itimer *timer)
 	return posix_cpu_timer_create(timer);
 }
 static int process_cpu_nsleep(const clockid_t which_clock, int flags,
-			      struct timespec *rqtp,
+			      struct timespec64 *rqtp,
 			      struct timespec __user *rmtp)
 {
 	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 0fbd0c5b95ca..c0cd53eb018a 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -103,6 +103,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		const struct timespec __user *, rqtp,
 		struct timespec __user *, rmtp)
 {
+	struct timespec64 t64;
 	struct timespec t;
 
 	switch (which_clock) {
@@ -111,9 +112,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 	case CLOCK_BOOTTIME:
 		if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
 			return -EFAULT;
-		if (!timespec_valid(&t))
+		t64 = timespec_to_timespec64(t);
+		if (!timespec64_valid(&t64))
 			return -EINVAL;
-		return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ?
+		return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ?
 					 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
 					 which_clock);
 	default:
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 9da4797d0c63..4d7b2ce09c27 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -130,7 +130,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
 /*
  * These ones are defined below.
  */
-static int common_nsleep(const clockid_t, int flags, struct timespec *t,
+static int common_nsleep(const clockid_t, int flags, struct timespec64 *t,
 			 struct timespec __user *rmtp);
 static int common_timer_create(struct k_itimer *new_timer);
 static void common_timer_get(struct k_itimer *, struct itimerspec64 *);
@@ -1099,7 +1099,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
  * nanosleep for monotonic and realtime clocks
  */
 static int common_nsleep(const clockid_t which_clock, int flags,
-			 struct timespec *tsave, struct timespec __user *rmtp)
+			 struct timespec64 *tsave, struct timespec __user *rmtp)
 {
 	return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
 				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
@@ -1111,6 +1111,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		struct timespec __user *, rmtp)
 {
 	struct k_clock *kc = clockid_to_kclock(which_clock);
+	struct timespec64 t64;
 	struct timespec t;
 
 	if (!kc)
@@ -1121,10 +1122,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
 		return -EFAULT;
 
-	if (!timespec_valid(&t))
+	t64 = timespec_to_timespec64(t);
+	if (!timespec64_valid(&t64))
 		return -EINVAL;
 
-	return kc->nsleep(which_clock, flags, &t, rmtp);
+	return kc->nsleep(which_clock, flags, &t64, rmtp);
 }
 
 /*
-- 
cgit v1.3-6-gb490


From 84253394927c4352652d0b118ad9583f5646959b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Apr 2017 13:28:46 +0200
Subject: remove the mg_disk driver

This drivers was added in 2008, but as far as a I can tell we never had a
single platform that actually registered resources for the platform driver.

It's also been unmaintained for a long time and apparently has a ATA mode
that can be driven using the IDE/libata subsystem.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/blockdev/mflash.txt |   84 ---
 drivers/block/Kconfig             |   17 -
 drivers/block/Makefile            |    1 -
 drivers/block/mg_disk.c           | 1110 -------------------------------------
 include/linux/mg_disk.h           |   45 --
 5 files changed, 1257 deletions(-)
 delete mode 100644 Documentation/blockdev/mflash.txt
 delete mode 100644 drivers/block/mg_disk.c
 delete mode 100644 include/linux/mg_disk.h

(limited to 'include/linux')

diff --git a/Documentation/blockdev/mflash.txt b/Documentation/blockdev/mflash.txt
deleted file mode 100644
index f7e050551487..000000000000
--- a/Documentation/blockdev/mflash.txt
+++ /dev/null
@@ -1,84 +0,0 @@
-This document describes m[g]flash support in linux.
-
-Contents
-  1. Overview
-  2. Reserved area configuration
-  3. Example of mflash platform driver registration
-
-1. Overview
-
-Mflash and gflash are embedded flash drive. The only difference is mflash is
-MCP(Multi Chip Package) device. These two device operate exactly same way.
-So the rest mflash repersents mflash and gflash altogether.
-
-Internally, mflash has nand flash and other hardware logics and supports
-2 different operation (ATA, IO) modes. ATA mode doesn't need any new
-driver and currently works well under standard IDE subsystem. Actually it's
-one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
-IDE interface.
-
-Following are brief descriptions about IO mode.
-A. IO mode based on ATA protocol and uses some custom command. (read confirm,
-write confirm)
-B. IO mode uses SRAM bus interface.
-C. IO mode supports 4kB boot area, so host can boot from mflash.
-
-2. Reserved area configuration
-If host boot from mflash, usually needs raw area for boot loader image. All of
-the mflash's block device operation will be taken this value as start offset.
-Note that boot loader's size of reserved area and kernel configuration value
-must be same.
-
-3. Example of mflash platform driver registration
-Working mflash is very straight forward. Adding platform device stuff to board
-configuration file is all. Here is some pseudo example.
-
-static struct mg_drv_data mflash_drv_data = {
-	/* If you want to polling driver set to 1 */
-	.use_polling = 0,
-	/* device attribution */
-	.dev_attr = MG_BOOT_DEV
-};
-
-static struct resource mg_mflash_rsc[] = {
-	/* Base address of mflash */
-	[0] = {
-		.start = 0x08000000,
-		.end = 0x08000000 + SZ_64K - 1,
-		.flags = IORESOURCE_MEM
-	},
-	/* mflash interrupt pin */
-	[1] = {
-		.start = IRQ_GPIO(84),
-		.end = IRQ_GPIO(84),
-		.flags = IORESOURCE_IRQ
-	},
-	/* mflash reset pin */
-	[2] = {
-		.start = 43,
-		.end = 43,
-		.name = MG_RST_PIN,
-		.flags = IORESOURCE_IO
-	},
-	/* mflash reset-out pin
-	 * If you use mflash as storage device (i.e. other than MG_BOOT_DEV),
-	 * should assign this */
-	[3] = {
-		.start = 51,
-		.end = 51,
-		.name = MG_RSTOUT_PIN,
-		.flags = IORESOURCE_IO
-	}
-};
-
-static struct platform_device mflash_dev = {
-	.name = MG_DEV_NAME,
-	.id = -1,
-	.dev = {
-		.platform_data = &mflash_drv_data,
-	},
-	.num_resources = ARRAY_SIZE(mg_mflash_rsc),
-	.resource = mg_mflash_rsc
-};
-
-platform_device_register(&mflash_dev);
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index a1c2e816128f..ebe8c1a6195e 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -434,23 +434,6 @@ config ATA_OVER_ETH
 	This driver provides Support for ATA over Ethernet block
 	devices like the Coraid EtherDrive (R) Storage Blade.
 
-config MG_DISK
-	tristate "mGine mflash, gflash support"
-	depends on ARM && GPIOLIB
-	help
-	  mGine mFlash(gFlash) block device driver
-
-config MG_DISK_RES
-	int "Size of reserved area before MBR"
-	depends on MG_DISK
-	default 0
-	help
-	  Define size of reserved area that usually used for boot. Unit is KB.
-	  All of the block device operation will be taken this value as start
-	  offset
-	  Examples:
-			1024 => 1 MB
-
 config SUNVDC
 	tristate "Sun Virtual Disk Client support"
 	depends on SUN_LDOMS
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index b12c772bbeb3..5ceead8b52d7 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -19,7 +19,6 @@ obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
 obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
 obj-$(CONFIG_XILINX_SYSACE)	+= xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
-obj-$(CONFIG_MG_DISK)		+= mg_disk.o
 obj-$(CONFIG_SUNVDC)		+= sunvdc.o
 obj-$(CONFIG_BLK_DEV_SKD)	+= skd.o
 obj-$(CONFIG_BLK_DEV_OSD)	+= osdblk.o
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
deleted file mode 100644
index e88e7b06c616..000000000000
--- a/drivers/block/mg_disk.c
+++ /dev/null
@@ -1,1110 +0,0 @@
-/*
- *  drivers/block/mg_disk.c
- *
- *  Support for the mGine m[g]flash IO mode.
- *  Based on legacy hd.c
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/hdreg.h>
-#include <linux/ata.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/platform_device.h>
-#include <linux/gpio.h>
-#include <linux/mg_disk.h>
-#include <linux/slab.h>
-
-#define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
-
-/* name for block device */
-#define MG_DISK_NAME "mgd"
-
-#define MG_DISK_MAJ 0
-#define MG_DISK_MAX_PART 16
-#define MG_SECTOR_SIZE 512
-#define MG_MAX_SECTS 256
-
-/* Register offsets */
-#define MG_BUFF_OFFSET			0x8000
-#define MG_REG_OFFSET			0xC000
-#define MG_REG_FEATURE			(MG_REG_OFFSET + 2)	/* write case */
-#define MG_REG_ERROR			(MG_REG_OFFSET + 2)	/* read case */
-#define MG_REG_SECT_CNT			(MG_REG_OFFSET + 4)
-#define MG_REG_SECT_NUM			(MG_REG_OFFSET + 6)
-#define MG_REG_CYL_LOW			(MG_REG_OFFSET + 8)
-#define MG_REG_CYL_HIGH			(MG_REG_OFFSET + 0xA)
-#define MG_REG_DRV_HEAD			(MG_REG_OFFSET + 0xC)
-#define MG_REG_COMMAND			(MG_REG_OFFSET + 0xE)	/* write case */
-#define MG_REG_STATUS			(MG_REG_OFFSET + 0xE)	/* read  case */
-#define MG_REG_DRV_CTRL			(MG_REG_OFFSET + 0x10)
-#define MG_REG_BURST_CTRL		(MG_REG_OFFSET + 0x12)
-
-/* handy status */
-#define MG_STAT_READY	(ATA_DRDY | ATA_DSC)
-#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | (ATA_BUSY | ATA_DF | \
-				 ATA_ERR))) == MG_STAT_READY)
-
-/* error code for others */
-#define MG_ERR_NONE		0
-#define MG_ERR_TIMEOUT		0x100
-#define MG_ERR_INIT_STAT	0x101
-#define MG_ERR_TRANSLATION	0x102
-#define MG_ERR_CTRL_RST		0x103
-#define MG_ERR_INV_STAT		0x104
-#define MG_ERR_RSTOUT		0x105
-
-#define MG_MAX_ERRORS	6	/* Max read/write errors */
-
-/* command */
-#define MG_CMD_RD 0x20
-#define MG_CMD_WR 0x30
-#define MG_CMD_SLEEP 0x99
-#define MG_CMD_WAKEUP 0xC3
-#define MG_CMD_ID 0xEC
-#define MG_CMD_WR_CONF 0x3C
-#define MG_CMD_RD_CONF 0x40
-
-/* operation mode */
-#define MG_OP_CASCADE (1 << 0)
-#define MG_OP_CASCADE_SYNC_RD (1 << 1)
-#define MG_OP_CASCADE_SYNC_WR (1 << 2)
-#define MG_OP_INTERLEAVE (1 << 3)
-
-/* synchronous */
-#define MG_BURST_LAT_4 (3 << 4)
-#define MG_BURST_LAT_5 (4 << 4)
-#define MG_BURST_LAT_6 (5 << 4)
-#define MG_BURST_LAT_7 (6 << 4)
-#define MG_BURST_LAT_8 (7 << 4)
-#define MG_BURST_LEN_4 (1 << 1)
-#define MG_BURST_LEN_8 (2 << 1)
-#define MG_BURST_LEN_16 (3 << 1)
-#define MG_BURST_LEN_32 (4 << 1)
-#define MG_BURST_LEN_CONT (0 << 1)
-
-/* timeout value (unit: ms) */
-#define MG_TMAX_CONF_TO_CMD	1
-#define MG_TMAX_WAIT_RD_DRQ	10
-#define MG_TMAX_WAIT_WR_DRQ	500
-#define MG_TMAX_RST_TO_BUSY	10
-#define MG_TMAX_HDRST_TO_RDY	500
-#define MG_TMAX_SWRST_TO_RDY	500
-#define MG_TMAX_RSTOUT		3000
-
-#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
-
-/* main structure for mflash driver */
-struct mg_host {
-	struct device *dev;
-
-	struct request_queue *breq;
-	struct request *req;
-	spinlock_t lock;
-	struct gendisk *gd;
-
-	struct timer_list timer;
-	void (*mg_do_intr) (struct mg_host *);
-
-	u16 id[ATA_ID_WORDS];
-
-	u16 cyls;
-	u16 heads;
-	u16 sectors;
-	u32 n_sectors;
-	u32 nres_sectors;
-
-	void __iomem *dev_base;
-	unsigned int irq;
-	unsigned int rst;
-	unsigned int rstout;
-
-	u32 major;
-	u32 error;
-};
-
-/*
- * Debugging macro and defines
- */
-#undef DO_MG_DEBUG
-#ifdef DO_MG_DEBUG
-#  define MG_DBG(fmt, args...) \
-	printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
-#else /* CONFIG_MG_DEBUG */
-#  define MG_DBG(fmt, args...) do { } while (0)
-#endif /* CONFIG_MG_DEBUG */
-
-static void mg_request(struct request_queue *);
-
-static bool mg_end_request(struct mg_host *host, int err, unsigned int nr_bytes)
-{
-	if (__blk_end_request(host->req, err, nr_bytes))
-		return true;
-
-	host->req = NULL;
-	return false;
-}
-
-static bool mg_end_request_cur(struct mg_host *host, int err)
-{
-	return mg_end_request(host, err, blk_rq_cur_bytes(host->req));
-}
-
-static void mg_dump_status(const char *msg, unsigned int stat,
-		struct mg_host *host)
-{
-	char *name = MG_DISK_NAME;
-
-	if (host->req)
-		name = host->req->rq_disk->disk_name;
-
-	printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
-	if (stat & ATA_BUSY)
-		printk("Busy ");
-	if (stat & ATA_DRDY)
-		printk("DriveReady ");
-	if (stat & ATA_DF)
-		printk("WriteFault ");
-	if (stat & ATA_DSC)
-		printk("SeekComplete ");
-	if (stat & ATA_DRQ)
-		printk("DataRequest ");
-	if (stat & ATA_CORR)
-		printk("CorrectedError ");
-	if (stat & ATA_ERR)
-		printk("Error ");
-	printk("}\n");
-	if ((stat & ATA_ERR) == 0) {
-		host->error = 0;
-	} else {
-		host->error = inb((unsigned long)host->dev_base + MG_REG_ERROR);
-		printk(KERN_ERR "%s: %s: error=0x%02x { ", name, msg,
-				host->error & 0xff);
-		if (host->error & ATA_BBK)
-			printk("BadSector ");
-		if (host->error & ATA_UNC)
-			printk("UncorrectableError ");
-		if (host->error & ATA_IDNF)
-			printk("SectorIdNotFound ");
-		if (host->error & ATA_ABORTED)
-			printk("DriveStatusError ");
-		if (host->error & ATA_AMNF)
-			printk("AddrMarkNotFound ");
-		printk("}");
-		if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) {
-			if (host->req)
-				printk(", sector=%u",
-				       (unsigned int)blk_rq_pos(host->req));
-		}
-		printk("\n");
-	}
-}
-
-static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
-{
-	u8 status;
-	unsigned long expire, cur_jiffies;
-	struct mg_drv_data *prv_data = host->dev->platform_data;
-
-	host->error = MG_ERR_NONE;
-	expire = jiffies + msecs_to_jiffies(msec);
-
-	/* These 2 times dummy status read prevents reading invalid
-	 * status. A very little time (3 times of mflash operating clk)
-	 * is required for busy bit is set. Use dummy read instead of
-	 * busy wait, because mflash's PLL is machine dependent.
-	 */
-	if (prv_data->use_polling) {
-		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-	}
-
-	status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-
-	do {
-		cur_jiffies = jiffies;
-		if (status & ATA_BUSY) {
-			if (expect == ATA_BUSY)
-				break;
-		} else {
-			/* Check the error condition! */
-			if (status & ATA_ERR) {
-				mg_dump_status("mg_wait", status, host);
-				break;
-			}
-
-			if (expect == MG_STAT_READY)
-				if (MG_READY_OK(status))
-					break;
-
-			if (expect == ATA_DRQ)
-				if (status & ATA_DRQ)
-					break;
-		}
-		if (!msec) {
-			mg_dump_status("not ready", status, host);
-			return MG_ERR_INV_STAT;
-		}
-
-		status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-	} while (time_before(cur_jiffies, expire));
-
-	if (time_after_eq(cur_jiffies, expire) && msec)
-		host->error = MG_ERR_TIMEOUT;
-
-	return host->error;
-}
-
-static unsigned int mg_wait_rstout(u32 rstout, u32 msec)
-{
-	unsigned long expire;
-
-	expire = jiffies + msecs_to_jiffies(msec);
-	while (time_before(jiffies, expire)) {
-		if (gpio_get_value(rstout) == 1)
-			return MG_ERR_NONE;
-		msleep(10);
-	}
-
-	return MG_ERR_RSTOUT;
-}
-
-static void mg_unexpected_intr(struct mg_host *host)
-{
-	u32 status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-
-	mg_dump_status("mg_unexpected_intr", status, host);
-}
-
-static irqreturn_t mg_irq(int irq, void *dev_id)
-{
-	struct mg_host *host = dev_id;
-	void (*handler)(struct mg_host *) = host->mg_do_intr;
-
-	spin_lock(&host->lock);
-
-	host->mg_do_intr = NULL;
-	del_timer(&host->timer);
-	if (!handler)
-		handler = mg_unexpected_intr;
-	handler(host);
-
-	spin_unlock(&host->lock);
-
-	return IRQ_HANDLED;
-}
-
-/* local copy of ata_id_string() */
-static void mg_id_string(const u16 *id, unsigned char *s,
-			 unsigned int ofs, unsigned int len)
-{
-	unsigned int c;
-
-	BUG_ON(len & 1);
-
-	while (len > 0) {
-		c = id[ofs] >> 8;
-		*s = c;
-		s++;
-
-		c = id[ofs] & 0xff;
-		*s = c;
-		s++;
-
-		ofs++;
-		len -= 2;
-	}
-}
-
-/* local copy of ata_id_c_string() */
-static void mg_id_c_string(const u16 *id, unsigned char *s,
-			   unsigned int ofs, unsigned int len)
-{
-	unsigned char *p;
-
-	mg_id_string(id, s, ofs, len - 1);
-
-	p = s + strnlen(s, len - 1);
-	while (p > s && p[-1] == ' ')
-		p--;
-	*p = '\0';
-}
-
-static int mg_get_disk_id(struct mg_host *host)
-{
-	u32 i;
-	s32 err;
-	const u16 *id = host->id;
-	struct mg_drv_data *prv_data = host->dev->platform_data;
-	char fwrev[ATA_ID_FW_REV_LEN + 1];
-	char model[ATA_ID_PROD_LEN + 1];
-	char serial[ATA_ID_SERNO_LEN + 1];
-
-	if (!prv_data->use_polling)
-		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-	outb(MG_CMD_ID, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	err = mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ);
-	if (err)
-		return err;
-
-	for (i = 0; i < (MG_SECTOR_SIZE >> 1); i++)
-		host->id[i] = le16_to_cpu(inw((unsigned long)host->dev_base +
-					MG_BUFF_OFFSET + i * 2));
-
-	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	err = mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD);
-	if (err)
-		return err;
-
-	if ((id[ATA_ID_FIELD_VALID] & 1) == 0)
-		return MG_ERR_TRANSLATION;
-
-	host->n_sectors = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
-	host->cyls = id[ATA_ID_CYLS];
-	host->heads = id[ATA_ID_HEADS];
-	host->sectors = id[ATA_ID_SECTORS];
-
-	if (MG_RES_SEC && host->heads && host->sectors) {
-		/* modify cyls, n_sectors */
-		host->cyls = (host->n_sectors - MG_RES_SEC) /
-			host->heads / host->sectors;
-		host->nres_sectors = host->n_sectors - host->cyls *
-			host->heads * host->sectors;
-		host->n_sectors -= host->nres_sectors;
-	}
-
-	mg_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev));
-	mg_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
-	mg_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial));
-	printk(KERN_INFO "mg_disk: model: %s\n", model);
-	printk(KERN_INFO "mg_disk: firm: %.8s\n", fwrev);
-	printk(KERN_INFO "mg_disk: serial: %s\n", serial);
-	printk(KERN_INFO "mg_disk: %d + reserved %d sectors\n",
-			host->n_sectors, host->nres_sectors);
-
-	if (!prv_data->use_polling)
-		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-	return err;
-}
-
-
-static int mg_disk_init(struct mg_host *host)
-{
-	struct mg_drv_data *prv_data = host->dev->platform_data;
-	s32 err;
-	u8 init_status;
-
-	/* hdd rst low */
-	gpio_set_value(host->rst, 0);
-	err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
-	if (err)
-		return err;
-
-	/* hdd rst high */
-	gpio_set_value(host->rst, 1);
-	err = mg_wait(host, MG_STAT_READY, MG_TMAX_HDRST_TO_RDY);
-	if (err)
-		return err;
-
-	/* soft reset on */
-	outb(ATA_SRST | (prv_data->use_polling ? ATA_NIEN : 0),
-			(unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-	err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
-	if (err)
-		return err;
-
-	/* soft reset off */
-	outb(prv_data->use_polling ? ATA_NIEN : 0,
-			(unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-	err = mg_wait(host, MG_STAT_READY, MG_TMAX_SWRST_TO_RDY);
-	if (err)
-		return err;
-
-	init_status = inb((unsigned long)host->dev_base + MG_REG_STATUS) & 0xf;
-
-	if (init_status == 0xf)
-		return MG_ERR_INIT_STAT;
-
-	return err;
-}
-
-static void mg_bad_rw_intr(struct mg_host *host)
-{
-	if (host->req)
-		if (++host->req->errors >= MG_MAX_ERRORS ||
-		    host->error == MG_ERR_TIMEOUT)
-			mg_end_request_cur(host, -EIO);
-}
-
-static unsigned int mg_out(struct mg_host *host,
-		unsigned int sect_num,
-		unsigned int sect_cnt,
-		unsigned int cmd,
-		void (*intr_addr)(struct mg_host *))
-{
-	struct mg_drv_data *prv_data = host->dev->platform_data;
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-		return host->error;
-
-	if (!prv_data->use_polling) {
-		host->mg_do_intr = intr_addr;
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-	}
-	if (MG_RES_SEC)
-		sect_num += MG_RES_SEC;
-	outb((u8)sect_cnt, (unsigned long)host->dev_base + MG_REG_SECT_CNT);
-	outb((u8)sect_num, (unsigned long)host->dev_base + MG_REG_SECT_NUM);
-	outb((u8)(sect_num >> 8), (unsigned long)host->dev_base +
-			MG_REG_CYL_LOW);
-	outb((u8)(sect_num >> 16), (unsigned long)host->dev_base +
-			MG_REG_CYL_HIGH);
-	outb((u8)((sect_num >> 24) | ATA_LBA | ATA_DEVICE_OBS),
-			(unsigned long)host->dev_base + MG_REG_DRV_HEAD);
-	outb(cmd, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	return MG_ERR_NONE;
-}
-
-static void mg_read_one(struct mg_host *host, struct request *req)
-{
-	u16 *buff = (u16 *)bio_data(req->bio);
-	u32 i;
-
-	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
-		*buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
-			      (i << 1));
-}
-
-static void mg_read(struct request *req)
-{
-	struct mg_host *host = req->rq_disk->private_data;
-
-	if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
-		   MG_CMD_RD, NULL) != MG_ERR_NONE)
-		mg_bad_rw_intr(host);
-
-	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));
-
-	do {
-		if (mg_wait(host, ATA_DRQ,
-			    MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return;
-		}
-
-		mg_read_one(host, req);
-
-		outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
-				MG_REG_COMMAND);
-	} while (mg_end_request(host, 0, MG_SECTOR_SIZE));
-}
-
-static void mg_write_one(struct mg_host *host, struct request *req)
-{
-	u16 *buff = (u16 *)bio_data(req->bio);
-	u32 i;
-
-	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
-		outw(*buff++, (unsigned long)host->dev_base + MG_BUFF_OFFSET +
-		     (i << 1));
-}
-
-static void mg_write(struct request *req)
-{
-	struct mg_host *host = req->rq_disk->private_data;
-	unsigned int rem = blk_rq_sectors(req);
-
-	if (mg_out(host, blk_rq_pos(req), rem,
-		   MG_CMD_WR, NULL) != MG_ERR_NONE) {
-		mg_bad_rw_intr(host);
-		return;
-	}
-
-	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       rem, blk_rq_pos(req), bio_data(req->bio));
-
-	if (mg_wait(host, ATA_DRQ,
-		    MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
-		mg_bad_rw_intr(host);
-		return;
-	}
-
-	do {
-		mg_write_one(host, req);
-
-		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
-				MG_REG_COMMAND);
-
-		rem--;
-		if (rem > 1 && mg_wait(host, ATA_DRQ,
-					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return;
-		} else if (mg_wait(host, MG_STAT_READY,
-					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return;
-		}
-	} while (mg_end_request(host, 0, MG_SECTOR_SIZE));
-}
-
-static void mg_read_intr(struct mg_host *host)
-{
-	struct request *req = host->req;
-	u32 i;
-
-	/* check status */
-	do {
-		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		if (i & ATA_BUSY)
-			break;
-		if (!MG_READY_OK(i))
-			break;
-		if (i & ATA_DRQ)
-			goto ok_to_read;
-	} while (0);
-	mg_dump_status("mg_read_intr", i, host);
-	mg_bad_rw_intr(host);
-	mg_request(host->breq);
-	return;
-
-ok_to_read:
-	mg_read_one(host, req);
-
-	MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-	       blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));
-
-	/* send read confirm */
-	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-
-	if (mg_end_request(host, 0, MG_SECTOR_SIZE)) {
-		/* set handler if read remains */
-		host->mg_do_intr = mg_read_intr;
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-	} else /* goto next request */
-		mg_request(host->breq);
-}
-
-static void mg_write_intr(struct mg_host *host)
-{
-	struct request *req = host->req;
-	u32 i;
-	bool rem;
-
-	/* check status */
-	do {
-		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		if (i & ATA_BUSY)
-			break;
-		if (!MG_READY_OK(i))
-			break;
-		if ((blk_rq_sectors(req) <= 1) || (i & ATA_DRQ))
-			goto ok_to_write;
-	} while (0);
-	mg_dump_status("mg_write_intr", i, host);
-	mg_bad_rw_intr(host);
-	mg_request(host->breq);
-	return;
-
-ok_to_write:
-	if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) {
-		/* write 1 sector and set handler if remains */
-		mg_write_one(host, req);
-		MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-		       blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
-		host->mg_do_intr = mg_write_intr;
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-	}
-
-	/* send write confirm */
-	outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-
-	if (!rem)
-		mg_request(host->breq);
-}
-
-static void mg_times_out(unsigned long data)
-{
-	struct mg_host *host = (struct mg_host *)data;
-	char *name;
-
-	spin_lock_irq(&host->lock);
-
-	if (!host->req)
-		goto out_unlock;
-
-	host->mg_do_intr = NULL;
-
-	name = host->req->rq_disk->disk_name;
-	printk(KERN_DEBUG "%s: timeout\n", name);
-
-	host->error = MG_ERR_TIMEOUT;
-	mg_bad_rw_intr(host);
-
-out_unlock:
-	mg_request(host->breq);
-	spin_unlock_irq(&host->lock);
-}
-
-static void mg_request_poll(struct request_queue *q)
-{
-	struct mg_host *host = q->queuedata;
-
-	while (1) {
-		if (!host->req) {
-			host->req = blk_fetch_request(q);
-			if (!host->req)
-				break;
-		}
-
-		switch (req_op(host->req)) {
-		case REQ_OP_READ:
-			mg_read(host->req);
-			break;
-		case REQ_OP_WRITE:
-			mg_write(host->req);
-			break;
-		default:
-			mg_end_request_cur(host, -EIO);
-			break;
-		}
-	}
-}
-
-static unsigned int mg_issue_req(struct request *req,
-		struct mg_host *host,
-		unsigned int sect_num,
-		unsigned int sect_cnt)
-{
-	switch (req_op(host->req)) {
-	case REQ_OP_READ:
-		if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
-				!= MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return host->error;
-		}
-		break;
-	case REQ_OP_WRITE:
-		/* TODO : handler */
-		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-		if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
-				!= MG_ERR_NONE) {
-			mg_bad_rw_intr(host);
-			return host->error;
-		}
-		del_timer(&host->timer);
-		mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ);
-		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-		if (host->error) {
-			mg_bad_rw_intr(host);
-			return host->error;
-		}
-		mg_write_one(host, req);
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
-				MG_REG_COMMAND);
-		break;
-	default:
-		mg_end_request_cur(host, -EIO);
-		break;
-	}
-	return MG_ERR_NONE;
-}
-
-/* This function also called from IRQ context */
-static void mg_request(struct request_queue *q)
-{
-	struct mg_host *host = q->queuedata;
-	struct request *req;
-	u32 sect_num, sect_cnt;
-
-	while (1) {
-		if (!host->req) {
-			host->req = blk_fetch_request(q);
-			if (!host->req)
-				break;
-		}
-		req = host->req;
-
-		/* check unwanted request call */
-		if (host->mg_do_intr)
-			return;
-
-		del_timer(&host->timer);
-
-		sect_num = blk_rq_pos(req);
-		/* deal whole segments */
-		sect_cnt = blk_rq_sectors(req);
-
-		/* sanity check */
-		if (sect_num >= get_capacity(req->rq_disk) ||
-				((sect_num + sect_cnt) >
-				 get_capacity(req->rq_disk))) {
-			printk(KERN_WARNING
-					"%s: bad access: sector=%d, count=%d\n",
-					req->rq_disk->disk_name,
-					sect_num, sect_cnt);
-			mg_end_request_cur(host, -EIO);
-			continue;
-		}
-
-		if (!mg_issue_req(req, host, sect_num, sect_cnt))
-			return;
-	}
-}
-
-static int mg_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	struct mg_host *host = bdev->bd_disk->private_data;
-
-	geo->cylinders = (unsigned short)host->cyls;
-	geo->heads = (unsigned char)host->heads;
-	geo->sectors = (unsigned char)host->sectors;
-	return 0;
-}
-
-static const struct block_device_operations mg_disk_ops = {
-	.getgeo = mg_getgeo
-};
-
-#ifdef CONFIG_PM_SLEEP
-static int mg_suspend(struct device *dev)
-{
-	struct mg_drv_data *prv_data = dev->platform_data;
-	struct mg_host *host = prv_data->host;
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-		return -EIO;
-
-	if (!prv_data->use_polling)
-		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-	outb(MG_CMD_SLEEP, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	/* wait until mflash deep sleep */
-	msleep(1);
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) {
-		if (!prv_data->use_polling)
-			outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static int mg_resume(struct device *dev)
-{
-	struct mg_drv_data *prv_data = dev->platform_data;
-	struct mg_host *host = prv_data->host;
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-		return -EIO;
-
-	outb(MG_CMD_WAKEUP, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	/* wait until mflash wakeup */
-	msleep(1);
-
-	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
-		return -EIO;
-
-	if (!prv_data->use_polling)
-		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
-	return 0;
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
-
-static int mg_probe(struct platform_device *plat_dev)
-{
-	struct mg_host *host;
-	struct resource *rsc;
-	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
-	int err = 0;
-
-	if (!prv_data) {
-		printk(KERN_ERR	"%s:%d fail (no driver_data)\n",
-				__func__, __LINE__);
-		err = -EINVAL;
-		goto probe_err;
-	}
-
-	/* alloc mg_host */
-	host = kzalloc(sizeof(struct mg_host), GFP_KERNEL);
-	if (!host) {
-		printk(KERN_ERR "%s:%d fail (no memory for mg_host)\n",
-				__func__, __LINE__);
-		err = -ENOMEM;
-		goto probe_err;
-	}
-	host->major = MG_DISK_MAJ;
-
-	/* link each other */
-	prv_data->host = host;
-	host->dev = &plat_dev->dev;
-
-	/* io remap */
-	rsc = platform_get_resource(plat_dev, IORESOURCE_MEM, 0);
-	if (!rsc) {
-		printk(KERN_ERR "%s:%d platform_get_resource fail\n",
-				__func__, __LINE__);
-		err = -EINVAL;
-		goto probe_err_2;
-	}
-	host->dev_base = ioremap(rsc->start, resource_size(rsc));
-	if (!host->dev_base) {
-		printk(KERN_ERR "%s:%d ioremap fail\n",
-				__func__, __LINE__);
-		err = -EIO;
-		goto probe_err_2;
-	}
-	MG_DBG("dev_base = 0x%x\n", (u32)host->dev_base);
-
-	/* get reset pin */
-	rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
-			MG_RST_PIN);
-	if (!rsc) {
-		printk(KERN_ERR "%s:%d get reset pin fail\n",
-				__func__, __LINE__);
-		err = -EIO;
-		goto probe_err_3;
-	}
-	host->rst = rsc->start;
-
-	/* init rst pin */
-	err = gpio_request(host->rst, MG_RST_PIN);
-	if (err)
-		goto probe_err_3;
-	gpio_direction_output(host->rst, 1);
-
-	/* reset out pin */
-	if (!(prv_data->dev_attr & MG_DEV_MASK)) {
-		err = -EINVAL;
-		goto probe_err_3a;
-	}
-
-	if (prv_data->dev_attr != MG_BOOT_DEV) {
-		rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
-				MG_RSTOUT_PIN);
-		if (!rsc) {
-			printk(KERN_ERR "%s:%d get reset-out pin fail\n",
-					__func__, __LINE__);
-			err = -EIO;
-			goto probe_err_3a;
-		}
-		host->rstout = rsc->start;
-		err = gpio_request(host->rstout, MG_RSTOUT_PIN);
-		if (err)
-			goto probe_err_3a;
-		gpio_direction_input(host->rstout);
-	}
-
-	/* disk reset */
-	if (prv_data->dev_attr == MG_STORAGE_DEV) {
-		/* If POR seq. not yet finished, wait */
-		err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT);
-		if (err)
-			goto probe_err_3b;
-		err = mg_disk_init(host);
-		if (err) {
-			printk(KERN_ERR "%s:%d fail (err code : %d)\n",
-					__func__, __LINE__, err);
-			err = -EIO;
-			goto probe_err_3b;
-		}
-	}
-
-	/* get irq resource */
-	if (!prv_data->use_polling) {
-		host->irq = platform_get_irq(plat_dev, 0);
-		if (host->irq == -ENXIO) {
-			err = host->irq;
-			goto probe_err_3b;
-		}
-		err = request_irq(host->irq, mg_irq,
-				IRQF_TRIGGER_RISING,
-				MG_DEV_NAME, host);
-		if (err) {
-			printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n",
-					__func__, __LINE__, err);
-			goto probe_err_3b;
-		}
-
-	}
-
-	/* get disk id */
-	err = mg_get_disk_id(host);
-	if (err) {
-		printk(KERN_ERR "%s:%d fail (err code : %d)\n",
-				__func__, __LINE__, err);
-		err = -EIO;
-		goto probe_err_4;
-	}
-
-	err = register_blkdev(host->major, MG_DISK_NAME);
-	if (err < 0) {
-		printk(KERN_ERR "%s:%d register_blkdev fail (err code : %d)\n",
-				__func__, __LINE__, err);
-		goto probe_err_4;
-	}
-	if (!host->major)
-		host->major = err;
-
-	spin_lock_init(&host->lock);
-
-	if (prv_data->use_polling)
-		host->breq = blk_init_queue(mg_request_poll, &host->lock);
-	else
-		host->breq = blk_init_queue(mg_request, &host->lock);
-
-	if (!host->breq) {
-		err = -ENOMEM;
-		printk(KERN_ERR "%s:%d (blk_init_queue) fail\n",
-				__func__, __LINE__);
-		goto probe_err_5;
-	}
-	host->breq->queuedata = host;
-
-	/* mflash is random device, thanx for the noop */
-	err = elevator_change(host->breq, "noop");
-	if (err) {
-		printk(KERN_ERR "%s:%d (elevator_init) fail\n",
-				__func__, __LINE__);
-		goto probe_err_6;
-	}
-	blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS);
-	blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
-
-	setup_timer(&host->timer, mg_times_out, (unsigned long)host);
-
-	host->gd = alloc_disk(MG_DISK_MAX_PART);
-	if (!host->gd) {
-		printk(KERN_ERR "%s:%d (alloc_disk) fail\n",
-				__func__, __LINE__);
-		err = -ENOMEM;
-		goto probe_err_7;
-	}
-	host->gd->major = host->major;
-	host->gd->first_minor = 0;
-	host->gd->fops = &mg_disk_ops;
-	host->gd->queue = host->breq;
-	host->gd->private_data = host;
-	sprintf(host->gd->disk_name, MG_DISK_NAME"a");
-
-	set_capacity(host->gd, host->n_sectors);
-
-	add_disk(host->gd);
-
-	return err;
-
-probe_err_7:
-	del_timer_sync(&host->timer);
-probe_err_6:
-	blk_cleanup_queue(host->breq);
-probe_err_5:
-	unregister_blkdev(host->major, MG_DISK_NAME);
-probe_err_4:
-	if (!prv_data->use_polling)
-		free_irq(host->irq, host);
-probe_err_3b:
-	gpio_free(host->rstout);
-probe_err_3a:
-	gpio_free(host->rst);
-probe_err_3:
-	iounmap(host->dev_base);
-probe_err_2:
-	kfree(host);
-probe_err:
-	return err;
-}
-
-static int mg_remove(struct platform_device *plat_dev)
-{
-	struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
-	struct mg_host *host = prv_data->host;
-	int err = 0;
-
-	/* delete timer */
-	del_timer_sync(&host->timer);
-
-	/* remove disk */
-	if (host->gd) {
-		del_gendisk(host->gd);
-		put_disk(host->gd);
-	}
-	/* remove queue */
-	if (host->breq)
-		blk_cleanup_queue(host->breq);
-
-	/* unregister blk device */
-	unregister_blkdev(host->major, MG_DISK_NAME);
-
-	/* free irq */
-	if (!prv_data->use_polling)
-		free_irq(host->irq, host);
-
-	/* free reset-out pin */
-	if (prv_data->dev_attr != MG_BOOT_DEV)
-		gpio_free(host->rstout);
-
-	/* free rst pin */
-	if (host->rst)
-		gpio_free(host->rst);
-
-	/* unmap io */
-	if (host->dev_base)
-		iounmap(host->dev_base);
-
-	/* free mg_host */
-	kfree(host);
-
-	return err;
-}
-
-static struct platform_driver mg_disk_driver = {
-	.probe = mg_probe,
-	.remove = mg_remove,
-	.driver = {
-		.name = MG_DEV_NAME,
-		.pm = &mg_pm,
-	}
-};
-
-/****************************************************************************
- *
- * Module stuff
- *
- ****************************************************************************/
-
-static int __init mg_init(void)
-{
-	printk(KERN_INFO "mGine mflash driver, (c) 2008 mGine Co.\n");
-	return platform_driver_register(&mg_disk_driver);
-}
-
-static void __exit mg_exit(void)
-{
-	printk(KERN_INFO "mflash driver : bye bye\n");
-	platform_driver_unregister(&mg_disk_driver);
-}
-
-module_init(mg_init);
-module_exit(mg_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("unsik Kim <donari75@gmail.com>");
-MODULE_DESCRIPTION("mGine m[g]flash device driver");
diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h
deleted file mode 100644
index e11f4d9f1c2e..000000000000
--- a/include/linux/mg_disk.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- *  include/linux/mg_disk.c
- *
- *  Private data for mflash platform driver
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-
-#ifndef __MG_DISK_H__
-#define __MG_DISK_H__
-
-/* name for platform device */
-#define MG_DEV_NAME "mg_disk"
-
-/* names of GPIO resource */
-#define MG_RST_PIN	"mg_rst"
-/* except MG_BOOT_DEV, reset-out pin should be assigned */
-#define MG_RSTOUT_PIN	"mg_rstout"
-
-/* device attribution */
-/* use mflash as boot device */
-#define MG_BOOT_DEV		(1 << 0)
-/* use mflash as storage device */
-#define MG_STORAGE_DEV		(1 << 1)
-/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
-#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
-
-/* private driver data */
-struct mg_drv_data {
-	/* disk resource */
-	u32 use_polling;
-
-	/* device attribution */
-	u32 dev_attr;
-
-	/* internally used */
-	void *host;
-};
-
-#endif
-- 
cgit v1.3-6-gb490


From c05e66733788118377c21a913c1bc7b64bccc167 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 14 Apr 2017 00:59:58 -0700
Subject: sbitmap: add sbitmap_get_shallow() operation

This operation supports the use case of limiting the number of bits that
can be allocated for a given operation. Rather than setting aside some
bits at the end of the bitmap, we can set aside bits in each word of the
bitmap. This means we can keep the allocation hints spread out and
support sbitmap_resize() nicely at the cost of lower granularity for the
allowed depth.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/sbitmap.h | 55 ++++++++++++++++++++++++++++++++++++
 lib/sbitmap.c           | 75 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 123 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index d4e0a204c118..a1904aadbc45 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -175,6 +175,25 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
  */
 int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin);
 
+/**
+ * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
+ * limiting the depth used from each word.
+ * @sb: Bitmap to allocate from.
+ * @alloc_hint: Hint for where to start searching for a free bit.
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ *
+ * This rather specific operation allows for having multiple users with
+ * different allocation limits. E.g., there can be a high-priority class that
+ * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
+ * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
+ * class can only allocate half of the total bits in the bitmap, preventing it
+ * from starving out the high-priority class.
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
+			unsigned long shallow_depth);
+
 /**
  * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
  * @sb: Bitmap to check.
@@ -325,6 +344,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
  */
 int __sbitmap_queue_get(struct sbitmap_queue *sbq);
 
+/**
+ * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue, limiting the depth used from each word, with preemption
+ * already disabled.
+ * @sbq: Bitmap queue to allocate from.
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * See sbitmap_get_shallow().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+				unsigned int shallow_depth);
+
 /**
  * sbitmap_queue_get() - Try to allocate a free bit from a &struct
  * sbitmap_queue.
@@ -345,6 +377,29 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
 	return nr;
 }
 
+/**
+ * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue, limiting the depth used from each word.
+ * @sbq: Bitmap queue to allocate from.
+ * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
+ *       sbitmap_queue_clear()).
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * See sbitmap_get_shallow().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+					    unsigned int *cpu,
+					    unsigned int shallow_depth)
+{
+	int nr;
+
+	*cpu = get_cpu();
+	nr = __sbitmap_queue_get_shallow(sbq, shallow_depth);
+	put_cpu();
+	return nr;
+}
+
 /**
  * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
  * &struct sbitmap_queue.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 60e800e0b5a0..80aa8d5463fa 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -79,15 +79,15 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
 }
 EXPORT_SYMBOL_GPL(sbitmap_resize);
 
-static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
-			      bool wrap)
+static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
+			      unsigned int hint, bool wrap)
 {
 	unsigned int orig_hint = hint;
 	int nr;
 
 	while (1) {
-		nr = find_next_zero_bit(&word->word, word->depth, hint);
-		if (unlikely(nr >= word->depth)) {
+		nr = find_next_zero_bit(word, depth, hint);
+		if (unlikely(nr >= depth)) {
 			/*
 			 * We started with an offset, and we didn't reset the
 			 * offset to 0 in a failure case, so start from 0 to
@@ -100,11 +100,11 @@ static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
 			return -1;
 		}
 
-		if (!test_and_set_bit(nr, &word->word))
+		if (!test_and_set_bit(nr, word))
 			break;
 
 		hint = nr + 1;
-		if (hint >= word->depth - 1)
+		if (hint >= depth - 1)
 			hint = 0;
 	}
 
@@ -119,7 +119,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 	index = SB_NR_TO_INDEX(sb, alloc_hint);
 
 	for (i = 0; i < sb->map_nr; i++) {
-		nr = __sbitmap_get_word(&sb->map[index],
+		nr = __sbitmap_get_word(&sb->map[index].word,
+					sb->map[index].depth,
 					SB_NR_TO_BIT(sb, alloc_hint),
 					!round_robin);
 		if (nr != -1) {
@@ -141,6 +142,37 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 }
 EXPORT_SYMBOL_GPL(sbitmap_get);
 
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
+			unsigned long shallow_depth)
+{
+	unsigned int i, index;
+	int nr = -1;
+
+	index = SB_NR_TO_INDEX(sb, alloc_hint);
+
+	for (i = 0; i < sb->map_nr; i++) {
+		nr = __sbitmap_get_word(&sb->map[index].word,
+					min(sb->map[index].depth, shallow_depth),
+					SB_NR_TO_BIT(sb, alloc_hint), true);
+		if (nr != -1) {
+			nr += index << sb->shift;
+			break;
+		}
+
+		/* Jump to next index. */
+		index++;
+		alloc_hint = index << sb->shift;
+
+		if (index >= sb->map_nr) {
+			index = 0;
+			alloc_hint = 0;
+		}
+	}
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_get_shallow);
+
 bool sbitmap_any_bit_set(const struct sbitmap *sb)
 {
 	unsigned int i;
@@ -342,6 +374,35 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq)
 }
 EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
 
+int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+				unsigned int shallow_depth)
+{
+	unsigned int hint, depth;
+	int nr;
+
+	hint = this_cpu_read(*sbq->alloc_hint);
+	depth = READ_ONCE(sbq->sb.depth);
+	if (unlikely(hint >= depth)) {
+		hint = depth ? prandom_u32() % depth : 0;
+		this_cpu_write(*sbq->alloc_hint, hint);
+	}
+	nr = sbitmap_get_shallow(&sbq->sb, hint, shallow_depth);
+
+	if (nr == -1) {
+		/* If the map is full, a hint won't do us much good. */
+		this_cpu_write(*sbq->alloc_hint, 0);
+	} else if (nr == hint || unlikely(sbq->round_robin)) {
+		/* Only update the hint if we used it. */
+		hint = nr + 1;
+		if (hint >= depth - 1)
+			hint = 0;
+		this_cpu_write(*sbq->alloc_hint, hint);
+	}
+
+	return nr;
+}
+EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
+
 static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
-- 
cgit v1.3-6-gb490


From 5b72727299307e53888277729f980ab03264dac8 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 14 Apr 2017 01:00:00 -0700
Subject: blk-mq: export helpers

blk_mq_finish_request() is required for schedulers that define their own
put_request(). blk_mq_run_hw_queue() is required for schedulers that
hold back requests to be run later.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 2 ++
 include/linux/blk-mq.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index e536dacfae4c..7138cd98146e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -368,6 +368,7 @@ void blk_mq_finish_request(struct request *rq)
 {
 	blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
 }
+EXPORT_SYMBOL_GPL(blk_mq_finish_request);
 
 void blk_mq_free_request(struct request *rq)
 {
@@ -1183,6 +1184,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
 	__blk_mq_delay_run_hw_queue(hctx, async, 0);
 }
+EXPORT_SYMBOL(blk_mq_run_hw_queue);
 
 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b90c3d5766cd..d75de612845d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -238,6 +238,7 @@ void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
-- 
cgit v1.3-6-gb490


From c05f8525f67b7d6489b0502211d4ed35622d9beb Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 14 Apr 2017 01:00:01 -0700
Subject: blk-mq-sched: make completed_request() callback more useful

Currently, this callback is called right after put_request() and has no
distinguishable purpose. Instead, let's call it before put_request() as
soon as I/O has completed on the request, before we account it in
blk-stat. With this, Kyber can enable stats when it sees a latency
outlier and make sure the outlier gets accounted.

Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sched.h     | 11 +++--------
 block/blk-mq.c           |  5 ++++-
 include/linux/elevator.h |  2 +-
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index f4bc186c3440..120c6abc37cc 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -82,17 +82,12 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
 	return true;
 }
 
-static inline void
-blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static inline void blk_mq_sched_completed_request(struct request *rq)
 {
-	struct elevator_queue *e = hctx->queue->elevator;
+	struct elevator_queue *e = rq->q->elevator;
 
 	if (e && e->type->ops.mq.completed_request)
-		e->type->ops.mq.completed_request(hctx, rq);
-
-	BUG_ON(rq->internal_tag == -1);
-
-	blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+		e->type->ops.mq.completed_request(rq);
 }
 
 static inline void blk_mq_sched_started_request(struct request *rq)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7138cd98146e..e2ef7b460924 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -350,7 +350,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 	if (rq->tag != -1)
 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 	if (sched_tag != -1)
-		blk_mq_sched_completed_request(hctx, rq);
+		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
 	blk_mq_sched_restart(hctx);
 	blk_queue_exit(q);
 }
@@ -444,6 +444,9 @@ static void __blk_mq_complete_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	if (rq->internal_tag != -1)
+		blk_mq_sched_completed_request(rq);
+
 	blk_mq_stat_add(rq);
 
 	if (!q->softirq_done_fn)
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index b7ec315ee7e7..3a216318ae73 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -106,7 +106,7 @@ struct elevator_mq_ops {
 	void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
 	struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
 	bool (*has_work)(struct blk_mq_hw_ctx *);
-	void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
+	void (*completed_request)(struct request *);
 	void (*started_request)(struct request *);
 	void (*requeue_request)(struct request *);
 	struct request *(*former_request)(struct request_queue *, struct request *);
-- 
cgit v1.3-6-gb490


From 0e8d6a9336b487a1dd6f1991ff376e669d4c87c6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 12 Apr 2017 22:07:28 +0200
Subject: workqueue: Provide work_on_cpu_safe()

work_on_cpu() is not protected against CPU hotplug. For code which requires
to be either executed on an online CPU or to fail if the CPU is not
available the callsite would have to protect against CPU hotplug.

Provide a function which does get/put_online_cpus() around the call to
work_on_cpu() and fails the call with -ENODEV if the target CPU is not
online.

Preparatory patch to convert several racy task affinity manipulations.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/20170412201042.262610721@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/workqueue.h |  5 +++++
 kernel/workqueue.c        | 23 +++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bde063cefd04..c102ef65cb64 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -608,8 +608,13 @@ static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 {
 	return fn(arg);
 }
+static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+	return fn(arg);
+}
 #else
 long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c0168b7da1ea..5bf1be018628 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4735,6 +4735,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 	return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
+
+/**
+ * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn:  the function to run
+ * @arg: the function argument
+ *
+ * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
+ * any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
+ */
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+	long ret = -ENODEV;
+
+	get_online_cpus();
+	if (cpu_online(cpu))
+		ret = work_on_cpu(cpu, fn, arg);
+	put_online_cpus();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu_safe);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
-- 
cgit v1.3-6-gb490


From 17912c49edfa6ab552329bf63d1b757eb874673b Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Sat, 15 Apr 2017 20:55:37 +0200
Subject: lightnvm: submit erases using the I/O path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Until now erases have been submitted as synchronous commands through a
dedicated erase function. In order to enable targets implementing
asynchronous erases, refactor the erase path so that it uses the normal
async I/O submission functions. If a target requires sync I/O, it can
implement it internally. Also, adapt rrpc to use the new erase path.

Signed-off-by: Javier González <javier@cnexlabs.com>
Fixed spelling error.
Signed-off-by: Matias Bjørling <matias@cnexlabs.com>

Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c      | 54 +++++++++++++++++++++++++++-----------------
 drivers/lightnvm/rrpc.c      |  3 +--
 drivers/nvme/host/lightnvm.c | 32 ++++++++------------------
 include/linux/lightnvm.h     |  8 +++----
 4 files changed, 47 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 5262ba66a7a7..95105c47e082 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -590,11 +590,11 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
-	nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
+	nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
 	nvm_rq_tgt_to_dev(tgt_dev, &rqd);
 
 	ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
-	nvm_free_rqd_ppalist(dev, &rqd);
+	nvm_free_rqd_ppalist(tgt_dev, &rqd);
 	if (ret) {
 		pr_err("nvm: failed bb mark\n");
 		return -EINVAL;
@@ -626,34 +626,45 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 }
 EXPORT_SYMBOL(nvm_submit_io);
 
-int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int flags)
+static void nvm_end_io_sync(struct nvm_rq *rqd)
 {
-	struct nvm_dev *dev = tgt_dev->parent;
-	struct nvm_rq rqd;
-	int ret;
+	struct completion *waiting = rqd->private;
 
-	if (!dev->ops->erase_block)
-		return 0;
+	complete(waiting);
+}
 
-	nvm_map_to_dev(tgt_dev, ppas);
+int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
+								int nr_ppas)
+{
+	struct nvm_geo *geo = &tgt_dev->geo;
+	struct nvm_rq rqd;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(wait);
 
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
-	ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, 1, 1);
+	rqd.opcode = NVM_OP_ERASE;
+	rqd.end_io = nvm_end_io_sync;
+	rqd.private = &wait;
+	rqd.flags = geo->plane_mode >> 1;
+
+	ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
 	if (ret)
 		return ret;
 
-	nvm_rq_tgt_to_dev(tgt_dev, &rqd);
-
-	rqd.flags = flags;
-
-	ret = dev->ops->erase_block(dev, &rqd);
+	ret = nvm_submit_io(tgt_dev, &rqd);
+	if (ret) {
+		pr_err("rrpr: erase I/O submission failed: %d\n", ret);
+		goto free_ppa_list;
+	}
+	wait_for_completion_io(&wait);
 
-	nvm_free_rqd_ppalist(dev, &rqd);
+free_ppa_list:
+	nvm_free_rqd_ppalist(tgt_dev, &rqd);
 
 	return ret;
 }
-EXPORT_SYMBOL(nvm_erase_blk);
+EXPORT_SYMBOL(nvm_erase_sync);
 
 int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb,
 		    nvm_l2p_update_fn *update_l2p, void *priv)
@@ -732,10 +743,11 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
 }
 EXPORT_SYMBOL(nvm_put_area);
 
-int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
+int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
 			const struct ppa_addr *ppas, int nr_ppas, int vblk)
 {
-	struct nvm_geo *geo = &dev->geo;
+	struct nvm_dev *dev = tgt_dev->parent;
+	struct nvm_geo *geo = &tgt_dev->geo;
 	int i, plane_cnt, pl_idx;
 	struct ppa_addr ppa;
 
@@ -773,12 +785,12 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
 }
 EXPORT_SYMBOL(nvm_set_rqd_ppalist);
 
-void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd)
+void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
 {
 	if (!rqd->ppa_list)
 		return;
 
-	nvm_dev_dma_free(dev, rqd->ppa_list, rqd->dma_ppa_list);
+	nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
 }
 EXPORT_SYMBOL(nvm_free_rqd_ppalist);
 
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index e68efbcf1188..4e4c2997337c 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -414,7 +414,6 @@ static void rrpc_block_gc(struct work_struct *work)
 	struct rrpc *rrpc = gcb->rrpc;
 	struct rrpc_block *rblk = gcb->rblk;
 	struct rrpc_lun *rlun = rblk->rlun;
-	struct nvm_tgt_dev *dev = rrpc->dev;
 	struct ppa_addr ppa;
 
 	mempool_free(gcb, rrpc->gcb_pool);
@@ -430,7 +429,7 @@ static void rrpc_block_gc(struct work_struct *work)
 	ppa.g.lun = rlun->bppa.g.lun;
 	ppa.g.blk = rblk->id;
 
-	if (nvm_erase_blk(dev, &ppa, 0))
+	if (nvm_erase_sync(rrpc->dev, &ppa, 1))
 		goto put_back;
 
 	rrpc_put_blk(rrpc, rblk);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index fd9895423f55..4ea9c93fbbe0 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -510,12 +510,16 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 	}
 	rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
 
-	rq->ioprio = bio_prio(bio);
-	if (bio_has_data(bio))
-		rq->nr_phys_segments = bio_phys_segments(q, bio);
-
-	rq->__data_len = bio->bi_iter.bi_size;
-	rq->bio = rq->biotail = bio;
+	if (bio) {
+		rq->ioprio = bio_prio(bio);
+		rq->__data_len = bio->bi_iter.bi_size;
+		rq->bio = rq->biotail = bio;
+		if (bio_has_data(bio))
+			rq->nr_phys_segments = bio_phys_segments(q, bio);
+	} else {
+		rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+		rq->__data_len = 0;
+	}
 
 	nvme_nvm_rqtocmd(rq, rqd, ns, cmd);
 
@@ -526,21 +530,6 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
 	return 0;
 }
 
-static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
-	struct request_queue *q = dev->q;
-	struct nvme_ns *ns = q->queuedata;
-	struct nvme_nvm_command c = {};
-
-	c.erase.opcode = NVM_OP_ERASE;
-	c.erase.nsid = cpu_to_le32(ns->ns_id);
-	c.erase.spba = cpu_to_le64(rqd->ppa_addr.ppa);
-	c.erase.length = cpu_to_le16(rqd->nr_ppas - 1);
-	c.erase.control = cpu_to_le16(rqd->flags);
-
-	return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0);
-}
-
 static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
 {
 	struct nvme_ns *ns = nvmdev->q->queuedata;
@@ -576,7 +565,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
 	.set_bb_tbl		= nvme_nvm_set_bb_tbl,
 
 	.submit_io		= nvme_nvm_submit_io,
-	.erase_block		= nvme_nvm_erase_block,
 
 	.create_dma_pool	= nvme_nvm_create_dma_pool,
 	.destroy_dma_pool	= nvme_nvm_destroy_dma_pool,
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index ca45e4a088a9..e11163f9b3b7 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -56,7 +56,6 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
 typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
 typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
 typedef void (nvm_destroy_dma_pool_fn)(void *);
 typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
@@ -70,7 +69,6 @@ struct nvm_dev_ops {
 	nvm_op_set_bb_fn	*set_bb_tbl;
 
 	nvm_submit_io_fn	*submit_io;
-	nvm_erase_blk_fn	*erase_block;
 
 	nvm_create_dma_pool_fn	*create_dma_pool;
 	nvm_destroy_dma_pool_fn	*destroy_dma_pool;
@@ -479,10 +477,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
 			      int, int);
 extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
 extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
-extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
+extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
+extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *,
 					const struct ppa_addr *, int, int);
-extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
-extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int);
+extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *);
 extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
 			   void *);
 extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
-- 
cgit v1.3-6-gb490


From a7737f39c70d9c63ba530d6316724d7be67de541 Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Sat, 15 Apr 2017 20:55:38 +0200
Subject: lightnvm: rename scrambler controller hint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

According to the OCSSD 1.2 specification, the 0x200 hint enables the
media scrambler for the read/write opcode, providing that the controller
has been correctly configured by the firmware. Rename the macro to
represent this meaning.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/lightnvm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index e11163f9b3b7..eff7d1f312a8 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -123,7 +123,7 @@ enum {
 	/* NAND Access Modes */
 	NVM_IO_SUSPEND		= 0x80,
 	NVM_IO_SLC_MODE		= 0x100,
-	NVM_IO_SCRAMBLE_DISABLE	= 0x200,
+	NVM_IO_SCRAMBLE_ENABLE	= 0x200,
 
 	/* Block Types */
 	NVM_BLK_T_FREE		= 0x0,
-- 
cgit v1.3-6-gb490


From 4af3f75d7992dd0dc49da95fbc039fa3806fba4f Mon Sep 17 00:00:00 2001
From: Javier González <jg@lightnvm.io>
Date: Sat, 15 Apr 2017 20:55:45 +0200
Subject: lightnvm: allow to init targets on factory mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Target initialization has two responsibilities: creating the target
partition and instantiating the target. This patch enables to create a
factory partition (e.g., do not trigger recovery on the given target).
This is useful for target development and for being able to restore the
device state at any moment in time without requiring a full-device
erase.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <matias@cnexlabs.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/lightnvm/core.c       | 14 +++++++++++---
 drivers/lightnvm/rrpc.c       |  3 ++-
 include/linux/lightnvm.h      |  3 ++-
 include/uapi/linux/lightnvm.h |  4 ++++
 4 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 5f84d2a418f6..a63b563b1a8a 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -280,7 +280,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 	tdisk->fops = &nvm_fops;
 	tdisk->queue = tqueue;
 
-	targetdata = tt->init(tgt_dev, tdisk);
+	targetdata = tt->init(tgt_dev, tdisk, create->flags);
 	if (IS_ERR(targetdata))
 		goto err_init;
 
@@ -1244,8 +1244,16 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
 	create.tgtname[DISK_NAME_LEN - 1] = '\0';
 
 	if (create.flags != 0) {
-		pr_err("nvm: no flags supported\n");
-		return -EINVAL;
+		__u32 flags = create.flags;
+
+		/* Check for valid flags */
+		if (flags & NVM_TARGET_FACTORY)
+			flags &= ~NVM_TARGET_FACTORY;
+
+		if (flags) {
+			pr_err("nvm: flag not supported\n");
+			return -EINVAL;
+		}
 	}
 
 	return __nvm_configure_create(&create);
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index a8acf9e06401..5dba54470c2a 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -1506,7 +1506,8 @@ err:
 
 static struct nvm_tgt_type tt_rrpc;
 
-static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk)
+static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
+		       int flags)
 {
 	struct request_queue *bqueue = dev->q;
 	struct request_queue *tqueue = tdisk->queue;
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index eff7d1f312a8..7dfa56ebbc6d 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -436,7 +436,8 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
 
 typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
 typedef sector_t (nvm_tgt_capacity_fn)(void *);
-typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *);
+typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
+				int flags);
 typedef void (nvm_tgt_exit_fn)(void *);
 typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *);
 typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *);
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
index fd19f36b3129..c8aec4b9e73b 100644
--- a/include/uapi/linux/lightnvm.h
+++ b/include/uapi/linux/lightnvm.h
@@ -85,6 +85,10 @@ struct nvm_ioctl_create_conf {
 	};
 };
 
+enum {
+	NVM_TARGET_FACTORY = 1 << 0,	/* Init target in factory mode */
+};
+
 struct nvm_ioctl_create {
 	char dev[DISK_NAME_LEN];		/* open-channel SSD device */
 	char tgttype[NVM_TTYPE_NAME_MAX];	/* target type name */
-- 
cgit v1.3-6-gb490


From 1b72e7fd304639f1cd49d1e11955c4974936d88c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 11 Apr 2017 00:20:41 +0200
Subject: cpufreq: schedutil: Use policy-dependent transition delays

Make the schedutil governor take the initial (default) value of the
rate_limit_us sysfs attribute from the (new) transition_delay_us
policy parameter (to be set by the scaling driver).

That will allow scaling drivers to make schedutil use smaller default
values of rate_limit_us and reduce the default average time interval
between consecutive frequency changes.

Make intel_pstate set transition_delay_us to 500.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/intel_pstate.c   |  2 ++
 include/linux/cpufreq.h          |  7 +++++++
 kernel/sched/cpufreq_schedutil.c | 15 ++++++++++-----
 3 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c31b72b16c2b..b7de5bd76a31 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -41,6 +41,7 @@
 #define INTEL_PSTATE_HWP_SAMPLING_INTERVAL	(50 * NSEC_PER_MSEC)
 
 #define INTEL_CPUFREQ_TRANSITION_LATENCY	20000
+#define INTEL_CPUFREQ_TRANSITION_DELAY		500
 
 #ifdef CONFIG_ACPI
 #include <acpi/processor.h>
@@ -2237,6 +2238,7 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		return ret;
 
 	policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
+	policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
 	/* This reflects the intel_pstate_get_cpu_pstates() setting. */
 	policy->cur = policy->cpuinfo.min_freq;
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 87165f06a307..a5ce0bbeadb5 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -120,6 +120,13 @@ struct cpufreq_policy {
 	bool			fast_switch_possible;
 	bool			fast_switch_enabled;
 
+	/*
+	 * Preferred average time interval between consecutive invocations of
+	 * the driver to set the frequency for this policy.  To be set by the
+	 * scaling driver (0, which is the default, means no preference).
+	 */
+	unsigned int		transition_delay_us;
+
 	 /* Cached frequency lookup from cpufreq_driver_resolve_freq. */
 	unsigned int cached_target_freq;
 	int cached_resolved_idx;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index b1fedf9932d6..76877a62b5fa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -494,7 +494,6 @@ static int sugov_init(struct cpufreq_policy *policy)
 {
 	struct sugov_policy *sg_policy;
 	struct sugov_tunables *tunables;
-	unsigned int lat;
 	int ret = 0;
 
 	/* State should be equivalent to EXIT */
@@ -533,10 +532,16 @@ static int sugov_init(struct cpufreq_policy *policy)
 		goto stop_kthread;
 	}
 
-	tunables->rate_limit_us = LATENCY_MULTIPLIER;
-	lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
-	if (lat)
-		tunables->rate_limit_us *= lat;
+	if (policy->transition_delay_us) {
+		tunables->rate_limit_us = policy->transition_delay_us;
+	} else {
+		unsigned int lat;
+
+		tunables->rate_limit_us = LATENCY_MULTIPLIER;
+		lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+		if (lat)
+			tunables->rate_limit_us *= lat;
+	}
 
 	policy->governor_data = sg_policy;
 	sg_policy->tunables = tunables;
-- 
cgit v1.3-6-gb490


From f51b17c8d90f85456579c3192ab59ee031835634 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Mon, 17 Apr 2017 21:34:56 +0800
Subject: boot/param: Move next_arg() function to lib/cmdline.c for later reuse
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

next_arg() will be used to parse boot parameters in the x86/boot/compressed code,
so move it to lib/cmdline.c for better code reuse.

No change in functionality.

Signed-off-by: Baoquan He <bhe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Gustavo Padovan <gustavo.padovan@collabora.co.uk>
Cc: Jens Axboe <axboe@fb.com>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Johannes Berg <johannes.berg@intel.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Larry Finger <Larry.Finger@lwfinger.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dan.j.williams@intel.com
Cc: dave.jiang@intel.com
Cc: dyoung@redhat.com
Cc: keescook@chromium.org
Cc: zijun_hu <zijun_hu@htc.com>
Link: http://lkml.kernel.org/r/1492436099-4017-2-git-send-email-bhe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel.h |  1 +
 kernel/params.c        | 52 ---------------------------------------------
 lib/cmdline.c          | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4c26dc3a8295..7ae256717a32 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -438,6 +438,7 @@ extern int get_option(char **str, int *pint);
 extern char *get_options(const char *str, int nints, int *ints);
 extern unsigned long long memparse(const char *ptr, char **retptr);
 extern bool parse_option_str(const char *str, const char *option);
+extern char *next_arg(char *args, char **param, char **val);
 
 extern int core_kernel_text(unsigned long addr);
 extern int core_kernel_data(unsigned long addr);
diff --git a/kernel/params.c b/kernel/params.c
index a6d6149c0fe6..60b2d8101355 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -160,58 +160,6 @@ static int parse_one(char *param,
 	return -ENOENT;
 }
 
-/* You can use " around spaces, but can't escape ". */
-/* Hyphens and underscores equivalent in parameter names. */
-static char *next_arg(char *args, char **param, char **val)
-{
-	unsigned int i, equals = 0;
-	int in_quote = 0, quoted = 0;
-	char *next;
-
-	if (*args == '"') {
-		args++;
-		in_quote = 1;
-		quoted = 1;
-	}
-
-	for (i = 0; args[i]; i++) {
-		if (isspace(args[i]) && !in_quote)
-			break;
-		if (equals == 0) {
-			if (args[i] == '=')
-				equals = i;
-		}
-		if (args[i] == '"')
-			in_quote = !in_quote;
-	}
-
-	*param = args;
-	if (!equals)
-		*val = NULL;
-	else {
-		args[equals] = '\0';
-		*val = args + equals + 1;
-
-		/* Don't include quotes in value. */
-		if (**val == '"') {
-			(*val)++;
-			if (args[i-1] == '"')
-				args[i-1] = '\0';
-		}
-	}
-	if (quoted && args[i-1] == '"')
-		args[i-1] = '\0';
-
-	if (args[i]) {
-		args[i] = '\0';
-		next = args + i + 1;
-	} else
-		next = args + i;
-
-	/* Chew up trailing spaces. */
-	return skip_spaces(next);
-}
-
 /* Args looks like "foo=bar,bar2 baz=fuz wiz". */
 char *parse_args(const char *doing,
 		 char *args,
diff --git a/lib/cmdline.c b/lib/cmdline.c
index 8f13cf73c2ec..3c6432df7e63 100644
--- a/lib/cmdline.c
+++ b/lib/cmdline.c
@@ -15,6 +15,7 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/ctype.h>
 
 /*
  *	If a hyphen was found in get_option, this will handle the
@@ -189,3 +190,59 @@ bool parse_option_str(const char *str, const char *option)
 
 	return false;
 }
+
+/*
+ * Parse a string to get a param value pair.
+ * You can use " around spaces, but can't escape ".
+ * Hyphens and underscores equivalent in parameter names.
+ */
+char *next_arg(char *args, char **param, char **val)
+{
+	unsigned int i, equals = 0;
+	int in_quote = 0, quoted = 0;
+	char *next;
+
+	if (*args == '"') {
+		args++;
+		in_quote = 1;
+		quoted = 1;
+	}
+
+	for (i = 0; args[i]; i++) {
+		if (isspace(args[i]) && !in_quote)
+			break;
+		if (equals == 0) {
+			if (args[i] == '=')
+				equals = i;
+		}
+		if (args[i] == '"')
+			in_quote = !in_quote;
+	}
+
+	*param = args;
+	if (!equals)
+		*val = NULL;
+	else {
+		args[equals] = '\0';
+		*val = args + equals + 1;
+
+		/* Don't include quotes in value. */
+		if (**val == '"') {
+			(*val)++;
+			if (args[i-1] == '"')
+				args[i-1] = '\0';
+		}
+	}
+	if (quoted && args[i-1] == '"')
+		args[i-1] = '\0';
+
+	if (args[i]) {
+		args[i] = '\0';
+		next = args + i + 1;
+	} else
+		next = args + i;
+
+	/* Chew up trailing spaces. */
+	return skip_spaces(next);
+	//return next;
+}
-- 
cgit v1.3-6-gb490


From f49c3f90a31f6e19ef3343dcc8809dac1019b59e Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Fri, 7 Apr 2017 16:22:47 +0800
Subject: ACPI / tables: Drop acpi_parse_entries() which is not used

Function acpi_parse_entries() is not used any more and if necessary,
acpi_table_parse_entries() can be used instead of it, so drop it.

Signed-off-by: Baoquan He <bhe@redhat.com>
[ rjw: Subject / changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/tables.c | 16 ----------------
 include/linux/acpi.h  |  4 ----
 2 files changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 2604189d6cd1..0dae722ab2ec 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -310,22 +310,6 @@ acpi_parse_entries_array(char *id, unsigned long table_size,
 	return errs ? -EINVAL : count;
 }
 
-int __init
-acpi_parse_entries(char *id,
-			unsigned long table_size,
-			acpi_tbl_entry_handler handler,
-			struct acpi_table_header *table_header,
-			int entry_id, unsigned int max_entries)
-{
-	struct acpi_subtable_proc proc = {
-		.id		= entry_id,
-		.handler	= handler,
-	};
-
-	return acpi_parse_entries_array(id, table_size, table_header,
-			&proc, 1, max_entries);
-}
-
 int __init
 acpi_table_parse_entries_array(char *id,
 			 unsigned long table_size,
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9b05886f9773..83abbfceabad 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -233,10 +233,6 @@ int acpi_numa_init (void);
 
 int acpi_table_init (void);
 int acpi_table_parse(char *id, acpi_tbl_table_handler handler);
-int __init acpi_parse_entries(char *id, unsigned long table_size,
-			      acpi_tbl_entry_handler handler,
-			      struct acpi_table_header *table_header,
-			      int entry_id, unsigned int max_entries);
 int __init acpi_table_parse_entries(char *id, unsigned long table_size,
 			      int entry_id,
 			      acpi_tbl_entry_handler handler,
-- 
cgit v1.3-6-gb490


From e21b7a0b988772e82e7147e1c659a5afe2ae003c Mon Sep 17 00:00:00 2001
From: Arianna Avanzini <avanzini.arianna@gmail.com>
Date: Wed, 12 Apr 2017 18:23:08 +0200
Subject: block, bfq: add full hierarchical scheduling and cgroups support

Add complete support for full hierarchical scheduling, with a cgroups
interface. Full hierarchical scheduling is implemented through the
'entity' abstraction: both bfq_queues, i.e., the internal BFQ queues
associated with processes, and groups are represented in general by
entities. Given the bfq_queues associated with the processes belonging
to a given group, the entities representing these queues are sons of
the entity representing the group. At higher levels, if a group, say
G, contains other groups, then the entity representing G is the parent
entity of the entities representing the groups in G.

Hierarchical scheduling is performed as follows: if the timestamps of
a leaf entity (i.e., of a bfq_queue) change, and such a change lets
the entity become the next-to-serve entity for its parent entity, then
the timestamps of the parent entity are recomputed as a function of
the budget of its new next-to-serve leaf entity. If the parent entity
belongs, in its turn, to a group, and its new timestamps let it become
the next-to-serve for its parent entity, then the timestamps of the
latter parent entity are recomputed as well, and so on. When a new
bfq_queue must be set in service, the reverse path is followed: the
next-to-serve highest-level entity is chosen, then its next-to-serve
child entity, and so on, until the next-to-serve leaf entity is
reached, and the bfq_queue that this entity represents is set in
service.

Writeback is accounted for on a per-group basis, i.e., for each group,
the async I/O requests of the processes of the group are enqueued in a
distinct bfq_queue, and the entity associated with this queue is a
child of the entity associated with the group.

Weights can be assigned explicitly to groups and processes through the
cgroups interface, differently from what happens, for single
processes, if the cgroups interface is not used (as explained in the
description of the previous patch). In particular, since each node has
a full scheduler, each group can be assigned its own weight.

Signed-off-by: Fabio Checconi <fchecconi@gmail.com>
Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/block/bfq-iosched.txt |   17 +-
 block/Kconfig.iosched               |   10 +
 block/bfq-iosched.c                 | 2568 ++++++++++++++++++++++++++++++-----
 include/linux/blkdev.h              |    2 +-
 4 files changed, 2213 insertions(+), 384 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
index cbf85f6f1fd8..461b27fce979 100644
--- a/Documentation/block/bfq-iosched.txt
+++ b/Documentation/block/bfq-iosched.txt
@@ -253,9 +253,14 @@ of slice_idle are copied from CFQ too.
 per-process ioprio and weight
 -----------------------------
 
-Unless the cgroups interface is used, weights can be assigned to
-processes only indirectly, through I/O priorities, and according to
-the relation: weight = (IOPRIO_BE_NR - ioprio) * 10.
+Unless the cgroups interface is used (see "4. BFQ group scheduling"),
+weights can be assigned to processes only indirectly, through I/O
+priorities, and according to the relation:
+weight = (IOPRIO_BE_NR - ioprio) * 10.
+
+Beware that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
 
 slice_idle
 ----------
@@ -450,9 +455,9 @@ may be reactivated for an already busy async queue (in ms).
 4. Group scheduling with BFQ
 ============================
 
-BFQ supports both cgroup-v1 and cgroup-v2 io controllers, namely blkio
-and io. In particular, BFQ supports weight-based proportional
-share.
+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
+blkio and io. In particular, BFQ supports weight-based proportional
+share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
 
 4-1 Service guarantees provided
 -------------------------------
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 6fc36027b70e..fd2cefa47d35 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -40,6 +40,7 @@ config CFQ_GROUP_IOSCHED
 	  Enable group IO scheduling in CFQ.
 
 choice
+
 	prompt "Default I/O scheduler"
 	default DEFAULT_CFQ
 	help
@@ -89,6 +90,15 @@ config IOSCHED_BFQ
 	real-time applications.  Details in
 	Documentation/block/bfq-iosched.txt
 
+config BFQ_GROUP_IOSCHED
+       bool "BFQ hierarchical scheduling support"
+       depends on IOSCHED_BFQ && BLK_CGROUP
+       default n
+       ---help---
+
+       Enable hierarchical scheduling in BFQ, using the blkio
+       (cgroups-v1) or io (cgroups-v2) controller.
+
 endmenu
 
 endif
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index c4e7d8db796a..af1740a1d453 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -90,6 +90,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/cgroup.h>
 #include <linux/elevator.h>
 #include <linux/ktime.h>
 #include <linux/rbtree.h>
@@ -114,7 +115,7 @@
 
 #define BFQ_DEFAULT_QUEUE_IOPRIO	4
 
-#define BFQ_DEFAULT_GRP_WEIGHT	10
+#define BFQ_WEIGHT_LEGACY_DFL	100
 #define BFQ_DEFAULT_GRP_IOPRIO	0
 #define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
 
@@ -149,10 +150,11 @@ struct bfq_service_tree {
  * struct bfq_sched_data - multi-class scheduler.
  *
  * bfq_sched_data is the basic scheduler queue.  It supports three
- * ioprio_classes, and can be used either as a toplevel queue or as
- * an intermediate queue on a hierarchical setup.
- * @next_in_service points to the active entity of the sched_data
- * service trees that will be scheduled next.
+ * ioprio_classes, and can be used either as a toplevel queue or as an
+ * intermediate queue on a hierarchical setup.  @next_in_service
+ * points to the active entity of the sched_data service trees that
+ * will be scheduled next. It is used to reduce the number of steps
+ * needed for each hierarchical-schedule update.
  *
  * The supported ioprio_classes are the same as in CFQ, in descending
  * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
@@ -164,19 +166,23 @@ struct bfq_service_tree {
 struct bfq_sched_data {
 	/* entity in service */
 	struct bfq_entity *in_service_entity;
-	/* head-of-the-line entity in the scheduler */
+	/* head-of-line entity (see comments above) */
 	struct bfq_entity *next_in_service;
 	/* array of service trees, one per ioprio_class */
 	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+	/* last time CLASS_IDLE was served */
+	unsigned long bfq_class_idle_last_service;
+
 };
 
 /**
  * struct bfq_entity - schedulable entity.
  *
- * A bfq_entity is used to represent a bfq_queue (leaf node in the upper
- * level scheduler). Each entity belongs to the sched_data of the parent
- * group hierarchy. Non-leaf entities have also their own sched_data,
- * stored in @my_sched_data.
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy.  Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
  *
  * Each entity stores independently its priority values; this would
  * allow different weights on different devices, but this
@@ -187,23 +193,24 @@ struct bfq_sched_data {
  * update to take place the effective and the requested priority
  * values are synchronized.
  *
- * The weight value is calculated from the ioprio to export the same
- * interface as CFQ.  When dealing with  ``well-behaved'' queues (i.e.,
- * queues that do not spend too much time to consume their budget
- * and have true sequential behavior, and when there are no external
- * factors breaking anticipation) the relative weights at each level
- * of the hierarchy should be guaranteed.  All the fields are
- * protected by the queue lock of the containing bfqd.
+ * Unless cgroups are used, the weight value is calculated from the
+ * ioprio to export the same interface as CFQ.  When dealing with
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * time to consume their budget and have true sequential behavior, and
+ * when there are no external factors breaking anticipation) the
+ * relative weights at each level of the cgroups hierarchy should be
+ * guaranteed.  All the fields are protected by the queue lock of the
+ * containing bfqd.
  */
 struct bfq_entity {
 	/* service_tree member */
 	struct rb_node rb_node;
 
 	/*
-	 * flag, true if the entity is on a tree (either the active or
-	 * the idle one of its service_tree).
+	 * Flag, true if the entity is on a tree (either the active or
+	 * the idle one of its service_tree) or is in service.
 	 */
-	int on_st;
+	bool on_st;
 
 	/* B-WF2Q+ start and finish timestamps [sectors/weight] */
 	u64 start, finish;
@@ -246,6 +253,8 @@ struct bfq_entity {
 	int prio_changed;
 };
 
+struct bfq_group;
+
 /**
  * struct bfq_ttime - per process thinktime stats.
  */
@@ -265,7 +274,11 @@ struct bfq_ttime {
  * struct bfq_queue - leaf schedulable entity.
  *
  * A bfq_queue is a leaf request queue; it can be associated with an
- * io_context or more, if it is async.
+ * io_context or more, if it is async. @cgroup holds a reference to
+ * the cgroup, to be sure that it does not disappear while a bfqq
+ * still references it (mostly to avoid races between request issuing
+ * and task migration followed by cgroup destruction).  All the fields
+ * are protected by the queue lock of the containing bfqd.
  */
 struct bfq_queue {
 	/* reference counter */
@@ -338,6 +351,9 @@ struct bfq_io_cq {
 	struct bfq_queue *bfqq[2];
 	/* per (request_queue, blkcg) ioprio */
 	int ioprio;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	uint64_t blkcg_serial_nr; /* the current blkcg serial */
+#endif
 };
 
 /**
@@ -351,8 +367,8 @@ struct bfq_data {
 	/* dispatch queue */
 	struct list_head dispatch;
 
-	/* root @bfq_sched_data for the device */
-	struct bfq_sched_data sched_data;
+	/* root bfq_group for the device */
+	struct bfq_group *root_group;
 
 	/*
 	 * Number of bfq_queues containing requests (including the
@@ -423,8 +439,6 @@ struct bfq_data {
 	unsigned int bfq_back_max;
 	/* maximum idling time */
 	u32 bfq_slice_idle;
-	/* last time CLASS_IDLE was served */
-	u64 bfq_class_idle_last_service;
 
 	/* user-configured max budget value (0 for auto-tuning) */
 	int bfq_user_max_budget;
@@ -516,8 +530,35 @@ BFQ_BFQQ_FNS(IO_bound);
 #undef BFQ_BFQQ_FNS
 
 /* Logging facilities. */
-#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
-	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
+			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
+			  __pbuf, ##args);				\
+} while (0)
+
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));		\
+	blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args);	\
+} while (0)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	\
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid,	\
+			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
+				##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
 
 #define bfq_log(bfqd, fmt, args...) \
 	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
@@ -534,15 +575,120 @@ enum bfqq_expiration {
 	BFQQE_PREEMPTED		/* preemption in progress */
 };
 
+struct bfqg_stats {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	/* number of ios merged */
+	struct blkg_rwstat		merged;
+	/* total time spent on device in ns, may not be accurate w/ queueing */
+	struct blkg_rwstat		service_time;
+	/* total time spent waiting in scheduler queue in ns */
+	struct blkg_rwstat		wait_time;
+	/* number of IOs queued up */
+	struct blkg_rwstat		queued;
+	/* total disk time and nr sectors dispatched by this group */
+	struct blkg_stat		time;
+	/* sum of number of ios queued across all samples */
+	struct blkg_stat		avg_queue_size_sum;
+	/* count of samples taken for average */
+	struct blkg_stat		avg_queue_size_samples;
+	/* how many times this group has been removed from service tree */
+	struct blkg_stat		dequeue;
+	/* total time spent waiting for it to be assigned a timeslice. */
+	struct blkg_stat		group_wait_time;
+	/* time spent idling for this blkcg_gq */
+	struct blkg_stat		idle_time;
+	/* total time with empty current active q with other requests queued */
+	struct blkg_stat		empty_time;
+	/* fields after this shouldn't be cleared on stat reset */
+	uint64_t			start_group_wait_time;
+	uint64_t			start_idle_time;
+	uint64_t			start_empty_time;
+	uint16_t			flags;
+#endif	/* CONFIG_BFQ_GROUP_IOSCHED */
+};
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/*
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
+ *
+ * @ps: @blkcg_policy_storage that this structure inherits
+ * @weight: weight of the bfq_group
+ */
+struct bfq_group_data {
+	/* must be the first member */
+	struct blkcg_policy_data pd;
+
+	unsigned short weight;
+};
+
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ *              both bfq_queues and bfq_groups).
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ *              the group, one queue per ioprio value per ioprio_class,
+ *              except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ *             to avoid too many special cases during group creation/
+ *             migration.
+ * @stats: stats for this bfqg.
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ *    o @bfqd is protected by the queue lock, RCU is used to access it
+ *      from the readers.
+ *    o All the other fields are protected by the @bfqd queue lock.
+ */
+struct bfq_group {
+	/* must be the first member */
+	struct blkg_policy_data pd;
+
+	struct bfq_entity entity;
+	struct bfq_sched_data sched_data;
+
+	void *bfqd;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct bfq_entity *my_entity;
+
+	struct bfqg_stats stats;
+};
+
+#else
+struct bfq_group {
+	struct bfq_sched_data sched_data;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct rb_root rq_pos_tree;
+};
+#endif
+
 static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
 
+static unsigned int bfq_class_idx(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	return bfqq ? bfqq->ioprio_class - 1 :
+		BFQ_DEFAULT_GRP_CLASS - 1;
+}
+
 static struct bfq_service_tree *
 bfq_entity_service_tree(struct bfq_entity *entity)
 {
 	struct bfq_sched_data *sched_data = entity->sched_data;
-	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
-	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
-				  BFQ_DEFAULT_GRP_CLASS - 1;
+	unsigned int idx = bfq_class_idx(entity);
 
 	return sched_data->service_tree + idx;
 }
@@ -568,16 +714,9 @@ static void bfq_put_queue(struct bfq_queue *bfqq);
 static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 				       struct bio *bio, bool is_sync,
 				       struct bfq_io_cq *bic);
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 
-/*
- * Array of async queues for all the processes, one queue
- * per ioprio value per ioprio_class.
- */
-struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
-/* Async queue for the idle class (ioprio is ignored) */
-struct bfq_queue *async_idle_bfqq;
-
 /* Expiration time of sync (0) and async (1) requests, in ns. */
 static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
 
@@ -663,30 +802,222 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
 }
 
 /*
- * Next two macros are just fake loops for the moment. They will
- * become true loops in the cgroups-enabled variant of the code. Such
- * a variant, in its turn, will be introduced by next commit.
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+static void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+	if (bfqd->queued != 0) {
+		bfq_log(bfqd, "schedule dispatch");
+		blk_mq_run_hw_queues(bfqd->queue, true);
+	}
+}
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static int bfq_gt(u64 a, u64 b)
+{
+	return (s64)(a - b) > 0;
+}
+
+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
+{
+	struct rb_node *node = tree->rb_node;
+
+	return rb_entry(node, struct bfq_entity, rb_node);
+}
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
+
+/**
+ * bfq_update_next_in_service - update sd->next_in_service
+ * @sd: sched_data for which to perform the update.
+ * @new_entity: if not NULL, pointer to the entity whose activation,
+ *		requeueing or repositionig triggered the invocation of
+ *		this function.
+ *
+ * This function is called to update sd->next_in_service, which, in
+ * its turn, may change as a consequence of the insertion or
+ * extraction of an entity into/from one of the active trees of
+ * sd. These insertions/extractions occur as a consequence of
+ * activations/deactivations of entities, with some activations being
+ * 'true' activations, and other activations being requeueings (i.e.,
+ * implementing the second, requeueing phase of the mechanism used to
+ * reposition an entity in its active tree; see comments on
+ * __bfq_activate_entity and __bfq_requeue_entity for details). In
+ * both the last two activation sub-cases, new_entity points to the
+ * just activated or requeued entity.
+ *
+ * Returns true if sd->next_in_service changes in such a way that
+ * entity->parent may become the next_in_service for its parent
+ * entity.
  */
+static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
+				       struct bfq_entity *new_entity)
+{
+	struct bfq_entity *next_in_service = sd->next_in_service;
+	bool parent_sched_may_change = false;
+
+	/*
+	 * If this update is triggered by the activation, requeueing
+	 * or repositiong of an entity that does not coincide with
+	 * sd->next_in_service, then a full lookup in the active tree
+	 * can be avoided. In fact, it is enough to check whether the
+	 * just-modified entity has a higher priority than
+	 * sd->next_in_service, or, even if it has the same priority
+	 * as sd->next_in_service, is eligible and has a lower virtual
+	 * finish time than sd->next_in_service. If this compound
+	 * condition holds, then the new entity becomes the new
+	 * next_in_service. Otherwise no change is needed.
+	 */
+	if (new_entity && new_entity != sd->next_in_service) {
+		/*
+		 * Flag used to decide whether to replace
+		 * sd->next_in_service with new_entity. Tentatively
+		 * set to true, and left as true if
+		 * sd->next_in_service is NULL.
+		 */
+		bool replace_next = true;
+
+		/*
+		 * If there is already a next_in_service candidate
+		 * entity, then compare class priorities or timestamps
+		 * to decide whether to replace sd->service_tree with
+		 * new_entity.
+		 */
+		if (next_in_service) {
+			unsigned int new_entity_class_idx =
+				bfq_class_idx(new_entity);
+			struct bfq_service_tree *st =
+				sd->service_tree + new_entity_class_idx;
+
+			/*
+			 * For efficiency, evaluate the most likely
+			 * sub-condition first.
+			 */
+			replace_next =
+				(new_entity_class_idx ==
+				 bfq_class_idx(next_in_service)
+				 &&
+				 !bfq_gt(new_entity->start, st->vtime)
+				 &&
+				 bfq_gt(next_in_service->finish,
+					new_entity->finish))
+				||
+				new_entity_class_idx <
+				bfq_class_idx(next_in_service);
+		}
+
+		if (replace_next)
+			next_in_service = new_entity;
+	} else /* invoked because of a deactivation: lookup needed */
+		next_in_service = bfq_lookup_next_entity(sd);
+
+	if (next_in_service) {
+		parent_sched_may_change = !sd->next_in_service ||
+			bfq_update_parent_budget(next_in_service);
+	}
+
+	sd->next_in_service = next_in_service;
+
+	if (!next_in_service)
+		return parent_sched_may_change;
+
+	return parent_sched_may_change;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/* both next loops stop at one of the child entities of the root group */
 #define for_each_entity(entity)	\
-	for (; entity ; entity = NULL)
+	for (; entity ; entity = entity->parent)
 
+/*
+ * For each iteration, compute parent in advance, so as to be safe if
+ * entity is deallocated during the iteration. Such a deallocation may
+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue
+ * containing entity.
+ */
 #define for_each_entity_safe(entity, parent) \
-	for (parent = NULL; entity ; entity = parent)
+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
 
-static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+/*
+ * Returns true if this budget changes may let next_in_service->parent
+ * become the next_in_service entity for its parent entity.
+ */
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
 {
-	return 0;
+	struct bfq_entity *bfqg_entity;
+	struct bfq_group *bfqg;
+	struct bfq_sched_data *group_sd;
+	bool ret = false;
+
+	group_sd = next_in_service->sched_data;
+
+	bfqg = container_of(group_sd, struct bfq_group, sched_data);
+	/*
+	 * bfq_group's my_entity field is not NULL only if the group
+	 * is not the root group. We must not touch the root entity
+	 * as it must never become an in-service entity.
+	 */
+	bfqg_entity = bfqg->my_entity;
+	if (bfqg_entity) {
+		if (bfqg_entity->budget > next_in_service->budget)
+			ret = true;
+		bfqg_entity->budget = next_in_service->budget;
+	}
+
+	return ret;
+}
+
+/*
+ * This function tells whether entity stops being a candidate for next
+ * service, according to the following logic.
+ *
+ * This function is invoked for an entity that is about to be set in
+ * service. If such an entity is a queue, then the entity is no longer
+ * a candidate for next service (i.e, a candidate entity to serve
+ * after the in-service entity is expired). The function then returns
+ * true.
+ */
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+	if (bfq_entity_to_bfqq(entity))
+		return true;
+
+	return false;
 }
 
-static void bfq_check_next_in_service(struct bfq_sched_data *sd,
-				      struct bfq_entity *entity)
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+/*
+ * Next two macros are fake loops when cgroups support is not
+ * enabled. I fact, in such a case, there is only one level to go up
+ * (to reach the root group).
+ */
+#define for_each_entity(entity)	\
+	for (; entity ; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+	for (parent = NULL; entity ; entity = parent)
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
 {
+	return false;
 }
 
-static void bfq_update_budget(struct bfq_entity *next_in_service)
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
 {
+	return true;
 }
 
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
 /*
  * Shift for timestamp calculations.  This actually limits the maximum
  * service allowed in one timestamp delta (small shift values increase it),
@@ -696,18 +1027,6 @@ static void bfq_update_budget(struct bfq_entity *next_in_service)
  */
 #define WFQ_SERVICE_SHIFT	22
 
-/**
- * bfq_gt - compare two timestamps.
- * @a: first ts.
- * @b: second ts.
- *
- * Return @a > @b, dealing with wrapping correctly.
- */
-static int bfq_gt(u64 a, u64 b)
-{
-	return (s64)(a - b) > 0;
-}
-
 static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
 {
 	struct bfq_queue *bfqq = NULL;
@@ -926,6 +1245,11 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 	struct rb_node *node = &entity->rb_node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
 
 	bfq_insert(&st->active, entity);
 
@@ -936,6 +1260,11 @@ static void bfq_active_insert(struct bfq_service_tree *st,
 
 	bfq_update_active_tree(node);
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
 	if (bfqq)
 		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
 }
@@ -1014,6 +1343,11 @@ static void bfq_active_extract(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 	struct rb_node *node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
 
 	node = bfq_find_deepest(&entity->rb_node);
 	bfq_extract(&st->active, entity);
@@ -1021,6 +1355,11 @@ static void bfq_active_extract(struct bfq_service_tree *st,
 	if (node)
 		bfq_update_active_tree(node);
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
 	if (bfqq)
 		list_del(&bfqq->bfqq_list);
 }
@@ -1069,7 +1408,7 @@ static void bfq_forget_entity(struct bfq_service_tree *st,
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
-	entity->on_st = 0;
+	entity->on_st = false;
 	st->wsum -= entity->weight;
 	if (bfqq && !is_in_service)
 		bfq_put_queue(bfqq);
@@ -1115,7 +1454,7 @@ static void bfq_forget_idle(struct bfq_service_tree *st)
 
 static struct bfq_service_tree *
 __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
-			 struct bfq_entity *entity)
+				struct bfq_entity *entity)
 {
 	struct bfq_service_tree *new_st = old_st;
 
@@ -1123,9 +1462,20 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 		unsigned short prev_weight, new_weight;
 		struct bfq_data *bfqd = NULL;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		struct bfq_sched_data *sd;
+		struct bfq_group *bfqg;
+#endif
 
 		if (bfqq)
 			bfqd = bfqq->bfqd;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		else {
+			sd = entity->my_sched_data;
+			bfqg = container_of(sd, struct bfq_group, sched_data);
+			bfqd = (struct bfq_data *)bfqg->bfqd;
+		}
+#endif
 
 		old_st->wsum -= entity->weight;
 
@@ -1171,6 +1521,9 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
 	return new_st;
 }
 
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+
 /**
  * bfq_bfqq_served - update the scheduler status after selection for
  *                   service.
@@ -1194,6 +1547,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
 		st->vtime += bfq_delta(served, st->wsum);
 		bfq_forget_idle(st);
 	}
+	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
 	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
 }
 
@@ -1216,78 +1570,10 @@ static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
 	bfq_bfqq_served(bfqq, entity->budget - entity->service);
 }
 
-/**
- * __bfq_activate_entity - activate an entity.
- * @entity: the entity being activated.
- * @non_blocking_wait_rq: true if this entity was waiting for a request
- *
- * Called whenever an entity is activated, i.e., it is not active and one
- * of its children receives a new request, or has to be reactivated due to
- * budget exhaustion.  It uses the current budget of the entity (and the
- * service received if @entity is active) of the queue to calculate its
- * timestamps.
- */
-static void __bfq_activate_entity(struct bfq_entity *entity,
-				  bool non_blocking_wait_rq)
+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
+					struct bfq_service_tree *st,
+					bool backshifted)
 {
-	struct bfq_sched_data *sd = entity->sched_data;
-	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
-	bool backshifted = false;
-
-	if (entity == sd->in_service_entity) {
-		/*
-		 * If we are requeueing the current entity we have
-		 * to take care of not charging to it service it has
-		 * not received.
-		 */
-		bfq_calc_finish(entity, entity->service);
-		entity->start = entity->finish;
-		sd->in_service_entity = NULL;
-	} else if (entity->tree == &st->active) {
-		/*
-		 * Requeueing an entity due to a change of some
-		 * next_in_service entity below it.  We reuse the
-		 * old start time.
-		 */
-		bfq_active_extract(st, entity);
-	} else {
-		unsigned long long min_vstart;
-
-		/* See comments on bfq_fqq_update_budg_for_activation */
-		if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
-			backshifted = true;
-			min_vstart = entity->finish;
-		} else
-			min_vstart = st->vtime;
-
-		if (entity->tree == &st->idle) {
-			/*
-			 * Must be on the idle tree, bfq_idle_extract() will
-			 * check for that.
-			 */
-			bfq_idle_extract(st, entity);
-			entity->start = bfq_gt(min_vstart, entity->finish) ?
-				min_vstart : entity->finish;
-		} else {
-			/*
-			 * The finish time of the entity may be invalid, and
-			 * it is in the past for sure, otherwise the queue
-			 * would have been on the idle tree.
-			 */
-			entity->start = min_vstart;
-			st->wsum += entity->weight;
-			/*
-			 * entity is about to be inserted into a service tree,
-			 * and then set in service: get a reference to make
-			 * sure entity does not disappear until it is no
-			 * longer in service or scheduled for service.
-			 */
-			bfq_get_entity(entity);
-
-			entity->on_st = 1;
-		}
-	}
-
 	st = __bfq_entity_update_weight_prio(st, entity);
 	bfq_calc_finish(entity, entity->budget);
 
@@ -1329,27 +1615,185 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
 }
 
 /**
- * bfq_activate_entity - activate an entity and its ancestors if necessary.
+ * __bfq_activate_entity - handle activation of entity.
+ * @entity: the entity being activated.
+ * @non_blocking_wait_rq: true if entity was waiting for a request
+ *
+ * Called for a 'true' activation, i.e., if entity is not active and
+ * one of its children receives a new request.
+ *
+ * Basically, this function updates the timestamps of entity and
+ * inserts entity into its active tree, ater possible extracting it
+ * from its idle tree.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity,
+				  bool non_blocking_wait_rq)
+{
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	bool backshifted = false;
+	unsigned long long min_vstart;
+
+	/* See comments on bfq_fqq_update_budg_for_activation */
+	if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
+		backshifted = true;
+		min_vstart = entity->finish;
+	} else
+		min_vstart = st->vtime;
+
+	if (entity->tree == &st->idle) {
+		/*
+		 * Must be on the idle tree, bfq_idle_extract() will
+		 * check for that.
+		 */
+		bfq_idle_extract(st, entity);
+		entity->start = bfq_gt(min_vstart, entity->finish) ?
+			min_vstart : entity->finish;
+	} else {
+		/*
+		 * The finish time of the entity may be invalid, and
+		 * it is in the past for sure, otherwise the queue
+		 * would have been on the idle tree.
+		 */
+		entity->start = min_vstart;
+		st->wsum += entity->weight;
+		/*
+		 * entity is about to be inserted into a service tree,
+		 * and then set in service: get a reference to make
+		 * sure entity does not disappear until it is no
+		 * longer in service or scheduled for service.
+		 */
+		bfq_get_entity(entity);
+
+		entity->on_st = true;
+	}
+
+	bfq_update_fin_time_enqueue(entity, st, backshifted);
+}
+
+/**
+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
+ * @entity: the entity being requeued or repositioned.
+ *
+ * Requeueing is needed if this entity stops being served, which
+ * happens if a leaf descendant entity has expired. On the other hand,
+ * repositioning is needed if the next_inservice_entity for the child
+ * entity has changed. See the comments inside the function for
+ * details.
+ *
+ * Basically, this function: 1) removes entity from its active tree if
+ * present there, 2) updates the timestamps of entity and 3) inserts
+ * entity back into its active tree (in the new, right position for
+ * the new values of the timestamps).
+ */
+static void __bfq_requeue_entity(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	if (entity == sd->in_service_entity) {
+		/*
+		 * We are requeueing the current in-service entity,
+		 * which may have to be done for one of the following
+		 * reasons:
+		 * - entity represents the in-service queue, and the
+		 *   in-service queue is being requeued after an
+		 *   expiration;
+		 * - entity represents a group, and its budget has
+		 *   changed because one of its child entities has
+		 *   just been either activated or requeued for some
+		 *   reason; the timestamps of the entity need then to
+		 *   be updated, and the entity needs to be enqueued
+		 *   or repositioned accordingly.
+		 *
+		 * In particular, before requeueing, the start time of
+		 * the entity must be moved forward to account for the
+		 * service that the entity has received while in
+		 * service. This is done by the next instructions. The
+		 * finish time will then be updated according to this
+		 * new value of the start time, and to the budget of
+		 * the entity.
+		 */
+		bfq_calc_finish(entity, entity->service);
+		entity->start = entity->finish;
+		/*
+		 * In addition, if the entity had more than one child
+		 * when set in service, then was not extracted from
+		 * the active tree. This implies that the position of
+		 * the entity in the active tree may need to be
+		 * changed now, because we have just updated the start
+		 * time of the entity, and we will update its finish
+		 * time in a moment (the requeueing is then, more
+		 * precisely, a repositioning in this case). To
+		 * implement this repositioning, we: 1) dequeue the
+		 * entity here, 2) update the finish time and
+		 * requeue the entity according to the new
+		 * timestamps below.
+		 */
+		if (entity->tree)
+			bfq_active_extract(st, entity);
+	} else { /* The entity is already active, and not in service */
+		/*
+		 * In this case, this function gets called only if the
+		 * next_in_service entity below this entity has
+		 * changed, and this change has caused the budget of
+		 * this entity to change, which, finally implies that
+		 * the finish time of this entity must be
+		 * updated. Such an update may cause the scheduling,
+		 * i.e., the position in the active tree, of this
+		 * entity to change. We handle this change by: 1)
+		 * dequeueing the entity here, 2) updating the finish
+		 * time and requeueing the entity according to the new
+		 * timestamps below. This is the same approach as the
+		 * non-extracted-entity sub-case above.
+		 */
+		bfq_active_extract(st, entity);
+	}
+
+	bfq_update_fin_time_enqueue(entity, st, false);
+}
+
+static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
+					  struct bfq_sched_data *sd,
+					  bool non_blocking_wait_rq)
+{
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+	if (sd->in_service_entity == entity || entity->tree == &st->active)
+		 /*
+		  * in service or already queued on the active tree,
+		  * requeue or reposition
+		  */
+		__bfq_requeue_entity(entity);
+	else
+		/*
+		 * Not in service and not queued on its active tree:
+		 * the activity is idle and this is a true activation.
+		 */
+		__bfq_activate_entity(entity, non_blocking_wait_rq);
+}
+
+
+/**
+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
+ *			 and activate, requeue or reposition all ancestors
+ *			 for which such an update becomes necessary.
  * @entity: the entity to activate.
  * @non_blocking_wait_rq: true if this entity was waiting for a request
- *
- * Activate @entity and all the entities on the path from it to the root.
+ * @requeue: true if this is a requeue, which implies that bfqq is
+ *	     being expired; thus ALL its ancestors stop being served and must
+ *	     therefore be requeued
  */
-static void bfq_activate_entity(struct bfq_entity *entity,
-				bool non_blocking_wait_rq)
+static void bfq_activate_requeue_entity(struct bfq_entity *entity,
+					bool non_blocking_wait_rq,
+					bool requeue)
 {
 	struct bfq_sched_data *sd;
 
 	for_each_entity(entity) {
-		__bfq_activate_entity(entity, non_blocking_wait_rq);
-
 		sd = entity->sched_data;
-		if (!bfq_update_next_in_service(sd))
-			/*
-			 * No need to propagate the activation to the
-			 * upper entities, as they will be updated when
-			 * the in-service entity is rescheduled.
-			 */
+		__bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
+
+		if (!bfq_update_next_in_service(sd, entity) && !requeue)
 			break;
 	}
 }
@@ -1357,52 +1801,48 @@ static void bfq_activate_entity(struct bfq_entity *entity,
 /**
  * __bfq_deactivate_entity - deactivate an entity from its service tree.
  * @entity: the entity to deactivate.
- * @requeue: if false, the entity will not be put into the idle tree.
+ * @ins_into_idle_tree: if false, the entity will not be put into the
+ *			idle tree.
  *
- * Deactivate an entity, independently from its previous state.  If the
- * entity was not on a service tree just return, otherwise if it is on
- * any scheduler tree, extract it from that tree, and if necessary
- * and if the caller did not specify @requeue, put it on the idle tree.
- *
- * Return %1 if the caller should update the entity hierarchy, i.e.,
- * if the entity was in service or if it was the next_in_service for
- * its sched_data; return %0 otherwise.
+ * Deactivates an entity, independently from its previous state.  Must
+ * be invoked only if entity is on a service tree. Extracts the entity
+ * from that tree, and if necessary and allowed, puts it on the idle
+ * tree.
  */
-static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+static bool __bfq_deactivate_entity(struct bfq_entity *entity,
+				    bool ins_into_idle_tree)
 {
 	struct bfq_sched_data *sd = entity->sched_data;
 	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
 	int is_in_service = entity == sd->in_service_entity;
-	int ret = 0;
 
-	if (!entity->on_st)
-		return 0;
+	if (!entity->on_st) /* entity never activated, or already inactive */
+		return false;
 
-	if (is_in_service) {
+	if (is_in_service)
 		bfq_calc_finish(entity, entity->service);
-		sd->in_service_entity = NULL;
-	} else if (entity->tree == &st->active)
+
+	if (entity->tree == &st->active)
 		bfq_active_extract(st, entity);
-	else if (entity->tree == &st->idle)
+	else if (!is_in_service && entity->tree == &st->idle)
 		bfq_idle_extract(st, entity);
 
-	if (is_in_service || sd->next_in_service == entity)
-		ret = bfq_update_next_in_service(sd);
-
-	if (!requeue || !bfq_gt(entity->finish, st->vtime))
+	if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
 		bfq_forget_entity(st, entity, is_in_service);
 	else
 		bfq_idle_insert(st, entity);
 
-	return ret;
+	return true;
 }
 
 /**
- * bfq_deactivate_entity - deactivate an entity.
+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
  * @entity: the entity to deactivate.
- * @requeue: true if the entity can be put on the idle tree
+ * @ins_into_idle_tree: true if the entity can be put on the idle tree
  */
-static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+static void bfq_deactivate_entity(struct bfq_entity *entity,
+				  bool ins_into_idle_tree,
+				  bool expiration)
 {
 	struct bfq_sched_data *sd;
 	struct bfq_entity *parent = NULL;
@@ -1410,63 +1850,102 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
 	for_each_entity_safe(entity, parent) {
 		sd = entity->sched_data;
 
-		if (!__bfq_deactivate_entity(entity, requeue))
+		if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
 			/*
-			 * The parent entity is still backlogged, and
-			 * we don't need to update it as it is still
-			 * in service.
+			 * entity is not in any tree any more, so
+			 * this deactivation is a no-op, and there is
+			 * nothing to change for upper-level entities
+			 * (in case of expiration, this can never
+			 * happen).
 			 */
-			break;
+			return;
+		}
+
+		if (sd->next_in_service == entity)
+			/*
+			 * entity was the next_in_service entity,
+			 * then, since entity has just been
+			 * deactivated, a new one must be found.
+			 */
+			bfq_update_next_in_service(sd, NULL);
 
 		if (sd->next_in_service)
 			/*
-			 * The parent entity is still backlogged and
-			 * the budgets on the path towards the root
-			 * need to be updated.
+			 * The parent entity is still backlogged,
+			 * because next_in_service is not NULL. So, no
+			 * further upwards deactivation must be
+			 * performed.  Yet, next_in_service has
+			 * changed.  Then the schedule does need to be
+			 * updated upwards.
 			 */
-			goto update;
+			break;
 
 		/*
-		 * If we get here, then the parent is no more backlogged and
-		 * we want to propagate the deactivation upwards.
+		 * If we get here, then the parent is no more
+		 * backlogged and we need to propagate the
+		 * deactivation upwards. Thus let the loop go on.
 		 */
-		requeue = 1;
-	}
 
-	return;
+		/*
+		 * Also let parent be queued into the idle tree on
+		 * deactivation, to preserve service guarantees, and
+		 * assuming that who invoked this function does not
+		 * need parent entities too to be removed completely.
+		 */
+		ins_into_idle_tree = true;
+	}
 
-update:
+	/*
+	 * If the deactivation loop is fully executed, then there are
+	 * no more entities to touch and next loop is not executed at
+	 * all. Otherwise, requeue remaining entities if they are
+	 * about to stop receiving service, or reposition them if this
+	 * is not the case.
+	 */
 	entity = parent;
 	for_each_entity(entity) {
-		__bfq_activate_entity(entity, false);
+		/*
+		 * Invoke __bfq_requeue_entity on entity, even if
+		 * already active, to requeue/reposition it in the
+		 * active tree (because sd->next_in_service has
+		 * changed)
+		 */
+		__bfq_requeue_entity(entity);
 
 		sd = entity->sched_data;
-		if (!bfq_update_next_in_service(sd))
+		if (!bfq_update_next_in_service(sd, entity) &&
+		    !expiration)
+			/*
+			 * next_in_service unchanged or not causing
+			 * any change in entity->parent->sd, and no
+			 * requeueing needed for expiration: stop
+			 * here.
+			 */
 			break;
 	}
 }
 
 /**
- * bfq_update_vtime - update vtime if necessary.
+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
+ *                       if needed, to have at least one entity eligible.
  * @st: the service tree to act upon.
  *
- * If necessary update the service tree vtime to have at least one
- * eligible entity, skipping to its start time.  Assumes that the
- * active tree of the device is not empty.
- *
- * NOTE: this hierarchical implementation updates vtimes quite often,
- * we may end up with reactivated processes getting timestamps after a
- * vtime skip done because we needed a ->first_active entity on some
- * intermediate node.
+ * Assumes that st is not empty.
  */
-static void bfq_update_vtime(struct bfq_service_tree *st)
+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
 {
-	struct bfq_entity *entry;
-	struct rb_node *node = st->active.rb_node;
+	struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
+
+	if (bfq_gt(root_entity->min_start, st->vtime))
+		return root_entity->min_start;
+
+	return st->vtime;
+}
 
-	entry = rb_entry(node, struct bfq_entity, rb_node);
-	if (bfq_gt(entry->min_start, st->vtime)) {
-		st->vtime = entry->min_start;
+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
+{
+	if (new_value > st->vtime) {
+		st->vtime = new_value;
 		bfq_forget_idle(st);
 	}
 }
@@ -1475,6 +1954,7 @@ static void bfq_update_vtime(struct bfq_service_tree *st)
  * bfq_first_active_entity - find the eligible entity with
  *                           the smallest finish time
  * @st: the service tree to select from.
+ * @vtime: the system virtual to use as a reference for eligibility
  *
  * This function searches the first schedulable entity, starting from the
  * root of the tree and going on the left every time on this side there is
@@ -1482,7 +1962,8 @@ static void bfq_update_vtime(struct bfq_service_tree *st)
  * the right is followed only if a) the left subtree contains no eligible
  * entities and b) no eligible entity has been found yet.
  */
-static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
+						  u64 vtime)
 {
 	struct bfq_entity *entry, *first = NULL;
 	struct rb_node *node = st->active.rb_node;
@@ -1490,13 +1971,13 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
 	while (node) {
 		entry = rb_entry(node, struct bfq_entity, rb_node);
 left:
-		if (!bfq_gt(entry->start, st->vtime))
+		if (!bfq_gt(entry->start, vtime))
 			first = entry;
 
 		if (node->rb_left) {
 			entry = rb_entry(node->rb_left,
 					 struct bfq_entity, rb_node);
-			if (!bfq_gt(entry->min_start, st->vtime)) {
+			if (!bfq_gt(entry->min_start, vtime)) {
 				node = node->rb_left;
 				goto left;
 			}
@@ -1506,222 +1987,1429 @@ left:
 		node = node->rb_right;
 	}
 
-	return first;
+	return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * If there is no in-service entity for the sched_data st belongs to,
+ * then return the entity that will be set in service if:
+ * 1) the parent entity this st belongs to is set in service;
+ * 2) no entity belonging to such parent entity undergoes a state change
+ * that would influence the timestamps of the entity (e.g., becomes idle,
+ * becomes backlogged, changes its budget, ...).
+ *
+ * In this first case, update the virtual time in @st too (see the
+ * comments on this update inside the function).
+ *
+ * In constrast, if there is an in-service entity, then return the
+ * entity that would be set in service if not only the above
+ * conditions, but also the next one held true: the currently
+ * in-service entity, on expiration,
+ * 1) gets a finish time equal to the current one, or
+ * 2) is not eligible any more, or
+ * 3) is idle.
+ */
+static struct bfq_entity *
+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
+{
+	struct bfq_entity *entity;
+	u64 new_vtime;
+
+	if (RB_EMPTY_ROOT(&st->active))
+		return NULL;
+
+	/*
+	 * Get the value of the system virtual time for which at
+	 * least one entity is eligible.
+	 */
+	new_vtime = bfq_calc_vtime_jump(st);
+
+	/*
+	 * If there is no in-service entity for the sched_data this
+	 * active tree belongs to, then push the system virtual time
+	 * up to the value that guarantees that at least one entity is
+	 * eligible. If, instead, there is an in-service entity, then
+	 * do not make any such update, because there is already an
+	 * eligible entity, namely the in-service one (even if the
+	 * entity is not on st, because it was extracted when set in
+	 * service).
+	 */
+	if (!in_service)
+		bfq_update_vtime(st, new_vtime);
+
+	entity = bfq_first_active_entity(st, new_vtime);
+
+	return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ *
+ * This function is invoked when there has been a change in the trees
+ * for sd, and we need know what is the new next entity after this
+ * change.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
+{
+	struct bfq_service_tree *st = sd->service_tree;
+	struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
+	struct bfq_entity *entity = NULL;
+	int class_idx = 0;
+
+	/*
+	 * Choose from idle class, if needed to guarantee a minimum
+	 * bandwidth to this class (and if there is some active entity
+	 * in idle class). This should also mitigate
+	 * priority-inversion problems in case a low priority task is
+	 * holding file system resources.
+	 */
+	if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
+				   BFQ_CL_IDLE_TIMEOUT)) {
+		if (!RB_EMPTY_ROOT(&idle_class_st->active))
+			class_idx = BFQ_IOPRIO_CLASSES - 1;
+		/* About to be served if backlogged, or not yet backlogged */
+		sd->bfq_class_idle_last_service = jiffies;
+	}
+
+	/*
+	 * Find the next entity to serve for the highest-priority
+	 * class, unless the idle class needs to be served.
+	 */
+	for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
+		entity = __bfq_lookup_next_entity(st + class_idx,
+						  sd->in_service_entity);
+
+		if (entity)
+			break;
+	}
+
+	if (!entity)
+		return NULL;
+
+	return entity;
+}
+
+static bool next_queue_may_preempt(struct bfq_data *bfqd)
+{
+	struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
+
+	return sd->next_in_service != sd->in_service_entity;
+}
+
+/*
+ * Get next queue for service.
+ */
+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+	struct bfq_entity *entity = NULL;
+	struct bfq_sched_data *sd;
+	struct bfq_queue *bfqq;
+
+	if (bfqd->busy_queues == 0)
+		return NULL;
+
+	/*
+	 * Traverse the path from the root to the leaf entity to
+	 * serve. Set in service all the entities visited along the
+	 * way.
+	 */
+	sd = &bfqd->root_group->sched_data;
+	for (; sd ; sd = entity->my_sched_data) {
+		/*
+		 * WARNING. We are about to set the in-service entity
+		 * to sd->next_in_service, i.e., to the (cached) value
+		 * returned by bfq_lookup_next_entity(sd) the last
+		 * time it was invoked, i.e., the last time when the
+		 * service order in sd changed as a consequence of the
+		 * activation or deactivation of an entity. In this
+		 * respect, if we execute bfq_lookup_next_entity(sd)
+		 * in this very moment, it may, although with low
+		 * probability, yield a different entity than that
+		 * pointed to by sd->next_in_service. This rare event
+		 * happens in case there was no CLASS_IDLE entity to
+		 * serve for sd when bfq_lookup_next_entity(sd) was
+		 * invoked for the last time, while there is now one
+		 * such entity.
+		 *
+		 * If the above event happens, then the scheduling of
+		 * such entity in CLASS_IDLE is postponed until the
+		 * service of the sd->next_in_service entity
+		 * finishes. In fact, when the latter is expired,
+		 * bfq_lookup_next_entity(sd) gets called again,
+		 * exactly to update sd->next_in_service.
+		 */
+
+		/* Make next_in_service entity become in_service_entity */
+		entity = sd->next_in_service;
+		sd->in_service_entity = entity;
+
+		/*
+		 * Reset the accumulator of the amount of service that
+		 * the entity is about to receive.
+		 */
+		entity->service = 0;
+
+		/*
+		 * If entity is no longer a candidate for next
+		 * service, then we extract it from its active tree,
+		 * for the following reason. To further boost the
+		 * throughput in some special case, BFQ needs to know
+		 * which is the next candidate entity to serve, while
+		 * there is already an entity in service. In this
+		 * respect, to make it easy to compute/update the next
+		 * candidate entity to serve after the current
+		 * candidate has been set in service, there is a case
+		 * where it is necessary to extract the current
+		 * candidate from its service tree. Such a case is
+		 * when the entity just set in service cannot be also
+		 * a candidate for next service. Details about when
+		 * this conditions holds are reported in the comments
+		 * on the function bfq_no_longer_next_in_service()
+		 * invoked below.
+		 */
+		if (bfq_no_longer_next_in_service(entity))
+			bfq_active_extract(bfq_entity_service_tree(entity),
+					   entity);
+
+		/*
+		 * For the same reason why we may have just extracted
+		 * entity from its active tree, we may need to update
+		 * next_in_service for the sched_data of entity too,
+		 * regardless of whether entity has been extracted.
+		 * In fact, even if entity has not been extracted, a
+		 * descendant entity may get extracted. Such an event
+		 * would cause a change in next_in_service for the
+		 * level of the descendant entity, and thus possibly
+		 * back to upper levels.
+		 *
+		 * We cannot perform the resulting needed update
+		 * before the end of this loop, because, to know which
+		 * is the correct next-to-serve candidate entity for
+		 * each level, we need first to find the leaf entity
+		 * to set in service. In fact, only after we know
+		 * which is the next-to-serve leaf entity, we can
+		 * discover whether the parent entity of the leaf
+		 * entity becomes the next-to-serve, and so on.
+		 */
+
+	}
+
+	bfqq = bfq_entity_to_bfqq(entity);
+
+	/*
+	 * We can finally update all next-to-serve entities along the
+	 * path from the leaf entity just set in service to the root.
+	 */
+	for_each_entity(entity) {
+		struct bfq_sched_data *sd = entity->sched_data;
+
+		if (!bfq_update_next_in_service(sd, NULL))
+			break;
+	}
+
+	return bfqq;
+}
+
+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+{
+	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
+	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+	struct bfq_entity *entity = in_serv_entity;
+
+	if (bfqd->in_service_bic) {
+		put_io_context(bfqd->in_service_bic->icq.ioc);
+		bfqd->in_service_bic = NULL;
+	}
+
+	bfq_clear_bfqq_wait_request(in_serv_bfqq);
+	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+	bfqd->in_service_queue = NULL;
+
+	/*
+	 * When this function is called, all in-service entities have
+	 * been properly deactivated or requeued, so we can safely
+	 * execute the final step: reset in_service_entity along the
+	 * path from entity to the root.
+	 */
+	for_each_entity(entity)
+		entity->sched_data->in_service_entity = NULL;
+
+	/*
+	 * in_serv_entity is no longer in service, so, if it is in no
+	 * service tree either, then release the service reference to
+	 * the queue it represents (taken with bfq_get_entity).
+	 */
+	if (!in_serv_entity->on_st)
+		bfq_put_queue(in_serv_bfqq);
+}
+
+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				bool ins_into_idle_tree, bool expiration)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
+}
+
+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
+				    false);
+	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_requeue_entity(entity, false,
+				    bfqq == bfqd->in_service_queue);
+}
+
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree. As a special case, it can be invoked during an
+ * expiration.
+ */
+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			      bool expiration)
+{
+	bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+	bfq_clear_bfqq_busy(bfqq);
+
+	bfqd->busy_queues--;
+
+	bfqg_stats_update_dequeue(bfqq_group(bfqq));
+
+	bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+	bfq_activate_bfqq(bfqd, bfqq);
+
+	bfq_mark_bfqq_busy(bfqq);
+	bfqd->busy_queues++;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+	BFQG_stats_waiting = 0,
+	BFQG_stats_idling,
+	BFQG_stats_empty,
+};
+
+#define BFQG_FLAG_FNS(name)						\
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags |= (1 << BFQG_stats_##name);			\
+}									\
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags &= ~(1 << BFQG_stats_##name);			\
+}									\
+static int bfqg_stats_##name(struct bfqg_stats *stats)		\
+{									\
+	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\
+}									\
+
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!bfqg_stats_waiting(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_group_wait_time))
+		blkg_stat_add(&stats->group_wait_time,
+			      now - stats->start_group_wait_time);
+	bfqg_stats_clear_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+						 struct bfq_group *curr_bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_waiting(stats))
+		return;
+	if (bfqg == curr_bfqg)
+		return;
+	stats->start_group_wait_time = sched_clock();
+	bfqg_stats_mark_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!bfqg_stats_empty(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_empty_time))
+		blkg_stat_add(&stats->empty_time,
+			      now - stats->start_empty_time);
+	bfqg_stats_clear_empty(stats);
+}
+
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
+{
+	blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (blkg_rwstat_total(&stats->queued))
+		return;
+
+	/*
+	 * group is already marked empty. This can happen if bfqq got new
+	 * request in parent group and moved to this group while being added
+	 * to service tree. Just ignore the event and move on.
+	 */
+	if (bfqg_stats_empty(stats))
+		return;
+
+	stats->start_empty_time = sched_clock();
+	bfqg_stats_mark_empty(stats);
+}
+
+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_idling(stats)) {
+		unsigned long long now = sched_clock();
+
+		if (time_after64(now, stats->start_idle_time))
+			blkg_stat_add(&stats->idle_time,
+				      now - stats->start_idle_time);
+		bfqg_stats_clear_idling(stats);
+	}
+}
+
+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	stats->start_idle_time = sched_clock();
+	bfqg_stats_mark_idling(stats);
+}
+
+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	blkg_stat_add(&stats->avg_queue_size_sum,
+		      blkg_rwstat_total(&stats->queued));
+	blkg_stat_add(&stats->avg_queue_size_samples, 1);
+	bfqg_stats_update_group_wait_time(stats);
+}
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
+ */
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
+
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+	return pd_to_blkg(&bfqg->pd);
+}
+
+static struct blkcg_policy blkcg_policy_bfq;
+
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+	return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
+}
+
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
+
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
+
+	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *group_entity = bfqq->entity.parent;
+
+	return group_entity ? container_of(group_entity, struct bfq_group,
+					   entity) :
+			      bfqq->bfqd->root_group;
+}
+
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
+
+static void bfqg_get(struct bfq_group *bfqg)
+{
+	return blkg_get(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_put(struct bfq_group *bfqg)
+{
+	return blkg_put(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+				     struct bfq_queue *bfqq,
+				     unsigned int op)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, op, 1);
+	bfqg_stats_end_empty_time(&bfqg->stats);
+	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, op, -1);
+}
+
+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
+{
+	blkg_rwstat_add(&bfqg->stats.merged, op, 1);
+}
+
+static void bfqg_stats_update_completion(struct bfq_group *bfqg,
+			uint64_t start_time, uint64_t io_start_time,
+			unsigned int op)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+	unsigned long long now = sched_clock();
+
+	if (time_after64(now, io_start_time))
+		blkg_rwstat_add(&stats->service_time, op,
+				now - io_start_time);
+	if (time_after64(io_start_time, start_time))
+		blkg_rwstat_add(&stats->wait_time, op,
+				io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_reset(&stats->merged);
+	blkg_rwstat_reset(&stats->service_time);
+	blkg_rwstat_reset(&stats->wait_time);
+	blkg_stat_reset(&stats->time);
+	blkg_stat_reset(&stats->avg_queue_size_sum);
+	blkg_stat_reset(&stats->avg_queue_size_samples);
+	blkg_stat_reset(&stats->dequeue);
+	blkg_stat_reset(&stats->group_wait_time);
+	blkg_stat_reset(&stats->idle_time);
+	blkg_stat_reset(&stats->empty_time);
+}
+
+/* @to += @from */
+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+	if (!to || !from)
+		return;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_add_aux(&to->merged, &from->merged);
+	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+	blkg_stat_add_aux(&from->time, &from->time);
+	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_add_aux(&to->avg_queue_size_samples,
+			  &from->avg_queue_size_samples);
+	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
+ */
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+{
+	struct bfq_group *parent;
+
+	if (!bfqg) /* root_group */
+		return;
+
+	parent = bfqg_parent(bfqg);
+
+	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+
+	if (unlikely(!parent))
+		return;
+
+	bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
+	bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+		bfqg_get(bfqg);
+	}
+	entity->parent = bfqg->my_entity; /* NULL for root group */
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
+	blkg_stat_exit(&stats->time);
+	blkg_stat_exit(&stats->avg_queue_size_sum);
+	blkg_stat_exit(&stats->avg_queue_size_samples);
+	blkg_stat_exit(&stats->dequeue);
+	blkg_stat_exit(&stats->group_wait_time);
+	blkg_stat_exit(&stats->idle_time);
+	blkg_stat_exit(&stats->empty_time);
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+	if (blkg_rwstat_init(&stats->merged, gfp) ||
+	    blkg_rwstat_init(&stats->service_time, gfp) ||
+	    blkg_rwstat_init(&stats->wait_time, gfp) ||
+	    blkg_rwstat_init(&stats->queued, gfp) ||
+	    blkg_stat_init(&stats->time, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+	    blkg_stat_init(&stats->dequeue, gfp) ||
+	    blkg_stat_init(&stats->group_wait_time, gfp) ||
+	    blkg_stat_init(&stats->idle_time, gfp) ||
+	    blkg_stat_init(&stats->empty_time, gfp)) {
+		bfqg_stats_exit(stats);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+{
+	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+}
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+	struct bfq_group_data *bgd;
+
+	bgd = kzalloc(sizeof(*bgd), gfp);
+	if (!bgd)
+		return NULL;
+	return &bgd->pd;
+}
+
+static void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+	d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
+		CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
+}
+
+static void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+	kfree(cpd_to_bfqgd(cpd));
+}
+
+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+{
+	struct bfq_group *bfqg;
+
+	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+	if (!bfqg)
+		return NULL;
+
+	if (bfqg_stats_init(&bfqg->stats, gfp)) {
+		kfree(bfqg);
+		return NULL;
+	}
+
+	return &bfqg->pd;
+}
+
+static void bfq_pd_init(struct blkg_policy_data *pd)
+{
+	struct blkcg_gq *blkg = pd_to_blkg(pd);
+	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
+	struct bfq_entity *entity = &bfqg->entity;
+	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
+
+	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+	entity->my_sched_data = &bfqg->sched_data;
+	bfqg->my_entity = entity; /*
+				   * the root_group's will be set to NULL
+				   * in bfq_init_queue()
+				   */
+	bfqg->bfqd = bfqd;
+}
+
+static void bfq_pd_free(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_exit(&bfqg->stats);
+	return kfree(bfqg);
+}
+
+static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+					struct bfq_group *parent)
+{
+	struct bfq_entity *entity;
+
+	entity = &bfqg->entity;
+	entity->parent = parent->my_entity;
+	entity->sched_data = &parent->sched_data;
+}
+
+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
+					 struct blkcg *blkcg)
+{
+	struct blkcg_gq *blkg;
+
+	blkg = blkg_lookup(blkcg, bfqd->queue);
+	if (likely(blkg))
+		return blkg_to_bfqg(blkg);
+	return NULL;
+}
+
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+					    struct blkcg *blkcg)
+{
+	struct bfq_group *bfqg, *parent;
+	struct bfq_entity *entity;
+
+	bfqg = bfq_lookup_bfqg(bfqd, blkcg);
+
+	if (unlikely(!bfqg))
+		return NULL;
+
+	/*
+	 * Update chain of bfq_groups as we might be handling a leaf group
+	 * which, along with some of its relatives, has not been hooked yet
+	 * to the private hierarchy of BFQ.
+	 */
+	entity = &bfqg->entity;
+	for_each_entity(entity) {
+		bfqg = container_of(entity, struct bfq_group, entity);
+		if (bfqg != bfqd->root_group) {
+			parent = bfqg_parent(bfqg);
+			if (!parent)
+				parent = bfqd->root_group;
+			bfq_group_set_parent(bfqg, parent);
+		}
+	}
+
+	return bfqg;
+}
+
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    bool compensate,
+			    enum bfqq_expiration reason);
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one.  Avoid putting the entity on the old group idle tree.
+ *
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+ */
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_group *bfqg)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	/* If bfqq is empty, then bfq_bfqq_expire also invokes
+	 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
+	 * from data structures related to current group. Otherwise we
+	 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
+	 * we do below.
+	 */
+	if (bfqq == bfqd->in_service_queue)
+		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+				false, BFQQE_PREEMPTED);
+
+	if (bfq_bfqq_busy(bfqq))
+		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+	else if (entity->on_st)
+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+	bfqg_put(bfqq_group(bfqq));
+
+	/*
+	 * Here we use a reference to bfqg.  We don't need a refcounter
+	 * as the cgroup reference will not be dropped, so that its
+	 * destroy() callback will not be invoked.
+	 */
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+	bfqg_get(bfqg);
+
+	if (bfq_bfqq_busy(bfqq))
+		bfq_activate_bfqq(bfqd, bfqq);
+
+	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+		bfq_schedule_dispatch(bfqd);
+}
+
+/**
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @bic: the bic to move.
+ * @blkcg: the blk-cgroup to move to.
+ *
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.
+ *
+ * NOTE: an alternative approach might have been to store the current
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
+ * time here, at the price of slightly more complex code.
+ */
+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
+						struct bfq_io_cq *bic,
+						struct blkcg *blkcg)
+{
+	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
+	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
+	struct bfq_group *bfqg;
+	struct bfq_entity *entity;
+
+	bfqg = bfq_find_set_group(bfqd, blkcg);
+
+	if (unlikely(!bfqg))
+		bfqg = bfqd->root_group;
+
+	if (async_bfqq) {
+		entity = &async_bfqq->entity;
+
+		if (entity->sched_data != &bfqg->sched_data) {
+			bic_set_bfqq(bic, NULL, 0);
+			bfq_log_bfqq(bfqd, async_bfqq,
+				     "bic_change_group: %p %d",
+				     async_bfqq,
+				     async_bfqq->ref);
+			bfq_put_queue(async_bfqq);
+		}
+	}
+
+	if (sync_bfqq) {
+		entity = &sync_bfqq->entity;
+		if (entity->sched_data != &bfqg->sched_data)
+			bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+	}
+
+	return bfqg;
+}
+
+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+	struct bfq_group *bfqg = NULL;
+	uint64_t serial_nr;
+
+	rcu_read_lock();
+	serial_nr = bio_blkcg(bio)->css.serial_nr;
+
+	/*
+	 * Check whether blkcg has changed.  The condition may trigger
+	 * spuriously on a newly created cic but there's no harm.
+	 */
+	if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
+		goto out;
+
+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+	bic->blkcg_serial_nr = serial_nr;
+out:
+	rcu_read_unlock();
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entity = st->first_idle;
+
+	for (; entity ; entity = st->first_idle)
+		__bfq_deactivate_entity(entity, false);
+}
+
+/**
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
+ * @bfqd: the device data structure with the root group.
+ * @entity: the entity to move.
+ */
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+				     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
 }
 
 /**
- * __bfq_lookup_next_entity - return the first eligible entity in @st.
- * @st: the service tree.
+ * bfq_reparent_active_entities - move to the root group all active
+ *                                entities.
+ * @bfqd: the device data structure with the root group.
+ * @bfqg: the group to move from.
+ * @st: the service tree with the entities.
  *
- * Update the virtual time in @st and return the first eligible entity
- * it contains.
+ * Needs queue_lock to be taken and reference to be valid over the call.
  */
-static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
-						   bool force)
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
+					 struct bfq_group *bfqg,
+					 struct bfq_service_tree *st)
 {
-	struct bfq_entity *entity, *new_next_in_service = NULL;
-
-	if (RB_EMPTY_ROOT(&st->active))
-		return NULL;
+	struct rb_root *active = &st->active;
+	struct bfq_entity *entity = NULL;
 
-	bfq_update_vtime(st);
-	entity = bfq_first_active_entity(st);
+	if (!RB_EMPTY_ROOT(&st->active))
+		entity = bfq_entity_of(rb_first(active));
 
-	/*
-	 * If the chosen entity does not match with the sched_data's
-	 * next_in_service and we are forcedly serving the IDLE priority
-	 * class tree, bubble up budget update.
-	 */
-	if (unlikely(force && entity != entity->sched_data->next_in_service)) {
-		new_next_in_service = entity;
-		for_each_entity(new_next_in_service)
-			bfq_update_budget(new_next_in_service);
-	}
+	for (; entity ; entity = bfq_entity_of(rb_first(active)))
+		bfq_reparent_leaf_entity(bfqd, entity);
 
-	return entity;
+	if (bfqg->sched_data.in_service_entity)
+		bfq_reparent_leaf_entity(bfqd,
+			bfqg->sched_data.in_service_entity);
 }
 
 /**
- * bfq_lookup_next_entity - return the first eligible entity in @sd.
- * @sd: the sched_data.
- * @extract: if true the returned entity will be also extracted from @sd.
+ * bfq_pd_offline - deactivate the entity associated with @pd,
+ *		    and reparent its children entities.
+ * @pd: descriptor of the policy going offline.
  *
- * NOTE: since we cache the next_in_service entity at each level of the
- * hierarchy, the complexity of the lookup can be decreased with
- * absolutely no effort just returning the cached next_in_service value;
- * we prefer to do full lookups to test the consistency of the data
- * structures.
+ * blkio already grabs the queue_lock for us, so no need to use
+ * RCU-based magic
  */
-static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
-						 int extract,
-						 struct bfq_data *bfqd)
+static void bfq_pd_offline(struct blkg_policy_data *pd)
 {
-	struct bfq_service_tree *st = sd->service_tree;
-	struct bfq_entity *entity;
-	int i = 0;
+	struct bfq_service_tree *st;
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	struct bfq_data *bfqd = bfqg->bfqd;
+	struct bfq_entity *entity = bfqg->my_entity;
+	unsigned long flags;
+	int i;
 
+	if (!entity) /* root group */
+		return;
+
+	spin_lock_irqsave(&bfqd->lock, flags);
 	/*
-	 * Choose from idle class, if needed to guarantee a minimum
-	 * bandwidth to this class. This should also mitigate
-	 * priority-inversion problems in case a low priority task is
-	 * holding file system resources.
+	 * Empty all service_trees belonging to this group before
+	 * deactivating the group itself.
 	 */
-	if (bfqd &&
-	    jiffies - bfqd->bfq_class_idle_last_service >
-	    BFQ_CL_IDLE_TIMEOUT) {
-		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
-						  true);
-		if (entity) {
-			i = BFQ_IOPRIO_CLASSES - 1;
-			bfqd->bfq_class_idle_last_service = jiffies;
-			sd->next_in_service = entity;
-		}
-	}
-	for (; i < BFQ_IOPRIO_CLASSES; i++) {
-		entity = __bfq_lookup_next_entity(st + i, false);
-		if (entity) {
-			if (extract) {
-				bfq_check_next_in_service(sd, entity);
-				bfq_active_extract(st + i, entity);
-				sd->in_service_entity = entity;
-				sd->next_in_service = NULL;
-			}
-			break;
-		}
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
+		st = bfqg->sched_data.service_tree + i;
+
+		/*
+		 * The idle tree may still contain bfq_queues belonging
+		 * to exited task because they never migrated to a different
+		 * cgroup from the one being destroyed now.  No one else
+		 * can access them so it's safe to act without any lock.
+		 */
+		bfq_flush_idle_tree(st);
+
+		/*
+		 * It may happen that some queues are still active
+		 * (busy) upon group destruction (if the corresponding
+		 * processes have been forced to terminate). We move
+		 * all the leaf entities corresponding to these queues
+		 * to the root_group.
+		 * Also, it may happen that the group has an entity
+		 * in service, which is disconnected from the active
+		 * tree: it must be moved, too.
+		 * There is no need to put the sync queues, as the
+		 * scheduler has taken no reference.
+		 */
+		bfq_reparent_active_entities(bfqd, bfqg, st);
 	}
 
-	return entity;
+	__bfq_deactivate_entity(entity, false);
+	bfq_put_async_queues(bfqd, bfqg);
+
+	spin_unlock_irqrestore(&bfqd->lock, flags);
+	/*
+	 * @blkg is going offline and will be ignored by
+	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
+	 * that they don't get lost.  If IOs complete after this point, the
+	 * stats for them will be lost.  Oh well...
+	 */
+	bfqg_stats_xfer_dead(bfqg);
 }
 
-static bool next_queue_may_preempt(struct bfq_data *bfqd)
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
 {
-	struct bfq_sched_data *sd = &bfqd->sched_data;
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	unsigned int val = 0;
 
-	return sd->next_in_service != sd->in_service_entity;
-}
+	if (bfqgd)
+		val = bfqgd->weight;
 
+	seq_printf(sf, "%u\n", val);
 
-/*
- * Get next queue for service.
- */
-static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+	return 0;
+}
+
+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
+				    struct cftype *cftype,
+				    u64 val)
 {
-	struct bfq_entity *entity = NULL;
-	struct bfq_sched_data *sd;
-	struct bfq_queue *bfqq;
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	struct blkcg_gq *blkg;
+	int ret = -ERANGE;
 
-	if (bfqd->busy_queues == 0)
-		return NULL;
+	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
+		return ret;
 
-	sd = &bfqd->sched_data;
-	for (; sd ; sd = entity->my_sched_data) {
-		entity = bfq_lookup_next_entity(sd, 1, bfqd);
-		entity->service = 0;
+	ret = 0;
+	spin_lock_irq(&blkcg->lock);
+	bfqgd->weight = (unsigned short)val;
+	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+		if (!bfqg)
+			continue;
+		/*
+		 * Setting the prio_changed flag of the entity
+		 * to 1 with new_weight == weight would re-set
+		 * the value of the weight to its ioprio mapping.
+		 * Set the flag only if necessary.
+		 */
+		if ((unsigned short)val != bfqg->entity.new_weight) {
+			bfqg->entity.new_weight = (unsigned short)val;
+			/*
+			 * Make sure that the above new value has been
+			 * stored in bfqg->entity.new_weight before
+			 * setting the prio_changed flag. In fact,
+			 * this flag may be read asynchronously (in
+			 * critical sections protected by a different
+			 * lock than that held here), and finding this
+			 * flag set may cause the execution of the code
+			 * for updating parameters whose value may
+			 * depend also on bfqg->entity.new_weight (in
+			 * __bfq_entity_update_weight_prio).
+			 * This barrier makes sure that the new value
+			 * of bfqg->entity.new_weight is correctly
+			 * seen in that code.
+			 */
+			smp_wmb();
+			bfqg->entity.prio_changed = 1;
+		}
 	}
+	spin_unlock_irq(&blkcg->lock);
 
-	bfqq = bfq_entity_to_bfqq(entity);
+	return ret;
+}
 
-	return bfqq;
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes,
+				 loff_t off)
+{
+	u64 weight;
+	/* First unsigned long found in the file is used */
+	int ret = kstrtoull(strim(buf), 0, &weight);
+
+	if (ret)
+		return ret;
+
+	return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
 }
 
-static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+static int bfqg_print_stat(struct seq_file *sf, void *v)
 {
-	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
-	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, false);
+	return 0;
+}
 
-	if (bfqd->in_service_bic) {
-		put_io_context(bfqd->in_service_bic->icq.ioc);
-		bfqd->in_service_bic = NULL;
-	}
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, true);
+	return 0;
+}
 
-	bfq_clear_bfqq_wait_request(in_serv_bfqq);
-	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
-	bfqd->in_service_queue = NULL;
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+					  &blkcg_policy_bfq, off);
+	return __blkg_prfill_u64(sf, pd, sum);
+}
 
-	/*
-	 * in_serv_entity is no longer in service, so, if it is in no
-	 * service tree either, then release the service reference to
-	 * the queue it represents (taken with bfq_get_entity).
-	 */
-	if (!in_serv_entity->on_st)
-		bfq_put_queue(in_serv_bfqq);
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+					struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+							   &blkcg_policy_bfq,
+							   off);
+	return __blkg_prfill_rwstat(sf, pd, &sum);
 }
 
-static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-				int requeue)
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
 {
-	struct bfq_entity *entity = &bfqq->entity;
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, false);
+	return 0;
+}
 
-	bfq_deactivate_entity(entity, requeue);
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, true);
+	return 0;
 }
 
-static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+			       int off)
 {
-	struct bfq_entity *entity = &bfqq->entity;
+	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
 
-	bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq));
-	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
 }
 
-/*
- * Called when the bfqq no longer has requests pending, remove it from
- * the service tree.
- */
-static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
-			      int requeue)
+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
 {
-	bfq_log_bfqq(bfqd, bfqq, "del from busy");
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
+	return 0;
+}
 
-	bfq_clear_bfqq_busy(bfqq);
+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
+					 struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+					offsetof(struct blkcg_gq, stat_bytes));
+	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 
-	bfqd->busy_queues--;
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
 
-	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
+			  false);
+	return 0;
 }
 
-/*
- * Called when an inactive queue receives a new request.
- */
-static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
 {
-	bfq_log_bfqq(bfqd, bfqq, "add to busy");
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+	u64 v = 0;
 
-	bfq_activate_bfqq(bfqd, bfqq);
+	if (samples) {
+		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+		v = div64_u64(v, samples);
+	}
+	__blkg_prfill_u64(sf, pd, v);
+	return 0;
+}
 
-	bfq_mark_bfqq_busy(bfqq);
-	bfqd->busy_queues++;
+/* print avg_queue_size */
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
+			  0, false);
+	return 0;
+}
+
+static struct bfq_group *
+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+	int ret;
+
+	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+	if (ret)
+		return NULL;
+
+	return blkg_to_bfqg(bfqd->queue->root_blkg);
 }
 
-static void bfq_init_entity(struct bfq_entity *entity)
+static struct cftype bfq_blkcg_legacy_files[] = {
+	{
+		.name = "bfq.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfq_io_show_weight,
+		.write_u64 = bfq_io_set_weight_legacy,
+	},
+
+	/* statistics, covers only the tasks in the bfqg */
+	{
+		.name = "bfq.time",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.sectors",
+		.seq_show = bfqg_print_stat_sectors,
+	},
+	{
+		.name = "bfq.io_service_bytes",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_bytes,
+	},
+	{
+		.name = "bfq.io_serviced",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_ios,
+	},
+	{
+		.name = "bfq.io_service_time",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_wait_time",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_merged",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_queued",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat,
+	},
+
+	/* the same statictics which cover the bfqg and its descendants */
+	{
+		.name = "bfq.time_recursive",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat_recursive,
+	},
+	{
+		.name = "bfq.sectors_recursive",
+		.seq_show = bfqg_print_stat_sectors_recursive,
+	},
+	{
+		.name = "bfq.io_service_bytes_recursive",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_bytes_recursive,
+	},
+	{
+		.name = "bfq.io_serviced_recursive",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_ios_recursive,
+	},
+	{
+		.name = "bfq.io_service_time_recursive",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_wait_time_recursive",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_merged_recursive",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_queued_recursive",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.avg_queue_size",
+		.seq_show = bfqg_print_avg_queue_size,
+	},
+	{
+		.name = "bfq.group_wait_time",
+		.private = offsetof(struct bfq_group, stats.group_wait_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.idle_time",
+		.private = offsetof(struct bfq_group, stats.idle_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.empty_time",
+		.private = offsetof(struct bfq_group, stats.empty_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.dequeue",
+		.private = offsetof(struct bfq_group, stats.dequeue),
+		.seq_show = bfqg_print_stat,
+	},
+	{ }	/* terminate */
+};
+
+static struct cftype bfq_blkg_files[] = {
+	{
+		.name = "bfq.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfq_io_show_weight,
+		.write = bfq_io_set_weight,
+	},
+	{} /* terminate */
+};
+
+#else	/* CONFIG_BFQ_GROUP_IOSCHED */
+
+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+			struct bfq_queue *bfqq, unsigned int op) { }
+static inline void
+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
+static inline void
+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
+			uint64_t start_time, uint64_t io_start_time,
+			unsigned int op) { }
+static inline void
+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+				     struct bfq_group *curr_bfqg) { }
+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
+
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_group *bfqg) {}
+
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
 {
 	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
 
 	entity->weight = entity->new_weight;
 	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+	}
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
+
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+					    struct blkcg *blkcg)
+{
+	return bfqd->root_group;
+}
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	return bfqq->bfqd->root_group;
+}
 
-	bfqq->ioprio = bfqq->new_ioprio;
-	bfqq->ioprio_class = bfqq->new_ioprio_class;
+static struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd,
+						    int node)
+{
+	struct bfq_group *bfqg;
+	int i;
+
+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+	if (!bfqg)
+		return NULL;
 
-	entity->sched_data = &bfqq->bfqd->sched_data;
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+	return bfqg;
 }
+#endif	/* CONFIG_BFQ_GROUP_IOSCHED */
 
 #define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
 
 #define bfq_sample_valid(samples)	((samples) > 80)
 
-/*
- * Scheduler run of queue, if there are requests pending and no one in the
- * driver that will restart queueing.
- */
-static void bfq_schedule_dispatch(struct bfq_data *bfqd)
-{
-	if (bfqd->queued != 0) {
-		bfq_log(bfqd, "schedule dispatch");
-		blk_mq_run_hw_queues(bfqd->queue, true);
-	}
-}
-
 /*
  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
  * We choose the request that is closesr to the head right now.  Distance
@@ -1905,7 +3593,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd,
 		entity->budget = new_budget;
 		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
 					 new_budget);
-		bfq_activate_bfqq(bfqd, bfqq);
+		bfq_requeue_bfqq(bfqd, bfqq);
 	}
 }
 
@@ -2076,6 +3764,8 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
 			bfqq->ttime.last_end_request +
 			bfqd->bfq_slice_idle * 3;
 
+	bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
+
 	/*
 	 * Update budget and check whether bfqq may want to preempt
 	 * the in-service queue.
@@ -2195,7 +3885,7 @@ static void bfq_remove_request(struct request_queue *q,
 		bfqq->next_rq = NULL;
 
 		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
-			bfq_del_bfqq_busy(bfqd, bfqq, 1);
+			bfq_del_bfqq_busy(bfqd, bfqq, false);
 			/*
 			 * bfqq emptied. In normal operation, when
 			 * bfqq is empty, bfqq->entity.service and
@@ -2215,6 +3905,8 @@ static void bfq_remove_request(struct request_queue *q,
 
 	if (rq->cmd_flags & REQ_META)
 		bfqq->meta_pending--;
+
+	bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
 }
 
 static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
@@ -2300,7 +3992,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
 
 	if (!RB_EMPTY_NODE(&rq->rb_node))
-		return;
+		goto end;
 	spin_lock_irq(&bfqq->bfqd->lock);
 
 	/*
@@ -2326,6 +4018,8 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
 	bfq_remove_request(q, next);
 
 	spin_unlock_irq(&bfqq->bfqd->lock);
+end:
+	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
 }
 
 static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
@@ -2355,6 +4049,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
 				       struct bfq_queue *bfqq)
 {
 	if (bfqq) {
+		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
 		bfq_mark_bfqq_budget_new(bfqq);
 		bfq_clear_bfqq_fifo_expire(bfqq);
 
@@ -2441,6 +4136,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
 	bfqd->last_idling_start = ktime_get();
 	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
 		      HRTIMER_MODE_REL);
+	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
 }
 
 /*
@@ -2490,12 +4186,17 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
 
 static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
 {
-	__bfq_bfqd_reset_in_service(bfqd);
-
 	if (RB_EMPTY_ROOT(&bfqq->sort_list))
-		bfq_del_bfqq_busy(bfqd, bfqq, 1);
+		bfq_del_bfqq_busy(bfqd, bfqq, true);
 	else
-		bfq_activate_bfqq(bfqd, bfqq);
+		bfq_requeue_bfqq(bfqd, bfqq);
+
+	/*
+	 * All in-service entities must have been properly deactivated
+	 * or requeued before executing the next function, which
+	 * resets all in-service entites as no more in service.
+	 */
+	__bfq_bfqd_reset_in_service(bfqd);
 }
 
 /**
@@ -2972,6 +4673,7 @@ check_queue:
 				 */
 				bfq_clear_bfqq_wait_request(bfqq);
 				hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+				bfqg_stats_update_idle_time(bfqq_group(bfqq));
 			}
 			goto keep_queue;
 		}
@@ -3159,6 +4861,10 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
  */
 static void bfq_put_queue(struct bfq_queue *bfqq)
 {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_group *bfqg = bfqq_group(bfqq);
+#endif
+
 	if (bfqq->bfqd)
 		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
 			     bfqq, bfqq->ref);
@@ -3167,7 +4873,12 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
 	if (bfqq->ref)
 		return;
 
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
+
 	kmem_cache_free(bfq_pool, bfqq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_put(bfqg);
+#endif
 }
 
 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
@@ -3323,18 +5034,19 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 }
 
 static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+					       struct bfq_group *bfqg,
 					       int ioprio_class, int ioprio)
 {
 	switch (ioprio_class) {
 	case IOPRIO_CLASS_RT:
-		return &async_bfqq[0][ioprio];
+		return &bfqg->async_bfqq[0][ioprio];
 	case IOPRIO_CLASS_NONE:
 		ioprio = IOPRIO_NORM;
 		/* fall through */
 	case IOPRIO_CLASS_BE:
-		return &async_bfqq[1][ioprio];
+		return &bfqg->async_bfqq[1][ioprio];
 	case IOPRIO_CLASS_IDLE:
-		return &async_idle_bfqq;
+		return &bfqg->async_idle_bfqq;
 	default:
 		return NULL;
 	}
@@ -3348,11 +5060,18 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
 	struct bfq_queue **async_bfqq = NULL;
 	struct bfq_queue *bfqq;
+	struct bfq_group *bfqg;
 
 	rcu_read_lock();
 
+	bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+	if (!bfqg) {
+		bfqq = &bfqd->oom_bfqq;
+		goto out;
+	}
+
 	if (!is_sync) {
-		async_bfqq = bfq_async_queue_prio(bfqd, ioprio_class,
+		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
 						  ioprio);
 		bfqq = *async_bfqq;
 		if (bfqq)
@@ -3366,7 +5085,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 	if (bfqq) {
 		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
 			      is_sync);
-		bfq_init_entity(&bfqq->entity);
+		bfq_init_entity(&bfqq->entity, bfqg);
 		bfq_log_bfqq(bfqd, bfqq, "allocated");
 	} else {
 		bfqq = &bfqd->oom_bfqq;
@@ -3379,9 +5098,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 	 * prune it.
 	 */
 	if (async_bfqq) {
-		bfqq->ref++;
-		bfq_log_bfqq(bfqd, bfqq,
-			     "get_queue, bfqq not in async: %p, %d",
+		bfqq->ref++; /*
+			      * Extra group reference, w.r.t. sync
+			      * queue. This extra reference is removed
+			      * only if bfqq->bfqg disappears, to
+			      * guarantee that this queue is not freed
+			      * until its group goes away.
+			      */
+		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
 			     bfqq, bfqq->ref);
 		*async_bfqq = bfqq;
 	}
@@ -3516,6 +5240,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		 */
 		bfq_clear_bfqq_wait_request(bfqq);
 		hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+		bfqg_stats_update_idle_time(bfqq_group(bfqq));
 
 		/*
 		 * The queue is not empty, because a new request just
@@ -3657,6 +5382,11 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
 	struct bfq_queue *bfqq = RQ_BFQQ(rq);
 	struct bfq_data *bfqd = bfqq->bfqd;
 
+	if (rq->rq_flags & RQF_STARTED)
+		bfqg_stats_update_completion(bfqq_group(bfqq),
+					     rq_start_time_ns(rq),
+					     rq_io_start_time_ns(rq),
+					     rq->cmd_flags);
 
 	if (likely(rq->rq_flags & RQF_STARTED)) {
 		unsigned long flags;
@@ -3707,6 +5437,8 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
 	if (!bic)
 		goto queue_fail;
 
+	bfq_bic_update_cgroup(bic, bio);
+
 	bfqq = bic_to_bfqq(bic, is_sync);
 	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
 		if (bfqq)
@@ -3803,6 +5535,8 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
 
 	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
 	if (bfqq) {
+		bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+
 		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
 			     bfqq, bfqq->ref);
 		bfq_put_queue(bfqq);
@@ -3811,18 +5545,20 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
 }
 
 /*
- * Release the extra reference of the async queues as the device
- * goes away.
+ * Release all the bfqg references to its async queues.  If we are
+ * deallocating the group these queues may still contain requests, so
+ * we reparent them to the root cgroup (i.e., the only one that will
+ * exist for sure until all the requests on a device are gone).
  */
-static void bfq_put_async_queues(struct bfq_data *bfqd)
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
 {
 	int i, j;
 
 	for (i = 0; i < 2; i++)
 		for (j = 0; j < IOPRIO_BE_NR; j++)
-			__bfq_put_async_bfqq(bfqd, &async_bfqq[i][j]);
+			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
 
-	__bfq_put_async_bfqq(bfqd, &async_idle_bfqq);
+	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
 }
 
 static void bfq_exit_queue(struct elevator_queue *e)
@@ -3834,20 +5570,42 @@ static void bfq_exit_queue(struct elevator_queue *e)
 
 	spin_lock_irq(&bfqd->lock);
 	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
-		bfq_deactivate_bfqq(bfqd, bfqq, false);
-	bfq_put_async_queues(bfqd);
+		bfq_deactivate_bfqq(bfqd, bfqq, false, false);
 	spin_unlock_irq(&bfqd->lock);
 
 	hrtimer_cancel(&bfqd->idle_slice_timer);
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
+#else
+	spin_lock_irq(&bfqd->lock);
+	bfq_put_async_queues(bfqd, bfqd->root_group);
+	kfree(bfqd->root_group);
+	spin_unlock_irq(&bfqd->lock);
+#endif
+
 	kfree(bfqd);
 }
 
+static void bfq_init_root_group(struct bfq_group *root_group,
+				struct bfq_data *bfqd)
+{
+	int i;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	root_group->entity.parent = NULL;
+	root_group->my_entity = NULL;
+	root_group->bfqd = bfqd;
+#endif
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+	root_group->sched_data.bfq_class_idle_last_service = jiffies;
+}
+
 static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 {
 	struct bfq_data *bfqd;
 	struct elevator_queue *eq;
-	int i;
 
 	eq = elevator_alloc(q, e);
 	if (!eq)
@@ -3860,6 +5618,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	}
 	eq->elevator_data = bfqd;
 
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
+
 	/*
 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
@@ -3880,8 +5642,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 
 	bfqd->queue = q;
 
-	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
-		bfqd->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+	INIT_LIST_HEAD(&bfqd->dispatch);
 
 	hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
 		     HRTIMER_MODE_REL);
@@ -3899,17 +5660,40 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	bfqd->bfq_back_max = bfq_back_max;
 	bfqd->bfq_back_penalty = bfq_back_penalty;
 	bfqd->bfq_slice_idle = bfq_slice_idle;
-	bfqd->bfq_class_idle_last_service = 0;
 	bfqd->bfq_timeout = bfq_timeout;
 
 	bfqd->bfq_requests_within_timer = 120;
 
 	spin_lock_init(&bfqd->lock);
-	INIT_LIST_HEAD(&bfqd->dispatch);
 
-	q->elevator = eq;
+	/*
+	 * The invocation of the next bfq_create_group_hierarchy
+	 * function is the head of a chain of function calls
+	 * (bfq_create_group_hierarchy->blkcg_activate_policy->
+	 * blk_mq_freeze_queue) that may lead to the invocation of the
+	 * has_work hook function. For this reason,
+	 * bfq_create_group_hierarchy is invoked only after all
+	 * scheduler data has been initialized, apart from the fields
+	 * that can be initialized only after invoking
+	 * bfq_create_group_hierarchy. This, in particular, enables
+	 * has_work to correctly return false. Of course, to avoid
+	 * other inconsistencies, the blk-mq stack must then refrain
+	 * from invoking further scheduler hooks before this init
+	 * function is finished.
+	 */
+	bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
+	if (!bfqd->root_group)
+		goto out_free;
+	bfq_init_root_group(bfqd->root_group, bfqd);
+	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+
 
 	return 0;
+
+out_free:
+	kfree(bfqd);
+	kobject_put(&eq->kobj);
+	return -ENOMEM;
 }
 
 static void bfq_slab_kill(void)
@@ -4134,10 +5918,34 @@ static struct elevator_type iosched_bfq_mq = {
 	.elevator_owner =	THIS_MODULE,
 };
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct blkcg_policy blkcg_policy_bfq = {
+	.dfl_cftypes		= bfq_blkg_files,
+	.legacy_cftypes		= bfq_blkcg_legacy_files,
+
+	.cpd_alloc_fn		= bfq_cpd_alloc,
+	.cpd_init_fn		= bfq_cpd_init,
+	.cpd_bind_fn	        = bfq_cpd_init,
+	.cpd_free_fn		= bfq_cpd_free,
+
+	.pd_alloc_fn		= bfq_pd_alloc,
+	.pd_init_fn		= bfq_pd_init,
+	.pd_offline_fn		= bfq_pd_offline,
+	.pd_free_fn		= bfq_pd_free,
+	.pd_reset_stats_fn	= bfq_pd_reset_stats,
+};
+#endif
+
 static int __init bfq_init(void)
 {
 	int ret;
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	ret = blkcg_policy_register(&blkcg_policy_bfq);
+	if (ret)
+		return ret;
+#endif
+
 	ret = -ENOMEM;
 	if (bfq_slab_setup())
 		goto err_pol_unreg;
@@ -4149,12 +5957,18 @@ static int __init bfq_init(void)
 	return 0;
 
 err_pol_unreg:
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
 	return ret;
 }
 
 static void __exit bfq_exit(void)
 {
 	elv_unregister(&iosched_bfq_mq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
 	bfq_slab_kill();
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ec993573e0a8..fe9c512cc6fa 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -50,7 +50,7 @@ struct blk_stat_callback;
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
  */
-#define BLKCG_MAX_POLS		2
+#define BLKCG_MAX_POLS		3
 
 typedef void (rq_end_io_fn)(struct request *, int);
 
-- 
cgit v1.3-6-gb490


From 5f1ae4ebe578319a0cd5dae9591dd426070be106 Mon Sep 17 00:00:00 2001
From: Fu Wei <fu.wei@linaro.org>
Date: Sat, 1 Apr 2017 01:51:01 +0800
Subject: acpi/arm64: Add GTDT table parse driver

This patch adds support for parsing arch timer info in GTDT,
provides some kernel APIs to parse all the PPIs and
always-on info in GTDT and export them.

By this driver, we can simplify arm_arch_timer drivers, and
separate the ACPI GTDT knowledge from it.

Signed-off-by: Fu Wei <fu.wei@linaro.org>
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Reviewed-by: Hanjun Guo <hanjun.guo@linaro.org>
Tested-by: Hanjun Guo <hanjun.guo@linaro.org>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
---
 arch/arm64/Kconfig          |   1 +
 drivers/acpi/arm64/Kconfig  |   3 +
 drivers/acpi/arm64/Makefile |   1 +
 drivers/acpi/arm64/gtdt.c   | 157 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h        |   6 ++
 5 files changed, 168 insertions(+)
 create mode 100644 drivers/acpi/arm64/gtdt.c

(limited to 'include/linux')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3741859765cf..7e2baec6f23a 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2,6 +2,7 @@ config ARM64
 	def_bool y
 	select ACPI_CCA_REQUIRED if ACPI
 	select ACPI_GENERIC_GSI if ACPI
+	select ACPI_GTDT if ACPI
 	select ACPI_REDUCED_HARDWARE_ONLY if ACPI
 	select ACPI_MCFG if ACPI
 	select ACPI_SPCR_TABLE if ACPI
diff --git a/drivers/acpi/arm64/Kconfig b/drivers/acpi/arm64/Kconfig
index 4616da4c15be..5a6f80fce0d6 100644
--- a/drivers/acpi/arm64/Kconfig
+++ b/drivers/acpi/arm64/Kconfig
@@ -4,3 +4,6 @@
 
 config ACPI_IORT
 	bool
+
+config ACPI_GTDT
+	bool
diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile
index 72331f2ce0e9..1017def2ea12 100644
--- a/drivers/acpi/arm64/Makefile
+++ b/drivers/acpi/arm64/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_ACPI_IORT) 	+= iort.o
+obj-$(CONFIG_ACPI_GTDT) 	+= gtdt.o
diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
new file mode 100644
index 000000000000..3d95af8f73c6
--- /dev/null
+++ b/drivers/acpi/arm64/gtdt.c
@@ -0,0 +1,157 @@
+/*
+ * ARM Specific GTDT table Support
+ *
+ * Copyright (C) 2016, Linaro Ltd.
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *         Fu Wei <fu.wei@linaro.org>
+ *         Hanjun Guo <hanjun.guo@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/acpi.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+#include <clocksource/arm_arch_timer.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "ACPI GTDT: " fmt
+
+/**
+ * struct acpi_gtdt_descriptor - Store the key info of GTDT for all functions
+ * @gtdt:	The pointer to the struct acpi_table_gtdt of GTDT table.
+ * @gtdt_end:	The pointer to the end of GTDT table.
+ * @platform_timer:	The pointer to the start of Platform Timer Structure
+ *
+ * The struct store the key info of GTDT table, it should be initialized by
+ * acpi_gtdt_init.
+ */
+struct acpi_gtdt_descriptor {
+	struct acpi_table_gtdt *gtdt;
+	void *gtdt_end;
+	void *platform_timer;
+};
+
+static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata;
+
+static int __init map_gt_gsi(u32 interrupt, u32 flags)
+{
+	int trigger, polarity;
+
+	trigger = (flags & ACPI_GTDT_INTERRUPT_MODE) ? ACPI_EDGE_SENSITIVE
+			: ACPI_LEVEL_SENSITIVE;
+
+	polarity = (flags & ACPI_GTDT_INTERRUPT_POLARITY) ? ACPI_ACTIVE_LOW
+			: ACPI_ACTIVE_HIGH;
+
+	return acpi_register_gsi(NULL, interrupt, trigger, polarity);
+}
+
+/**
+ * acpi_gtdt_map_ppi() - Map the PPIs of per-cpu arch_timer.
+ * @type:	the type of PPI.
+ *
+ * Note: Secure state is not managed by the kernel on ARM64 systems.
+ * So we only handle the non-secure timer PPIs,
+ * ARCH_TIMER_PHYS_SECURE_PPI is treated as invalid type.
+ *
+ * Return: the mapped PPI value, 0 if error.
+ */
+int __init acpi_gtdt_map_ppi(int type)
+{
+	struct acpi_table_gtdt *gtdt = acpi_gtdt_desc.gtdt;
+
+	switch (type) {
+	case ARCH_TIMER_PHYS_NONSECURE_PPI:
+		return map_gt_gsi(gtdt->non_secure_el1_interrupt,
+				  gtdt->non_secure_el1_flags);
+	case ARCH_TIMER_VIRT_PPI:
+		return map_gt_gsi(gtdt->virtual_timer_interrupt,
+				  gtdt->virtual_timer_flags);
+
+	case ARCH_TIMER_HYP_PPI:
+		return map_gt_gsi(gtdt->non_secure_el2_interrupt,
+				  gtdt->non_secure_el2_flags);
+	default:
+		pr_err("Failed to map timer interrupt: invalid type.\n");
+	}
+
+	return 0;
+}
+
+/**
+ * acpi_gtdt_c3stop() - Got c3stop info from GTDT according to the type of PPI.
+ * @type:	the type of PPI.
+ *
+ * Return: true if the timer HW state is lost when a CPU enters an idle state,
+ * false otherwise
+ */
+bool __init acpi_gtdt_c3stop(int type)
+{
+	struct acpi_table_gtdt *gtdt = acpi_gtdt_desc.gtdt;
+
+	switch (type) {
+	case ARCH_TIMER_PHYS_NONSECURE_PPI:
+		return !(gtdt->non_secure_el1_flags & ACPI_GTDT_ALWAYS_ON);
+
+	case ARCH_TIMER_VIRT_PPI:
+		return !(gtdt->virtual_timer_flags & ACPI_GTDT_ALWAYS_ON);
+
+	case ARCH_TIMER_HYP_PPI:
+		return !(gtdt->non_secure_el2_flags & ACPI_GTDT_ALWAYS_ON);
+
+	default:
+		pr_err("Failed to get c3stop info: invalid type.\n");
+	}
+
+	return false;
+}
+
+/**
+ * acpi_gtdt_init() - Get the info of GTDT table to prepare for further init.
+ * @table:			The pointer to GTDT table.
+ * @platform_timer_count:	It points to a integer variable which is used
+ *				for storing the number of platform timers.
+ *				This pointer could be NULL, if the caller
+ *				doesn't need this info.
+ *
+ * Return: 0 if success, -EINVAL if error.
+ */
+int __init acpi_gtdt_init(struct acpi_table_header *table,
+			  int *platform_timer_count)
+{
+	void *platform_timer;
+	struct acpi_table_gtdt *gtdt;
+
+	gtdt = container_of(table, struct acpi_table_gtdt, header);
+	acpi_gtdt_desc.gtdt = gtdt;
+	acpi_gtdt_desc.gtdt_end = (void *)table + table->length;
+	acpi_gtdt_desc.platform_timer = NULL;
+	if (platform_timer_count)
+		*platform_timer_count = 0;
+
+	if (table->revision < 2) {
+		pr_warn("Revision:%d doesn't support Platform Timers.\n",
+			table->revision);
+		return 0;
+	}
+
+	if (!gtdt->platform_timer_count) {
+		pr_debug("No Platform Timer.\n");
+		return 0;
+	}
+
+	platform_timer = (void *)gtdt + gtdt->platform_timer_offset;
+	if (platform_timer < (void *)table + sizeof(struct acpi_table_gtdt)) {
+		pr_err(FW_BUG "invalid timer data.\n");
+		return -EINVAL;
+	}
+	acpi_gtdt_desc.platform_timer = platform_timer;
+	if (platform_timer_count)
+		*platform_timer_count = gtdt->platform_timer_count;
+
+	return 0;
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9b05886f9773..4b5c146fba97 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -595,6 +595,12 @@ enum acpi_reconfig_event  {
 int acpi_reconfig_notifier_register(struct notifier_block *nb);
 int acpi_reconfig_notifier_unregister(struct notifier_block *nb);
 
+#ifdef CONFIG_ACPI_GTDT
+int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count);
+int acpi_gtdt_map_ppi(int type);
+bool acpi_gtdt_c3stop(int type);
+#endif
+
 #else	/* !CONFIG_ACPI */
 
 #define acpi_disabled 1
-- 
cgit v1.3-6-gb490


From a712c3ed9b8a4565a200f3e8e09c42079b1666b3 Mon Sep 17 00:00:00 2001
From: Fu Wei <fu.wei@linaro.org>
Date: Sat, 1 Apr 2017 01:51:03 +0800
Subject: acpi/arm64: Add memory-mapped timer support in GTDT driver

On platforms booting with ACPI, architected memory-mapped timers'
configuration data is provided by firmware through the ACPI GTDT
static table.

The clocksource architected timer kernel driver requires a firmware
interface to collect timer configuration and configure its driver.
this infrastructure is present for device tree systems, but it is
missing on systems booting with ACPI.

Implement the kernel infrastructure required to parse the static
ACPI GTDT table so that the architected timer clocksource driver can
make use of it on systems booting with ACPI, therefore enabling
the corresponding timers configuration.

Signed-off-by: Fu Wei <fu.wei@linaro.org>
Signed-off-by: Hanjun Guo <hanjun.guo@linaro.org>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
[Mark: restructure error handling]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
---
 drivers/acpi/arm64/gtdt.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h      |   1 +
 2 files changed, 158 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/arm64/gtdt.c b/drivers/acpi/arm64/gtdt.c
index 3d95af8f73c6..b9046678fb43 100644
--- a/drivers/acpi/arm64/gtdt.c
+++ b/drivers/acpi/arm64/gtdt.c
@@ -13,6 +13,7 @@
 
 #include <linux/acpi.h>
 #include <linux/init.h>
+#include <linux/irqdomain.h>
 #include <linux/kernel.h>
 
 #include <clocksource/arm_arch_timer.h>
@@ -37,6 +38,28 @@ struct acpi_gtdt_descriptor {
 
 static struct acpi_gtdt_descriptor acpi_gtdt_desc __initdata;
 
+static inline void *next_platform_timer(void *platform_timer)
+{
+	struct acpi_gtdt_header *gh = platform_timer;
+
+	platform_timer += gh->length;
+	if (platform_timer < acpi_gtdt_desc.gtdt_end)
+		return platform_timer;
+
+	return NULL;
+}
+
+#define for_each_platform_timer(_g)				\
+	for (_g = acpi_gtdt_desc.platform_timer; _g;	\
+	     _g = next_platform_timer(_g))
+
+static inline bool is_timer_block(void *platform_timer)
+{
+	struct acpi_gtdt_header *gh = platform_timer;
+
+	return gh->type == ACPI_GTDT_TYPE_TIMER_BLOCK;
+}
+
 static int __init map_gt_gsi(u32 interrupt, u32 flags)
 {
 	int trigger, polarity;
@@ -155,3 +178,137 @@ int __init acpi_gtdt_init(struct acpi_table_header *table,
 
 	return 0;
 }
+
+static int __init gtdt_parse_timer_block(struct acpi_gtdt_timer_block *block,
+					 struct arch_timer_mem *timer_mem)
+{
+	int i;
+	struct arch_timer_mem_frame *frame;
+	struct acpi_gtdt_timer_entry *gtdt_frame;
+
+	if (!block->timer_count) {
+		pr_err(FW_BUG "GT block present, but frame count is zero.");
+		return -ENODEV;
+	}
+
+	if (block->timer_count > ARCH_TIMER_MEM_MAX_FRAMES) {
+		pr_err(FW_BUG "GT block lists %d frames, ACPI spec only allows 8\n",
+		       block->timer_count);
+		return -EINVAL;
+	}
+
+	timer_mem->cntctlbase = (phys_addr_t)block->block_address;
+	/*
+	 * The CNTCTLBase frame is 4KB (register offsets 0x000 - 0xFFC).
+	 * See ARM DDI 0487A.k_iss10775, page I1-5129, Table I1-3
+	 * "CNTCTLBase memory map".
+	 */
+	timer_mem->size = SZ_4K;
+
+	gtdt_frame = (void *)block + block->timer_offset;
+	if (gtdt_frame + block->timer_count != (void *)block + block->header.length)
+		return -EINVAL;
+
+	/*
+	 * Get the GT timer Frame data for every GT Block Timer
+	 */
+	for (i = 0; i < block->timer_count; i++, gtdt_frame++) {
+		if (gtdt_frame->common_flags & ACPI_GTDT_GT_IS_SECURE_TIMER)
+			continue;
+		if (gtdt_frame->frame_number >= ARCH_TIMER_MEM_MAX_FRAMES ||
+		    !gtdt_frame->base_address || !gtdt_frame->timer_interrupt)
+			goto error;
+
+		frame = &timer_mem->frame[gtdt_frame->frame_number];
+
+		/* duplicate frame */
+		if (frame->valid)
+			goto error;
+
+		frame->phys_irq = map_gt_gsi(gtdt_frame->timer_interrupt,
+					     gtdt_frame->timer_flags);
+		if (frame->phys_irq <= 0) {
+			pr_warn("failed to map physical timer irq in frame %d.\n",
+				gtdt_frame->frame_number);
+			goto error;
+		}
+
+		if (gtdt_frame->virtual_timer_interrupt) {
+			frame->virt_irq =
+				map_gt_gsi(gtdt_frame->virtual_timer_interrupt,
+					   gtdt_frame->virtual_timer_flags);
+			if (frame->virt_irq <= 0) {
+				pr_warn("failed to map virtual timer irq in frame %d.\n",
+					gtdt_frame->frame_number);
+				goto error;
+			}
+		} else {
+			pr_debug("virtual timer in frame %d not implemented.\n",
+				 gtdt_frame->frame_number);
+		}
+
+		frame->cntbase = gtdt_frame->base_address;
+		/*
+		 * The CNTBaseN frame is 4KB (register offsets 0x000 - 0xFFC).
+		 * See ARM DDI 0487A.k_iss10775, page I1-5130, Table I1-4
+		 * "CNTBaseN memory map".
+		 */
+		frame->size = SZ_4K;
+		frame->valid = true;
+	}
+
+	return 0;
+
+error:
+	do {
+		if (gtdt_frame->common_flags & ACPI_GTDT_GT_IS_SECURE_TIMER ||
+		    gtdt_frame->frame_number >= ARCH_TIMER_MEM_MAX_FRAMES)
+			continue;
+
+		frame = &timer_mem->frame[gtdt_frame->frame_number];
+
+		if (frame->phys_irq > 0)
+			acpi_unregister_gsi(gtdt_frame->timer_interrupt);
+		frame->phys_irq = 0;
+
+		if (frame->virt_irq > 0)
+			acpi_unregister_gsi(gtdt_frame->virtual_timer_interrupt);
+		frame->virt_irq = 0;
+	} while (i-- >= 0 && gtdt_frame--);
+
+	return -EINVAL;
+}
+
+/**
+ * acpi_arch_timer_mem_init() - Get the info of all GT blocks in GTDT table.
+ * @timer_mem:	The pointer to the array of struct arch_timer_mem for returning
+ *		the result of parsing. The element number of this array should
+ *		be platform_timer_count(the total number of platform timers).
+ * @timer_count: It points to a integer variable which is used for storing the
+ *		number of GT blocks we have parsed.
+ *
+ * Return: 0 if success, -EINVAL/-ENODEV if error.
+ */
+int __init acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem,
+				    int *timer_count)
+{
+	int ret;
+	void *platform_timer;
+
+	*timer_count = 0;
+	for_each_platform_timer(platform_timer) {
+		if (is_timer_block(platform_timer)) {
+			ret = gtdt_parse_timer_block(platform_timer, timer_mem);
+			if (ret)
+				return ret;
+			timer_mem++;
+			(*timer_count)++;
+		}
+	}
+
+	if (*timer_count)
+		pr_info("found %d memory-mapped timer block(s).\n",
+			*timer_count);
+
+	return 0;
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 4b5c146fba97..31937249f8cc 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -599,6 +599,7 @@ int acpi_reconfig_notifier_unregister(struct notifier_block *nb);
 int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count);
 int acpi_gtdt_map_ppi(int type);
 bool acpi_gtdt_c3stop(int type);
+int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count);
 #endif
 
 #else	/* !CONFIG_ACPI */
-- 
cgit v1.3-6-gb490


From 314fe91b4a99949bb720501ba74d2228093bbf47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 12 Apr 2017 12:13:57 +0200
Subject: block: remove blk_end_request_err and __blk_end_request_err

Both functions are entirely unused.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 39 ---------------------------------------
 include/linux/blkdev.h |  2 --
 2 files changed, 41 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 8654aa0cef6d..f6e18ab551e7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2875,25 +2875,6 @@ bool blk_end_request_cur(struct request *rq, int error)
 }
 EXPORT_SYMBOL(blk_end_request_cur);
 
-/**
- * blk_end_request_err - Finish a request till the next failure boundary.
- * @rq: the request to finish till the next failure boundary for
- * @error: must be negative errno
- *
- * Description:
- *     Complete @rq till the next failure boundary.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-bool blk_end_request_err(struct request *rq, int error)
-{
-	WARN_ON(error >= 0);
-	return blk_end_request(rq, error, blk_rq_err_bytes(rq));
-}
-EXPORT_SYMBOL_GPL(blk_end_request_err);
-
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
@@ -2953,26 +2934,6 @@ bool __blk_end_request_cur(struct request *rq, int error)
 }
 EXPORT_SYMBOL(__blk_end_request_cur);
 
-/**
- * __blk_end_request_err - Finish a request till the next failure boundary.
- * @rq: the request to finish till the next failure boundary for
- * @error: must be negative errno
- *
- * Description:
- *     Complete @rq till the next failure boundary.  Must be called
- *     with queue lock held.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-bool __blk_end_request_err(struct request *rq, int error)
-{
-	WARN_ON(error >= 0);
-	return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
-}
-EXPORT_SYMBOL_GPL(__blk_end_request_err);
-
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fe9c512cc6fa..cca704c80b01 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1127,12 +1127,10 @@ extern bool blk_end_request(struct request *rq, int error,
 			    unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, int error);
 extern bool blk_end_request_cur(struct request *rq, int error);
-extern bool blk_end_request_err(struct request *rq, int error);
 extern bool __blk_end_request(struct request *rq, int error,
 			      unsigned int nr_bytes);
 extern void __blk_end_request_all(struct request *rq, int error);
 extern bool __blk_end_request_cur(struct request *rq, int error);
-extern bool __blk_end_request_err(struct request *rq, int error);
 
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
-- 
cgit v1.3-6-gb490


From fa1a15c08e23cb89c5837915b1989909bce47456 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 12 Apr 2017 12:13:58 +0200
Subject: block: remove blk_end_request_cur

This function is not used anywhere in the kernel.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 18 ------------------
 include/linux/blkdev.h |  1 -
 2 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index f6e18ab551e7..728299323f65 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2857,24 +2857,6 @@ void blk_end_request_all(struct request *rq, int error)
 }
 EXPORT_SYMBOL(blk_end_request_all);
 
-/**
- * blk_end_request_cur - Helper function to finish the current request chunk.
- * @rq: the request to finish the current chunk for
- * @error: %0 for success, < %0 for error
- *
- * Description:
- *     Complete the current consecutively mapped chunk from @rq.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-bool blk_end_request_cur(struct request *rq, int error)
-{
-	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
-}
-EXPORT_SYMBOL(blk_end_request_cur);
-
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cca704c80b01..5b52b3d7818c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1126,7 +1126,6 @@ extern void blk_finish_request(struct request *rq, int error);
 extern bool blk_end_request(struct request *rq, int error,
 			    unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, int error);
-extern bool blk_end_request_cur(struct request *rq, int error);
 extern bool __blk_end_request(struct request *rq, int error,
 			      unsigned int nr_bytes);
 extern void __blk_end_request_all(struct request *rq, int error);
-- 
cgit v1.3-6-gb490


From 28c5fe99016d28f15d1b825df8acb1558a3a63a1 Mon Sep 17 00:00:00 2001
From: Felix Brack <fb@ltec.ch>
Date: Thu, 13 Apr 2017 09:51:38 +0200
Subject: leds: pca9532: Extend pca9532 device tree support

This patch extends the device tree support for the pca9532 by adding
the leds 'default-state' property.

Signed-off-by: Felix Brack <fb@ltec.ch>
Signed-off-by: Jacek Anaszewski <jacek.anaszewski@gmail.com>
---
 .../devicetree/bindings/leds/leds-pca9532.txt      | 10 ++++++++
 drivers/leds/leds-pca9532.c                        | 27 +++++++++++++++++++++-
 include/linux/leds-pca9532.h                       |  4 ++--
 3 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/leds/leds-pca9532.txt b/Documentation/devicetree/bindings/leds/leds-pca9532.txt
index 198f3ba0e01f..f769c52e3643 100644
--- a/Documentation/devicetree/bindings/leds/leds-pca9532.txt
+++ b/Documentation/devicetree/bindings/leds/leds-pca9532.txt
@@ -17,6 +17,8 @@ Optional sub-node properties:
 	- label: see Documentation/devicetree/bindings/leds/common.txt
 	- type: Output configuration, see dt-bindings/leds/leds-pca9532.h (default NONE)
 	- linux,default-trigger: see Documentation/devicetree/bindings/leds/common.txt
+	- default-state: see Documentation/devicetree/bindings/leds/common.txt
+	  This property is only valid for sub-nodes of type <PCA9532_TYPE_LED>.
 
 Example:
   #include <dt-bindings/leds/leds-pca9532.h>
@@ -33,6 +35,14 @@ Example:
       label = "pca:green:power";
       type = <PCA9532_TYPE_LED>;
     };
+    kernel-booting {
+      type = <PCA9532_TYPE_LED>;
+      default-state = "on";
+    };
+    sys-stat {
+      type = <PCA9532_TYPE_LED>;
+      default-state = "keep"; // don't touch, was set by U-Boot
+    };
   };
 
 For more product information please see the link below:
diff --git a/drivers/leds/leds-pca9532.c b/drivers/leds/leds-pca9532.c
index 06e63106ae1e..7fea18b0c15d 100644
--- a/drivers/leds/leds-pca9532.c
+++ b/drivers/leds/leds-pca9532.c
@@ -254,6 +254,21 @@ static void pca9532_input_work(struct work_struct *work)
 	mutex_unlock(&data->update_lock);
 }
 
+static enum pca9532_state pca9532_getled(struct pca9532_led *led)
+{
+	struct i2c_client *client = led->client;
+	struct pca9532_data *data = i2c_get_clientdata(client);
+	u8 maxleds = data->chip_info->num_leds;
+	char reg;
+	enum pca9532_state ret;
+
+	mutex_lock(&data->update_lock);
+	reg = i2c_smbus_read_byte_data(client, LED_REG(maxleds, led->id));
+	ret = reg >> LED_NUM(led->id)/2;
+	mutex_unlock(&data->update_lock);
+	return ret;
+}
+
 #ifdef CONFIG_LEDS_PCA9532_GPIO
 static int pca9532_gpio_request_pin(struct gpio_chip *gc, unsigned offset)
 {
@@ -366,7 +381,10 @@ static int pca9532_configure(struct i2c_client *client,
 			gpios++;
 			break;
 		case PCA9532_TYPE_LED:
-			led->state = pled->state;
+			if (pled->state == PCA9532_KEEP)
+				led->state = pca9532_getled(led);
+			else
+				led->state = pled->state;
 			led->name = pled->name;
 			led->ldev.name = led->name;
 			led->ldev.default_trigger = pled->default_trigger;
@@ -456,6 +474,7 @@ pca9532_of_populate_pdata(struct device *dev, struct device_node *np)
 	const struct of_device_id *match;
 	int devid, maxleds;
 	int i = 0;
+	const char *state;
 
 	match = of_match_device(of_pca9532_leds_match, dev);
 	if (!match)
@@ -475,6 +494,12 @@ pca9532_of_populate_pdata(struct device *dev, struct device_node *np)
 		of_property_read_u32(child, "type", &pdata->leds[i].type);
 		of_property_read_string(child, "linux,default-trigger",
 					&pdata->leds[i].default_trigger);
+		if (!of_property_read_string(child, "default-state", &state)) {
+			if (!strcmp(state, "on"))
+				pdata->leds[i].state = PCA9532_ON;
+			else if (!strcmp(state, "keep"))
+				pdata->leds[i].state = PCA9532_KEEP;
+		}
 		if (++i >= maxleds) {
 			of_node_put(child);
 			break;
diff --git a/include/linux/leds-pca9532.h b/include/linux/leds-pca9532.h
index d215b4561180..5e240b2b4d58 100644
--- a/include/linux/leds-pca9532.h
+++ b/include/linux/leds-pca9532.h
@@ -22,7 +22,8 @@ enum pca9532_state {
 	PCA9532_OFF  = 0x0,
 	PCA9532_ON   = 0x1,
 	PCA9532_PWM0 = 0x2,
-	PCA9532_PWM1 = 0x3
+	PCA9532_PWM1 = 0x3,
+	PCA9532_KEEP = 0xff,
 };
 
 struct pca9532_led {
@@ -44,4 +45,3 @@ struct pca9532_platform_data {
 };
 
 #endif /* __LINUX_PCA9532_H */
-
-- 
cgit v1.3-6-gb490


From 8661423eea1a1b58417014716e3f1ba286072379 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 19 Apr 2017 14:02:08 +0200
Subject: ACPI / utils: Add new acpi_dev_present helper

acpi_dev_found just iterates over all ACPI-ids and sees if one matches.
This means that it will return true for devices which are in the DSDT
but disabled (their _STA method returns 0).

For some drivers it is useful to be able to check if a certain HID
is not only present in the namespace, but also actually present as in
acpi_device_is_present() will return true for the device. For example
because if a certain device is present then the driver will want to use
an extcon or IIO ADC channel provided by that device.

This commit adds a new acpi_dev_present helper which drivers can use
to this end.

Like acpi_dev_found, acpi_dev_present take a HID as argument, but
it also has 2 extra optional arguments to only check for an ACPI
device with a specific UID and/or HRV value. This makes it more
generic and allows it to replace custom code doing similar checks
in several places.

Arguably acpi_dev_present is what acpi_dev_found should have been, but
there are too many users to just change acpi_dev_found without the risk
of breaking something.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/utils.c    | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/acpi/acpi_bus.h |  1 +
 include/linux/acpi.h    |  5 ++++
 3 files changed, 72 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c
index 22c09952e177..27d0dcfcf47d 100644
--- a/drivers/acpi/utils.c
+++ b/drivers/acpi/utils.c
@@ -736,6 +736,72 @@ bool acpi_dev_found(const char *hid)
 }
 EXPORT_SYMBOL(acpi_dev_found);
 
+struct acpi_dev_present_info {
+	struct acpi_device_id hid[2];
+	const char *uid;
+	s64 hrv;
+};
+
+static int acpi_dev_present_cb(struct device *dev, void *data)
+{
+	struct acpi_device *adev = to_acpi_device(dev);
+	struct acpi_dev_present_info *match = data;
+	unsigned long long hrv;
+	acpi_status status;
+
+	if (acpi_match_device_ids(adev, match->hid))
+		return 0;
+
+	if (match->uid && (!adev->pnp.unique_id ||
+	    strcmp(adev->pnp.unique_id, match->uid)))
+		return 0;
+
+	if (match->hrv == -1)
+		return 1;
+
+	status = acpi_evaluate_integer(adev->handle, "_HRV", NULL, &hrv);
+	if (ACPI_FAILURE(status))
+		return 0;
+
+	return hrv == match->hrv;
+}
+
+/**
+ * acpi_dev_present - Detect that a given ACPI device is present
+ * @hid: Hardware ID of the device.
+ * @uid: Unique ID of the device, pass NULL to not check _UID
+ * @hrv: Hardware Revision of the device, pass -1 to not check _HRV
+ *
+ * Return %true if a matching device was present at the moment of invocation.
+ * Note that if the device is pluggable, it may since have disappeared.
+ *
+ * Note that unlike acpi_dev_found() this function checks the status
+ * of the device. So for devices which are present in the dsdt, but
+ * which are disabled (their _STA callback returns 0) this function
+ * will return false.
+ *
+ * For this function to work, acpi_bus_scan() must have been executed
+ * which happens in the subsys_initcall() subsection. Hence, do not
+ * call from a subsys_initcall() or earlier (use acpi_get_devices()
+ * instead). Calling from module_init() is fine (which is synonymous
+ * with device_initcall()).
+ */
+bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
+{
+	struct acpi_dev_present_info match = {};
+	struct device *dev;
+
+	strlcpy(match.hid[0].id, hid, sizeof(match.hid[0].id));
+	match.uid = uid;
+	match.hrv = hrv;
+
+	dev = bus_find_device(&acpi_bus_type, NULL, &match,
+			      acpi_dev_present_cb);
+
+	return !!dev;
+}
+EXPORT_SYMBOL(acpi_dev_present);
+
 /*
  * acpi_backlight= handling, this is done here rather then in video_detect.c
  * because __setup cannot be used in modules.
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index ef0ae8aaa567..b53c058e7009 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -88,6 +88,7 @@ acpi_evaluate_dsm_typed(acpi_handle handle, const u8 *uuid, u64 rev, u64 func,
 	}
 
 bool acpi_dev_found(const char *hid);
+bool acpi_dev_present(const char *hid, const char *uid, s64 hrv);
 
 #ifdef CONFIG_ACPI
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9b05886f9773..841a8dc55ade 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -611,6 +611,11 @@ static inline bool acpi_dev_found(const char *hid)
 	return false;
 }
 
+static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv)
+{
+	return false;
+}
+
 static inline bool is_acpi_node(struct fwnode_handle *fwnode)
 {
 	return false;
-- 
cgit v1.3-6-gb490


From da8d7f079b868ceab830309f80efc69d350576f3 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 19 Apr 2017 14:01:24 -0700
Subject: block: Export blk_init_request_from_bio()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export this function such that it becomes available to block
drivers.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Matias Bjørling <m@bjorling.me>
Cc: Adam Manzanares <adam.manzanares@wdc.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 5 +++--
 block/blk-mq.c         | 2 +-
 block/blk.h            | 1 -
 include/linux/blkdev.h | 1 +
 4 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 9697b789408f..72544b462657 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1628,7 +1628,7 @@ out:
 	return ret;
 }
 
-void init_request_from_bio(struct request *req, struct bio *bio)
+void blk_init_request_from_bio(struct request *req, struct bio *bio)
 {
 	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
@@ -1640,6 +1640,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
+EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
 
 static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 {
@@ -1730,7 +1731,7 @@ get_rq:
 	 * We don't worry about that case for efficiency. It won't happen
 	 * often, and the elevators are able to handle it.
 	 */
-	init_request_from_bio(req, bio);
+	blk_init_request_from_bio(req, bio);
 
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
 		req->cpu = raw_smp_processor_id();
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e2ef7b460924..c496692ecc5b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1424,7 +1424,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
-	init_request_from_bio(rq, bio);
+	blk_init_request_from_bio(rq, bio);
 
 	blk_account_io_start(rq, true);
 }
diff --git a/block/blk.h b/block/blk.h
index 35b3041eec1a..2ed70228e44f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -60,7 +60,6 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
 int blk_init_rl(struct request_list *rl, struct request_queue *q,
 		gfp_t gfp_mask);
 void blk_exit_rl(struct request_list *rl);
-void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
 void blk_queue_bypass_start(struct request_queue *q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5b52b3d7818c..3470375952a1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -924,6 +924,7 @@ extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern blk_qc_t generic_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
+extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
-- 
cgit v1.3-6-gb490


From 0be0dee64eacd950f8e4b6c45adb5a92392eaaaf Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 19 Apr 2017 14:01:27 -0700
Subject: block: Inline blk_rq_set_prio()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since only a single caller remains, inline blk_rq_set_prio(). Initialize
req->ioprio even if no I/O priority has been set in the bio nor in the
I/O context.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Adam Manzanares <adam.manzanares@wdc.com>
Tested-by: Adam Manzanares <adam.manzanares@wdc.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Matias Bjørling <m@bjorling.me>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  7 ++++++-
 include/linux/blkdev.h | 14 --------------
 2 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 72544b462657..25aea293ee98 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1630,14 +1630,19 @@ out:
 
 void blk_init_request_from_bio(struct request *req, struct bio *bio)
 {
+	struct io_context *ioc = rq_ioc(bio);
+
 	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 
 	req->errors = 0;
 	req->__sector = bio->bi_iter.bi_sector;
-	blk_rq_set_prio(req, rq_ioc(bio));
 	if (ioprio_valid(bio_prio(bio)))
 		req->ioprio = bio_prio(bio);
+	else if (ioc)
+		req->ioprio = ioc->ioprio;
+	else
+		req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
 	blk_rq_bio_prep(req->q, req, bio);
 }
 EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3470375952a1..51c9e391798e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1087,20 +1087,6 @@ static inline unsigned int blk_rq_count_bios(struct request *rq)
 	return nr_bios;
 }
 
-/*
- * blk_rq_set_prio - associate a request with prio from ioc
- * @rq: request of interest
- * @ioc: target iocontext
- *
- * Assocate request prio with ioc prio so request based drivers
- * can leverage priority information.
- */
-static inline void blk_rq_set_prio(struct request *rq, struct io_context *ioc)
-{
-	if (ioc)
-		rq->ioprio = ioc->ioprio;
-}
-
 /*
  * Request issue related functions.
  */
-- 
cgit v1.3-6-gb490


From 0773cea37470f8e080c510fe720fc356cf35df3a Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Tue, 18 Apr 2017 16:30:37 -0700
Subject: clocksource: Use GENMASK_ULL in definition of CLOCKSOURCE_MASK

Besides reusing existing code this removes the special case handling
for 64-bit masks, which causes clang to raise a shift count overflow
warning due to https://bugs.llvm.org//show_bug.cgi?id=10030.

Suggested-by: Dmitry Torokhov <dtor@chromium.org>
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Grant Grundler <grundler@chromium.org>
Cc: Greg Hackmann <ghackmann@google.com>
Cc: Michael Davidson <md@google.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20170418233037.70990-1-mka@chromium.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index cfc75848a35d..f2b10d9ebd04 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -120,7 +120,7 @@ struct clocksource {
 #define CLOCK_SOURCE_RESELECT			0x100
 
 /* simplify initialization of mask field */
-#define CLOCKSOURCE_MASK(bits) (u64)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
+#define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0)
 
 static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from)
 {
-- 
cgit v1.3-6-gb490


From 58bb100a9de10329ca0d63484e76f27c257e9a2e Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Wed, 19 Apr 2017 13:26:46 +0100
Subject: Clocksource/mips-gic: Remove redundant non devicetree init

Malta was the only platform probing this driver from platform code
without using device tree. With that code removed, gic_clocksource_init
is redundant so remove it.

Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Link: http://lkml.kernel.org/r/1492604806-23420-2-git-send-email-matt.redfearn@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/clocksource/mips-gic-timer.c | 13 -------------
 include/linux/irqchip/mips-gic.h     |  1 -
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c
index b6ad9c0a7c55..3f52ee219923 100644
--- a/drivers/clocksource/mips-gic-timer.c
+++ b/drivers/clocksource/mips-gic-timer.c
@@ -154,19 +154,6 @@ static int __init __gic_clocksource_init(void)
 	return ret;
 }
 
-void __init gic_clocksource_init(unsigned int frequency)
-{
-	gic_frequency = frequency;
-	gic_timer_irq = MIPS_GIC_IRQ_BASE +
-		GIC_LOCAL_TO_HWIRQ(GIC_LOCAL_INT_COMPARE);
-
-	__gic_clocksource_init();
-	gic_clockevent_init();
-
-	/* And finally start the counter */
-	gic_start_count();
-}
-
 static int __init gic_clocksource_of_init(struct device_node *node)
 {
 	struct clk *clk;
diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h
index 7b49c71c968b..2b0e56619e53 100644
--- a/include/linux/irqchip/mips-gic.h
+++ b/include/linux/irqchip/mips-gic.h
@@ -258,7 +258,6 @@ extern unsigned int gic_present;
 extern void gic_init(unsigned long gic_base_addr,
 	unsigned long gic_addrspace_size, unsigned int cpu_vec,
 	unsigned int irqbase);
-extern void gic_clocksource_init(unsigned int);
 extern u64 gic_read_count(void);
 extern unsigned int gic_get_count_width(void);
 extern u64 gic_read_compare(void);
-- 
cgit v1.3-6-gb490


From baf7a616d537f577d33b7d9986f40532e2bd9f66 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:25 +0200
Subject: bdi: Provide bdi_register_va() and bdi_alloc()

Add function that registers bdi and takes va_list instead of variable
number of arguments.

Add bdi_alloc() as simple wrapper for NUMA-unaware users allocating BDI.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev.h |  6 ++++++
 mm/backing-dev.c            | 20 +++++++++++++++-----
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c52a48cb9a66..47a98e6e2a65 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -30,6 +30,8 @@ void bdi_put(struct backing_dev_info *bdi);
 __printf(3, 4)
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
+int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
+		const char *fmt, va_list args);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
 void bdi_unregister(struct backing_dev_info *bdi);
@@ -37,6 +39,10 @@ void bdi_unregister(struct backing_dev_info *bdi);
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 void bdi_destroy(struct backing_dev_info *bdi);
 struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id);
+static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
+{
+	return bdi_alloc_node(gfp_mask, NUMA_NO_NODE);
+}
 
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
 			bool range_cyclic, enum wb_reason reason);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3ea3bbd921d6..e5e0972bdd6f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -856,18 +856,15 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
 	return bdi;
 }
 
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-		const char *fmt, ...)
+int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
+		const char *fmt, va_list args)
 {
-	va_list args;
 	struct device *dev;
 
 	if (bdi->dev)	/* The driver needs to use separate queues per device */
 		return 0;
 
-	va_start(args, fmt);
 	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
-	va_end(args);
 	if (IS_ERR(dev))
 		return PTR_ERR(dev);
 
@@ -884,6 +881,19 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 	trace_writeback_bdi_register(bdi);
 	return 0;
 }
+EXPORT_SYMBOL(bdi_register_va);
+
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+		const char *fmt, ...)
+{
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = bdi_register_va(bdi, parent, fmt, args);
+	va_end(args);
+	return ret;
+}
 EXPORT_SYMBOL(bdi_register);
 
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
-- 
cgit v1.3-6-gb490


From fca39346a55bb7196888ffc77d9e3557340d1d0b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:28 +0200
Subject: fs: Provide infrastructure for dynamic BDIs in filesystems

Provide helper functions for setting up dynamically allocated
backing_dev_info structures for filesystems and cleaning them up on
superblock destruction.

CC: linux-mtd@lists.infradead.org
CC: linux-nfs@vger.kernel.org
CC: Petr Vandrovec <petr@vandrovec.name>
CC: linux-nilfs@vger.kernel.org
CC: cluster-devel@redhat.com
CC: osd-dev@open-osd.org
CC: codalist@coda.cs.cmu.edu
CC: linux-afs@lists.infradead.org
CC: ecryptfs@vger.kernel.org
CC: linux-cifs@vger.kernel.org
CC: ceph-devel@vger.kernel.org
CC: linux-btrfs@vger.kernel.org
CC: v9fs-developer@lists.sourceforge.net
CC: lustre-devel@lists.lustre.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/super.c                       | 49 ++++++++++++++++++++++++++++++++++++++++
 include/linux/backing-dev-defs.h |  2 +-
 include/linux/fs.h               |  6 +++++
 3 files changed, 56 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index b8b6a086c03b..0f51a437c269 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -446,6 +446,11 @@ void generic_shutdown_super(struct super_block *sb)
 	hlist_del_init(&sb->s_instances);
 	spin_unlock(&sb_lock);
 	up_write(&sb->s_umount);
+	if (sb->s_iflags & SB_I_DYNBDI) {
+		bdi_put(sb->s_bdi);
+		sb->s_bdi = &noop_backing_dev_info;
+		sb->s_iflags &= ~SB_I_DYNBDI;
+	}
 }
 
 EXPORT_SYMBOL(generic_shutdown_super);
@@ -1255,6 +1260,50 @@ out:
 	return ERR_PTR(error);
 }
 
+/*
+ * Setup private BDI for given superblock. It gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
+{
+	struct backing_dev_info *bdi;
+	int err;
+	va_list args;
+
+	bdi = bdi_alloc(GFP_KERNEL);
+	if (!bdi)
+		return -ENOMEM;
+
+	bdi->name = sb->s_type->name;
+
+	va_start(args, fmt);
+	err = bdi_register_va(bdi, NULL, fmt, args);
+	va_end(args);
+	if (err) {
+		bdi_put(bdi);
+		return err;
+	}
+	WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+	sb->s_bdi = bdi;
+	sb->s_iflags |= SB_I_DYNBDI;
+
+	return 0;
+}
+EXPORT_SYMBOL(super_setup_bdi_name);
+
+/*
+ * Setup private BDI for given superblock. I gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi(struct super_block *sb)
+{
+	static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+	return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
+				    atomic_long_inc_return(&bdi_seq));
+}
+EXPORT_SYMBOL(super_setup_bdi);
+
 /*
  * This is an internal function, please use sb_end_{write,pagefault,intwrite}
  * instead.
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index e66d4722db8e..866c433e7d32 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -146,7 +146,7 @@ struct backing_dev_info {
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
 
-	char *name;
+	const char *name;
 
 	struct kref refcnt;	/* Reference counter for the structure */
 	unsigned int capabilities; /* Device capabilities */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7251f7bb45e8..98cf14ea78c0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1272,6 +1272,9 @@ struct mm_struct;
 /* sb->s_iflags to limit user namespace mounts */
 #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
 
+/* Temporary flag until all filesystems are converted to dynamic bdis */
+#define SB_I_DYNBDI	0x00000100
+
 /* Possible states of 'frozen' field */
 enum {
 	SB_UNFROZEN = 0,		/* FS is unfrozen */
@@ -2121,6 +2124,9 @@ extern int vfs_ustat(dev_t, struct kstatfs *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
 extern bool our_mnt(struct vfsmount *mnt);
+extern __printf(2, 3)
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
+extern int super_setup_bdi(struct super_block *sb);
 
 extern int current_umask(void);
 
-- 
cgit v1.3-6-gb490


From fa06052d637bf3a76f18cd2304048b866af4096e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:37 +0200
Subject: mtd: Convert to dynamically allocated bdi infrastructure

MTD already allocates backing_dev_info dynamically. Convert it to use
generic infrastructure for this including proper refcounting. We drop
mtd->backing_dev_info as its only use was to pass mtd_bdi pointer from
one file into another and if we wanted to keep that in a clean way, we'd
have to make mtd hold and drop bdi reference as needed which seems
pointless for passing one global pointer...

CC: David Woodhouse <dwmw2@infradead.org>
CC: Brian Norris <computersforpeace@gmail.com>
CC: linux-mtd@lists.infradead.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/mtd/mtdcore.c   | 23 ++++++++++++-----------
 drivers/mtd/mtdsuper.c  |  7 ++++++-
 include/linux/mtd/mtd.h |  5 -----
 3 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 66a9dedd1062..23e2e56ca54e 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -46,7 +46,7 @@
 
 #include "mtdcore.h"
 
-static struct backing_dev_info *mtd_bdi;
+struct backing_dev_info *mtd_bdi;
 
 #ifdef CONFIG_PM_SLEEP
 
@@ -496,11 +496,9 @@ int add_mtd_device(struct mtd_info *mtd)
 	 * mtd_device_parse_register() multiple times on the same master MTD,
 	 * especially with CONFIG_MTD_PARTITIONED_MASTER=y.
 	 */
-	if (WARN_ONCE(mtd->backing_dev_info, "MTD already registered\n"))
+	if (WARN_ONCE(mtd->dev.type, "MTD already registered\n"))
 		return -EEXIST;
 
-	mtd->backing_dev_info = mtd_bdi;
-
 	BUG_ON(mtd->writesize == 0);
 	mutex_lock(&mtd_table_mutex);
 
@@ -1775,13 +1773,18 @@ static struct backing_dev_info * __init mtd_bdi_init(char *name)
 	struct backing_dev_info *bdi;
 	int ret;
 
-	bdi = kzalloc(sizeof(*bdi), GFP_KERNEL);
+	bdi = bdi_alloc(GFP_KERNEL);
 	if (!bdi)
 		return ERR_PTR(-ENOMEM);
 
-	ret = bdi_setup_and_register(bdi, name);
+	bdi->name = name;
+	/*
+	 * We put '-0' suffix to the name to get the same name format as we
+	 * used to get. Since this is called only once, we get a unique name. 
+	 */
+	ret = bdi_register(bdi, NULL, "%.28s-0", name);
 	if (ret)
-		kfree(bdi);
+		bdi_put(bdi);
 
 	return ret ? ERR_PTR(ret) : bdi;
 }
@@ -1813,8 +1816,7 @@ static int __init init_mtd(void)
 out_procfs:
 	if (proc_mtd)
 		remove_proc_entry("mtd", NULL);
-	bdi_destroy(mtd_bdi);
-	kfree(mtd_bdi);
+	bdi_put(mtd_bdi);
 err_bdi:
 	class_unregister(&mtd_class);
 err_reg:
@@ -1828,8 +1830,7 @@ static void __exit cleanup_mtd(void)
 	if (proc_mtd)
 		remove_proc_entry("mtd", NULL);
 	class_unregister(&mtd_class);
-	bdi_destroy(mtd_bdi);
-	kfree(mtd_bdi);
+	bdi_put(mtd_bdi);
 	idr_destroy(&mtd_idr);
 }
 
diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index 20c02a3b7417..e69e7855e31f 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -18,6 +18,7 @@
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/major.h>
+#include <linux/backing-dev.h>
 
 /*
  * compare superblocks to see if they're equivalent
@@ -38,6 +39,8 @@ static int get_sb_mtd_compare(struct super_block *sb, void *_mtd)
 	return 0;
 }
 
+extern struct backing_dev_info *mtd_bdi;
+
 /*
  * mark the superblock by the MTD device it is using
  * - set the device number to be the correct MTD block device for pesuperstence
@@ -49,7 +52,9 @@ static int get_sb_mtd_set(struct super_block *sb, void *_mtd)
 
 	sb->s_mtd = mtd;
 	sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
-	sb->s_bdi = mtd->backing_dev_info;
+	sb->s_bdi = bdi_get(mtd_bdi);
+	sb->s_iflags |= SB_I_DYNBDI;
+
 	return 0;
 }
 
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index eebdc63cf6af..79b176eca04a 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -334,11 +334,6 @@ struct mtd_info {
 	int (*_get_device) (struct mtd_info *mtd);
 	void (*_put_device) (struct mtd_info *mtd);
 
-	/* Backing device capabilities for this device
-	 * - provides mmap capabilities
-	 */
-	struct backing_dev_info *backing_dev_info;
-
 	struct notifier_block reboot_notifier;  /* default mode before reboot */
 
 	/* ECC status information */
-- 
cgit v1.3-6-gb490


From a5695a79088653c73c92ae8d48658cbc49f31884 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:38 +0200
Subject: coda: Convert to separately allocated bdi

Allocate struct backing_dev_info separately instead of embedding it
inside the superblock. This unifies handling of bdi among users.

CC: Jan Harkes <jaharkes@cs.cmu.edu>
CC: coda@cs.cmu.edu
CC: codalist@coda.cs.cmu.edu
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/coda/inode.c            | 11 ++++-------
 include/linux/coda_psdev.h |  1 -
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2dea594da199..6058df380cc0 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,10 +183,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 		goto unlock_out;
 	}
 
-	error = bdi_setup_and_register(&vc->bdi, "coda");
-	if (error)
-		goto unlock_out;
-
 	vc->vc_sb = sb;
 	mutex_unlock(&vc->vc_mutex);
 
@@ -197,7 +193,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = CODA_SUPER_MAGIC;
 	sb->s_op = &coda_super_operations;
 	sb->s_d_op = &coda_dentry_operations;
-	sb->s_bdi = &vc->bdi;
+
+	error = super_setup_bdi(sb);
+	if (error)
+		goto error;
 
 	/* get root fid from Venus: this needs the root inode */
 	error = venus_rootfid(sb, &fid);
@@ -228,7 +227,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 
 error:
 	mutex_lock(&vc->vc_mutex);
-	bdi_destroy(&vc->bdi);
 	vc->vc_sb = NULL;
 	sb->s_fs_info = NULL;
 unlock_out:
@@ -240,7 +238,6 @@ static void coda_put_super(struct super_block *sb)
 {
 	struct venus_comm *vcp = coda_vcp(sb);
 	mutex_lock(&vcp->vc_mutex);
-	bdi_destroy(&vcp->bdi);
 	vcp->vc_sb = NULL;
 	sb->s_fs_info = NULL;
 	mutex_unlock(&vcp->vc_mutex);
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 5b8721efa948..31e4e1f1547c 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -15,7 +15,6 @@ struct venus_comm {
 	struct list_head    vc_processing;
 	int                 vc_inuse;
 	struct super_block *vc_sb;
-	struct backing_dev_info bdi;
 	struct mutex	    vc_mutex;
 };
 
-- 
cgit v1.3-6-gb490


From 0db10944a76ba09f37d43b99d0fe085a18307f22 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:45 +0200
Subject: nfs: Convert to separately allocated bdi

Allocate struct backing_dev_info separately instead of embedding it
inside the superblock. This unifies handling of bdi among users.

CC: Anna Schumaker <anna.schumaker@netapp.com>
CC: linux-nfs@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Acked-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/nfs/client.c           | 10 ----------
 fs/nfs/internal.h         |  6 +++---
 fs/nfs/super.c            | 34 +++++++++++++++++++---------------
 fs/nfs/write.c            | 13 ++++++-------
 include/linux/nfs_fs_sb.h |  1 -
 5 files changed, 28 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 390ada8741bc..04d15a0045e3 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -761,9 +761,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
 		server->rsize = NFS_MAX_FILE_IO_SIZE;
 	server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-	server->backing_dev_info.name = "nfs";
-	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-
 	if (server->wsize > max_rpc_payload)
 		server->wsize = max_rpc_payload;
 	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
@@ -917,12 +914,6 @@ struct nfs_server *nfs_alloc_server(void)
 		return NULL;
 	}
 
-	if (bdi_init(&server->backing_dev_info)) {
-		nfs_free_iostats(server->io_stats);
-		kfree(server);
-		return NULL;
-	}
-
 	ida_init(&server->openowner_id);
 	ida_init(&server->lockowner_id);
 	pnfs_init_server(server);
@@ -953,7 +944,6 @@ void nfs_free_server(struct nfs_server *server)
 	ida_destroy(&server->lockowner_id);
 	ida_destroy(&server->openowner_id);
 	nfs_free_iostats(server->io_stats);
-	bdi_destroy(&server->backing_dev_info);
 	kfree(server);
 	nfs_release_automount_timer();
 	dprintk("<-- nfs_free_server()\n");
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7b38fedb7e03..9dc65d7ae754 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -139,7 +139,7 @@ struct nfs_mount_request {
 };
 
 struct nfs_mount_info {
-	void (*fill_super)(struct super_block *, struct nfs_mount_info *);
+	int (*fill_super)(struct super_block *, struct nfs_mount_info *);
 	int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *);
 	struct nfs_parsed_mount_data *parsed;
 	struct nfs_clone_mount *cloned;
@@ -407,7 +407,7 @@ struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *
 struct dentry * nfs_xdev_mount_common(struct file_system_type *, int,
 		const char *, struct nfs_mount_info *);
 void nfs_kill_super(struct super_block *);
-void nfs_fill_super(struct super_block *, struct nfs_mount_info *);
+int nfs_fill_super(struct super_block *, struct nfs_mount_info *);
 
 extern struct rpc_stat nfs_rpcstat;
 
@@ -458,7 +458,7 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 
 /* super.c */
-void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
+int nfs_clone_super(struct super_block *, struct nfs_mount_info *);
 void nfs_umount_begin(struct super_block *);
 int  nfs_statfs(struct dentry *, struct kstatfs *);
 int  nfs_show_options(struct seq_file *, struct dentry *);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 54e0f9f2dd94..8d97aa70407e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2315,18 +2315,17 @@ inline void nfs_initialise_sb(struct super_block *sb)
 		sb->s_blocksize = nfs_block_bits(server->wsize,
 						 &sb->s_blocksize_bits);
 
-	sb->s_bdi = &server->backing_dev_info;
-
 	nfs_super_set_maxbytes(sb, server->maxfilesize);
 }
 
 /*
  * Finish setting up an NFS2/3 superblock
  */
-void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+int nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 {
 	struct nfs_parsed_mount_data *data = mount_info->parsed;
 	struct nfs_server *server = NFS_SB(sb);
+	int ret;
 
 	sb->s_blocksize_bits = 0;
 	sb->s_blocksize = 0;
@@ -2344,13 +2343,21 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 	}
 
  	nfs_initialise_sb(sb);
+
+	ret = super_setup_bdi_name(sb, "%u:%u", MAJOR(server->s_dev),
+				   MINOR(server->s_dev));
+	if (ret)
+		return ret;
+	sb->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
+	return 0;
+
 }
 EXPORT_SYMBOL_GPL(nfs_fill_super);
 
 /*
  * Finish setting up a cloned NFS2/3/4 superblock
  */
-void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 {
 	const struct super_block *old_sb = mount_info->cloned->sb;
 	struct nfs_server *server = NFS_SB(sb);
@@ -2370,6 +2377,11 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 	}
 
  	nfs_initialise_sb(sb);
+
+	sb->s_bdi = bdi_get(old_sb->s_bdi);
+	sb->s_iflags |= SB_I_DYNBDI;
+
+	return 0;
 }
 
 static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
@@ -2522,11 +2534,6 @@ static void nfs_get_cache_cookie(struct super_block *sb,
 }
 #endif
 
-static int nfs_bdi_register(struct nfs_server *server)
-{
-	return bdi_register_dev(&server->backing_dev_info, server->s_dev);
-}
-
 int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
 			struct nfs_mount_info *mount_info)
 {
@@ -2594,17 +2601,14 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
 		nfs_free_server(server);
 		server = NULL;
 	} else {
-		error = nfs_bdi_register(server);
-		if (error) {
-			mntroot = ERR_PTR(error);
-			goto error_splat_super;
-		}
 		server->super = s;
 	}
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		mount_info->fill_super(s, mount_info);
+		error = mount_info->fill_super(s, mount_info);
+		if (error)
+			goto error_splat_super;
 		nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
 	}
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index abb2c8a3be42..cc341fc7fd44 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -263,16 +263,15 @@ int nfs_congestion_kb;
 
 static void nfs_set_page_writeback(struct page *page)
 {
-	struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);
+	struct inode *inode = page_file_mapping(page)->host;
+	struct nfs_server *nfss = NFS_SERVER(inode);
 	int ret = test_set_page_writeback(page);
 
 	WARN_ON_ONCE(ret != 0);
 
 	if (atomic_long_inc_return(&nfss->writeback) >
-			NFS_CONGESTION_ON_THRESH) {
-		set_bdi_congested(&nfss->backing_dev_info,
-					BLK_RW_ASYNC);
-	}
+			NFS_CONGESTION_ON_THRESH)
+		set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 }
 
 static void nfs_end_page_writeback(struct nfs_page *req)
@@ -285,7 +284,7 @@ static void nfs_end_page_writeback(struct nfs_page *req)
 
 	end_page_writeback(req->wb_page);
 	if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+		clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 }
 
 
@@ -1808,7 +1807,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 	}
 	nfss = NFS_SERVER(data->inode);
 	if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
-		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+		clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
 
 	nfs_init_cinfo(&cinfo, data->inode, data->dreq);
 	nfs_commit_end(cinfo.mds);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b34097c67848..e1502c55741e 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -133,7 +133,6 @@ struct nfs_server {
 	struct rpc_clnt *	client_acl;	/* ACL RPC client handle */
 	struct nlm_host		*nlm_host;	/* NLM client handle */
 	struct nfs_iostats __percpu *io_stats;	/* I/O statistics */
-	struct backing_dev_info	backing_dev_info;
 	atomic_long_t		writeback;	/* number of writeback pages */
 	int			flags;		/* various flags */
 	unsigned int		caps;		/* server capabilities */
-- 
cgit v1.3-6-gb490


From c1844d536dafa5f2cddf4b4841a3634f80a27666 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:47 +0200
Subject: fs: Remove SB_I_DYNBDI flag

Now that all bdi structures filesystems use are properly refcounted, we
can remove the SB_I_DYNBDI flag.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/mtd/mtdsuper.c | 1 -
 fs/gfs2/ops_fstype.c   | 1 -
 fs/nfs/super.c         | 1 -
 fs/nilfs2/super.c      | 1 -
 fs/super.c             | 5 +----
 include/linux/fs.h     | 3 ---
 6 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index e69e7855e31f..e43fea896d1e 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -53,7 +53,6 @@ static int get_sb_mtd_set(struct super_block *sb, void *_mtd)
 	sb->s_mtd = mtd;
 	sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
 	sb->s_bdi = bdi_get(mtd_bdi);
-	sb->s_iflags |= SB_I_DYNBDI;
 
 	return 0;
 }
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e6b6f97d0fc1..ed67548b286c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1224,7 +1224,6 @@ static int set_gfs2_super(struct super_block *s, void *data)
 	s->s_bdev = data;
 	s->s_dev = s->s_bdev->bd_dev;
 	s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
-	s->s_iflags |= SB_I_DYNBDI;
 	return 0;
 }
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 8d97aa70407e..dc69314d455e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2379,7 +2379,6 @@ int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
  	nfs_initialise_sb(sb);
 
 	sb->s_bdi = bdi_get(old_sb->s_bdi);
-	sb->s_iflags |= SB_I_DYNBDI;
 
 	return 0;
 }
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index feb796a38b8d..926682981d61 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1069,7 +1069,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_max_links = NILFS_LINK_MAX;
 
 	sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi);
-	sb->s_iflags |= SB_I_DYNBDI;
 
 	err = load_nilfs(nilfs, sb);
 	if (err)
diff --git a/fs/super.c b/fs/super.c
index e267d3a00144..8444d26926ef 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -446,10 +446,9 @@ void generic_shutdown_super(struct super_block *sb)
 	hlist_del_init(&sb->s_instances);
 	spin_unlock(&sb_lock);
 	up_write(&sb->s_umount);
-	if (sb->s_iflags & SB_I_DYNBDI) {
+	if (sb->s_bdi != &noop_backing_dev_info) {
 		bdi_put(sb->s_bdi);
 		sb->s_bdi = &noop_backing_dev_info;
-		sb->s_iflags &= ~SB_I_DYNBDI;
 	}
 }
 
@@ -1055,7 +1054,6 @@ static int set_bdev_super(struct super_block *s, void *data)
 	s->s_bdev = data;
 	s->s_dev = s->s_bdev->bd_dev;
 	s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
-	s->s_iflags |= SB_I_DYNBDI;
 
 	return 0;
 }
@@ -1282,7 +1280,6 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
 	}
 	WARN_ON(sb->s_bdi != &noop_backing_dev_info);
 	sb->s_bdi = bdi;
-	sb->s_iflags |= SB_I_DYNBDI;
 
 	return 0;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98cf14ea78c0..30e5c14bd743 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1272,9 +1272,6 @@ struct mm_struct;
 /* sb->s_iflags to limit user namespace mounts */
 #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
 
-/* Temporary flag until all filesystems are converted to dynamic bdis */
-#define SB_I_DYNBDI	0x00000100
-
 /* Possible states of 'frozen' field */
 enum {
 	SB_UNFROZEN = 0,		/* FS is unfrozen */
-- 
cgit v1.3-6-gb490


From 2e82b84c01d9438d86079980e22e036eee71e754 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:48 +0200
Subject: block: Remove unused functions

Now that all backing_dev_info structure are allocated separately, we can
drop some unused functions.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev.h |  5 ----
 mm/backing-dev.c            | 56 +++++----------------------------------------
 2 files changed, 6 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 47a98e6e2a65..aaeb2ec5d33c 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -17,8 +17,6 @@
 #include <linux/backing-dev-defs.h>
 #include <linux/slab.h>
 
-int __must_check bdi_init(struct backing_dev_info *bdi);
-
 static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
 {
 	kref_get(&bdi->refcnt);
@@ -32,12 +30,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, va_list args);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
 void bdi_unregister(struct backing_dev_info *bdi);
 
-int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
-void bdi_destroy(struct backing_dev_info *bdi);
 struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id);
 static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
 {
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3dd175986390..4dcd56947f2a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -12,8 +12,6 @@
 #include <linux/device.h>
 #include <trace/events/writeback.h>
 
-static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-
 struct backing_dev_info noop_backing_dev_info = {
 	.name		= "noop",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -242,6 +240,8 @@ static __init int bdi_class_init(void)
 }
 postcore_initcall(bdi_class_init);
 
+static int bdi_init(struct backing_dev_info *bdi);
+
 static int __init default_bdi_init(void)
 {
 	int err;
@@ -820,7 +820,7 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
 
 #endif	/* CONFIG_CGROUP_WRITEBACK */
 
-int bdi_init(struct backing_dev_info *bdi)
+static int bdi_init(struct backing_dev_info *bdi)
 {
 	int ret;
 
@@ -838,7 +838,6 @@ int bdi_init(struct backing_dev_info *bdi)
 
 	return ret;
 }
-EXPORT_SYMBOL(bdi_init);
 
 struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
 {
@@ -897,12 +896,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 }
 EXPORT_SYMBOL(bdi_register);
 
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
-{
-	return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
-}
-EXPORT_SYMBOL(bdi_register_dev);
-
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
 {
 	int rc;
@@ -950,13 +943,6 @@ void bdi_unregister(struct backing_dev_info *bdi)
 	}
 }
 
-static void bdi_exit(struct backing_dev_info *bdi)
-{
-	WARN_ON_ONCE(bdi->dev);
-	wb_exit(&bdi->wb);
-	cgwb_bdi_exit(bdi);
-}
-
 static void release_bdi(struct kref *ref)
 {
 	struct backing_dev_info *bdi =
@@ -964,7 +950,9 @@ static void release_bdi(struct kref *ref)
 
 	if (test_bit(WB_registered, &bdi->wb.state))
 		bdi_unregister(bdi);
-	bdi_exit(bdi);
+	WARN_ON_ONCE(bdi->dev);
+	wb_exit(&bdi->wb);
+	cgwb_bdi_exit(bdi);
 	kfree(bdi);
 }
 
@@ -974,38 +962,6 @@ void bdi_put(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(bdi_put);
 
-void bdi_destroy(struct backing_dev_info *bdi)
-{
-	bdi_unregister(bdi);
-	bdi_exit(bdi);
-}
-EXPORT_SYMBOL(bdi_destroy);
-
-/*
- * For use from filesystems to quickly init and register a bdi associated
- * with dirty writeback
- */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
-{
-	int err;
-
-	bdi->name = name;
-	bdi->capabilities = 0;
-	err = bdi_init(bdi);
-	if (err)
-		return err;
-
-	err = bdi_register(bdi, NULL, "%.28s-%ld", name,
-			   atomic_long_inc_return(&bdi_seq));
-	if (err) {
-		bdi_destroy(bdi);
-		return err;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(bdi_setup_and_register);
-
 static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
-- 
cgit v1.3-6-gb490


From 7c4cc30024946dae9530cd6dc0d8d4eb40fca173 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 12 Apr 2017 12:24:49 +0200
Subject: bdi: Drop 'parent' argument from bdi_register[_va]()

Drop 'parent' argument of bdi_register() and bdi_register_va().  It is
always NULL.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/mtd/mtdcore.c       |  2 +-
 fs/super.c                  |  2 +-
 include/linux/backing-dev.h |  9 ++++-----
 mm/backing-dev.c            | 13 +++++--------
 4 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 23e2e56ca54e..1517da3ddd7d 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1782,7 +1782,7 @@ static struct backing_dev_info * __init mtd_bdi_init(char *name)
 	 * We put '-0' suffix to the name to get the same name format as we
 	 * used to get. Since this is called only once, we get a unique name. 
 	 */
-	ret = bdi_register(bdi, NULL, "%.28s-0", name);
+	ret = bdi_register(bdi, "%.28s-0", name);
 	if (ret)
 		bdi_put(bdi);
 
diff --git a/fs/super.c b/fs/super.c
index 8444d26926ef..adb0c0de428c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1272,7 +1272,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
 	bdi->name = sb->s_type->name;
 
 	va_start(args, fmt);
-	err = bdi_register_va(bdi, NULL, fmt, args);
+	err = bdi_register_va(bdi, fmt, args);
 	va_end(args);
 	if (err) {
 		bdi_put(bdi);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index aaeb2ec5d33c..557d84063934 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -25,11 +25,10 @@ static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
 
 void bdi_put(struct backing_dev_info *bdi);
 
-__printf(3, 4)
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-		const char *fmt, ...);
-int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
-		const char *fmt, va_list args);
+__printf(2, 3)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
+		    va_list args);
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
 void bdi_unregister(struct backing_dev_info *bdi);
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 4dcd56947f2a..f028a9a472fd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -856,15 +856,14 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
 }
 EXPORT_SYMBOL(bdi_alloc_node);
 
-int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
-		const char *fmt, va_list args)
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
 {
 	struct device *dev;
 
 	if (bdi->dev)	/* The driver needs to use separate queues per device */
 		return 0;
 
-	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+	dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
 	if (IS_ERR(dev))
 		return PTR_ERR(dev);
 
@@ -883,14 +882,13 @@ int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
 }
 EXPORT_SYMBOL(bdi_register_va);
 
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-		const char *fmt, ...)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
 {
 	va_list args;
 	int ret;
 
 	va_start(args, fmt);
-	ret = bdi_register_va(bdi, parent, fmt, args);
+	ret = bdi_register_va(bdi, fmt, args);
 	va_end(args);
 	return ret;
 }
@@ -900,8 +898,7 @@ int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
 {
 	int rc;
 
-	rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt),
-			MINOR(owner->devt));
+	rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt));
 	if (rc)
 		return rc;
 	/* Leaking owner reference... */
-- 
cgit v1.3-6-gb490


From b7819b9259185dcdcc81eb32182a4dc13d695738 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Apr 2017 16:02:55 +0200
Subject: block: remove the blk_execute_rq return value

The function only returns -EIO if rq->errors is non-zero, which is not
very useful and lets a large number of callers ignore the return value.

Just let the callers figure out their error themselves.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-exec.c                 | 8 +-------
 block/scsi_ioctl.c               | 3 ++-
 drivers/block/virtio_blk.c       | 3 ++-
 drivers/cdrom/cdrom.c            | 3 ++-
 drivers/ide/ide-atapi.c          | 3 ++-
 drivers/ide/ide-cd.c             | 3 ++-
 drivers/ide/ide-cd_ioctl.c       | 3 ++-
 drivers/ide/ide-devsets.c        | 4 ++--
 drivers/ide/ide-disk.c           | 3 +--
 drivers/ide/ide-ioctls.c         | 7 ++++---
 drivers/ide/ide-park.c           | 3 ++-
 drivers/ide/ide-pm.c             | 3 ++-
 drivers/ide/ide-taskfile.c       | 4 ++--
 drivers/scsi/osd/osd_initiator.c | 5 ++++-
 fs/nfsd/blocklayout.c            | 5 +++--
 include/linux/blkdev.h           | 2 +-
 16 files changed, 34 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-exec.c b/block/blk-exec.c
index 8cd0e9bc8dc8..afa383248c7c 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -92,11 +92,10 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
  *    Insert a fully prepared request at the back of the I/O scheduler queue
  *    for execution and wait for completion.
  */
-int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
+void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 		   struct request *rq, int at_head)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
-	int err = 0;
 	unsigned long hang_check;
 
 	rq->end_io_data = &wait;
@@ -108,10 +107,5 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 		while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
 	else
 		wait_for_completion_io(&wait);
-
-	if (rq->errors)
-		err = -EIO;
-
-	return err;
 }
 EXPORT_SYMBOL(blk_execute_rq);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 82a43bb19967..b1352143f12f 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -547,7 +547,8 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	scsi_req(rq)->cmd[0] = cmd;
 	scsi_req(rq)->cmd[4] = data;
 	scsi_req(rq)->cmd_len = 6;
-	err = blk_execute_rq(q, bd_disk, rq, 0);
+	blk_execute_rq(q, bd_disk, rq, 0);
+	err = rq->errors ? -EIO : 0;
 	blk_put_request(rq);
 
 	return err;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2d8290169271..eaf99022bdc6 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -310,7 +310,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
 	if (err)
 		goto out;
 
-	err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+	blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+	err = req->errors ? -EIO : 0;
 out:
 	blk_put_request(req);
 	return err;
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 87739649eac2..308501730ab3 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2218,7 +2218,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		rq->timeout = 60 * HZ;
 		bio = rq->bio;
 
-		if (blk_execute_rq(q, cdi->disk, rq, 0)) {
+		blk_execute_rq(q, cdi->disk, rq, 0);
+		if (rq->errors) {
 			struct request_sense *s = req->sense;
 			ret = -EIO;
 			cdi->last_sense = s->sense_key;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index feb30061123b..1524797e1776 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -107,7 +107,8 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	memcpy(scsi_req(rq)->cmd, pc->c, 12);
 	if (drive->media == ide_tape)
 		scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
-	error = blk_execute_rq(drive->queue, disk, rq, 0);
+	blk_execute_rq(drive->queue, disk, rq, 0);
+	error = rq->errors ? -EIO : 0;
 put_req:
 	blk_put_request(rq);
 	return error;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 74f1b7dc03f7..95c40afa9120 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -452,7 +452,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 			}
 		}
 
-		error = blk_execute_rq(drive->queue, info->disk, rq, 0);
+		blk_execute_rq(drive->queue, info->disk, rq, 0);
+		error = rq->errors ? -EIO : 0;
 
 		if (buffer)
 			*bufflen = scsi_req(rq)->resid_len;
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 9fcefbc8425e..f1ab726bd430 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -307,7 +307,8 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	scsi_req_init(rq);
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->rq_flags = RQF_QUIET;
-	ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
+	blk_execute_rq(drive->queue, cd->disk, rq, 0);
+	ret = rq->errors ? -EIO : 0;
 	blk_put_request(rq);
 	/*
 	 * A reset will unlock the door. If it was previously locked,
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index a45dda5386e4..eea6a7cb80b5 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -173,8 +173,8 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	*(int *)&scsi_req(rq)->cmd[1] = arg;
 	rq->special = setting->set;
 
-	if (blk_execute_rq(q, NULL, rq, 0))
-		ret = rq->errors;
+	blk_execute_rq(q, NULL, rq, 0);
+	ret = rq->errors;
 	blk_put_request(rq);
 
 	return ret;
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 186159715b71..7c06237f3479 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -470,7 +470,6 @@ ide_devset_get(multcount, mult_count);
 static int set_multcount(ide_drive_t *drive, int arg)
 {
 	struct request *rq;
-	int error;
 
 	if (arg < 0 || arg > (drive->id[ATA_ID_MAX_MULTSECT] & 0xff))
 		return -EINVAL;
@@ -484,7 +483,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
 
 	drive->mult_req = arg;
 	drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
-	error = blk_execute_rq(drive->queue, NULL, rq, 0);
+	blk_execute_rq(drive->queue, NULL, rq, 0);
 	blk_put_request(rq);
 
 	return (drive->mult_count == arg) ? 0 : -EIO;
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 248a3e0ceb46..3e96e531b367 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -128,7 +128,8 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 		rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
 		scsi_req_init(rq);
 		ide_req(rq)->type = ATA_PRIV_TASKFILE;
-		err = blk_execute_rq(drive->queue, NULL, rq, 0);
+		blk_execute_rq(drive->queue, NULL, rq, 0);
+		err = rq->errors ? -EIO : 0;
 		blk_put_request(rq);
 
 		return err;
@@ -227,8 +228,8 @@ static int generic_drive_reset(ide_drive_t *drive)
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	scsi_req(rq)->cmd_len = 1;
 	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
-	if (blk_execute_rq(drive->queue, NULL, rq, 1))
-		ret = rq->errors;
+	blk_execute_rq(drive->queue, NULL, rq, 1);
+	ret = rq->errors;
 	blk_put_request(rq);
 	return ret;
 }
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 101aed9a61ca..b4f577016f5a 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -37,7 +37,8 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	scsi_req(rq)->cmd_len = 1;
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->special = &timeout;
-	rc = blk_execute_rq(q, NULL, rq, 1);
+	blk_execute_rq(q, NULL, rq, 1);
+	rc = rq->errors ? -EIO : 0;
 	blk_put_request(rq);
 	if (rc)
 		goto out;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index ec951be4b0c8..bf513f886f3c 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -27,7 +27,8 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 		mesg.event = PM_EVENT_FREEZE;
 	rqpm.pm_state = mesg.event;
 
-	ret = blk_execute_rq(drive->queue, NULL, rq, 0);
+	blk_execute_rq(drive->queue, NULL, rq, 0);
+	ret = rq->errors ? -EIO : 0;
 	blk_put_request(rq);
 
 	if (ret == 0 && ide_port_acpi(hwif)) {
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 4c0007cb74e3..78924c7c9478 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -452,8 +452,8 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	rq->special = cmd;
 	cmd->rq = rq;
 
-	error = blk_execute_rq(drive->queue, NULL, rq, 0);
-
+	blk_execute_rq(drive->queue, NULL, rq, 0);
+	error = rq->errors ? -EIO : 0;
 put_req:
 	blk_put_request(rq);
 	return error;
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 9d0727b2bdec..5eeab7047d1e 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -489,7 +489,10 @@ static void _set_error_resid(struct osd_request *or, struct request *req,
 
 int osd_execute_request(struct osd_request *or)
 {
-	int error = blk_execute_rq(or->request->q, NULL, or->request, 0);
+	int error;
+
+	blk_execute_rq(or->request->q, NULL, or->request, 0);
+	error = or->request->errors ? -EIO : 0;
 
 	_set_error_resid(or, or->request, error);
 	return error;
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 92b4b41d19d2..9f618b77ffee 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -242,10 +242,11 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
 	req->cmd[4] = bufflen & 0xff;
 	req->cmd_len = COMMAND_SIZE(INQUIRY);
 
-	error = blk_execute_rq(rq->q, NULL, rq, 1);
-	if (error) {
+	blk_execute_rq(rq->q, NULL, rq, 1);
+	if (rq->errors) {
 		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
 			rq->errors);
+		error = -EIO;
 		goto out_put_request;
 	}
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 51c9e391798e..e2064ed3c703 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -970,7 +970,7 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
 			       struct rq_map_data *, const struct iov_iter *,
 			       gfp_t);
-extern int blk_execute_rq(struct request_queue *, struct gendisk *,
+extern void blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
-- 
cgit v1.3-6-gb490


From 17d5363b83f8c73ef9109f75a4a9b578f31d842f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Apr 2017 16:03:01 +0200
Subject: scsi: introduce a result field in struct scsi_request

This passes on the scsi_cmnd result field to users of passthrough
requests.  Currently we abuse req->errors for this purpose, but that
field will go away in its current form.

Note that the old IDE code abuses the errors field in very creative
ways and stores all kinds of different values in it.  I didn't dare
to touch this magic, so the abuses are brought forward 1:1.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bsg-lib.c                    |  8 ++++----
 block/bsg.c                        | 12 +++++------
 block/scsi_ioctl.c                 | 14 ++++++-------
 drivers/block/cciss.c              | 42 +++++++++++++++++++-------------------
 drivers/block/pktcdvd.c            |  2 +-
 drivers/block/virtio_blk.c         |  2 +-
 drivers/cdrom/cdrom.c              |  2 +-
 drivers/ide/ide-atapi.c            | 10 ++++-----
 drivers/ide/ide-cd.c               | 20 +++++++++---------
 drivers/ide/ide-cd_ioctl.c         |  2 +-
 drivers/ide/ide-devsets.c          |  4 ++--
 drivers/ide/ide-dma.c              |  2 +-
 drivers/ide/ide-eh.c               | 36 ++++++++++++++++----------------
 drivers/ide/ide-floppy.c           | 10 ++++-----
 drivers/ide/ide-io.c               | 10 ++++-----
 drivers/ide/ide-ioctls.c           |  4 ++--
 drivers/ide/ide-park.c             |  2 +-
 drivers/ide/ide-pm.c               |  8 ++++----
 drivers/ide/ide-tape.c             |  4 ++--
 drivers/ide/ide-taskfile.c         |  6 +++---
 drivers/scsi/osd/osd_initiator.c   |  4 ++--
 drivers/scsi/osst.c                |  2 +-
 drivers/scsi/qla2xxx/qla_bsg.c     |  6 +++---
 drivers/scsi/scsi_lib.c            | 15 +++++++-------
 drivers/scsi/scsi_transport_sas.c  |  2 +-
 drivers/scsi/sg.c                  |  2 +-
 drivers/scsi/st.c                  |  6 +++---
 drivers/target/target_core_pscsi.c |  2 +-
 fs/nfsd/blocklayout.c              |  4 ++--
 include/linux/ide.h                |  2 +-
 include/scsi/scsi_request.h        |  1 +
 31 files changed, 123 insertions(+), 123 deletions(-)

(limited to 'include/linux')

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index cd15f9dbb147..0a23dbba2d30 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -37,7 +37,7 @@ static void bsg_destroy_job(struct kref *kref)
 	struct bsg_job *job = container_of(kref, struct bsg_job, kref);
 	struct request *rq = job->req;
 
-	blk_end_request_all(rq, rq->errors);
+	blk_end_request_all(rq, scsi_req(rq)->result);
 
 	put_device(job->dev);	/* release reference for the request */
 
@@ -74,7 +74,7 @@ void bsg_job_done(struct bsg_job *job, int result,
 	struct scsi_request *rq = scsi_req(req);
 	int err;
 
-	err = job->req->errors = result;
+	err = scsi_req(job->req)->result = result;
 	if (err < 0)
 		/* we're only returning the result field in the reply */
 		rq->sense_len = sizeof(u32);
@@ -177,7 +177,7 @@ failjob_rls_job:
  * @q: request queue to manage
  *
  * On error the create_bsg_job function should return a -Exyz error value
- * that will be set to the req->errors.
+ * that will be set to ->result.
  *
  * Drivers/subsys should pass this to the queue init function.
  */
@@ -201,7 +201,7 @@ static void bsg_request_fn(struct request_queue *q)
 
 		ret = bsg_create_job(dev, req);
 		if (ret) {
-			req->errors = ret;
+			scsi_req(req)->result = ret;
 			blk_end_request_all(req, ret);
 			spin_lock_irq(q->queue_lock);
 			continue;
diff --git a/block/bsg.c b/block/bsg.c
index 74835dbf0c47..d9da1b613ced 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -391,13 +391,13 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	struct scsi_request *req = scsi_req(rq);
 	int ret = 0;
 
-	dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors);
+	dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result);
 	/*
 	 * fill in all the output members
 	 */
-	hdr->device_status = rq->errors & 0xff;
-	hdr->transport_status = host_byte(rq->errors);
-	hdr->driver_status = driver_byte(rq->errors);
+	hdr->device_status = req->result & 0xff;
+	hdr->transport_status = host_byte(req->result);
+	hdr->driver_status = driver_byte(req->result);
 	hdr->info = 0;
 	if (hdr->device_status || hdr->transport_status || hdr->driver_status)
 		hdr->info |= SG_INFO_CHECK;
@@ -431,8 +431,8 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	 * just a protocol response (i.e. non negative), that gets
 	 * processed above.
 	 */
-	if (!ret && rq->errors < 0)
-		ret = rq->errors;
+	if (!ret && req->result < 0)
+		ret = req->result;
 
 	blk_rq_unmap_user(bio);
 	scsi_req_free_cmd(req);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index b1352143f12f..4a294a5f7fab 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -262,11 +262,11 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
 	/*
 	 * fill in all the output members
 	 */
-	hdr->status = rq->errors & 0xff;
-	hdr->masked_status = status_byte(rq->errors);
-	hdr->msg_status = msg_byte(rq->errors);
-	hdr->host_status = host_byte(rq->errors);
-	hdr->driver_status = driver_byte(rq->errors);
+	hdr->status = req->result & 0xff;
+	hdr->masked_status = status_byte(req->result);
+	hdr->msg_status = msg_byte(req->result);
+	hdr->host_status = host_byte(req->result);
+	hdr->driver_status = driver_byte(req->result);
 	hdr->info = 0;
 	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
 		hdr->info |= SG_INFO_CHECK;
@@ -509,7 +509,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
 
 	blk_execute_rq(q, disk, rq, 0);
 
-	err = rq->errors & 0xff;	/* only 8 bit SCSI status */
+	err = req->result & 0xff;	/* only 8 bit SCSI status */
 	if (err) {
 		if (req->sense_len && req->sense) {
 			bytes = (OMAX_SB_LEN > req->sense_len) ?
@@ -548,7 +548,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 	scsi_req(rq)->cmd[4] = data;
 	scsi_req(rq)->cmd_len = 6;
 	blk_execute_rq(q, bd_disk, rq, 0);
-	err = rq->errors ? -EIO : 0;
+	err = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 
 	return err;
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 8e1a4554951c..cd375503f7b0 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1864,8 +1864,7 @@ static void cciss_softirq_done(struct request *rq)
 	/* set the residual count for pc requests */
 	if (blk_rq_is_passthrough(rq))
 		scsi_req(rq)->resid_len = c->err_info->ResidualCnt;
-
-	blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
+	blk_end_request_all(rq, scsi_req(rq)->result ? -EIO : 0);
 
 	spin_lock_irqsave(&h->lock, flags);
 	cmd_free(h, c);
@@ -3140,18 +3139,19 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 {
 	int retry_cmd = 0;
 	struct request *rq = cmd->rq;
+	struct scsi_request *sreq = scsi_req(rq);
 
-	rq->errors = 0;
+	sreq->result = 0;
 
 	if (timeout)
-		rq->errors = make_status_bytes(0, 0, 0, DRIVER_TIMEOUT);
+		sreq->result = make_status_bytes(0, 0, 0, DRIVER_TIMEOUT);
 
 	if (cmd->err_info->CommandStatus == 0)	/* no error has occurred */
 		goto after_error_processing;
 
 	switch (cmd->err_info->CommandStatus) {
 	case CMD_TARGET_STATUS:
-		rq->errors = evaluate_target_status(h, cmd, &retry_cmd);
+		sreq->result = evaluate_target_status(h, cmd, &retry_cmd);
 		break;
 	case CMD_DATA_UNDERRUN:
 		if (!blk_rq_is_passthrough(cmd->rq)) {
@@ -3169,7 +3169,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 	case CMD_INVALID:
 		dev_warn(&h->pdev->dev, "cciss: cmd %p is "
 		       "reported invalid\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3177,7 +3177,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 	case CMD_PROTOCOL_ERR:
 		dev_warn(&h->pdev->dev, "cciss: cmd %p has "
 		       "protocol error\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3185,7 +3185,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 	case CMD_HARDWARE_ERR:
 		dev_warn(&h->pdev->dev, "cciss: cmd %p had "
 		       " hardware error\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3193,7 +3193,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 	case CMD_CONNECTION_LOST:
 		dev_warn(&h->pdev->dev, "cciss: cmd %p had "
 		       "connection lost\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3201,7 +3201,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 	case CMD_ABORTED:
 		dev_warn(&h->pdev->dev, "cciss: cmd %p was "
 		       "aborted\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ABORT);
@@ -3209,7 +3209,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 	case CMD_ABORT_FAILED:
 		dev_warn(&h->pdev->dev, "cciss: cmd %p reports "
 		       "abort failed\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3224,21 +3224,21 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		} else
 			dev_warn(&h->pdev->dev,
 				"%p retried too many times\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ABORT);
 		break;
 	case CMD_TIMEOUT:
 		dev_warn(&h->pdev->dev, "cmd %p timedout\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
 		break;
 	case CMD_UNABORTABLE:
 		dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3247,7 +3247,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 		dev_warn(&h->pdev->dev, "cmd %p returned "
 		       "unknown status %x\n", cmd,
 		       cmd->err_info->CommandStatus);
-		rq->errors = make_status_bytes(SAM_STAT_GOOD,
+		sreq->result = make_status_bytes(SAM_STAT_GOOD,
 			cmd->err_info->CommandStatus, DRIVER_OK,
 			blk_rq_is_passthrough(cmd->rq) ?
 				DID_PASSTHROUGH : DID_ERROR);
@@ -3380,9 +3380,9 @@ static void do_cciss_request(struct request_queue *q)
 		if (dma_mapping_error(&h->pdev->dev, temp64.val)) {
 			dev_warn(&h->pdev->dev,
 				"%s: error mapping page for DMA\n", __func__);
-			creq->errors = make_status_bytes(SAM_STAT_GOOD,
-							0, DRIVER_OK,
-							DID_SOFT_ERROR);
+			scsi_req(creq)->result =
+				make_status_bytes(SAM_STAT_GOOD, 0, DRIVER_OK,
+						  DID_SOFT_ERROR);
 			cmd_free(h, c);
 			return;
 		}
@@ -3395,9 +3395,9 @@ static void do_cciss_request(struct request_queue *q)
 		if (cciss_map_sg_chain_block(h, c, h->cmd_sg_list[c->cmdindex],
 			(seg - (h->max_cmd_sgentries - 1)) *
 				sizeof(SGDescriptor_struct))) {
-			creq->errors = make_status_bytes(SAM_STAT_GOOD,
-							0, DRIVER_OK,
-							DID_SOFT_ERROR);
+			scsi_req(creq)->result =
+				make_status_bytes(SAM_STAT_GOOD, 0, DRIVER_OK,
+						  DID_SOFT_ERROR);
 			cmd_free(h, c);
 			return;
 		}
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 66d846ba85a9..205b865ebeb9 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -724,7 +724,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
 		rq->rq_flags |= RQF_QUIET;
 
 	blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
-	if (rq->errors)
+	if (scsi_req(rq)->result)
 		ret = -EIO;
 out:
 	blk_put_request(rq);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8378ad480f77..dea2a58d6734 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -119,7 +119,7 @@ static inline void virtblk_scsi_request_done(struct request *req)
 
 	sreq->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
 	sreq->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
-	req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
+	sreq->result = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
 }
 
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 308501730ab3..76c952fd9ab9 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2219,7 +2219,7 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		bio = rq->bio;
 
 		blk_execute_rq(q, cdi->disk, rq, 0);
-		if (rq->errors) {
+		if (scsi_req(rq)->result) {
 			struct request_sense *s = req->sense;
 			ret = -EIO;
 			cdi->last_sense = s->sense_key;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 1524797e1776..5901937284e7 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -108,7 +108,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	if (drive->media == ide_tape)
 		scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
 	blk_execute_rq(drive->queue, disk, rq, 0);
-	error = rq->errors ? -EIO : 0;
+	error = scsi_req(rq)->result ? -EIO : 0;
 put_req:
 	blk_put_request(rq);
 	return error;
@@ -455,7 +455,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			debug_log("%s: I/O error\n", drive->name);
 
 			if (drive->media != ide_tape)
-				pc->rq->errors++;
+				scsi_req(pc->rq)->result++;
 
 			if (scsi_req(rq)->cmd[0] == REQUEST_SENSE) {
 				printk(KERN_ERR PFX "%s: I/O error in request "
@@ -489,13 +489,13 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			drive->failed_pc = NULL;
 
 		if (ata_misc_request(rq)) {
-			rq->errors = 0;
+			scsi_req(rq)->result = 0;
 			error = 0;
 		} else {
 
 			if (blk_rq_is_passthrough(rq) && uptodate <= 0) {
-				if (rq->errors == 0)
-					rq->errors = -EIO;
+				if (scsi_req(rq)->result == 0)
+					scsi_req(rq)->result = -EIO;
 			}
 
 			error = uptodate ? 0 : -EIO;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 95c40afa9120..07e5ff3a64c3 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -247,10 +247,10 @@ static int ide_cd_breathe(ide_drive_t *drive, struct request *rq)
 
 	struct cdrom_info *info = drive->driver_data;
 
-	if (!rq->errors)
+	if (!scsi_req(rq)->result)
 		info->write_timeout = jiffies +	ATAPI_WAIT_WRITE_BUSY;
 
-	rq->errors = 1;
+	scsi_req(rq)->result = 1;
 
 	if (time_after(jiffies, info->write_timeout))
 		return 0;
@@ -294,8 +294,8 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 	}
 
 	/* if we have an error, pass CHECK_CONDITION as the SCSI status byte */
-	if (blk_rq_is_scsi(rq) && !rq->errors)
-		rq->errors = SAM_STAT_CHECK_CONDITION;
+	if (blk_rq_is_scsi(rq) && !scsi_req(rq)->result)
+		scsi_req(rq)->result = SAM_STAT_CHECK_CONDITION;
 
 	if (blk_noretry_request(rq))
 		do_end_request = 1;
@@ -325,7 +325,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 		 * Arrange to retry the request but be sure to give up if we've
 		 * retried too many times.
 		 */
-		if (++rq->errors > ERROR_MAX)
+		if (++scsi_req(rq)->result > ERROR_MAX)
 			do_end_request = 1;
 		break;
 	case ILLEGAL_REQUEST:
@@ -372,7 +372,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 			/* go to the default handler for other errors */
 			ide_error(drive, "cdrom_decode_status", stat);
 			return 1;
-		} else if (++rq->errors > ERROR_MAX)
+		} else if (++scsi_req(rq)->result > ERROR_MAX)
 			/* we've racked up too many retries, abort */
 			do_end_request = 1;
 	}
@@ -453,7 +453,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		}
 
 		blk_execute_rq(drive->queue, info->disk, rq, 0);
-		error = rq->errors ? -EIO : 0;
+		error = scsi_req(rq)->result ? -EIO : 0;
 
 		if (buffer)
 			*bufflen = scsi_req(rq)->resid_len;
@@ -684,8 +684,8 @@ out_end:
 			if (cmd->nleft == 0)
 				uptodate = 1;
 		} else {
-			if (uptodate <= 0 && rq->errors == 0)
-				rq->errors = -EIO;
+			if (uptodate <= 0 && scsi_req(rq)->result == 0)
+				scsi_req(rq)->result = -EIO;
 		}
 
 		if (uptodate == 0 && rq->bio)
@@ -1380,7 +1380,7 @@ static int ide_cdrom_prep_pc(struct request *rq)
 	 * appropriate action
 	 */
 	if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
-		rq->errors = ILLEGAL_REQUEST;
+		scsi_req(rq)->result = ILLEGAL_REQUEST;
 		return BLKPREP_KILL;
 	}
 
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index f1ab726bd430..55cd736c39c6 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -308,7 +308,7 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->rq_flags = RQF_QUIET;
 	blk_execute_rq(drive->queue, cd->disk, rq, 0);
-	ret = rq->errors ? -EIO : 0;
+	ret = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 	/*
 	 * A reset will unlock the door. If it was previously locked,
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index eea6a7cb80b5..b1223234037d 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -174,7 +174,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	rq->special = setting->set;
 
 	blk_execute_rq(q, NULL, rq, 0);
-	ret = rq->errors;
+	ret = scsi_req(rq)->result;
 	blk_put_request(rq);
 
 	return ret;
@@ -186,7 +186,7 @@ ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
 
 	err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]);
 	if (err)
-		rq->errors = err;
+		scsi_req(rq)->result = err;
 	ide_complete_rq(drive, err, blk_rq_bytes(rq));
 	return ide_stopped;
 }
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 17a65ac56491..51c81223e56d 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -490,7 +490,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 	 * make sure request is sane
 	 */
 	if (hwif->rq)
-		hwif->rq->errors = 0;
+		scsi_req(hwif->rq)->result = 0;
 	return ret;
 }
 
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index cf3af6840368..4b7ffd7d158d 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -12,7 +12,7 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
 	if ((stat & ATA_BUSY) ||
 	    ((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
 		/* other bits are useless when BUSY */
-		rq->errors |= ERROR_RESET;
+		scsi_req(rq)->result |= ERROR_RESET;
 	} else if (stat & ATA_ERR) {
 		/* err has different meaning on cdrom and tape */
 		if (err == ATA_ABORTED) {
@@ -25,10 +25,10 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
 			drive->crc_count++;
 		} else if (err & (ATA_BBK | ATA_UNC)) {
 			/* retries won't help these */
-			rq->errors = ERROR_MAX;
+			scsi_req(rq)->result = ERROR_MAX;
 		} else if (err & ATA_TRK0NF) {
 			/* help it find track zero */
-			rq->errors |= ERROR_RECAL;
+			scsi_req(rq)->result |= ERROR_RECAL;
 		}
 	}
 
@@ -39,23 +39,23 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
 		ide_pad_transfer(drive, READ, nsect * SECTOR_SIZE);
 	}
 
-	if (rq->errors >= ERROR_MAX || blk_noretry_request(rq)) {
+	if (scsi_req(rq)->result >= ERROR_MAX || blk_noretry_request(rq)) {
 		ide_kill_rq(drive, rq);
 		return ide_stopped;
 	}
 
 	if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
-		rq->errors |= ERROR_RESET;
+		scsi_req(rq)->result |= ERROR_RESET;
 
-	if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
-		++rq->errors;
+	if ((scsi_req(rq)->result & ERROR_RESET) == ERROR_RESET) {
+		++scsi_req(rq)->result;
 		return ide_do_reset(drive);
 	}
 
-	if ((rq->errors & ERROR_RECAL) == ERROR_RECAL)
+	if ((scsi_req(rq)->result & ERROR_RECAL) == ERROR_RECAL)
 		drive->special_flags |= IDE_SFLAG_RECALIBRATE;
 
-	++rq->errors;
+	++scsi_req(rq)->result;
 
 	return ide_stopped;
 }
@@ -68,7 +68,7 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq,
 	if ((stat & ATA_BUSY) ||
 	    ((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
 		/* other bits are useless when BUSY */
-		rq->errors |= ERROR_RESET;
+		scsi_req(rq)->result |= ERROR_RESET;
 	} else {
 		/* add decoding error stuff */
 	}
@@ -77,14 +77,14 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq,
 		/* force an abort */
 		hwif->tp_ops->exec_command(hwif, ATA_CMD_IDLEIMMEDIATE);
 
-	if (rq->errors >= ERROR_MAX) {
+	if (scsi_req(rq)->result >= ERROR_MAX) {
 		ide_kill_rq(drive, rq);
 	} else {
-		if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
-			++rq->errors;
+		if ((scsi_req(rq)->result & ERROR_RESET) == ERROR_RESET) {
+			++scsi_req(rq)->result;
 			return ide_do_reset(drive);
 		}
-		++rq->errors;
+		++scsi_req(rq)->result;
 	}
 
 	return ide_stopped;
@@ -130,11 +130,11 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 			if (cmd)
 				ide_complete_cmd(drive, cmd, stat, err);
 		} else if (ata_pm_request(rq)) {
-			rq->errors = 1;
+			scsi_req(rq)->result = 1;
 			ide_complete_pm_rq(drive, rq);
 			return ide_stopped;
 		}
-		rq->errors = err;
+		scsi_req(rq)->result = err;
 		ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq));
 		return ide_stopped;
 	}
@@ -149,8 +149,8 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
 
 	if (rq && ata_misc_request(rq) &&
 	    scsi_req(rq)->cmd[0] == REQ_DRIVE_RESET) {
-		if (err <= 0 && rq->errors == 0)
-			rq->errors = -EIO;
+		if (err <= 0 && scsi_req(rq)->result == 0)
+			scsi_req(rq)->result = -EIO;
 		ide_complete_rq(drive, err ? err : 0, blk_rq_bytes(rq));
 	}
 }
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index a69e8013f1df..8ac6048cd2df 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -98,7 +98,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc)
 	}
 
 	if (ata_misc_request(rq))
-		rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
+		scsi_req(rq)->result = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
 
 	return uptodate;
 }
@@ -239,7 +239,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 					? rq->rq_disk->disk_name
 					: "dev?"));
 
-	if (rq->errors >= ERROR_MAX) {
+	if (scsi_req(rq)->result >= ERROR_MAX) {
 		if (drive->failed_pc) {
 			ide_floppy_report_error(floppy, drive->failed_pc);
 			drive->failed_pc = NULL;
@@ -247,7 +247,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 			printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
 
 		if (ata_misc_request(rq)) {
-			rq->errors = 0;
+			scsi_req(rq)->result = 0;
 			ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 			return ide_stopped;
 		} else
@@ -301,8 +301,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 	return ide_floppy_issue_pc(drive, &cmd, pc);
 out_end:
 	drive->failed_pc = NULL;
-	if (blk_rq_is_passthrough(rq) && rq->errors == 0)
-		rq->errors = -EIO;
+	if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0)
+		scsi_req(rq)->result = -EIO;
 	ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
 	return ide_stopped;
 }
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 043b1fb963cb..45b3f41a43d4 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -141,12 +141,12 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq)
 	drive->failed_pc = NULL;
 
 	if ((media == ide_floppy || media == ide_tape) && drv_req) {
-		rq->errors = 0;
+		scsi_req(rq)->result = 0;
 	} else {
 		if (media == ide_tape)
-			rq->errors = IDE_DRV_ERROR_GENERAL;
-		else if (blk_rq_is_passthrough(rq) && rq->errors == 0)
-			rq->errors = -EIO;
+			scsi_req(rq)->result = IDE_DRV_ERROR_GENERAL;
+		else if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0)
+			scsi_req(rq)->result = -EIO;
 	}
 
 	ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
@@ -271,7 +271,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 #ifdef DEBUG
  	printk("%s: DRIVE_CMD (null)\n", drive->name);
 #endif
-	rq->errors = 0;
+	scsi_req(rq)->result = 0;
 	ide_complete_rq(drive, 0, blk_rq_bytes(rq));
 
  	return ide_stopped;
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 3e96e531b367..8c0d17297a7a 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -129,7 +129,7 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
 		scsi_req_init(rq);
 		ide_req(rq)->type = ATA_PRIV_TASKFILE;
 		blk_execute_rq(drive->queue, NULL, rq, 0);
-		err = rq->errors ? -EIO : 0;
+		err = scsi_req(rq)->result ? -EIO : 0;
 		blk_put_request(rq);
 
 		return err;
@@ -229,7 +229,7 @@ static int generic_drive_reset(ide_drive_t *drive)
 	scsi_req(rq)->cmd_len = 1;
 	scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
 	blk_execute_rq(drive->queue, NULL, rq, 1);
-	ret = rq->errors;
+	ret = scsi_req(rq)->result;
 	blk_put_request(rq);
 	return ret;
 }
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index b4f577016f5a..94e3107f59b9 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -38,7 +38,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	ide_req(rq)->type = ATA_PRIV_MISC;
 	rq->special = &timeout;
 	blk_execute_rq(q, NULL, rq, 1);
-	rc = rq->errors ? -EIO : 0;
+	rc = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 	if (rc)
 		goto out;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index bf513f886f3c..277c2bb7616f 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -28,7 +28,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	rqpm.pm_state = mesg.event;
 
 	blk_execute_rq(drive->queue, NULL, rq, 0);
-	ret = rq->errors ? -EIO : 0;
+	ret = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
 
 	if (ret == 0 && ide_port_acpi(hwif)) {
@@ -56,8 +56,8 @@ static int ide_pm_execute_rq(struct request *rq)
 	spin_lock_irq(q->queue_lock);
 	if (unlikely(blk_queue_dying(q))) {
 		rq->rq_flags |= RQF_QUIET;
-		rq->errors = -ENXIO;
-		__blk_end_request_all(rq, rq->errors);
+		scsi_req(rq)->result = -ENXIO;
+		__blk_end_request_all(rq, scsi_req(rq)->result);
 		spin_unlock_irq(q->queue_lock);
 		return -ENXIO;
 	}
@@ -67,7 +67,7 @@ static int ide_pm_execute_rq(struct request *rq)
 
 	wait_for_completion_io(&wait);
 
-	return rq->errors ? -EIO : 0;
+	return scsi_req(rq)->result ? -EIO : 0;
 }
 
 int generic_ide_resume(struct device *dev)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d8a552b47718..a0651f948b76 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -366,7 +366,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 			err = pc->error;
 		}
 	}
-	rq->errors = err;
+	scsi_req(rq)->result = err;
 
 	return uptodate;
 }
@@ -879,7 +879,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 		tape->valid = 0;
 
 	ret = size;
-	if (rq->errors == IDE_DRV_ERROR_GENERAL)
+	if (scsi_req(rq)->result == IDE_DRV_ERROR_GENERAL)
 		ret = -EIO;
 out_put:
 	blk_put_request(rq);
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 78924c7c9478..d71199d23c9e 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -287,7 +287,7 @@ static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
 	u8 saved_io_32bit = drive->io_32bit;
 
 	if (cmd->tf_flags & IDE_TFLAG_FS)
-		cmd->rq->errors = 0;
+		scsi_req(cmd->rq)->result = 0;
 
 	if (cmd->tf_flags & IDE_TFLAG_IO_16BIT)
 		drive->io_32bit = 0;
@@ -329,7 +329,7 @@ void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat)
 	u8 set_xfer = !!(cmd->tf_flags & IDE_TFLAG_SET_XFER);
 
 	ide_complete_cmd(drive, cmd, stat, err);
-	rq->errors = err;
+	scsi_req(rq)->result = err;
 
 	if (err == 0 && set_xfer) {
 		ide_set_xfer_rate(drive, nsect);
@@ -453,7 +453,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	cmd->rq = rq;
 
 	blk_execute_rq(drive->queue, NULL, rq, 0);
-	error = rq->errors ? -EIO : 0;
+	error = scsi_req(rq)->result ? -EIO : 0;
 put_req:
 	blk_put_request(rq);
 	return error;
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 5eeab7047d1e..8a1b94816419 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -477,7 +477,7 @@ static void _set_error_resid(struct osd_request *or, struct request *req,
 			     int error)
 {
 	or->async_error = error;
-	or->req_errors = req->errors ? : error;
+	or->req_errors = scsi_req(req)->result ? : error;
 	or->sense_len = scsi_req(req)->sense_len;
 	if (or->sense_len)
 		memcpy(or->sense, scsi_req(req)->sense, or->sense_len);
@@ -492,7 +492,7 @@ int osd_execute_request(struct osd_request *or)
 	int error;
 
 	blk_execute_rq(or->request->q, NULL, or->request, 0);
-	error = or->request->errors ? -EIO : 0;
+	error = scsi_req(or->request)->result ? -EIO : 0;
 
 	_set_error_resid(or, or->request, error);
 	return error;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 41bc1d64bf86..67cbed92f07d 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -327,7 +327,7 @@ static void osst_end_async(struct request *req, int update)
 	struct osst_tape *STp = SRpnt->stp;
 	struct rq_map_data *mdata = &SRpnt->stp->buffer->map_data;
 
-	STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
+	STp->buffer->cmdstat.midlevel_result = SRpnt->result = rq->result;
 #if DEBUG
 	STp->write_pending = 0;
 #endif
diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c
index 84c9098cc089..b6e40fd4c3c1 100644
--- a/drivers/scsi/qla2xxx/qla_bsg.c
+++ b/drivers/scsi/qla2xxx/qla_bsg.c
@@ -2553,13 +2553,13 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
 						ql_log(ql_log_warn, vha, 0x7089,
 						    "mbx abort_command "
 						    "failed.\n");
-						bsg_job->req->errors =
+						scsi_req(bsg_job->req)->result =
 						bsg_reply->result = -EIO;
 					} else {
 						ql_dbg(ql_dbg_user, vha, 0x708a,
 						    "mbx abort_command "
 						    "success.\n");
-						bsg_job->req->errors =
+						scsi_req(bsg_job->req)->result =
 						bsg_reply->result = 0;
 					}
 					spin_lock_irqsave(&ha->hardware_lock, flags);
@@ -2570,7 +2570,7 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
 	}
 	spin_unlock_irqrestore(&ha->hardware_lock, flags);
 	ql_log(ql_log_info, vha, 0x708b, "SRB not found to abort.\n");
-	bsg_job->req->errors = bsg_reply->result = -ENXIO;
+	scsi_req(bsg_job->req)->result = bsg_reply->result = -ENXIO;
 	return 0;
 
 done:
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7bc4513bf4e4..b9298a499e19 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -229,8 +229,8 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
  * @rq_flags:	flags for ->rq_flags
  * @resid:	optional residual length
  *
- * returns the req->errors value which is the scsi_cmnd result
- * field.
+ * Returns the scsi_cmnd result field if a command was executed, or a negative
+ * Linux error code if we didn't get that far.
  */
 int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 		 int data_direction, void *buffer, unsigned bufflen,
@@ -281,7 +281,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 		memcpy(sense, rq->sense, SCSI_SENSE_BUFFERSIZE);
 	if (sshdr)
 		scsi_normalize_sense(rq->sense, rq->sense_len, sshdr);
-	ret = req->errors;
+	ret = rq->result;
  out:
 	blk_put_request(req);
 
@@ -797,8 +797,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		/*
 		 * __scsi_error_from_host_byte may have reset the host_byte
 		 */
-		req->errors = cmd->result;
-
+		scsi_req(req)->result = cmd->result;
 		scsi_req(req)->resid_len = scsi_get_resid(cmd);
 
 		if (scsi_bidi_cmnd(cmd)) {
@@ -835,7 +834,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	/*
 	 * Recovered errors need reporting, but they're always treated as
 	 * success, so fiddle the result code here.  For passthrough requests
-	 * we already took a copy of the original into rq->errors which
+	 * we already took a copy of the original into sreq->result which
 	 * is what gets returned to the user
 	 */
 	if (sense_valid && (sshdr.sense_key == RECOVERED_ERROR)) {
@@ -1281,7 +1280,7 @@ scsi_prep_return(struct request_queue *q, struct request *req, int ret)
 	switch (ret) {
 	case BLKPREP_KILL:
 	case BLKPREP_INVALID:
-		req->errors = DID_NO_CONNECT << 16;
+		scsi_req(req)->result = DID_NO_CONNECT << 16;
 		/* release the command and kill it */
 		if (req->special) {
 			struct scsi_cmnd *cmd = req->special;
@@ -1905,7 +1904,7 @@ static int scsi_mq_prep_fn(struct request *req)
 static void scsi_mq_done(struct scsi_cmnd *cmd)
 {
 	trace_scsi_dispatch_cmd_done(cmd);
-	blk_mq_complete_request(cmd->request, cmd->request->errors);
+	blk_mq_complete_request(cmd->request, 0);
 }
 
 static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index cdbb293aca08..a2b279737a4b 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -184,7 +184,7 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost,
 				blk_rq_bytes(req->next_rq);
 		handler = to_sas_internal(shost->transportt)->f->smp_handler;
 		ret = handler(shost, rphy, req);
-		req->errors = ret;
+		scsi_req(req)->result = ret;
 
 		blk_end_request_all(req, ret);
 
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index b61cc3c512d3..90ee9d926deb 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1298,7 +1298,7 @@ sg_rq_end_io(struct request *rq, int uptodate)
 		pr_info("%s: device detaching\n", __func__);
 
 	sense = req->sense;
-	result = rq->errors;
+	result = req->result;
 	resid = req->resid_len;
 
 	SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sdp,
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 5408643431bb..1ea34d6f5437 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -480,7 +480,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
 		atomic64_add(ktime_to_ns(now), &STp->stats->tot_write_time);
 		atomic64_add(ktime_to_ns(now), &STp->stats->tot_io_time);
 		atomic64_inc(&STp->stats->write_cnt);
-		if (req->errors) {
+		if (scsi_req(req)->result) {
 			atomic64_add(atomic_read(&STp->stats->last_write_size)
 				- STp->buffer->cmdstat.residual,
 				&STp->stats->write_byte_cnt);
@@ -494,7 +494,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
 		atomic64_add(ktime_to_ns(now), &STp->stats->tot_read_time);
 		atomic64_add(ktime_to_ns(now), &STp->stats->tot_io_time);
 		atomic64_inc(&STp->stats->read_cnt);
-		if (req->errors) {
+		if (scsi_req(req)->result) {
 			atomic64_add(atomic_read(&STp->stats->last_read_size)
 				- STp->buffer->cmdstat.residual,
 				&STp->stats->read_byte_cnt);
@@ -518,7 +518,7 @@ static void st_scsi_execute_end(struct request *req, int uptodate)
 	struct scsi_tape *STp = SRpnt->stp;
 	struct bio *tmp;
 
-	STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
+	STp->buffer->cmdstat.midlevel_result = SRpnt->result = rq->result;
 	STp->buffer->cmdstat.residual = rq->resid_len;
 
 	st_do_stats(STp, req);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index c7fa372c527a..a93d94e68ab5 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1050,7 +1050,7 @@ static void pscsi_req_done(struct request *req, int uptodate)
 	struct se_cmd *cmd = req->end_io_data;
 	struct pscsi_plugin_task *pt = cmd->priv;
 
-	pt->pscsi_result = req->errors;
+	pt->pscsi_result = scsi_req(req)->result;
 	pt->pscsi_resid = scsi_req(req)->resid_len;
 
 	cmd->scsi_status = status_byte(pt->pscsi_result) << 1;
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 9f618b77ffee..fb5213afc854 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -243,9 +243,9 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
 	req->cmd_len = COMMAND_SIZE(INQUIRY);
 
 	blk_execute_rq(rq->q, NULL, rq, 1);
-	if (rq->errors) {
+	if (req->result) {
 		pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
-			rq->errors);
+			req->result);
 		error = -EIO;
 		goto out_put_request;
 	}
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 2f51c1724b5a..6980ca322074 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -88,7 +88,7 @@ static inline bool ata_pm_request(struct request *rq)
 		 ide_req(rq)->type == ATA_PRIV_PM_RESUME);
 }
 
-/* Error codes returned in rq->errors to the higher part of the driver. */
+/* Error codes returned in result to the higher part of the driver. */
 enum {
 	IDE_DRV_ERROR_GENERAL	= 101,
 	IDE_DRV_ERROR_FILEMARK	= 102,
diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h
index 7c583a0f363a..f0c76f9dc285 100644
--- a/include/scsi/scsi_request.h
+++ b/include/scsi/scsi_request.h
@@ -9,6 +9,7 @@ struct scsi_request {
 	unsigned char	__cmd[BLK_MAX_CDB];
 	unsigned char	*cmd;
 	unsigned short	cmd_len;
+	int		result;
 	unsigned int	sense_len;
 	unsigned int	resid_len;	/* residual count */
 	int		retries;
-- 
cgit v1.3-6-gb490


From 08e0029aa2a4acdd365613ce88a1184e5351a8a1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Apr 2017 16:03:09 +0200
Subject: blk-mq: remove the error argument to blk_mq_complete_request

Now that all drivers that call blk_mq_complete_requests have a
->complete callback we can remove the direct call to blk_mq_end_request,
as well as the error argument to blk_mq_complete_request.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c                    | 15 +++------------
 drivers/block/loop.c              |  4 ++--
 drivers/block/mtip32xx/mtip32xx.c |  4 ++--
 drivers/block/nbd.c               |  4 ++--
 drivers/block/null_blk.c          |  2 +-
 drivers/block/virtio_blk.c        |  2 +-
 drivers/block/xen-blkfront.c      |  2 +-
 drivers/md/dm-rq.c                |  2 +-
 drivers/nvme/host/core.c          |  2 +-
 drivers/nvme/host/nvme.h          |  2 +-
 drivers/scsi/scsi_lib.c           |  2 +-
 include/linux/blk-mq.h            |  2 +-
 12 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c496692ecc5b..3a2d179d49d6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -442,17 +442,10 @@ static void blk_mq_stat_add(struct request *rq)
 
 static void __blk_mq_complete_request(struct request *rq)
 {
-	struct request_queue *q = rq->q;
-
 	if (rq->internal_tag != -1)
 		blk_mq_sched_completed_request(rq);
-
 	blk_mq_stat_add(rq);
-
-	if (!q->softirq_done_fn)
-		blk_mq_end_request(rq, rq->errors);
-	else
-		blk_mq_ipi_complete_request(rq);
+	blk_mq_ipi_complete_request(rq);
 }
 
 /**
@@ -463,16 +456,14 @@ static void __blk_mq_complete_request(struct request *rq)
  *	Ends all I/O on a request. It does not handle partial completions.
  *	The actual completion happens out-of-order, through a IPI handler.
  **/
-void blk_mq_complete_request(struct request *rq, int error)
+void blk_mq_complete_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
 	if (unlikely(blk_should_fake_timeout(q)))
 		return;
-	if (!blk_mark_rq_complete(rq)) {
-		rq->errors = error;
+	if (!blk_mark_rq_complete(rq))
 		__blk_mq_complete_request(rq);
-	}
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 86351b3f7350..994403efee19 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -465,7 +465,7 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
 
 	cmd->ret = ret;
-	blk_mq_complete_request(cmd->rq, 0);
+	blk_mq_complete_request(cmd->rq);
 }
 
 static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
@@ -1685,7 +1685,7 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
 	/* complete non-aio request */
 	if (!cmd->use_aio || ret) {
 		cmd->ret = ret ? -EIO : 0;
-		blk_mq_complete_request(cmd->rq, 0);
+		blk_mq_complete_request(cmd->rq);
 	}
 }
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 7406de29db58..66a6bd83faae 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -242,7 +242,7 @@ static void mtip_async_complete(struct mtip_port *port,
 	rq = mtip_rq_from_tag(dd, tag);
 
 	cmd->status = status;
-	blk_mq_complete_request(rq, 0);
+	blk_mq_complete_request(rq);
 }
 
 /*
@@ -4109,7 +4109,7 @@ static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv)
 
 	if (likely(!reserv)) {
 		cmd->status = -ENODEV;
-		blk_mq_complete_request(rq, 0);
+		blk_mq_complete_request(rq);
 	} else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &dd->port->flags)) {
 
 		cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 09a74a66beb1..d387bef07fcc 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -635,7 +635,7 @@ static void recv_work(struct work_struct *work)
 			break;
 		}
 
-		blk_mq_complete_request(blk_mq_rq_from_pdu(cmd), 0);
+		blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
 	}
 	atomic_dec(&config->recv_threads);
 	wake_up(&config->recv_wq);
@@ -651,7 +651,7 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved)
 		return;
 	cmd = blk_mq_rq_to_pdu(req);
 	cmd->status = -EIO;
-	blk_mq_complete_request(req, 0);
+	blk_mq_complete_request(req);
 }
 
 static void nbd_clear_que(struct nbd_device *nbd)
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 0ca4aa34edb9..d946e1eeac8e 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -281,7 +281,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd)
 	case NULL_IRQ_SOFTIRQ:
 		switch (queue_mode)  {
 		case NULL_Q_MQ:
-			blk_mq_complete_request(cmd->rq, 0);
+			blk_mq_complete_request(cmd->rq);
 			break;
 		case NULL_Q_RQ:
 			blk_complete_request(cmd->rq);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index dea2a58d6734..f94614257462 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -201,7 +201,7 @@ static void virtblk_done(struct virtqueue *vq)
 		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
 			struct request *req = blk_mq_rq_from_pdu(vbr);
 
-			blk_mq_complete_request(req, 0);
+			blk_mq_complete_request(req);
 			req_done = true;
 		}
 		if (unlikely(virtqueue_is_broken(vq)))
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 57866355c060..39459631667c 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1647,7 +1647,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 			BUG();
 		}
 
-		blk_mq_complete_request(req, 0);
+		blk_mq_complete_request(req);
 	}
 
 	rinfo->ring.rsp_cons = i;
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 1173be21f6f6..bff7e3bdb4ed 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -363,7 +363,7 @@ static void dm_complete_request(struct request *rq, int error)
 	if (!rq->q->mq_ops)
 		blk_complete_request(rq);
 	else
-		blk_mq_complete_request(rq, 0);
+		blk_mq_complete_request(rq);
 }
 
 /*
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 805f250315ec..8dc664798293 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -117,7 +117,7 @@ void nvme_cancel_request(struct request *req, void *data, bool reserved)
 	if (blk_queue_dying(req->q))
 		status |= NVME_SC_DNR;
 	nvme_req(req)->status = status;
-	blk_mq_complete_request(req, 0);
+	blk_mq_complete_request(req);
 
 }
 EXPORT_SYMBOL_GPL(nvme_cancel_request);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 550037f5efea..c6ef6c30e2f0 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -251,7 +251,7 @@ static inline void nvme_end_request(struct request *req, __le16 status,
 
 	rq->status = le16_to_cpu(status) >> 1;
 	rq->result = result;
-	blk_mq_complete_request(req, 0);
+	blk_mq_complete_request(req);
 }
 
 void nvme_complete_rq(struct request *req);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index b9298a499e19..4a20e6098f7c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1904,7 +1904,7 @@ static int scsi_mq_prep_fn(struct request *req)
 static void scsi_mq_done(struct scsi_cmnd *cmd)
 {
 	trace_scsi_dispatch_cmd_done(cmd);
-	blk_mq_complete_request(cmd->request, 0);
+	blk_mq_complete_request(cmd->request);
 }
 
 static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d75de612845d..0c4dadb85f62 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -228,7 +228,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
 void blk_mq_abort_requeue_list(struct request_queue *q);
-void blk_mq_complete_request(struct request *rq, int error);
+void blk_mq_complete_request(struct request *rq);
 
 bool blk_mq_queue_stopped(struct request_queue *q);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
-- 
cgit v1.3-6-gb490


From e26738e037f34aedfe05e412f442833f44f4a6e5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Apr 2017 16:03:11 +0200
Subject: block: add a error_count field to struct request

This is for the legacy floppy and ataflop drivers that currently abuse
->errors for this purpose.  It's stashed away in a union to not grow
the struct size, the other fields are either used by modern drivers
for different purposes or the I/O scheduler before queing the I/O
to drivers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e2064ed3c703..a3dcee624de3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -175,6 +175,7 @@ struct request {
 		struct rb_node rb_node;	/* sort/lookup */
 		struct bio_vec special_vec;
 		void *completion_data;
+		int error_count; /* for legacy drivers, don't use */
 	};
 
 	/*
-- 
cgit v1.3-6-gb490


From caf7df12272118e0274c8353bcfeaf60c7743a47 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Apr 2017 16:03:16 +0200
Subject: block: remove the errors field from struct request
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Acked-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c             | 14 +-------------
 block/blk-exec.c             |  3 +--
 block/blk-mq.c               | 10 +++-------
 block/blk-timeout.c          |  1 -
 include/linux/blkdev.h       |  2 --
 include/trace/events/block.h | 17 +++++++----------
 kernel/trace/blktrace.c      | 26 ++++++++++++--------------
 7 files changed, 24 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 25aea293ee98..a49b0830aaaf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1635,7 +1635,6 @@ void blk_init_request_from_bio(struct request *req, struct bio *bio)
 	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 
-	req->errors = 0;
 	req->__sector = bio->bi_iter.bi_sector;
 	if (ioprio_valid(bio_prio(bio)))
 		req->ioprio = bio_prio(bio);
@@ -2573,22 +2572,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 {
 	int total_bytes;
 
-	trace_block_rq_complete(req->q, req, nr_bytes);
+	trace_block_rq_complete(req, error, nr_bytes);
 
 	if (!req->bio)
 		return false;
 
-	/*
-	 * For fs requests, rq is just carrier of independent bio's
-	 * and each partial completion should be handled separately.
-	 * Reset per-request error on each partial completion.
-	 *
-	 * TODO: tj: This is too subtle.  It would be better to let
-	 * low level drivers do what they see fit.
-	 */
-	if (!blk_rq_is_passthrough(req))
-		req->errors = 0;
-
 	if (error && !blk_rq_is_passthrough(req) &&
 	    !(req->rq_flags & RQF_QUIET)) {
 		char *error_type;
diff --git a/block/blk-exec.c b/block/blk-exec.c
index afa383248c7c..a9451e3b8587 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -69,8 +69,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 
 	if (unlikely(blk_queue_dying(q))) {
 		rq->rq_flags |= RQF_QUIET;
-		rq->errors = -ENXIO;
-		__blk_end_request_all(rq, rq->errors);
+		__blk_end_request_all(rq, -ENXIO);
 		spin_unlock_irq(q->queue_lock);
 		return;
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3a21948c867a..f2f6c59c5b05 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -213,7 +213,6 @@ void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 #endif
 	rq->special = NULL;
 	/* tag was already set */
-	rq->errors = 0;
 	rq->extra_len = 0;
 
 	INIT_LIST_HEAD(&rq->timeout_list);
@@ -624,8 +623,7 @@ void blk_mq_abort_requeue_list(struct request_queue *q)
 
 		rq = list_first_entry(&rq_list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
-		rq->errors = -EIO;
-		blk_mq_end_request(rq, rq->errors);
+		blk_mq_end_request(rq, -EIO);
 	}
 }
 EXPORT_SYMBOL(blk_mq_abort_requeue_list);
@@ -1032,8 +1030,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 			pr_err("blk-mq: bad return on queue: %d\n", ret);
 		case BLK_MQ_RQ_QUEUE_ERROR:
 			errors++;
-			rq->errors = -EIO;
-			blk_mq_end_request(rq, rq->errors);
+			blk_mq_end_request(rq, -EIO);
 			break;
 		}
 
@@ -1484,8 +1481,7 @@ static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
 
 	if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
 		*cookie = BLK_QC_T_NONE;
-		rq->errors = -EIO;
-		blk_mq_end_request(rq, rq->errors);
+		blk_mq_end_request(rq, -EIO);
 		return;
 	}
 
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index a30441a200c0..cbff183f3d9f 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -89,7 +89,6 @@ static void blk_rq_timed_out(struct request *req)
 		ret = q->rq_timed_out_fn(req);
 	switch (ret) {
 	case BLK_EH_HANDLED:
-		/* Can we use req->errors here? */
 		__blk_complete_request(req);
 		break;
 	case BLK_EH_RESET_TIMER:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a3dcee624de3..6c4ab0d4a160 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -220,8 +220,6 @@ struct request {
 
 	void *special;		/* opaque pointer available for LLD use */
 
-	int errors;
-
 	unsigned int extra_len;	/* length of alignment and padding */
 
 	unsigned long deadline;
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 99ed69fad041..d0dbe60d8a6d 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -80,7 +80,6 @@ TRACE_EVENT(block_rq_requeue,
 		__field(  dev_t,	dev			)
 		__field(  sector_t,	sector			)
 		__field(  unsigned int,	nr_sector		)
-		__field(  int,		errors			)
 		__array(  char,		rwbs,	RWBS_LEN	)
 		__dynamic_array( char,	cmd,	1		)
 	),
@@ -89,7 +88,6 @@ TRACE_EVENT(block_rq_requeue,
 		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
 		__entry->sector    = blk_rq_trace_sector(rq);
 		__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
-		__entry->errors    = rq->errors;
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 		__get_str(cmd)[0] = '\0';
@@ -99,13 +97,13 @@ TRACE_EVENT(block_rq_requeue,
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __get_str(cmd),
 		  (unsigned long long)__entry->sector,
-		  __entry->nr_sector, __entry->errors)
+		  __entry->nr_sector, 0)
 );
 
 /**
  * block_rq_complete - block IO operation completed by device driver
- * @q: queue containing the block operation request
  * @rq: block operations request
+ * @error: status code
  * @nr_bytes: number of completed bytes
  *
  * The block_rq_complete tracepoint event indicates that some portion
@@ -116,16 +114,15 @@ TRACE_EVENT(block_rq_requeue,
  */
 TRACE_EVENT(block_rq_complete,
 
-	TP_PROTO(struct request_queue *q, struct request *rq,
-		 unsigned int nr_bytes),
+	TP_PROTO(struct request *rq, int error, unsigned int nr_bytes),
 
-	TP_ARGS(q, rq, nr_bytes),
+	TP_ARGS(rq, error, nr_bytes),
 
 	TP_STRUCT__entry(
 		__field(  dev_t,	dev			)
 		__field(  sector_t,	sector			)
 		__field(  unsigned int,	nr_sector		)
-		__field(  int,		errors			)
+		__field(  int,		error			)
 		__array(  char,		rwbs,	RWBS_LEN	)
 		__dynamic_array( char,	cmd,	1		)
 	),
@@ -134,7 +131,7 @@ TRACE_EVENT(block_rq_complete,
 		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
 		__entry->sector    = blk_rq_pos(rq);
 		__entry->nr_sector = nr_bytes >> 9;
-		__entry->errors    = rq->errors;
+		__entry->error     = error;
 
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
 		__get_str(cmd)[0] = '\0';
@@ -144,7 +141,7 @@ TRACE_EVENT(block_rq_complete,
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __get_str(cmd),
 		  (unsigned long long)__entry->sector,
-		  __entry->nr_sector, __entry->errors)
+		  __entry->nr_sector, __entry->error)
 );
 
 DECLARE_EVENT_CLASS(block_rq,
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9f3624dadb09..bd8ae8d5ae9c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q)
 
 /**
  * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
  * @rq:		the source request
+ * @error:	return status to log
  * @nr_bytes:	number of completed bytes
  * @what:	the action
  *
@@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q)
  *     Records an action against a request. Will log the bio offset + size.
  *
  **/
-static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+static void blk_add_trace_rq(struct request *rq, int error,
 			     unsigned int nr_bytes, u32 what)
 {
-	struct blk_trace *bt = q->blk_trace;
+	struct blk_trace *bt = rq->q->blk_trace;
 
 	if (likely(!bt))
 		return;
@@ -713,34 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 		what |= BLK_TC_ACT(BLK_TC_FS);
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
-			rq->cmd_flags, what, rq->errors, 0, NULL);
+			rq->cmd_flags, what, error, 0, NULL);
 }
 
 static void blk_add_trace_rq_insert(void *ignore,
 				    struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
 }
 
 static void blk_add_trace_rq_issue(void *ignore,
 				   struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
 }
 
 static void blk_add_trace_rq_requeue(void *ignore,
 				     struct request_queue *q,
 				     struct request *rq)
 {
-	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+	blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
 }
 
-static void blk_add_trace_rq_complete(void *ignore,
-				      struct request_queue *q,
-				      struct request *rq,
-				      unsigned int nr_bytes)
+static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
+			int error, unsigned int nr_bytes)
 {
-	blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
+	blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
 }
 
 /**
@@ -935,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore,
 	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
-			rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
+			rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
 			sizeof(r), &r);
 }
 
@@ -960,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q,
 		return;
 
 	__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
-				BLK_TA_DRV_DATA, rq->errors, len, data);
+				BLK_TA_DRV_DATA, 0, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
-- 
cgit v1.3-6-gb490


From 0206319fdfee7c36b97aa6c0561bab206132f813 Mon Sep 17 00:00:00 2001
From: Stephen Bates <sbates@raithlin.com>
Date: Thu, 20 Apr 2017 16:59:11 -0600
Subject: blk-mq: Fix poll_stat for new size-based bucketing.

Fixes an issue where the size of the poll_stat array in request_queue
does not match the size expected by the new size based bucketing for
IO completion polling.

Fixes: 720b8ccc4500 ("blk-mq: Add a polling specific stats function")
Signed-off-by: Stephen Bates <sbates@raithlin.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-debugfs.c | 15 +++++++++------
 block/blk-mq.c         |  2 --
 include/linux/blkdev.h |  5 ++++-
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index df9b688b877c..3057641d5d15 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -159,14 +159,17 @@ static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
 static int queue_poll_stat_show(struct seq_file *m, void *v)
 {
 	struct request_queue *q = m->private;
+	int bucket;
 
-	seq_puts(m, "read: ");
-	print_stat(m, &q->poll_stat[READ]);
-	seq_puts(m, "\n");
+	for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) {
+		seq_printf(m, "read  (%d Bytes): ", 1 << (9+bucket));
+		print_stat(m, &q->poll_stat[2*bucket]);
+		seq_puts(m, "\n");
 
-	seq_puts(m, "write: ");
-	print_stat(m, &q->poll_stat[WRITE]);
-	seq_puts(m, "\n");
+		seq_printf(m, "write (%d Bytes): ",  1 << (9+bucket));
+		print_stat(m, &q->poll_stat[2*bucket+1]);
+		seq_puts(m, "\n");
+	}
 	return 0;
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 47b810638729..b6dc9ba38e35 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -42,8 +42,6 @@ static LIST_HEAD(all_q_list);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
-/* Must be consisitent with function below */
-#define BLK_MQ_POLL_STATS_BKTS 16
 static int blk_mq_poll_stats_bkt(const struct request *rq)
 {
 	int ddir, bytes, bucket;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6c4ab0d4a160..6c247861cb66 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -46,6 +46,9 @@ struct blk_stat_callback;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
+/* Must be consisitent with blk_mq_poll_stats_bkt() */
+#define BLK_MQ_POLL_STATS_BKTS 16
+
 /*
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
@@ -517,7 +520,7 @@ struct request_queue {
 	int			poll_nsec;
 
 	struct blk_stat_callback	*poll_cb;
-	struct blk_rq_stat	poll_stat[2];
+	struct blk_rq_stat	poll_stat[BLK_MQ_POLL_STATS_BKTS];
 
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
-- 
cgit v1.3-6-gb490


From f9f38e33389c019ec880f6825119c94867c1fde0 Mon Sep 17 00:00:00 2001
From: Helen Koike <helen.koike@collabora.co.uk>
Date: Mon, 10 Apr 2017 12:51:07 -0300
Subject: nvme: improve performance for virtual NVMe devices

This change provides a mechanism to reduce the number of MMIO doorbell
writes for the NVMe driver. When running in a virtualized environment
like QEMU, the cost of an MMIO is quite hefy here. The main idea for
the patch is provide the device two memory location locations:
 1) to store the doorbell values so they can be lookup without the doorbell
    MMIO write
 2) to store an event index.
I believe the doorbell value is obvious, the event index not so much.
Similar to the virtio specification, the virtual device can tell the
driver (guest OS) not to write MMIO unless you are writing past this
value.

FYI: doorbell values are written by the nvme driver (guest OS) and the
event index is written by the virtual device (host OS).

The patch implements a new admin command that will communicate where
these two memory locations reside. If the command fails, the nvme
driver will work as before without any optimizations.

Contributions:
  Eric Northup <digitaleric@google.com>
  Frank Swiderski <fes@google.com>
  Ted Tso <tytso@mit.edu>
  Keith Busch <keith.busch@intel.com>

Just to give an idea on the performance boost with the vendor
extension: Running fio [1], a stock NVMe driver I get about 200K read
IOPs with my vendor patch I get about 1000K read IOPs. This was
running with a null device i.e. the backing device simply returned
success on every read IO request.

[1] Running on a 4 core machine:
  fio --time_based --name=benchmark --runtime=30
  --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32
  --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4
  --rw=randread --blocksize=4k --randrepeat=false

Signed-off-by: Rob Nelson <rlnelson@google.com>
[mlin: port for upstream]
Signed-off-by: Ming Lin <mlin@kernel.org>
[koike: updated for upstream]
Signed-off-by: Helen Koike <helen.koike@collabora.co.uk>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/pci.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nvme.h    |  13 +++++
 2 files changed, 154 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index af783a33e93a..a363fecb8d82 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -103,8 +103,22 @@ struct nvme_dev {
 	u32 cmbloc;
 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
+	u32 *dbbuf_dbs;
+	dma_addr_t dbbuf_dbs_dma_addr;
+	u32 *dbbuf_eis;
+	dma_addr_t dbbuf_eis_dma_addr;
 };
 
+static inline unsigned int sq_idx(unsigned int qid, u32 stride)
+{
+	return qid * 2 * stride;
+}
+
+static inline unsigned int cq_idx(unsigned int qid, u32 stride)
+{
+	return (qid * 2 + 1) * stride;
+}
+
 static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
 {
 	return container_of(ctrl, struct nvme_dev, ctrl);
@@ -133,6 +147,10 @@ struct nvme_queue {
 	u16 qid;
 	u8 cq_phase;
 	u8 cqe_seen;
+	u32 *dbbuf_sq_db;
+	u32 *dbbuf_cq_db;
+	u32 *dbbuf_sq_ei;
+	u32 *dbbuf_cq_ei;
 };
 
 /*
@@ -171,6 +189,112 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
+}
+
+static inline unsigned int nvme_dbbuf_size(u32 stride)
+{
+	return ((num_possible_cpus() + 1) * 8 * stride);
+}
+
+static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
+{
+	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+	if (dev->dbbuf_dbs)
+		return 0;
+
+	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
+					    &dev->dbbuf_dbs_dma_addr,
+					    GFP_KERNEL);
+	if (!dev->dbbuf_dbs)
+		return -ENOMEM;
+	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
+					    &dev->dbbuf_eis_dma_addr,
+					    GFP_KERNEL);
+	if (!dev->dbbuf_eis) {
+		dma_free_coherent(dev->dev, mem_size,
+				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+		dev->dbbuf_dbs = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
+{
+	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+	if (dev->dbbuf_dbs) {
+		dma_free_coherent(dev->dev, mem_size,
+				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+		dev->dbbuf_dbs = NULL;
+	}
+	if (dev->dbbuf_eis) {
+		dma_free_coherent(dev->dev, mem_size,
+				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
+		dev->dbbuf_eis = NULL;
+	}
+}
+
+static void nvme_dbbuf_init(struct nvme_dev *dev,
+			    struct nvme_queue *nvmeq, int qid)
+{
+	if (!dev->dbbuf_dbs || !qid)
+		return;
+
+	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
+	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
+	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
+	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
+}
+
+static void nvme_dbbuf_set(struct nvme_dev *dev)
+{
+	struct nvme_command c;
+
+	if (!dev->dbbuf_dbs)
+		return;
+
+	memset(&c, 0, sizeof(c));
+	c.dbbuf.opcode = nvme_admin_dbbuf;
+	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
+	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);
+
+	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
+		dev_warn(dev->dev, "unable to set dbbuf\n");
+		/* Free memory and continue on */
+		nvme_dbbuf_dma_free(dev);
+	}
+}
+
+static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
+{
+	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
+}
+
+/* Update dbbuf and return true if an MMIO is required */
+static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
+					      volatile u32 *dbbuf_ei)
+{
+	if (dbbuf_db) {
+		u16 old_value;
+
+		/*
+		 * Ensure that the queue is written before updating
+		 * the doorbell in memory
+		 */
+		wmb();
+
+		old_value = *dbbuf_db;
+		*dbbuf_db = value;
+
+		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
+			return false;
+	}
+
+	return true;
 }
 
 /*
@@ -297,7 +421,9 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
 
 	if (++tail == nvmeq->q_depth)
 		tail = 0;
-	writel(tail, nvmeq->q_db);
+	if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
+					      nvmeq->dbbuf_sq_ei))
+		writel(tail, nvmeq->q_db);
 	nvmeq->sq_tail = tail;
 }
 
@@ -686,7 +812,9 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 		return;
 
 	if (likely(nvmeq->cq_vector >= 0))
-		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+		if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
+						      nvmeq->dbbuf_cq_ei))
+			writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
 	nvmeq->cq_head = head;
 	nvmeq->cq_phase = phase;
 
@@ -1070,6 +1198,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	nvmeq->cq_phase = 1;
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+	nvme_dbbuf_init(dev, nvmeq, qid);
 	dev->online_queues++;
 	spin_unlock_irq(&nvmeq->q_lock);
 }
@@ -1542,6 +1671,8 @@ static int nvme_dev_add(struct nvme_dev *dev)
 		if (blk_mq_alloc_tag_set(&dev->tagset))
 			return 0;
 		dev->ctrl.tagset = &dev->tagset;
+
+		nvme_dbbuf_set(dev);
 	} else {
 		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
 
@@ -1728,6 +1859,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
 {
 	struct nvme_dev *dev = to_nvme_dev(ctrl);
 
+	nvme_dbbuf_dma_free(dev);
 	put_device(dev->dev);
 	if (dev->tagset.tags)
 		blk_mq_free_tag_set(&dev->tagset);
@@ -1795,6 +1927,13 @@ static void nvme_reset_work(struct work_struct *work)
 		dev->ctrl.opal_dev = NULL;
 	}
 
+	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
+		result = nvme_dbbuf_dma_alloc(dev);
+		if (result)
+			dev_warn(dev->dev,
+				 "unable to allocate dma for dbbuf\n");
+	}
+
 	result = nvme_setup_io_queues(dev);
 	if (result)
 		goto out;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9061780b141f..b625bacf37ef 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -245,6 +245,7 @@ enum {
 	NVME_CTRL_ONCS_WRITE_ZEROES		= 1 << 3,
 	NVME_CTRL_VWC_PRESENT			= 1 << 0,
 	NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
+	NVME_CTRL_OACS_DBBUF_SUPP		= 1 << 7,
 };
 
 struct nvme_lbaf {
@@ -603,6 +604,7 @@ enum nvme_admin_opcode {
 	nvme_admin_download_fw		= 0x11,
 	nvme_admin_ns_attach		= 0x15,
 	nvme_admin_keep_alive		= 0x18,
+	nvme_admin_dbbuf		= 0x7C,
 	nvme_admin_format_nvm		= 0x80,
 	nvme_admin_security_send	= 0x81,
 	nvme_admin_security_recv	= 0x82,
@@ -874,6 +876,16 @@ struct nvmf_property_get_command {
 	__u8		resv4[16];
 };
 
+struct nvme_dbbuf {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__u32			rsvd1[5];
+	__le64			prp1;
+	__le64			prp2;
+	__u32			rsvd12[6];
+};
+
 struct nvme_command {
 	union {
 		struct nvme_common_command common;
@@ -893,6 +905,7 @@ struct nvme_command {
 		struct nvmf_connect_command connect;
 		struct nvmf_property_set_command prop_set;
 		struct nvmf_property_get_command prop_get;
+		struct nvme_dbbuf dbbuf;
 	};
 };
 
-- 
cgit v1.3-6-gb490


From 39498faef7c02f9f6de4060ccdc7e8975a6e690b Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 11 Apr 2017 11:32:28 -0700
Subject: nvmet_fc: add target feature flags for upcall isr contexts

Two new feature flags were added to control whether upcalls to the
transport result in context switches or stay in the calling context.

NVMET_FCTGTFEAT_CMD_IN_ISR:
  By default, if the flag is not set, the transport assumes the
  lldd is in a non-isr context and in the cpu context it should be
  for the io queue. As such, the cmd handler is called directly in the
  calling context.
  If the flag is set, indicating the upcall is an isr context, the
  transport mandates a transition to a workqueue. The workqueue assigned
  to the queue is used for the context.
NVMET_FCTGTFEAT_OPDONE_IN_ISR
  By default, if the flag is not set, the transport assumes the
  lldd is in a non-isr context and in the cpu context it should be
  for the io queue. As such, the fcp operation done callback is called
  directly in the calling context.
  If the flag is set, indicating the upcall is an isr context, the
  transport mandates a transition to a workqueue. The workqueue assigned
  to the queue is used for the context.

Updated lpfc for flags

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/fc.c       | 37 ++++++++++++++++++++++++++++++++++---
 drivers/nvme/target/fcloop.c   |  5 +++--
 drivers/scsi/lpfc/lpfc_nvmet.c |  4 +++-
 include/linux/nvme-fc-driver.h | 16 ++++++++++++++++
 4 files changed, 56 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 2c0709f0a7d3..a4fad3d0b660 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -86,6 +86,7 @@ struct nvmet_fc_fcp_iod {
 
 	struct nvmet_req		req;
 	struct work_struct		work;
+	struct work_struct		done_work;
 
 	struct nvmet_fc_tgtport		*tgtport;
 	struct nvmet_fc_tgt_queue	*queue;
@@ -213,6 +214,7 @@ static DEFINE_IDA(nvmet_fc_tgtport_cnt);
 
 static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work);
 static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work);
+static void nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work);
 static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc);
 static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc);
 static void nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue);
@@ -414,6 +416,7 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
 
 	for (i = 0; i < queue->sqsize; fod++, i++) {
 		INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work);
+		INIT_WORK(&fod->done_work, nvmet_fc_fcp_rqst_op_done_work);
 		fod->tgtport = tgtport;
 		fod->queue = queue;
 		fod->active = false;
@@ -1807,10 +1810,13 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
 	}
 }
 
+/*
+ * actual done handler for FCP operations when completed by the lldd
+ */
 static void
-nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
+nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 {
-	struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
 	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
 	unsigned long flags;
 	bool abort;
@@ -1905,6 +1911,28 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
 	}
 }
 
+static void
+nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work)
+{
+	struct nvmet_fc_fcp_iod *fod =
+		container_of(work, struct nvmet_fc_fcp_iod, done_work);
+
+	nvmet_fc_fod_op_done(fod);
+}
+
+static void
+nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
+{
+	struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+	struct nvmet_fc_tgt_queue *queue = fod->queue;
+
+	if (fod->tgtport->ops->target_features & NVMET_FCTGTFEAT_OPDONE_IN_ISR)
+		/* context switch so completion is not in ISR context */
+		queue_work_on(queue->cpu, queue->work_q, &fod->done_work);
+	else
+		nvmet_fc_fod_op_done(fod);
+}
+
 /*
  * actual completion handler after execution by the nvmet layer
  */
@@ -2149,7 +2177,10 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
 			((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0;
 	memcpy(&fod->cmdiubuf, cmdiubuf, cmdiubuf_len);
 
-	queue_work_on(queue->cpu, queue->work_q, &fod->work);
+	if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR)
+		queue_work_on(queue->cpu, queue->work_q, &fod->work);
+	else
+		nvmet_fc_handle_fcp_rqst(tgtport, fod);
 
 	return 0;
 }
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 4e8e6a22bce1..a5cdeca39013 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -575,8 +575,9 @@ struct nvmet_fc_target_template tgttemplate = {
 	.max_dif_sgl_segments	= FCLOOP_SGL_SEGS,
 	.dma_boundary		= FCLOOP_DMABOUND_4G,
 	/* optional features */
-	.target_features	= NVMET_FCTGTFEAT_READDATA_RSP |
-				  NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED,
+	.target_features	= NVMET_FCTGTFEAT_CMD_IN_ISR |
+				  NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED |
+				  NVMET_FCTGTFEAT_OPDONE_IN_ISR,
 	/* sizes of additional private data for data structures */
 	.target_priv_sz		= sizeof(struct fcloop_tport),
 };
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c
index 7ca868f394da..176b4e035bcf 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -669,7 +669,9 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba)
 	lpfc_tgttemplate.max_hw_queues = phba->cfg_nvme_io_channel;
 	lpfc_tgttemplate.max_sgl_segments = phba->cfg_sg_seg_cnt;
 	lpfc_tgttemplate.target_features = NVMET_FCTGTFEAT_READDATA_RSP |
-					   NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED;
+					   NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED |
+					   NVMET_FCTGTFEAT_CMD_IN_ISR |
+					   NVMET_FCTGTFEAT_OPDONE_IN_ISR;
 
 #if (IS_ENABLED(CONFIG_NVME_TARGET_FC))
 	error = nvmet_fc_register_targetport(&pinfo, &lpfc_tgttemplate,
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 16eb264980c2..d70a9c98bc23 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -655,6 +655,22 @@ enum {
 		 * on. The transport should pick a cpu to schedule the work
 		 * on.
 		 */
+	NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 2),
+		/* Bit 2: When 0, the LLDD is calling the cmd rcv handler
+		 * in a non-isr context, allowing the transport to finish
+		 * op completion in the calling context. When 1, the LLDD
+		 * is calling the cmd rcv handler in an ISR context,
+		 * requiring the transport to transition to a workqueue
+		 * for op completion.
+		 */
+	NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 3),
+		/* Bit 3: When 0, the LLDD is calling the op done handler
+		 * in a non-isr context, allowing the transport to finish
+		 * op completion in the calling context. When 1, the LLDD
+		 * is calling the op done handler in an ISR context,
+		 * requiring the transport to transition to a workqueue
+		 * for op completion.
+		 */
 };
 
 
-- 
cgit v1.3-6-gb490


From 19b58d9473e8e3d38e7f3602a07c8febfbd07bc1 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 11 Apr 2017 11:32:29 -0700
Subject: nvmet_fc: add req_release to lldd api

With the advent of the opdone calls changing context, the lldd can no
longer assume that once the op->done call returns for RSP operations
that the request struct is no longer being accessed.

As such, revise the lldd api for a req_release callback that the
transport will call when the job is complete. This will also be used
with abort cases.

Fixed text in api header for change in io complete semantics.

Revised lpfc to support the new req_release api.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/fc.c       |  8 +++--
 drivers/nvme/target/fcloop.c   | 15 ++++++---
 drivers/scsi/lpfc/lpfc_nvmet.c | 73 +++++++++++++++++++++++++++++++++++++++---
 drivers/scsi/lpfc/lpfc_nvmet.h |  7 ++--
 include/linux/nvme-fc-driver.h | 57 ++++++++++++++++++++-------------
 5 files changed, 124 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index a4fad3d0b660..d7068f039904 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -482,6 +482,8 @@ static void
 nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
 			struct nvmet_fc_fcp_iod *fod)
 {
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
 	unsigned long flags;
 
 	spin_lock_irqsave(&queue->qlock, flags);
@@ -493,6 +495,8 @@ nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
 	 * release the reference taken at queue lookup and fod allocation
 	 */
 	nvmet_fc_tgt_q_put(queue);
+
+	tgtport->ops->fcp_req_release(&tgtport->fc_target_port, fcpreq);
 }
 
 static int
@@ -849,7 +853,7 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo,
 	int ret, idx;
 
 	if (!template->xmt_ls_rsp || !template->fcp_op ||
-	    !template->targetport_delete ||
+	    !template->fcp_req_release || !template->targetport_delete ||
 	    !template->max_hw_queues || !template->max_sgl_segments ||
 	    !template->max_dif_sgl_segments || !template->dma_boundary) {
 		ret = -EINVAL;
@@ -2124,7 +2128,7 @@ nvmet_fc_handle_fcp_rqst_work(struct work_struct *work)
  * If this routine returns error, the lldd should abort the exchange.
  *
  * @target_port: pointer to the (registered) target port the FCP CMD IU
- *              was receive on.
+ *              was received on.
  * @fcpreq:     pointer to a fcpreq request structure to be used to reference
  *              the exchange corresponding to the FCP Exchange.
  * @cmdiubuf:   pointer to the buffer containing the FCP CMD IU
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index a5cdeca39013..2c4d68201ba7 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -492,14 +492,18 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
 	tgt_fcpreq->fcp_error = fcp_err;
 	tgt_fcpreq->done(tgt_fcpreq);
 
-	if ((!fcp_err) && (op == NVMET_FCOP_RSP ||
-			op == NVMET_FCOP_READDATA_RSP ||
-			op == NVMET_FCOP_ABORT))
-		schedule_work(&tfcp_req->work);
-
 	return 0;
 }
 
+static void
+fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
+			struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+	struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+
+	schedule_work(&tfcp_req->work);
+}
+
 static void
 fcloop_ls_abort(struct nvme_fc_local_port *localport,
 			struct nvme_fc_remote_port *remoteport,
@@ -570,6 +574,7 @@ struct nvmet_fc_target_template tgttemplate = {
 	.targetport_delete	= fcloop_targetport_delete,
 	.xmt_ls_rsp		= fcloop_xmt_ls_rsp,
 	.fcp_op			= fcloop_fcp_op,
+	.fcp_req_release	= fcloop_fcp_req_release,
 	.max_hw_queues		= FCLOOP_HW_QUEUES,
 	.max_sgl_segments	= FCLOOP_SGL_SEGS,
 	.max_dif_sgl_segments	= FCLOOP_SGL_SEGS,
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c
index 176b4e035bcf..1846c7eb086a 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -408,9 +408,7 @@ out:
 		if (phba->ktime_on)
 			lpfc_nvmet_ktime(phba, ctxp);
 #endif
-		/* Let Abort cmpl repost the context */
-		if (!(ctxp->flag & LPFC_NVMET_ABORT_OP))
-			lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+		/* lpfc_nvmet_xmt_fcp_release() will recycle the context */
 	} else {
 		ctxp->entry_cnt++;
 		start_clean = offsetof(struct lpfc_iocbq, wqe);
@@ -634,10 +632,47 @@ lpfc_nvmet_targetport_delete(struct nvmet_fc_target_port *targetport)
 	complete(&tport->tport_unreg_done);
 }
 
+static void
+lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport,
+			   struct nvmefc_tgt_fcp_req *rsp)
+{
+	struct lpfc_nvmet_tgtport *lpfc_nvmep = tgtport->private;
+	struct lpfc_nvmet_rcv_ctx *ctxp =
+		container_of(rsp, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req);
+	struct lpfc_hba *phba = ctxp->phba;
+	unsigned long flags;
+	bool aborting = false;
+
+	spin_lock_irqsave(&ctxp->ctxlock, flags);
+	if (ctxp->flag & LPFC_NVMET_ABORT_OP) {
+		aborting = true;
+		ctxp->flag |= LPFC_NVMET_CTX_RLS;
+	}
+	spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
+	if (aborting)
+		/* let the abort path do the real release */
+		return;
+
+	/* Sanity check */
+	if (ctxp->state != LPFC_NVMET_STE_DONE) {
+		atomic_inc(&lpfc_nvmep->xmt_fcp_drop);
+		lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR,
+				"6117 Bad state IO x%x aborted\n",
+				ctxp->oxid);
+	}
+
+	lpfc_nvmeio_data(phba, "NVMET FCP FREE: xri x%x ste %d\n", ctxp->oxid,
+			 ctxp->state, 0);
+
+	lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+}
+
 static struct nvmet_fc_target_template lpfc_tgttemplate = {
 	.targetport_delete = lpfc_nvmet_targetport_delete,
 	.xmt_ls_rsp     = lpfc_nvmet_xmt_ls_rsp,
 	.fcp_op         = lpfc_nvmet_xmt_fcp_op,
+	.fcp_req_release = lpfc_nvmet_xmt_fcp_release,
 
 	.max_hw_queues  = 1,
 	.max_sgl_segments = LPFC_NVMET_DEFAULT_SEGS,
@@ -834,6 +869,7 @@ dropit:
 	ctxp->wqeq = NULL;
 	ctxp->state = LPFC_NVMET_STE_RCV;
 	ctxp->rqb_buffer = (void *)nvmebuf;
+	spin_lock_init(&ctxp->ctxlock);
 
 	lpfc_nvmeio_data(phba, "NVMET LS   RCV: xri x%x sz %d from %06x\n",
 			 oxid, size, sid);
@@ -1595,6 +1631,8 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
 	struct lpfc_nvmet_rcv_ctx *ctxp;
 	struct lpfc_nvmet_tgtport *tgtp;
 	uint32_t status, result;
+	unsigned long flags;
+	bool released = false;
 
 	ctxp = cmdwqe->context2;
 	status = bf_get(lpfc_wcqe_c_status, wcqe);
@@ -1609,7 +1647,18 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
 			result, wcqe->word3);
 
 	ctxp->state = LPFC_NVMET_STE_DONE;
-	lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+	spin_lock_irqsave(&ctxp->ctxlock, flags);
+	if (ctxp->flag & LPFC_NVMET_CTX_RLS)
+		released = true;
+	ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
+	spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
+	/*
+	 * if transport has released ctx, then can reuse it. Otherwise,
+	 * will be recycled by transport release call.
+	 */
+	if (released)
+		lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
 
 	cmdwqe->context2 = NULL;
 	cmdwqe->context3 = NULL;
@@ -1632,7 +1681,9 @@ lpfc_nvmet_xmt_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
 {
 	struct lpfc_nvmet_rcv_ctx *ctxp;
 	struct lpfc_nvmet_tgtport *tgtp;
+	unsigned long flags;
 	uint32_t status, result;
+	bool released = false;
 
 	ctxp = cmdwqe->context2;
 	status = bf_get(lpfc_wcqe_c_status, wcqe);
@@ -1654,7 +1705,19 @@ lpfc_nvmet_xmt_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
 					ctxp->state, ctxp->oxid);
 		}
 		ctxp->state = LPFC_NVMET_STE_DONE;
-		lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+		spin_lock_irqsave(&ctxp->ctxlock, flags);
+		if (ctxp->flag & LPFC_NVMET_CTX_RLS)
+			released = true;
+		ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
+		spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
+		/*
+		 * if transport has released ctx, then can reuse it. Otherwise,
+		 * will be recycled by transport release call.
+		 */
+		if (released)
+			lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+
 		cmdwqe->context2 = NULL;
 		cmdwqe->context3 = NULL;
 	}
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.h b/drivers/scsi/lpfc/lpfc_nvmet.h
index ca96f05c1604..02735fc6fd41 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.h
+++ b/drivers/scsi/lpfc/lpfc_nvmet.h
@@ -81,6 +81,7 @@ struct lpfc_nvmet_rcv_ctx {
 	struct lpfc_iocbq *wqeq;
 	struct lpfc_iocbq *abort_wqeq;
 	dma_addr_t txrdy_phys;
+	spinlock_t ctxlock; /* protect flag access */
 	uint32_t *txrdy;
 	uint32_t sid;
 	uint32_t offset;
@@ -97,8 +98,10 @@ struct lpfc_nvmet_rcv_ctx {
 #define LPFC_NVMET_STE_RSP		4
 #define LPFC_NVMET_STE_DONE		5
 	uint16_t flag;
-#define LPFC_NVMET_IO_INP		1
-#define LPFC_NVMET_ABORT_OP		2
+#define LPFC_NVMET_IO_INP		0x1
+#define LPFC_NVMET_ABORT_OP		0x2
+#define LPFC_NVMET_CTX_RLS		0x4
+
 	struct rqb_dmabuf *rqb_buffer;
 
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index d70a9c98bc23..d98ddb2feabc 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -741,12 +741,12 @@ struct nvmet_fc_target_port {
  *       be freed/released.
  *       Entrypoint is Mandatory.
  *
- * @fcp_op:  Called to perform a data transfer, transmit a response, or
- *       abort an FCP opertion. The nvmefc_tgt_fcp_req structure is the same
- *       LLDD-supplied exchange structure specified in the
- *       nvmet_fc_rcv_fcp_req() call made when the FCP CMD IU was received.
- *       The op field in the structure shall indicate the operation for
- *       the LLDD to perform relative to the io.
+ * @fcp_op:  Called to perform a data transfer or transmit a response.
+ *       The nvmefc_tgt_fcp_req structure is the same LLDD-supplied
+ *       exchange structure specified in the nvmet_fc_rcv_fcp_req() call
+ *       made when the FCP CMD IU was received. The op field in the
+ *       structure shall indicate the operation for the LLDD to perform
+ *       relative to the io.
  *         NVMET_FCOP_READDATA operation: the LLDD is to send the
  *           payload data (described by sglist) to the host in 1 or
  *           more FC sequences (preferrably 1).  Note: the fc-nvme layer
@@ -768,29 +768,35 @@ struct nvmet_fc_target_port {
  *           successfully, the LLDD is to update the nvmefc_tgt_fcp_req
  *           transferred_length field and may subsequently transmit the
  *           FCP_RSP iu payload (described by rspbuf, rspdma, rsplen).
- *           The LLDD is to await FCP_CONF reception to confirm the RSP
- *           reception by the host. The LLDD may retramsit the FCP_RSP iu
- *           if necessary per FC-NVME. Upon reception of FCP_CONF, or upon
- *           FCP_CONF failure, the LLDD is to set the nvmefc_tgt_fcp_req
- *           fcp_error field and consider the operation complete..
+ *           If FCP_CONF is supported, the LLDD is to await FCP_CONF
+ *           reception to confirm the RSP reception by the host. The LLDD
+ *           may retramsit the FCP_RSP iu if necessary per FC-NVME. Upon
+ *           transmission of the FCP_RSP iu if FCP_CONF is not supported,
+ *           or upon success/failure of FCP_CONF if it is supported, the
+ *           LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
+ *           consider the operation complete.
  *         NVMET_FCOP_RSP: the LLDD is to transmit the FCP_RSP iu payload
- *           (described by rspbuf, rspdma, rsplen).  The LLDD is to await
- *           FCP_CONF reception to confirm the RSP reception by the host.
- *           The LLDD may retramsit the FCP_RSP iu if necessary per FC-NVME.
- *           Upon reception of FCP_CONF, or upon FCP_CONF failure, the
+ *           (described by rspbuf, rspdma, rsplen). If FCP_CONF is
+ *           supported, the LLDD is to await FCP_CONF reception to confirm
+ *           the RSP reception by the host. The LLDD may retramsit the
+ *           FCP_RSP iu if FCP_CONF is not received per FC-NVME. Upon
+ *           transmission of the FCP_RSP iu if FCP_CONF is not supported,
+ *           or upon success/failure of FCP_CONF if it is supported, the
  *           LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
- *           consider the operation complete..
+ *           consider the operation complete.
  *         NVMET_FCOP_ABORT: the LLDD is to terminate the exchange
  *           corresponding to the fcp operation. The LLDD shall send
  *           ABTS and follow FC exchange abort-multi rules, including
  *           ABTS retries and possible logout.
  *       Upon completing the indicated operation, the LLDD is to set the
  *       status fields for the operation (tranferred_length and fcp_error
- *       status) in the request, then all the "done" routine
- *       indicated in the fcp request.  Upon return from the "done"
- *       routine for either a NVMET_FCOP_RSP or NVMET_FCOP_ABORT operation
- *       the fc-nvme layer will not longer reference the fcp request,
- *       allowing the LLDD to free/release the fcp request.
+ *       status) in the request, then call the "done" routine
+ *       indicated in the fcp request. After the operation completes,
+ *       regardless of whether the FCP_RSP iu was successfully transmit,
+ *       the LLDD-supplied exchange structure must remain valid until the
+ *       transport calls the fcp_req_release() callback to return ownership
+ *       of the exchange structure back to the LLDD so that it may be used
+ *       for another fcp command.
  *       Note: when calling the done routine for READDATA or WRITEDATA
  *       operations, the fc-nvme layer may immediate convert, in the same
  *       thread and before returning to the LLDD, the fcp operation to
@@ -802,6 +808,11 @@ struct nvmet_fc_target_port {
  *       Returns 0 on success, -<errno> on failure (Ex: -EIO)
  *       Entrypoint is Mandatory.
  *
+ * @fcp_req_release:  Called by the transport to return a nvmefc_tgt_fcp_req
+ *       to the LLDD after all operations on the fcp operation are complete.
+ *       This may be due to the command completing or upon completion of
+ *       abort cleanup.
+ *
  * @max_hw_queues:  indicates the maximum number of hw queues the LLDD
  *       supports for cpu affinitization.
  *       Value is Mandatory. Must be at least 1.
@@ -836,7 +847,9 @@ struct nvmet_fc_target_template {
 	int (*xmt_ls_rsp)(struct nvmet_fc_target_port *tgtport,
 				struct nvmefc_tgt_ls_req *tls_req);
 	int (*fcp_op)(struct nvmet_fc_target_port *tgtport,
-				struct nvmefc_tgt_fcp_req *);
+				struct nvmefc_tgt_fcp_req *fcpreq);
+	void (*fcp_req_release)(struct nvmet_fc_target_port *tgtport,
+				struct nvmefc_tgt_fcp_req *fcpreq);
 
 	u32	max_hw_queues;
 	u16	max_sgl_segments;
-- 
cgit v1.3-6-gb490


From a97ec51b37efacb84f286979876675a8143035b0 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Tue, 11 Apr 2017 11:32:31 -0700
Subject: nvmet_fc: Rework target side abort handling

target transport:
----------------------
There are cases when there is a need to abort in-progress target
operations (writedata) so that controller termination or errors can
clean up. That can't happen currently as the abort is another target
op type, so it can't be used till the running one finishes (and it may
not).  Solve by removing the abort op type and creating a separate
downcall from the transport to the lldd to request an io to be aborted.

The transport will abort ios on queue teardown or io errors. In general
the transport tries to call the lldd abort only when the io state is
idle. Meaning: ops that transmit data (readdata or rsp) will always
finish their transmit (or the lldd will see a state on the
link or initiator port that fails the transmit) and the done call for
the operation will occur. The transport will wait for the op done
upcall before calling the abort function, and as the io is idle, the
io can be cleaned up immediately after the abort call; Similarly, ios
that are not waiting for data or transmitting data must be in the nvmet
layer being processed. The transport will wait for the nvmet layer
completion before calling the abort function, and as the io is idle,
the io can be cleaned up immediately after the abort call; As for ops
that are waiting for data (writedata), they may be outstanding
indefinitely if the lldd doesn't see a condition where the initiatior
port or link is bad. In those cases, the transport will call the abort
function and wait for the lldd's op done upcall for the operation, where
it will then clean up the io.

Additionally, if a lldd receives an ABTS and matches it to an outstanding
request in the transport, A new new transport upcall was created to abort
the outstanding request in the transport. The transport expects any
outstanding op call (readdata or writedata) will completed by the lldd and
the operation upcall made. The transport doesn't act on the reported
abort (e.g. clean up the io) until an op done upcall occurs, a new op is
attempted, or the nvmet layer completes the io processing.

fcloop:
----------------------
Updated to support the new target apis.
On fcp io aborts from the initiator, the loopback context is updated to
NULL out the half that has completed. The initiator side is immediately
called after the abort request with an io completion (abort status).
On fcp io aborts from the target, the io is stopped and the initiator side
sees it as an aborted io. Target side ops, perhaps in progress while the
initiator side is done, continue but noop the data movement as there's no
structure on the initiator side to reference.

patch also contains:
----------------------
Revised lpfc to support the new abort api

commonized rsp buffer syncing and nulling of private data based on
calling paths.

errors in op done calls don't take action on the fod. They're bad
operations which implies the fod may be bad.

Signed-off-by: James Smart <james.smart@broadcom.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/target/fc.c       | 205 ++++++++++++++++++++++++++++++-----------
 drivers/nvme/target/fcloop.c   | 152 +++++++++++++++++++++++++-----
 drivers/scsi/lpfc/lpfc_nvmet.c |  49 +++++-----
 include/linux/nvme-fc-driver.h |  25 +++--
 4 files changed, 325 insertions(+), 106 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index d7068f039904..e5d30bbb1b10 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -82,6 +82,8 @@ struct nvmet_fc_fcp_iod {
 	enum nvmet_fcp_datadir		io_dir;
 	bool				active;
 	bool				abort;
+	bool				aborted;
+	bool				writedataactive;
 	spinlock_t			flock;
 
 	struct nvmet_req		req;
@@ -420,6 +422,9 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
 		fod->tgtport = tgtport;
 		fod->queue = queue;
 		fod->active = false;
+		fod->abort = false;
+		fod->aborted = false;
+		fod->fcpreq = NULL;
 		list_add_tail(&fod->fcp_list, &queue->fod_list);
 		spin_lock_init(&fod->flock);
 
@@ -466,7 +471,6 @@ nvmet_fc_alloc_fcp_iod(struct nvmet_fc_tgt_queue *queue)
 	if (fod) {
 		list_del(&fod->fcp_list);
 		fod->active = true;
-		fod->abort = false;
 		/*
 		 * no queue reference is taken, as it was taken by the
 		 * queue lookup just prior to the allocation. The iod
@@ -486,9 +490,18 @@ nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
 	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
 	unsigned long flags;
 
+	fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
+				sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+
+	fcpreq->nvmet_fc_private = NULL;
+
 	spin_lock_irqsave(&queue->qlock, flags);
 	list_add_tail(&fod->fcp_list, &fod->queue->fod_list);
 	fod->active = false;
+	fod->abort = false;
+	fod->aborted = false;
+	fod->writedataactive = false;
+	fod->fcpreq = NULL;
 	spin_unlock_irqrestore(&queue->qlock, flags);
 
 	/*
@@ -622,33 +635,13 @@ nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue)
 }
 
 
-static void
-nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport,
-				struct nvmefc_tgt_fcp_req *fcpreq)
-{
-	int ret;
-
-	fcpreq->op = NVMET_FCOP_ABORT;
-	fcpreq->offset = 0;
-	fcpreq->timeout = 0;
-	fcpreq->transfer_length = 0;
-	fcpreq->transferred_length = 0;
-	fcpreq->fcp_error = 0;
-	fcpreq->sg_cnt = 0;
-
-	ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fcpreq);
-	if (ret)
-		/* should never reach here !! */
-		WARN_ON(1);
-}
-
-
 static void
 nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
 {
+	struct nvmet_fc_tgtport *tgtport = queue->assoc->tgtport;
 	struct nvmet_fc_fcp_iod *fod = queue->fod;
 	unsigned long flags;
-	int i;
+	int i, writedataactive;
 	bool disconnect;
 
 	disconnect = atomic_xchg(&queue->connected, 0);
@@ -659,7 +652,20 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
 		if (fod->active) {
 			spin_lock(&fod->flock);
 			fod->abort = true;
+			writedataactive = fod->writedataactive;
 			spin_unlock(&fod->flock);
+			/*
+			 * only call lldd abort routine if waiting for
+			 * writedata. other outstanding ops should finish
+			 * on their own.
+			 */
+			if (writedataactive) {
+				spin_lock(&fod->flock);
+				fod->aborted = true;
+				spin_unlock(&fod->flock);
+				tgtport->ops->fcp_abort(
+					&tgtport->fc_target_port, fod->fcpreq);
+			}
 		}
 	}
 	spin_unlock_irqrestore(&queue->qlock, flags);
@@ -853,6 +859,7 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo,
 	int ret, idx;
 
 	if (!template->xmt_ls_rsp || !template->fcp_op ||
+	    !template->fcp_abort ||
 	    !template->fcp_req_release || !template->targetport_delete ||
 	    !template->max_hw_queues || !template->max_sgl_segments ||
 	    !template->max_dif_sgl_segments || !template->dma_boundary) {
@@ -1717,6 +1724,26 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
 
 static void nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq);
 
+static void
+nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport,
+				struct nvmet_fc_fcp_iod *fod)
+{
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+
+	/* data no longer needed */
+	nvmet_fc_free_tgt_pgs(fod);
+
+	/*
+	 * if an ABTS was received or we issued the fcp_abort early
+	 * don't call abort routine again.
+	 */
+	/* no need to take lock - lock was taken earlier to get here */
+	if (!fod->aborted)
+		tgtport->ops->fcp_abort(&tgtport->fc_target_port, fcpreq);
+
+	nvmet_fc_free_fcp_iod(fod->queue, fod);
+}
+
 static void
 nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
 				struct nvmet_fc_fcp_iod *fod)
@@ -1730,7 +1757,7 @@ nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
 
 	ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq);
 	if (ret)
-		nvmet_fc_abort_op(tgtport, fod->fcpreq);
+		nvmet_fc_abort_op(tgtport, fod);
 }
 
 static void
@@ -1739,6 +1766,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
 {
 	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
 	struct scatterlist *sg, *datasg;
+	unsigned long flags;
 	u32 tlen, sg_off;
 	int ret;
 
@@ -1803,10 +1831,13 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
 		 */
 		fod->abort = true;
 
-		if (op == NVMET_FCOP_WRITEDATA)
+		if (op == NVMET_FCOP_WRITEDATA) {
+			spin_lock_irqsave(&fod->flock, flags);
+			fod->writedataactive = false;
+			spin_unlock_irqrestore(&fod->flock, flags);
 			nvmet_req_complete(&fod->req,
 					NVME_SC_FC_TRANSPORT_ERROR);
-		else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ {
+		} else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ {
 			fcpreq->fcp_error = ret;
 			fcpreq->transferred_length = 0;
 			nvmet_fc_xmt_fcp_op_done(fod->fcpreq);
@@ -1814,6 +1845,27 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
 	}
 }
 
+static inline bool
+__nvmet_fc_fod_op_abort(struct nvmet_fc_fcp_iod *fod, bool abort)
+{
+	struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+
+	/* if in the middle of an io and we need to tear down */
+	if (abort) {
+		if (fcpreq->op == NVMET_FCOP_WRITEDATA) {
+			nvmet_req_complete(&fod->req,
+					NVME_SC_FC_TRANSPORT_ERROR);
+			return true;
+		}
+
+		nvmet_fc_abort_op(tgtport, fod);
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * actual done handler for FCP operations when completed by the lldd
  */
@@ -1827,22 +1879,20 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 
 	spin_lock_irqsave(&fod->flock, flags);
 	abort = fod->abort;
+	fod->writedataactive = false;
 	spin_unlock_irqrestore(&fod->flock, flags);
 
-	/* if in the middle of an io and we need to tear down */
-	if (abort && fcpreq->op != NVMET_FCOP_ABORT) {
-		/* data no longer needed */
-		nvmet_fc_free_tgt_pgs(fod);
-
-		nvmet_req_complete(&fod->req, fcpreq->fcp_error);
-		return;
-	}
-
 	switch (fcpreq->op) {
 
 	case NVMET_FCOP_WRITEDATA:
+		if (__nvmet_fc_fod_op_abort(fod, abort))
+			return;
 		if (fcpreq->fcp_error ||
 		    fcpreq->transferred_length != fcpreq->transfer_length) {
+			spin_lock(&fod->flock);
+			fod->abort = true;
+			spin_unlock(&fod->flock);
+
 			nvmet_req_complete(&fod->req,
 					NVME_SC_FC_TRANSPORT_ERROR);
 			return;
@@ -1850,6 +1900,10 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 
 		fod->offset += fcpreq->transferred_length;
 		if (fod->offset != fod->total_length) {
+			spin_lock_irqsave(&fod->flock, flags);
+			fod->writedataactive = true;
+			spin_unlock_irqrestore(&fod->flock, flags);
+
 			/* transfer the next chunk */
 			nvmet_fc_transfer_fcp_data(tgtport, fod,
 						NVMET_FCOP_WRITEDATA);
@@ -1864,12 +1918,11 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 
 	case NVMET_FCOP_READDATA:
 	case NVMET_FCOP_READDATA_RSP:
+		if (__nvmet_fc_fod_op_abort(fod, abort))
+			return;
 		if (fcpreq->fcp_error ||
 		    fcpreq->transferred_length != fcpreq->transfer_length) {
-			/* data no longer needed */
-			nvmet_fc_free_tgt_pgs(fod);
-
-			nvmet_fc_abort_op(tgtport, fod->fcpreq);
+			nvmet_fc_abort_op(tgtport, fod);
 			return;
 		}
 
@@ -1878,8 +1931,6 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 		if (fcpreq->op == NVMET_FCOP_READDATA_RSP) {
 			/* data no longer needed */
 			nvmet_fc_free_tgt_pgs(fod);
-			fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
-					sizeof(fod->rspiubuf), DMA_TO_DEVICE);
 			nvmet_fc_free_fcp_iod(fod->queue, fod);
 			return;
 		}
@@ -1902,15 +1953,12 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 		break;
 
 	case NVMET_FCOP_RSP:
-	case NVMET_FCOP_ABORT:
-		fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
-				sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+		if (__nvmet_fc_fod_op_abort(fod, abort))
+			return;
 		nvmet_fc_free_fcp_iod(fod->queue, fod);
 		break;
 
 	default:
-		nvmet_fc_free_tgt_pgs(fod);
-		nvmet_fc_abort_op(tgtport, fod->fcpreq);
 		break;
 	}
 }
@@ -1958,10 +2006,7 @@ __nvmet_fc_fcp_nvme_cmd_done(struct nvmet_fc_tgtport *tgtport,
 		fod->queue->sqhd = cqe->sq_head;
 
 	if (abort) {
-		/* data no longer needed */
-		nvmet_fc_free_tgt_pgs(fod);
-
-		nvmet_fc_abort_op(tgtport, fod->fcpreq);
+		nvmet_fc_abort_op(tgtport, fod);
 		return;
 	}
 
@@ -2057,8 +2102,8 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
 				&fod->queue->nvme_cq,
 				&fod->queue->nvme_sq,
 				&nvmet_fc_tgt_fcp_ops);
-	if (!ret) {	/* bad SQE content */
-		nvmet_fc_abort_op(tgtport, fod->fcpreq);
+	if (!ret) {	/* bad SQE content or invalid ctrl state */
+		nvmet_fc_abort_op(tgtport, fod);
 		return;
 	}
 
@@ -2098,7 +2143,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
 	return;
 
 transport_error:
-	nvmet_fc_abort_op(tgtport, fod->fcpreq);
+	nvmet_fc_abort_op(tgtport, fod);
 }
 
 /*
@@ -2151,7 +2196,6 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
 			(be16_to_cpu(cmdiu->iu_len) != (sizeof(*cmdiu)/4)))
 		return -EIO;
 
-
 	queue = nvmet_fc_find_target_queue(tgtport,
 				be64_to_cpu(cmdiu->connection_id));
 	if (!queue)
@@ -2190,6 +2234,59 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
 }
 EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_req);
 
+/**
+ * nvmet_fc_rcv_fcp_abort - transport entry point called by an LLDD
+ *                       upon the reception of an ABTS for a FCP command
+ *
+ * Notify the transport that an ABTS has been received for a FCP command
+ * that had been given to the transport via nvmet_fc_rcv_fcp_req(). The
+ * LLDD believes the command is still being worked on
+ * (template_ops->fcp_req_release() has not been called).
+ *
+ * The transport will wait for any outstanding work (an op to the LLDD,
+ * which the lldd should complete with error due to the ABTS; or the
+ * completion from the nvmet layer of the nvme command), then will
+ * stop processing and call the nvmet_fc_rcv_fcp_req() callback to
+ * return the i/o context to the LLDD.  The LLDD may send the BA_ACC
+ * to the ABTS either after return from this function (assuming any
+ * outstanding op work has been terminated) or upon the callback being
+ * called.
+ *
+ * @target_port: pointer to the (registered) target port the FCP CMD IU
+ *              was received on.
+ * @fcpreq:     pointer to the fcpreq request structure that corresponds
+ *              to the exchange that received the ABTS.
+ */
+void
+nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *target_port,
+			struct nvmefc_tgt_fcp_req *fcpreq)
+{
+	struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+	struct nvmet_fc_tgt_queue *queue;
+	unsigned long flags;
+
+	if (!fod || fod->fcpreq != fcpreq)
+		/* job appears to have already completed, ignore abort */
+		return;
+
+	queue = fod->queue;
+
+	spin_lock_irqsave(&queue->qlock, flags);
+	if (fod->active) {
+		/*
+		 * mark as abort. The abort handler, invoked upon completion
+		 * of any work, will detect the aborted status and do the
+		 * callback.
+		 */
+		spin_lock(&fod->flock);
+		fod->abort = true;
+		fod->aborted = true;
+		spin_unlock(&fod->flock);
+	}
+	spin_unlock_irqrestore(&queue->qlock, flags);
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_abort);
+
 enum {
 	FCT_TRADDR_ERR		= 0,
 	FCT_TRADDR_WWNN		= 1 << 0,
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index dbcafd30dd9b..aaa3dbe22bd5 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -246,7 +246,10 @@ struct fcloop_lsreq {
 struct fcloop_fcpreq {
 	struct fcloop_tport		*tport;
 	struct nvmefc_fcp_req		*fcpreq;
+	spinlock_t			reqlock;
 	u16				status;
+	bool				active;
+	bool				aborted;
 	struct work_struct		work;
 	struct nvmefc_tgt_fcp_req	tgt_fcp_req;
 };
@@ -254,6 +257,7 @@ struct fcloop_fcpreq {
 struct fcloop_ini_fcpreq {
 	struct nvmefc_fcp_req		*fcpreq;
 	struct fcloop_fcpreq		*tfcp_req;
+	struct work_struct		iniwork;
 };
 
 static inline struct fcloop_lsreq *
@@ -345,7 +349,21 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
 }
 
 /*
- * FCP IO operation done. call back up initiator "done" flows.
+ * FCP IO operation done by initiator abort.
+ * call back up initiator "done" flows.
+ */
+static void
+fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work)
+{
+	struct fcloop_ini_fcpreq *inireq =
+		container_of(work, struct fcloop_ini_fcpreq, iniwork);
+
+	inireq->fcpreq->done(inireq->fcpreq);
+}
+
+/*
+ * FCP IO operation done by target completion.
+ * call back up initiator "done" flows.
  */
 static void
 fcloop_tgt_fcprqst_done_work(struct work_struct *work)
@@ -353,9 +371,13 @@ fcloop_tgt_fcprqst_done_work(struct work_struct *work)
 	struct fcloop_fcpreq *tfcp_req =
 		container_of(work, struct fcloop_fcpreq, work);
 	struct fcloop_tport *tport = tfcp_req->tport;
-	struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+	struct nvmefc_fcp_req *fcpreq;
+
+	spin_lock(&tfcp_req->reqlock);
+	fcpreq = tfcp_req->fcpreq;
+	spin_unlock(&tfcp_req->reqlock);
 
-	if (tport->remoteport) {
+	if (tport->remoteport && fcpreq) {
 		fcpreq->status = tfcp_req->status;
 		fcpreq->done(fcpreq);
 	}
@@ -384,8 +406,10 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
 
 	inireq->fcpreq = fcpreq;
 	inireq->tfcp_req = tfcp_req;
+	INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work);
 	tfcp_req->fcpreq = fcpreq;
 	tfcp_req->tport = rport->targetport->private;
+	spin_lock_init(&tfcp_req->reqlock);
 	INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
 
 	ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
@@ -453,50 +477,86 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
 			struct nvmefc_tgt_fcp_req *tgt_fcpreq)
 {
 	struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
-	struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+	struct nvmefc_fcp_req *fcpreq;
 	u32 rsplen = 0, xfrlen = 0;
-	int fcp_err = 0;
+	int fcp_err = 0, active, aborted;
 	u8 op = tgt_fcpreq->op;
 
+	spin_lock(&tfcp_req->reqlock);
+	fcpreq = tfcp_req->fcpreq;
+	active = tfcp_req->active;
+	aborted = tfcp_req->aborted;
+	tfcp_req->active = true;
+	spin_unlock(&tfcp_req->reqlock);
+
+	if (unlikely(active))
+		/* illegal - call while i/o active */
+		return -EALREADY;
+
+	if (unlikely(aborted)) {
+		/* target transport has aborted i/o prior */
+		spin_lock(&tfcp_req->reqlock);
+		tfcp_req->active = false;
+		spin_unlock(&tfcp_req->reqlock);
+		tgt_fcpreq->transferred_length = 0;
+		tgt_fcpreq->fcp_error = -ECANCELED;
+		tgt_fcpreq->done(tgt_fcpreq);
+		return 0;
+	}
+
+	/*
+	 * if fcpreq is NULL, the I/O has been aborted (from
+	 * initiator side). For the target side, act as if all is well
+	 * but don't actually move data.
+	 */
+
 	switch (op) {
 	case NVMET_FCOP_WRITEDATA:
 		xfrlen = tgt_fcpreq->transfer_length;
-		fcloop_fcp_copy_data(op, tgt_fcpreq->sg, fcpreq->first_sgl,
-					tgt_fcpreq->offset, xfrlen);
-		fcpreq->transferred_length += xfrlen;
+		if (fcpreq) {
+			fcloop_fcp_copy_data(op, tgt_fcpreq->sg,
+					fcpreq->first_sgl, tgt_fcpreq->offset,
+					xfrlen);
+			fcpreq->transferred_length += xfrlen;
+		}
 		break;
 
 	case NVMET_FCOP_READDATA:
 	case NVMET_FCOP_READDATA_RSP:
 		xfrlen = tgt_fcpreq->transfer_length;
-		fcloop_fcp_copy_data(op, tgt_fcpreq->sg, fcpreq->first_sgl,
-					tgt_fcpreq->offset, xfrlen);
-		fcpreq->transferred_length += xfrlen;
+		if (fcpreq) {
+			fcloop_fcp_copy_data(op, tgt_fcpreq->sg,
+					fcpreq->first_sgl, tgt_fcpreq->offset,
+					xfrlen);
+			fcpreq->transferred_length += xfrlen;
+		}
 		if (op == NVMET_FCOP_READDATA)
 			break;
 
 		/* Fall-Thru to RSP handling */
 
 	case NVMET_FCOP_RSP:
-		rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ?
-				fcpreq->rsplen : tgt_fcpreq->rsplen);
-		memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen);
-		if (rsplen < tgt_fcpreq->rsplen)
-			fcp_err = -E2BIG;
-		fcpreq->rcv_rsplen = rsplen;
-		fcpreq->status = 0;
+		if (fcpreq) {
+			rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ?
+					fcpreq->rsplen : tgt_fcpreq->rsplen);
+			memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen);
+			if (rsplen < tgt_fcpreq->rsplen)
+				fcp_err = -E2BIG;
+			fcpreq->rcv_rsplen = rsplen;
+			fcpreq->status = 0;
+		}
 		tfcp_req->status = 0;
 		break;
 
-	case NVMET_FCOP_ABORT:
-		tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED;
-		break;
-
 	default:
 		fcp_err = -EINVAL;
 		break;
 	}
 
+	spin_lock(&tfcp_req->reqlock);
+	tfcp_req->active = false;
+	spin_unlock(&tfcp_req->reqlock);
+
 	tgt_fcpreq->transferred_length = xfrlen;
 	tgt_fcpreq->fcp_error = fcp_err;
 	tgt_fcpreq->done(tgt_fcpreq);
@@ -504,6 +564,32 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
 	return 0;
 }
 
+static void
+fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport,
+			struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+	struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+	int active;
+
+	/*
+	 * mark aborted only in case there were 2 threads in transport
+	 * (one doing io, other doing abort) and only kills ops posted
+	 * after the abort request
+	 */
+	spin_lock(&tfcp_req->reqlock);
+	active = tfcp_req->active;
+	tfcp_req->aborted = true;
+	spin_unlock(&tfcp_req->reqlock);
+
+	tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED;
+
+	/*
+	 * nothing more to do. If io wasn't active, the transport should
+	 * immediately call the req_release. If it was active, the op
+	 * will complete, and the lldd should call req_release.
+	 */
+}
+
 static void
 fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
 			struct nvmefc_tgt_fcp_req *tgt_fcpreq)
@@ -526,6 +612,27 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
 			void *hw_queue_handle,
 			struct nvmefc_fcp_req *fcpreq)
 {
+	struct fcloop_rport *rport = remoteport->private;
+	struct fcloop_ini_fcpreq *inireq = fcpreq->private;
+	struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
+
+	if (!tfcp_req)
+		/* abort has already been called */
+		return;
+
+	if (rport->targetport)
+		nvmet_fc_rcv_fcp_abort(rport->targetport,
+					&tfcp_req->tgt_fcp_req);
+
+	/* break initiator/target relationship for io */
+	spin_lock(&tfcp_req->reqlock);
+	inireq->tfcp_req = NULL;
+	tfcp_req->fcpreq = NULL;
+	spin_unlock(&tfcp_req->reqlock);
+
+	/* post the aborted io completion */
+	fcpreq->status = -ECANCELED;
+	schedule_work(&inireq->iniwork);
 }
 
 static void
@@ -583,6 +690,7 @@ struct nvmet_fc_target_template tgttemplate = {
 	.targetport_delete	= fcloop_targetport_delete,
 	.xmt_ls_rsp		= fcloop_xmt_ls_rsp,
 	.fcp_op			= fcloop_fcp_op,
+	.fcp_abort		= fcloop_tgt_fcp_abort,
 	.fcp_req_release	= fcloop_fcp_req_release,
 	.max_hw_queues		= FCLOOP_HW_QUEUES,
 	.max_sgl_segments	= FCLOOP_SGL_SEGS,
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c
index 1846c7eb086a..d488c3318d4b 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -542,27 +542,6 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport,
 	}
 #endif
 
-	if (rsp->op == NVMET_FCOP_ABORT) {
-		lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
-				"6103 Abort op: oxri x%x %d cnt %d\n",
-				ctxp->oxid, ctxp->state, ctxp->entry_cnt);
-
-		lpfc_nvmeio_data(phba, "NVMET FCP ABRT: "
-				 "xri x%x state x%x cnt x%x\n",
-				 ctxp->oxid, ctxp->state, ctxp->entry_cnt);
-
-		atomic_inc(&lpfc_nvmep->xmt_fcp_abort);
-		ctxp->entry_cnt++;
-		ctxp->flag |= LPFC_NVMET_ABORT_OP;
-		if (ctxp->flag & LPFC_NVMET_IO_INP)
-			lpfc_nvmet_sol_fcp_issue_abort(phba, ctxp, ctxp->sid,
-						       ctxp->oxid);
-		else
-			lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, ctxp->sid,
-							 ctxp->oxid);
-		return 0;
-	}
-
 	/* Sanity check */
 	if (ctxp->state == LPFC_NVMET_STE_ABORT) {
 		atomic_inc(&lpfc_nvmep->xmt_fcp_drop);
@@ -632,6 +611,33 @@ lpfc_nvmet_targetport_delete(struct nvmet_fc_target_port *targetport)
 	complete(&tport->tport_unreg_done);
 }
 
+static void
+lpfc_nvmet_xmt_fcp_abort(struct nvmet_fc_target_port *tgtport,
+			 struct nvmefc_tgt_fcp_req *req)
+{
+	struct lpfc_nvmet_tgtport *lpfc_nvmep = tgtport->private;
+	struct lpfc_nvmet_rcv_ctx *ctxp =
+		container_of(req, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req);
+	struct lpfc_hba *phba = ctxp->phba;
+
+	lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+			"6103 Abort op: oxri x%x %d cnt %d\n",
+			ctxp->oxid, ctxp->state, ctxp->entry_cnt);
+
+	lpfc_nvmeio_data(phba, "NVMET FCP ABRT: xri x%x state x%x cnt x%x\n",
+			 ctxp->oxid, ctxp->state, ctxp->entry_cnt);
+
+	atomic_inc(&lpfc_nvmep->xmt_fcp_abort);
+	ctxp->entry_cnt++;
+	ctxp->flag |= LPFC_NVMET_ABORT_OP;
+	if (ctxp->flag & LPFC_NVMET_IO_INP)
+		lpfc_nvmet_sol_fcp_issue_abort(phba, ctxp, ctxp->sid,
+					       ctxp->oxid);
+	else
+		lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, ctxp->sid,
+						 ctxp->oxid);
+}
+
 static void
 lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport,
 			   struct nvmefc_tgt_fcp_req *rsp)
@@ -672,6 +678,7 @@ static struct nvmet_fc_target_template lpfc_tgttemplate = {
 	.targetport_delete = lpfc_nvmet_targetport_delete,
 	.xmt_ls_rsp     = lpfc_nvmet_xmt_ls_rsp,
 	.fcp_op         = lpfc_nvmet_xmt_fcp_op,
+	.fcp_abort      = lpfc_nvmet_xmt_fcp_abort,
 	.fcp_req_release = lpfc_nvmet_xmt_fcp_release,
 
 	.max_hw_queues  = 1,
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index d98ddb2feabc..0db37158a61d 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -533,9 +533,6 @@ enum {
 					 * rsp as well
 					 */
 	NVMET_FCOP_RSP		= 4,	/* send rsp frame */
-	NVMET_FCOP_ABORT	= 5,	/* abort exchange via ABTS */
-	NVMET_FCOP_BA_ACC	= 6,	/* send BA_ACC */
-	NVMET_FCOP_BA_RJT	= 7,	/* send BA_RJT */
 };
 
 /**
@@ -572,8 +569,6 @@ enum {
  *     upon compeletion of the operation.  The nvmet-fc layer will also set a
  *     private pointer for its own use in the done routine.
  *
- * Note: the LLDD must never fail a NVMET_FCOP_ABORT request !!
- *
  * Values set by the NVMET-FC layer prior to calling the LLDD fcp_op
  * entrypoint.
  * @op:       Indicates the FCP IU operation to perform (see NVMET_FCOP_xxx)
@@ -784,10 +779,6 @@ struct nvmet_fc_target_port {
  *           or upon success/failure of FCP_CONF if it is supported, the
  *           LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
  *           consider the operation complete.
- *         NVMET_FCOP_ABORT: the LLDD is to terminate the exchange
- *           corresponding to the fcp operation. The LLDD shall send
- *           ABTS and follow FC exchange abort-multi rules, including
- *           ABTS retries and possible logout.
  *       Upon completing the indicated operation, the LLDD is to set the
  *       status fields for the operation (tranferred_length and fcp_error
  *       status) in the request, then call the "done" routine
@@ -808,6 +799,17 @@ struct nvmet_fc_target_port {
  *       Returns 0 on success, -<errno> on failure (Ex: -EIO)
  *       Entrypoint is Mandatory.
  *
+ * @fcp_abort:  Called by the transport to abort an active command.
+ *       The command may be in-between operations (nothing active in LLDD)
+ *       or may have an active WRITEDATA operation pending. The LLDD is to
+ *       initiate the ABTS process for the command and return from the
+ *       callback. The ABTS does not need to be complete on the command.
+ *       The fcp_abort callback inherently cannot fail. After the
+ *       fcp_abort() callback completes, the transport will wait for any
+ *       outstanding operation (if there was one) to complete, then will
+ *       call the fcp_req_release() callback to return the command's
+ *       exchange context back to the LLDD.
+ *
  * @fcp_req_release:  Called by the transport to return a nvmefc_tgt_fcp_req
  *       to the LLDD after all operations on the fcp operation are complete.
  *       This may be due to the command completing or upon completion of
@@ -848,6 +850,8 @@ struct nvmet_fc_target_template {
 				struct nvmefc_tgt_ls_req *tls_req);
 	int (*fcp_op)(struct nvmet_fc_target_port *tgtport,
 				struct nvmefc_tgt_fcp_req *fcpreq);
+	void (*fcp_abort)(struct nvmet_fc_target_port *tgtport,
+				struct nvmefc_tgt_fcp_req *fcpreq);
 	void (*fcp_req_release)(struct nvmet_fc_target_port *tgtport,
 				struct nvmefc_tgt_fcp_req *fcpreq);
 
@@ -877,4 +881,7 @@ int nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *tgtport,
 			struct nvmefc_tgt_fcp_req *fcpreq,
 			void *cmdiubuf, u32 cmdiubuf_len);
 
+void nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *tgtport,
+			struct nvmefc_tgt_fcp_req *fcpreq);
+
 #endif /* _NVME_FC_DRIVER_H */
-- 
cgit v1.3-6-gb490


From 19b7ccf8651df09d274671b53039c672a52ad84d Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 18 Apr 2017 18:43:20 +0200
Subject: block: get rid of blk_integrity_revalidate()

Commit 25520d55cdb6 ("block: Inline blk_integrity in struct gendisk")
introduced blk_integrity_revalidate(), which seems to assume ownership
of the stable pages flag and unilaterally clears it if no blk_integrity
profile is registered:

    if (bi->profile)
            disk->queue->backing_dev_info->capabilities |=
                    BDI_CAP_STABLE_WRITES;
    else
            disk->queue->backing_dev_info->capabilities &=
                    ~BDI_CAP_STABLE_WRITES;

It's called from revalidate_disk() and rescan_partitions(), making it
impossible to enable stable pages for drivers that support partitions
and don't use blk_integrity: while the call in revalidate_disk() can be
trivially worked around (see zram, which doesn't support partitions and
hence gets away with zram_revalidate_disk()), rescan_partitions() can
be triggered from userspace at any time.  This breaks rbd, where the
ceph messenger is responsible for generating/verifying CRCs.

Since blk_integrity_{un,}register() "must" be used for (un)registering
the integrity profile with the block layer, move BDI_CAP_STABLE_WRITES
setting there.  This way drivers that call blk_integrity_register() and
use integrity infrastructure won't interfere with drivers that don't
but still want stable pages.

Fixes: 25520d55cdb6 ("block: Inline blk_integrity in struct gendisk")
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 4.4+, needs backporting
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-integrity.c     | 19 ++-----------------
 block/partition-generic.c |  1 -
 fs/block_dev.c            |  1 -
 include/linux/genhd.h     |  2 --
 4 files changed, 2 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index b3622cb00fc2..ce43a8214d3e 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -417,7 +417,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
 	bi->tuple_size = template->tuple_size;
 	bi->tag_size = template->tag_size;
 
-	blk_integrity_revalidate(disk);
+	disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
 }
 EXPORT_SYMBOL(blk_integrity_register);
 
@@ -430,26 +430,11 @@ EXPORT_SYMBOL(blk_integrity_register);
  */
 void blk_integrity_unregister(struct gendisk *disk)
 {
-	blk_integrity_revalidate(disk);
+	disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
 	memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity));
 }
 EXPORT_SYMBOL(blk_integrity_unregister);
 
-void blk_integrity_revalidate(struct gendisk *disk)
-{
-	struct blk_integrity *bi = &disk->queue->integrity;
-
-	if (!(disk->flags & GENHD_FL_UP))
-		return;
-
-	if (bi->profile)
-		disk->queue->backing_dev_info->capabilities |=
-			BDI_CAP_STABLE_WRITES;
-	else
-		disk->queue->backing_dev_info->capabilities &=
-			~BDI_CAP_STABLE_WRITES;
-}
-
 void blk_integrity_add(struct gendisk *disk)
 {
 	if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype,
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 7afb9907821f..0171a2faad68 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -497,7 +497,6 @@ rescan:
 
 	if (disk->fops->revalidate_disk)
 		disk->fops->revalidate_disk(disk);
-	blk_integrity_revalidate(disk);
 	check_disk_size_change(disk, bdev);
 	bdev->bd_invalidated = 0;
 	if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index e405d8e58e31..9ccabe3bb7de 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1453,7 +1453,6 @@ int revalidate_disk(struct gendisk *disk)
 
 	if (disk->fops->revalidate_disk)
 		ret = disk->fops->revalidate_disk(disk);
-	blk_integrity_revalidate(disk);
 	bdev = bdget_disk(disk, 0);
 	if (!bdev)
 		return ret;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 9e11082c7f9b..acff9437e5c3 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -722,11 +722,9 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 extern void blk_integrity_add(struct gendisk *);
 extern void blk_integrity_del(struct gendisk *);
-extern void blk_integrity_revalidate(struct gendisk *);
 #else	/* CONFIG_BLK_DEV_INTEGRITY */
 static inline void blk_integrity_add(struct gendisk *disk) { }
 static inline void blk_integrity_del(struct gendisk *disk) { }
-static inline void blk_integrity_revalidate(struct gendisk *disk) { }
 #endif	/* CONFIG_BLK_DEV_INTEGRITY */
 
 #else /* CONFIG_BLOCK */
-- 
cgit v1.3-6-gb490


From 701cac61d0250912b89cbc28589969530179099a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Apr 2017 19:15:53 -0400
Subject: CONFIG_ARCH_HAS_RAW_COPY_USER is unconditional now

all architectures converted

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/Kconfig                  |  3 ---
 arch/alpha/Kconfig            |  1 -
 arch/arc/Kconfig              |  1 -
 arch/arm/Kconfig              |  1 -
 arch/arm64/Kconfig            |  1 -
 arch/avr32/Kconfig            |  1 -
 arch/blackfin/Kconfig         |  1 -
 arch/c6x/Kconfig              |  1 -
 arch/cris/Kconfig             |  1 -
 arch/frv/Kconfig              |  1 -
 arch/h8300/Kconfig            |  1 -
 arch/hexagon/Kconfig          |  1 -
 arch/ia64/Kconfig             |  1 -
 arch/m32r/Kconfig             |  1 -
 arch/m68k/Kconfig             |  1 -
 arch/metag/Kconfig            |  1 -
 arch/microblaze/Kconfig       |  1 -
 arch/mips/Kconfig             |  1 -
 arch/mn10300/Kconfig          |  1 -
 arch/nios2/Kconfig            |  1 -
 arch/openrisc/Kconfig         |  1 -
 arch/parisc/Kconfig           |  1 -
 arch/powerpc/Kconfig          |  1 -
 arch/s390/Kconfig             |  1 -
 arch/score/Kconfig            |  1 -
 arch/sh/Kconfig               |  1 -
 arch/sparc/Kconfig            |  1 -
 arch/tile/Kconfig             |  1 -
 arch/um/Kconfig.common        |  1 -
 arch/unicore32/Kconfig        |  1 -
 arch/x86/Kconfig              |  1 -
 arch/xtensa/Kconfig           |  1 -
 include/asm-generic/uaccess.h | 41 -----------------------------------------
 include/linux/uaccess.h       |  7 ++-----
 lib/Makefile                  |  4 +---
 35 files changed, 3 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 315d37626ddc..cd211a14a88f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -847,7 +847,4 @@ config STRICT_MODULE_RWX
 config ARCH_WANT_RELAX_ORDER
 	bool
 
-config ARCH_HAS_RAW_COPY_USER
-	bool
-
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 1be5f61dc630..0e49d39ea74a 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -26,7 +26,6 @@ config ALPHA
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
-	select ARCH_HAS_RAW_COPY_USER
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 7e213ff4f01f..c9f30f4763ab 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -44,7 +44,6 @@ config ARC
 	select HAVE_GENERIC_DMA_COHERENT
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZMA
-	select ARCH_HAS_RAW_COPY_USER
 
 config MIGHT_HAVE_PCI
 	bool
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 6fab7f34739c..0d4e71b42c77 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -96,7 +96,6 @@ config ARM
 	select PERF_USE_VMALLOC
 	select RTC_LIB
 	select SYS_SUPPORTS_APM_EMULATION
-	select ARCH_HAS_RAW_COPY_USER
 	# Above selects are sorted alphabetically; please add new ones
 	# according to that.  Thanks.
 	help
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3c833ff3303c..3741859765cf 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -115,7 +115,6 @@ config ARM64
 	select SPARSE_IRQ
 	select SYSCTL_EXCEPTION_TRACE
 	select THREAD_INFO_IN_TASK
-	select ARCH_HAS_RAW_COPY_USER
 	help
 	  ARM 64-bit (AArch64) Linux support.
 
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 8c349f2a9ebb..7e75d45e20cd 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -19,7 +19,6 @@ config AVR32
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
 	select HAVE_NMI
-	select ARCH_HAS_RAW_COPY_USER
 	help
 	  AVR32 is a high-performance 32-bit RISC microprocessor core,
 	  designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 919dad1436f7..3c1bd640042a 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -41,7 +41,6 @@ config BLACKFIN
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_NMI
-	select ARCH_HAS_RAW_COPY_USER
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 3c7bd9a29f90..5aa8ea8bad2d 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -18,7 +18,6 @@ config C6X
 	select GENERIC_CLOCKEVENTS
 	select MODULES_USE_ELF_RELA
 	select ARCH_NO_COHERENT_DMA_MMAP
-	select ARCH_HAS_RAW_COPY_USER
 
 config MMU
 	def_bool n
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 36f94c45e3f9..71b758dc3a96 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -71,7 +71,6 @@ config CRIS
 	select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
 	select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32
 	select HAVE_NMI
-	select ARCH_HAS_RAW_COPY_USER
 
 config HZ
 	int
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index e489fef111dd..eefd9a4ed156 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -16,7 +16,6 @@ config FRV
 	select OLD_SIGACTION
 	select HAVE_DEBUG_STACKOVERFLOW
 	select ARCH_NO_COHERENT_DMA_MMAP
-	select ARCH_HAS_RAW_COPY_USER
 
 config ZONE_DMA
 	bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 473883417004..3ae852507e57 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -22,7 +22,6 @@ config H8300
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_HASH
 	select CPU_NO_EFFICIENT_FFS
-	select ARCH_HAS_RAW_COPY_USER
 
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 0c536a83ea71..1941e4baaee6 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -26,7 +26,6 @@ config HEXAGON
 	select GENERIC_CLOCKEVENTS_BROADCAST
 	select MODULES_USE_ELF_RELA
 	select GENERIC_CPU_DEVICES
-	select ARCH_HAS_RAW_COPY_USER
 	---help---
 	  Qualcomm Hexagon is a processor architecture designed for high
 	  performance and low power across a wide variety of applications.
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 0ed0e44856b2..18ca6a9ce566 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -53,7 +53,6 @@ config IA64
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_HARDENED_USERCOPY
-	select ARCH_HAS_RAW_COPY_USER
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index b3e82bdd6db0..95474460b367 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -19,7 +19,6 @@ config M32R
 	select HAVE_DEBUG_STACKOVERFLOW
 	select CPU_NO_EFFICIENT_FFS
 	select DMA_NOOP_OPS
-	select ARCH_HAS_RAW_COPY_USER
 
 config SBUS
 	bool
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 7d345758ea16..d140206d5d29 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -22,7 +22,6 @@ config M68K
 	select MODULES_USE_ELF_RELA
 	select OLD_SIGSUSPEND3
 	select OLD_SIGACTION
-	select ARCH_HAS_RAW_COPY_USER
 
 config RWSEM_GENERIC_SPINLOCK
 	bool
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index ecce0c5ec8e8..5b7a45d99cfb 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -1,6 +1,5 @@
 config METAG
 	def_bool y
-	select ARCH_HAS_RAW_COPY_USER
 	select EMBEDDED
 	select GENERIC_ATOMIC64
 	select GENERIC_CLOCKEVENTS
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 1aff3658a104..85885a501dce 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -34,7 +34,6 @@ config MICROBLAZE
 	select TRACING_SUPPORT
 	select VIRT_TO_BUS
 	select CPU_NO_EFFICIENT_FFS
-	select ARCH_HAS_RAW_COPY_USER
 
 config SWAP
 	def_bool n
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index aff5633bfe4d..a008a9f03072 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -69,7 +69,6 @@ config MIPS
 	select HAVE_EXIT_THREAD
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_ARCH_HARDENED_USERCOPY
-	select ARCH_HAS_RAW_COPY_USER
 
 menu "Machine selection"
 
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index a96f3dcb0119..38e3494bfb63 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -16,7 +16,6 @@ config MN10300
 	select OLD_SIGACTION
 	select HAVE_DEBUG_STACKOVERFLOW
 	select ARCH_NO_COHERENT_DMA_MMAP
-	select ARCH_HAS_RAW_COPY_USER
 
 config AM33_2
 	def_bool n
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 45b4727e3136..51a56c8b04b4 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,7 +16,6 @@ config NIOS2
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
 	select CPU_NO_EFFICIENT_FFS
-	select ARCH_HAS_RAW_COPY_USER
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 38954181fa96..1e95920b0737 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -28,7 +28,6 @@ config OPENRISC
 	select OR1K_PIC
 	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 	select NO_BOOTMEM
-	select ARCH_HAS_RAW_COPY_USER
 
 config MMU
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 15b7b279a169..ad294b3fb90b 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -41,7 +41,6 @@ config PARISC
 	select GENERIC_CLOCKEVENTS
 	select ARCH_NO_COHERENT_DMA_MMAP
 	select CPU_NO_EFFICIENT_FFS
-	select ARCH_HAS_RAW_COPY_USER
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 839f08887269..97a8bc8a095c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -87,7 +87,6 @@ config PPC
 	select ARCH_HAS_DMA_SET_COHERENT_MASK
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
-	select ARCH_HAS_RAW_COPY_USER
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 0724dbf2d8f8..a2dcef0aacc7 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -178,7 +178,6 @@ config S390
 	select ARCH_HAS_SCALED_CPUTIME
 	select VIRT_TO_BUS
 	select HAVE_NMI
-	select ARCH_HAS_RAW_COPY_USER
 
 
 config SCHED_OMIT_FRAME_POINTER
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 4d241cfedc3b..507d63181389 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -15,7 +15,6 @@ config SCORE
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
 	select CPU_NO_EFFICIENT_FFS
-	select ARCH_HAS_RAW_COPY_USER
 
 choice
 	prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index c689645eb8b7..ee086958b2b2 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -48,7 +48,6 @@ config SUPERH
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select HAVE_NMI
-	select ARCH_HAS_RAW_COPY_USER
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 68353048f24e..68ac5c7cd982 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -45,7 +45,6 @@ config SPARC
 	select HAVE_ARCH_HARDENED_USERCOPY
 	select PROVE_LOCKING_SMALL if PROVE_LOCKING
 	select ARCH_WANT_RELAX_ORDER
-	select ARCH_HAS_RAW_COPY_USER
 
 config SPARC32
 	def_bool !64BIT
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 27808d86eb57..4583c0320059 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -33,7 +33,6 @@ config TILE
 	select USER_STACKTRACE_SUPPORT
 	select USE_PMC if PERF_EVENTS
 	select VIRT_TO_BUS
-	select ARCH_HAS_RAW_COPY_USER
 
 config MMU
 	def_bool y
diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common
index e13f763849b1..fd443852103c 100644
--- a/arch/um/Kconfig.common
+++ b/arch/um/Kconfig.common
@@ -13,7 +13,6 @@ config UML
 	select GENERIC_CLOCKEVENTS
 	select HAVE_GCC_PLUGINS
 	select TTY # Needed for line.c
-	select ARCH_HAS_RAW_COPY_USER
 
 config MMU
 	bool
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 319519eaed24..0769066929c6 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -18,7 +18,6 @@ config UNICORE32
 	select ARCH_WANT_FRAME_POINTERS
 	select GENERIC_IOMAP
 	select MODULES_USE_ELF_REL
-	select ARCH_HAS_RAW_COPY_USER
 	help
 	  UniCore-32 is 32-bit Instruction Set Architecture,
 	  including a series of low-power-consumption RISC chip
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5f59fc388063..cc98d5a294ee 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -175,7 +175,6 @@ config X86
 	select USER_STACKTRACE_SUPPORT
 	select VIRT_TO_BUS
 	select X86_FEATURE_NAMES		if PROC_FS
-	select ARCH_HAS_RAW_COPY_USER
 
 config INSTRUCTION_DECODER
 	def_bool y
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 043d37d45919..f4126cf997a4 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -29,7 +29,6 @@ config XTENSA
 	select NO_BOOTMEM
 	select PERF_USE_VMALLOC
 	select VIRT_TO_BUS
-	select ARCH_HAS_RAW_COPY_USER
 	help
 	  Xtensa processors are 32-bit RISC machines designed by Tensilica
 	  primarily for embedded systems.  These processors are both
diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index d65c311eb128..bbe4bb438e39 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -86,11 +86,7 @@ static inline int __access_ok(unsigned long addr, unsigned long size)
 
 static inline int __put_user_fn(size_t size, void __user *ptr, void *x)
 {
-#ifdef CONFIG_ARCH_HAS_RAW_COPY_USER
 	return unlikely(raw_copy_to_user(ptr, x, size)) ? -EFAULT : 0;
-#else
-	return unlikely(__copy_to_user(ptr, x, size)) ? -EFAULT : 0;
-#endif
 }
 
 #define __put_user_fn(sz, u, k)	__put_user_fn(sz, u, k)
@@ -151,11 +147,7 @@ extern int __put_user_bad(void) __attribute__((noreturn));
 #ifndef __get_user_fn
 static inline int __get_user_fn(size_t size, const void __user *ptr, void *x)
 {
-#ifdef CONFIG_ARCH_HAS_RAW_COPY_USER
 	return unlikely(raw_copy_from_user(x, ptr, size)) ? -EFAULT : 0;
-#else
-	return unlikely(__copy_from_user(x, ptr, size)) ? -EFAULT : 0;
-#endif
 }
 
 #define __get_user_fn(sz, u, k)	__get_user_fn(sz, u, k)
@@ -164,39 +156,6 @@ static inline int __get_user_fn(size_t size, const void __user *ptr, void *x)
 
 extern int __get_user_bad(void) __attribute__((noreturn));
 
-#ifndef CONFIG_ARCH_HAS_RAW_COPY_USER
-
-#ifndef __copy_from_user_inatomic
-#define __copy_from_user_inatomic __copy_from_user
-#endif
-
-#ifndef __copy_to_user_inatomic
-#define __copy_to_user_inatomic __copy_to_user
-#endif
-
-static inline long copy_from_user(void *to,
-		const void __user * from, unsigned long n)
-{
-	unsigned long res = n;
-	might_fault();
-	if (likely(access_ok(VERIFY_READ, from, n)))
-		res = __copy_from_user(to, from, n);
-	if (unlikely(res))
-		memset(to + (n - res), 0, res);
-	return res;
-}
-
-static inline long copy_to_user(void __user *to,
-		const void *from, unsigned long n)
-{
-	might_fault();
-	if (access_ok(VERIFY_WRITE, to, n))
-		return __copy_to_user(to, from, n);
-	else
-		return n;
-}
-#endif
-
 /*
  * Copy a null terminated string from userspace.
  */
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 7fc2104b88bc..e0cbfb09e60f 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -12,12 +12,10 @@
 
 #include <asm/uaccess.h>
 
-#ifdef CONFIG_ARCH_HAS_RAW_COPY_USER
 /*
  * Architectures should provide two primitives (raw_copy_{to,from}_user())
- * select ARCH_HAS_RAW_COPY_FROM_USER and get rid of their private instances
- * of copy_{to,from}_user() and __copy_{to,from}_user{,_inatomic}().  Once
- * all of them switch, this part of linux/uaccess.h will become unconditional.
+ * and get rid of their private instances of copy_{to,from}_user() and
+ * __copy_{to,from}_user{,_inatomic}().
  *
  * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
  * return the amount left to copy.  They should assume that access_ok() has
@@ -196,7 +194,6 @@ copy_in_user(void __user *to, const void *from, unsigned long n)
 	return n;
 }
 #endif
-#endif
 
 static __always_inline void pagefault_disabled_inc(void)
 {
diff --git a/lib/Makefile b/lib/Makefile
index 7d875c389172..b47cf97e1e68 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -41,7 +41,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
-	 once.o refcount.o
+	 once.o refcount.o usercopy.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += hexdump.o
@@ -242,5 +242,3 @@ UBSAN_SANITIZE_ubsan.o := n
 obj-$(CONFIG_SBITMAP) += sbitmap.o
 
 obj-$(CONFIG_PARMAN) += parman.o
-
-obj-$(CONFIG_ARCH_HAS_RAW_COPY_USER) += usercopy.o
-- 
cgit v1.3-6-gb490


From 2836ee4b1acbe7b396219d0677426885f14cd792 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Wed, 26 Apr 2017 13:47:56 -0700
Subject: blk-mq: Add blk_mq_ops.show_rq()

This new callback function will be used in the next patch to show
more information about SCSI requests.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-debugfs.c | 6 +++++-
 include/linux/blk-mq.h | 8 ++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index ac39093c4ef7..bcd2a7d4a3a5 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -311,6 +311,7 @@ static const char *const rqf_name[] = {
 static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
 {
 	struct request *rq = list_entry_rq(v);
+	const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
 	const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
 
 	seq_printf(m, "%p {.op=", rq);
@@ -324,8 +325,11 @@ static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
 	seq_puts(m, ", .rq_flags=");
 	blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
 		       ARRAY_SIZE(rqf_name));
-	seq_printf(m, ", .tag=%d, .internal_tag=%d}\n", rq->tag,
+	seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
 		   rq->internal_tag);
+	if (mq_ops->show_rq)
+		mq_ops->show_rq(m, rq);
+	seq_puts(m, "}\n");
 	return 0;
 }
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0c4dadb85f62..32bd8eb5ba67 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -142,6 +142,14 @@ struct blk_mq_ops {
 	reinit_request_fn	*reinit_request;
 
 	map_queues_fn		*map_queues;
+
+#ifdef CONFIG_BLK_DEBUG_FS
+	/*
+	 * Used by the debugfs implementation to show driver-specific
+	 * information about a request.
+	 */
+	void (*show_rq)(struct seq_file *m, struct request *rq);
+#endif
 };
 
 enum {
-- 
cgit v1.3-6-gb490


From 9f993737906b30d7b2454a38637d1f70ffd60f2f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 10 Apr 2017 09:54:54 -0600
Subject: blk-mq: unify hctx delayed_run_work and run_work

They serve the exact same purpose. Get rid of the non-delayed
work variant, and just run it without delay for the normal case.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  2 +-
 block/blk-mq.c         | 27 ++++++---------------------
 include/linux/blk-mq.h |  3 +--
 3 files changed, 8 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 6bd4d1754d29..37939672d4df 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -269,7 +269,7 @@ void blk_sync_queue(struct request_queue *q)
 		int i;
 
 		queue_for_each_hw_ctx(q, hctx, i) {
-			cancel_work_sync(&hctx->run_work);
+			cancel_delayed_work_sync(&hctx->run_work);
 			cancel_delayed_work_sync(&hctx->delay_work);
 		}
 	} else {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e6aad49c1686..5c68fce87ffc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1166,13 +1166,9 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
 		put_cpu();
 	}
 
-	if (msecs == 0)
-		kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx),
-					 &hctx->run_work);
-	else
-		kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-						 &hctx->delayed_run_work,
-						 msecs_to_jiffies(msecs));
+	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+					 &hctx->run_work,
+					 msecs_to_jiffies(msecs));
 }
 
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@ -1224,7 +1220,7 @@ EXPORT_SYMBOL(blk_mq_queue_stopped);
 
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
-	cancel_work(&hctx->run_work);
+	cancel_delayed_work_sync(&hctx->run_work);
 	cancel_delayed_work(&hctx->delay_work);
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
@@ -1282,17 +1278,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 {
 	struct blk_mq_hw_ctx *hctx;
 
-	hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
-
-	__blk_mq_run_hw_queue(hctx);
-}
-
-static void blk_mq_delayed_run_work_fn(struct work_struct *work)
-{
-	struct blk_mq_hw_ctx *hctx;
-
-	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_run_work.work);
-
+	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
 	__blk_mq_run_hw_queue(hctx);
 }
 
@@ -1898,8 +1884,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
 	if (node == NUMA_NO_NODE)
 		node = hctx->numa_node = set->numa_node;
 
-	INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
-	INIT_DELAYED_WORK(&hctx->delayed_run_work, blk_mq_delayed_run_work_fn);
+	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
 	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 32bd8eb5ba67..c7cc90328426 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -15,7 +15,7 @@ struct blk_mq_hw_ctx {
 		unsigned long		state;		/* BLK_MQ_S_* flags */
 	} ____cacheline_aligned_in_smp;
 
-	struct work_struct	run_work;
+	struct delayed_work	run_work;
 	cpumask_var_t		cpumask;
 	int			next_cpu;
 	int			next_cpu_batch;
@@ -51,7 +51,6 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
-	struct delayed_work	delayed_run_work;
 	struct delayed_work	delay_work;
 
 	struct hlist_node	cpuhp_dead;
-- 
cgit v1.3-6-gb490


From 818cd1cbaa7b00bbc35452a76bebc681a65f1912 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 10 Apr 2017 09:54:55 -0600
Subject: block: add kblock_mod_delayed_work_on()

This modifies (or adds, if not currently pending) an existing
delayed work item.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       | 7 +++++++
 include/linux/blkdev.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 37939672d4df..64b6e58532bf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3076,6 +3076,13 @@ int kblockd_schedule_work_on(int cpu, struct work_struct *work)
 }
 EXPORT_SYMBOL(kblockd_schedule_work_on);
 
+int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
+				unsigned long delay)
+{
+	return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
+
 int kblockd_schedule_delayed_work(struct delayed_work *dwork,
 				  unsigned long delay)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6c247861cb66..d098c66b3ab0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1685,6 +1685,7 @@ int kblockd_schedule_work(struct work_struct *work);
 int kblockd_schedule_work_on(int cpu, struct work_struct *work);
 int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
 int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
+int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
 #ifdef CONFIG_BLK_CGROUP
 /*
-- 
cgit v1.3-6-gb490


From 21c6e939a9f6bb06fe616a87defec0f92a7c3df0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Mon, 10 Apr 2017 09:54:56 -0600
Subject: blk-mq: unify hctx delay_work and run_work

The only difference between ->run_work and ->delay_work, is that
the latter is used to defer running a queue. This is done by
marking the queue stopped, and scheduling ->delay_work to run
sometime in the future. While the queue is stopped, direct runs
or runs through ->run_work will not run the queue.

If we combine the handlers, then we need to handle two things:

1) If a delayed/stopped run is scheduled, then we should not run
   the queue before that has been completed.
2) If a queue is delayed/stopped, the handler needs to restart
   the queue. Normally a run of a queue with the stopped bit set
   would be a no-op.

Case 1 is handled by modifying a currently pending queue run
to the deadline set by the caller of blk_mq_delay_queue().
Subsequent attempts to queue a queue run will find the work
item already pending, and direct runs will see a stopped queue
as before.

Case 2 is handled by adding a new bit, BLK_MQ_S_START_ON_RUN,
that tells the work handler that it should clear a stopped
queue and run the handler.

Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c       |  4 +---
 block/blk-mq.c         | 34 ++++++++++++++++++++++------------
 include/linux/blk-mq.h |  3 +--
 3 files changed, 24 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 64b6e58532bf..24886b69690f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -268,10 +268,8 @@ void blk_sync_queue(struct request_queue *q)
 		struct blk_mq_hw_ctx *hctx;
 		int i;
 
-		queue_for_each_hw_ctx(q, hctx, i) {
+		queue_for_each_hw_ctx(q, hctx, i)
 			cancel_delayed_work_sync(&hctx->run_work);
-			cancel_delayed_work_sync(&hctx->delay_work);
-		}
 	} else {
 		cancel_delayed_work_sync(&q->delay_work);
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5c68fce87ffc..a0bdf63aebfe 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1221,7 +1221,6 @@ EXPORT_SYMBOL(blk_mq_queue_stopped);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	cancel_delayed_work_sync(&hctx->run_work);
-	cancel_delayed_work(&hctx->delay_work);
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -1279,27 +1278,39 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 	struct blk_mq_hw_ctx *hctx;
 
 	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
-	__blk_mq_run_hw_queue(hctx);
-}
 
-static void blk_mq_delay_work_fn(struct work_struct *work)
-{
-	struct blk_mq_hw_ctx *hctx;
+	/*
+	 * If we are stopped, don't run the queue. The exception is if
+	 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
+	 * the STOPPED bit and run it.
+	 */
+	if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
+		if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
+			return;
 
-	hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
+		clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+	}
 
-	if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
-		__blk_mq_run_hw_queue(hctx);
+	__blk_mq_run_hw_queue(hctx);
 }
 
+
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 {
 	if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
 		return;
 
+	/*
+	 * Stop the hw queue, then modify currently delayed work.
+	 * This should prevent us from running the queue prematurely.
+	 * Mark the queue as auto-clearing STOPPED when it runs.
+	 */
 	blk_mq_stop_hw_queue(hctx);
-	kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-			&hctx->delay_work, msecs_to_jiffies(msecs));
+	set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+					&hctx->run_work,
+					msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_mq_delay_queue);
 
@@ -1885,7 +1896,6 @@ static int blk_mq_init_hctx(struct request_queue *q,
 		node = hctx->numa_node = set->numa_node;
 
 	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
-	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
 	hctx->queue = q;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c7cc90328426..f3e5e1de1bdb 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -51,8 +51,6 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
-	struct delayed_work	delay_work;
-
 	struct hlist_node	cpuhp_dead;
 	struct kobject		kobj;
 
@@ -168,6 +166,7 @@ enum {
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
 	BLK_MQ_S_TAG_WAITING	= 3,
+	BLK_MQ_S_START_ON_RUN	= 4,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
-- 
cgit v1.3-6-gb490


From 63ccc191649eb0f14a761074291551d0d2f85389 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 25 Apr 2017 14:26:52 +0200
Subject: libata: remove SCT WRITE SAME support

This was already disabled a while ago because it caused I/O errors,
and it's severly getting into the way of the discard / write zeroes
rework.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-scsi.c | 132 ++++++++++------------------------------------
 include/linux/ata.h       |   5 --
 2 files changed, 29 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 50f56d066936..49ba9834c715 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -3392,46 +3392,6 @@ static size_t ata_format_dsm_trim_descr(struct scsi_cmnd *cmd, u32 trmax,
 	return r;
 }
 
-/**
- * ata_format_dsm_trim_descr() - SATL Write Same to ATA SCT Write Same
- * @cmd: SCSI command being translated
- * @lba: Starting sector
- * @num: Number of sectors to be zero'd.
- *
- * Rewrite the WRITE SAME payload to be an SCT Write Same formatted
- * descriptor.
- * NOTE: Writes a pattern (0's) in the foreground.
- *
- * Return: Number of bytes copied into sglist.
- */
-static size_t ata_format_sct_write_same(struct scsi_cmnd *cmd, u64 lba, u64 num)
-{
-	struct scsi_device *sdp = cmd->device;
-	size_t len = sdp->sector_size;
-	size_t r;
-	u16 *buf;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ata_scsi_rbuf_lock, flags);
-	buf = ((void *)ata_scsi_rbuf);
-
-	put_unaligned_le16(0x0002,  &buf[0]); /* SCT_ACT_WRITE_SAME */
-	put_unaligned_le16(0x0101,  &buf[1]); /* WRITE PTRN FG */
-	put_unaligned_le64(lba,     &buf[2]);
-	put_unaligned_le64(num,     &buf[6]);
-	put_unaligned_le32(0u,      &buf[10]); /* pattern */
-
-	WARN_ON(len > ATA_SCSI_RBUF_SIZE);
-
-	if (len > ATA_SCSI_RBUF_SIZE)
-		len = ATA_SCSI_RBUF_SIZE;
-
-	r = sg_copy_from_buffer(scsi_sglist(cmd), scsi_sg_count(cmd), buf, len);
-	spin_unlock_irqrestore(&ata_scsi_rbuf_lock, flags);
-
-	return r;
-}
-
 /**
  * ata_scsi_write_same_xlat() - SATL Write Same to ATA SCT Write Same
  * @qc: Command to be translated
@@ -3476,26 +3436,17 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
 	}
 	scsi_16_lba_len(cdb, &block, &n_block);
 
-	if (unmap) {
-		/* If trim is not enabled the cmd is invalid. */
-		if ((dev->horkage & ATA_HORKAGE_NOTRIM) ||
-		    !ata_id_has_trim(dev->id)) {
-			fp = 1;
-			bp = 3;
-			goto invalid_fld;
-		}
-		/* If the request is too large the cmd is invalid */
-		if (n_block > 0xffff * trmax) {
-			fp = 2;
-			goto invalid_fld;
-		}
-	} else {
-		/* If write same is not available the cmd is invalid */
-		if (!ata_id_sct_write_same(dev->id)) {
-			fp = 1;
-			bp = 3;
-			goto invalid_fld;
-		}
+	if (!unmap ||
+	    (dev->horkage & ATA_HORKAGE_NOTRIM) ||
+	    !ata_id_has_trim(dev->id)) {
+		fp = 1;
+		bp = 3;
+		goto invalid_fld;
+	}
+	/* If the request is too large the cmd is invalid */
+	if (n_block > 0xffff * trmax) {
+		fp = 2;
+		goto invalid_fld;
 	}
 
 	/*
@@ -3510,49 +3461,28 @@ static unsigned int ata_scsi_write_same_xlat(struct ata_queued_cmd *qc)
 	 * For DATA SET MANAGEMENT TRIM in ACS-2 nsect (aka count)
 	 * is defined as number of 512 byte blocks to be transferred.
 	 */
-	if (unmap) {
-		size = ata_format_dsm_trim_descr(scmd, trmax, block, n_block);
-		if (size != len)
-			goto invalid_param_len;
 
-		if (ata_ncq_enabled(dev) && ata_fpdma_dsm_supported(dev)) {
-			/* Newer devices support queued TRIM commands */
-			tf->protocol = ATA_PROT_NCQ;
-			tf->command = ATA_CMD_FPDMA_SEND;
-			tf->hob_nsect = ATA_SUBCMD_FPDMA_SEND_DSM & 0x1f;
-			tf->nsect = qc->tag << 3;
-			tf->hob_feature = (size / 512) >> 8;
-			tf->feature = size / 512;
+	size = ata_format_dsm_trim_descr(scmd, trmax, block, n_block);
+	if (size != len)
+		goto invalid_param_len;
 
-			tf->auxiliary = 1;
-		} else {
-			tf->protocol = ATA_PROT_DMA;
-			tf->hob_feature = 0;
-			tf->feature = ATA_DSM_TRIM;
-			tf->hob_nsect = (size / 512) >> 8;
-			tf->nsect = size / 512;
-			tf->command = ATA_CMD_DSM;
-		}
-	} else {
-		size = ata_format_sct_write_same(scmd, block, n_block);
-		if (size != len)
-			goto invalid_param_len;
+	if (ata_ncq_enabled(dev) && ata_fpdma_dsm_supported(dev)) {
+		/* Newer devices support queued TRIM commands */
+		tf->protocol = ATA_PROT_NCQ;
+		tf->command = ATA_CMD_FPDMA_SEND;
+		tf->hob_nsect = ATA_SUBCMD_FPDMA_SEND_DSM & 0x1f;
+		tf->nsect = qc->tag << 3;
+		tf->hob_feature = (size / 512) >> 8;
+		tf->feature = size / 512;
 
-		tf->hob_feature = 0;
-		tf->feature = 0;
-		tf->hob_nsect = 0;
-		tf->nsect = 1;
-		tf->lbah = 0;
-		tf->lbam = 0;
-		tf->lbal = ATA_CMD_STANDBYNOW1;
-		tf->hob_lbah = 0;
-		tf->hob_lbam = 0;
-		tf->hob_lbal = 0;
-		tf->device = ATA_CMD_STANDBYNOW1;
+		tf->auxiliary = 1;
+	} else {
 		tf->protocol = ATA_PROT_DMA;
-		tf->command = ATA_CMD_WRITE_LOG_DMA_EXT;
-		if (unlikely(dev->flags & ATA_DFLAG_PIO))
-			tf->command = ATA_CMD_WRITE_LOG_EXT;
+		tf->hob_feature = 0;
+		tf->feature = ATA_DSM_TRIM;
+		tf->hob_nsect = (size / 512) >> 8;
+		tf->nsect = size / 512;
+		tf->command = ATA_CMD_DSM;
 	}
 
 	tf->flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE | ATA_TFLAG_LBA48 |
@@ -3627,10 +3557,6 @@ static unsigned int ata_scsiop_maint_in(struct ata_scsi_args *args, u8 *rbuf)
 	case START_STOP:
 		supported = 3;
 		break;
-	case WRITE_SAME_16:
-		if (!ata_id_sct_write_same(dev->id))
-			break;
-		/* fallthrough: if SCT ... only enable for ZBC */
 	case ZBC_IN:
 	case ZBC_OUT:
 		if (ata_id_zoned_cap(dev->id) ||
diff --git a/include/linux/ata.h b/include/linux/ata.h
index af6859b3a93d..ad7d9ee89ff0 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -817,11 +817,6 @@ static inline bool ata_id_sct_error_recovery_ctrl(const u16 *id)
 	return id[ATA_ID_SCT_CMD_XPORT] & (1 << 3) ? true : false;
 }
 
-static inline bool ata_id_sct_write_same(const u16 *id)
-{
-	return id[ATA_ID_SCT_CMD_XPORT] & (1 << 2) ? true : false;
-}
-
 static inline bool ata_id_sct_long_sector_access(const u16 *id)
 {
 	return id[ATA_ID_SCT_CMD_XPORT] & (1 << 1) ? true : false;
-- 
cgit v1.3-6-gb490


From 71389703839ebe9cb426c72d5f0bd549592e583c Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 28 Apr 2017 10:23:37 -0700
Subject: mm, zone_device: Replace {get, put}_zone_device_page() with a single
 reference to fix pmem crash
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The x86 conversion to the generic GUP code included a small change which causes
crashes and data corruption in the pmem code - not good.

The root cause is that the /dev/pmem driver code implicitly relies on the x86
get_user_pages() implementation doing a get_page() on the page refcount, because
get_page() does a get_zone_device_page() which properly refcounts pmem's separate
page struct arrays that are not present in the regular page struct structures.
(The pmem driver does this because it can cover huge memory areas.)

But the x86 conversion to the generic GUP code changed the get_page() to
page_cache_get_speculative() which is faster but doesn't do the
get_zone_device_page() call the pmem code relies on.

One way to solve the regression would be to change the generic GUP code to use
get_page(), but that would slow things down a bit and punish other generic-GUP
using architectures for an x86-ism they did not care about. (Arguably the pmem
driver was probably not working reliably for them: but nvdimm is an Intel
feature, so non-x86 exposure is probably still limited.)

So restructure the pmem code's interface with the MM instead: get rid of the
get/put_zone_device_page() distinction, integrate put_zone_device_page() into
__put_page() and and restructure the pmem completion-wait and teardown machinery:

Kirill points out that the calls to {get,put}_dev_pagemap() can be
removed from the mm fast path if we take a single get_dev_pagemap()
reference to signify that the page is alive and use the final put of the
page to drop that reference.

This does require some care to make sure that any waits for the
percpu_ref to drop to zero occur *after* devm_memremap_page_release(),
since it now maintains its own elevated reference.

This speeds up things while also making the pmem refcounting more robust going
forward.

Suggested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Tested-by: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/dax/pmem.c    |  2 +-
 drivers/nvdimm/pmem.c | 13 +++++++++++--
 include/linux/mm.h    | 14 --------------
 kernel/memremap.c     | 22 +++++++++-------------
 mm/swap.c             | 10 ++++++++++
 5 files changed, 31 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c
index 033f49b31fdc..cb0d742fa23f 100644
--- a/drivers/dax/pmem.c
+++ b/drivers/dax/pmem.c
@@ -43,6 +43,7 @@ static void dax_pmem_percpu_exit(void *data)
 	struct dax_pmem *dax_pmem = to_dax_pmem(ref);
 
 	dev_dbg(dax_pmem->dev, "%s\n", __func__);
+	wait_for_completion(&dax_pmem->cmp);
 	percpu_ref_exit(ref);
 }
 
@@ -53,7 +54,6 @@ static void dax_pmem_percpu_kill(void *data)
 
 	dev_dbg(dax_pmem->dev, "%s\n", __func__);
 	percpu_ref_kill(ref);
-	wait_for_completion(&dax_pmem->cmp);
 }
 
 static int dax_pmem_probe(struct device *dev)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 5b536be5a12e..fb7bbc79ac26 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -25,6 +25,7 @@
 #include <linux/badblocks.h>
 #include <linux/memremap.h>
 #include <linux/vmalloc.h>
+#include <linux/blk-mq.h>
 #include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/pmem.h>
@@ -231,6 +232,11 @@ static void pmem_release_queue(void *q)
 	blk_cleanup_queue(q);
 }
 
+static void pmem_freeze_queue(void *q)
+{
+	blk_mq_freeze_queue_start(q);
+}
+
 static void pmem_release_disk(void *disk)
 {
 	del_gendisk(disk);
@@ -284,6 +290,9 @@ static int pmem_attach_disk(struct device *dev,
 	if (!q)
 		return -ENOMEM;
 
+	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
+		return -ENOMEM;
+
 	pmem->pfn_flags = PFN_DEV;
 	if (is_nd_pfn(dev)) {
 		addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
@@ -303,10 +312,10 @@ static int pmem_attach_disk(struct device *dev,
 				pmem->size, ARCH_MEMREMAP_PMEM);
 
 	/*
-	 * At release time the queue must be dead before
+	 * At release time the queue must be frozen before
 	 * devm_memremap_pages is unwound
 	 */
-	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
+	if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
 		return -ENOMEM;
 
 	if (IS_ERR(addr))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a835edd2db34..695da2a19b4c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -762,19 +762,11 @@ static inline enum zone_type page_zonenum(const struct page *page)
 }
 
 #ifdef CONFIG_ZONE_DEVICE
-void get_zone_device_page(struct page *page);
-void put_zone_device_page(struct page *page);
 static inline bool is_zone_device_page(const struct page *page)
 {
 	return page_zonenum(page) == ZONE_DEVICE;
 }
 #else
-static inline void get_zone_device_page(struct page *page)
-{
-}
-static inline void put_zone_device_page(struct page *page)
-{
-}
 static inline bool is_zone_device_page(const struct page *page)
 {
 	return false;
@@ -790,9 +782,6 @@ static inline void get_page(struct page *page)
 	 */
 	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
 	page_ref_inc(page);
-
-	if (unlikely(is_zone_device_page(page)))
-		get_zone_device_page(page);
 }
 
 static inline void put_page(struct page *page)
@@ -801,9 +790,6 @@ static inline void put_page(struct page *page)
 
 	if (put_page_testzero(page))
 		__put_page(page);
-
-	if (unlikely(is_zone_device_page(page)))
-		put_zone_device_page(page);
 }
 
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 07e85e5229da..23a6483c3666 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -182,18 +182,6 @@ struct page_map {
 	struct vmem_altmap altmap;
 };
 
-void get_zone_device_page(struct page *page)
-{
-	percpu_ref_get(page->pgmap->ref);
-}
-EXPORT_SYMBOL(get_zone_device_page);
-
-void put_zone_device_page(struct page *page)
-{
-	put_dev_pagemap(page->pgmap);
-}
-EXPORT_SYMBOL(put_zone_device_page);
-
 static void pgmap_radix_release(struct resource *res)
 {
 	resource_size_t key, align_start, align_size, align_end;
@@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
 	struct resource *res = &page_map->res;
 	resource_size_t align_start, align_size;
 	struct dev_pagemap *pgmap = &page_map->pgmap;
+	unsigned long pfn;
+
+	for_each_device_pfn(pfn, page_map)
+		put_page(pfn_to_page(pfn));
 
 	if (percpu_ref_tryget_live(pgmap->ref)) {
 		dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
@@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
  *
  * Notes:
  * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
- *    (or devm release event).
+ *    (or devm release event). The expected order of events is that @ref has
+ *    been through percpu_ref_kill() before devm_memremap_pages_release(). The
+ *    wait for the completion of all references being dropped and
+ *    percpu_ref_exit() must occur after devm_memremap_pages_release().
  *
  * 2/ @res is expected to be a host memory range that could feasibly be
  *    treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 		 */
 		list_del(&page->lru);
 		page->pgmap = pgmap;
+		percpu_ref_get(ref);
 	}
 	devres_add(dev, page_map);
 	return __va(res->start);
diff --git a/mm/swap.c b/mm/swap.c
index c4910f14f957..a4e6113276b5 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page)
 
 void __put_page(struct page *page)
 {
+	if (is_zone_device_page(page)) {
+		put_dev_pagemap(page->pgmap);
+
+		/*
+		 * The page belongs to the device that created pgmap. Do
+		 * not return it to page allocator.
+		 */
+		return;
+	}
+
 	if (unlikely(PageCompound(page)))
 		__put_compound_page(page);
 	else
-- 
cgit v1.3-6-gb490