From 67b886d290052dbf2bcfc876a5ae41a5fe461edf Mon Sep 17 00:00:00 2001
From: "Andrew F. Davis" <afd@ti.com>
Date: Thu, 21 Mar 2019 15:09:56 -0500
Subject: dma-buf: Remove leftover [un]map_atomic comments

The map_atomic/unmap_atomic callbacks have been removed, remove
the related comments.

Fixes: f664a5269542 ("dma-buf: remove kmap_atomic interface")
Signed-off-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190321200957.16938-1-afd@ti.com
---
 include/linux/dma-buf.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 58725f890b5b..e4a8dab2bc54 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -39,11 +39,6 @@ struct dma_buf_attachment;
 
 /**
  * struct dma_buf_ops - operations possible on struct dma_buf
- * @map_atomic: [optional] maps a page from the buffer into kernel address
- *		space, users may not block until the subsequent unmap call.
- *		This callback must not sleep.
- * @unmap_atomic: [optional] unmaps a atomically mapped page from the buffer.
- *		  This Callback must not sleep.
  * @map: [optional] maps a page from the buffer into kernel address space.
  * @unmap: [optional] unmaps a page from the buffer.
  * @vmap: [optional] creates a virtual mapping for the buffer into kernel
-- 
cgit v1.2.3-59-g8ed1b


From d5ae7712b7ffbb435e8f3d98f2123eff4734c77f Mon Sep 17 00:00:00 2001
From: "Andrew F. Davis" <afd@ti.com>
Date: Thu, 21 Mar 2019 15:09:57 -0500
Subject: dma-buf: Update [un]map documentation to match the other functions

Other function have inline documentation, a couple still have
theirs at the top of the structure, update the docs and move
them inline.

Signed-off-by: Andrew F. Davis <afd@ti.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190321200957.16938-2-afd@ti.com
---
 include/linux/dma-buf.h | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index e4a8dab2bc54..a0bd071466fc 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -39,8 +39,6 @@ struct dma_buf_attachment;
 
 /**
  * struct dma_buf_ops - operations possible on struct dma_buf
- * @map: [optional] maps a page from the buffer into kernel address space.
- * @unmap: [optional] unmaps a page from the buffer.
  * @vmap: [optional] creates a virtual mapping for the buffer into kernel
  *	  address space. Same restrictions as for vmap and friends apply.
  * @vunmap: [optional] unmaps a vmap from the buffer
@@ -200,8 +198,6 @@ struct dma_buf_ops {
 	 * to be restarted.
 	 */
 	int (*end_cpu_access)(struct dma_buf *, enum dma_data_direction);
-	void *(*map)(struct dma_buf *, unsigned long);
-	void (*unmap)(struct dma_buf *, unsigned long, void *);
 
 	/**
 	 * @mmap:
@@ -240,6 +236,31 @@ struct dma_buf_ops {
 	 */
 	int (*mmap)(struct dma_buf *, struct vm_area_struct *vma);
 
+	/**
+	 * @map:
+	 *
+	 * Maps a page from the buffer into kernel address space. The page is
+	 * specified by offset into the buffer in PAGE_SIZE units.
+	 *
+	 * This callback is optional.
+	 *
+	 * Returns:
+	 *
+	 * Virtual address pointer where requested page can be accessed. NULL
+	 * on error or when this function is unimplemented by the exporter.
+	 */
+	void *(*map)(struct dma_buf *, unsigned long);
+
+	/**
+	 * @unmap:
+	 *
+	 * Unmaps a page from the buffer. Page offset and address pointer should
+	 * be the same as the one passed to and returned by matching call to map.
+	 *
+	 * This callback is optional.
+	 */
+	void (*unmap)(struct dma_buf *, unsigned long, void *);
+
 	void *(*vmap)(struct dma_buf *);
 	void (*vunmap)(struct dma_buf *, void *vaddr);
 };
-- 
cgit v1.2.3-59-g8ed1b


From 09ed79d6d75f06cc963a78f25463251b0a758dc7 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 7 May 2019 10:01:47 -0700
Subject: percpu_ref: introduce PERCPU_REF_ALLOW_REINIT flag

In most cases percpu reference counters are not switched to the
percpu mode after they reach the atomic mode. Some obvious exceptions
are reference counters which are initialized into the atomic
mode (using PERCPU_REF_INIT_ATOMIC and PERCPU_REF_INIT_DEAD flags),
and there are few other exceptions.

But in most cases there is no way back, and once the reference counter
is switched to the atomic mode, there is no reason to wait for
percpu_ref_exit() to release the percpu memory. Of course, the size
of a single counter is not so big, but because it can pin the whole
percpu block in memory, the memory footprint can be noticeable
(e.g. on my 32 CPUs machine a percpu block is 8Mb large).

To make releasing of the percpu memory as early as possible, let's
introduce the PERCPU_REF_ALLOW_REINIT flag with the following semantics:
it has to be set in order to switch a percpu reference counter to the
percpu mode after the initialization. PERCPU_REF_INIT_ATOMIC and
PERCPU_REF_INIT_DEAD flags will implicitly assume PERCPU_REF_ALLOW_REINIT.

This patch doesn't introduce any functional change to avoid any
regressions. It will be done later in the patchset after adjusting
all call sites, which are reviving percpu counters.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
---
 include/linux/percpu-refcount.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index b297cd1cd4f1..0f0240af8520 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -75,14 +75,21 @@ enum {
 	 * operation using percpu_ref_switch_to_percpu().  If initialized
 	 * with this flag, the ref will stay in atomic mode until
 	 * percpu_ref_switch_to_percpu() is invoked on it.
+	 * Implies ALLOW_REINIT.
 	 */
 	PERCPU_REF_INIT_ATOMIC	= 1 << 0,
 
 	/*
 	 * Start dead w/ ref == 0 in atomic mode.  Must be revived with
-	 * percpu_ref_reinit() before used.  Implies INIT_ATOMIC.
+	 * percpu_ref_reinit() before used.  Implies INIT_ATOMIC and
+	 * ALLOW_REINIT.
 	 */
 	PERCPU_REF_INIT_DEAD	= 1 << 1,
+
+	/*
+	 * Allow switching from atomic mode to percpu mode.
+	 */
+	PERCPU_REF_ALLOW_REINIT	= 1 << 2,
 };
 
 struct percpu_ref {
-- 
cgit v1.2.3-59-g8ed1b


From 7d9ab9b6adffd9c474c1274acb5f6208f9a09cf3 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Tue, 7 May 2019 10:01:50 -0700
Subject: percpu_ref: release percpu memory early without
 PERCPU_REF_ALLOW_REINIT

Release percpu memory after finishing the switch to the atomic mode
if only PERCPU_REF_ALLOW_REINIT isn't set.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
---
 include/linux/percpu-refcount.h |  1 +
 lib/percpu-refcount.c           | 13 +++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 0f0240af8520..7aef0abc194a 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -102,6 +102,7 @@ struct percpu_ref {
 	percpu_ref_func_t	*release;
 	percpu_ref_func_t	*confirm_switch;
 	bool			force_atomic:1;
+	bool			allow_reinit:1;
 	struct rcu_head		rcu;
 };
 
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 9877682e49c7..501b517bd3db 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -69,11 +69,14 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
 		return -ENOMEM;
 
 	ref->force_atomic = flags & PERCPU_REF_INIT_ATOMIC;
+	ref->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT;
 
-	if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD))
+	if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) {
 		ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
-	else
+		ref->allow_reinit = true;
+	} else {
 		start_count += PERCPU_COUNT_BIAS;
+	}
 
 	if (flags & PERCPU_REF_INIT_DEAD)
 		ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
@@ -119,6 +122,9 @@ static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
 	ref->confirm_switch = NULL;
 	wake_up_all(&percpu_ref_switch_waitq);
 
+	if (!ref->allow_reinit)
+		percpu_ref_exit(ref);
+
 	/* drop ref from percpu_ref_switch_to_atomic() */
 	percpu_ref_put(ref);
 }
@@ -194,6 +200,9 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
 	if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
 		return;
 
+	if (WARN_ON_ONCE(!ref->allow_reinit))
+		return;
+
 	atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From ec9964b4803300fb86f8e8fd9b421e59f7a71dc5 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 13 May 2019 09:56:34 +0200
Subject: Platform: OLPC: Move EC-specific functionality out from x86

Move the olpc-ec driver away from the X86 OLPC platform so that it could be
used by the ARM based laptops too. Notably, the driver for the OLPC battery,
which is also used on the ARM models, builds on this driver's interface.

It is actually plaform independent: the OLPC EC commands with their argument
and responses are mostly the same despite the delivery mechanism is
different.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 arch/x86/include/asm/olpc.h         |  31 ----------
 arch/x86/platform/olpc/olpc.c       | 119 ++++++------------------------------
 drivers/platform/olpc/olpc-ec.c     |  99 +++++++++++++++++++++++++++++-
 drivers/power/supply/olpc_battery.c |   1 -
 include/linux/olpc-ec.h             |  32 +++++++++-
 5 files changed, 145 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index c2bf1de5d901..6fe76282aceb 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -9,12 +9,10 @@
 struct olpc_platform_t {
 	int flags;
 	uint32_t boardrev;
-	int ecver;
 };
 
 #define OLPC_F_PRESENT		0x01
 #define OLPC_F_DCON		0x02
-#define OLPC_F_EC_WIDE_SCI	0x04
 
 #ifdef CONFIG_OLPC
 
@@ -64,13 +62,6 @@ static inline int olpc_board_at_least(uint32_t rev)
 	return olpc_platform_info.boardrev >= rev;
 }
 
-extern void olpc_ec_wakeup_set(u16 value);
-extern void olpc_ec_wakeup_clear(u16 value);
-extern bool olpc_ec_wakeup_available(void);
-
-extern int olpc_ec_mask_write(u16 bits);
-extern int olpc_ec_sci_query(u16 *sci_value);
-
 #else
 
 static inline int machine_is_olpc(void)
@@ -83,14 +74,6 @@ static inline int olpc_has_dcon(void)
 	return 0;
 }
 
-static inline void olpc_ec_wakeup_set(u16 value) { }
-static inline void olpc_ec_wakeup_clear(u16 value) { }
-
-static inline bool olpc_ec_wakeup_available(void)
-{
-	return false;
-}
-
 #endif
 
 #ifdef CONFIG_OLPC_XO1_PM
@@ -101,20 +84,6 @@ extern void olpc_xo1_pm_wakeup_clear(u16 value);
 
 extern int pci_olpc_init(void);
 
-/* SCI source values */
-
-#define EC_SCI_SRC_EMPTY	0x00
-#define EC_SCI_SRC_GAME		0x01
-#define EC_SCI_SRC_BATTERY	0x02
-#define EC_SCI_SRC_BATSOC	0x04
-#define EC_SCI_SRC_BATERR	0x08
-#define EC_SCI_SRC_EBOOK	0x10	/* XO-1 only */
-#define EC_SCI_SRC_WLAN		0x20	/* XO-1 only */
-#define EC_SCI_SRC_ACPWR	0x40
-#define EC_SCI_SRC_BATCRIT	0x80
-#define EC_SCI_SRC_GPWAKE	0x100	/* XO-1.5 only */
-#define EC_SCI_SRC_ALL		0x1FF
-
 /* GPIO assignments */
 
 #define OLPC_GPIO_MIC_AC	1
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index f0e920fb98ad..c6c62b4f251f 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -30,9 +30,6 @@
 struct olpc_platform_t olpc_platform_info;
 EXPORT_SYMBOL_GPL(olpc_platform_info);
 
-/* EC event mask to be applied during suspend (defining wakeup sources). */
-static u16 ec_wakeup_mask;
-
 /* what the timeout *should* be (in ms) */
 #define EC_BASE_TIMEOUT 20
 
@@ -186,83 +183,6 @@ err:
 	return ret;
 }
 
-void olpc_ec_wakeup_set(u16 value)
-{
-	ec_wakeup_mask |= value;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set);
-
-void olpc_ec_wakeup_clear(u16 value)
-{
-	ec_wakeup_mask &= ~value;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear);
-
-/*
- * Returns true if the compile and runtime configurations allow for EC events
- * to wake the system.
- */
-bool olpc_ec_wakeup_available(void)
-{
-	if (!machine_is_olpc())
-		return false;
-
-	/*
-	 * XO-1 EC wakeups are available when olpc-xo1-sci driver is
-	 * compiled in
-	 */
-#ifdef CONFIG_OLPC_XO1_SCI
-	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) /* XO-1 */
-		return true;
-#endif
-
-	/*
-	 * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is
-	 * compiled in
-	 */
-#ifdef CONFIG_OLPC_XO15_SCI
-	if (olpc_platform_info.boardrev >= olpc_board_pre(0xd0)) /* XO-1.5 */
-		return true;
-#endif
-
-	return false;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available);
-
-int olpc_ec_mask_write(u16 bits)
-{
-	if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
-		__be16 ec_word = cpu_to_be16(bits);
-		return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *) &ec_word, 2,
-				   NULL, 0);
-	} else {
-		unsigned char ec_byte = bits & 0xff;
-		return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0);
-	}
-}
-EXPORT_SYMBOL_GPL(olpc_ec_mask_write);
-
-int olpc_ec_sci_query(u16 *sci_value)
-{
-	int ret;
-
-	if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
-		__be16 ec_word;
-		ret = olpc_ec_cmd(EC_EXT_SCI_QUERY,
-			NULL, 0, (void *) &ec_word, 2);
-		if (ret == 0)
-			*sci_value = be16_to_cpu(ec_word);
-	} else {
-		unsigned char ec_byte;
-		ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1);
-		if (ret == 0)
-			*sci_value = ec_byte;
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
-
 static bool __init check_ofw_architecture(struct device_node *root)
 {
 	const char *olpc_arch;
@@ -296,6 +216,10 @@ static bool __init platform_detect(void)
 	if (success) {
 		olpc_platform_info.boardrev = get_board_revision(root);
 		olpc_platform_info.flags |= OLPC_F_PRESENT;
+
+		pr_info("OLPC board revision %s%X\n",
+			((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
+			olpc_platform_info.boardrev >> 4);
 	}
 
 	of_node_put(root);
@@ -315,27 +239,8 @@ static int __init add_xo1_platform_devices(void)
 	return PTR_ERR_OR_ZERO(pdev);
 }
 
-static int olpc_xo1_ec_probe(struct platform_device *pdev)
-{
-	/* get the EC revision */
-	olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
-			(unsigned char *) &olpc_platform_info.ecver, 1);
-
-	/* EC version 0x5f adds support for wide SCI mask */
-	if (olpc_platform_info.ecver >= 0x5f)
-		olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI;
-
-	pr_info("OLPC board revision %s%X (EC=%x)\n",
-			((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
-			olpc_platform_info.boardrev >> 4,
-			olpc_platform_info.ecver);
-
-	return 0;
-}
 static int olpc_xo1_ec_suspend(struct platform_device *pdev)
 {
-	olpc_ec_mask_write(ec_wakeup_mask);
-
 	/*
 	 * Squelch SCIs while suspended.  This is a fix for
 	 * <http://dev.laptop.org/ticket/1835>.
@@ -359,15 +264,27 @@ static int olpc_xo1_ec_resume(struct platform_device *pdev)
 }
 
 static struct olpc_ec_driver ec_xo1_driver = {
-	.probe = olpc_xo1_ec_probe,
 	.suspend = olpc_xo1_ec_suspend,
 	.resume = olpc_xo1_ec_resume,
 	.ec_cmd = olpc_xo1_ec_cmd,
+#ifdef CONFIG_OLPC_XO1_SCI
+	/*
+	 * XO-1 EC wakeups are available when olpc-xo1-sci driver is
+	 * compiled in
+	 */
+	.wakeup_available = true,
+#endif
 };
 
 static struct olpc_ec_driver ec_xo1_5_driver = {
-	.probe = olpc_xo1_ec_probe,
 	.ec_cmd = olpc_xo1_ec_cmd,
+#ifdef CONFIG_OLPC_XO1_5_SCI
+	/*
+	 * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is
+	 * compiled in
+	 */
+	.wakeup_available = true,
+#endif
 };
 
 static int __init olpc_init(void)
diff --git a/drivers/platform/olpc/olpc-ec.c b/drivers/platform/olpc/olpc-ec.c
index 981955dce926..2a647455a368 100644
--- a/drivers/platform/olpc/olpc-ec.c
+++ b/drivers/platform/olpc/olpc-ec.c
@@ -32,6 +32,7 @@ struct ec_cmd_desc {
 
 struct olpc_ec_priv {
 	struct olpc_ec_driver *drv;
+	u8 version;
 	struct work_struct worker;
 	struct mutex cmd_lock;
 
@@ -41,6 +42,12 @@ struct olpc_ec_priv {
 
 	struct dentry *dbgfs_dir;
 
+	/*
+	 * EC event mask to be applied during suspend (defining wakeup
+	 * sources).
+	 */
+	u16 ec_wakeup_mask;
+
 	/*
 	 * Running an EC command while suspending means we don't always finish
 	 * the command before the machine suspends.  This means that the EC
@@ -149,6 +156,88 @@ int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen)
 }
 EXPORT_SYMBOL_GPL(olpc_ec_cmd);
 
+void olpc_ec_wakeup_set(u16 value)
+{
+	struct olpc_ec_priv *ec = ec_priv;
+
+	if (WARN_ON(!ec))
+		return;
+
+	ec->ec_wakeup_mask |= value;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set);
+
+void olpc_ec_wakeup_clear(u16 value)
+{
+	struct olpc_ec_priv *ec = ec_priv;
+
+	if (WARN_ON(!ec))
+		return;
+
+	ec->ec_wakeup_mask &= ~value;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear);
+
+int olpc_ec_mask_write(u16 bits)
+{
+	struct olpc_ec_priv *ec = ec_priv;
+
+	if (WARN_ON(!ec))
+		return -ENODEV;
+
+	/* EC version 0x5f adds support for wide SCI mask */
+	if (ec->version >= 0x5f) {
+		__be16 ec_word = cpu_to_be16(bits);
+
+		return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *)&ec_word, 2, NULL, 0);
+	} else {
+		u8 ec_byte = bits & 0xff;
+
+		return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0);
+	}
+}
+EXPORT_SYMBOL_GPL(olpc_ec_mask_write);
+
+/*
+ * Returns true if the compile and runtime configurations allow for EC events
+ * to wake the system.
+ */
+bool olpc_ec_wakeup_available(void)
+{
+	if (WARN_ON(!ec_driver))
+		return false;
+
+	return ec_driver->wakeup_available;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available);
+
+int olpc_ec_sci_query(u16 *sci_value)
+{
+	struct olpc_ec_priv *ec = ec_priv;
+	int ret;
+
+	if (WARN_ON(!ec))
+		return -ENODEV;
+
+	/* EC version 0x5f adds support for wide SCI mask */
+	if (ec->version >= 0x5f) {
+		__be16 ec_word;
+
+		ret = olpc_ec_cmd(EC_EXT_SCI_QUERY, NULL, 0, (void *)&ec_word, 2);
+		if (ret == 0)
+			*sci_value = be16_to_cpu(ec_word);
+	} else {
+		u8 ec_byte;
+
+		ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1);
+		if (ret == 0)
+			*sci_value = ec_byte;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
+
 #ifdef CONFIG_DEBUG_FS
 
 /*
@@ -276,14 +365,16 @@ static int olpc_ec_probe(struct platform_device *pdev)
 	ec_priv = ec;
 	platform_set_drvdata(pdev, ec);
 
-	err = ec_driver->probe ? ec_driver->probe(pdev) : 0;
+	/* get the EC revision */
+	err = olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, &ec->version, 1);
 	if (err) {
 		ec_priv = NULL;
 		kfree(ec);
-	} else {
-		ec->dbgfs_dir = olpc_ec_setup_debugfs();
+		return err;
 	}
 
+	ec->dbgfs_dir = olpc_ec_setup_debugfs();
+
 	return err;
 }
 
@@ -293,6 +384,8 @@ static int olpc_ec_suspend(struct device *dev)
 	struct olpc_ec_priv *ec = platform_get_drvdata(pdev);
 	int err = 0;
 
+	olpc_ec_mask_write(ec->ec_wakeup_mask);
+
 	if (ec_driver->suspend)
 		err = ec_driver->suspend(pdev);
 	if (!err)
diff --git a/drivers/power/supply/olpc_battery.c b/drivers/power/supply/olpc_battery.c
index 7720e4c2ac0b..066ec9a11153 100644
--- a/drivers/power/supply/olpc_battery.c
+++ b/drivers/power/supply/olpc_battery.c
@@ -20,7 +20,6 @@
 #include <linux/jiffies.h>
 #include <linux/sched.h>
 #include <linux/olpc-ec.h>
-#include <asm/olpc.h>
 
 
 #define EC_BAT_VOLTAGE	0x10	/* uint16_t,	*9.76/32,    mV   */
diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h
index 79bdc6328c52..7fa3d27f7fee 100644
--- a/include/linux/olpc-ec.h
+++ b/include/linux/olpc-ec.h
@@ -16,14 +16,28 @@
 #define EC_SCI_QUERY			0x84
 #define EC_EXT_SCI_QUERY		0x85
 
+/* SCI source values */
+#define EC_SCI_SRC_EMPTY        0x00
+#define EC_SCI_SRC_GAME         0x01
+#define EC_SCI_SRC_BATTERY      0x02
+#define EC_SCI_SRC_BATSOC       0x04
+#define EC_SCI_SRC_BATERR       0x08
+#define EC_SCI_SRC_EBOOK        0x10    /* XO-1 only */
+#define EC_SCI_SRC_WLAN         0x20    /* XO-1 only */
+#define EC_SCI_SRC_ACPWR        0x40
+#define EC_SCI_SRC_BATCRIT      0x80
+#define EC_SCI_SRC_GPWAKE       0x100   /* XO-1.5 only */
+#define EC_SCI_SRC_ALL          0x1FF
+
 struct platform_device;
 
 struct olpc_ec_driver {
-	int (*probe)(struct platform_device *);
 	int (*suspend)(struct platform_device *);
 	int (*resume)(struct platform_device *);
 
 	int (*ec_cmd)(u8, u8 *, size_t, u8 *, size_t, void *);
+
+	bool wakeup_available;
 };
 
 #ifdef CONFIG_OLPC
@@ -33,11 +47,27 @@ extern void olpc_ec_driver_register(struct olpc_ec_driver *drv, void *arg);
 extern int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf,
 		size_t outlen);
 
+extern void olpc_ec_wakeup_set(u16 value);
+extern void olpc_ec_wakeup_clear(u16 value);
+
+extern int olpc_ec_mask_write(u16 bits);
+extern int olpc_ec_sci_query(u16 *sci_value);
+
+extern bool olpc_ec_wakeup_available(void);
+
 #else
 
 static inline int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf,
 		size_t outlen) { return -ENODEV; }
 
+static inline void olpc_ec_wakeup_set(u16 value) { }
+static inline void olpc_ec_wakeup_clear(u16 value) { }
+
+static inline bool olpc_ec_wakeup_available(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_OLPC */
 
 #endif /* _LINUX_OLPC_EC_H */
-- 
cgit v1.2.3-59-g8ed1b


From 8097548f3af9ec990169574ad9d874052b78bff8 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 13 May 2019 09:56:36 +0200
Subject: Platform: OLPC: Use BIT() and GENMASK() for event masks

Just a cosmetic tidy-up.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 include/linux/olpc-ec.h | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h
index 7fa3d27f7fee..f7b6a7eda232 100644
--- a/include/linux/olpc-ec.h
+++ b/include/linux/olpc-ec.h
@@ -2,6 +2,8 @@
 #ifndef _LINUX_OLPC_EC_H
 #define _LINUX_OLPC_EC_H
 
+#include <linux/bits.h>
+
 /* XO-1 EC commands */
 #define EC_FIRMWARE_REV			0x08
 #define EC_WRITE_SCI_MASK		0x1b
@@ -17,17 +19,16 @@
 #define EC_EXT_SCI_QUERY		0x85
 
 /* SCI source values */
-#define EC_SCI_SRC_EMPTY        0x00
-#define EC_SCI_SRC_GAME         0x01
-#define EC_SCI_SRC_BATTERY      0x02
-#define EC_SCI_SRC_BATSOC       0x04
-#define EC_SCI_SRC_BATERR       0x08
-#define EC_SCI_SRC_EBOOK        0x10    /* XO-1 only */
-#define EC_SCI_SRC_WLAN         0x20    /* XO-1 only */
-#define EC_SCI_SRC_ACPWR        0x40
-#define EC_SCI_SRC_BATCRIT      0x80
-#define EC_SCI_SRC_GPWAKE       0x100   /* XO-1.5 only */
-#define EC_SCI_SRC_ALL          0x1FF
+#define EC_SCI_SRC_GAME         BIT(0)
+#define EC_SCI_SRC_BATTERY      BIT(1)
+#define EC_SCI_SRC_BATSOC       BIT(2)
+#define EC_SCI_SRC_BATERR       BIT(3)
+#define EC_SCI_SRC_EBOOK        BIT(4)    /* XO-1 only */
+#define EC_SCI_SRC_WLAN         BIT(5)    /* XO-1 only */
+#define EC_SCI_SRC_ACPWR        BIT(6)
+#define EC_SCI_SRC_BATCRIT      BIT(7)
+#define EC_SCI_SRC_GPWAKE       BIT(8)   /* XO-1.5 only */
+#define EC_SCI_SRC_ALL          GENMASK(8, 0)
 
 struct platform_device;
 
-- 
cgit v1.2.3-59-g8ed1b


From 0c3d931b3ab9efeea4948b5373c62095449d0101 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Mon, 13 May 2019 09:56:37 +0200
Subject: Platform: OLPC: Add XO-1.75 EC driver

It's based off the driver from the OLPC kernel sources. Somewhat
modernized and cleaned up, for better or worse.

Modified to plug into the olpc-ec driver infrastructure (so that battery
interface and debugfs could be reused) and the SPI slave framework.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 arch/x86/Kconfig                      |   1 +
 drivers/platform/Kconfig              |   2 +
 drivers/platform/Makefile             |   2 +-
 drivers/platform/olpc/Kconfig         |  14 +
 drivers/platform/olpc/Makefile        |   3 +-
 drivers/platform/olpc/olpc-xo175-ec.c | 752 ++++++++++++++++++++++++++++++++++
 include/linux/olpc-ec.h               |   4 +-
 7 files changed, 774 insertions(+), 4 deletions(-)
 create mode 100644 drivers/platform/olpc/Kconfig
 create mode 100644 drivers/platform/olpc/olpc-xo175-ec.c

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2bbbd4d1ba31..cb1c073b3c7e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2698,6 +2698,7 @@ config OLPC
 	select OF
 	select OF_PROMTREE
 	select IRQ_DOMAIN
+	select OLPC_EC
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
diff --git a/drivers/platform/Kconfig b/drivers/platform/Kconfig
index d4c2e424a700..4313d73d3618 100644
--- a/drivers/platform/Kconfig
+++ b/drivers/platform/Kconfig
@@ -10,3 +10,5 @@ source "drivers/platform/goldfish/Kconfig"
 source "drivers/platform/chrome/Kconfig"
 
 source "drivers/platform/mellanox/Kconfig"
+
+source "drivers/platform/olpc/Kconfig"
diff --git a/drivers/platform/Makefile b/drivers/platform/Makefile
index 4b2ce58bcd9c..6fda58c021ca 100644
--- a/drivers/platform/Makefile
+++ b/drivers/platform/Makefile
@@ -6,6 +6,6 @@
 obj-$(CONFIG_X86)		+= x86/
 obj-$(CONFIG_MELLANOX_PLATFORM)	+= mellanox/
 obj-$(CONFIG_MIPS)		+= mips/
-obj-$(CONFIG_OLPC)		+= olpc/
+obj-$(CONFIG_OLPC_EC)		+= olpc/
 obj-$(CONFIG_GOLDFISH)		+= goldfish/
 obj-$(CONFIG_CHROME_PLATFORMS)	+= chrome/
diff --git a/drivers/platform/olpc/Kconfig b/drivers/platform/olpc/Kconfig
new file mode 100644
index 000000000000..559f843199d7
--- /dev/null
+++ b/drivers/platform/olpc/Kconfig
@@ -0,0 +1,14 @@
+config OLPC_EC
+	bool
+
+config OLPC_XO175_EC
+	tristate "OLPC XO 1.75 Embedded Controller"
+	depends on ARCH_MMP || COMPILE_TEST
+	select SPI_SLAVE
+	select OLPC_EC
+	help
+	  Include support for the OLPC XO Embedded Controller (EC). The EC
+	  provides various platform services, including support for the power,
+	  button, restart, shutdown and battery charging status.
+
+	  Unless you have an OLPC XO laptop, you will want to say N.
diff --git a/drivers/platform/olpc/Makefile b/drivers/platform/olpc/Makefile
index dc8b26bc7209..01fe6ba01665 100644
--- a/drivers/platform/olpc/Makefile
+++ b/drivers/platform/olpc/Makefile
@@ -1,4 +1,5 @@
 #
 # OLPC XO platform-specific drivers
 #
-obj-$(CONFIG_OLPC)		+= olpc-ec.o
+obj-$(CONFIG_OLPC_EC)		+= olpc-ec.o
+obj-$(CONFIG_OLPC_XO175_EC)	+= olpc-xo175-ec.o
diff --git a/drivers/platform/olpc/olpc-xo175-ec.c b/drivers/platform/olpc/olpc-xo175-ec.c
new file mode 100644
index 000000000000..344d14f3da54
--- /dev/null
+++ b/drivers/platform/olpc/olpc-xo175-ec.c
@@ -0,0 +1,752 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Driver for the OLPC XO-1.75 Embedded Controller.
+ *
+ * The EC protocol is documented at:
+ * http://wiki.laptop.org/go/XO_1.75_HOST_to_EC_Protocol
+ *
+ * Copyright (C) 2010 One Laptop per Child Foundation.
+ * Copyright (C) 2018 Lubomir Rintel <lkundrak@v3.sk>
+ */
+
+#include <linux/completion.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/gpio/consumer.h>
+#include <linux/input.h>
+#include <linux/kfifo.h>
+#include <linux/module.h>
+#include <linux/olpc-ec.h>
+#include <linux/platform_device.h>
+#include <linux/power_supply.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/spi/spi.h>
+
+struct ec_cmd_t {
+	u8 cmd;
+	u8 bytes_returned;
+};
+
+enum ec_chan_t {
+	CHAN_NONE = 0,
+	CHAN_SWITCH,
+	CHAN_CMD_RESP,
+	CHAN_KEYBOARD,
+	CHAN_TOUCHPAD,
+	CHAN_EVENT,
+	CHAN_DEBUG,
+	CHAN_CMD_ERROR,
+};
+
+/*
+ * EC events
+ */
+#define EVENT_AC_CHANGE			1  /* AC plugged/unplugged */
+#define EVENT_BATTERY_STATUS		2  /* Battery low/full/error/gone */
+#define EVENT_BATTERY_CRITICAL		3  /* Battery critical voltage */
+#define EVENT_BATTERY_SOC_CHANGE	4  /* 1% SOC Change */
+#define EVENT_BATTERY_ERROR		5  /* Abnormal error, query for cause */
+#define EVENT_POWER_PRESSED		6  /* Power button was pressed */
+#define EVENT_POWER_PRESS_WAKE		7  /* Woken up with a power button */
+#define EVENT_TIMED_HOST_WAKE		8  /* Host wake timer */
+#define EVENT_OLS_HIGH_LIMIT		9  /* OLS crossed dark threshold */
+#define EVENT_OLS_LOW_LIMIT		10 /* OLS crossed light threshold */
+
+/*
+ * EC commands
+ * (from http://dev.laptop.org/git/users/rsmith/ec-1.75/tree/ec_cmd.h)
+ */
+#define CMD_GET_API_VERSION		0x08 /* out: u8 */
+#define CMD_READ_VOLTAGE		0x10 /* out: u16, *9.76/32, mV */
+#define CMD_READ_CURRENT		0x11 /* out: s16, *15.625/120, mA */
+#define CMD_READ_ACR			0x12 /* out: s16, *6250/15, uAh */
+#define CMD_READ_BATT_TEMPERATURE	0x13 /* out: u16, *100/256, deg C */
+#define CMD_READ_AMBIENT_TEMPERATURE	0x14 /* unimplemented, no hardware */
+#define CMD_READ_BATTERY_STATUS		0x15 /* out: u8, bitmask */
+#define CMD_READ_SOC			0x16 /* out: u8, percentage */
+#define CMD_READ_GAUGE_ID		0x17 /* out: u8 * 8 */
+#define CMD_READ_GAUGE_DATA		0x18 /* in: u8 addr, out: u8 data */
+#define CMD_READ_BOARD_ID		0x19 /* out: u16 (platform id) */
+#define CMD_READ_BATT_ERR_CODE		0x1f /* out: u8, error bitmask */
+#define CMD_SET_DCON_POWER		0x26 /* in: u8 */
+#define CMD_RESET_EC			0x28 /* none */
+#define CMD_READ_BATTERY_TYPE		0x2c /* out: u8 */
+#define CMD_SET_AUTOWAK			0x33 /* out: u8 */
+#define CMD_SET_EC_WAKEUP_TIMER		0x36 /* in: u32, out: ? */
+#define CMD_READ_EXT_SCI_MASK		0x37 /* ? */
+#define CMD_WRITE_EXT_SCI_MASK		0x38 /* ? */
+#define CMD_CLEAR_EC_WAKEUP_TIMER	0x39 /* none */
+#define CMD_ENABLE_RUNIN_DISCHARGE	0x3B /* none */
+#define CMD_DISABLE_RUNIN_DISCHARGE	0x3C /* none */
+#define CMD_READ_MPPT_ACTIVE		0x3d /* out: u8 */
+#define CMD_READ_MPPT_LIMIT		0x3e /* out: u8 */
+#define CMD_SET_MPPT_LIMIT		0x3f /* in: u8 */
+#define CMD_DISABLE_MPPT		0x40 /* none */
+#define CMD_ENABLE_MPPT			0x41 /* none */
+#define CMD_READ_VIN			0x42 /* out: u16 */
+#define CMD_EXT_SCI_QUERY		0x43 /* ? */
+#define RSP_KEYBOARD_DATA		0x48 /* ? */
+#define RSP_TOUCHPAD_DATA		0x49 /* ? */
+#define CMD_GET_FW_VERSION		0x4a /* out: u8 * 16 */
+#define CMD_POWER_CYCLE			0x4b /* none */
+#define CMD_POWER_OFF			0x4c /* none */
+#define CMD_RESET_EC_SOFT		0x4d /* none */
+#define CMD_READ_GAUGE_U16		0x4e /* ? */
+#define CMD_ENABLE_MOUSE		0x4f /* ? */
+#define CMD_ECHO			0x52 /* in: u8 * 5, out: u8 * 5 */
+#define CMD_GET_FW_DATE			0x53 /* out: u8 * 16 */
+#define CMD_GET_FW_USER			0x54 /* out: u8 * 16 */
+#define CMD_TURN_OFF_POWER		0x55 /* none (same as 0x4c) */
+#define CMD_READ_OLS			0x56 /* out: u16 */
+#define CMD_OLS_SMT_LEDON		0x57 /* none */
+#define CMD_OLS_SMT_LEDOFF		0x58 /* none */
+#define CMD_START_OLS_ASSY		0x59 /* none */
+#define CMD_STOP_OLS_ASSY		0x5a /* none */
+#define CMD_OLS_SMTTEST_STOP		0x5b /* none */
+#define CMD_READ_VIN_SCALED		0x5c /* out: u16 */
+#define CMD_READ_BAT_MIN_W		0x5d /* out: u16 */
+#define CMD_READ_BAR_MAX_W		0x5e /* out: u16 */
+#define CMD_RESET_BAT_MINMAX_W		0x5f /* none */
+#define CMD_READ_LOCATION		0x60 /* in: u16 addr, out: u8 data */
+#define CMD_WRITE_LOCATION		0x61 /* in: u16 addr, u8 data */
+#define CMD_KEYBOARD_CMD		0x62 /* in: u8, out: ? */
+#define CMD_TOUCHPAD_CMD		0x63 /* in: u8, out: ? */
+#define CMD_GET_FW_HASH			0x64 /* out: u8 * 16 */
+#define CMD_SUSPEND_HINT		0x65 /* in: u8 */
+#define CMD_ENABLE_WAKE_TIMER		0x66 /* in: u8 */
+#define CMD_SET_WAKE_TIMER		0x67 /* in: 32 */
+#define CMD_ENABLE_WAKE_AUTORESET	0x68 /* in: u8 */
+#define CMD_OLS_SET_LIMITS		0x69 /* in: u16, u16 */
+#define CMD_OLS_GET_LIMITS		0x6a /* out: u16, u16 */
+#define CMD_OLS_SET_CEILING		0x6b /* in: u16 */
+#define CMD_OLS_GET_CEILING		0x6c /* out: u16 */
+
+/*
+ * Accepted EC commands, and how many bytes they return. There are plenty
+ * of EC commands that are no longer implemented, or are implemented only on
+ * certain older boards.
+ */
+static const struct ec_cmd_t olpc_xo175_ec_cmds[] = {
+	{ CMD_GET_API_VERSION, 1 },
+	{ CMD_READ_VOLTAGE, 2 },
+	{ CMD_READ_CURRENT, 2 },
+	{ CMD_READ_ACR, 2 },
+	{ CMD_READ_BATT_TEMPERATURE, 2 },
+	{ CMD_READ_BATTERY_STATUS, 1 },
+	{ CMD_READ_SOC, 1 },
+	{ CMD_READ_GAUGE_ID, 8 },
+	{ CMD_READ_GAUGE_DATA, 1 },
+	{ CMD_READ_BOARD_ID, 2 },
+	{ CMD_READ_BATT_ERR_CODE, 1 },
+	{ CMD_SET_DCON_POWER, 0 },
+	{ CMD_RESET_EC, 0 },
+	{ CMD_READ_BATTERY_TYPE, 1 },
+	{ CMD_ENABLE_RUNIN_DISCHARGE, 0 },
+	{ CMD_DISABLE_RUNIN_DISCHARGE, 0 },
+	{ CMD_READ_MPPT_ACTIVE, 1 },
+	{ CMD_READ_MPPT_LIMIT, 1 },
+	{ CMD_SET_MPPT_LIMIT, 0 },
+	{ CMD_DISABLE_MPPT, 0 },
+	{ CMD_ENABLE_MPPT, 0 },
+	{ CMD_READ_VIN, 2 },
+	{ CMD_GET_FW_VERSION, 16 },
+	{ CMD_POWER_CYCLE, 0 },
+	{ CMD_POWER_OFF, 0 },
+	{ CMD_RESET_EC_SOFT, 0 },
+	{ CMD_ECHO, 5 },
+	{ CMD_GET_FW_DATE, 16 },
+	{ CMD_GET_FW_USER, 16 },
+	{ CMD_TURN_OFF_POWER, 0 },
+	{ CMD_READ_OLS, 2 },
+	{ CMD_OLS_SMT_LEDON, 0 },
+	{ CMD_OLS_SMT_LEDOFF, 0 },
+	{ CMD_START_OLS_ASSY, 0 },
+	{ CMD_STOP_OLS_ASSY, 0 },
+	{ CMD_OLS_SMTTEST_STOP, 0 },
+	{ CMD_READ_VIN_SCALED, 2 },
+	{ CMD_READ_BAT_MIN_W, 2 },
+	{ CMD_READ_BAR_MAX_W, 2 },
+	{ CMD_RESET_BAT_MINMAX_W, 0 },
+	{ CMD_READ_LOCATION, 1 },
+	{ CMD_WRITE_LOCATION, 0 },
+	{ CMD_GET_FW_HASH, 16 },
+	{ CMD_SUSPEND_HINT, 0 },
+	{ CMD_ENABLE_WAKE_TIMER, 0 },
+	{ CMD_SET_WAKE_TIMER, 0 },
+	{ CMD_ENABLE_WAKE_AUTORESET, 0 },
+	{ CMD_OLS_SET_LIMITS, 0 },
+	{ CMD_OLS_GET_LIMITS, 4 },
+	{ CMD_OLS_SET_CEILING, 0 },
+	{ CMD_OLS_GET_CEILING, 2 },
+	{ CMD_READ_EXT_SCI_MASK, 2 },
+	{ CMD_WRITE_EXT_SCI_MASK, 0 },
+
+	{ }
+};
+
+#define EC_MAX_CMD_DATA_LEN	5
+#define EC_MAX_RESP_LEN		16
+
+#define LOG_BUF_SIZE		128
+
+#define PM_WAKEUP_TIME		1000
+
+#define EC_ALL_EVENTS		GENMASK(15, 0)
+
+enum ec_state_t {
+	CMD_STATE_IDLE = 0,
+	CMD_STATE_WAITING_FOR_SWITCH,
+	CMD_STATE_CMD_IN_TX_FIFO,
+	CMD_STATE_CMD_SENT,
+	CMD_STATE_RESP_RECEIVED,
+	CMD_STATE_ERROR_RECEIVED,
+};
+
+struct olpc_xo175_ec_cmd {
+	u8 command;
+	u8 nr_args;
+	u8 data_len;
+	u8 args[EC_MAX_CMD_DATA_LEN];
+};
+
+struct olpc_xo175_ec_resp {
+	u8 channel;
+	u8 byte;
+};
+
+struct olpc_xo175_ec {
+	bool suspended;
+
+	/* SPI related stuff. */
+	struct spi_device *spi;
+	struct spi_transfer xfer;
+	struct spi_message msg;
+	union {
+		struct olpc_xo175_ec_cmd cmd;
+		struct olpc_xo175_ec_resp resp;
+	} tx_buf, rx_buf;
+
+	/* GPIO for the CMD signals. */
+	struct gpio_desc *gpio_cmd;
+
+	/* Command handling related state. */
+	spinlock_t cmd_state_lock;
+	int cmd_state;
+	bool cmd_running;
+	struct completion cmd_done;
+	struct olpc_xo175_ec_cmd cmd;
+	u8 resp_data[EC_MAX_RESP_LEN];
+	int expected_resp_len;
+	int resp_len;
+
+	/* Power button. */
+	struct input_dev *pwrbtn;
+
+	/* Debug handling. */
+	char logbuf[LOG_BUF_SIZE];
+	int logbuf_len;
+};
+
+static struct platform_device *olpc_ec;
+
+static int olpc_xo175_ec_resp_len(u8 cmd)
+{
+	const struct ec_cmd_t *p;
+
+	for (p = olpc_xo175_ec_cmds; p->cmd; p++) {
+		if (p->cmd == cmd)
+			return p->bytes_returned;
+	}
+
+	return -EINVAL;
+}
+
+static void olpc_xo175_ec_flush_logbuf(struct olpc_xo175_ec *priv)
+{
+	dev_dbg(&priv->spi->dev, "got debug string [%*pE]\n",
+				priv->logbuf_len, priv->logbuf);
+	priv->logbuf_len = 0;
+}
+
+static void olpc_xo175_ec_complete(void *arg);
+
+static void olpc_xo175_ec_send_command(struct olpc_xo175_ec *priv, void *cmd,
+								size_t cmdlen)
+{
+	int ret;
+
+	memcpy(&priv->tx_buf, cmd, cmdlen);
+	priv->xfer.len = cmdlen;
+
+	spi_message_init_with_transfers(&priv->msg, &priv->xfer, 1);
+
+	priv->msg.complete = olpc_xo175_ec_complete;
+	priv->msg.context = priv;
+
+	ret = spi_async(priv->spi, &priv->msg);
+	if (ret)
+		dev_err(&priv->spi->dev, "spi_async() failed %d\n", ret);
+}
+
+static void olpc_xo175_ec_read_packet(struct olpc_xo175_ec *priv)
+{
+	u8 nonce[] = {0xA5, 0x5A};
+
+	olpc_xo175_ec_send_command(priv, nonce, sizeof(nonce));
+}
+
+static void olpc_xo175_ec_complete(void *arg)
+{
+	struct olpc_xo175_ec *priv = arg;
+	struct device *dev = &priv->spi->dev;
+	struct power_supply *psy;
+	unsigned long flags;
+	u8 channel;
+	u8 byte;
+	int ret;
+
+	ret = priv->msg.status;
+	if (ret) {
+		dev_err(dev, "SPI transfer failed: %d\n", ret);
+
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+		if (priv->cmd_running) {
+			priv->resp_len = 0;
+			priv->cmd_state = CMD_STATE_ERROR_RECEIVED;
+			complete(&priv->cmd_done);
+		}
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+
+		if (ret != -EINTR)
+			olpc_xo175_ec_read_packet(priv);
+
+		return;
+	}
+
+	channel = priv->rx_buf.resp.channel;
+	byte = priv->rx_buf.resp.byte;
+
+	switch (channel) {
+	case CHAN_NONE:
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+		if (!priv->cmd_running) {
+			/* We can safely ignore these */
+			dev_err(dev, "spurious FIFO read packet\n");
+			spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+			return;
+		}
+
+		priv->cmd_state = CMD_STATE_CMD_SENT;
+		if (!priv->expected_resp_len)
+			complete(&priv->cmd_done);
+		olpc_xo175_ec_read_packet(priv);
+
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+		return;
+
+	case CHAN_SWITCH:
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+		if (!priv->cmd_running) {
+			/* Just go with the flow */
+			dev_err(dev, "spurious SWITCH packet\n");
+			memset(&priv->cmd, 0, sizeof(priv->cmd));
+			priv->cmd.command = CMD_ECHO;
+		}
+
+		priv->cmd_state = CMD_STATE_CMD_IN_TX_FIFO;
+
+		/* Throw command into TxFIFO */
+		gpiod_set_value_cansleep(priv->gpio_cmd, 0);
+		olpc_xo175_ec_send_command(priv, &priv->cmd, sizeof(priv->cmd));
+
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+		return;
+
+	case CHAN_CMD_RESP:
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+		if (!priv->cmd_running) {
+			dev_err(dev, "spurious response packet\n");
+		} else if (priv->resp_len >= priv->expected_resp_len) {
+			dev_err(dev, "too many response packets\n");
+		} else {
+			priv->resp_data[priv->resp_len++] = byte;
+			if (priv->resp_len == priv->expected_resp_len) {
+				priv->cmd_state = CMD_STATE_RESP_RECEIVED;
+				complete(&priv->cmd_done);
+			}
+		}
+
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+		break;
+
+	case CHAN_CMD_ERROR:
+		spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+		if (!priv->cmd_running) {
+			dev_err(dev, "spurious cmd error packet\n");
+		} else {
+			priv->resp_data[0] = byte;
+			priv->resp_len = 1;
+			priv->cmd_state = CMD_STATE_ERROR_RECEIVED;
+			complete(&priv->cmd_done);
+		}
+		spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+		break;
+
+	case CHAN_KEYBOARD:
+		dev_warn(dev, "keyboard is not supported\n");
+		break;
+
+	case CHAN_TOUCHPAD:
+		dev_warn(dev, "touchpad is not supported\n");
+		break;
+
+	case CHAN_EVENT:
+		dev_dbg(dev, "got event %.2x\n", byte);
+		switch (byte) {
+		case EVENT_AC_CHANGE:
+			psy = power_supply_get_by_name("olpc-ac");
+			if (psy) {
+				power_supply_changed(psy);
+				power_supply_put(psy);
+			}
+			break;
+		case EVENT_BATTERY_STATUS:
+		case EVENT_BATTERY_CRITICAL:
+		case EVENT_BATTERY_SOC_CHANGE:
+		case EVENT_BATTERY_ERROR:
+			psy = power_supply_get_by_name("olpc-battery");
+			if (psy) {
+				power_supply_changed(psy);
+				power_supply_put(psy);
+			}
+			break;
+		case EVENT_POWER_PRESSED:
+			input_report_key(priv->pwrbtn, KEY_POWER, 1);
+			input_sync(priv->pwrbtn);
+			input_report_key(priv->pwrbtn, KEY_POWER, 0);
+			input_sync(priv->pwrbtn);
+			/* fall through */
+		case EVENT_POWER_PRESS_WAKE:
+		case EVENT_TIMED_HOST_WAKE:
+			pm_wakeup_event(priv->pwrbtn->dev.parent,
+						PM_WAKEUP_TIME);
+			break;
+		default:
+			dev_dbg(dev, "ignored unknown event %.2x\n", byte);
+			break;
+		}
+		break;
+
+	case CHAN_DEBUG:
+		if (byte == '\n') {
+			olpc_xo175_ec_flush_logbuf(priv);
+		} else if (isprint(byte)) {
+			priv->logbuf[priv->logbuf_len++] = byte;
+			if (priv->logbuf_len == LOG_BUF_SIZE)
+				olpc_xo175_ec_flush_logbuf(priv);
+		}
+		break;
+
+	default:
+		dev_warn(dev, "unknown channel: %d, %.2x\n", channel, byte);
+		break;
+	}
+
+	/* Most non-command packets get the TxFIFO refilled and an ACK. */
+	olpc_xo175_ec_read_packet(priv);
+}
+
+/*
+ * This function is protected with a mutex. We can safely assume that
+ * there will be only one instance of this function running at a time.
+ * One of the ways in which we enforce this is by waiting until we get
+ * all response bytes back from the EC, rather than just the number that
+ * the caller requests (otherwise, we might start a new command while an
+ * old command's response bytes are still incoming).
+ */
+static int olpc_xo175_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *resp,
+					size_t resp_len, void *ec_cb_arg)
+{
+	struct olpc_xo175_ec *priv = ec_cb_arg;
+	struct device *dev = &priv->spi->dev;
+	unsigned long flags;
+	size_t nr_bytes;
+	int ret = 0;
+
+	dev_dbg(dev, "CMD %x, %zd bytes expected\n", cmd, resp_len);
+
+	if (inlen > 5) {
+		dev_err(dev, "command len %zd too big!\n", resp_len);
+		return -EOVERFLOW;
+	}
+
+	/* Suspending in the middle of an EC command hoses things badly! */
+	if (WARN_ON(priv->suspended))
+		return -EBUSY;
+
+	/* Ensure a valid command and return bytes */
+	ret = olpc_xo175_ec_resp_len(cmd);
+	if (ret < 0) {
+		dev_err_ratelimited(dev, "unknown command 0x%x\n", cmd);
+
+		/*
+		 * Assume the best in our callers, and allow unknown commands
+		 * through. I'm not the charitable type, but it was beaten
+		 * into me. Just maintain a minimum standard of sanity.
+		 */
+		if (resp_len > sizeof(priv->resp_data)) {
+			dev_err(dev, "response too big: %zd!\n", resp_len);
+			return -EOVERFLOW;
+		}
+		nr_bytes = resp_len;
+	} else {
+		nr_bytes = (size_t)ret;
+	}
+	resp_len = min(resp_len, nr_bytes);
+
+	spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+	/* Initialize the state machine */
+	init_completion(&priv->cmd_done);
+	priv->cmd_running = true;
+	priv->cmd_state = CMD_STATE_WAITING_FOR_SWITCH;
+	memset(&priv->cmd, 0, sizeof(priv->cmd));
+	priv->cmd.command = cmd;
+	priv->cmd.nr_args = inlen;
+	priv->cmd.data_len = 0;
+	memcpy(priv->cmd.args, inbuf, inlen);
+	priv->expected_resp_len = nr_bytes;
+	priv->resp_len = 0;
+
+	/* Tickle the cmd gpio to get things started */
+	gpiod_set_value_cansleep(priv->gpio_cmd, 1);
+
+	spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+
+	/* The irq handler should do the rest */
+	if (!wait_for_completion_timeout(&priv->cmd_done,
+			msecs_to_jiffies(4000))) {
+		dev_err(dev, "EC cmd error: timeout in STATE %d\n",
+				priv->cmd_state);
+		gpiod_set_value_cansleep(priv->gpio_cmd, 0);
+		spi_slave_abort(priv->spi);
+		olpc_xo175_ec_read_packet(priv);
+		return -ETIMEDOUT;
+	}
+
+	spin_lock_irqsave(&priv->cmd_state_lock, flags);
+
+	/* Deal with the results. */
+	if (priv->cmd_state == CMD_STATE_ERROR_RECEIVED) {
+		/* EC-provided error is in the single response byte */
+		dev_err(dev, "command 0x%x returned error 0x%x\n",
+						cmd, priv->resp_data[0]);
+		ret = -EREMOTEIO;
+	} else if (priv->resp_len != nr_bytes) {
+		dev_err(dev, "command 0x%x returned %d bytes, expected %zd bytes\n",
+						cmd, priv->resp_len, nr_bytes);
+		ret = -EREMOTEIO;
+	} else {
+		/*
+		 * We may have 8 bytes in priv->resp, but we only care about
+		 * what we've been asked for. If the caller asked for only 2
+		 * bytes, give them that. We've guaranteed that
+		 * resp_len <= priv->resp_len and priv->resp_len == nr_bytes.
+		 */
+		memcpy(resp, priv->resp_data, resp_len);
+	}
+
+	/* This should already be low, but just in case. */
+	gpiod_set_value_cansleep(priv->gpio_cmd, 0);
+	priv->cmd_running = false;
+
+	spin_unlock_irqrestore(&priv->cmd_state_lock, flags);
+
+	return ret;
+}
+
+static int olpc_xo175_ec_set_event_mask(unsigned int mask)
+{
+	u8 args[2];
+
+	args[0] = mask >> 0;
+	args[1] = mask >> 8;
+	return olpc_ec_cmd(CMD_WRITE_EXT_SCI_MASK, args, 2, NULL, 0);
+}
+
+static void olpc_xo175_ec_power_off(void)
+{
+	while (1) {
+		olpc_ec_cmd(CMD_POWER_OFF, NULL, 0, NULL, 0);
+		mdelay(1000);
+	}
+}
+
+static int __maybe_unused olpc_xo175_ec_suspend(struct device *dev)
+{
+	struct olpc_xo175_ec *priv = dev_get_drvdata(dev);
+	static struct {
+		u8 suspend;
+		u32 suspend_count;
+	} __packed hintargs;
+	static unsigned int suspend_count;
+
+	/*
+	 * SOC_SLEEP is not wired to the EC on B3 and earlier boards.
+	 * This command lets the EC know instead. The suspend count doesn't seem
+	 * to be used anywhere but in the EC debug output.
+	 */
+	hintargs.suspend = 1;
+	hintargs.suspend_count = suspend_count++;
+	olpc_ec_cmd(CMD_SUSPEND_HINT, (void *)&hintargs, sizeof(hintargs),
+								NULL, 0);
+
+	/*
+	 * After we've sent the suspend hint, don't allow further EC commands
+	 * to be run until we've resumed. Userspace tasks should be frozen,
+	 * but kernel threads and interrupts could still schedule EC commands.
+	 */
+	priv->suspended = true;
+
+	return 0;
+}
+
+static int __maybe_unused olpc_xo175_ec_resume_noirq(struct device *dev)
+{
+	struct olpc_xo175_ec *priv = dev_get_drvdata(dev);
+
+	priv->suspended = false;
+
+	return 0;
+}
+
+static int __maybe_unused olpc_xo175_ec_resume(struct device *dev)
+{
+	u8 x = 0;
+
+	/*
+	 * The resume hint is only needed if no other commands are
+	 * being sent during resume. all it does is tell the EC
+	 * the SoC is definitely awake.
+	 */
+	olpc_ec_cmd(CMD_SUSPEND_HINT, &x, 1, NULL, 0);
+
+	/* Enable all EC events while we're awake */
+	olpc_xo175_ec_set_event_mask(EC_ALL_EVENTS);
+
+	return 0;
+}
+
+static struct olpc_ec_driver olpc_xo175_ec_driver = {
+	.ec_cmd = olpc_xo175_ec_cmd,
+};
+
+static int olpc_xo175_ec_remove(struct spi_device *spi)
+{
+	if (pm_power_off == olpc_xo175_ec_power_off)
+		pm_power_off = NULL;
+
+	spi_slave_abort(spi);
+
+	platform_device_unregister(olpc_ec);
+	olpc_ec = NULL;
+
+	return 0;
+}
+
+static int olpc_xo175_ec_probe(struct spi_device *spi)
+{
+	struct olpc_xo175_ec *priv;
+	int ret;
+
+	if (olpc_ec) {
+		dev_err(&spi->dev, "OLPC EC already registered.\n");
+		return -EBUSY;
+	}
+
+	priv = devm_kzalloc(&spi->dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->gpio_cmd = devm_gpiod_get(&spi->dev, "cmd", GPIOD_OUT_LOW);
+	if (IS_ERR(priv->gpio_cmd)) {
+		dev_err(&spi->dev, "failed to get cmd gpio: %ld\n",
+					PTR_ERR(priv->gpio_cmd));
+		return PTR_ERR(priv->gpio_cmd);
+	}
+
+	priv->spi = spi;
+
+	spin_lock_init(&priv->cmd_state_lock);
+	priv->cmd_state = CMD_STATE_IDLE;
+	init_completion(&priv->cmd_done);
+
+	priv->logbuf_len = 0;
+
+	/* Set up power button input device */
+	priv->pwrbtn = devm_input_allocate_device(&spi->dev);
+	if (!priv->pwrbtn)
+		return -ENOMEM;
+	priv->pwrbtn->name = "Power Button";
+	priv->pwrbtn->dev.parent = &spi->dev;
+	input_set_capability(priv->pwrbtn, EV_KEY, KEY_POWER);
+	ret = input_register_device(priv->pwrbtn);
+	if (ret) {
+		dev_err(&spi->dev, "error registering input device: %d\n", ret);
+		return ret;
+	}
+
+	spi_set_drvdata(spi, priv);
+
+	priv->xfer.rx_buf = &priv->rx_buf;
+	priv->xfer.tx_buf = &priv->tx_buf;
+
+	olpc_xo175_ec_read_packet(priv);
+
+	olpc_ec_driver_register(&olpc_xo175_ec_driver, priv);
+	olpc_ec = platform_device_register_resndata(&spi->dev, "olpc-ec", -1,
+							NULL, 0, NULL, 0);
+
+	/* Enable all EC events while we're awake */
+	olpc_xo175_ec_set_event_mask(EC_ALL_EVENTS);
+
+	if (pm_power_off == NULL)
+		pm_power_off = olpc_xo175_ec_power_off;
+
+	dev_info(&spi->dev, "OLPC XO-1.75 Embedded Controller driver\n");
+
+	return 0;
+}
+
+static const struct dev_pm_ops olpc_xo175_ec_pm_ops = {
+	SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(NULL, olpc_xo175_ec_resume_noirq)
+	SET_RUNTIME_PM_OPS(olpc_xo175_ec_suspend, olpc_xo175_ec_resume, NULL)
+};
+
+static const struct of_device_id olpc_xo175_ec_of_match[] = {
+	{ .compatible = "olpc,xo1.75-ec" },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, olpc_xo175_ec_of_match);
+
+static struct spi_driver olpc_xo175_ec_spi_driver = {
+	.driver = {
+		.name	= "olpc-xo175-ec",
+		.of_match_table = olpc_xo175_ec_of_match,
+		.pm = &olpc_xo175_ec_pm_ops,
+	},
+	.probe		= olpc_xo175_ec_probe,
+	.remove		= olpc_xo175_ec_remove,
+};
+module_spi_driver(olpc_xo175_ec_spi_driver);
+
+MODULE_DESCRIPTION("OLPC XO-1.75 Embedded Controller driver");
+MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>"); /* Functionality */
+MODULE_AUTHOR("Lubomir Rintel <lkundrak@v3.sk>"); /* Bugs */
+MODULE_LICENSE("GPL");
diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h
index f7b6a7eda232..c4602364e909 100644
--- a/include/linux/olpc-ec.h
+++ b/include/linux/olpc-ec.h
@@ -41,7 +41,7 @@ struct olpc_ec_driver {
 	bool wakeup_available;
 };
 
-#ifdef CONFIG_OLPC
+#ifdef CONFIG_OLPC_EC
 
 extern void olpc_ec_driver_register(struct olpc_ec_driver *drv, void *arg);
 
@@ -69,6 +69,6 @@ static inline bool olpc_ec_wakeup_available(void)
 	return false;
 }
 
-#endif /* CONFIG_OLPC */
+#endif /* CONFIG_OLPC_EC */
 
 #endif /* _LINUX_OLPC_EC_H */
-- 
cgit v1.2.3-59-g8ed1b


From 9a0f780958bbcb85604636fa340e2a1efaa4f432 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms+renesas@verge.net.au>
Date: Mon, 13 May 2019 13:39:51 +0200
Subject: dmaengine: sudmac: remove unused driver

SUDMAC driver was introduced in v3.10 but was never integrated for use
by any platform. As it is unused remove it.

Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
Acked-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/sh/Kconfig  |   6 -
 drivers/dma/sh/Makefile |   1 -
 drivers/dma/sh/sudmac.c | 414 ------------------------------------------------
 include/linux/sudmac.h  |  52 ------
 4 files changed, 473 deletions(-)
 delete mode 100644 drivers/dma/sh/sudmac.c
 delete mode 100644 include/linux/sudmac.h

(limited to 'include/linux')

diff --git a/drivers/dma/sh/Kconfig b/drivers/dma/sh/Kconfig
index 4d6b02b3b1f1..54d5d0369d3c 100644
--- a/drivers/dma/sh/Kconfig
+++ b/drivers/dma/sh/Kconfig
@@ -47,9 +47,3 @@ config RENESAS_USB_DMAC
 	help
 	  This driver supports the USB-DMA controller found in the Renesas
 	  SoCs.
-
-config SUDMAC
-	tristate "Renesas SUDMAC support"
-	depends on SH_DMAE_BASE
-	help
-	  Enable support for the Renesas SUDMAC controllers.
diff --git a/drivers/dma/sh/Makefile b/drivers/dma/sh/Makefile
index 42110dd57a56..112fbd22bb3f 100644
--- a/drivers/dma/sh/Makefile
+++ b/drivers/dma/sh/Makefile
@@ -15,4 +15,3 @@ obj-$(CONFIG_SH_DMAE) += shdma.o
 
 obj-$(CONFIG_RCAR_DMAC) += rcar-dmac.o
 obj-$(CONFIG_RENESAS_USB_DMAC) += usb-dmac.o
-obj-$(CONFIG_SUDMAC) += sudmac.o
diff --git a/drivers/dma/sh/sudmac.c b/drivers/dma/sh/sudmac.c
deleted file mode 100644
index 30cc3553cb8b..000000000000
--- a/drivers/dma/sh/sudmac.c
+++ /dev/null
@@ -1,414 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Renesas SUDMAC support
- *
- * Copyright (C) 2013 Renesas Solutions Corp.
- *
- * based on drivers/dma/sh/shdma.c:
- * Copyright (C) 2011-2012 Guennadi Liakhovetski <g.liakhovetski@gmx.de>
- * Copyright (C) 2009 Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
- * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved.
- * Copyright (C) 2007 Freescale Semiconductor, Inc. All rights reserved.
- */
-
-#include <linux/dmaengine.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/sudmac.h>
-
-struct sudmac_chan {
-	struct shdma_chan shdma_chan;
-	void __iomem *base;
-	char dev_id[16];	/* unique name per DMAC of channel */
-
-	u32 offset;		/* for CFG, BA, BBC, CA, CBC, DEN */
-	u32 cfg;
-	u32 dint_end_bit;
-};
-
-struct sudmac_device {
-	struct shdma_dev shdma_dev;
-	struct sudmac_pdata *pdata;
-	void __iomem *chan_reg;
-};
-
-struct sudmac_regs {
-	u32 base_addr;
-	u32 base_byte_count;
-};
-
-struct sudmac_desc {
-	struct sudmac_regs hw;
-	struct shdma_desc shdma_desc;
-};
-
-#define to_chan(schan) container_of(schan, struct sudmac_chan, shdma_chan)
-#define to_desc(sdesc) container_of(sdesc, struct sudmac_desc, shdma_desc)
-#define to_sdev(sc) container_of(sc->shdma_chan.dma_chan.device, \
-				 struct sudmac_device, shdma_dev.dma_dev)
-
-/* SUDMAC register */
-#define SUDMAC_CH0CFG		0x00
-#define SUDMAC_CH0BA		0x10
-#define SUDMAC_CH0BBC		0x18
-#define SUDMAC_CH0CA		0x20
-#define SUDMAC_CH0CBC		0x28
-#define SUDMAC_CH0DEN		0x30
-#define SUDMAC_DSTSCLR		0x38
-#define SUDMAC_DBUFCTRL		0x3C
-#define SUDMAC_DINTCTRL		0x40
-#define SUDMAC_DINTSTS		0x44
-#define SUDMAC_DINTSTSCLR	0x48
-#define SUDMAC_CH0SHCTRL	0x50
-
-/* Definitions for the sudmac_channel.config */
-#define SUDMAC_SENDBUFM	0x1000 /* b12: Transmit Buffer Mode */
-#define SUDMAC_RCVENDM	0x0100 /* b8: Receive Data Transfer End Mode */
-#define SUDMAC_LBA_WAIT	0x0030 /* b5-4: Local Bus Access Wait */
-
-/* Definitions for the sudmac_channel.dint_end_bit */
-#define SUDMAC_CH1ENDE	0x0002 /* b1: Ch1 DMA Transfer End Int Enable */
-#define SUDMAC_CH0ENDE	0x0001 /* b0: Ch0 DMA Transfer End Int Enable */
-
-#define SUDMAC_DRV_NAME "sudmac"
-
-static void sudmac_writel(struct sudmac_chan *sc, u32 data, u32 reg)
-{
-	iowrite32(data, sc->base + reg);
-}
-
-static u32 sudmac_readl(struct sudmac_chan *sc, u32 reg)
-{
-	return ioread32(sc->base + reg);
-}
-
-static bool sudmac_is_busy(struct sudmac_chan *sc)
-{
-	u32 den = sudmac_readl(sc, SUDMAC_CH0DEN + sc->offset);
-
-	if (den)
-		return true; /* working */
-
-	return false; /* waiting */
-}
-
-static void sudmac_set_reg(struct sudmac_chan *sc, struct sudmac_regs *hw,
-			   struct shdma_desc *sdesc)
-{
-	sudmac_writel(sc, sc->cfg, SUDMAC_CH0CFG + sc->offset);
-	sudmac_writel(sc, hw->base_addr, SUDMAC_CH0BA + sc->offset);
-	sudmac_writel(sc, hw->base_byte_count, SUDMAC_CH0BBC + sc->offset);
-}
-
-static void sudmac_start(struct sudmac_chan *sc)
-{
-	u32 dintctrl = sudmac_readl(sc, SUDMAC_DINTCTRL);
-
-	sudmac_writel(sc, dintctrl | sc->dint_end_bit, SUDMAC_DINTCTRL);
-	sudmac_writel(sc, 1, SUDMAC_CH0DEN + sc->offset);
-}
-
-static void sudmac_start_xfer(struct shdma_chan *schan,
-			      struct shdma_desc *sdesc)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	struct sudmac_desc *sd = to_desc(sdesc);
-
-	sudmac_set_reg(sc, &sd->hw, sdesc);
-	sudmac_start(sc);
-}
-
-static bool sudmac_channel_busy(struct shdma_chan *schan)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-
-	return sudmac_is_busy(sc);
-}
-
-static void sudmac_setup_xfer(struct shdma_chan *schan, int slave_id)
-{
-}
-
-static const struct sudmac_slave_config *sudmac_find_slave(
-	struct sudmac_chan *sc, int slave_id)
-{
-	struct sudmac_device *sdev = to_sdev(sc);
-	struct sudmac_pdata *pdata = sdev->pdata;
-	const struct sudmac_slave_config *cfg;
-	int i;
-
-	for (i = 0, cfg = pdata->slave; i < pdata->slave_num; i++, cfg++)
-		if (cfg->slave_id == slave_id)
-			return cfg;
-
-	return NULL;
-}
-
-static int sudmac_set_slave(struct shdma_chan *schan, int slave_id,
-			    dma_addr_t slave_addr, bool try)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	const struct sudmac_slave_config *cfg = sudmac_find_slave(sc, slave_id);
-
-	if (!cfg)
-		return -ENODEV;
-
-	return 0;
-}
-
-static inline void sudmac_dma_halt(struct sudmac_chan *sc)
-{
-	u32 dintctrl = sudmac_readl(sc, SUDMAC_DINTCTRL);
-
-	sudmac_writel(sc, 0, SUDMAC_CH0DEN + sc->offset);
-	sudmac_writel(sc, dintctrl & ~sc->dint_end_bit, SUDMAC_DINTCTRL);
-	sudmac_writel(sc, sc->dint_end_bit, SUDMAC_DINTSTSCLR);
-}
-
-static int sudmac_desc_setup(struct shdma_chan *schan,
-			     struct shdma_desc *sdesc,
-			     dma_addr_t src, dma_addr_t dst, size_t *len)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	struct sudmac_desc *sd = to_desc(sdesc);
-
-	dev_dbg(sc->shdma_chan.dev, "%s: src=%pad, dst=%pad, len=%zu\n",
-		__func__, &src, &dst, *len);
-
-	if (*len > schan->max_xfer_len)
-		*len = schan->max_xfer_len;
-
-	if (dst)
-		sd->hw.base_addr = dst;
-	else if (src)
-		sd->hw.base_addr = src;
-	sd->hw.base_byte_count = *len;
-
-	return 0;
-}
-
-static void sudmac_halt(struct shdma_chan *schan)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-
-	sudmac_dma_halt(sc);
-}
-
-static bool sudmac_chan_irq(struct shdma_chan *schan, int irq)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	u32 dintsts = sudmac_readl(sc, SUDMAC_DINTSTS);
-
-	if (!(dintsts & sc->dint_end_bit))
-		return false;
-
-	/* DMA stop */
-	sudmac_dma_halt(sc);
-
-	return true;
-}
-
-static size_t sudmac_get_partial(struct shdma_chan *schan,
-				 struct shdma_desc *sdesc)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	struct sudmac_desc *sd = to_desc(sdesc);
-	u32 current_byte_count = sudmac_readl(sc, SUDMAC_CH0CBC + sc->offset);
-
-	return sd->hw.base_byte_count - current_byte_count;
-}
-
-static bool sudmac_desc_completed(struct shdma_chan *schan,
-				  struct shdma_desc *sdesc)
-{
-	struct sudmac_chan *sc = to_chan(schan);
-	struct sudmac_desc *sd = to_desc(sdesc);
-	u32 current_addr = sudmac_readl(sc, SUDMAC_CH0CA + sc->offset);
-
-	return sd->hw.base_addr + sd->hw.base_byte_count == current_addr;
-}
-
-static int sudmac_chan_probe(struct sudmac_device *su_dev, int id, int irq,
-			     unsigned long flags)
-{
-	struct shdma_dev *sdev = &su_dev->shdma_dev;
-	struct platform_device *pdev = to_platform_device(sdev->dma_dev.dev);
-	struct sudmac_chan *sc;
-	struct shdma_chan *schan;
-	int err;
-
-	sc = devm_kzalloc(&pdev->dev, sizeof(struct sudmac_chan), GFP_KERNEL);
-	if (!sc)
-		return -ENOMEM;
-
-	schan = &sc->shdma_chan;
-	schan->max_xfer_len = 64 * 1024 * 1024 - 1;
-
-	shdma_chan_probe(sdev, schan, id);
-
-	sc->base = su_dev->chan_reg;
-
-	/* get platform_data */
-	sc->offset = su_dev->pdata->channel->offset;
-	if (su_dev->pdata->channel->config & SUDMAC_TX_BUFFER_MODE)
-		sc->cfg |= SUDMAC_SENDBUFM;
-	if (su_dev->pdata->channel->config & SUDMAC_RX_END_MODE)
-		sc->cfg |= SUDMAC_RCVENDM;
-	sc->cfg |= (su_dev->pdata->channel->wait << 4) & SUDMAC_LBA_WAIT;
-
-	if (su_dev->pdata->channel->dint_end_bit & SUDMAC_DMA_BIT_CH0)
-		sc->dint_end_bit |= SUDMAC_CH0ENDE;
-	if (su_dev->pdata->channel->dint_end_bit & SUDMAC_DMA_BIT_CH1)
-		sc->dint_end_bit |= SUDMAC_CH1ENDE;
-
-	/* set up channel irq */
-	if (pdev->id >= 0)
-		snprintf(sc->dev_id, sizeof(sc->dev_id), "sudmac%d.%d",
-			 pdev->id, id);
-	else
-		snprintf(sc->dev_id, sizeof(sc->dev_id), "sudmac%d", id);
-
-	err = shdma_request_irq(schan, irq, flags, sc->dev_id);
-	if (err) {
-		dev_err(sdev->dma_dev.dev,
-			"DMA channel %d request_irq failed %d\n", id, err);
-		goto err_no_irq;
-	}
-
-	return 0;
-
-err_no_irq:
-	/* remove from dmaengine device node */
-	shdma_chan_remove(schan);
-	return err;
-}
-
-static void sudmac_chan_remove(struct sudmac_device *su_dev)
-{
-	struct shdma_chan *schan;
-	int i;
-
-	shdma_for_each_chan(schan, &su_dev->shdma_dev, i) {
-		BUG_ON(!schan);
-
-		shdma_chan_remove(schan);
-	}
-}
-
-static dma_addr_t sudmac_slave_addr(struct shdma_chan *schan)
-{
-	/* SUDMAC doesn't need the address */
-	return 0;
-}
-
-static struct shdma_desc *sudmac_embedded_desc(void *buf, int i)
-{
-	return &((struct sudmac_desc *)buf)[i].shdma_desc;
-}
-
-static const struct shdma_ops sudmac_shdma_ops = {
-	.desc_completed = sudmac_desc_completed,
-	.halt_channel = sudmac_halt,
-	.channel_busy = sudmac_channel_busy,
-	.slave_addr = sudmac_slave_addr,
-	.desc_setup = sudmac_desc_setup,
-	.set_slave = sudmac_set_slave,
-	.setup_xfer = sudmac_setup_xfer,
-	.start_xfer = sudmac_start_xfer,
-	.embedded_desc = sudmac_embedded_desc,
-	.chan_irq = sudmac_chan_irq,
-	.get_partial = sudmac_get_partial,
-};
-
-static int sudmac_probe(struct platform_device *pdev)
-{
-	struct sudmac_pdata *pdata = dev_get_platdata(&pdev->dev);
-	int err, i;
-	struct sudmac_device *su_dev;
-	struct dma_device *dma_dev;
-	struct resource *chan, *irq_res;
-
-	/* get platform data */
-	if (!pdata)
-		return -ENODEV;
-
-	irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (!irq_res)
-		return -ENODEV;
-
-	err = -ENOMEM;
-	su_dev = devm_kzalloc(&pdev->dev, sizeof(struct sudmac_device),
-			      GFP_KERNEL);
-	if (!su_dev)
-		return err;
-
-	dma_dev = &su_dev->shdma_dev.dma_dev;
-
-	chan = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	su_dev->chan_reg = devm_ioremap_resource(&pdev->dev, chan);
-	if (IS_ERR(su_dev->chan_reg))
-		return PTR_ERR(su_dev->chan_reg);
-
-	dma_cap_set(DMA_SLAVE, dma_dev->cap_mask);
-
-	su_dev->shdma_dev.ops = &sudmac_shdma_ops;
-	su_dev->shdma_dev.desc_size = sizeof(struct sudmac_desc);
-	err = shdma_init(&pdev->dev, &su_dev->shdma_dev, pdata->channel_num);
-	if (err < 0)
-		return err;
-
-	/* platform data */
-	su_dev->pdata = dev_get_platdata(&pdev->dev);
-
-	platform_set_drvdata(pdev, su_dev);
-
-	/* Create DMA Channel */
-	for (i = 0; i < pdata->channel_num; i++) {
-		err = sudmac_chan_probe(su_dev, i, irq_res->start, IRQF_SHARED);
-		if (err)
-			goto chan_probe_err;
-	}
-
-	err = dma_async_device_register(&su_dev->shdma_dev.dma_dev);
-	if (err < 0)
-		goto chan_probe_err;
-
-	return err;
-
-chan_probe_err:
-	sudmac_chan_remove(su_dev);
-
-	shdma_cleanup(&su_dev->shdma_dev);
-
-	return err;
-}
-
-static int sudmac_remove(struct platform_device *pdev)
-{
-	struct sudmac_device *su_dev = platform_get_drvdata(pdev);
-	struct dma_device *dma_dev = &su_dev->shdma_dev.dma_dev;
-
-	dma_async_device_unregister(dma_dev);
-	sudmac_chan_remove(su_dev);
-	shdma_cleanup(&su_dev->shdma_dev);
-
-	return 0;
-}
-
-static struct platform_driver sudmac_driver = {
-	.driver		= {
-		.name	= SUDMAC_DRV_NAME,
-	},
-	.probe		= sudmac_probe,
-	.remove		= sudmac_remove,
-};
-module_platform_driver(sudmac_driver);
-
-MODULE_AUTHOR("Yoshihiro Shimoda");
-MODULE_DESCRIPTION("Renesas SUDMAC driver");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("platform:" SUDMAC_DRV_NAME);
diff --git a/include/linux/sudmac.h b/include/linux/sudmac.h
deleted file mode 100644
index 377b8a5788fa..000000000000
--- a/include/linux/sudmac.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Header for the SUDMAC driver
- *
- * Copyright (C) 2013 Renesas Solutions Corp.
- *
- * This is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- */
-#ifndef SUDMAC_H
-#define SUDMAC_H
-
-#include <linux/dmaengine.h>
-#include <linux/shdma-base.h>
-#include <linux/types.h>
-
-/* Used by slave DMA clients to request DMA to/from a specific peripheral */
-struct sudmac_slave {
-	struct shdma_slave	shdma_slave;	/* Set by the platform */
-};
-
-/*
- * Supplied by platforms to specify, how a DMA channel has to be configured for
- * a certain peripheral
- */
-struct sudmac_slave_config {
-	int		slave_id;
-};
-
-struct sudmac_channel {
-	unsigned long	offset;
-	unsigned long	config;
-	unsigned long	wait;		/* The configuable range is 0 to 3 */
-	unsigned long	dint_end_bit;
-};
-
-struct sudmac_pdata {
-	const struct sudmac_slave_config *slave;
-	int slave_num;
-	const struct sudmac_channel *channel;
-	int channel_num;
-};
-
-/* Definitions for the sudmac_channel.config */
-#define SUDMAC_TX_BUFFER_MODE	BIT(0)
-#define SUDMAC_RX_END_MODE	BIT(1)
-
-/* Definitions for the sudmac_channel.dint_end_bit */
-#define SUDMAC_DMA_BIT_CH0	BIT(0)
-#define SUDMAC_DMA_BIT_CH1	BIT(1)
-
-#endif
-- 
cgit v1.2.3-59-g8ed1b


From 7e5f7bb08b8cefd3a7e8961861f47fe1f0e830d4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 May 2019 13:44:57 +0100
Subject: unexport simple_dname()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/d_path.c            | 1 -
 fs/internal.h          | 1 +
 include/linux/dcache.h | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/d_path.c b/fs/d_path.c
index e8fce6b1174f..a7d0a96b35ce 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -316,7 +316,6 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
 		end = ERR_PTR(-ENAMETOOLONG);
 	return end;
 }
-EXPORT_SYMBOL(simple_dname);
 
 /*
  * Write full pathname from the root of the filesystem into the buffer.
diff --git a/fs/internal.h b/fs/internal.h
index 0010889f2e85..1ac2b8f6c621 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -160,6 +160,7 @@ extern int d_set_mounted(struct dentry *dentry);
 extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
 extern struct dentry *d_alloc_cursor(struct dentry *);
 extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
+extern char *simple_dname(struct dentry *, char *, int);
 
 /*
  * read_write.c
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index f14e587c5d5d..361305ddd75e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -291,7 +291,6 @@ static inline unsigned d_count(const struct dentry *dentry)
  */
 extern __printf(4, 5)
 char *dynamic_dname(struct dentry *, char *, int, const char *, ...);
-extern char *simple_dname(struct dentry *, char *, int);
 
 extern char *__d_path(const struct path *, const struct path *, char *, int);
 extern char *d_absolute_path(const struct path *, char *, int);
-- 
cgit v1.2.3-59-g8ed1b


From f13e143e7444bffc53f5c2904aeed76646da69d6 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Tue, 3 Jul 2018 16:42:26 +0200
Subject: dma-buf: start caching of sg_table objects v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To allow a smooth transition from pinning buffer objects to dynamic
invalidation we first start to cache the sg_table for an attachment.

v2: keep closer to the DRM implementation

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.kernel.org/patch/10943053/
---
 drivers/dma-buf/dma-buf.c | 27 +++++++++++++++++++++++++--
 include/linux/dma-buf.h   | 13 +++++++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 3ae6c0c2cc02..f4104a21b069 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -576,6 +576,7 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
 	list_add(&attach->node, &dmabuf->attachments);
 
 	mutex_unlock(&dmabuf->lock);
+
 	return attach;
 
 err_attach:
@@ -598,6 +599,9 @@ void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach)
 	if (WARN_ON(!dmabuf || !attach))
 		return;
 
+	if (attach->sgt)
+		dmabuf->ops->unmap_dma_buf(attach, attach->sgt, attach->dir);
+
 	mutex_lock(&dmabuf->lock);
 	list_del(&attach->node);
 	if (dmabuf->ops->detach)
@@ -633,10 +637,27 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach,
 	if (WARN_ON(!attach || !attach->dmabuf))
 		return ERR_PTR(-EINVAL);
 
+	if (attach->sgt) {
+		/*
+		 * Two mappings with different directions for the same
+		 * attachment are not allowed.
+		 */
+		if (attach->dir != direction &&
+		    attach->dir != DMA_BIDIRECTIONAL)
+			return ERR_PTR(-EBUSY);
+
+		return attach->sgt;
+	}
+
 	sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction);
 	if (!sg_table)
 		sg_table = ERR_PTR(-ENOMEM);
 
+	if (!IS_ERR(sg_table) && attach->dmabuf->ops->cache_sgt_mapping) {
+		attach->sgt = sg_table;
+		attach->dir = direction;
+	}
+
 	return sg_table;
 }
 EXPORT_SYMBOL_GPL(dma_buf_map_attachment);
@@ -660,8 +681,10 @@ void dma_buf_unmap_attachment(struct dma_buf_attachment *attach,
 	if (WARN_ON(!attach || !attach->dmabuf || !sg_table))
 		return;
 
-	attach->dmabuf->ops->unmap_dma_buf(attach, sg_table,
-						direction);
+	if (attach->sgt == sg_table)
+		return;
+
+	attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction);
 }
 EXPORT_SYMBOL_GPL(dma_buf_unmap_attachment);
 
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index a0bd071466fc..8a327566d7f4 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -44,6 +44,15 @@ struct dma_buf_attachment;
  * @vunmap: [optional] unmaps a vmap from the buffer
  */
 struct dma_buf_ops {
+	/**
+	  * @cache_sgt_mapping:
+	  *
+	  * If true the framework will cache the first mapping made for each
+	  * attachment. This avoids creating mappings for attachments multiple
+	  * times.
+	  */
+	bool cache_sgt_mapping;
+
 	/**
 	 * @attach:
 	 *
@@ -323,6 +332,8 @@ struct dma_buf {
  * @dmabuf: buffer for this attachment.
  * @dev: device attached to the buffer.
  * @node: list of dma_buf_attachment.
+ * @sgt: cached mapping.
+ * @dir: direction of cached mapping.
  * @priv: exporter specific attachment data.
  *
  * This structure holds the attachment information between the dma_buf buffer
@@ -338,6 +349,8 @@ struct dma_buf_attachment {
 	struct dma_buf *dmabuf;
 	struct device *dev;
 	struct list_head node;
+	struct sg_table *sgt;
+	enum dma_data_direction dir;
 	void *priv;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From fbb5d0353c62d10c3699ec844d2d015a762952d7 Mon Sep 17 00:00:00 2001
From: Uma Shankar <uma.shankar@intel.com>
Date: Thu, 16 May 2019 19:40:06 +0530
Subject: drm: Add HDR source metadata property
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds a blob property to get HDR metadata
information from userspace. This will be send as part
of AVI Infoframe to panel.

It also implements get() and set() functions for HDR output
metadata property.The blob data is received from userspace and
saved in connector state, the same is returned as blob in get
property call to userspace.

v2: Rebase and modified the metadata structure elements
as per Ville's POC changes.

v3: No Change

v4: Addressed Shashank's review comments

v5: Rebase.

v6: Addressed Brian Starkey's review comments, defined
new structure with header for dynamic metadata scalability.
Merge get/set property functions for metadata in this patch.

v7: Addressed Jonas Karlman review comments and defined separate
structure for infoframe to better align with CTA 861.G spec. Added
Shashank's RB.

v8: Addressed Ville's review comments. Moved sink metadata structure
out of uapi headers as suggested by Jonas Karlman.

v9: Rebase and addressed Jonas Karlman review comments.

v10: Addressed Ville's review comments, dropped the metdata_changed
state variable as its not needed anymore.

Signed-off-by: Uma Shankar <uma.shankar@intel.com>
Reviewed-by: Shashank Sharma <shashank.sharma@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1558015817-12025-2-git-send-email-uma.shankar@intel.com
---
 drivers/gpu/drm/drm_atomic_uapi.c | 12 ++++++++++++
 drivers/gpu/drm/drm_connector.c   |  6 ++++++
 include/drm/drm_connector.h       | 10 ++++++++++
 include/drm/drm_mode_config.h     |  7 +++++++
 include/linux/hdmi.h              | 26 ++++++++++++++++++++++++++
 include/uapi/drm/drm_mode.h       | 23 +++++++++++++++++++++++
 6 files changed, 84 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c
index 428d82662dc4..125605ff45af 100644
--- a/drivers/gpu/drm/drm_atomic_uapi.c
+++ b/drivers/gpu/drm/drm_atomic_uapi.c
@@ -676,6 +676,8 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector,
 {
 	struct drm_device *dev = connector->dev;
 	struct drm_mode_config *config = &dev->mode_config;
+	bool replaced = false;
+	int ret;
 
 	if (property == config->prop_crtc_id) {
 		struct drm_crtc *crtc = drm_crtc_find(dev, file_priv, val);
@@ -726,6 +728,13 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector,
 		 */
 		if (state->link_status != DRM_LINK_STATUS_GOOD)
 			state->link_status = val;
+	} else if (property == config->hdr_output_metadata_property) {
+		ret = drm_atomic_replace_property_blob_from_id(dev,
+				&state->hdr_output_metadata,
+				val,
+				sizeof(struct hdr_output_metadata), -1,
+				&replaced);
+		return ret;
 	} else if (property == config->aspect_ratio_property) {
 		state->picture_aspect_ratio = val;
 	} else if (property == config->content_type_property) {
@@ -814,6 +823,9 @@ drm_atomic_connector_get_property(struct drm_connector *connector,
 		*val = state->colorspace;
 	} else if (property == connector->scaling_mode_property) {
 		*val = state->scaling_mode;
+	} else if (property == config->hdr_output_metadata_property) {
+		*val = state->hdr_output_metadata ?
+			state->hdr_output_metadata->base.id : 0;
 	} else if (property == connector->content_protection_property) {
 		*val = state->content_protection;
 	} else if (property == config->writeback_fb_id_property) {
diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index b34c3d38bf15..365ace0c0c9e 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -1058,6 +1058,12 @@ int drm_connector_create_standard_properties(struct drm_device *dev)
 		return -ENOMEM;
 	dev->mode_config.non_desktop_property = prop;
 
+	prop = drm_property_create(dev, DRM_MODE_PROP_BLOB,
+				   "HDR_OUTPUT_METADATA", 0);
+	if (!prop)
+		return -ENOMEM;
+	dev->mode_config.hdr_output_metadata_property = prop;
+
 	return 0;
 }
 
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index f43f40d5888a..f0e987df4c1e 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -603,6 +603,12 @@ struct drm_connector_state {
 	 * and the connector bpc limitations obtained from edid.
 	 */
 	u8 max_bpc;
+
+	/**
+	 * @hdr_output_metadata:
+	 * DRM blob property for HDR output metadata
+	 */
+	struct drm_property_blob *hdr_output_metadata;
 };
 
 /**
@@ -1243,6 +1249,10 @@ struct drm_connector {
 	 * &drm_mode_config.connector_free_work.
 	 */
 	struct llist_node free_node;
+
+	/* HDR metdata */
+	struct hdr_output_metadata hdr_output_metadata;
+	struct hdr_sink_metadata hdr_sink_metadata;
 };
 
 #define obj_to_connector(x) container_of(x, struct drm_connector, base)
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 7f60e8eb269a..c031b5a9d8d1 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -836,6 +836,13 @@ struct drm_mode_config {
 	 */
 	struct drm_property *writeback_out_fence_ptr_property;
 
+	/**
+	 * hdr_output_metadata_property: Connector property containing hdr
+	 * metatda. This will be provided by userspace compositors based
+	 * on HDR content
+	 */
+	struct drm_property *hdr_output_metadata_property;
+
 	/* dumb ioctl parameters */
 	uint32_t preferred_depth, prefer_shadow;
 
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index 927ad6451105..6780476dcbff 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -152,6 +152,16 @@ enum hdmi_content_type {
 	HDMI_CONTENT_TYPE_GAME,
 };
 
+enum hdmi_metadata_type {
+	HDMI_STATIC_METADATA_TYPE1 = 1,
+};
+
+enum hdmi_eotf {
+	HDMI_EOTF_TRADITIONAL_GAMMA_SDR,
+	HDMI_EOTF_TRADITIONAL_GAMMA_HDR,
+	HDMI_EOTF_SMPTE_ST2084,
+};
+
 struct hdmi_avi_infoframe {
 	enum hdmi_infoframe_type type;
 	unsigned char version;
@@ -320,6 +330,22 @@ struct hdmi_vendor_infoframe {
 	unsigned int s3d_ext_data;
 };
 
+/* HDR Metadata as per 861.G spec */
+struct hdr_static_metadata {
+	__u8 eotf;
+	__u8 metadata_type;
+	__u16 max_cll;
+	__u16 max_fall;
+	__u16 min_cll;
+};
+
+struct hdr_sink_metadata {
+	__u32 metadata_type;
+	union {
+		struct hdr_static_metadata hdmi_type1;
+	};
+};
+
 int hdmi_vendor_infoframe_init(struct hdmi_vendor_infoframe *frame);
 ssize_t hdmi_vendor_infoframe_pack(struct hdmi_vendor_infoframe *frame,
 				   void *buffer, size_t size);
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 83cd1636b9be..997a7e05c0c6 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -630,6 +630,29 @@ struct drm_color_lut {
 	__u16 reserved;
 };
 
+/* HDR Metadata Infoframe as per 861.G spec */
+struct hdr_metadata_infoframe {
+	__u8 eotf;
+	__u8 metadata_type;
+	struct {
+		__u16 x, y;
+		} display_primaries[3];
+	struct {
+		__u16 x, y;
+		} white_point;
+	__u16 max_display_mastering_luminance;
+	__u16 min_display_mastering_luminance;
+	__u16 max_cll;
+	__u16 max_fall;
+};
+
+struct hdr_output_metadata {
+	__u32 metadata_type;
+	union {
+		struct hdr_metadata_infoframe hdmi_metadata_type1;
+	};
+};
+
 #define DRM_MODE_PAGE_FLIP_EVENT 0x01
 #define DRM_MODE_PAGE_FLIP_ASYNC 0x02
 #define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4
-- 
cgit v1.2.3-59-g8ed1b


From 2cdbfd66a82969770ce1a7032fb1e2155a08cee8 Mon Sep 17 00:00:00 2001
From: Uma Shankar <uma.shankar@intel.com>
Date: Thu, 16 May 2019 19:40:09 +0530
Subject: drm: Enable HDR infoframe support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enable Dynamic Range and Mastering Infoframe for HDR
content, which is defined in CEA 861.3 spec.

The metadata will be computed based on blending
policy in userspace compositors and passed as a connector
property blob to driver. The same will be sent as infoframe
to panel which support HDR.

Added the const version of infoframe for DRM metadata
for HDR.

v2: Rebase and added Ville's POC changes.

v3: No Change

v4: Addressed Shashank's review comments and merged the
patch making drm infoframe function arguments as constant.

v5: Rebase

v6: Fixed checkpatch warnings with --strict option. Addressed
Shashank's review comments and added his RB.

v7: Addressed Brian Starkey's review comments. Merged 2 patches
into one.

v8: Addressed Jonas Karlman review comments.

v9: Addressed Jonas Karlman review comments.

v10: Addressed Ville's review comments.

v11: Added BUILD_BUG_ON and sizeof instead of magic numbers as
per Ville's comments.

Signed-off-by: Uma Shankar <uma.shankar@intel.com>
Reviewed-by: Shashank Sharma <shashank.sharma@intel.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1558015817-12025-5-git-send-email-uma.shankar@intel.com
---
 drivers/gpu/drm/drm_edid.c |  72 +++++++++++++++++
 drivers/video/hdmi.c       | 190 +++++++++++++++++++++++++++++++++++++++++++++
 include/drm/drm_edid.h     |   5 ++
 include/linux/hdmi.h       |  28 +++++++
 4 files changed, 295 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index a5ef9f45fee0..73560c9437cd 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -4904,6 +4904,78 @@ static bool is_hdmi2_sink(struct drm_connector *connector)
 		connector->display_info.color_formats & DRM_COLOR_FORMAT_YCRCB420;
 }
 
+static inline bool is_eotf_supported(u8 output_eotf, u8 sink_eotf)
+{
+	return sink_eotf & BIT(output_eotf);
+}
+
+/**
+ * drm_hdmi_infoframe_set_hdr_metadata() - fill an HDMI DRM infoframe with
+ *                                         HDR metadata from userspace
+ * @frame: HDMI DRM infoframe
+ * @hdr_metadata: hdr_source_metadata info from userspace
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+int
+drm_hdmi_infoframe_set_hdr_metadata(struct hdmi_drm_infoframe *frame,
+				    const struct drm_connector_state *conn_state)
+{
+	struct drm_connector *connector;
+	struct hdr_output_metadata *hdr_metadata;
+	int err;
+
+	if (!frame || !conn_state)
+		return -EINVAL;
+
+	connector = conn_state->connector;
+
+	if (!conn_state->hdr_output_metadata)
+		return -EINVAL;
+
+	hdr_metadata = conn_state->hdr_output_metadata->data;
+
+	if (!hdr_metadata || !connector)
+		return -EINVAL;
+
+	/* Sink EOTF is Bit map while infoframe is absolute values */
+	if (!is_eotf_supported(hdr_metadata->hdmi_metadata_type1.eotf,
+	    connector->hdr_sink_metadata.hdmi_type1.eotf)) {
+		DRM_DEBUG_KMS("EOTF Not Supported\n");
+		return -EINVAL;
+	}
+
+	err = hdmi_drm_infoframe_init(frame);
+	if (err < 0)
+		return err;
+
+	frame->eotf = hdr_metadata->hdmi_metadata_type1.eotf;
+	frame->metadata_type = hdr_metadata->hdmi_metadata_type1.metadata_type;
+
+	BUILD_BUG_ON(sizeof(frame->display_primaries) !=
+		     sizeof(hdr_metadata->hdmi_metadata_type1.display_primaries));
+	BUILD_BUG_ON(sizeof(frame->white_point) !=
+		     sizeof(hdr_metadata->hdmi_metadata_type1.white_point));
+
+	memcpy(&frame->display_primaries,
+	       &hdr_metadata->hdmi_metadata_type1.display_primaries,
+	       sizeof(frame->display_primaries));
+
+	memcpy(&frame->white_point,
+	       &hdr_metadata->hdmi_metadata_type1.white_point,
+	       sizeof(frame->white_point));
+
+	frame->max_display_mastering_luminance =
+		hdr_metadata->hdmi_metadata_type1.max_display_mastering_luminance;
+	frame->min_display_mastering_luminance =
+		hdr_metadata->hdmi_metadata_type1.min_display_mastering_luminance;
+	frame->max_fall = hdr_metadata->hdmi_metadata_type1.max_fall;
+	frame->max_cll = hdr_metadata->hdmi_metadata_type1.max_cll;
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_hdmi_infoframe_set_hdr_metadata);
+
 /**
  * drm_hdmi_avi_infoframe_from_display_mode() - fill an HDMI AVI infoframe with
  *                                              data from a DRM display mode
diff --git a/drivers/video/hdmi.c b/drivers/video/hdmi.c
index 799ae49774f5..481f0367dfd3 100644
--- a/drivers/video/hdmi.c
+++ b/drivers/video/hdmi.c
@@ -650,6 +650,150 @@ hdmi_vendor_any_infoframe_check_only(const union hdmi_vendor_any_infoframe *fram
 	return 0;
 }
 
+/**
+ * hdmi_drm_infoframe_init() - initialize an HDMI Dynaminc Range and
+ * mastering infoframe
+ * @frame: HDMI DRM infoframe
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int hdmi_drm_infoframe_init(struct hdmi_drm_infoframe *frame)
+{
+	memset(frame, 0, sizeof(*frame));
+
+	frame->type = HDMI_INFOFRAME_TYPE_DRM;
+	frame->version = 1;
+	frame->length = HDMI_DRM_INFOFRAME_SIZE;
+
+	return 0;
+}
+EXPORT_SYMBOL(hdmi_drm_infoframe_init);
+
+static int hdmi_drm_infoframe_check_only(const struct hdmi_drm_infoframe *frame)
+{
+	if (frame->type != HDMI_INFOFRAME_TYPE_DRM ||
+	    frame->version != 1)
+		return -EINVAL;
+
+	if (frame->length != HDMI_DRM_INFOFRAME_SIZE)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * hdmi_drm_infoframe_check() - check a HDMI DRM infoframe
+ * @frame: HDMI DRM infoframe
+ *
+ * Validates that the infoframe is consistent.
+ * Returns 0 on success or a negative error code on failure.
+ */
+int hdmi_drm_infoframe_check(struct hdmi_drm_infoframe *frame)
+{
+	return hdmi_drm_infoframe_check_only(frame);
+}
+EXPORT_SYMBOL(hdmi_drm_infoframe_check);
+
+/**
+ * hdmi_drm_infoframe_pack_only() - write HDMI DRM infoframe to binary buffer
+ * @frame: HDMI DRM infoframe
+ * @buffer: destination buffer
+ * @size: size of buffer
+ *
+ * Packs the information contained in the @frame structure into a binary
+ * representation that can be written into the corresponding controller
+ * registers. Also computes the checksum as required by section 5.3.5 of
+ * the HDMI 1.4 specification.
+ *
+ * Returns the number of bytes packed into the binary buffer or a negative
+ * error code on failure.
+ */
+ssize_t hdmi_drm_infoframe_pack_only(const struct hdmi_drm_infoframe *frame,
+				     void *buffer, size_t size)
+{
+	u8 *ptr = buffer;
+	size_t length;
+	int i;
+
+	length = HDMI_INFOFRAME_HEADER_SIZE + frame->length;
+
+	if (size < length)
+		return -ENOSPC;
+
+	memset(buffer, 0, size);
+
+	ptr[0] = frame->type;
+	ptr[1] = frame->version;
+	ptr[2] = frame->length;
+	ptr[3] = 0; /* checksum */
+
+	/* start infoframe payload */
+	ptr += HDMI_INFOFRAME_HEADER_SIZE;
+
+	*ptr++ = frame->eotf;
+	*ptr++ = frame->metadata_type;
+
+	for (i = 0; i < 3; i++) {
+		*ptr++ = frame->display_primaries[i].x;
+		*ptr++ = frame->display_primaries[i].x >> 8;
+		*ptr++ = frame->display_primaries[i].y;
+		*ptr++ = frame->display_primaries[i].y >> 8;
+	}
+
+	*ptr++ = frame->white_point.x;
+	*ptr++ = frame->white_point.x >> 8;
+
+	*ptr++ = frame->white_point.y;
+	*ptr++ = frame->white_point.y >> 8;
+
+	*ptr++ = frame->max_display_mastering_luminance;
+	*ptr++ = frame->max_display_mastering_luminance >> 8;
+
+	*ptr++ = frame->min_display_mastering_luminance;
+	*ptr++ = frame->min_display_mastering_luminance >> 8;
+
+	*ptr++ = frame->max_cll;
+	*ptr++ = frame->max_cll >> 8;
+
+	*ptr++ = frame->max_fall;
+	*ptr++ = frame->max_fall >> 8;
+
+	hdmi_infoframe_set_checksum(buffer, length);
+
+	return length;
+}
+EXPORT_SYMBOL(hdmi_drm_infoframe_pack_only);
+
+/**
+ * hdmi_drm_infoframe_pack() - check a HDMI DRM infoframe,
+ *                             and write it to binary buffer
+ * @frame: HDMI DRM infoframe
+ * @buffer: destination buffer
+ * @size: size of buffer
+ *
+ * Validates that the infoframe is consistent and updates derived fields
+ * (eg. length) based on other fields, after which it packs the information
+ * contained in the @frame structure into a binary representation that
+ * can be written into the corresponding controller registers. This function
+ * also computes the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns the number of bytes packed into the binary buffer or a negative
+ * error code on failure.
+ */
+ssize_t hdmi_drm_infoframe_pack(struct hdmi_drm_infoframe *frame,
+				void *buffer, size_t size)
+{
+	int ret;
+
+	ret = hdmi_drm_infoframe_check(frame);
+	if (ret)
+		return ret;
+
+	return hdmi_drm_infoframe_pack_only(frame, buffer, size);
+}
+EXPORT_SYMBOL(hdmi_drm_infoframe_pack);
+
 /*
  * hdmi_vendor_any_infoframe_check() - check a vendor infoframe
  */
@@ -758,6 +902,10 @@ hdmi_infoframe_pack_only(const union hdmi_infoframe *frame, void *buffer, size_t
 		length = hdmi_avi_infoframe_pack_only(&frame->avi,
 						      buffer, size);
 		break;
+	case HDMI_INFOFRAME_TYPE_DRM:
+		length = hdmi_drm_infoframe_pack_only(&frame->drm,
+						      buffer, size);
+		break;
 	case HDMI_INFOFRAME_TYPE_SPD:
 		length = hdmi_spd_infoframe_pack_only(&frame->spd,
 						      buffer, size);
@@ -806,6 +954,9 @@ hdmi_infoframe_pack(union hdmi_infoframe *frame,
 	case HDMI_INFOFRAME_TYPE_AVI:
 		length = hdmi_avi_infoframe_pack(&frame->avi, buffer, size);
 		break;
+	case HDMI_INFOFRAME_TYPE_DRM:
+		length = hdmi_drm_infoframe_pack(&frame->drm, buffer, size);
+		break;
 	case HDMI_INFOFRAME_TYPE_SPD:
 		length = hdmi_spd_infoframe_pack(&frame->spd, buffer, size);
 		break;
@@ -838,6 +989,8 @@ static const char *hdmi_infoframe_type_get_name(enum hdmi_infoframe_type type)
 		return "Source Product Description (SPD)";
 	case HDMI_INFOFRAME_TYPE_AUDIO:
 		return "Audio";
+	case HDMI_INFOFRAME_TYPE_DRM:
+		return "Dynamic Range and Mastering";
 	}
 	return "Reserved";
 }
@@ -1284,6 +1437,40 @@ static void hdmi_audio_infoframe_log(const char *level,
 			frame->downmix_inhibit ? "Yes" : "No");
 }
 
+/**
+ * hdmi_drm_infoframe_log() - log info of HDMI DRM infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI DRM infoframe
+ */
+static void hdmi_drm_infoframe_log(const char *level,
+				   struct device *dev,
+				   const struct hdmi_drm_infoframe *frame)
+{
+	int i;
+
+	hdmi_infoframe_log_header(level, dev,
+				  (struct hdmi_any_infoframe *)frame);
+	hdmi_log("length: %d\n", frame->length);
+	hdmi_log("metadata type: %d\n", frame->metadata_type);
+	hdmi_log("eotf: %d\n", frame->eotf);
+	for (i = 0; i < 3; i++) {
+		hdmi_log("x[%d]: %d\n", i, frame->display_primaries[i].x);
+		hdmi_log("y[%d]: %d\n", i, frame->display_primaries[i].y);
+	}
+
+	hdmi_log("white point x: %d\n", frame->white_point.x);
+	hdmi_log("white point y: %d\n", frame->white_point.y);
+
+	hdmi_log("max_display_mastering_luminance: %d\n",
+		 frame->max_display_mastering_luminance);
+	hdmi_log("min_display_mastering_luminance: %d\n",
+		 frame->min_display_mastering_luminance);
+
+	hdmi_log("max_cll: %d\n", frame->max_cll);
+	hdmi_log("max_fall: %d\n", frame->max_fall);
+}
+
 static const char *
 hdmi_3d_structure_get_name(enum hdmi_3d_structure s3d_struct)
 {
@@ -1372,6 +1559,9 @@ void hdmi_infoframe_log(const char *level,
 	case HDMI_INFOFRAME_TYPE_VENDOR:
 		hdmi_vendor_any_infoframe_log(level, dev, &frame->vendor);
 		break;
+	case HDMI_INFOFRAME_TYPE_DRM:
+		hdmi_drm_infoframe_log(level, dev, &frame->drm);
+		break;
 	}
 }
 EXPORT_SYMBOL(hdmi_infoframe_log);
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 9d3b5b93102c..0e21e91c4314 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -25,6 +25,7 @@
 
 #include <linux/types.h>
 #include <linux/hdmi.h>
+#include <drm/drm_mode.h>
 
 struct drm_device;
 struct i2c_adapter;
@@ -370,6 +371,10 @@ drm_hdmi_avi_infoframe_quant_range(struct hdmi_avi_infoframe *frame,
 				   const struct drm_display_mode *mode,
 				   enum hdmi_quantization_range rgb_quant_range);
 
+int
+drm_hdmi_infoframe_set_hdr_metadata(struct hdmi_drm_infoframe *frame,
+				    const struct drm_connector_state *conn_state);
+
 /**
  * drm_eld_mnl - Get ELD monitor name length in bytes.
  * @eld: pointer to an eld memory structure with mnl set
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index 6780476dcbff..bcf3c6c3499e 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -47,6 +47,7 @@ enum hdmi_infoframe_type {
 	HDMI_INFOFRAME_TYPE_AVI = 0x82,
 	HDMI_INFOFRAME_TYPE_SPD = 0x83,
 	HDMI_INFOFRAME_TYPE_AUDIO = 0x84,
+	HDMI_INFOFRAME_TYPE_DRM = 0x87,
 };
 
 #define HDMI_IEEE_OUI 0x000c03
@@ -55,6 +56,7 @@ enum hdmi_infoframe_type {
 #define HDMI_AVI_INFOFRAME_SIZE    13
 #define HDMI_SPD_INFOFRAME_SIZE    25
 #define HDMI_AUDIO_INFOFRAME_SIZE  10
+#define HDMI_DRM_INFOFRAME_SIZE    26
 
 #define HDMI_INFOFRAME_SIZE(type)	\
 	(HDMI_INFOFRAME_HEADER_SIZE + HDMI_ ## type ## _INFOFRAME_SIZE)
@@ -185,12 +187,37 @@ struct hdmi_avi_infoframe {
 	unsigned short right_bar;
 };
 
+/* DRM Infoframe as per CTA 861.G spec */
+struct hdmi_drm_infoframe {
+	enum hdmi_infoframe_type type;
+	unsigned char version;
+	unsigned char length;
+	enum hdmi_eotf eotf;
+	enum hdmi_metadata_type metadata_type;
+	struct {
+		u16 x, y;
+	} display_primaries[3];
+	struct {
+		u16 x, y;
+	} white_point;
+	u16 max_display_mastering_luminance;
+	u16 min_display_mastering_luminance;
+	u16 max_cll;
+	u16 max_fall;
+};
+
 int hdmi_avi_infoframe_init(struct hdmi_avi_infoframe *frame);
 ssize_t hdmi_avi_infoframe_pack(struct hdmi_avi_infoframe *frame, void *buffer,
 				size_t size);
 ssize_t hdmi_avi_infoframe_pack_only(const struct hdmi_avi_infoframe *frame,
 				     void *buffer, size_t size);
 int hdmi_avi_infoframe_check(struct hdmi_avi_infoframe *frame);
+int hdmi_drm_infoframe_init(struct hdmi_drm_infoframe *frame);
+ssize_t hdmi_drm_infoframe_pack(struct hdmi_drm_infoframe *frame, void *buffer,
+				size_t size);
+ssize_t hdmi_drm_infoframe_pack_only(const struct hdmi_drm_infoframe *frame,
+				     void *buffer, size_t size);
+int hdmi_drm_infoframe_check(struct hdmi_drm_infoframe *frame);
 
 enum hdmi_spd_sdi {
 	HDMI_SPD_SDI_UNKNOWN,
@@ -381,6 +408,7 @@ union hdmi_infoframe {
 	struct hdmi_spd_infoframe spd;
 	union hdmi_vendor_any_infoframe vendor;
 	struct hdmi_audio_infoframe audio;
+	struct hdmi_drm_infoframe drm;
 };
 
 ssize_t hdmi_infoframe_pack(union hdmi_infoframe *frame, void *buffer,
-- 
cgit v1.2.3-59-g8ed1b


From b5e3eed1eeb363c148e2935d9d3c12c30a280de6 Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Thu, 16 May 2019 19:40:12 +0530
Subject: drm: Add HLG EOTF
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ADD HLG EOTF to the list of EOTF transfer functions supported.
Hybrid Log-Gamma (HLG) is a high dynamic range (HDR) standard.
HLG defines a nonlinear transfer function in which the lower
half of the signal values use a gamma curve and the upper half
of the signal values use a logarithmic curve.

v2: Rebase

v3: Fixed a warning message

v4: Addressed Shashank's review comments

v5: Addressed Jonas Karlman's review comment and dropped the i915
tag from header.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Uma Shankar <uma.shankar@intel.com>
Reviewed-by: Shashank Sharma <shashank.sharma@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1558015817-12025-8-git-send-email-uma.shankar@intel.com
---
 drivers/gpu/drm/drm_edid.c | 3 ++-
 include/linux/hdmi.h       | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 73560c9437cd..262510c2a670 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -3854,7 +3854,8 @@ static uint8_t eotf_supported(const u8 *edid_ext)
 	return edid_ext[2] &
 		(BIT(HDMI_EOTF_TRADITIONAL_GAMMA_SDR) |
 		 BIT(HDMI_EOTF_TRADITIONAL_GAMMA_HDR) |
-		 BIT(HDMI_EOTF_SMPTE_ST2084));
+		 BIT(HDMI_EOTF_SMPTE_ST2084) |
+		 BIT(HDMI_EOTF_BT_2100_HLG));
 }
 
 static uint8_t hdr_metadata_type(const u8 *edid_ext)
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index bcf3c6c3499e..ee55ba589cdc 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -162,6 +162,7 @@ enum hdmi_eotf {
 	HDMI_EOTF_TRADITIONAL_GAMMA_SDR,
 	HDMI_EOTF_TRADITIONAL_GAMMA_HDR,
 	HDMI_EOTF_SMPTE_ST2084,
+	HDMI_EOTF_BT_2100_HLG,
 };
 
 struct hdmi_avi_infoframe {
-- 
cgit v1.2.3-59-g8ed1b


From c08e7e4c8a6f04e01d16117eb4a0077059ec2cd4 Mon Sep 17 00:00:00 2001
From: Guillaume La Roque <glaroque@baylibre.com>
Date: Tue, 14 May 2019 10:26:48 +0200
Subject: pinctrl: generic: add new 'drive-strength-microamp' property support

Add drive-strength-microamp property support to allow drive strength in uA

Signed-off-by: Guillaume La Roque <glaroque@baylibre.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf-generic.c       | 2 ++
 include/linux/pinctrl/pinconf-generic.h | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index b4f7f8a458ea..d0cbdb1ad76a 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -39,6 +39,7 @@ static const struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_DRIVE_OPEN_SOURCE, "output drive open source", NULL, false),
 	PCONFDUMP(PIN_CONFIG_DRIVE_PUSH_PULL, "output drive push pull", NULL, false),
 	PCONFDUMP(PIN_CONFIG_DRIVE_STRENGTH, "output drive strength", "mA", true),
+	PCONFDUMP(PIN_CONFIG_DRIVE_STRENGTH_UA, "output drive strength", "uA", true),
 	PCONFDUMP(PIN_CONFIG_INPUT_DEBOUNCE, "input debounce", "usec", true),
 	PCONFDUMP(PIN_CONFIG_INPUT_ENABLE, "input enabled", NULL, false),
 	PCONFDUMP(PIN_CONFIG_INPUT_SCHMITT, "input schmitt trigger", NULL, false),
@@ -167,6 +168,7 @@ static const struct pinconf_generic_params dt_params[] = {
 	{ "drive-open-source", PIN_CONFIG_DRIVE_OPEN_SOURCE, 0 },
 	{ "drive-push-pull", PIN_CONFIG_DRIVE_PUSH_PULL, 0 },
 	{ "drive-strength", PIN_CONFIG_DRIVE_STRENGTH, 0 },
+	{ "drive-strength-microamp", PIN_CONFIG_DRIVE_STRENGTH_UA, 0 },
 	{ "input-debounce", PIN_CONFIG_INPUT_DEBOUNCE, 0 },
 	{ "input-disable", PIN_CONFIG_INPUT_ENABLE, 0 },
 	{ "input-enable", PIN_CONFIG_INPUT_ENABLE, 1 },
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 6c0680641108..72d06d6a3099 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -55,6 +55,8 @@
  *	push-pull mode, the argument is ignored.
  * @PIN_CONFIG_DRIVE_STRENGTH: the pin will sink or source at most the current
  *	passed as argument. The argument is in mA.
+ * @PIN_CONFIG_DRIVE_STRENGTH_UA: the pin will sink or source at most the current
+ *	passed as argument. The argument is in uA.
  * @PIN_CONFIG_INPUT_DEBOUNCE: this will configure the pin to debounce mode,
  *	which means it will wait for signals to settle when reading inputs. The
  *	argument gives the debounce time in usecs. Setting the
@@ -112,6 +114,7 @@ enum pin_config_param {
 	PIN_CONFIG_DRIVE_OPEN_SOURCE,
 	PIN_CONFIG_DRIVE_PUSH_PULL,
 	PIN_CONFIG_DRIVE_STRENGTH,
+	PIN_CONFIG_DRIVE_STRENGTH_UA,
 	PIN_CONFIG_INPUT_DEBOUNCE,
 	PIN_CONFIG_INPUT_ENABLE,
 	PIN_CONFIG_INPUT_SCHMITT,
-- 
cgit v1.2.3-59-g8ed1b


From 036f394dd77f8117346874151793ec38967d843f Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@st.com>
Date: Wed, 22 May 2019 17:29:24 +0200
Subject: pinctrl: Enable device link creation for pin control

A pin controller may want to create a link between itself
and its clients to be sure of suspend/resume call ordering.

Introduce link_consumers field in pinctrl_desc structure to let
pinctrl core knows that controller expect to create a link.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@st.com>
[Renamed create_link to link_consumers]
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c          | 11 +++++++++++
 include/linux/pinctrl/pinctrl.h |  5 +++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index c6ff4d5fa482..d757c51d7114 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -1216,6 +1216,15 @@ struct pinctrl_state *pinctrl_lookup_state(struct pinctrl *p,
 }
 EXPORT_SYMBOL_GPL(pinctrl_lookup_state);
 
+static void pinctrl_link_add(struct pinctrl_dev *pctldev,
+			     struct device *consumer)
+{
+	if (pctldev->desc->link_consumers)
+		device_link_add(consumer, pctldev->dev,
+				DL_FLAG_PM_RUNTIME |
+				DL_FLAG_AUTOREMOVE_CONSUMER);
+}
+
 /**
  * pinctrl_commit_state() - select/activate/program a pinctrl state to HW
  * @p: the pinctrl handle for the device that requests configuration
@@ -1261,6 +1270,8 @@ static int pinctrl_commit_state(struct pinctrl *p, struct pinctrl_state *state)
 		if (ret < 0) {
 			goto unapply_new_state;
 		}
+
+		pinctrl_link_add(setting->pctldev, p->dev);
 	}
 
 	p->state = state;
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 8f5dbb84547a..2744113f1024 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -125,6 +125,10 @@ struct pinctrl_ops {
  *	the hardware description
  * @custom_conf_items: Information how to print @params in debugfs, must be
  *	the same size as the @custom_params, i.e. @num_custom_params
+ * @link_consumers: If true create a device link between pinctrl and its
+ *	consumers (i.e. the devices requesting pin control states). This is
+ *	sometimes necessary to ascertain the right suspend/resume order for
+ *	example.
  */
 struct pinctrl_desc {
 	const char *name;
@@ -139,6 +143,7 @@ struct pinctrl_desc {
 	const struct pinconf_generic_params *custom_params;
 	const struct pin_config_item *custom_conf_items;
 #endif
+	bool link_consumers;
 };
 
 /* External interface to pin controller */
-- 
cgit v1.2.3-59-g8ed1b


From 59fcdce425b7c947ccea03a16e393af9bb4d6262 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@kernel.org>
Date: Thu, 23 May 2019 17:05:59 -0700
Subject: clk: Remove ifdef for COMMON_CLK in clk-provider.h

This ifdef has been there since the beginning of this file, but it
doesn't really seem to serve any purpose besides obfuscating the struct
definitions and #defines here from compilation units that include it.
Let's always expose these function prototypes and struct definitions so
that code can inspect clk providers without needing to have
CONFIG_COMMON_CLK enabled.

Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/linux/clk-provider.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bb6118f79784..3bced2ec9f26 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -9,8 +9,6 @@
 #include <linux/of.h>
 #include <linux/of_clk.h>
 
-#ifdef CONFIG_COMMON_CLK
-
 /*
  * flags used across common struct clk.  these flags should only affect the
  * top-level framework.  custom flags for dealing with hardware specifics
@@ -1019,5 +1017,4 @@ static inline int of_clk_detect_critical(struct device_node *np, int index,
 
 void clk_gate_restore_context(struct clk_hw *hw);
 
-#endif /* CONFIG_COMMON_CLK */
 #endif /* CLK_PROVIDER_H */
-- 
cgit v1.2.3-59-g8ed1b


From 30d5a945743cd05ec5c847f2e38c2fbda5e00944 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@kernel.org>
Date: Thu, 23 May 2019 17:11:57 -0700
Subject: clk: Unexport __clk_of_table

This symbol doesn't need to be exported to clk providers anymore.
Originally, it was hidden inside clk.c, but then OMAP needed to get
access to it in commit 819b4861c18d ("CLK: ti: add init support for
clock IP blocks"), but eventually that code also changed in commit
c08ee14cc663 ("clk: ti: change clock init to use generic of_clk_init")
and we were left with this exported. Move this back into clk.c so that
it isn't exposed anymore.

Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.c            | 1 +
 include/linux/clk-provider.h | 4 ----
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index aa51756fd4d6..b34e84bb8167 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -4038,6 +4038,7 @@ struct of_clk_provider {
 	void *data;
 };
 
+extern struct of_device_id __clk_of_table;
 static const struct of_device_id __clk_of_table_sentinel
 	__used __section(__clk_of_table_end);
 
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 3bced2ec9f26..9ba000e3a50d 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -865,8 +865,6 @@ static inline long divider_ro_round_rate(struct clk_hw *hw, unsigned long rate,
  */
 unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate);
 
-struct of_device_id;
-
 struct clk_onecell_data {
 	struct clk **clks;
 	unsigned int clk_num;
@@ -877,8 +875,6 @@ struct clk_hw_onecell_data {
 	struct clk_hw *hws[];
 };
 
-extern struct of_device_id __clk_of_table;
-
 #define CLK_OF_DECLARE(name, compat, fn) OF_DECLARE_1(clk, name, compat, fn)
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 1f58bb18f6f28d1df0b7144d90bc90ee5672416d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 20 May 2019 13:44:57 +0100
Subject: mount_pseudo(): drop 'name' argument, switch to d_make_root()

Once upon a time we used to set ->d_name of e.g. pipefs root
so that d_path() on pipes would work.  These days it's
completely pointless - dentries of pipes are not even connected
to pipefs root.  However, mount_pseudo() had set the root
dentry name (passed as the second argument) and callers
kept inventing names to pass to it.  Including those that
didn't *have* any non-root dentries to start with...

All of that had been pointless for about 8 years now; it's
time to get rid of that cargo-culting...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/kernel/perfmon.c      |  2 +-
 drivers/dax/super.c             |  2 +-
 drivers/gpu/drm/drm_drv.c       |  6 +-----
 drivers/misc/cxl/api.c          |  3 +--
 drivers/scsi/cxlflash/ocxl_hw.c |  3 +--
 drivers/virtio/virtio_balloon.c |  3 +--
 fs/aio.c                        |  3 +--
 fs/anon_inodes.c                |  4 ++--
 fs/block_dev.c                  |  2 +-
 fs/btrfs/tests/btrfs-tests.c    |  2 +-
 fs/libfs.c                      | 12 +++---------
 fs/nsfs.c                       |  2 +-
 fs/pipe.c                       |  2 +-
 include/linux/fs.h              |  6 +++---
 mm/z3fold.c                     |  2 +-
 mm/zsmalloc.c                   |  2 +-
 net/socket.c                    |  2 +-
 17 files changed, 22 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 7a969f4c3534..a30da6f2c28e 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -602,7 +602,7 @@ static const struct dentry_operations pfmfs_dentry_operations;
 static struct dentry *
 pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "pfm:", NULL, &pfmfs_dentry_operations,
+	return mount_pseudo(fs_type, NULL, &pfmfs_dentry_operations,
 			PFMFS_MAGIC);
 }
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 35f051efaf35..f83814eea5ad 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -440,7 +440,7 @@ static const struct super_operations dax_sops = {
 static struct dentry *dax_mount(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC);
+	return mount_pseudo(fs_type, &dax_sops, NULL, DAXFS_MAGIC);
 }
 
 static struct file_system_type dax_fs_type = {
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 8b44ac9a92ae..48365c62a190 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -535,11 +535,7 @@ static struct vfsmount *drm_fs_mnt;
 static struct dentry *drm_fs_mount(struct file_system_type *fs_type, int flags,
 				   const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type,
-			    "drm:",
-			    NULL,
-			    NULL,
-			    0x010203ff);
+	return mount_pseudo(fs_type, NULL, NULL, 0x010203ff);
 }
 
 static struct file_system_type drm_fs_type = {
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index a59c7af79873..1f2b0535a8cf 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -40,8 +40,7 @@ static struct vfsmount *cxl_vfs_mount;
 static struct dentry *cxl_fs_mount(struct file_system_type *fs_type, int flags,
 				const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "cxl:", NULL, NULL,
-			CXL_PSEUDO_FS_MAGIC);
+	return mount_pseudo(fs_type, NULL, NULL, CXL_PSEUDO_FS_MAGIC);
 }
 
 static struct file_system_type cxl_fs_type = {
diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c
index 31cfdf2c8c30..38e1fbd2b406 100644
--- a/drivers/scsi/cxlflash/ocxl_hw.c
+++ b/drivers/scsi/cxlflash/ocxl_hw.c
@@ -48,8 +48,7 @@ static struct dentry *ocxlflash_fs_mount(struct file_system_type *fs_type,
 					 int flags, const char *dev_name,
 					 void *data)
 {
-	return mount_pseudo(fs_type, "ocxlflash:", NULL, NULL,
-			    OCXLFLASH_FS_MAGIC);
+	return mount_pseudo(fs_type, NULL, NULL, OCXLFLASH_FS_MAGIC);
 }
 
 static struct file_system_type ocxlflash_fs_type = {
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 554d1a98d193..62bafc4f2662 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -761,8 +761,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
 static struct dentry *balloon_mount(struct file_system_type *fs_type,
 		int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "balloon-kvm:", NULL, NULL,
-				BALLOON_KVM_MAGIC);
+	return mount_pseudo(fs_type, NULL, NULL, BALLOON_KVM_MAGIC);
 }
 
 static struct file_system_type balloon_fs = {
diff --git a/fs/aio.c b/fs/aio.c
index 3490d1fa0e16..09bc35fa6810 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -252,8 +252,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 static struct dentry *aio_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, NULL,
-					   AIO_RING_MAGIC);
+	struct dentry *root = mount_pseudo(fs_type, NULL, NULL, AIO_RING_MAGIC);
 
 	if (!IS_ERR(root))
 		root->d_sb->s_iflags |= SB_I_NOEXEC;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 91262c34b797..644d0837aafe 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -41,8 +41,8 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {
 static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "anon_inode:", NULL,
-			&anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
+	return mount_pseudo(fs_type, NULL, &anon_inodefs_dentry_operations,
+			    ANON_INODE_FS_MAGIC);
 }
 
 static struct file_system_type anon_inode_fs_type = {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0f7552a87d54..3143da7b0998 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -837,7 +837,7 @@ static struct dentry *bd_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
 	struct dentry *dent;
-	dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
+	dent = mount_pseudo(fs_type, &bdev_sops, NULL, BDEVFS_MAGIC);
 	if (!IS_ERR(dent))
 		dent->d_sb->s_iflags |= SB_I_CGROUPWB;
 	return dent;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9238fd4f1734..6da54323eaf8 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -36,7 +36,7 @@ static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
 				       int flags, const char *dev_name,
 				       void *data)
 {
-	return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
+	return mount_pseudo(fs_type, &btrfs_test_super_ops,
 			    NULL, BTRFS_TEST_MAGIC);
 }
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 4b59b1816efb..030e545f586e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -239,14 +239,12 @@ static const struct super_operations simple_super_operations = {
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
  */
-struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
+struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type,
 	const struct super_operations *ops, const struct xattr_handler **xattr,
 	const struct dentry_operations *dops, unsigned long magic)
 {
 	struct super_block *s;
-	struct dentry *dentry;
 	struct inode *root;
-	struct qstr d_name = QSTR_INIT(name, strlen(name));
 
 	s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
 			&init_user_ns, NULL);
@@ -271,13 +269,9 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
 	root->i_ino = 1;
 	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
 	root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
-	dentry = __d_alloc(s, &d_name);
-	if (!dentry) {
-		iput(root);
+	s->s_root = d_make_root(root);
+	if (!s->s_root)
 		goto Enomem;
-	}
-	d_instantiate(dentry, root);
-	s->s_root = dentry;
 	s->s_d_op = dops;
 	s->s_flags |= SB_ACTIVE;
 	return dget(s->s_root);
diff --git a/fs/nsfs.c b/fs/nsfs.c
index e3bf08c5af41..b3c49ddc0f85 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -261,7 +261,7 @@ static const struct super_operations nsfs_ops = {
 static struct dentry *nsfs_mount(struct file_system_type *fs_type,
 			int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "nsfs:", &nsfs_ops,
+	return mount_pseudo(fs_type, &nsfs_ops,
 			&ns_dentry_operations, NSFS_MAGIC);
 }
 static struct file_system_type nsfs = {
diff --git a/fs/pipe.c b/fs/pipe.c
index 41065901106b..99a023730e6f 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1185,7 +1185,7 @@ static const struct super_operations pipefs_ops = {
 static struct dentry *pipefs_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "pipe:", &pipefs_ops,
+	return mount_pseudo(fs_type, &pipefs_ops,
 			&pipefs_dentry_operations, PIPEFS_MAGIC);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f7fdfe93e25d..b06251dd429f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2257,18 +2257,18 @@ struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
 			int flags, void *data);
-extern struct dentry *mount_pseudo_xattr(struct file_system_type *, char *,
+extern struct dentry *mount_pseudo_xattr(struct file_system_type *,
 					 const struct super_operations *ops,
 					 const struct xattr_handler **xattr,
 					 const struct dentry_operations *dops,
 					 unsigned long);
 
 static inline struct dentry *
-mount_pseudo(struct file_system_type *fs_type, char *name,
+mount_pseudo(struct file_system_type *fs_type,
 	     const struct super_operations *ops,
 	     const struct dentry_operations *dops, unsigned long magic)
 {
-	return mount_pseudo_xattr(fs_type, name, ops, NULL, dops, magic);
+	return mount_pseudo_xattr(fs_type, ops, NULL, dops, magic);
 }
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 0b14daf930a8..abeb5bcbea57 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -242,7 +242,7 @@ static inline void free_handle(unsigned long handle)
 static struct dentry *z3fold_do_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "z3fold:", NULL, NULL, 0x33);
+	return mount_pseudo(fs_type, NULL, NULL, 0x33);
 }
 
 static struct file_system_type z3fold_fs = {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d9f831f63625..ef230be8c03e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1817,7 +1817,7 @@ static void lock_zspage(struct zspage *zspage)
 static struct dentry *zs_mount(struct file_system_type *fs_type,
 				int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo(fs_type, "zsmalloc:", NULL, NULL, ZSMALLOC_MAGIC);
+	return mount_pseudo(fs_type, NULL, NULL, ZSMALLOC_MAGIC);
 }
 
 static struct file_system_type zsmalloc_fs = {
diff --git a/net/socket.c b/net/socket.c
index 472fbefa5d9b..c86679584eed 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -362,7 +362,7 @@ static const struct xattr_handler *sockfs_xattr_handlers[] = {
 static struct dentry *sockfs_mount(struct file_system_type *fs_type,
 			 int flags, const char *dev_name, void *data)
 {
-	return mount_pseudo_xattr(fs_type, "socket:", &sockfs_ops,
+	return mount_pseudo_xattr(fs_type, &sockfs_ops,
 				  sockfs_xattr_handlers,
 				  &sockfs_dentry_operations, SOCKFS_MAGIC);
 }
-- 
cgit v1.2.3-59-g8ed1b


From bb7b6b2bbdb827e68cd506c8f5e3ba13215cccb2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Mar 2019 16:38:28 +0000
Subject: vfs: Kill mount_ns()

Kill mount_ns() as it has been replaced by vfs_get_super() in the new mount
API.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 38 --------------------------------------
 include/linux/fs.h |  3 ---
 2 files changed, 41 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 3ba91d70c2a8..6919f5c728f0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1143,44 +1143,6 @@ void kill_litter_super(struct super_block *sb)
 }
 EXPORT_SYMBOL(kill_litter_super);
 
-static int ns_test_super(struct super_block *sb, void *data)
-{
-	return sb->s_fs_info == data;
-}
-
-static int ns_set_super(struct super_block *sb, void *data)
-{
-	sb->s_fs_info = data;
-	return set_anon_super(sb, NULL);
-}
-
-struct dentry *mount_ns(struct file_system_type *fs_type,
-	int flags, void *data, void *ns, struct user_namespace *user_ns,
-	int (*fill_super)(struct super_block *, void *, int))
-{
-	struct super_block *sb;
-
-	sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
-			 user_ns, ns);
-	if (IS_ERR(sb))
-		return ERR_CAST(sb);
-
-	if (!sb->s_root) {
-		int err;
-		err = fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
-		if (err) {
-			deactivate_locked_super(sb);
-			return ERR_PTR(err);
-		}
-
-		sb->s_flags |= SB_ACTIVE;
-	}
-
-	return dget(sb->s_root);
-}
-
-EXPORT_SYMBOL(mount_ns);
-
 int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
 {
 	return set_anon_super(sb, NULL);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b06251dd429f..790342cf4df9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2206,9 +2206,6 @@ struct file_system_type {
 
 #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)
 
-extern struct dentry *mount_ns(struct file_system_type *fs_type,
-	int flags, void *data, void *ns, struct user_namespace *user_ns,
-	int (*fill_super)(struct super_block *, void *, int));
 #ifdef CONFIG_BLOCK
 extern struct dentry *mount_bdev(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data,
-- 
cgit v1.2.3-59-g8ed1b


From c80fa7c8301c10ad10d997b9e86b4aeac5923b3e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Mar 2019 16:38:23 +0000
Subject: vfs: Provide sb->s_iflags settings in fs_context struct

Provide a field in the fs_context struct through which bits in the
sb->s_iflags superblock field can be set.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-fsdevel@vger.kernel.org
---
 fs/super.c                 | 1 +
 include/linux/fs_context.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 72b4a5afcfd6..f836b67abffe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -540,6 +540,7 @@ retry:
 	}
 	fc->s_fs_info = NULL;
 	s->s_type = fc->fs_type;
+	s->s_iflags |= fc->s_iflags;
 	strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id));
 	list_add_tail(&s->s_list, &super_blocks);
 	hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 1f966670c8dc..c995b852ba40 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -103,6 +103,7 @@ struct fs_context {
 	void			*s_fs_info;	/* Proposed s_fs_info */
 	unsigned int		sb_flags;	/* Proposed superblock flags (SB_*) */
 	unsigned int		sb_flags_mask;	/* Superblock flags that were changed */
+	unsigned int		s_iflags;	/* OR'd with sb->s_iflags */
 	unsigned int		lsm_flags;	/* Information flags from the fs to the LSM */
 	enum fs_context_purpose	purpose:8;
 	enum fs_context_phase	phase:8;	/* The phase the context is in */
-- 
cgit v1.2.3-59-g8ed1b


From 31d6d5ce53400d6dc58e29ddd8dc184b3ba89d66 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Mar 2019 16:38:23 +0000
Subject: vfs: Provide a mount_pseudo-replacement for the new mount API

Provide a function, init_pseudo(), that provides a common
infrastructure for converting pseudo-filesystems that can never be
mountable.

[AV: once all users of mount_pseudo_xattr() get converted, it will be folded
into pseudo_fs_get_tree()]

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-fsdevel@vger.kernel.org
---
 fs/libfs.c                | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pseudo_fs.h | 16 ++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 include/linux/pseudo_fs.h

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index 030e545f586e..edef70d35438 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,6 +16,8 @@
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> /* sync_mapping_buffers */
+#include <linux/fs_context.h>
+#include <linux/pseudo_fs.h>
 
 #include <linux/uaccess.h>
 
@@ -235,6 +237,50 @@ static const struct super_operations simple_super_operations = {
 	.statfs		= simple_statfs,
 };
 
+static int pseudo_fs_get_tree(struct fs_context *fc)
+{
+	struct pseudo_fs_context *ctx = fc->fs_private;
+	struct dentry *root;
+
+	root = mount_pseudo_xattr(fc->fs_type,
+				  ctx->ops, ctx->xattr,
+			          ctx->dops, ctx->magic);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	fc->root = root;
+	return 0;
+}
+
+static void pseudo_fs_free(struct fs_context *fc)
+{
+	kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations pseudo_fs_context_ops = {
+	.free		= pseudo_fs_free,
+	.get_tree	= pseudo_fs_get_tree,
+};
+
+/*
+ * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
+ * will never be mountable)
+ */
+struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
+					unsigned long magic)
+{
+	struct pseudo_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL);
+	if (likely(ctx)) {
+		ctx->magic = magic;
+		fc->fs_private = ctx;
+		fc->ops = &pseudo_fs_context_ops;
+	}
+	return ctx;
+}
+EXPORT_SYMBOL(init_pseudo);
+
 /*
  * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
  * will never be mountable)
diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h
new file mode 100644
index 000000000000..eceda1d1407a
--- /dev/null
+++ b/include/linux/pseudo_fs.h
@@ -0,0 +1,16 @@
+#ifndef __LINUX_PSEUDO_FS__
+#define __LINUX_PSEUDO_FS__
+
+#include <linux/fs_context.h>
+
+struct pseudo_fs_context {
+	const struct super_operations *ops;
+	const struct xattr_handler **xattr;
+	const struct dentry_operations *dops;
+	unsigned long magic;
+};
+
+struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
+				      unsigned long magic);
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 8d9e46d80777b484f8f0945c317ad618224d7811 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 11 May 2019 11:43:59 -0400
Subject: fold mount_pseudo_xattr() into pseudo_fs_get_tree()

... now that all other callers are gone

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c         | 88 +++++++++++++++++++++---------------------------------
 include/linux/fs.h | 13 --------
 2 files changed, 34 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/fs/libfs.c b/fs/libfs.c
index edef70d35438..7df3c9a85f6b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -240,16 +240,43 @@ static const struct super_operations simple_super_operations = {
 static int pseudo_fs_get_tree(struct fs_context *fc)
 {
 	struct pseudo_fs_context *ctx = fc->fs_private;
-	struct dentry *root;
+	struct super_block *s;
+	struct inode *root;
 
-	root = mount_pseudo_xattr(fc->fs_type,
-				  ctx->ops, ctx->xattr,
-			          ctx->dops, ctx->magic);
-	if (IS_ERR(root))
-		return PTR_ERR(root);
+	s = sget_userns(fc->fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
+			&init_user_ns, NULL);
+	if (IS_ERR(s))
+		return PTR_ERR(s);
 
-	fc->root = root;
+	s->s_maxbytes = MAX_LFS_FILESIZE;
+	s->s_blocksize = PAGE_SIZE;
+	s->s_blocksize_bits = PAGE_SHIFT;
+	s->s_magic = ctx->magic;
+	s->s_op = ctx->ops ?: &simple_super_operations;
+	s->s_xattr = ctx->xattr;
+	s->s_time_gran = 1;
+	root = new_inode(s);
+	if (!root)
+		goto Enomem;
+	/*
+	 * since this is the first inode, make it number 1. New inodes created
+	 * after this must take care not to collide with it (by passing
+	 * max_reserved of 1 to iunique).
+	 */
+	root->i_ino = 1;
+	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
+	root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
+	s->s_root = d_make_root(root);
+	if (!s->s_root)
+		goto Enomem;
+	s->s_d_op = ctx->dops;
+	s->s_flags |= SB_ACTIVE;
+	fc->root = dget(s->s_root);
 	return 0;
+
+Enomem:
+	deactivate_locked_super(s);
+	return -ENOMEM;
 }
 
 static void pseudo_fs_free(struct fs_context *fc)
@@ -281,53 +308,6 @@ struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
 }
 EXPORT_SYMBOL(init_pseudo);
 
-/*
- * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
- * will never be mountable)
- */
-struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type,
-	const struct super_operations *ops, const struct xattr_handler **xattr,
-	const struct dentry_operations *dops, unsigned long magic)
-{
-	struct super_block *s;
-	struct inode *root;
-
-	s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER,
-			&init_user_ns, NULL);
-	if (IS_ERR(s))
-		return ERR_CAST(s);
-
-	s->s_maxbytes = MAX_LFS_FILESIZE;
-	s->s_blocksize = PAGE_SIZE;
-	s->s_blocksize_bits = PAGE_SHIFT;
-	s->s_magic = magic;
-	s->s_op = ops ? ops : &simple_super_operations;
-	s->s_xattr = xattr;
-	s->s_time_gran = 1;
-	root = new_inode(s);
-	if (!root)
-		goto Enomem;
-	/*
-	 * since this is the first inode, make it number 1. New inodes created
-	 * after this must take care not to collide with it (by passing
-	 * max_reserved of 1 to iunique).
-	 */
-	root->i_ino = 1;
-	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
-	root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
-	s->s_root = d_make_root(root);
-	if (!s->s_root)
-		goto Enomem;
-	s->s_d_op = dops;
-	s->s_flags |= SB_ACTIVE;
-	return dget(s->s_root);
-
-Enomem:
-	deactivate_locked_super(s);
-	return ERR_PTR(-ENOMEM);
-}
-EXPORT_SYMBOL(mount_pseudo_xattr);
-
 int simple_open(struct inode *inode, struct file *file)
 {
 	if (inode->i_private)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 790342cf4df9..d625acabbfcf 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2254,19 +2254,6 @@ struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
 			int flags, void *data);
-extern struct dentry *mount_pseudo_xattr(struct file_system_type *,
-					 const struct super_operations *ops,
-					 const struct xattr_handler **xattr,
-					 const struct dentry_operations *dops,
-					 unsigned long);
-
-static inline struct dentry *
-mount_pseudo(struct file_system_type *fs_type,
-	     const struct super_operations *ops,
-	     const struct dentry_operations *dops, unsigned long magic)
-{
-	return mount_pseudo_xattr(fs_type, ops, NULL, dops, magic);
-}
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
 #define fops_get(fops) \
-- 
cgit v1.2.3-59-g8ed1b


From 023d066a0d0a87696c04b0de2ceae53063d0b655 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 25 Mar 2019 16:38:28 +0000
Subject: vfs: Kill sget_userns()

Kill sget_userns(), folding it into sget() as that's the only remaining
user.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-fsdevel@vger.kernel.org
---
 fs/super.c         | 54 ++++++++++++++++--------------------------------------
 include/linux/fs.h |  5 -----
 2 files changed, 16 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index f836b67abffe..ca2302501d32 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -563,24 +563,31 @@ share_extant_sb:
 EXPORT_SYMBOL(sget_fc);
 
 /**
- *	sget_userns -	find or create a superblock
- *	@type:	filesystem type superblock should belong to
- *	@test:	comparison callback
- *	@set:	setup callback
- *	@flags:	mount flags
- *	@user_ns: User namespace for the super_block
- *	@data:	argument to each of them
+ *	sget	-	find or create a superblock
+ *	@type:	  filesystem type superblock should belong to
+ *	@test:	  comparison callback
+ *	@set:	  setup callback
+ *	@flags:	  mount flags
+ *	@data:	  argument to each of them
  */
-struct super_block *sget_userns(struct file_system_type *type,
+struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
-			int flags, struct user_namespace *user_ns,
+			int flags,
 			void *data)
 {
+	struct user_namespace *user_ns = current_user_ns();
 	struct super_block *s = NULL;
 	struct super_block *old;
 	int err;
 
+	/* We don't yet pass the user namespace of the parent
+	 * mount through to here so always use &init_user_ns
+	 * until that changes.
+	 */
+	if (flags & SB_SUBMOUNT)
+		user_ns = &init_user_ns;
+
 retry:
 	spin_lock(&sb_lock);
 	if (test) {
@@ -621,35 +628,6 @@ retry:
 	register_shrinker_prepared(&s->s_shrink);
 	return s;
 }
-
-EXPORT_SYMBOL(sget_userns);
-
-/**
- *	sget	-	find or create a superblock
- *	@type:	  filesystem type superblock should belong to
- *	@test:	  comparison callback
- *	@set:	  setup callback
- *	@flags:	  mount flags
- *	@data:	  argument to each of them
- */
-struct super_block *sget(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags,
-			void *data)
-{
-	struct user_namespace *user_ns = current_user_ns();
-
-	/* We don't yet pass the user namespace of the parent
-	 * mount through to here so always use &init_user_ns
-	 * until that changes.
-	 */
-	if (flags & SB_SUBMOUNT)
-		user_ns = &init_user_ns;
-
-	return sget_userns(type, test, set, flags, user_ns, data);
-}
-
 EXPORT_SYMBOL(sget);
 
 void drop_super(struct super_block *sb)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d625acabbfcf..71421856ff2c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2245,11 +2245,6 @@ void free_anon_bdev(dev_t);
 struct super_block *sget_fc(struct fs_context *fc,
 			    int (*test)(struct super_block *, struct fs_context *),
 			    int (*set)(struct super_block *, struct fs_context *));
-struct super_block *sget_userns(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags, struct user_namespace *user_ns,
-			void *data);
 struct super_block *sget(struct file_system_type *type,
 			int (*test)(struct super_block *,void *),
 			int (*set)(struct super_block *,void *),
-- 
cgit v1.2.3-59-g8ed1b


From 7375dca1647fa978310f2d706ddbff537f72110b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 20 May 2019 09:26:24 -0400
Subject: ftrace: Make enable and update parameters bool when applicable

The code modification functions have "enable" and "update" variables that
are sometimes "int" but used as "bool". Remove the ambiguity and make them
"bool" when they are only used for true or false values.

Link: http://lkml.kernel.org/r/e1429923d9eda92a3cf5ee9e33c7eacce539781d.1558115654.git.naveen.n.rao@linux.vnet.ibm.com

Reported-by: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  4 ++--
 kernel/trace/ftrace.c  | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 25e2995d4a4c..8a8cb3c401b2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -427,8 +427,8 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter);
 	     iter = ftrace_rec_iter_next(iter))
 
 
-int ftrace_update_record(struct dyn_ftrace *rec, int enable);
-int ftrace_test_record(struct dyn_ftrace *rec, int enable);
+int ftrace_update_record(struct dyn_ftrace *rec, bool enable);
+int ftrace_test_record(struct dyn_ftrace *rec, bool enable);
 void ftrace_run_stop_machine(int command);
 unsigned long ftrace_location(unsigned long ip);
 unsigned long ftrace_location_range(unsigned long start, unsigned long end);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a12aff849c04..4f2c26bebe2a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1768,7 +1768,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
 		count++;
 
 		/* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */
-		update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE;
+		update |= ftrace_test_record(rec, true) != FTRACE_UPDATE_IGNORE;
 
 		/* Shortcut, if we handled all records, we are done. */
 		if (!all && count == hash->count)
@@ -2047,7 +2047,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
 	}
 }
 
-static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
+static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 {
 	unsigned long flag = 0UL;
 
@@ -2146,28 +2146,28 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
 /**
  * ftrace_update_record, set a record that now is tracing or not
  * @rec: the record to update
- * @enable: set to 1 if the record is tracing, zero to force disable
+ * @enable: set to true if the record is tracing, false to force disable
  *
  * The records that represent all functions that can be traced need
  * to be updated when tracing has been enabled.
  */
-int ftrace_update_record(struct dyn_ftrace *rec, int enable)
+int ftrace_update_record(struct dyn_ftrace *rec, bool enable)
 {
-	return ftrace_check_record(rec, enable, 1);
+	return ftrace_check_record(rec, enable, true);
 }
 
 /**
  * ftrace_test_record, check if the record has been enabled or not
  * @rec: the record to test
- * @enable: set to 1 to check if enabled, 0 if it is disabled
+ * @enable: set to true to check if enabled, false if it is disabled
  *
  * The arch code may need to test if a record is already set to
  * tracing to determine how to modify the function code that it
  * represents.
  */
-int ftrace_test_record(struct dyn_ftrace *rec, int enable)
+int ftrace_test_record(struct dyn_ftrace *rec, bool enable)
 {
-	return ftrace_check_record(rec, enable, 0);
+	return ftrace_check_record(rec, enable, false);
 }
 
 static struct ftrace_ops *
@@ -2356,7 +2356,7 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
 }
 
 static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, bool enable)
 {
 	unsigned long ftrace_old_addr;
 	unsigned long ftrace_addr;
@@ -2395,7 +2395,7 @@ void __weak ftrace_replace_code(int mod_flags)
 {
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
-	int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
+	bool enable = mod_flags & FTRACE_MODIFY_ENABLE_FL;
 	int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL;
 	int failed;
 
-- 
cgit v1.2.3-59-g8ed1b


From 2d8d8fac3b4eab035dcd0068e1f5a746a697fbb3 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 15 May 2019 14:38:06 +0900
Subject: x86/uaccess: Allow access_ok() in irq context if pagefault_disabled

WARN_ON_IN_IRQ() assumes that the access_ok() and following
user memory access can sleep. But this assumption is not
always correct; when the pagefault is disabled, following
memory access will just returns -EFAULT and never sleep.

Add pagefault_disabled() check in WARN_ON_ONCE() so that
it can ignore the case we call it with disabling pagefault.
For this purpose, this modified pagefault_disabled() as
an inline function.

Link: http://lkml.kernel.org/r/155789868664.26965.7932665824135793317.stgit@devnote2

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 arch/x86/include/asm/uaccess.h | 4 +++-
 include/linux/uaccess.h        | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index c82abd6e4ca3..9c4435307ff8 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -66,7 +66,9 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 })
 
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
-# define WARN_ON_IN_IRQ()	WARN_ON_ONCE(!in_task())
+static inline bool pagefault_disabled(void);
+# define WARN_ON_IN_IRQ()	\
+	WARN_ON_ONCE(!in_task() && !pagefault_disabled())
 #else
 # define WARN_ON_IN_IRQ()
 #endif
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 2b70130af585..5a43ef7db492 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -203,7 +203,10 @@ static inline void pagefault_enable(void)
 /*
  * Is the pagefault handler disabled? If so, user access methods will not sleep.
  */
-#define pagefault_disabled() (current->pagefault_disabled != 0)
+static inline bool pagefault_disabled(void)
+{
+	return current->pagefault_disabled != 0;
+}
 
 /*
  * The pagefault handler is in general disabled by pagefault_disable() or
-- 
cgit v1.2.3-59-g8ed1b


From 3d7081822f7f9eab867d9bcc8fd635208ec438e0 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 15 May 2019 14:38:18 +0900
Subject: uaccess: Add non-pagefault user-space read functions

Add probe_user_read(), strncpy_from_unsafe_user() and
strnlen_unsafe_user() which allows caller to access user-space
in IRQ context.

Current probe_kernel_read() and strncpy_from_unsafe() are
not available for user-space memory, because it sets
KERNEL_DS while accessing data. On some arch, user address
space and kernel address space can be co-exist, but others
can not. In that case, setting KERNEL_DS means given
address is treated as a kernel address space.
Also strnlen_user() is only available from user context since
it can sleep if pagefault is enabled.

To access user-space memory without pagefault, we need
these new functions which sets USER_DS while accessing
the data.

Link: http://lkml.kernel.org/r/155789869802.26965.4940338412595759063.stgit@devnote2

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/uaccess.h |  14 ++++++
 mm/maccess.c            | 122 +++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 130 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 5a43ef7db492..9c435c3f2105 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -242,6 +242,17 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to,
 extern long probe_kernel_read(void *dst, const void *src, size_t size);
 extern long __probe_kernel_read(void *dst, const void *src, size_t size);
 
+/*
+ * probe_user_read(): safely attempt to read from a location in user space
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from
+ * @size: size of the data chunk
+ *
+ * Safely read from address @src to the buffer at @dst.  If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+extern long probe_user_read(void *dst, const void __user *src, size_t size);
+
 /*
  * probe_kernel_write(): safely attempt to write to a location
  * @dst: address to write to
@@ -255,6 +266,9 @@ extern long notrace probe_kernel_write(void *dst, const void *src, size_t size);
 extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size);
 
 extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count);
+extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
+				     long count);
+extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count);
 
 /**
  * probe_kernel_address(): safely attempt to read from a location
diff --git a/mm/maccess.c b/mm/maccess.c
index ec00be51a24f..19c8c3dc14df 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -5,8 +5,20 @@
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 
+static __always_inline long
+probe_read_common(void *dst, const void __user *src, size_t size)
+{
+	long ret;
+
+	pagefault_disable();
+	ret = __copy_from_user_inatomic(dst, src, size);
+	pagefault_enable();
+
+	return ret ? -EFAULT : 0;
+}
+
 /**
- * probe_kernel_read(): safely attempt to read from a location
+ * probe_kernel_read(): safely attempt to read from a kernel-space location
  * @dst: pointer to the buffer that shall take the data
  * @src: address to read from
  * @size: size of the data chunk
@@ -29,16 +41,40 @@ long __probe_kernel_read(void *dst, const void *src, size_t size)
 	mm_segment_t old_fs = get_fs();
 
 	set_fs(KERNEL_DS);
-	pagefault_disable();
-	ret = __copy_from_user_inatomic(dst,
-			(__force const void __user *)src, size);
-	pagefault_enable();
+	ret = probe_read_common(dst, (__force const void __user *)src, size);
 	set_fs(old_fs);
 
-	return ret ? -EFAULT : 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(probe_kernel_read);
 
+/**
+ * probe_user_read(): safely attempt to read from a user-space location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from. This must be a user address.
+ * @size: size of the data chunk
+ *
+ * Safely read from user address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_user_read(void *dst, const void __user *src, size_t size)
+    __attribute__((alias("__probe_user_read")));
+
+long __probe_user_read(void *dst, const void __user *src, size_t size)
+{
+	long ret = -EFAULT;
+	mm_segment_t old_fs = get_fs();
+
+	set_fs(USER_DS);
+	if (access_ok(src, size))
+		ret = probe_read_common(dst, src, size);
+	set_fs(old_fs);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(probe_user_read);
+
 /**
  * probe_kernel_write(): safely attempt to write to a location
  * @dst: address to write to
@@ -66,6 +102,7 @@ long __probe_kernel_write(void *dst, const void *src, size_t size)
 }
 EXPORT_SYMBOL_GPL(probe_kernel_write);
 
+
 /**
  * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
  * @dst:   Destination address, in kernel space.  This buffer must be at
@@ -105,3 +142,76 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
 
 	return ret ? -EFAULT : src - unsafe_addr;
 }
+
+/**
+ * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user
+ *				address.
+ * @dst:   Destination address, in kernel space.  This buffer must be at
+ *         least @count bytes long.
+ * @unsafe_addr: Unsafe user address.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from unsafe user address to kernel buffer.
+ *
+ * On success, returns the length of the string INCLUDING the trailing NUL.
+ *
+ * If access fails, returns -EFAULT (some data may have been copied
+ * and the trailing NUL added).
+ *
+ * If @count is smaller than the length of the string, copies @count-1 bytes,
+ * sets the last byte of @dst buffer to NUL and returns @count.
+ */
+long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
+			      long count)
+{
+	mm_segment_t old_fs = get_fs();
+	long ret;
+
+	if (unlikely(count <= 0))
+		return 0;
+
+	set_fs(USER_DS);
+	pagefault_disable();
+	ret = strncpy_from_user(dst, unsafe_addr, count);
+	pagefault_enable();
+	set_fs(old_fs);
+
+	if (ret >= count) {
+		ret = count;
+		dst[ret - 1] = '\0';
+	} else if (ret > 0) {
+		ret++;
+	}
+
+	return ret;
+}
+
+/**
+ * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL.
+ * @unsafe_addr: The string to measure.
+ * @count: Maximum count (including NUL)
+ *
+ * Get the size of a NUL-terminated string in user space without pagefault.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ *
+ * If the string is too long, returns a number larger than @count. User
+ * has to check the return value against "> count".
+ * On exception (or invalid count), returns 0.
+ *
+ * Unlike strnlen_user, this can be used from IRQ handler etc. because
+ * it disables pagefaults.
+ */
+long strnlen_unsafe_user(const void __user *unsafe_addr, long count)
+{
+	mm_segment_t old_fs = get_fs();
+	int ret;
+
+	set_fs(USER_DS);
+	pagefault_disable();
+	ret = strnlen_user(unsafe_addr, count);
+	pagefault_enable();
+	set_fs(old_fs);
+
+	return ret;
+}
-- 
cgit v1.2.3-59-g8ed1b


From 87a90956eeab260a469a51897bfda27b28adf67d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Wed, 22 May 2019 17:27:44 +0900
Subject: uaccess: Add a prototype of non-static __probe_user_read()

Declare a prototype of non-static __probe_user_read() as
same as __probe_kernel_read() at uaccess.h.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/uaccess.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index 9c435c3f2105..34a038563d97 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -252,6 +252,7 @@ extern long __probe_kernel_read(void *dst, const void *src, size_t size);
  * happens, handle that and return -EFAULT.
  */
 extern long probe_user_read(void *dst, const void __user *src, size_t size);
+extern long __probe_user_read(void *dst, const void __user *src, size_t size);
 
 /*
  * probe_kernel_write(): safely attempt to write to a location
-- 
cgit v1.2.3-59-g8ed1b


From f5151311c3f37f6edc85b2253ccf6d3e2a4c4c26 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Mon, 20 May 2019 19:32:14 +0800
Subject: dmaengine: Add matching device node validation in
 __dma_request_channel()

When user try to request one DMA channel by __dma_request_channel(), it won't
validate if it is the correct DMA device to request, that will lead each DMA
engine driver to validate the correct device node in their filter function
if it is necessary.

Thus we can add the matching device node validation in the DMA engine core,
to remove all of device node validation in the drivers.

Tested-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/dmaengine.c   | 10 ++++++++--
 drivers/dma/of-dma.c      |  4 ++--
 include/linux/dmaengine.h | 12 ++++++++----
 3 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 3a11b1092e80..610080c629bb 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -641,11 +641,13 @@ EXPORT_SYMBOL_GPL(dma_get_any_slave_channel);
  * @mask: capabilities that the channel must satisfy
  * @fn: optional callback to disposition available channels
  * @fn_param: opaque parameter to pass to dma_filter_fn
+ * @np: device node to look for DMA channels
  *
  * Returns pointer to appropriate DMA channel on success or NULL.
  */
 struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
-				       dma_filter_fn fn, void *fn_param)
+				       dma_filter_fn fn, void *fn_param,
+				       struct device_node *np)
 {
 	struct dma_device *device, *_d;
 	struct dma_chan *chan = NULL;
@@ -653,6 +655,10 @@ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
 	/* Find a channel */
 	mutex_lock(&dma_list_mutex);
 	list_for_each_entry_safe(device, _d, &dma_device_list, global_node) {
+		/* Finds a DMA controller with matching device node */
+		if (np && device->dev->of_node && np != device->dev->of_node)
+			continue;
+
 		chan = find_candidate(device, mask, fn, fn_param);
 		if (!IS_ERR(chan))
 			break;
@@ -769,7 +775,7 @@ struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask)
 	if (!mask)
 		return ERR_PTR(-ENODEV);
 
-	chan = __dma_request_channel(mask, NULL, NULL);
+	chan = __dma_request_channel(mask, NULL, NULL, NULL);
 	if (!chan) {
 		mutex_lock(&dma_list_mutex);
 		if (list_empty(&dma_device_list))
diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index 91fd395c90c4..6b43d04da05d 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -316,8 +316,8 @@ struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
 	if (count != 1)
 		return NULL;
 
-	return dma_request_channel(info->dma_cap, info->filter_fn,
-			&dma_spec->args[0]);
+	return __dma_request_channel(&info->dma_cap, info->filter_fn,
+				     &dma_spec->args[0], dma_spec->np);
 }
 EXPORT_SYMBOL_GPL(of_dma_simple_xlate);
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index d49ec5c31944..504085b2bf21 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -1314,7 +1314,8 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
 enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
 void dma_issue_pending_all(void);
 struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
-					dma_filter_fn fn, void *fn_param);
+				       dma_filter_fn fn, void *fn_param,
+				       struct device_node *np);
 struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name);
 
 struct dma_chan *dma_request_chan(struct device *dev, const char *name);
@@ -1339,7 +1340,9 @@ static inline void dma_issue_pending_all(void)
 {
 }
 static inline struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
-					      dma_filter_fn fn, void *fn_param)
+						     dma_filter_fn fn,
+						     void *fn_param,
+						     struct device_node *np)
 {
 	return NULL;
 }
@@ -1411,7 +1414,8 @@ void dma_async_device_unregister(struct dma_device *device);
 void dma_run_dependencies(struct dma_async_tx_descriptor *tx);
 struct dma_chan *dma_get_slave_channel(struct dma_chan *chan);
 struct dma_chan *dma_get_any_slave_channel(struct dma_device *device);
-#define dma_request_channel(mask, x, y) __dma_request_channel(&(mask), x, y)
+#define dma_request_channel(mask, x, y) \
+	__dma_request_channel(&(mask), x, y, NULL)
 #define dma_request_slave_channel_compat(mask, x, y, dev, name) \
 	__dma_request_slave_channel_compat(&(mask), x, y, dev, name)
 
@@ -1429,6 +1433,6 @@ static inline struct dma_chan
 	if (!fn || !fn_param)
 		return NULL;
 
-	return __dma_request_channel(mask, fn, fn_param);
+	return __dma_request_channel(mask, fn, fn_param, NULL);
 }
 #endif /* DMAENGINE_H */
-- 
cgit v1.2.3-59-g8ed1b


From 990c0b53bf6599a9c9c7df1529dde681dee6cf64 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Mon, 20 May 2019 19:32:16 +0800
Subject: dmaengine: imx-sdma: Let the core do the device node validation

Let the DMA engine core do the device node validation instead of drivers.

Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/imx-sdma.c                | 9 ++-------
 include/linux/platform_data/dma-imx.h | 1 -
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 99d9f431ae2c..ca296f0849ef 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -1934,16 +1934,11 @@ disable_clk_ipg:
 static bool sdma_filter_fn(struct dma_chan *chan, void *fn_param)
 {
 	struct sdma_channel *sdmac = to_sdma_chan(chan);
-	struct sdma_engine *sdma = sdmac->sdma;
 	struct imx_dma_data *data = fn_param;
 
 	if (!imx_dma_is_general_purpose(chan))
 		return false;
 
-	/* return false if it's not the right device */
-	if (sdma->dev->of_node != data->of_node)
-		return false;
-
 	sdmac->data = *data;
 	chan->private = &sdmac->data;
 
@@ -1971,9 +1966,9 @@ static struct dma_chan *sdma_xlate(struct of_phandle_args *dma_spec,
 	 * be set to sdmac->event_id1.
 	 */
 	data.dma_request2 = 0;
-	data.of_node = ofdma->of_node;
 
-	return dma_request_channel(mask, sdma_filter_fn, &data);
+	return __dma_request_channel(&mask, sdma_filter_fn, &data,
+				     ofdma->of_node);
 }
 
 static int sdma_probe(struct platform_device *pdev)
diff --git a/include/linux/platform_data/dma-imx.h b/include/linux/platform_data/dma-imx.h
index 9daea8d42a10..7d964e787299 100644
--- a/include/linux/platform_data/dma-imx.h
+++ b/include/linux/platform_data/dma-imx.h
@@ -55,7 +55,6 @@ struct imx_dma_data {
 	int dma_request2; /* secondary DMA request line */
 	enum sdma_peripheral_type peripheral_type;
 	int priority;
-	struct device_node *of_node;
 };
 
 static inline int imx_dma_is_ipu(struct dma_chan *chan)
-- 
cgit v1.2.3-59-g8ed1b


From d27ac2e02bf256d4e824e7c1e1e1afa2b96cefcc Mon Sep 17 00:00:00 2001
From: Alexandru Ardelean <alexandru.ardelean@analog.com>
Date: Mon, 27 May 2019 09:55:16 +0300
Subject: include: fpga: adi-axi-common.h: add common regs & defs header

The AXI HDL cores provided for Analog Devices reference designs all share
some common base registers (e.g. version register at address 0x00).

To reduce duplication for this, a common header is added to define these
registers as well as bitfields & macros to work with these registers.

Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/fpga/adi-axi-common.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 include/linux/fpga/adi-axi-common.h

(limited to 'include/linux')

diff --git a/include/linux/fpga/adi-axi-common.h b/include/linux/fpga/adi-axi-common.h
new file mode 100644
index 000000000000..7fc95d5c95bb
--- /dev/null
+++ b/include/linux/fpga/adi-axi-common.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Analog Devices AXI common registers & definitions
+ *
+ * Copyright 2019 Analog Devices Inc.
+ *
+ * https://wiki.analog.com/resources/fpga/docs/axi_ip
+ * https://wiki.analog.com/resources/fpga/docs/hdl/regmap
+ */
+
+#ifndef ADI_AXI_COMMON_H_
+#define ADI_AXI_COMMON_H_
+
+#define	ADI_AXI_REG_VERSION			0x0000
+
+#define ADI_AXI_PCORE_VER(major, minor, patch)	\
+	(((major) << 16) | ((minor) << 8) | (patch))
+
+#endif /* ADI_AXI_COMMON_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From 153969fd952d81ab8f57574f9be1a90b0a0fa791 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 21 May 2019 03:38:25 +0200
Subject: ARM: versatile: Drop CLCD platform data

The Versatile family no longer makes any use of the CLCD
platform data, we have moved over all users to the DRM
driver that has built-in handling of the displays. Delete
the old auxdata.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-versatile/versatile_dt.c             | 157 ---------------------
 include/linux/platform_data/video-clcd-versatile.h |  28 ----
 2 files changed, 185 deletions(-)
 delete mode 100644 include/linux/platform_data/video-clcd-versatile.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-versatile/versatile_dt.c b/arch/arm/mach-versatile/versatile_dt.c
index 028463af726d..b5ff1ea5a944 100644
--- a/arch/arm/mach-versatile/versatile_dt.c
+++ b/arch/arm/mach-versatile/versatile_dt.c
@@ -29,8 +29,6 @@
 #include <linux/of_platform.h>
 #include <linux/slab.h>
 #include <linux/amba/bus.h>
-#include <linux/amba/clcd.h>
-#include <linux/platform_data/video-clcd-versatile.h>
 #include <linux/amba/mmci.h>
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
@@ -47,14 +45,12 @@
  */
 #define VERSATILE_SYS_PCICTL_OFFSET           0x44
 #define VERSATILE_SYS_MCI_OFFSET              0x48
-#define VERSATILE_SYS_CLCD_OFFSET             0x50
 
 /*
  * VERSATILE peripheral addresses
  */
 #define VERSATILE_MMCI0_BASE           0x10005000	/* MMC interface */
 #define VERSATILE_MMCI1_BASE           0x1000B000	/* MMC Interface */
-#define VERSATILE_CLCD_BASE            0x10120000	/* CLCD */
 #define VERSATILE_SCTL_BASE            0x101E0000	/* System controller */
 #define VERSATILE_IB2_BASE             0x24000000	/* IB2 module */
 #define VERSATILE_IB2_CTL_BASE		(VERSATILE_IB2_BASE + 0x03000000)
@@ -96,158 +92,6 @@ static struct mmci_platform_data mmc1_plat_data = {
 	.status		= mmc_status,
 };
 
-/*
- * CLCD support.
- */
-#define SYS_CLCD_MODE_MASK	(3 << 0)
-#define SYS_CLCD_MODE_888	(0 << 0)
-#define SYS_CLCD_MODE_5551	(1 << 0)
-#define SYS_CLCD_MODE_565_RLSB	(2 << 0)
-#define SYS_CLCD_MODE_565_BLSB	(3 << 0)
-#define SYS_CLCD_NLCDIOON	(1 << 2)
-#define SYS_CLCD_VDDPOSSWITCH	(1 << 3)
-#define SYS_CLCD_PWR3V5SWITCH	(1 << 4)
-#define SYS_CLCD_ID_MASK	(0x1f << 8)
-#define SYS_CLCD_ID_SANYO_3_8	(0x00 << 8)
-#define SYS_CLCD_ID_UNKNOWN_8_4	(0x01 << 8)
-#define SYS_CLCD_ID_EPSON_2_2	(0x02 << 8)
-#define SYS_CLCD_ID_SANYO_2_5	(0x07 << 8)
-#define SYS_CLCD_ID_VGA		(0x1f << 8)
-
-static bool is_sanyo_2_5_lcd;
-
-/*
- * Disable all display connectors on the interface module.
- */
-static void versatile_clcd_disable(struct clcd_fb *fb)
-{
-	void __iomem *sys_clcd = versatile_sys_base + VERSATILE_SYS_CLCD_OFFSET;
-	u32 val;
-
-	val = readl(sys_clcd);
-	val &= ~SYS_CLCD_NLCDIOON | SYS_CLCD_PWR3V5SWITCH;
-	writel(val, sys_clcd);
-
-	/*
-	 * If the LCD is Sanyo 2x5 in on the IB2 board, turn the back-light off
-	 */
-	if (of_machine_is_compatible("arm,versatile-ab") && is_sanyo_2_5_lcd) {
-		unsigned long ctrl;
-
-		ctrl = readl(versatile_ib2_ctrl);
-		ctrl &= ~0x01;
-		writel(ctrl, versatile_ib2_ctrl);
-	}
-}
-
-/*
- * Enable the relevant connector on the interface module.
- */
-static void versatile_clcd_enable(struct clcd_fb *fb)
-{
-	struct fb_var_screeninfo *var = &fb->fb.var;
-	void __iomem *sys_clcd = versatile_sys_base + VERSATILE_SYS_CLCD_OFFSET;
-	u32 val;
-
-	val = readl(sys_clcd);
-	val &= ~SYS_CLCD_MODE_MASK;
-
-	switch (var->green.length) {
-	case 5:
-		val |= SYS_CLCD_MODE_5551;
-		break;
-	case 6:
-		if (var->red.offset == 0)
-			val |= SYS_CLCD_MODE_565_RLSB;
-		else
-			val |= SYS_CLCD_MODE_565_BLSB;
-		break;
-	case 8:
-		val |= SYS_CLCD_MODE_888;
-		break;
-	}
-
-	/*
-	 * Set the MUX
-	 */
-	writel(val, sys_clcd);
-
-	/*
-	 * And now enable the PSUs
-	 */
-	val |= SYS_CLCD_NLCDIOON | SYS_CLCD_PWR3V5SWITCH;
-	writel(val, sys_clcd);
-
-	/*
-	 * If the LCD is Sanyo 2x5 in on the IB2 board, turn the back-light on
-	 */
-	if (of_machine_is_compatible("arm,versatile-ab") && is_sanyo_2_5_lcd) {
-		unsigned long ctrl;
-
-		ctrl = readl(versatile_ib2_ctrl);
-		ctrl |= 0x01;
-		writel(ctrl, versatile_ib2_ctrl);
-	}
-}
-
-/*
- * Detect which LCD panel is connected, and return the appropriate
- * clcd_panel structure.  Note: we do not have any information on
- * the required timings for the 8.4in panel, so we presently assume
- * VGA timings.
- */
-static int versatile_clcd_setup(struct clcd_fb *fb)
-{
-	void __iomem *sys_clcd = versatile_sys_base + VERSATILE_SYS_CLCD_OFFSET;
-	const char *panel_name;
-	u32 val;
-
-	is_sanyo_2_5_lcd = false;
-
-	val = readl(sys_clcd) & SYS_CLCD_ID_MASK;
-	if (val == SYS_CLCD_ID_SANYO_3_8)
-		panel_name = "Sanyo TM38QV67A02A";
-	else if (val == SYS_CLCD_ID_SANYO_2_5) {
-		panel_name = "Sanyo QVGA Portrait";
-		is_sanyo_2_5_lcd = true;
-	} else if (val == SYS_CLCD_ID_EPSON_2_2)
-		panel_name = "Epson L2F50113T00";
-	else if (val == SYS_CLCD_ID_VGA)
-		panel_name = "VGA";
-	else {
-		printk(KERN_ERR "CLCD: unknown LCD panel ID 0x%08x, using VGA\n",
-			val);
-		panel_name = "VGA";
-	}
-
-	fb->panel = versatile_clcd_get_panel(panel_name);
-	if (!fb->panel)
-		return -EINVAL;
-
-	return versatile_clcd_setup_dma(fb, SZ_1M);
-}
-
-static void versatile_clcd_decode(struct clcd_fb *fb, struct clcd_regs *regs)
-{
-	clcdfb_decode(fb, regs);
-
-	/* Always clear BGR for RGB565: we do the routing externally */
-	if (fb->fb.var.green.length == 6)
-		regs->cntl &= ~CNTL_BGR;
-}
-
-static struct clcd_board clcd_plat_data = {
-	.name		= "Versatile",
-	.caps		= CLCD_CAP_5551 | CLCD_CAP_565 | CLCD_CAP_888,
-	.check		= clcdfb_check,
-	.decode		= versatile_clcd_decode,
-	.disable	= versatile_clcd_disable,
-	.enable		= versatile_clcd_enable,
-	.setup		= versatile_clcd_setup,
-	.mmap		= versatile_clcd_mmap_dma,
-	.remove		= versatile_clcd_remove_dma,
-};
-
 /*
  * Lookup table for attaching a specific name and platform_data pointer to
  * devices as they get created by of_platform_populate().  Ideally this table
@@ -257,7 +101,6 @@ static struct clcd_board clcd_plat_data = {
 struct of_dev_auxdata versatile_auxdata_lookup[] __initdata = {
 	OF_DEV_AUXDATA("arm,primecell", VERSATILE_MMCI0_BASE, "fpga:05", &mmc0_plat_data),
 	OF_DEV_AUXDATA("arm,primecell", VERSATILE_MMCI1_BASE, "fpga:0b", &mmc1_plat_data),
-	OF_DEV_AUXDATA("arm,primecell", VERSATILE_CLCD_BASE, "dev:20", &clcd_plat_data),
 	{}
 };
 
diff --git a/include/linux/platform_data/video-clcd-versatile.h b/include/linux/platform_data/video-clcd-versatile.h
deleted file mode 100644
index 305ebaec3afd..000000000000
--- a/include/linux/platform_data/video-clcd-versatile.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef PLAT_CLCD_H
-#define PLAT_CLCD_H
-
-#ifdef CONFIG_PLAT_VERSATILE_CLCD
-struct clcd_panel *versatile_clcd_get_panel(const char *);
-int versatile_clcd_setup_dma(struct clcd_fb *, unsigned long);
-int versatile_clcd_mmap_dma(struct clcd_fb *, struct vm_area_struct *);
-void versatile_clcd_remove_dma(struct clcd_fb *);
-#else
-static inline struct clcd_panel *versatile_clcd_get_panel(const char *s)
-{
-	return NULL;
-}
-static inline int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize)
-{
-	return -ENODEV;
-}
-static inline int versatile_clcd_mmap_dma(struct clcd_fb *fb, struct vm_area_struct *vm)
-{
-	return -ENODEV;
-}
-static inline void versatile_clcd_remove_dma(struct clcd_fb *fb)
-{
-}
-#endif
-
-#endif
-- 
cgit v1.2.3-59-g8ed1b


From 2b2f7def058a5386838ef4dba70a860285f79e66 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 27 May 2019 04:51:53 -0700
Subject: bus: ti-sysc: Add support for missing clockdomain handling

We need to let ti-sysc driver manage clockdomain autoidle for the
duration of of reset, enable and idle. And we need to do it before we
enable the clock and after we disable it. Currently we are still
relying on platform callbacks indirectly managing clockdomain autoidle.
But I noticed that for device tree only probed drivers it now happens
only after we enabling the clocks and before we disable the clocks,
while it should be the other way around. So far I have not noticed
any issues with this though.

Let's add new ti_sysc_clkdm_deny_idle() and ti_sysc_clkdm_allow_idle()
functions for ti-sysc driver to use to manage clockdomains directly via
platform data callbacks. Note that we can implement the clockdomain
functions in pdata-quirks.c as for probing devices without "ti,hwmods"
custom property we don't need to use the other platform data callbacks.

Let's do this in one patch as there's is still an unlikely chance we
may need to apply this as a fix for v5.2 for dropping legacy platform
data for some devices. We also do have the option of adding back the
platform data if needed in case of trouble.

Tested-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 arch/arm/mach-omap2/omap_hwmod.c      |  39 ++---------
 arch/arm/mach-omap2/pdata-quirks.c    |  60 ++++++++++++++++
 drivers/bus/ti-sysc.c                 | 127 +++++++++++++++++++++++++++-------
 include/linux/platform_data/ti-sysc.h |   8 +++
 4 files changed, 174 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c
index 405ac24def05..932ba221e8e7 100644
--- a/arch/arm/mach-omap2/omap_hwmod.c
+++ b/arch/arm/mach-omap2/omap_hwmod.c
@@ -3445,6 +3445,7 @@ static int omap_hwmod_check_module(struct device *dev,
  * @dev: struct device
  * @oh: module
  * @sysc_fields: sysc register bits
+ * @clockdomain: clockdomain
  * @rev_offs: revision register offset
  * @sysc_offs: sysconfig register offset
  * @syss_offs: sysstatus register offset
@@ -3456,6 +3457,7 @@ static int omap_hwmod_check_module(struct device *dev,
 static int omap_hwmod_allocate_module(struct device *dev, struct omap_hwmod *oh,
 				      const struct ti_sysc_module_data *data,
 				      struct sysc_regbits *sysc_fields,
+				      struct clockdomain *clkdm,
 				      s32 rev_offs, s32 sysc_offs,
 				      s32 syss_offs, u32 sysc_flags,
 				      u32 idlemodes)
@@ -3463,8 +3465,6 @@ static int omap_hwmod_allocate_module(struct device *dev, struct omap_hwmod *oh,
 	struct omap_hwmod_class_sysconfig *sysc;
 	struct omap_hwmod_class *class = NULL;
 	struct omap_hwmod_ocp_if *oi = NULL;
-	struct clockdomain *clkdm = NULL;
-	struct clk *clk = NULL;
 	void __iomem *regs = NULL;
 	unsigned long flags;
 
@@ -3511,36 +3511,6 @@ static int omap_hwmod_allocate_module(struct device *dev, struct omap_hwmod *oh,
 		oi->user = OCP_USER_MPU | OCP_USER_SDMA;
 	}
 
-	if (!oh->_clk) {
-		struct clk_hw_omap *hwclk;
-
-		clk = of_clk_get_by_name(dev->of_node, "fck");
-		if (!IS_ERR(clk))
-			clk_prepare(clk);
-		else
-			clk = NULL;
-
-		/*
-		 * Populate clockdomain based on dts clock. It is needed for
-		 * clkdm_deny_idle() and clkdm_allow_idle() until we have have
-		 * interconnect driver and reset driver capable of blocking
-		 * clockdomain idle during reset, enable and idle.
-		 */
-		if (clk) {
-			hwclk = to_clk_hw_omap(__clk_get_hw(clk));
-			if (hwclk && hwclk->clkdm_name)
-				clkdm = clkdm_lookup(hwclk->clkdm_name);
-		}
-
-		/*
-		 * Note that we assume interconnect driver manages the clocks
-		 * and do not need to populate oh->_clk for dynamically
-		 * allocated modules.
-		 */
-		clk_unprepare(clk);
-		clk_put(clk);
-	}
-
 	spin_lock_irqsave(&oh->_lock, flags);
 	if (regs)
 		oh->_mpu_rt_va = regs;
@@ -3626,7 +3596,7 @@ int omap_hwmod_init_module(struct device *dev,
 	u32 sysc_flags, idlemodes;
 	int error;
 
-	if (!dev || !data)
+	if (!dev || !data || !data->name || !cookie)
 		return -EINVAL;
 
 	oh = _lookup(data->name);
@@ -3697,7 +3667,8 @@ int omap_hwmod_init_module(struct device *dev,
 		return error;
 
 	return omap_hwmod_allocate_module(dev, oh, data, sysc_fields,
-					  rev_offs, sysc_offs, syss_offs,
+					  cookie->clkdm, rev_offs,
+					  sysc_offs, syss_offs,
 					  sysc_flags, idlemodes);
 }
 
diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c
index a2ecc5e69abb..b09cc4e8d240 100644
--- a/arch/arm/mach-omap2/pdata-quirks.c
+++ b/arch/arm/mach-omap2/pdata-quirks.c
@@ -29,6 +29,7 @@
 #include <linux/platform_data/wkup_m3.h>
 #include <linux/platform_data/asoc-ti-mcbsp.h>
 
+#include "clockdomain.h"
 #include "common.h"
 #include "common-board-devices.h"
 #include "control.h"
@@ -463,6 +464,62 @@ static void __init dra7x_evm_mmc_quirk(void)
 }
 #endif
 
+static struct clockdomain *ti_sysc_find_one_clockdomain(struct clk *clk)
+{
+	struct clockdomain *clkdm = NULL;
+	struct clk_hw_omap *hwclk;
+
+	hwclk = to_clk_hw_omap(__clk_get_hw(clk));
+	if (hwclk && hwclk->clkdm_name)
+		clkdm = clkdm_lookup(hwclk->clkdm_name);
+
+	return clkdm;
+}
+
+/**
+ * ti_sysc_clkdm_init - find clockdomain based on clock
+ * @fck: device functional clock
+ * @ick: device interface clock
+ * @dev: struct device
+ *
+ * Populate clockdomain based on clock. It is needed for
+ * clkdm_deny_idle() and clkdm_allow_idle() for blocking clockdomain
+ * clockdomain idle during reset, enable and idle.
+ *
+ * Note that we assume interconnect driver manages the clocks
+ * and do not need to populate oh->_clk for dynamically
+ * allocated modules.
+ */
+static int ti_sysc_clkdm_init(struct device *dev,
+			      struct clk *fck, struct clk *ick,
+			      struct ti_sysc_cookie *cookie)
+{
+	if (fck)
+		cookie->clkdm = ti_sysc_find_one_clockdomain(fck);
+	if (cookie->clkdm)
+		return 0;
+	if (ick)
+		cookie->clkdm = ti_sysc_find_one_clockdomain(ick);
+	if (cookie->clkdm)
+		return 0;
+
+	return -ENODEV;
+}
+
+static void ti_sysc_clkdm_deny_idle(struct device *dev,
+				    const struct ti_sysc_cookie *cookie)
+{
+	if (cookie->clkdm)
+		clkdm_deny_idle(cookie->clkdm);
+}
+
+static void ti_sysc_clkdm_allow_idle(struct device *dev,
+				     const struct ti_sysc_cookie *cookie)
+{
+	if (cookie->clkdm)
+		clkdm_allow_idle(cookie->clkdm);
+}
+
 static int ti_sysc_enable_module(struct device *dev,
 				 const struct ti_sysc_cookie *cookie)
 {
@@ -494,6 +551,9 @@ static struct of_dev_auxdata omap_auxdata_lookup[];
 
 static struct ti_sysc_platform_data ti_sysc_pdata = {
 	.auxdata = omap_auxdata_lookup,
+	.init_clockdomain = ti_sysc_clkdm_init,
+	.clkdm_deny_idle = ti_sysc_clkdm_deny_idle,
+	.clkdm_allow_idle = ti_sysc_clkdm_allow_idle,
 	.init_module = omap_hwmod_init_module,
 	.enable_module = ti_sysc_enable_module,
 	.idle_module = ti_sysc_idle_module,
diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index b72741668c92..e86f7850206a 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -422,6 +422,30 @@ static void sysc_disable_opt_clocks(struct sysc *ddata)
 	}
 }
 
+static void sysc_clkdm_deny_idle(struct sysc *ddata)
+{
+	struct ti_sysc_platform_data *pdata;
+
+	if (ddata->legacy_mode)
+		return;
+
+	pdata = dev_get_platdata(ddata->dev);
+	if (pdata && pdata->clkdm_deny_idle)
+		pdata->clkdm_deny_idle(ddata->dev, &ddata->cookie);
+}
+
+static void sysc_clkdm_allow_idle(struct sysc *ddata)
+{
+	struct ti_sysc_platform_data *pdata;
+
+	if (ddata->legacy_mode)
+		return;
+
+	pdata = dev_get_platdata(ddata->dev);
+	if (pdata && pdata->clkdm_allow_idle)
+		pdata->clkdm_allow_idle(ddata->dev, &ddata->cookie);
+}
+
 /**
  * sysc_init_resets - init rstctrl reset line if configured
  * @ddata: device driver data
@@ -795,6 +819,7 @@ static void sysc_show_registers(struct sysc *ddata)
 
 #define SYSC_IDLE_MASK	(SYSC_NR_IDLEMODES - 1)
 
+/* Caller needs to manage sysc_clkdm_deny_idle() and sysc_clkdm_allow_idle() */
 static int sysc_enable_module(struct device *dev)
 {
 	struct sysc *ddata;
@@ -805,11 +830,6 @@ static int sysc_enable_module(struct device *dev)
 	if (ddata->offsets[SYSC_SYSCONFIG] == -ENODEV)
 		return 0;
 
-	/*
-	 * TODO: Need to prevent clockdomain autoidle?
-	 * See clkdm_deny_idle() in arch/mach-omap2/omap_hwmod.c
-	 */
-
 	regbits = ddata->cap->regbits;
 	reg = sysc_read(ddata, ddata->offsets[SYSC_SYSCONFIG]);
 
@@ -861,6 +881,7 @@ static int sysc_best_idle_mode(u32 idlemodes, u32 *best_mode)
 	return 0;
 }
 
+/* Caller needs to manage sysc_clkdm_deny_idle() and sysc_clkdm_allow_idle() */
 static int sysc_disable_module(struct device *dev)
 {
 	struct sysc *ddata;
@@ -872,11 +893,6 @@ static int sysc_disable_module(struct device *dev)
 	if (ddata->offsets[SYSC_SYSCONFIG] == -ENODEV)
 		return 0;
 
-	/*
-	 * TODO: Need to prevent clockdomain autoidle?
-	 * See clkdm_deny_idle() in arch/mach-omap2/omap_hwmod.c
-	 */
-
 	regbits = ddata->cap->regbits;
 	reg = sysc_read(ddata, ddata->offsets[SYSC_SYSCONFIG]);
 
@@ -966,14 +982,16 @@ static int __maybe_unused sysc_runtime_suspend(struct device *dev)
 	if (!ddata->enabled)
 		return 0;
 
+	sysc_clkdm_deny_idle(ddata);
+
 	if (ddata->legacy_mode) {
 		error = sysc_runtime_suspend_legacy(dev, ddata);
 		if (error)
-			return error;
+			goto err_allow_idle;
 	} else {
 		error = sysc_disable_module(dev);
 		if (error)
-			return error;
+			goto err_allow_idle;
 	}
 
 	sysc_disable_main_clocks(ddata);
@@ -983,6 +1001,9 @@ static int __maybe_unused sysc_runtime_suspend(struct device *dev)
 
 	ddata->enabled = false;
 
+err_allow_idle:
+	sysc_clkdm_allow_idle(ddata);
+
 	return error;
 }
 
@@ -996,10 +1017,12 @@ static int __maybe_unused sysc_runtime_resume(struct device *dev)
 	if (ddata->enabled)
 		return 0;
 
+	sysc_clkdm_deny_idle(ddata);
+
 	if (sysc_opt_clks_needed(ddata)) {
 		error = sysc_enable_opt_clocks(ddata);
 		if (error)
-			return error;
+			goto err_allow_idle;
 	}
 
 	error = sysc_enable_main_clocks(ddata);
@@ -1018,6 +1041,8 @@ static int __maybe_unused sysc_runtime_resume(struct device *dev)
 
 	ddata->enabled = true;
 
+	sysc_clkdm_allow_idle(ddata);
+
 	return 0;
 
 err_main_clocks:
@@ -1025,6 +1050,8 @@ err_main_clocks:
 err_opt_clocks:
 	if (sysc_opt_clks_needed(ddata))
 		sysc_disable_opt_clocks(ddata);
+err_allow_idle:
+	sysc_clkdm_allow_idle(ddata);
 
 	return error;
 }
@@ -1245,6 +1272,33 @@ static void sysc_init_revision_quirks(struct sysc *ddata)
 	}
 }
 
+static int sysc_clockdomain_init(struct sysc *ddata)
+{
+	struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev);
+	struct clk *fck = NULL, *ick = NULL;
+	int error;
+
+	if (!pdata || !pdata->init_clockdomain)
+		return 0;
+
+	switch (ddata->nr_clocks) {
+	case 2:
+		ick = ddata->clocks[SYSC_ICK];
+		/* fallthrough */
+	case 1:
+		fck = ddata->clocks[SYSC_FCK];
+		break;
+	case 0:
+		return 0;
+	}
+
+	error = pdata->init_clockdomain(ddata->dev, fck, ick, &ddata->cookie);
+	if (!error || error == -ENODEV)
+		return 0;
+
+	return error;
+}
+
 /*
  * Note that pdata->init_module() typically does a reset first. After
  * pdata->init_module() is done, PM runtime can be used for the interconnect
@@ -1255,7 +1309,7 @@ static int sysc_legacy_init(struct sysc *ddata)
 	struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev);
 	int error;
 
-	if (!ddata->legacy_mode || !pdata || !pdata->init_module)
+	if (!pdata || !pdata->init_module)
 		return 0;
 
 	error = pdata->init_module(ddata->dev, ddata->mdata, &ddata->cookie);
@@ -1347,7 +1401,13 @@ static int sysc_init_module(struct sysc *ddata)
 	    (SYSC_QUIRK_NO_IDLE | SYSC_QUIRK_NO_IDLE_ON_INIT))
 		manage_clocks = false;
 
+	error = sysc_clockdomain_init(ddata);
+	if (error)
+		return error;
+
 	if (manage_clocks) {
+		sysc_clkdm_deny_idle(ddata);
+
 		error = sysc_enable_opt_clocks(ddata);
 		if (error)
 			return error;
@@ -1360,20 +1420,33 @@ static int sysc_init_module(struct sysc *ddata)
 	ddata->revision = sysc_read_revision(ddata);
 	sysc_init_revision_quirks(ddata);
 
-	error = sysc_legacy_init(ddata);
-	if (error)
-		goto err_main_clocks;
+	if (ddata->legacy_mode) {
+		error = sysc_legacy_init(ddata);
+		if (error)
+			goto err_main_clocks;
+	}
+
+	if (!ddata->legacy_mode && manage_clocks) {
+		error = sysc_enable_module(ddata->dev);
+		if (error)
+			goto err_main_clocks;
+	}
 
 	error = sysc_reset(ddata);
 	if (error)
 		dev_err(ddata->dev, "Reset failed with %d\n", error);
 
+	if (!ddata->legacy_mode && manage_clocks)
+		sysc_disable_module(ddata->dev);
+
 err_main_clocks:
 	if (manage_clocks)
 		sysc_disable_main_clocks(ddata);
 err_opt_clocks:
-	if (manage_clocks)
+	if (manage_clocks) {
 		sysc_disable_opt_clocks(ddata);
+		sysc_clkdm_allow_idle(ddata);
+	}
 
 	return error;
 }
@@ -2012,20 +2085,22 @@ static int sysc_init_pdata(struct sysc *ddata)
 	struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev);
 	struct ti_sysc_module_data *mdata;
 
-	if (!pdata || !ddata->legacy_mode)
+	if (!pdata)
 		return 0;
 
 	mdata = devm_kzalloc(ddata->dev, sizeof(*mdata), GFP_KERNEL);
 	if (!mdata)
 		return -ENOMEM;
 
-	mdata->name = ddata->legacy_mode;
-	mdata->module_pa = ddata->module_pa;
-	mdata->module_size = ddata->module_size;
-	mdata->offsets = ddata->offsets;
-	mdata->nr_offsets = SYSC_MAX_REGS;
-	mdata->cap = ddata->cap;
-	mdata->cfg = &ddata->cfg;
+	if (ddata->legacy_mode) {
+		mdata->name = ddata->legacy_mode;
+		mdata->module_pa = ddata->module_pa;
+		mdata->module_size = ddata->module_size;
+		mdata->offsets = ddata->offsets;
+		mdata->nr_offsets = SYSC_MAX_REGS;
+		mdata->cap = ddata->cap;
+		mdata->cfg = &ddata->cfg;
+	}
 
 	ddata->mdata = mdata;
 
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 9256c0305968..6626fd31e309 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -19,6 +19,7 @@ enum ti_sysc_module_type {
 
 struct ti_sysc_cookie {
 	void *data;
+	void *clkdm;
 };
 
 /**
@@ -125,9 +126,16 @@ struct ti_sysc_module_data {
 };
 
 struct device;
+struct clk;
 
 struct ti_sysc_platform_data {
 	struct of_dev_auxdata *auxdata;
+	int (*init_clockdomain)(struct device *dev, struct clk *fck,
+				struct clk *ick, struct ti_sysc_cookie *cookie);
+	void (*clkdm_deny_idle)(struct device *dev,
+				const struct ti_sysc_cookie *cookie);
+	void (*clkdm_allow_idle)(struct device *dev,
+				 const struct ti_sysc_cookie *cookie);
 	int (*init_module)(struct device *dev,
 			   const struct ti_sysc_module_data *data,
 			   struct ti_sysc_cookie *cookie);
-- 
cgit v1.2.3-59-g8ed1b


From e0db94fe87dacd72be0699adcc29e321db7f1689 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 27 May 2019 04:51:53 -0700
Subject: bus: ti-sysc: Make OCP reset work for sysstatus and sysconfig reset
 bits

We've had minimal OCP softreset support in ti-sysc interconnect target
module driver only used for MCAN driver so far. But it turns out that
MCAN has the sysstatus register resetdone bit inverted compared to most
other modules.

Let's make OCP softreset work for other typical cases with reset status
in sysstatus or sysconfig register so we can use the new functions for
sysc_enable_module() and sysc_disable_module() without "ti,hwmods"
property in the following patches.

Tested-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 drivers/bus/ti-sysc.c                 | 72 ++++++++++++++++++++++++++---------
 include/linux/platform_data/ti-sysc.h |  1 +
 2 files changed, 55 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index f00997eea207..f4a048430cd1 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -153,6 +153,26 @@ static u32 sysc_read_revision(struct sysc *ddata)
 	return sysc_read(ddata, offset);
 }
 
+static u32 sysc_read_sysconfig(struct sysc *ddata)
+{
+	int offset = ddata->offsets[SYSC_SYSCONFIG];
+
+	if (offset < 0)
+		return 0;
+
+	return sysc_read(ddata, offset);
+}
+
+static u32 sysc_read_sysstatus(struct sysc *ddata)
+{
+	int offset = ddata->offsets[SYSC_SYSSTATUS];
+
+	if (offset < 0)
+		return 0;
+
+	return sysc_read(ddata, offset);
+}
+
 static int sysc_add_named_clock_from_child(struct sysc *ddata,
 					   const char *name,
 					   const char *optfck_name)
@@ -1369,34 +1389,49 @@ static int sysc_rstctrl_reset_deassert(struct sysc *ddata, bool reset)
 	return reset_control_deassert(ddata->rsts);
 }
 
+/*
+ * Note that the caller must ensure the interconnect target module is enabled
+ * before calling reset. Otherwise reset will not complete.
+ */
 static int sysc_reset(struct sysc *ddata)
 {
-	int offset = ddata->offsets[SYSC_SYSCONFIG];
-	int val;
+	int sysc_offset, syss_offset, sysc_val, rstval, quirks, error = 0;
+	u32 sysc_mask, syss_done;
+
+	sysc_offset = ddata->offsets[SYSC_SYSCONFIG];
+	syss_offset = ddata->offsets[SYSC_SYSSTATUS];
+	quirks = ddata->cfg.quirks;
 
-	if (ddata->legacy_mode || offset < 0 ||
+	if (ddata->legacy_mode || sysc_offset < 0 ||
+	    ddata->cap->regbits->srst_shift < 0 ||
 	    ddata->cfg.quirks & SYSC_QUIRK_NO_RESET_ON_INIT)
 		return 0;
 
-	/*
-	 * Currently only support reset status in sysstatus.
-	 * Warn and return error in all other cases
-	 */
-	if (!ddata->cfg.syss_mask) {
-		dev_err(ddata->dev, "No ti,syss-mask. Reset failed\n");
-		return -EINVAL;
-	}
+	sysc_mask = BIT(ddata->cap->regbits->srst_shift);
 
-	val = sysc_read(ddata, offset);
-	val |= (0x1 << ddata->cap->regbits->srst_shift);
-	sysc_write(ddata, offset, val);
+	if (ddata->cfg.quirks & SYSS_QUIRK_RESETDONE_INVERTED)
+		syss_done = 0;
+	else
+		syss_done = ddata->cfg.syss_mask;
+
+	sysc_val = sysc_read_sysconfig(ddata);
+	sysc_val |= sysc_mask;
+	sysc_write(ddata, sysc_offset, sysc_val);
 
 	/* Poll on reset status */
-	offset = ddata->offsets[SYSC_SYSSTATUS];
+	if (syss_offset >= 0) {
+		error = readx_poll_timeout(sysc_read_sysstatus, ddata, rstval,
+					   (rstval & ddata->cfg.syss_mask) ==
+					   syss_done,
+					   100, MAX_MODULE_SOFTRESET_WAIT);
+
+	} else if (ddata->cfg.quirks & SYSC_QUIRK_RESET_STATUS) {
+		error = readx_poll_timeout(sysc_read_sysconfig, ddata, rstval,
+					   !(rstval & sysc_mask),
+					   100, MAX_MODULE_SOFTRESET_WAIT);
+	}
 
-	return readl_poll_timeout(ddata->module_va + offset, val,
-				  (val & ddata->cfg.syss_mask) == 0x0,
-				  100, MAX_MODULE_SOFTRESET_WAIT);
+	return error;
 }
 
 /*
@@ -2099,6 +2134,7 @@ static const struct sysc_capabilities sysc_dra7_mcan = {
 	.type = TI_SYSC_DRA7_MCAN,
 	.sysc_mask = SYSC_DRA7_MCAN_ENAWAKEUP | SYSC_OMAP4_SOFTRESET,
 	.regbits = &sysc_regbits_dra7_mcan,
+	.mod_quirks = SYSS_QUIRK_RESETDONE_INVERTED,
 };
 
 static int sysc_init_pdata(struct sysc *ddata)
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 6626fd31e309..8822e99ff813 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -47,6 +47,7 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSS_QUIRK_RESETDONE_INVERTED	BIT(14)
 #define SYSC_QUIRK_SWSUP_MSTANDBY	BIT(13)
 #define SYSC_QUIRK_SWSUP_SIDLE_ACT	BIT(12)
 #define SYSC_QUIRK_SWSUP_SIDLE		BIT(11)
-- 
cgit v1.2.3-59-g8ed1b


From 229b4e0728e0a6ddca2645e73696d5b104fbbbfb Mon Sep 17 00:00:00 2001
From: Changbin Du <changbin.du@gmail.com>
Date: Tue, 14 May 2019 22:47:24 +0800
Subject: Documentation: PCI: convert pci.txt to reST

Convert plain text documentation to reStructuredText format and add it to
Sphinx TOC tree.  No essential content change.

Move the description of struct pci_driver and struct pci_device_id into
in-source comments.

Signed-off-by: Changbin Du <changbin.du@gmail.com>
[bhelgaas: fix kernel-doc warnings related to moving descriptions to
linux/pci.h, fix "space tab" whitespace errors in mod_devicetable.h]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/PCI/index.rst     |   2 +
 Documentation/PCI/pci.rst       | 578 ++++++++++++++++++++++++++++++++++++
 Documentation/PCI/pci.txt       | 636 ----------------------------------------
 include/linux/mod_devicetable.h |  29 +-
 include/linux/pci.h             |  48 ++-
 5 files changed, 651 insertions(+), 642 deletions(-)
 create mode 100644 Documentation/PCI/pci.rst
 delete mode 100644 Documentation/PCI/pci.txt

(limited to 'include/linux')

diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst
index c2f8728d11cf..7babf43709b0 100644
--- a/Documentation/PCI/index.rst
+++ b/Documentation/PCI/index.rst
@@ -7,3 +7,5 @@ Linux PCI Bus Subsystem
 .. toctree::
    :maxdepth: 2
    :numbered:
+
+   pci
diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst
new file mode 100644
index 000000000000..6864f9a70f5f
--- /dev/null
+++ b/Documentation/PCI/pci.rst
@@ -0,0 +1,578 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+How To Write Linux PCI Drivers
+==============================
+
+:Authors: - Martin Mares <mj@ucw.cz>
+          - Grant Grundler <grundler@parisc-linux.org>
+
+The world of PCI is vast and full of (mostly unpleasant) surprises.
+Since each CPU architecture implements different chip-sets and PCI devices
+have different requirements (erm, "features"), the result is the PCI support
+in the Linux kernel is not as trivial as one would wish. This short paper
+tries to introduce all potential driver authors to Linux APIs for
+PCI device drivers.
+
+A more complete resource is the third edition of "Linux Device Drivers"
+by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman.
+LDD3 is available for free (under Creative Commons License) from:
+http://lwn.net/Kernel/LDD3/.
+
+However, keep in mind that all documents are subject to "bit rot".
+Refer to the source code if things are not working as described here.
+
+Please send questions/comments/patches about Linux PCI API to the
+"Linux PCI" <linux-pci@atrey.karlin.mff.cuni.cz> mailing list.
+
+
+Structure of PCI drivers
+========================
+PCI drivers "discover" PCI devices in a system via pci_register_driver().
+Actually, it's the other way around. When the PCI generic code discovers
+a new device, the driver with a matching "description" will be notified.
+Details on this below.
+
+pci_register_driver() leaves most of the probing for devices to
+the PCI layer and supports online insertion/removal of devices [thus
+supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver].
+pci_register_driver() call requires passing in a table of function
+pointers and thus dictates the high level structure of a driver.
+
+Once the driver knows about a PCI device and takes ownership, the
+driver generally needs to perform the following initialization:
+
+  - Enable the device
+  - Request MMIO/IOP resources
+  - Set the DMA mask size (for both coherent and streaming DMA)
+  - Allocate and initialize shared control data (pci_allocate_coherent())
+  - Access device configuration space (if needed)
+  - Register IRQ handler (request_irq())
+  - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
+  - Enable DMA/processing engines
+
+When done using the device, and perhaps the module needs to be unloaded,
+the driver needs to take the follow steps:
+
+  - Disable the device from generating IRQs
+  - Release the IRQ (free_irq())
+  - Stop all DMA activity
+  - Release DMA buffers (both streaming and coherent)
+  - Unregister from other subsystems (e.g. scsi or netdev)
+  - Release MMIO/IOP resources
+  - Disable the device
+
+Most of these topics are covered in the following sections.
+For the rest look at LDD3 or <linux/pci.h> .
+
+If the PCI subsystem is not configured (CONFIG_PCI is not set), most of
+the PCI functions described below are defined as inline functions either
+completely empty or just returning an appropriate error codes to avoid
+lots of ifdefs in the drivers.
+
+
+pci_register_driver() call
+==========================
+
+PCI device drivers call ``pci_register_driver()`` during their
+initialization with a pointer to a structure describing the driver
+(``struct pci_driver``):
+
+.. kernel-doc:: include/linux/pci.h
+   :functions: pci_driver
+
+The ID table is an array of ``struct pci_device_id`` entries ending with an
+all-zero entry.  Definitions with static const are generally preferred.
+
+.. kernel-doc:: include/linux/mod_devicetable.h
+   :functions: pci_device_id
+
+Most drivers only need ``PCI_DEVICE()`` or ``PCI_DEVICE_CLASS()`` to set up
+a pci_device_id table.
+
+New PCI IDs may be added to a device driver pci_ids table at runtime
+as shown below::
+
+  echo "vendor device subvendor subdevice class class_mask driver_data" > \
+  /sys/bus/pci/drivers/{driver}/new_id
+
+All fields are passed in as hexadecimal values (no leading 0x).
+The vendor and device fields are mandatory, the others are optional. Users
+need pass only as many optional fields as necessary:
+
+  - subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
+  - class and classmask fields default to 0
+  - driver_data defaults to 0UL.
+
+Note that driver_data must match the value used by any of the pci_device_id
+entries defined in the driver. This makes the driver_data field mandatory
+if all the pci_device_id entries have a non-zero driver_data value.
+
+Once added, the driver probe routine will be invoked for any unclaimed
+PCI devices listed in its (newly updated) pci_ids list.
+
+When the driver exits, it just calls pci_unregister_driver() and the PCI layer
+automatically calls the remove hook for all devices handled by the driver.
+
+
+"Attributes" for driver functions/data
+--------------------------------------
+
+Please mark the initialization and cleanup functions where appropriate
+(the corresponding macros are defined in <linux/init.h>):
+
+	======		=================================================
+	__init		Initialization code. Thrown away after the driver
+			initializes.
+	__exit		Exit code. Ignored for non-modular drivers.
+	======		=================================================
+
+Tips on when/where to use the above attributes:
+	- The module_init()/module_exit() functions (and all
+	  initialization functions called _only_ from these)
+	  should be marked __init/__exit.
+
+	- Do not mark the struct pci_driver.
+
+	- Do NOT mark a function if you are not sure which mark to use.
+	  Better to not mark the function than mark the function wrong.
+
+
+How to find PCI devices manually
+================================
+
+PCI drivers should have a really good reason for not using the
+pci_register_driver() interface to search for PCI devices.
+The main reason PCI devices are controlled by multiple drivers
+is because one PCI device implements several different HW services.
+E.g. combined serial/parallel port/floppy controller.
+
+A manual search may be performed using the following constructs:
+
+Searching by vendor and device ID::
+
+	struct pci_dev *dev = NULL;
+	while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev))
+		configure_device(dev);
+
+Searching by class ID (iterate in a similar way)::
+
+	pci_get_class(CLASS_ID, dev)
+
+Searching by both vendor/device and subsystem vendor/device ID::
+
+	pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev).
+
+You can use the constant PCI_ANY_ID as a wildcard replacement for
+VENDOR_ID or DEVICE_ID.  This allows searching for any device from a
+specific vendor, for example.
+
+These functions are hotplug-safe. They increment the reference count on
+the pci_dev that they return. You must eventually (possibly at module unload)
+decrement the reference count on these devices by calling pci_dev_put().
+
+
+Device Initialization Steps
+===========================
+
+As noted in the introduction, most PCI drivers need the following steps
+for device initialization:
+
+  - Enable the device
+  - Request MMIO/IOP resources
+  - Set the DMA mask size (for both coherent and streaming DMA)
+  - Allocate and initialize shared control data (pci_allocate_coherent())
+  - Access device configuration space (if needed)
+  - Register IRQ handler (request_irq())
+  - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
+  - Enable DMA/processing engines.
+
+The driver can access PCI config space registers at any time.
+(Well, almost. When running BIST, config space can go away...but
+that will just result in a PCI Bus Master Abort and config reads
+will return garbage).
+
+
+Enable the PCI device
+---------------------
+Before touching any device registers, the driver needs to enable
+the PCI device by calling pci_enable_device(). This will:
+
+  - wake up the device if it was in suspended state,
+  - allocate I/O and memory regions of the device (if BIOS did not),
+  - allocate an IRQ (if BIOS did not).
+
+.. note::
+   pci_enable_device() can fail! Check the return value.
+
+.. warning::
+   OS BUG: we don't check resource allocations before enabling those
+   resources. The sequence would make more sense if we called
+   pci_request_resources() before calling pci_enable_device().
+   Currently, the device drivers can't detect the bug when when two
+   devices have been allocated the same range. This is not a common
+   problem and unlikely to get fixed soon.
+
+   This has been discussed before but not changed as of 2.6.19:
+   http://lkml.org/lkml/2006/3/2/194
+
+
+pci_set_master() will enable DMA by setting the bus master bit
+in the PCI_COMMAND register. It also fixes the latency timer value if
+it's set to something bogus by the BIOS.  pci_clear_master() will
+disable DMA by clearing the bus master bit.
+
+If the PCI device can use the PCI Memory-Write-Invalidate transaction,
+call pci_set_mwi().  This enables the PCI_COMMAND bit for Mem-Wr-Inval
+and also ensures that the cache line size register is set correctly.
+Check the return value of pci_set_mwi() as not all architectures
+or chip-sets may support Memory-Write-Invalidate.  Alternatively,
+if Mem-Wr-Inval would be nice to have but is not required, call
+pci_try_set_mwi() to have the system do its best effort at enabling
+Mem-Wr-Inval.
+
+
+Request MMIO/IOP resources
+--------------------------
+Memory (MMIO), and I/O port addresses should NOT be read directly
+from the PCI device config space. Use the values in the pci_dev structure
+as the PCI "bus address" might have been remapped to a "host physical"
+address by the arch/chip-set specific kernel support.
+
+See Documentation/io-mapping.txt for how to access device registers
+or device memory.
+
+The device driver needs to call pci_request_region() to verify
+no other device is already using the same address resource.
+Conversely, drivers should call pci_release_region() AFTER
+calling pci_disable_device().
+The idea is to prevent two devices colliding on the same address range.
+
+.. tip::
+   See OS BUG comment above. Currently (2.6.19), The driver can only
+   determine MMIO and IO Port resource availability _after_ calling
+   pci_enable_device().
+
+Generic flavors of pci_request_region() are request_mem_region()
+(for MMIO ranges) and request_region() (for IO Port ranges).
+Use these for address resources that are not described by "normal" PCI
+BARs.
+
+Also see pci_request_selected_regions() below.
+
+
+Set the DMA mask size
+---------------------
+.. note::
+   If anything below doesn't make sense, please refer to
+   Documentation/DMA-API.txt. This section is just a reminder that
+   drivers need to indicate DMA capabilities of the device and is not
+   an authoritative source for DMA interfaces.
+
+While all drivers should explicitly indicate the DMA capability
+(e.g. 32 or 64 bit) of the PCI bus master, devices with more than
+32-bit bus master capability for streaming data need the driver
+to "register" this capability by calling pci_set_dma_mask() with
+appropriate parameters.  In general this allows more efficient DMA
+on systems where System RAM exists above 4G _physical_ address.
+
+Drivers for all PCI-X and PCIe compliant devices must call
+pci_set_dma_mask() as they are 64-bit DMA devices.
+
+Similarly, drivers must also "register" this capability if the device
+can directly address "consistent memory" in System RAM above 4G physical
+address by calling pci_set_consistent_dma_mask().
+Again, this includes drivers for all PCI-X and PCIe compliant devices.
+Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
+64-bit DMA capable for payload ("streaming") data but not control
+("consistent") data.
+
+
+Setup shared control data
+-------------------------
+Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
+memory.  See Documentation/DMA-API.txt for a full description of
+the DMA APIs. This section is just a reminder that it needs to be done
+before enabling DMA on the device.
+
+
+Initialize device registers
+---------------------------
+Some drivers will need specific "capability" fields programmed
+or other "vendor specific" register initialized or reset.
+E.g. clearing pending interrupts.
+
+
+Register IRQ handler
+--------------------
+While calling request_irq() is the last step described here,
+this is often just another intermediate step to initialize a device.
+This step can often be deferred until the device is opened for use.
+
+All interrupt handlers for IRQ lines should be registered with IRQF_SHARED
+and use the devid to map IRQs to devices (remember that all PCI IRQ lines
+can be shared).
+
+request_irq() will associate an interrupt handler and device handle
+with an interrupt number. Historically interrupt numbers represent
+IRQ lines which run from the PCI device to the Interrupt controller.
+With MSI and MSI-X (more below) the interrupt number is a CPU "vector".
+
+request_irq() also enables the interrupt. Make sure the device is
+quiesced and does not have any interrupts pending before registering
+the interrupt handler.
+
+MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts"
+which deliver interrupts to the CPU via a DMA write to a Local APIC.
+The fundamental difference between MSI and MSI-X is how multiple
+"vectors" get allocated. MSI requires contiguous blocks of vectors
+while MSI-X can allocate several individual ones.
+
+MSI capability can be enabled by calling pci_alloc_irq_vectors() with the
+PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This
+causes the PCI support to program CPU vector data into the PCI device
+capability registers. Many architectures, chip-sets, or BIOSes do NOT
+support MSI or MSI-X and a call to pci_alloc_irq_vectors with just
+the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always
+specify PCI_IRQ_LEGACY as well.
+
+Drivers that have different interrupt handlers for MSI/MSI-X and
+legacy INTx should chose the right one based on the msi_enabled
+and msix_enabled flags in the pci_dev structure after calling
+pci_alloc_irq_vectors.
+
+There are (at least) two really good reasons for using MSI:
+
+1) MSI is an exclusive interrupt vector by definition.
+   This means the interrupt handler doesn't have to verify
+   its device caused the interrupt.
+
+2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed
+   to be visible to the host CPU(s) when the MSI is delivered. This
+   is important for both data coherency and avoiding stale control data.
+   This guarantee allows the driver to omit MMIO reads to flush
+   the DMA stream.
+
+See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples
+of MSI/MSI-X usage.
+
+
+PCI device shutdown
+===================
+
+When a PCI device driver is being unloaded, most of the following
+steps need to be performed:
+
+  - Disable the device from generating IRQs
+  - Release the IRQ (free_irq())
+  - Stop all DMA activity
+  - Release DMA buffers (both streaming and consistent)
+  - Unregister from other subsystems (e.g. scsi or netdev)
+  - Disable device from responding to MMIO/IO Port addresses
+  - Release MMIO/IO Port resource(s)
+
+
+Stop IRQs on the device
+-----------------------
+How to do this is chip/device specific. If it's not done, it opens
+the possibility of a "screaming interrupt" if (and only if)
+the IRQ is shared with another device.
+
+When the shared IRQ handler is "unhooked", the remaining devices
+using the same IRQ line will still need the IRQ enabled. Thus if the
+"unhooked" device asserts IRQ line, the system will respond assuming
+it was one of the remaining devices asserted the IRQ line. Since none
+of the other devices will handle the IRQ, the system will "hang" until
+it decides the IRQ isn't going to get handled and masks the IRQ (100,000
+iterations later). Once the shared IRQ is masked, the remaining devices
+will stop functioning properly. Not a nice situation.
+
+This is another reason to use MSI or MSI-X if it's available.
+MSI and MSI-X are defined to be exclusive interrupts and thus
+are not susceptible to the "screaming interrupt" problem.
+
+
+Release the IRQ
+---------------
+Once the device is quiesced (no more IRQs), one can call free_irq().
+This function will return control once any pending IRQs are handled,
+"unhook" the drivers IRQ handler from that IRQ, and finally release
+the IRQ if no one else is using it.
+
+
+Stop all DMA activity
+---------------------
+It's extremely important to stop all DMA operations BEFORE attempting
+to deallocate DMA control data. Failure to do so can result in memory
+corruption, hangs, and on some chip-sets a hard crash.
+
+Stopping DMA after stopping the IRQs can avoid races where the
+IRQ handler might restart DMA engines.
+
+While this step sounds obvious and trivial, several "mature" drivers
+didn't get this step right in the past.
+
+
+Release DMA buffers
+-------------------
+Once DMA is stopped, clean up streaming DMA first.
+I.e. unmap data buffers and return buffers to "upstream"
+owners if there is one.
+
+Then clean up "consistent" buffers which contain the control data.
+
+See Documentation/DMA-API.txt for details on unmapping interfaces.
+
+
+Unregister from other subsystems
+--------------------------------
+Most low level PCI device drivers support some other subsystem
+like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your
+driver isn't losing resources from that other subsystem.
+If this happens, typically the symptom is an Oops (panic) when
+the subsystem attempts to call into a driver that has been unloaded.
+
+
+Disable Device from responding to MMIO/IO Port addresses
+--------------------------------------------------------
+io_unmap() MMIO or IO Port resources and then call pci_disable_device().
+This is the symmetric opposite of pci_enable_device().
+Do not access device registers after calling pci_disable_device().
+
+
+Release MMIO/IO Port Resource(s)
+--------------------------------
+Call pci_release_region() to mark the MMIO or IO Port range as available.
+Failure to do so usually results in the inability to reload the driver.
+
+
+How to access PCI config space
+==============================
+
+You can use `pci_(read|write)_config_(byte|word|dword)` to access the config
+space of a device represented by `struct pci_dev *`. All these functions return
+0 when successful or an error code (`PCIBIOS_...`) which can be translated to a
+text string by pcibios_strerror. Most drivers expect that accesses to valid PCI
+devices don't fail.
+
+If you don't have a struct pci_dev available, you can call
+`pci_bus_(read|write)_config_(byte|word|dword)` to access a given device
+and function on that bus.
+
+If you access fields in the standard portion of the config header, please
+use symbolic names of locations and bits declared in <linux/pci.h>.
+
+If you need to access Extended PCI Capability registers, just call
+pci_find_capability() for the particular capability and it will find the
+corresponding register block for you.
+
+
+Other interesting functions
+===========================
+
+=============================	================================================
+pci_get_domain_bus_and_slot()	Find pci_dev corresponding to given domain,
+				bus and slot and number. If the device is
+				found, its reference count is increased.
+pci_set_power_state()		Set PCI Power Management state (0=D0 ... 3=D3)
+pci_find_capability()		Find specified capability in device's capability
+				list.
+pci_resource_start()		Returns bus start address for a given PCI region
+pci_resource_end()		Returns bus end address for a given PCI region
+pci_resource_len()		Returns the byte length of a PCI region
+pci_set_drvdata()		Set private driver data pointer for a pci_dev
+pci_get_drvdata()		Return private driver data pointer for a pci_dev
+pci_set_mwi()			Enable Memory-Write-Invalidate transactions.
+pci_clear_mwi()			Disable Memory-Write-Invalidate transactions.
+=============================	================================================
+
+
+Miscellaneous hints
+===================
+
+When displaying PCI device names to the user (for example when a driver wants
+to tell the user what card has it found), please use pci_name(pci_dev).
+
+Always refer to the PCI devices by a pointer to the pci_dev structure.
+All PCI layer functions use this identification and it's the only
+reasonable one. Don't use bus/slot/function numbers except for very
+special purposes -- on systems with multiple primary buses their semantics
+can be pretty complex.
+
+Don't try to turn on Fast Back to Back writes in your driver.  All devices
+on the bus need to be capable of doing it, so this is something which needs
+to be handled by platform and generic code, not individual drivers.
+
+
+Vendor and device identifications
+=================================
+
+Do not add new device or vendor IDs to include/linux/pci_ids.h unless they
+are shared across multiple drivers.  You can add private definitions in
+your driver if they're helpful, or just use plain hex constants.
+
+The device IDs are arbitrary hex numbers (vendor controlled) and normally used
+only in a single location, the pci_device_id table.
+
+Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/.
+There are mirrors of the pci.ids file at http://pciids.sourceforge.net/
+and https://github.com/pciutils/pciids.
+
+
+Obsolete functions
+==================
+
+There are several functions which you might come across when trying to
+port an old driver to the new PCI interface.  They are no longer present
+in the kernel as they aren't compatible with hotplug or PCI domains or
+having sane locking.
+
+=================	===========================================
+pci_find_device()	Superseded by pci_get_device()
+pci_find_subsys()	Superseded by pci_get_subsys()
+pci_find_slot()		Superseded by pci_get_domain_bus_and_slot()
+pci_get_slot()		Superseded by pci_get_domain_bus_and_slot()
+=================	===========================================
+
+The alternative is the traditional PCI device driver that walks PCI
+device lists. This is still possible but discouraged.
+
+
+MMIO Space and "Write Posting"
+==============================
+
+Converting a driver from using I/O Port space to using MMIO space
+often requires some additional changes. Specifically, "write posting"
+needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2)
+already do this. I/O Port space guarantees write transactions reach the PCI
+device before the CPU can continue. Writes to MMIO space allow the CPU
+to continue before the transaction reaches the PCI device. HW weenies
+call this "Write Posting" because the write completion is "posted" to
+the CPU before the transaction has reached its destination.
+
+Thus, timing sensitive code should add readl() where the CPU is
+expected to wait before doing other work.  The classic "bit banging"
+sequence works fine for I/O Port space::
+
+       for (i = 8; --i; val >>= 1) {
+               outb(val & 1, ioport_reg);      /* write bit */
+               udelay(10);
+       }
+
+The same sequence for MMIO space should be::
+
+       for (i = 8; --i; val >>= 1) {
+               writeb(val & 1, mmio_reg);      /* write bit */
+               readb(safe_mmio_reg);           /* flush posted write */
+               udelay(10);
+       }
+
+It is important that "safe_mmio_reg" not have any side effects that
+interferes with the correct operation of the device.
+
+Another case to watch out for is when resetting a PCI device. Use PCI
+Configuration space reads to flush the writel(). This will gracefully
+handle the PCI master abort on all platforms if the PCI device is
+expected to not respond to a readl().  Most x86 platforms will allow
+MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage
+(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail").
diff --git a/Documentation/PCI/pci.txt b/Documentation/PCI/pci.txt
deleted file mode 100644
index badb26ac33dc..000000000000
--- a/Documentation/PCI/pci.txt
+++ /dev/null
@@ -1,636 +0,0 @@
-
-			How To Write Linux PCI Drivers
-
-		by Martin Mares <mj@ucw.cz> on 07-Feb-2000
-	updated by Grant Grundler <grundler@parisc-linux.org> on 23-Dec-2006
-
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The world of PCI is vast and full of (mostly unpleasant) surprises.
-Since each CPU architecture implements different chip-sets and PCI devices
-have different requirements (erm, "features"), the result is the PCI support
-in the Linux kernel is not as trivial as one would wish. This short paper
-tries to introduce all potential driver authors to Linux APIs for
-PCI device drivers.
-
-A more complete resource is the third edition of "Linux Device Drivers"
-by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman.
-LDD3 is available for free (under Creative Commons License) from:
-
-	http://lwn.net/Kernel/LDD3/
-
-However, keep in mind that all documents are subject to "bit rot".
-Refer to the source code if things are not working as described here.
-
-Please send questions/comments/patches about Linux PCI API to the
-"Linux PCI" <linux-pci@atrey.karlin.mff.cuni.cz> mailing list.
-
-
-
-0. Structure of PCI drivers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-PCI drivers "discover" PCI devices in a system via pci_register_driver().
-Actually, it's the other way around. When the PCI generic code discovers
-a new device, the driver with a matching "description" will be notified.
-Details on this below.
-
-pci_register_driver() leaves most of the probing for devices to
-the PCI layer and supports online insertion/removal of devices [thus
-supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver].
-pci_register_driver() call requires passing in a table of function
-pointers and thus dictates the high level structure of a driver.
-
-Once the driver knows about a PCI device and takes ownership, the
-driver generally needs to perform the following initialization:
-
-	Enable the device
-	Request MMIO/IOP resources
-	Set the DMA mask size (for both coherent and streaming DMA)
-	Allocate and initialize shared control data (pci_allocate_coherent())
-	Access device configuration space (if needed)
-	Register IRQ handler (request_irq())
-	Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
-	Enable DMA/processing engines
-
-When done using the device, and perhaps the module needs to be unloaded,
-the driver needs to take the follow steps:
-	Disable the device from generating IRQs
-	Release the IRQ (free_irq())
-	Stop all DMA activity
-	Release DMA buffers (both streaming and coherent)
-	Unregister from other subsystems (e.g. scsi or netdev)
-	Release MMIO/IOP resources
-	Disable the device
-
-Most of these topics are covered in the following sections.
-For the rest look at LDD3 or <linux/pci.h> .
-
-If the PCI subsystem is not configured (CONFIG_PCI is not set), most of
-the PCI functions described below are defined as inline functions either
-completely empty or just returning an appropriate error codes to avoid
-lots of ifdefs in the drivers.
-
-
-
-1. pci_register_driver() call
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-PCI device drivers call pci_register_driver() during their
-initialization with a pointer to a structure describing the driver
-(struct pci_driver):
-
-	field name	Description
-	----------	------------------------------------------------------
-	id_table	Pointer to table of device ID's the driver is
-			interested in.  Most drivers should export this
-			table using MODULE_DEVICE_TABLE(pci,...).
-
-	probe		This probing function gets called (during execution
-			of pci_register_driver() for already existing
-			devices or later if a new device gets inserted) for
-			all PCI devices which match the ID table and are not
-			"owned" by the other drivers yet. This function gets
-			passed a "struct pci_dev *" for each device whose
-			entry in the ID table matches the device. The probe
-			function returns zero when the driver chooses to
-			take "ownership" of the device or an error code
-			(negative number) otherwise.
-			The probe function always gets called from process
-			context, so it can sleep.
-
-	remove		The remove() function gets called whenever a device
-			being handled by this driver is removed (either during
-			deregistration of the driver or when it's manually
-			pulled out of a hot-pluggable slot).
-			The remove function always gets called from process
-			context, so it can sleep.
-
-	suspend		Put device into low power state.
-	suspend_late	Put device into low power state.
-
-	resume_early	Wake device from low power state.
-	resume		Wake device from low power state.
-
-		(Please see Documentation/power/pci.txt for descriptions
-		of PCI Power Management and the related functions.)
-
-	shutdown	Hook into reboot_notifier_list (kernel/sys.c).
-			Intended to stop any idling DMA operations.
-			Useful for enabling wake-on-lan (NIC) or changing
-			the power state of a device before reboot.
-			e.g. drivers/net/e100.c.
-
-	err_handler	See Documentation/PCI/pci-error-recovery.txt
-
-
-The ID table is an array of struct pci_device_id entries ending with an
-all-zero entry.  Definitions with static const are generally preferred.
-
-Each entry consists of:
-
-	vendor,device	Vendor and device ID to match (or PCI_ANY_ID)
-
-	subvendor,	Subsystem vendor and device ID to match (or PCI_ANY_ID)
-	subdevice,
-
-	class		Device class, subclass, and "interface" to match.
-			See Appendix D of the PCI Local Bus Spec or
-			include/linux/pci_ids.h for a full list of classes.
-			Most drivers do not need to specify class/class_mask
-			as vendor/device is normally sufficient.
-
-	class_mask	limit which sub-fields of the class field are compared.
-			See drivers/scsi/sym53c8xx_2/ for example of usage.
-
-	driver_data	Data private to the driver.
-			Most drivers don't need to use driver_data field.
-			Best practice is to use driver_data as an index
-			into a static list of equivalent device types,
-			instead of using it as a pointer.
-
-
-Most drivers only need PCI_DEVICE() or PCI_DEVICE_CLASS() to set up
-a pci_device_id table.
-
-New PCI IDs may be added to a device driver pci_ids table at runtime
-as shown below:
-
-echo "vendor device subvendor subdevice class class_mask driver_data" > \
-/sys/bus/pci/drivers/{driver}/new_id
-
-All fields are passed in as hexadecimal values (no leading 0x).
-The vendor and device fields are mandatory, the others are optional. Users
-need pass only as many optional fields as necessary:
-	o subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF)
-	o class and classmask fields default to 0
-	o driver_data defaults to 0UL.
-
-Note that driver_data must match the value used by any of the pci_device_id
-entries defined in the driver. This makes the driver_data field mandatory
-if all the pci_device_id entries have a non-zero driver_data value.
-
-Once added, the driver probe routine will be invoked for any unclaimed
-PCI devices listed in its (newly updated) pci_ids list.
-
-When the driver exits, it just calls pci_unregister_driver() and the PCI layer
-automatically calls the remove hook for all devices handled by the driver.
-
-
-1.1 "Attributes" for driver functions/data
-
-Please mark the initialization and cleanup functions where appropriate
-(the corresponding macros are defined in <linux/init.h>):
-
-	__init		Initialization code. Thrown away after the driver
-			initializes.
-	__exit		Exit code. Ignored for non-modular drivers.
-
-Tips on when/where to use the above attributes:
-	o The module_init()/module_exit() functions (and all
-	  initialization functions called _only_ from these)
-	  should be marked __init/__exit.
-
-	o Do not mark the struct pci_driver.
-
-	o Do NOT mark a function if you are not sure which mark to use.
-	  Better to not mark the function than mark the function wrong.
-
-
-
-2. How to find PCI devices manually
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-PCI drivers should have a really good reason for not using the
-pci_register_driver() interface to search for PCI devices.
-The main reason PCI devices are controlled by multiple drivers
-is because one PCI device implements several different HW services.
-E.g. combined serial/parallel port/floppy controller.
-
-A manual search may be performed using the following constructs:
-
-Searching by vendor and device ID:
-
-	struct pci_dev *dev = NULL;
-	while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev))
-		configure_device(dev);
-
-Searching by class ID (iterate in a similar way):
-
-	pci_get_class(CLASS_ID, dev)
-
-Searching by both vendor/device and subsystem vendor/device ID:
-
-	pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev).
-
-You can use the constant PCI_ANY_ID as a wildcard replacement for
-VENDOR_ID or DEVICE_ID.  This allows searching for any device from a
-specific vendor, for example.
-
-These functions are hotplug-safe. They increment the reference count on
-the pci_dev that they return. You must eventually (possibly at module unload)
-decrement the reference count on these devices by calling pci_dev_put().
-
-
-
-3. Device Initialization Steps
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-As noted in the introduction, most PCI drivers need the following steps
-for device initialization:
-
-	Enable the device
-	Request MMIO/IOP resources
-	Set the DMA mask size (for both coherent and streaming DMA)
-	Allocate and initialize shared control data (pci_allocate_coherent())
-	Access device configuration space (if needed)
-	Register IRQ handler (request_irq())
-	Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip)
-	Enable DMA/processing engines.
-
-The driver can access PCI config space registers at any time.
-(Well, almost. When running BIST, config space can go away...but
-that will just result in a PCI Bus Master Abort and config reads
-will return garbage).
-
-
-3.1 Enable the PCI device
-~~~~~~~~~~~~~~~~~~~~~~~~~
-Before touching any device registers, the driver needs to enable
-the PCI device by calling pci_enable_device(). This will:
-	o wake up the device if it was in suspended state,
-	o allocate I/O and memory regions of the device (if BIOS did not),
-	o allocate an IRQ (if BIOS did not).
-
-NOTE: pci_enable_device() can fail! Check the return value.
-
-[ OS BUG: we don't check resource allocations before enabling those
-  resources. The sequence would make more sense if we called
-  pci_request_resources() before calling pci_enable_device().
-  Currently, the device drivers can't detect the bug when when two
-  devices have been allocated the same range. This is not a common
-  problem and unlikely to get fixed soon.
-
-  This has been discussed before but not changed as of 2.6.19:
-	http://lkml.org/lkml/2006/3/2/194
-]
-
-pci_set_master() will enable DMA by setting the bus master bit
-in the PCI_COMMAND register. It also fixes the latency timer value if
-it's set to something bogus by the BIOS.  pci_clear_master() will
-disable DMA by clearing the bus master bit.
-
-If the PCI device can use the PCI Memory-Write-Invalidate transaction,
-call pci_set_mwi().  This enables the PCI_COMMAND bit for Mem-Wr-Inval
-and also ensures that the cache line size register is set correctly.
-Check the return value of pci_set_mwi() as not all architectures
-or chip-sets may support Memory-Write-Invalidate.  Alternatively,
-if Mem-Wr-Inval would be nice to have but is not required, call
-pci_try_set_mwi() to have the system do its best effort at enabling
-Mem-Wr-Inval.
-
-
-3.2 Request MMIO/IOP resources
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Memory (MMIO), and I/O port addresses should NOT be read directly
-from the PCI device config space. Use the values in the pci_dev structure
-as the PCI "bus address" might have been remapped to a "host physical"
-address by the arch/chip-set specific kernel support.
-
-See Documentation/io-mapping.txt for how to access device registers
-or device memory.
-
-The device driver needs to call pci_request_region() to verify
-no other device is already using the same address resource.
-Conversely, drivers should call pci_release_region() AFTER
-calling pci_disable_device().
-The idea is to prevent two devices colliding on the same address range.
-
-[ See OS BUG comment above. Currently (2.6.19), The driver can only
-  determine MMIO and IO Port resource availability _after_ calling
-  pci_enable_device(). ]
-
-Generic flavors of pci_request_region() are request_mem_region()
-(for MMIO ranges) and request_region() (for IO Port ranges).
-Use these for address resources that are not described by "normal" PCI
-BARs.
-
-Also see pci_request_selected_regions() below.
-
-
-3.3 Set the DMA mask size
-~~~~~~~~~~~~~~~~~~~~~~~~~
-[ If anything below doesn't make sense, please refer to
-  Documentation/DMA-API.txt. This section is just a reminder that
-  drivers need to indicate DMA capabilities of the device and is not
-  an authoritative source for DMA interfaces. ]
-
-While all drivers should explicitly indicate the DMA capability
-(e.g. 32 or 64 bit) of the PCI bus master, devices with more than
-32-bit bus master capability for streaming data need the driver
-to "register" this capability by calling pci_set_dma_mask() with
-appropriate parameters.  In general this allows more efficient DMA
-on systems where System RAM exists above 4G _physical_ address.
-
-Drivers for all PCI-X and PCIe compliant devices must call
-pci_set_dma_mask() as they are 64-bit DMA devices.
-
-Similarly, drivers must also "register" this capability if the device
-can directly address "consistent memory" in System RAM above 4G physical
-address by calling pci_set_consistent_dma_mask().
-Again, this includes drivers for all PCI-X and PCIe compliant devices.
-Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are
-64-bit DMA capable for payload ("streaming") data but not control
-("consistent") data.
-
-
-3.4 Setup shared control data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared)
-memory.  See Documentation/DMA-API.txt for a full description of
-the DMA APIs. This section is just a reminder that it needs to be done
-before enabling DMA on the device.
-
-
-3.5 Initialize device registers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Some drivers will need specific "capability" fields programmed
-or other "vendor specific" register initialized or reset.
-E.g. clearing pending interrupts.
-
-
-3.6 Register IRQ handler
-~~~~~~~~~~~~~~~~~~~~~~~~
-While calling request_irq() is the last step described here,
-this is often just another intermediate step to initialize a device.
-This step can often be deferred until the device is opened for use.
-
-All interrupt handlers for IRQ lines should be registered with IRQF_SHARED
-and use the devid to map IRQs to devices (remember that all PCI IRQ lines
-can be shared).
-
-request_irq() will associate an interrupt handler and device handle
-with an interrupt number. Historically interrupt numbers represent
-IRQ lines which run from the PCI device to the Interrupt controller.
-With MSI and MSI-X (more below) the interrupt number is a CPU "vector".
-
-request_irq() also enables the interrupt. Make sure the device is
-quiesced and does not have any interrupts pending before registering
-the interrupt handler.
-
-MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts"
-which deliver interrupts to the CPU via a DMA write to a Local APIC.
-The fundamental difference between MSI and MSI-X is how multiple
-"vectors" get allocated. MSI requires contiguous blocks of vectors
-while MSI-X can allocate several individual ones.
-
-MSI capability can be enabled by calling pci_alloc_irq_vectors() with the
-PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This
-causes the PCI support to program CPU vector data into the PCI device
-capability registers. Many architectures, chip-sets, or BIOSes do NOT
-support MSI or MSI-X and a call to pci_alloc_irq_vectors with just
-the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always
-specify PCI_IRQ_LEGACY as well.
-
-Drivers that have different interrupt handlers for MSI/MSI-X and
-legacy INTx should chose the right one based on the msi_enabled
-and msix_enabled flags in the pci_dev structure after calling
-pci_alloc_irq_vectors.
-
-There are (at least) two really good reasons for using MSI:
-1) MSI is an exclusive interrupt vector by definition.
-   This means the interrupt handler doesn't have to verify
-   its device caused the interrupt.
-
-2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed
-   to be visible to the host CPU(s) when the MSI is delivered. This
-   is important for both data coherency and avoiding stale control data.
-   This guarantee allows the driver to omit MMIO reads to flush
-   the DMA stream.
-
-See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples
-of MSI/MSI-X usage.
-
-
-
-4. PCI device shutdown
-~~~~~~~~~~~~~~~~~~~~~~~
-
-When a PCI device driver is being unloaded, most of the following
-steps need to be performed:
-
-	Disable the device from generating IRQs
-	Release the IRQ (free_irq())
-	Stop all DMA activity
-	Release DMA buffers (both streaming and consistent)
-	Unregister from other subsystems (e.g. scsi or netdev)
-	Disable device from responding to MMIO/IO Port addresses
-	Release MMIO/IO Port resource(s)
-
-
-4.1 Stop IRQs on the device
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-How to do this is chip/device specific. If it's not done, it opens
-the possibility of a "screaming interrupt" if (and only if)
-the IRQ is shared with another device.
-
-When the shared IRQ handler is "unhooked", the remaining devices
-using the same IRQ line will still need the IRQ enabled. Thus if the
-"unhooked" device asserts IRQ line, the system will respond assuming
-it was one of the remaining devices asserted the IRQ line. Since none
-of the other devices will handle the IRQ, the system will "hang" until
-it decides the IRQ isn't going to get handled and masks the IRQ (100,000
-iterations later). Once the shared IRQ is masked, the remaining devices
-will stop functioning properly. Not a nice situation.
-
-This is another reason to use MSI or MSI-X if it's available.
-MSI and MSI-X are defined to be exclusive interrupts and thus
-are not susceptible to the "screaming interrupt" problem.
-
-
-4.2 Release the IRQ
-~~~~~~~~~~~~~~~~~~~
-Once the device is quiesced (no more IRQs), one can call free_irq().
-This function will return control once any pending IRQs are handled,
-"unhook" the drivers IRQ handler from that IRQ, and finally release
-the IRQ if no one else is using it.
-
-
-4.3 Stop all DMA activity
-~~~~~~~~~~~~~~~~~~~~~~~~~
-It's extremely important to stop all DMA operations BEFORE attempting
-to deallocate DMA control data. Failure to do so can result in memory
-corruption, hangs, and on some chip-sets a hard crash.
-
-Stopping DMA after stopping the IRQs can avoid races where the
-IRQ handler might restart DMA engines.
-
-While this step sounds obvious and trivial, several "mature" drivers
-didn't get this step right in the past.
-
-
-4.4 Release DMA buffers
-~~~~~~~~~~~~~~~~~~~~~~~
-Once DMA is stopped, clean up streaming DMA first.
-I.e. unmap data buffers and return buffers to "upstream"
-owners if there is one.
-
-Then clean up "consistent" buffers which contain the control data.
-
-See Documentation/DMA-API.txt for details on unmapping interfaces.
-
-
-4.5 Unregister from other subsystems
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Most low level PCI device drivers support some other subsystem
-like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your
-driver isn't losing resources from that other subsystem.
-If this happens, typically the symptom is an Oops (panic) when
-the subsystem attempts to call into a driver that has been unloaded.
-
-
-4.6 Disable Device from responding to MMIO/IO Port addresses
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-io_unmap() MMIO or IO Port resources and then call pci_disable_device().
-This is the symmetric opposite of pci_enable_device().
-Do not access device registers after calling pci_disable_device().
-
-
-4.7 Release MMIO/IO Port Resource(s)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Call pci_release_region() to mark the MMIO or IO Port range as available.
-Failure to do so usually results in the inability to reload the driver.
-
-
-
-5. How to access PCI config space
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-You can use pci_(read|write)_config_(byte|word|dword) to access the config
-space of a device represented by struct pci_dev *. All these functions return 0
-when successful or an error code (PCIBIOS_...) which can be translated to a text
-string by pcibios_strerror. Most drivers expect that accesses to valid PCI
-devices don't fail.
-
-If you don't have a struct pci_dev available, you can call
-pci_bus_(read|write)_config_(byte|word|dword) to access a given device
-and function on that bus.
-
-If you access fields in the standard portion of the config header, please
-use symbolic names of locations and bits declared in <linux/pci.h>.
-
-If you need to access Extended PCI Capability registers, just call
-pci_find_capability() for the particular capability and it will find the
-corresponding register block for you.
-
-
-
-6. Other interesting functions
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-pci_get_domain_bus_and_slot()	Find pci_dev corresponding to given domain,
-				bus and slot and number. If the device is
-				found, its reference count is increased.
-pci_set_power_state()		Set PCI Power Management state (0=D0 ... 3=D3)
-pci_find_capability()		Find specified capability in device's capability
-				list.
-pci_resource_start()		Returns bus start address for a given PCI region
-pci_resource_end()		Returns bus end address for a given PCI region
-pci_resource_len()		Returns the byte length of a PCI region
-pci_set_drvdata()		Set private driver data pointer for a pci_dev
-pci_get_drvdata()		Return private driver data pointer for a pci_dev
-pci_set_mwi()			Enable Memory-Write-Invalidate transactions.
-pci_clear_mwi()			Disable Memory-Write-Invalidate transactions.
-
-
-
-7. Miscellaneous hints
-~~~~~~~~~~~~~~~~~~~~~~
-
-When displaying PCI device names to the user (for example when a driver wants
-to tell the user what card has it found), please use pci_name(pci_dev).
-
-Always refer to the PCI devices by a pointer to the pci_dev structure.
-All PCI layer functions use this identification and it's the only
-reasonable one. Don't use bus/slot/function numbers except for very
-special purposes -- on systems with multiple primary buses their semantics
-can be pretty complex.
-
-Don't try to turn on Fast Back to Back writes in your driver.  All devices
-on the bus need to be capable of doing it, so this is something which needs
-to be handled by platform and generic code, not individual drivers.
-
-
-
-8. Vendor and device identifications
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Do not add new device or vendor IDs to include/linux/pci_ids.h unless they
-are shared across multiple drivers.  You can add private definitions in
-your driver if they're helpful, or just use plain hex constants.
-
-The device IDs are arbitrary hex numbers (vendor controlled) and normally used
-only in a single location, the pci_device_id table.
-
-Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/.
-There are mirrors of the pci.ids file at http://pciids.sourceforge.net/
-and https://github.com/pciutils/pciids.
-
-
-
-9. Obsolete functions
-~~~~~~~~~~~~~~~~~~~~~
-
-There are several functions which you might come across when trying to
-port an old driver to the new PCI interface.  They are no longer present
-in the kernel as they aren't compatible with hotplug or PCI domains or
-having sane locking.
-
-pci_find_device()	Superseded by pci_get_device()
-pci_find_subsys()	Superseded by pci_get_subsys()
-pci_find_slot()		Superseded by pci_get_domain_bus_and_slot()
-pci_get_slot()		Superseded by pci_get_domain_bus_and_slot()
-
-
-The alternative is the traditional PCI device driver that walks PCI
-device lists. This is still possible but discouraged.
-
-
-
-10. MMIO Space and "Write Posting"
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Converting a driver from using I/O Port space to using MMIO space
-often requires some additional changes. Specifically, "write posting"
-needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2)
-already do this. I/O Port space guarantees write transactions reach the PCI
-device before the CPU can continue. Writes to MMIO space allow the CPU
-to continue before the transaction reaches the PCI device. HW weenies
-call this "Write Posting" because the write completion is "posted" to
-the CPU before the transaction has reached its destination.
-
-Thus, timing sensitive code should add readl() where the CPU is
-expected to wait before doing other work.  The classic "bit banging"
-sequence works fine for I/O Port space:
-
-       for (i = 8; --i; val >>= 1) {
-               outb(val & 1, ioport_reg);      /* write bit */
-               udelay(10);
-       }
-
-The same sequence for MMIO space should be:
-
-       for (i = 8; --i; val >>= 1) {
-               writeb(val & 1, mmio_reg);      /* write bit */
-               readb(safe_mmio_reg);           /* flush posted write */
-               udelay(10);
-       }
-
-It is important that "safe_mmio_reg" not have any side effects that
-interferes with the correct operation of the device.
-
-Another case to watch out for is when resetting a PCI device. Use PCI
-Configuration space reads to flush the writel(). This will gracefully
-handle the PCI master abort on all platforms if the PCI device is
-expected to not respond to a readl().  Most x86 platforms will allow
-MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage
-(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail").
-
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 448621c32e4d..664c0fb1d53d 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -16,6 +16,25 @@ typedef unsigned long kernel_ulong_t;
 
 #define PCI_ANY_ID (~0)
 
+/**
+ * struct pci_device_id - PCI device ID structure
+ * @vendor:		Vendor ID to match (or PCI_ANY_ID)
+ * @device:		Device ID to match (or PCI_ANY_ID)
+ * @subvendor:		Subsystem vendor ID to match (or PCI_ANY_ID)
+ * @subdevice:		Subsystem device ID to match (or PCI_ANY_ID)
+ * @class:		Device class, subclass, and "interface" to match.
+ *			See Appendix D of the PCI Local Bus Spec or
+ *			include/linux/pci_ids.h for a full list of classes.
+ *			Most drivers do not need to specify class/class_mask
+ *			as vendor/device is normally sufficient.
+ * @class_mask:		Limit which sub-fields of the class field are compared.
+ *			See drivers/scsi/sym53c8xx_2/ for example of usage.
+ * @driver_data:	Data private to the driver.
+ *			Most drivers don't need to use driver_data field.
+ *			Best practice is to use driver_data as an index
+ *			into a static list of equivalent device types,
+ *			instead of using it as a pointer.
+ */
 struct pci_device_id {
 	__u32 vendor, device;		/* Vendor and device ID or PCI_ANY_ID*/
 	__u32 subvendor, subdevice;	/* Subsystem ID's or PCI_ANY_ID */
@@ -257,17 +276,17 @@ struct pcmcia_device_id {
 	__u16		match_flags;
 
 	__u16		manf_id;
-	__u16 		card_id;
+	__u16		card_id;
 
-	__u8  		func_id;
+	__u8		func_id;
 
 	/* for real multi-function devices */
-	__u8  		function;
+	__u8		function;
 
 	/* for pseudo multi-function devices */
-	__u8  		device_no;
+	__u8		device_no;
 
-	__u32 		prod_id_hash[4];
+	__u32		prod_id_hash[4];
 
 	/* not matched against in kernelspace */
 	const char *	prod_id[4];
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..b74b2a4e6df2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -151,6 +151,8 @@ static inline const char *pci_power_name(pci_power_t state)
 #define PCI_PM_BUS_WAIT		50
 
 /**
+ * typedef pci_channel_state_t
+ *
  * The pci_channel state describes connectivity between the CPU and
  * the PCI device.  If some PCI bus between here and the PCI device
  * has crashed or locked up, this info is reflected here.
@@ -775,6 +777,50 @@ struct pci_error_handlers {
 
 
 struct module;
+
+/**
+ * struct pci_driver - PCI driver structure
+ * @node:	List of driver structures.
+ * @name:	Driver name.
+ * @id_table:	Pointer to table of device IDs the driver is
+ *		interested in.  Most drivers should export this
+ *		table using MODULE_DEVICE_TABLE(pci,...).
+ * @probe:	This probing function gets called (during execution
+ *		of pci_register_driver() for already existing
+ *		devices or later if a new device gets inserted) for
+ *		all PCI devices which match the ID table and are not
+ *		"owned" by the other drivers yet. This function gets
+ *		passed a "struct pci_dev \*" for each device whose
+ *		entry in the ID table matches the device. The probe
+ *		function returns zero when the driver chooses to
+ *		take "ownership" of the device or an error code
+ *		(negative number) otherwise.
+ *		The probe function always gets called from process
+ *		context, so it can sleep.
+ * @remove:	The remove() function gets called whenever a device
+ *		being handled by this driver is removed (either during
+ *		deregistration of the driver or when it's manually
+ *		pulled out of a hot-pluggable slot).
+ *		The remove function always gets called from process
+ *		context, so it can sleep.
+ * @suspend:	Put device into low power state.
+ * @suspend_late: Put device into low power state.
+ * @resume_early: Wake device from low power state.
+ * @resume:	Wake device from low power state.
+ *		(Please see Documentation/power/pci.txt for descriptions
+ *		of PCI Power Management and the related functions.)
+ * @shutdown:	Hook into reboot_notifier_list (kernel/sys.c).
+ *		Intended to stop any idling DMA operations.
+ *		Useful for enabling wake-on-lan (NIC) or changing
+ *		the power state of a device before reboot.
+ *		e.g. drivers/net/e100.c.
+ * @sriov_configure: Optional driver callback to allow configuration of
+ *		number of VFs to enable via sysfs "sriov_numvfs" file.
+ * @err_handler: See Documentation/PCI/pci-error-recovery.rst
+ * @groups:	Sysfs attribute groups.
+ * @driver:	Driver model structure.
+ * @dynids:	List of dynamically added device IDs.
+ */
 struct pci_driver {
 	struct list_head	node;
 	const char		*name;
@@ -2206,7 +2252,7 @@ static inline u8 pci_vpd_srdt_tag(const u8 *srdt)
 
 /**
  * pci_vpd_info_field_size - Extracts the information field length
- * @lrdt: Pointer to the beginning of an information field header
+ * @info_field: Pointer to the beginning of an information field header
  *
  * Returns the extracted information field length.
  */
-- 
cgit v1.2.3-59-g8ed1b


From 87e5e6dab6c2a21fab2620f37786276d202e2ce0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 14 May 2019 16:02:22 -0600
Subject: uio: make import_iovec()/compat_import_iovec() return bytes on
 success

Currently these functions return < 0 on error, and 0 for success.
Change that so that we return < 0 on error, but number of bytes
for success.

Some callers already treat the return value that way, others need a
slight tweak.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/aio.c            |  9 +++++----
 fs/io_uring.c       | 16 ++++++++--------
 fs/splice.c         |  8 ++++----
 include/linux/uio.h |  4 ++--
 lib/iov_iter.c      | 15 ++++++++-------
 net/compat.c        |  3 ++-
 net/socket.c        |  3 ++-
 7 files changed, 31 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 3490d1fa0e16..41824c710b36 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1479,8 +1479,9 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	return 0;
 }
 
-static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec,
-		bool vectored, bool compat, struct iov_iter *iter)
+static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
+		struct iovec **iovec, bool vectored, bool compat,
+		struct iov_iter *iter)
 {
 	void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
 	size_t len = iocb->aio_nbytes;
@@ -1537,7 +1538,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
 		return -EINVAL;
 
 	ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
-	if (ret)
+	if (ret < 0)
 		return ret;
 	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret)
@@ -1565,7 +1566,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 		return -EINVAL;
 
 	ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
-	if (ret)
+	if (ret < 0)
 		return ret;
 	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret) {
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0fbb486a320e..23e08c10f486 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1003,9 +1003,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
 	return 0;
 }
 
-static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
-			   const struct sqe_submit *s, struct iovec **iovec,
-			   struct iov_iter *iter)
+static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
+			       const struct sqe_submit *s, struct iovec **iovec,
+			       struct iov_iter *iter)
 {
 	const struct io_uring_sqe *sqe = s->sqe;
 	void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -1023,7 +1023,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
 	opcode = READ_ONCE(sqe->opcode);
 	if (opcode == IORING_OP_READ_FIXED ||
 	    opcode == IORING_OP_WRITE_FIXED) {
-		int ret = io_import_fixed(ctx, rw, sqe, iter);
+		ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
 		*iovec = NULL;
 		return ret;
 	}
@@ -1089,7 +1089,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 	struct iov_iter iter;
 	struct file *file;
 	size_t iov_count;
-	int ret;
+	ssize_t ret;
 
 	ret = io_prep_rw(req, s, force_nonblock);
 	if (ret)
@@ -1102,7 +1102,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
 		return -EINVAL;
 
 	ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
-	if (ret)
+	if (ret < 0)
 		return ret;
 
 	iov_count = iov_iter_count(&iter);
@@ -1136,7 +1136,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	struct iov_iter iter;
 	struct file *file;
 	size_t iov_count;
-	int ret;
+	ssize_t ret;
 
 	ret = io_prep_rw(req, s, force_nonblock);
 	if (ret)
@@ -1149,7 +1149,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
 		return -EINVAL;
 
 	ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
-	if (ret)
+	if (ret < 0)
 		return ret;
 
 	iov_count = iov_iter_count(&iter);
diff --git a/fs/splice.c b/fs/splice.c
index 14cb602d9a2f..98412721f056 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1356,7 +1356,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	struct iov_iter iter;
-	long error;
+	ssize_t error;
 	struct fd f;
 	int type;
 
@@ -1367,7 +1367,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
 
 	error = import_iovec(type, uiov, nr_segs,
 			     ARRAY_SIZE(iovstack), &iov, &iter);
-	if (!error) {
+	if (error >= 0) {
 		error = do_vmsplice(f.file, &iter, flags);
 		kfree(iov);
 	}
@@ -1382,7 +1382,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	struct iov_iter iter;
-	long error;
+	ssize_t error;
 	struct fd f;
 	int type;
 
@@ -1393,7 +1393,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io
 
 	error = compat_import_iovec(type, iov32, nr_segs,
 			     ARRAY_SIZE(iovstack), &iov, &iter);
-	if (!error) {
+	if (error >= 0) {
 		error = do_vmsplice(f.file, &iter, flags);
 		kfree(iov);
 	}
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 2d0131ad4604..a61ceb6575ab 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -279,13 +279,13 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct
 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
 		struct iov_iter *i);
 
-int import_iovec(int type, const struct iovec __user * uvector,
+ssize_t import_iovec(int type, const struct iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
 		 struct iovec **iov, struct iov_iter *i);
 
 #ifdef CONFIG_COMPAT
 struct compat_iovec;
-int compat_import_iovec(int type, const struct compat_iovec __user * uvector,
+ssize_t compat_import_iovec(int type, const struct compat_iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
 		 struct iovec **iov, struct iov_iter *i);
 #endif
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f99c41d4eb54..f1e0569b4539 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1634,9 +1634,9 @@ EXPORT_SYMBOL(dup_iter);
  * on-stack array was used or not (and regardless of whether this function
  * returns an error or not).
  *
- * Return: 0 on success or negative error code on error.
+ * Return: Negative error code on error, bytes imported on success
  */
-int import_iovec(int type, const struct iovec __user * uvector,
+ssize_t import_iovec(int type, const struct iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
 		 struct iovec **iov, struct iov_iter *i)
 {
@@ -1652,16 +1652,17 @@ int import_iovec(int type, const struct iovec __user * uvector,
 	}
 	iov_iter_init(i, type, p, nr_segs, n);
 	*iov = p == *iov ? NULL : p;
-	return 0;
+	return n;
 }
 EXPORT_SYMBOL(import_iovec);
 
 #ifdef CONFIG_COMPAT
 #include <linux/compat.h>
 
-int compat_import_iovec(int type, const struct compat_iovec __user * uvector,
-		 unsigned nr_segs, unsigned fast_segs,
-		 struct iovec **iov, struct iov_iter *i)
+ssize_t compat_import_iovec(int type,
+		const struct compat_iovec __user * uvector,
+		unsigned nr_segs, unsigned fast_segs,
+		struct iovec **iov, struct iov_iter *i)
 {
 	ssize_t n;
 	struct iovec *p;
@@ -1675,7 +1676,7 @@ int compat_import_iovec(int type, const struct compat_iovec __user * uvector,
 	}
 	iov_iter_init(i, type, p, nr_segs, n);
 	*iov = p == *iov ? NULL : p;
-	return 0;
+	return n;
 }
 #endif
 
diff --git a/net/compat.c b/net/compat.c
index 3f9ce609397f..0f7ded26059e 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -80,9 +80,10 @@ int get_compat_msghdr(struct msghdr *kmsg,
 
 	kmsg->msg_iocb = NULL;
 
-	return compat_import_iovec(save_addr ? READ : WRITE,
+	err = compat_import_iovec(save_addr ? READ : WRITE,
 				   compat_ptr(msg.msg_iov), msg.msg_iovlen,
 				   UIO_FASTIOV, iov, &kmsg->msg_iter);
+	return err < 0 ? err : 0;
 }
 
 /* Bleech... */
diff --git a/net/socket.c b/net/socket.c
index 72372dc5dd70..bffec466b4f1 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2208,9 +2208,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
 
 	kmsg->msg_iocb = NULL;
 
-	return import_iovec(save_addr ? READ : WRITE,
+	err = import_iovec(save_addr ? READ : WRITE,
 			    msg.msg_iov, msg.msg_iovlen,
 			    UIO_FASTIOV, iov, &kmsg->msg_iter);
+	return err < 0 ? err : 0;
 }
 
 static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
-- 
cgit v1.2.3-59-g8ed1b


From 5213d7efc8ec26ed8938dce75427eff9275a62d9 Mon Sep 17 00:00:00 2001
From: Ruslan Babayev <ruslan@babayev.com>
Date: Tue, 28 May 2019 16:02:32 -0700
Subject: i2c: acpi: export i2c_acpi_find_adapter_by_handle

This allows drivers to lookup i2c adapters on ACPI based systems similar to
of_get_i2c_adapter_by_node() with DT based systems.

Signed-off-by: Ruslan Babayev <ruslan@babayev.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-acpi.c | 3 ++-
 include/linux/i2c.h         | 6 ++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c
index 272800692088..964687534754 100644
--- a/drivers/i2c/i2c-core-acpi.c
+++ b/drivers/i2c/i2c-core-acpi.c
@@ -337,7 +337,7 @@ static int i2c_acpi_find_match_device(struct device *dev, void *data)
 	return ACPI_COMPANION(dev) == data;
 }
 
-static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
+struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
 {
 	struct device *dev;
 
@@ -345,6 +345,7 @@ static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
 			      i2c_acpi_find_match_adapter);
 	return dev ? i2c_verify_adapter(dev) : NULL;
 }
+EXPORT_SYMBOL_GPL(i2c_acpi_find_adapter_by_handle);
 
 static struct i2c_client *i2c_acpi_find_client_by_adev(struct acpi_device *adev)
 {
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 1308126fc384..e982b8913b73 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -14,6 +14,7 @@
 #ifndef _LINUX_I2C_H
 #define _LINUX_I2C_H
 
+#include <linux/acpi.h>		/* for acpi_handle */
 #include <linux/mod_devicetable.h>
 #include <linux/device.h>	/* for struct device */
 #include <linux/sched.h>	/* for completion */
@@ -981,6 +982,7 @@ bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 u32 i2c_acpi_find_bus_speed(struct device *dev);
 struct i2c_client *i2c_acpi_new_device(struct device *dev, int index,
 				       struct i2c_board_info *info);
+struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle);
 #else
 static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares,
 					     struct acpi_resource_i2c_serialbus **i2c)
@@ -996,6 +998,10 @@ static inline struct i2c_client *i2c_acpi_new_device(struct device *dev,
 {
 	return NULL;
 }
+static inline struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle)
+{
+	return NULL;
+}
 #endif /* CONFIG_ACPI */
 
 #endif /* _LINUX_I2C_H */
-- 
cgit v1.2.3-59-g8ed1b


From a09db883e5d938b525a86a4630fc04f98ff1063d Mon Sep 17 00:00:00 2001
From: Uma Shankar <uma.shankar@intel.com>
Date: Tue, 4 Jun 2019 16:47:02 +0530
Subject: drm: Fix docbook warnings in hdr metadata helper structures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the following warnings:
./include/drm/drm_mode_config.h:841: warning: Incorrect use of
kernel-doc format:          * hdr_output_metadata_property: Connector
property containing hdr
./include/drm/drm_mode_config.h:918: warning: Function parameter or member 'hdr_output_metadata_property' not described in 'drm_mode_config'
./include/drm/drm_connector.h:1251: warning: Function parameter or member 'hdr_output_metadata' not described in 'drm_connector'
./include/drm/drm_connector.h:1251: warning: Function parameter or member 'hdr_sink_metadata' not described in 'drm_connector'

Also adds some property documentation for HDR Metadata Connector
Property in connector property create function.

v2: Fixed Sean Paul's review comments.

v3: Fixed Daniel Vetter's review comments, added the UAPI structure
definition section in kernel docs.

v4: Fixed Daniel Vetter's review comments.

v5: Added structure member references as per Daniel's suggestion.

Cc: Shashank Sharma <shashank.sharma@intel.com>
Cc: Ville Syrjä <ville.syrjala@linux.intel.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <maxime.ripard@bootlin.com>
Cc: Sean Paul <sean@poorly.run>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: "Ville Syrjä" <ville.syrjala@linux.intel.com>
Cc: Hans Verkuil <hansverk@cisco.com>
Cc: dri-devel@lists.freedesktop.org
Cc: linux-fbdev@vger.kernel.org
Reviewed-by: Sean Paul <sean@poorly.run> (v1)
Signed-off-by: Uma Shankar <uma.shankar@intel.com>
[danvet: Fix up markup: () for functions, & for structs. Style guide
also recommends to prepend struct for structures.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/1559647022-7336-1-git-send-email-uma.shankar@intel.com
---
 drivers/gpu/drm/drm_connector.c | 40 ++++++++++++++++++++++
 include/drm/drm_connector.h     |  1 +
 include/drm/drm_mode_config.h   |  4 +--
 include/linux/hdmi.h            | 12 +++++++
 include/uapi/drm/drm_mode.h     | 74 ++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 128 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index c9ac8b9e83ea..e17586aaa80f 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -956,6 +956,46 @@ static const struct drm_prop_enum_list hdmi_colorspaces[] = {
  *	  is no longer protected and userspace should take appropriate action
  *	  (whatever that might be).
  *
+ * HDR_OUTPUT_METADATA:
+ *	Connector property to enable userspace to send HDR Metadata to
+ *	driver. This metadata is based on the composition and blending
+ *	policies decided by user, taking into account the hardware and
+ *	sink capabilities. The driver gets this metadata and creates a
+ *	Dynamic Range and Mastering Infoframe (DRM) in case of HDMI,
+ *	SDP packet (Non-audio INFOFRAME SDP v1.3) for DP. This is then
+ *	sent to sink. This notifies the sink of the upcoming frame's Color
+ *	Encoding and Luminance parameters.
+ *
+ *	Userspace first need to detect the HDR capabilities of sink by
+ *	reading and parsing the EDID. Details of HDR metadata for HDMI
+ *	are added in CTA 861.G spec. For DP , its defined in VESA DP
+ *	Standard v1.4. It needs to then get the metadata information
+ *	of the video/game/app content which are encoded in HDR (basically
+ *	using HDR transfer functions). With this information it needs to
+ *	decide on a blending policy and compose the relevant
+ *	layers/overlays into a common format. Once this blending is done,
+ *	userspace will be aware of the metadata of the composed frame to
+ *	be send to sink. It then uses this property to communicate this
+ *	metadata to driver which then make a Infoframe packet and sends
+ *	to sink based on the type of encoder connected.
+ *
+ *	Userspace will be responsible to do Tone mapping operation in case:
+ *		- Some layers are HDR and others are SDR
+ *		- HDR layers luminance is not same as sink
+ *	It will even need to do colorspace conversion and get all layers
+ *	to one common colorspace for blending. It can use either GL, Media
+ *	or display engine to get this done based on the capabilties of the
+ *	associated hardware.
+ *
+ *	Driver expects metadata to be put in &struct hdr_output_metadata
+ *	structure from userspace. This is received as blob and stored in
+ *	&drm_connector_state.hdr_output_metadata. It parses EDID and saves the
+ *	sink metadata in &struct hdr_sink_metadata, as
+ *	&drm_connector.hdr_sink_metadata.  Driver uses
+ *	drm_hdmi_infoframe_set_hdr_metadata() helper to set the HDR metadata,
+ *	hdmi_drm_infoframe_pack() to pack the infoframe as per spec, in case of
+ *	HDMI encoder.
+ *
  * max bpc:
  *	This range property is used by userspace to limit the bit depth. When
  *	used the driver would limit the bpc in accordance with the valid range
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index 547656173c74..47e749b74e5f 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -1244,6 +1244,7 @@ struct drm_connector {
 	 */
 	struct llist_node free_node;
 
+	/** @hdr_sink_metadata: HDR Metadata Information read from sink */
 	struct hdr_sink_metadata hdr_sink_metadata;
 };
 
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 4f88cc972407..759d462d028b 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -837,8 +837,8 @@ struct drm_mode_config {
 	struct drm_property *writeback_out_fence_ptr_property;
 
 	/**
-	 * hdr_output_metadata_property: Connector property containing hdr
-	 * metatda. This will be provided by userspace compositors based
+	 * @hdr_output_metadata_property: Connector property containing hdr
+	 * metatada. This will be provided by userspace compositors based
 	 * on HDR content
 	 */
 	struct drm_property *hdr_output_metadata_property;
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index ee55ba589cdc..9918a6c910c5 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -367,8 +367,19 @@ struct hdr_static_metadata {
 	__u16 min_cll;
 };
 
+/**
+ * struct hdr_sink_metadata - HDR sink metadata
+ *
+ * Metadata Information read from Sink's EDID
+ */
 struct hdr_sink_metadata {
+	/**
+	 * @metadata_type: Static_Metadata_Descriptor_ID.
+	 */
 	__u32 metadata_type;
+	/**
+	 * @hdmi_type1: HDR Metadata Infoframe.
+	 */
 	union {
 		struct hdr_static_metadata hdmi_type1;
 	};
@@ -398,6 +409,7 @@ union hdmi_vendor_any_infoframe {
  * @spd: spd infoframe
  * @vendor: union of all vendor infoframes
  * @audio: audio infoframe
+ * @drm: Dynamic Range and Mastering infoframe
  *
  * This is used by the generic pack function. This works since all infoframes
  * have the same header which also indicates which type of infoframe should be
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 19b5cf368cff..5ab331e5dc23 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -33,6 +33,15 @@
 extern "C" {
 #endif
 
+/**
+ * DOC: overview
+ *
+ * DRM exposes many UAPI and structure definition to have a consistent
+ * and standardized interface with user.
+ * Userspace can refer to these structure definitions and UAPI formats
+ * to communicate to driver
+ */
+
 #define DRM_CONNECTOR_NAME_LEN	32
 #define DRM_DISPLAY_MODE_LEN	32
 #define DRM_PROP_NAME_LEN	32
@@ -630,24 +639,87 @@ struct drm_color_lut {
 	__u16 reserved;
 };
 
-/* HDR Metadata Infoframe as per 861.G spec */
+/**
+ * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data.
+ *
+ * HDR Metadata Infoframe as per CTA 861.G spec. This is expected
+ * to match exactly with the spec.
+ *
+ * Userspace is expected to pass the metadata information as per
+ * the format described in this structure.
+ */
 struct hdr_metadata_infoframe {
+	/**
+	 * @eotf: Electro-Optical Transfer Function (EOTF)
+	 * used in the stream.
+	 */
 	__u8 eotf;
+	/**
+	 * @metadata_type: Static_Metadata_Descriptor_ID.
+	 */
 	__u8 metadata_type;
+	/**
+	 * @display_primaries: Color Primaries of the Data.
+	 * These are coded as unsigned 16-bit values in units of
+	 * 0.00002, where 0x0000 represents zero and 0xC350
+	 * represents 1.0000.
+	 * @display_primaries.x: X cordinate of color primary.
+	 * @display_primaries.y: Y cordinate of color primary.
+	 */
 	struct {
 		__u16 x, y;
 		} display_primaries[3];
+	/**
+	 * @white_point: White Point of Colorspace Data.
+	 * These are coded as unsigned 16-bit values in units of
+	 * 0.00002, where 0x0000 represents zero and 0xC350
+	 * represents 1.0000.
+	 * @white_point.x: X cordinate of whitepoint of color primary.
+	 * @white_point.y: Y cordinate of whitepoint of color primary.
+	 */
 	struct {
 		__u16 x, y;
 		} white_point;
+	/**
+	 * @max_display_mastering_luminance: Max Mastering Display Luminance.
+	 * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+	 * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+	 */
 	__u16 max_display_mastering_luminance;
+	/**
+	 * @min_display_mastering_luminance: Min Mastering Display Luminance.
+	 * This value is coded as an unsigned 16-bit value in units of
+	 * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF
+	 * represents 6.5535 cd/m2.
+	 */
 	__u16 min_display_mastering_luminance;
+	/**
+	 * @max_cll: Max Content Light Level.
+	 * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+	 * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+	 */
 	__u16 max_cll;
+	/**
+	 * @max_fall: Max Frame Average Light Level.
+	 * This value is coded as an unsigned 16-bit value in units of 1 cd/m2,
+	 * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2.
+	 */
 	__u16 max_fall;
 };
 
+/**
+ * struct hdr_output_metadata - HDR output metadata
+ *
+ * Metadata Information to be passed from userspace
+ */
 struct hdr_output_metadata {
+	/**
+	 * @metadata_type: Static_Metadata_Descriptor_ID.
+	 */
 	__u32 metadata_type;
+	/**
+	 * @hdmi_metadata_type1: HDR Metadata Infoframe.
+	 */
 	union {
 		struct hdr_metadata_infoframe hdmi_metadata_type1;
 	};
-- 
cgit v1.2.3-59-g8ed1b


From 1e390478cfb527e34c9ab89ba57212cb05c33c51 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Wed, 5 Jun 2019 10:46:05 +0200
Subject: gpu: host1x: Increase maximum DMA segment size

Recent versions of the DMA API debug code have started to warn about
violations of the maximum DMA segment size. This is because the segment
size defaults to 64 KiB, which can easily be exceeded in large buffer
allocations such as used in DRM/KMS for framebuffers.

Technically the Tegra SMMU and ARM SMMU don't have a maximum segment
size (they map individual pages irrespective of whether they are
contiguous or not), so the choice of 4 MiB is a bit arbitrary here. The
maximum segment size is a 32-bit unsigned integer, though, so we can't
set it to the correct maximum size, which would be the size of the
aperture.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/bus.c | 3 +++
 include/linux/host1x.h   | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c
index 103fffc1904b..c9a637d9417e 100644
--- a/drivers/gpu/host1x/bus.c
+++ b/drivers/gpu/host1x/bus.c
@@ -425,6 +425,9 @@ static int host1x_device_add(struct host1x *host1x,
 
 	of_dma_configure(&device->dev, host1x->dev->of_node, true);
 
+	device->dev.dma_parms = &device->dma_parms;
+	dma_set_max_seg_size(&device->dev, SZ_4M);
+
 	err = host1x_device_parse_dt(device, driver);
 	if (err < 0) {
 		kfree(device);
diff --git a/include/linux/host1x.h b/include/linux/host1x.h
index 89110d896d72..aef6e2f73802 100644
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -310,6 +310,8 @@ struct host1x_device {
 	struct list_head clients;
 
 	bool registered;
+
+	struct device_dma_parameters dma_parms;
 };
 
 static inline struct host1x_device *to_host1x_device(struct device *dev)
-- 
cgit v1.2.3-59-g8ed1b


From 2076e5c0451ca943ff8ecc6def7239c84c77e070 Mon Sep 17 00:00:00 2001
From: Ralph Campbell <rcampbell@nvidia.com>
Date: Mon, 6 May 2019 16:29:38 -0700
Subject: mm/hmm: update HMM documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update the HMM documentation to reflect the latest API and make a few
minor wording changes.

Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 Documentation/vm/hmm.rst | 141 +++++++++++++++++++++++++----------------------
 include/linux/hmm.h      |   7 ++-
 2 files changed, 78 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 7cdf7282e022..7b6eeda5a7c0 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -10,7 +10,7 @@ of this being specialized struct page for such memory (see sections 5 to 7 of
 this document).
 
 HMM also provides optional helpers for SVM (Share Virtual Memory), i.e.,
-allowing a device to transparently access program address coherently with
+allowing a device to transparently access program addresses coherently with
 the CPU meaning that any valid pointer on the CPU is also a valid pointer
 for the device. This is becoming mandatory to simplify the use of advanced
 heterogeneous computing where GPU, DSP, or FPGA are used to perform various
@@ -22,8 +22,8 @@ expose the hardware limitations that are inherent to many platforms. The third
 section gives an overview of the HMM design. The fourth section explains how
 CPU page-table mirroring works and the purpose of HMM in this context. The
 fifth section deals with how device memory is represented inside the kernel.
-Finally, the last section presents a new migration helper that allows lever-
-aging the device DMA engine.
+Finally, the last section presents a new migration helper that allows
+leveraging the device DMA engine.
 
 .. contents:: :local:
 
@@ -39,20 +39,20 @@ address space. I use shared address space to refer to the opposite situation:
 i.e., one in which any application memory region can be used by a device
 transparently.
 
-Split address space happens because device can only access memory allocated
-through device specific API. This implies that all memory objects in a program
+Split address space happens because devices can only access memory allocated
+through a device specific API. This implies that all memory objects in a program
 are not equal from the device point of view which complicates large programs
 that rely on a wide set of libraries.
 
-Concretely this means that code that wants to leverage devices like GPUs needs
-to copy object between generically allocated memory (malloc, mmap private, mmap
+Concretely, this means that code that wants to leverage devices like GPUs needs
+to copy objects between generically allocated memory (malloc, mmap private, mmap
 share) and memory allocated through the device driver API (this still ends up
 with an mmap but of the device file).
 
 For flat data sets (array, grid, image, ...) this isn't too hard to achieve but
-complex data sets (list, tree, ...) are hard to get right. Duplicating a
+for complex data sets (list, tree, ...) it's hard to get right. Duplicating a
 complex data set needs to re-map all the pointer relations between each of its
-elements. This is error prone and program gets harder to debug because of the
+elements. This is error prone and programs get harder to debug because of the
 duplicate data set and addresses.
 
 Split address space also means that libraries cannot transparently use data
@@ -77,12 +77,12 @@ I/O bus, device memory characteristics
 
 I/O buses cripple shared address spaces due to a few limitations. Most I/O
 buses only allow basic memory access from device to main memory; even cache
-coherency is often optional. Access to device memory from CPU is even more
+coherency is often optional. Access to device memory from a CPU is even more
 limited. More often than not, it is not cache coherent.
 
 If we only consider the PCIE bus, then a device can access main memory (often
 through an IOMMU) and be cache coherent with the CPUs. However, it only allows
-a limited set of atomic operations from device on main memory. This is worse
+a limited set of atomic operations from the device on main memory. This is worse
 in the other direction: the CPU can only access a limited range of the device
 memory and cannot perform atomic operations on it. Thus device memory cannot
 be considered the same as regular memory from the kernel point of view.
@@ -93,20 +93,20 @@ The final limitation is latency. Access to main memory from the device has an
 order of magnitude higher latency than when the device accesses its own memory.
 
 Some platforms are developing new I/O buses or additions/modifications to PCIE
-to address some of these limitations (OpenCAPI, CCIX). They mainly allow two-
-way cache coherency between CPU and device and allow all atomic operations the
+to address some of these limitations (OpenCAPI, CCIX). They mainly allow
+two-way cache coherency between CPU and device and allow all atomic operations the
 architecture supports. Sadly, not all platforms are following this trend and
 some major architectures are left without hardware solutions to these problems.
 
 So for shared address space to make sense, not only must we allow devices to
 access any memory but we must also permit any memory to be migrated to device
-memory while device is using it (blocking CPU access while it happens).
+memory while the device is using it (blocking CPU access while it happens).
 
 
 Shared address space and migration
 ==================================
 
-HMM intends to provide two main features. First one is to share the address
+HMM intends to provide two main features. The first one is to share the address
 space by duplicating the CPU page table in the device page table so the same
 address points to the same physical memory for any valid main memory address in
 the process address space.
@@ -121,14 +121,14 @@ why HMM provides helpers to factor out everything that can be while leaving the
 hardware specific details to the device driver.
 
 The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that
-allows allocating a struct page for each page of the device memory. Those pages
+allows allocating a struct page for each page of device memory. Those pages
 are special because the CPU cannot map them. However, they allow migrating
 main memory to device memory using existing migration mechanisms and everything
-looks like a page is swapped out to disk from the CPU point of view. Using a
-struct page gives the easiest and cleanest integration with existing mm mech-
-anisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
+looks like a page that is swapped out to disk from the CPU point of view. Using a
+struct page gives the easiest and cleanest integration with existing mm
+mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE
 memory for the device memory and second to perform migration. Policy decisions
-of what and when to migrate things is left to the device driver.
+of what and when to migrate is left to the device driver.
 
 Note that any CPU access to a device page triggers a page fault and a migration
 back to main memory. For example, when a page backing a given CPU address A is
@@ -136,8 +136,8 @@ migrated from a main memory page to a device page, then any CPU access to
 address A triggers a page fault and initiates a migration back to main memory.
 
 With these two features, HMM not only allows a device to mirror process address
-space and keeping both CPU and device page table synchronized, but also lever-
-ages device memory by migrating the part of the data set that is actively being
+space and keeps both CPU and device page tables synchronized, but also
+leverages device memory by migrating the part of the data set that is actively being
 used by the device.
 
 
@@ -151,21 +151,28 @@ registration of an hmm_mirror struct::
 
  int hmm_mirror_register(struct hmm_mirror *mirror,
                          struct mm_struct *mm);
- int hmm_mirror_register_locked(struct hmm_mirror *mirror,
-                                struct mm_struct *mm);
 
-
-The locked variant is to be used when the driver is already holding mmap_sem
-of the mm in write mode. The mirror struct has a set of callbacks that are used
+The mirror struct has a set of callbacks that are used
 to propagate CPU page tables::
 
  struct hmm_mirror_ops {
+     /* release() - release hmm_mirror
+      *
+      * @mirror: pointer to struct hmm_mirror
+      *
+      * This is called when the mm_struct is being released.  The callback
+      * must ensure that all access to any pages obtained from this mirror
+      * is halted before the callback returns. All future access should
+      * fault.
+      */
+     void (*release)(struct hmm_mirror *mirror);
+
      /* sync_cpu_device_pagetables() - synchronize page tables
       *
       * @mirror: pointer to struct hmm_mirror
-      * @update_type: type of update that occurred to the CPU page table
-      * @start: virtual start address of the range to update
-      * @end: virtual end address of the range to update
+      * @update: update information (see struct mmu_notifier_range)
+      * Return: -EAGAIN if update.blockable false and callback need to
+      *         block, 0 otherwise.
       *
       * This callback ultimately originates from mmu_notifiers when the CPU
       * page table is updated. The device driver must update its page table
@@ -176,14 +183,12 @@ to propagate CPU page tables::
       * page tables are completely updated (TLBs flushed, etc); this is a
       * synchronous call.
       */
-      void (*update)(struct hmm_mirror *mirror,
-                     enum hmm_update action,
-                     unsigned long start,
-                     unsigned long end);
+     int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
+                                       const struct hmm_update *update);
  };
 
 The device driver must perform the update action to the range (mark range
-read only, or fully unmap, ...). The device must be done with the update before
+read only, or fully unmap, etc.). The device must complete the update before
 the driver callback returns.
 
 When the device driver wants to populate a range of virtual addresses, it can
@@ -194,17 +199,18 @@ use either::
 
 The first one (hmm_range_snapshot()) will only fetch present CPU page table
 entries and will not trigger a page fault on missing or non-present entries.
-The second one does trigger a page fault on missing or read-only entry if the
-write parameter is true. Page faults use the generic mm page fault code path
-just like a CPU page fault.
+The second one does trigger a page fault on missing or read-only entries if
+write access is requested (see below). Page faults use the generic mm page
+fault code path just like a CPU page fault.
 
 Both functions copy CPU page table entries into their pfns array argument. Each
 entry in that array corresponds to an address in the virtual range. HMM
 provides a set of flags to help the driver identify special CPU page table
 entries.
 
-Locking with the update() callback is the most important aspect the driver must
-respect in order to keep things properly synchronized. The usage pattern is::
+Locking within the sync_cpu_device_pagetables() callback is the most important
+aspect the driver must respect in order to keep things properly synchronized.
+The usage pattern is::
 
  int driver_populate_range(...)
  {
@@ -239,11 +245,11 @@ respect in order to keep things properly synchronized. The usage pattern is::
             hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC);
             goto again;
           }
-          hmm_mirror_unregister(&range);
+          hmm_range_unregister(&range);
           return ret;
       }
       take_lock(driver->update);
-      if (!range.valid) {
+      if (!hmm_range_valid(&range)) {
           release_lock(driver->update);
           up_read(&mm->mmap_sem);
           goto again;
@@ -251,15 +257,15 @@ respect in order to keep things properly synchronized. The usage pattern is::
 
       // Use pfns array content to update device page table
 
-      hmm_mirror_unregister(&range);
+      hmm_range_unregister(&range);
       release_lock(driver->update);
       up_read(&mm->mmap_sem);
       return 0;
  }
 
 The driver->update lock is the same lock that the driver takes inside its
-update() callback. That lock must be held before checking the range.valid
-field to avoid any race with a concurrent CPU page table update.
+sync_cpu_device_pagetables() callback. That lock must be held before calling
+hmm_range_valid() to avoid any race with a concurrent CPU page table update.
 
 HMM implements all this on top of the mmu_notifier API because we wanted a
 simpler API and also to be able to perform optimizations latter on like doing
@@ -279,46 +285,47 @@ concurrently).
 Leverage default_flags and pfn_flags_mask
 =========================================
 
-The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows
-to set fault or snapshot policy for a whole range instead of having to set them
-for each entries in the range.
+The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify
+fault or snapshot policy for the whole range instead of having to set them
+for each entry in the pfns array.
+
+For instance, if the device flags for range.flags are::
 
-For instance if the device flags for device entries are:
-    VALID (1 << 63)
-    WRITE (1 << 62)
+    range.flags[HMM_PFN_VALID] = (1 << 63);
+    range.flags[HMM_PFN_WRITE] = (1 << 62);
 
-Now let say that device driver wants to fault with at least read a range then
-it does set::
+and the device driver wants pages for a range with at least read permission,
+it sets::
 
     range->default_flags = (1 << 63);
     range->pfn_flags_mask = 0;
 
-and calls hmm_range_fault() as described above. This will fill fault all page
+and calls hmm_range_fault() as described above. This will fill fault all pages
 in the range with at least read permission.
 
-Now let say driver wants to do the same except for one page in the range for
-which its want to have write. Now driver set::
+Now let's say the driver wants to do the same except for one page in the range for
+which it wants to have write permission. Now driver set::
 
     range->default_flags = (1 << 63);
     range->pfn_flags_mask = (1 << 62);
     range->pfns[index_of_write] = (1 << 62);
 
-With this HMM will fault in all page with at least read (ie valid) and for the
+With this, HMM will fault in all pages with at least read (i.e., valid) and for the
 address == range->start + (index_of_write << PAGE_SHIFT) it will fault with
-write permission ie if the CPU pte does not have write permission set then HMM
+write permission i.e., if the CPU pte does not have write permission set then HMM
 will call handle_mm_fault().
 
-Note that HMM will populate the pfns array with write permission for any entry
-that have write permission within the CPU pte no matter what are the values set
+Note that HMM will populate the pfns array with write permission for any page
+that is mapped with CPU write permission no matter what values are set
 in default_flags or pfn_flags_mask.
 
 
 Represent and manage device memory from core kernel point of view
 =================================================================
 
-Several different designs were tried to support device memory. First one used
-a device specific data structure to keep information about migrated memory and
-HMM hooked itself in various places of mm code to handle any access to
+Several different designs were tried to support device memory. The first one
+used a device specific data structure to keep information about migrated memory
+and HMM hooked itself in various places of mm code to handle any access to
 addresses that were backed by device memory. It turns out that this ended up
 replicating most of the fields of struct page and also needed many kernel code
 paths to be updated to understand this new kind of memory.
@@ -341,7 +348,7 @@ The hmm_devmem_ops is where most of the important things are::
 
  struct hmm_devmem_ops {
      void (*free)(struct hmm_devmem *devmem, struct page *page);
-     int (*fault)(struct hmm_devmem *devmem,
+     vm_fault_t (*fault)(struct hmm_devmem *devmem,
                   struct vm_area_struct *vma,
                   unsigned long addr,
                   struct page *page,
@@ -417,9 +424,9 @@ willing to pay to keep all the code simpler.
 Memory cgroup (memcg) and rss accounting
 ========================================
 
-For now device memory is accounted as any regular page in rss counters (either
+For now, device memory is accounted as any regular page in rss counters (either
 anonymous if device page is used for anonymous, file if device page is used for
-file backed page or shmem if device page is used for shared memory). This is a
+file backed page, or shmem if device page is used for shared memory). This is a
 deliberate choice to keep existing applications, that might start using device
 memory without knowing about it, running unimpacted.
 
@@ -439,6 +446,6 @@ get more experience in how device memory is used and its impact on memory
 resource control.
 
 
-Note that device memory can never be pinned by device driver nor through GUP
+Note that device memory can never be pinned by a device driver nor through GUP
 and thus such memory is always free upon process exit. Or when last reference
 is dropped in case of shared memory or file backed memory.
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 044a36d7c3f8..740bb00853f5 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -418,9 +418,10 @@ struct hmm_mirror_ops {
 	 *
 	 * @mirror: pointer to struct hmm_mirror
 	 *
-	 * This is called when the mm_struct is being released.
-	 * The callback should make sure no references to the mirror occur
-	 * after the callback returns.
+	 * This is called when the mm_struct is being released.  The callback
+	 * must ensure that all access to any pages obtained from this mirror
+	 * is halted before the callback returns. All future access should
+	 * fault.
 	 */
 	void (*release)(struct hmm_mirror *mirror);
 
-- 
cgit v1.2.3-59-g8ed1b


From 085ea25064a9169eba5f2ed6484c111ab0f3ee79 Mon Sep 17 00:00:00 2001
From: Ralph Campbell <rcampbell@nvidia.com>
Date: Mon, 6 May 2019 16:29:39 -0700
Subject: mm/hmm: clean up some coding style and comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no functional changes, just some coding style clean ups and
minor comment changes.

Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h | 71 +++++++++++++++++++++++++++--------------------------
 mm/hmm.c            | 62 ++++++++++++++++++++++++----------------------
 2 files changed, 68 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 740bb00853f5..7007123842ba 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -21,8 +21,8 @@
  *
  * HMM address space mirroring API:
  *
- * Use HMM address space mirroring if you want to mirror range of the CPU page
- * table of a process into a device page table. Here, "mirror" means "keep
+ * Use HMM address space mirroring if you want to mirror a range of the CPU
+ * page tables of a process into a device page table. Here, "mirror" means "keep
  * synchronized". Prerequisites: the device must provide the ability to write-
  * protect its page tables (at PAGE_SIZE granularity), and must be able to
  * recover from the resulting potential page faults.
@@ -105,10 +105,11 @@ struct hmm {
  * HMM_PFN_WRITE: CPU page table has write permission set
  * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE)
  *
- * The driver provide a flags array, if driver valid bit for an entry is bit
- * 3 ie (entry & (1 << 3)) is true if entry is valid then driver must provide
+ * The driver provides a flags array for mapping page protections to device
+ * PTE bits. If the driver valid bit for an entry is bit 3,
+ * i.e., (entry & (1 << 3)), then the driver must provide
  * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3.
- * Same logic apply to all flags. This is same idea as vm_page_prot in vma
+ * Same logic apply to all flags. This is the same idea as vm_page_prot in vma
  * except that this is per device driver rather than per architecture.
  */
 enum hmm_pfn_flag_e {
@@ -129,13 +130,13 @@ enum hmm_pfn_flag_e {
  *      be mirrored by a device, because the entry will never have HMM_PFN_VALID
  *      set and the pfn value is undefined.
  *
- * Driver provide entry value for none entry, error entry and special entry,
- * driver can alias (ie use same value for error and special for instance). It
- * should not alias none and error or special.
+ * Driver provides values for none entry, error entry, and special entry.
+ * Driver can alias (i.e., use same value) error and special, but
+ * it should not alias none with error or special.
  *
  * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be:
  * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous,
- * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table
+ * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry,
  * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one
  */
 enum hmm_pfn_value_e {
@@ -158,6 +159,7 @@ enum hmm_pfn_value_e {
  * @values: pfn value for some special case (none, special, error, ...)
  * @default_flags: default flags for the range (write, read, ... see hmm doc)
  * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter
+ * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT)
  * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT)
  * @valid: pfns array did not change since it has been fill by an HMM function
  */
@@ -180,7 +182,7 @@ struct hmm_range {
 /*
  * hmm_range_page_shift() - return the page shift for the range
  * @range: range being queried
- * Returns: page shift (page size = 1 << page shift) for the range
+ * Return: page shift (page size = 1 << page shift) for the range
  */
 static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
 {
@@ -190,7 +192,7 @@ static inline unsigned hmm_range_page_shift(const struct hmm_range *range)
 /*
  * hmm_range_page_size() - return the page size for the range
  * @range: range being queried
- * Returns: page size for the range in bytes
+ * Return: page size for the range in bytes
  */
 static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
 {
@@ -201,7 +203,7 @@ static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
  * hmm_range_wait_until_valid() - wait for range to be valid
  * @range: range affected by invalidation to wait on
  * @timeout: time out for wait in ms (ie abort wait after that period of time)
- * Returns: true if the range is valid, false otherwise.
+ * Return: true if the range is valid, false otherwise.
  */
 static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
 					      unsigned long timeout)
@@ -222,7 +224,7 @@ static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
 /*
  * hmm_range_valid() - test if a range is valid or not
  * @range: range
- * Returns: true if the range is valid, false otherwise.
+ * Return: true if the range is valid, false otherwise.
  */
 static inline bool hmm_range_valid(struct hmm_range *range)
 {
@@ -233,7 +235,7 @@ static inline bool hmm_range_valid(struct hmm_range *range)
  * hmm_device_entry_to_page() - return struct page pointed to by a device entry
  * @range: range use to decode device entry value
  * @entry: device entry value to get corresponding struct page from
- * Returns: struct page pointer if entry is a valid, NULL otherwise
+ * Return: struct page pointer if entry is a valid, NULL otherwise
  *
  * If the device entry is valid (ie valid flag set) then return the struct page
  * matching the entry value. Otherwise return NULL.
@@ -256,7 +258,7 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang
  * hmm_device_entry_to_pfn() - return pfn value store in a device entry
  * @range: range use to decode device entry value
  * @entry: device entry to extract pfn from
- * Returns: pfn value if device entry is valid, -1UL otherwise
+ * Return: pfn value if device entry is valid, -1UL otherwise
  */
 static inline unsigned long
 hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
@@ -276,7 +278,7 @@ hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn)
  * hmm_device_entry_from_page() - create a valid device entry for a page
  * @range: range use to encode HMM pfn value
  * @page: page for which to create the device entry
- * Returns: valid device entry for the page
+ * Return: valid device entry for the page
  */
 static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
 						  struct page *page)
@@ -289,7 +291,7 @@ static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range,
  * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
  * @range: range use to encode HMM pfn value
  * @pfn: pfn value for which to create the device entry
- * Returns: valid device entry for the pfn
+ * Return: valid device entry for the pfn
  */
 static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
 						 unsigned long pfn)
@@ -394,7 +396,7 @@ enum hmm_update_event {
 };
 
 /*
- * struct hmm_update - HMM update informations for callback
+ * struct hmm_update - HMM update information for callback
  *
  * @start: virtual start address of the range to update
  * @end: virtual end address of the range to update
@@ -428,8 +430,8 @@ struct hmm_mirror_ops {
 	/* sync_cpu_device_pagetables() - synchronize page tables
 	 *
 	 * @mirror: pointer to struct hmm_mirror
-	 * @update: update informations (see struct hmm_update)
-	 * Returns: -EAGAIN if update.blockable false and callback need to
+	 * @update: update information (see struct hmm_update)
+	 * Return: -EAGAIN if update.blockable false and callback need to
 	 *          block, 0 otherwise.
 	 *
 	 * This callback ultimately originates from mmu_notifiers when the CPU
@@ -468,13 +470,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror);
 /*
  * hmm_mirror_mm_is_alive() - test if mm is still alive
  * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
- * Returns: false if the mm is dead, true otherwise
+ * Return: false if the mm is dead, true otherwise
  *
- * This is an optimization it will not accurately always return -EINVAL if the
- * mm is dead ie there can be false negative (process is being kill but HMM is
- * not yet inform of that). It is only intented to be use to optimize out case
- * where driver is about to do something time consuming and it would be better
- * to skip it if the mm is dead.
+ * This is an optimization, it will not always accurately return false if the
+ * mm is dead; i.e., there can be false negatives (process is being killed but
+ * HMM is not yet informed of that). It is only intended to be used to optimize
+ * out cases where the driver is about to do something time consuming and it
+ * would be better to skip it if the mm is dead.
  */
 static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
 {
@@ -489,7 +491,6 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
 	return true;
 }
 
-
 /*
  * Please see Documentation/vm/hmm.rst for how to use the range API.
  */
@@ -562,7 +563,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block)
 	ret = hmm_range_fault(range, block);
 	if (ret <= 0) {
 		if (ret == -EBUSY || !ret) {
-			/* Same as above  drop mmap_sem to match old API. */
+			/* Same as above, drop mmap_sem to match old API. */
 			up_read(&range->vma->vm_mm->mmap_sem);
 			ret = -EBUSY;
 		} else if (ret == -EAGAIN)
@@ -629,7 +630,7 @@ struct hmm_devmem_ops {
 	 * @page: pointer to struct page backing virtual address (unreliable)
 	 * @flags: FAULT_FLAG_* (see include/linux/mm.h)
 	 * @pmdp: page middle directory
-	 * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
+	 * Return: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
 	 *   on error
 	 *
 	 * The callback occurs whenever there is a CPU page fault or GUP on a
@@ -637,14 +638,14 @@ struct hmm_devmem_ops {
 	 * page back to regular memory (CPU accessible).
 	 *
 	 * The device driver is free to migrate more than one page from the
-	 * fault() callback as an optimization. However if device decide to
-	 * migrate more than one page it must always priotirize the faulting
+	 * fault() callback as an optimization. However if the device decides
+	 * to migrate more than one page it must always priotirize the faulting
 	 * address over the others.
 	 *
-	 * The struct page pointer is only given as an hint to allow quick
+	 * The struct page pointer is only given as a hint to allow quick
 	 * lookup of internal device driver data. A concurrent migration
-	 * might have already free that page and the virtual address might
-	 * not longer be back by it. So it should not be modified by the
+	 * might have already freed that page and the virtual address might
+	 * no longer be backed by it. So it should not be modified by the
 	 * callback.
 	 *
 	 * Note that mmap semaphore is held in read mode at least when this
@@ -671,7 +672,7 @@ struct hmm_devmem_ops {
  * @ref: per CPU refcount
  * @page_fault: callback when CPU fault on an unaddressable device page
  *
- * This an helper structure for device drivers that do not wish to implement
+ * This is a helper structure for device drivers that do not wish to implement
  * the gory details related to hotplugging new memoy and allocating struct
  * pages.
  *
diff --git a/mm/hmm.c b/mm/hmm.c
index c62ae414a3a2..4db5dcf110ba 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -153,9 +153,8 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 
 	/* Wake-up everyone waiting on any range. */
 	mutex_lock(&hmm->lock);
-	list_for_each_entry(range, &hmm->ranges, list) {
+	list_for_each_entry(range, &hmm->ranges, list)
 		range->valid = false;
-	}
 	wake_up_all(&hmm->wq);
 	mutex_unlock(&hmm->lock);
 
@@ -166,9 +165,10 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 		list_del_init(&mirror->list);
 		if (mirror->ops->release) {
 			/*
-			 * Drop mirrors_sem so callback can wait on any pending
-			 * work that might itself trigger mmu_notifier callback
-			 * and thus would deadlock with us.
+			 * Drop mirrors_sem so the release callback can wait
+			 * on any pending work that might itself trigger a
+			 * mmu_notifier callback and thus would deadlock with
+			 * us.
 			 */
 			up_write(&hmm->mirrors_sem);
 			mirror->ops->release(mirror);
@@ -223,11 +223,8 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 		int ret;
 
 		ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
-		if (!update.blockable && ret == -EAGAIN) {
-			up_read(&hmm->mirrors_sem);
-			ret = -EAGAIN;
-			goto out;
-		}
+		if (!update.blockable && ret == -EAGAIN)
+			break;
 	}
 	up_read(&hmm->mirrors_sem);
 
@@ -271,6 +268,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
  *
  * @mirror: new mirror struct to register
  * @mm: mm to register against
+ * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments
  *
  * To start mirroring a process address space, the device driver must register
  * an HMM mirror struct.
@@ -298,7 +296,7 @@ EXPORT_SYMBOL(hmm_mirror_register);
 /*
  * hmm_mirror_unregister() - unregister a mirror
  *
- * @mirror: new mirror struct to register
+ * @mirror: mirror struct to unregister
  *
  * Stop mirroring a process address space, and cleanup.
  */
@@ -372,7 +370,7 @@ static int hmm_pfns_bad(unsigned long addr,
  * @fault: should we fault or not ?
  * @write_fault: write fault ?
  * @walk: mm_walk structure
- * Returns: 0 on success, -EBUSY after page fault, or page fault error
+ * Return: 0 on success, -EBUSY after page fault, or page fault error
  *
  * This function will be called whenever pmd_none() or pte_none() returns true,
  * or whenever there is no page directory covering the virtual address range.
@@ -911,6 +909,7 @@ int hmm_range_register(struct hmm_range *range,
 		       unsigned page_shift)
 {
 	unsigned long mask = ((1UL << page_shift) - 1UL);
+	struct hmm *hmm;
 
 	range->valid = false;
 	range->hmm = NULL;
@@ -924,28 +923,29 @@ int hmm_range_register(struct hmm_range *range,
 	range->start = start;
 	range->end = end;
 
-	range->hmm = hmm_get_or_create(mm);
-	if (!range->hmm)
+	hmm = hmm_get_or_create(mm);
+	if (!hmm)
 		return -EFAULT;
 
 	/* Check if hmm_mm_destroy() was call. */
-	if (range->hmm->mm == NULL || range->hmm->dead) {
-		hmm_put(range->hmm);
+	if (hmm->mm == NULL || hmm->dead) {
+		hmm_put(hmm);
 		return -EFAULT;
 	}
 
-	/* Initialize range to track CPU page table update */
-	mutex_lock(&range->hmm->lock);
+	/* Initialize range to track CPU page table updates. */
+	mutex_lock(&hmm->lock);
 
-	list_add_rcu(&range->list, &range->hmm->ranges);
+	range->hmm = hmm;
+	list_add_rcu(&range->list, &hmm->ranges);
 
 	/*
 	 * If there are any concurrent notifiers we have to wait for them for
 	 * the range to be valid (see hmm_range_wait_until_valid()).
 	 */
-	if (!range->hmm->notifiers)
+	if (!hmm->notifiers)
 		range->valid = true;
-	mutex_unlock(&range->hmm->lock);
+	mutex_unlock(&hmm->lock);
 
 	return 0;
 }
@@ -960,17 +960,19 @@ EXPORT_SYMBOL(hmm_range_register);
  */
 void hmm_range_unregister(struct hmm_range *range)
 {
+	struct hmm *hmm = range->hmm;
+
 	/* Sanity check this really should not happen. */
-	if (range->hmm == NULL || range->end <= range->start)
+	if (hmm == NULL || range->end <= range->start)
 		return;
 
-	mutex_lock(&range->hmm->lock);
+	mutex_lock(&hmm->lock);
 	list_del_rcu(&range->list);
-	mutex_unlock(&range->hmm->lock);
+	mutex_unlock(&hmm->lock);
 
 	/* Drop reference taken by hmm_range_register() */
 	range->valid = false;
-	hmm_put(range->hmm);
+	hmm_put(hmm);
 	range->hmm = NULL;
 }
 EXPORT_SYMBOL(hmm_range_unregister);
@@ -978,7 +980,7 @@ EXPORT_SYMBOL(hmm_range_unregister);
 /*
  * hmm_range_snapshot() - snapshot CPU page table for a range
  * @range: range
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
+ * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
  *          permission (for instance asking for write and range is read only),
  *          -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid
  *          vma or it is illegal to access that range), number of valid pages
@@ -1061,7 +1063,7 @@ EXPORT_SYMBOL(hmm_range_snapshot);
  * hmm_range_fault() - try to fault some address in a virtual address range
  * @range: range being faulted
  * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: number of valid pages in range->pfns[] (from range start
+ * Return: number of valid pages in range->pfns[] (from range start
  *          address). This may be zero. If the return value is negative,
  *          then one of the following values may be returned:
  *
@@ -1179,7 +1181,7 @@ EXPORT_SYMBOL(hmm_range_fault);
  * @device: device against to dma map page to
  * @daddrs: dma address of mapped pages
  * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been
+ * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been
  *          drop and you need to try again, some other error value otherwise
  *
  * Note same usage pattern as hmm_range_fault().
@@ -1267,7 +1269,7 @@ EXPORT_SYMBOL(hmm_range_dma_map);
  * @device: device against which dma map was done
  * @daddrs: dma address of mapped pages
  * @dirty: dirty page if it had the write flag set
- * Returns: number of page unmapped on success, -EINVAL otherwise
+ * Return: number of page unmapped on success, -EINVAL otherwise
  *
  * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
  * to the sync_cpu_device_pagetables() callback so that it is safe here to
@@ -1390,7 +1392,7 @@ static void hmm_devmem_free(struct page *page, void *data)
  * @ops: memory event device driver callback (see struct hmm_devmem_ops)
  * @device: device struct to bind the resource too
  * @size: size in bytes of the device memory to add
- * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
+ * Return: pointer to new hmm_devmem struct ERR_PTR otherwise
  *
  * This function first finds an empty range of physical address big enough to
  * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
-- 
cgit v1.2.3-59-g8ed1b


From 6d7c3cde93c1d9ac0b37f78ec3f2ff052159a242 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Wed, 22 May 2019 16:52:52 -0300
Subject: mm/hmm: fix use after free with struct hmm in the mmu notifiers

mmu_notifier_unregister_no_release() is not a fence and the mmu_notifier
system will continue to reference hmm->mn until the srcu grace period
expires.

Resulting in use after free races like this:

         CPU0                                     CPU1
                                               __mmu_notifier_invalidate_range_start()
                                                 srcu_read_lock
                                                 hlist_for_each ()
                                                   // mn == hmm->mn
hmm_mirror_unregister()
  hmm_put()
    hmm_free()
      mmu_notifier_unregister_no_release()
         hlist_del_init_rcu(hmm-mn->list)
			                           mn->ops->invalidate_range_start(mn, range);
					             mm_get_hmm()
      mm->hmm = NULL;
      kfree(hmm)
                                                     mutex_lock(&hmm->lock);

Use SRCU to kfree the hmm memory so that the notifiers can rely on hmm
existing. Get the now-safe hmm struct through container_of and directly
check kref_get_unless_zero to lock it against free.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h |  1 +
 mm/hmm.c            | 23 +++++++++++++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 7007123842ba..cb01cf1fa3c0 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -93,6 +93,7 @@ struct hmm {
 	struct mmu_notifier	mmu_notifier;
 	struct rw_semaphore	mirrors_sem;
 	wait_queue_head_t	wq;
+	struct rcu_head		rcu;
 	long			notifiers;
 	bool			dead;
 };
diff --git a/mm/hmm.c b/mm/hmm.c
index 826816ab2377..f6956d78e3cb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -104,6 +104,11 @@ error:
 	return NULL;
 }
 
+static void hmm_free_rcu(struct rcu_head *rcu)
+{
+	kfree(container_of(rcu, struct hmm, rcu));
+}
+
 static void hmm_free(struct kref *kref)
 {
 	struct hmm *hmm = container_of(kref, struct hmm, kref);
@@ -116,7 +121,7 @@ static void hmm_free(struct kref *kref)
 		mm->hmm = NULL;
 	spin_unlock(&mm->page_table_lock);
 
-	kfree(hmm);
+	mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
 }
 
 static inline void hmm_put(struct hmm *hmm)
@@ -144,10 +149,14 @@ void hmm_mm_destroy(struct mm_struct *mm)
 
 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
-	struct hmm *hmm = mm_get_hmm(mm);
+	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 	struct hmm_mirror *mirror;
 	struct hmm_range *range;
 
+	/* Bail out if hmm is in the process of being freed */
+	if (!kref_get_unless_zero(&hmm->kref))
+		return;
+
 	/* Report this HMM as dying. */
 	hmm->dead = true;
 
@@ -185,13 +194,14 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 			const struct mmu_notifier_range *nrange)
 {
-	struct hmm *hmm = mm_get_hmm(nrange->mm);
+	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 	struct hmm_mirror *mirror;
 	struct hmm_update update;
 	struct hmm_range *range;
 	int ret = 0;
 
-	VM_BUG_ON(!hmm);
+	if (!kref_get_unless_zero(&hmm->kref))
+		return 0;
 
 	update.start = nrange->start;
 	update.end = nrange->end;
@@ -236,9 +246,10 @@ out:
 static void hmm_invalidate_range_end(struct mmu_notifier *mn,
 			const struct mmu_notifier_range *nrange)
 {
-	struct hmm *hmm = mm_get_hmm(nrange->mm);
+	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 
-	VM_BUG_ON(!hmm);
+	if (!kref_get_unless_zero(&hmm->kref))
+		return;
 
 	mutex_lock(&hmm->lock);
 	hmm->notifiers--;
-- 
cgit v1.2.3-59-g8ed1b


From f652e66fcca07e59f207bcca27c5566193feabd5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Sun, 9 Jun 2019 23:43:13 +0900
Subject: pinctrl: add include guard to pinctrl-state.h

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinctrl-state.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinctrl-state.h b/include/linux/pinctrl/pinctrl-state.h
index a0e785815a64..635d97e9285e 100644
--- a/include/linux/pinctrl/pinctrl-state.h
+++ b/include/linux/pinctrl/pinctrl-state.h
@@ -3,6 +3,9 @@
  * Standard pin control state definitions
  */
 
+#ifndef __LINUX_PINCTRL_PINCTRL_STATE_H
+#define __LINUX_PINCTRL_PINCTRL_STATE_H
+
 /**
  * @PINCTRL_STATE_DEFAULT: the state the pinctrl handle shall be put
  *	into as default, usually this means the pins are up and ready to
@@ -31,3 +34,5 @@
 #define PINCTRL_STATE_INIT "init"
 #define PINCTRL_STATE_IDLE "idle"
 #define PINCTRL_STATE_SLEEP "sleep"
+
+#endif /* __LINUX_PINCTRL_PINCTRL_STATE_H */
-- 
cgit v1.2.3-59-g8ed1b


From e63d79d1ffcd2201a2dbff1d7a1184b8f3ec74cf Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <Gustavo.Pimentel@synopsys.com>
Date: Tue, 4 Jun 2019 15:29:22 +0200
Subject: dmaengine: Add Synopsys eDMA IP core driver

Add Synopsys PCIe Endpoint eDMA IP core driver to kernel.

This IP is generally distributed with Synopsys PCIe Endpoint IP (depends
of the use and licensing agreement).

This core driver, initializes and configures the eDMA IP using vma-helpers
functions and dma-engine subsystem.

This driver can be compile as built-in or external module in kernel.

To enable this driver just select DW_EDMA option in kernel configuration,
however it requires and selects automatically DMA_ENGINE and
DMA_VIRTUAL_CHANNELS option too.

In order to transfer data from point A to B as fast as possible this IP
requires a dedicated memory space containing linked list of elements.

All elements of this linked list are continuous and each one describes a
data transfer (source and destination addresses, length and a control
variable).

For the sake of simplicity, lets assume a memory space for channel write
0 which allows about 42 elements.

+---------+
| Desc #0 |-+
+---------+ |
            V
       +----------+
       | Chunk #0 |-+
       |  CB = 1  | |  +----------+  +-----+  +-----------+  +-----+
       +----------+ +->| Burst #0 |->| ... |->| Burst #41 |->| llp |
            |          +----------+  +-----+  +-----------+  +-----+
            V
       +----------+
       | Chunk #1 |-+
       |  CB = 0  | |  +-----------+  +-----+  +-----------+  +-----+
       +----------+ +->| Burst #42 |->| ... |->| Burst #83 |->| llp |
            |          +-----------+  +-----+  +-----------+  +-----+
            V
       +----------+
       | Chunk #2 |-+
       |  CB = 1  | |  +-----------+  +-----+  +------------+  +-----+
       +----------+ +->| Burst #84 |->| ... |->| Burst #125 |->| llp |
            |          +-----------+  +-----+  +------------+  +-----+
            V
       +----------+
       | Chunk #3 |-+
       |  CB = 0  | |  +------------+  +-----+  +------------+  +-----+
       +----------+ +->| Burst #126 |->| ... |->| Burst #129 |->| llp |
                       +------------+  +-----+  +------------+  +-----+

Legend:
 - Linked list, also know as Chunk
 - Linked list element*, also know as Burst *CB*, also know as Change Bit,
it's a control bit (and typically is toggled) that allows to easily
identify and differentiate between the current linked list and the
previous or the next one.
 - LLP, is a special element that indicates the end of the linked list
element stream also informs that the next CB should be toggle

On every last Burst of the Chunk (Burst #41, Burst #83, Burst #125 or
even Burst #129) is set some flags on their control variable (RIE and
LIE bits) that will trigger the send of "done" interruption.

On the interruptions callback, is decided whether to recycle the linked
list memory space by writing a new set of Bursts elements (if still
exists Chunks to transfer) or is considered completed (if there is no
Chunks available to transfer).

On scatter-gather transfer mode, the client will submit a scatter-gather
list of n (on this case 130) elements, that will be divide in multiple
Chunks, each Chunk will have (on this case 42) a limited number of
Bursts and after transferring all Bursts, an interrupt will be
triggered, which will allow to recycle the all linked list dedicated
memory again with the new information relative to the next Chunk and
respective Burst associated and repeat the whole cycle again.

On cyclic transfer mode, the client will submit a buffer pointer, length
of it and number of repetitions, in this case each burst will correspond
directly to each repetition.

Each Burst can describes a data transfer from point A(source) to point
B(destination) with a length that can be from 1 byte up to 4 GB. Since
dedicated the memory space where the linked list will reside is limited,
the whole n burst elements will be organized in several Chunks, that
will be used later to recycle the dedicated memory space to initiate a
new sequence of data transfers.

The whole transfer is considered has completed when it was transferred
all bursts.

Currently this IP has a set well-known register map, which includes
support for legacy and unroll modes. Legacy mode is version of this
register map that has multiplexer register that allows to switch
registers between all write and read channels and the unroll modes
repeats all write and read channels registers with an offset between
them. This register map is called v0.

The IP team is creating a new register map more suitable to the latest
PCIe features, that very likely will change the map register, which this
version will be called v1. As soon as this new version is released by
the IP team the support for this version in be included on this driver.

According to the logic, patches 1, 2 and 3 should be squashed into 1
unique patch, but for the sake of simplicity of review, it was divided
in this 3 patches files.

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Cc: Vinod Koul <vkoul@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Russell King <rmk+kernel@armlinux.org.uk>
Cc: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/Kconfig                |   2 +
 drivers/dma/Makefile               |   1 +
 drivers/dma/dw-edma/Kconfig        |   9 +
 drivers/dma/dw-edma/Makefile       |   4 +
 drivers/dma/dw-edma/dw-edma-core.c | 936 +++++++++++++++++++++++++++++++++++++
 drivers/dma/dw-edma/dw-edma-core.h | 165 +++++++
 include/linux/dma/edma.h           |  47 ++
 7 files changed, 1164 insertions(+)
 create mode 100644 drivers/dma/dw-edma/Kconfig
 create mode 100644 drivers/dma/dw-edma/Makefile
 create mode 100644 drivers/dma/dw-edma/dw-edma-core.c
 create mode 100644 drivers/dma/dw-edma/dw-edma-core.h
 create mode 100644 include/linux/dma/edma.h

(limited to 'include/linux')

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index eaf78f4e07ce..76859aa2688c 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -665,6 +665,8 @@ source "drivers/dma/qcom/Kconfig"
 
 source "drivers/dma/dw/Kconfig"
 
+source "drivers/dma/dw-edma/Kconfig"
+
 source "drivers/dma/hsu/Kconfig"
 
 source "drivers/dma/sh/Kconfig"
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 6126e1c3a875..5bddf6f8790f 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_DMA_SUN4I) += sun4i-dma.o
 obj-$(CONFIG_DMA_SUN6I) += sun6i-dma.o
 obj-$(CONFIG_DW_AXI_DMAC) += dw-axi-dmac/
 obj-$(CONFIG_DW_DMAC_CORE) += dw/
+obj-$(CONFIG_DW_EDMA) += dw-edma/
 obj-$(CONFIG_EP93XX_DMA) += ep93xx_dma.o
 obj-$(CONFIG_FSL_DMA) += fsldma.o
 obj-$(CONFIG_FSL_EDMA) += fsl-edma.o fsl-edma-common.o
diff --git a/drivers/dma/dw-edma/Kconfig b/drivers/dma/dw-edma/Kconfig
new file mode 100644
index 000000000000..3016bed63589
--- /dev/null
+++ b/drivers/dma/dw-edma/Kconfig
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+
+config DW_EDMA
+	tristate "Synopsys DesignWare eDMA controller driver"
+	select DMA_ENGINE
+	select DMA_VIRTUAL_CHANNELS
+	help
+	  Support the Synopsys DesignWare eDMA controller, normally
+	  implemented on endpoints SoCs.
diff --git a/drivers/dma/dw-edma/Makefile b/drivers/dma/dw-edma/Makefile
new file mode 100644
index 000000000000..322401089891
--- /dev/null
+++ b/drivers/dma/dw-edma/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_DW_EDMA)		+= dw-edma.o
+dw-edma-objs			:= dw-edma-core.o
diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
new file mode 100644
index 000000000000..c9d032f49dc3
--- /dev/null
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -0,0 +1,936 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018-2019 Synopsys, Inc. and/or its affiliates.
+ * Synopsys DesignWare eDMA core driver
+ *
+ * Author: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/pm_runtime.h>
+#include <linux/dmaengine.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/dma/edma.h>
+#include <linux/pci.h>
+
+#include "dw-edma-core.h"
+#include "../dmaengine.h"
+#include "../virt-dma.h"
+
+static inline
+struct device *dchan2dev(struct dma_chan *dchan)
+{
+	return &dchan->dev->device;
+}
+
+static inline
+struct device *chan2dev(struct dw_edma_chan *chan)
+{
+	return &chan->vc.chan.dev->device;
+}
+
+static inline
+struct dw_edma_desc *vd2dw_edma_desc(struct virt_dma_desc *vd)
+{
+	return container_of(vd, struct dw_edma_desc, vd);
+}
+
+static struct dw_edma_burst *dw_edma_alloc_burst(struct dw_edma_chunk *chunk)
+{
+	struct dw_edma_burst *burst;
+
+	burst = kzalloc(sizeof(*burst), GFP_NOWAIT);
+	if (unlikely(!burst))
+		return NULL;
+
+	INIT_LIST_HEAD(&burst->list);
+	if (chunk->burst) {
+		/* Create and add new element into the linked list */
+		chunk->bursts_alloc++;
+		list_add_tail(&burst->list, &chunk->burst->list);
+	} else {
+		/* List head */
+		chunk->bursts_alloc = 0;
+		chunk->burst = burst;
+	}
+
+	return burst;
+}
+
+static struct dw_edma_chunk *dw_edma_alloc_chunk(struct dw_edma_desc *desc)
+{
+	struct dw_edma_chan *chan = desc->chan;
+	struct dw_edma *dw = chan->chip->dw;
+	struct dw_edma_chunk *chunk;
+
+	chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT);
+	if (unlikely(!chunk))
+		return NULL;
+
+	INIT_LIST_HEAD(&chunk->list);
+	chunk->chan = chan;
+	/* Toggling change bit (CB) in each chunk, this is a mechanism to
+	 * inform the eDMA HW block that this is a new linked list ready
+	 * to be consumed.
+	 *  - Odd chunks originate CB equal to 0
+	 *  - Even chunks originate CB equal to 1
+	 */
+	chunk->cb = !(desc->chunks_alloc % 2);
+	chunk->ll_region.paddr = dw->ll_region.paddr + chan->ll_off;
+	chunk->ll_region.vaddr = dw->ll_region.vaddr + chan->ll_off;
+
+	if (desc->chunk) {
+		/* Create and add new element into the linked list */
+		desc->chunks_alloc++;
+		list_add_tail(&chunk->list, &desc->chunk->list);
+		if (!dw_edma_alloc_burst(chunk)) {
+			kfree(chunk);
+			return NULL;
+		}
+	} else {
+		/* List head */
+		chunk->burst = NULL;
+		desc->chunks_alloc = 0;
+		desc->chunk = chunk;
+	}
+
+	return chunk;
+}
+
+static struct dw_edma_desc *dw_edma_alloc_desc(struct dw_edma_chan *chan)
+{
+	struct dw_edma_desc *desc;
+
+	desc = kzalloc(sizeof(*desc), GFP_NOWAIT);
+	if (unlikely(!desc))
+		return NULL;
+
+	desc->chan = chan;
+	if (!dw_edma_alloc_chunk(desc)) {
+		kfree(desc);
+		return NULL;
+	}
+
+	return desc;
+}
+
+static void dw_edma_free_burst(struct dw_edma_chunk *chunk)
+{
+	struct dw_edma_burst *child, *_next;
+
+	/* Remove all the list elements */
+	list_for_each_entry_safe(child, _next, &chunk->burst->list, list) {
+		list_del(&child->list);
+		kfree(child);
+		chunk->bursts_alloc--;
+	}
+
+	/* Remove the list head */
+	kfree(child);
+	chunk->burst = NULL;
+}
+
+static void dw_edma_free_chunk(struct dw_edma_desc *desc)
+{
+	struct dw_edma_chunk *child, *_next;
+
+	if (!desc->chunk)
+		return;
+
+	/* Remove all the list elements */
+	list_for_each_entry_safe(child, _next, &desc->chunk->list, list) {
+		dw_edma_free_burst(child);
+		list_del(&child->list);
+		kfree(child);
+		desc->chunks_alloc--;
+	}
+
+	/* Remove the list head */
+	kfree(child);
+	desc->chunk = NULL;
+}
+
+static void dw_edma_free_desc(struct dw_edma_desc *desc)
+{
+	dw_edma_free_chunk(desc);
+	kfree(desc);
+}
+
+static void vchan_free_desc(struct virt_dma_desc *vdesc)
+{
+	dw_edma_free_desc(vd2dw_edma_desc(vdesc));
+}
+
+static void dw_edma_start_transfer(struct dw_edma_chan *chan)
+{
+	struct dw_edma_chunk *child;
+	struct dw_edma_desc *desc;
+	struct virt_dma_desc *vd;
+
+	vd = vchan_next_desc(&chan->vc);
+	if (!vd)
+		return;
+
+	desc = vd2dw_edma_desc(vd);
+	if (!desc)
+		return;
+
+	child = list_first_entry_or_null(&desc->chunk->list,
+					 struct dw_edma_chunk, list);
+	if (!child)
+		return;
+
+	dw_edma_v0_core_start(child, !desc->xfer_sz);
+	desc->xfer_sz += child->ll_region.sz;
+	dw_edma_free_burst(child);
+	list_del(&child->list);
+	kfree(child);
+	desc->chunks_alloc--;
+}
+
+static int dw_edma_device_config(struct dma_chan *dchan,
+				 struct dma_slave_config *config)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+
+	memcpy(&chan->config, config, sizeof(*config));
+	chan->configured = true;
+
+	return 0;
+}
+
+static int dw_edma_device_pause(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	int err = 0;
+
+	if (!chan->configured)
+		err = -EPERM;
+	else if (chan->status != EDMA_ST_BUSY)
+		err = -EPERM;
+	else if (chan->request != EDMA_REQ_NONE)
+		err = -EPERM;
+	else
+		chan->request = EDMA_REQ_PAUSE;
+
+	return err;
+}
+
+static int dw_edma_device_resume(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	int err = 0;
+
+	if (!chan->configured) {
+		err = -EPERM;
+	} else if (chan->status != EDMA_ST_PAUSE) {
+		err = -EPERM;
+	} else if (chan->request != EDMA_REQ_NONE) {
+		err = -EPERM;
+	} else {
+		chan->status = EDMA_ST_BUSY;
+		dw_edma_start_transfer(chan);
+	}
+
+	return err;
+}
+
+static int dw_edma_device_terminate_all(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	int err = 0;
+	LIST_HEAD(head);
+
+	if (!chan->configured) {
+		/* Do nothing */
+	} else if (chan->status == EDMA_ST_PAUSE) {
+		chan->status = EDMA_ST_IDLE;
+		chan->configured = false;
+	} else if (chan->status == EDMA_ST_IDLE) {
+		chan->configured = false;
+	} else if (dw_edma_v0_core_ch_status(chan) == DMA_COMPLETE) {
+		/*
+		 * The channel is in a false BUSY state, probably didn't
+		 * receive or lost an interrupt
+		 */
+		chan->status = EDMA_ST_IDLE;
+		chan->configured = false;
+	} else if (chan->request > EDMA_REQ_PAUSE) {
+		err = -EPERM;
+	} else {
+		chan->request = EDMA_REQ_STOP;
+	}
+
+	return err;
+}
+
+static void dw_edma_device_issue_pending(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	unsigned long flags;
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	if (chan->configured && chan->request == EDMA_REQ_NONE &&
+	    chan->status == EDMA_ST_IDLE && vchan_issue_pending(&chan->vc)) {
+		chan->status = EDMA_ST_BUSY;
+		dw_edma_start_transfer(chan);
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+}
+
+static enum dma_status
+dw_edma_device_tx_status(struct dma_chan *dchan, dma_cookie_t cookie,
+			 struct dma_tx_state *txstate)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	struct dw_edma_desc *desc;
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+	enum dma_status ret;
+	u32 residue = 0;
+
+	ret = dma_cookie_status(dchan, cookie, txstate);
+	if (ret == DMA_COMPLETE)
+		return ret;
+
+	if (ret == DMA_IN_PROGRESS && chan->status == EDMA_ST_PAUSE)
+		ret = DMA_PAUSED;
+
+	if (!txstate)
+		goto ret_residue;
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	vd = vchan_find_desc(&chan->vc, cookie);
+	if (vd) {
+		desc = vd2dw_edma_desc(vd);
+		if (desc)
+			residue = desc->alloc_sz - desc->xfer_sz;
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+ret_residue:
+	dma_set_residue(txstate, residue);
+
+	return ret;
+}
+
+static struct dma_async_tx_descriptor *
+dw_edma_device_transfer(struct dw_edma_transfer *xfer)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(xfer->dchan);
+	enum dma_transfer_direction direction = xfer->direction;
+	phys_addr_t src_addr, dst_addr;
+	struct scatterlist *sg = NULL;
+	struct dw_edma_chunk *chunk;
+	struct dw_edma_burst *burst;
+	struct dw_edma_desc *desc;
+	u32 cnt;
+	int i;
+
+	if ((direction == DMA_MEM_TO_DEV && chan->dir == EDMA_DIR_WRITE) ||
+	    (direction == DMA_DEV_TO_MEM && chan->dir == EDMA_DIR_READ))
+		return NULL;
+
+	if (xfer->cyclic) {
+		if (!xfer->xfer.cyclic.len || !xfer->xfer.cyclic.cnt)
+			return NULL;
+	} else {
+		if (xfer->xfer.sg.len < 1)
+			return NULL;
+	}
+
+	if (!chan->configured)
+		return NULL;
+
+	desc = dw_edma_alloc_desc(chan);
+	if (unlikely(!desc))
+		goto err_alloc;
+
+	chunk = dw_edma_alloc_chunk(desc);
+	if (unlikely(!chunk))
+		goto err_alloc;
+
+	src_addr = chan->config.src_addr;
+	dst_addr = chan->config.dst_addr;
+
+	if (xfer->cyclic) {
+		cnt = xfer->xfer.cyclic.cnt;
+	} else {
+		cnt = xfer->xfer.sg.len;
+		sg = xfer->xfer.sg.sgl;
+	}
+
+	for (i = 0; i < cnt; i++) {
+		if (!xfer->cyclic && !sg)
+			break;
+
+		if (chunk->bursts_alloc == chan->ll_max) {
+			chunk = dw_edma_alloc_chunk(desc);
+			if (unlikely(!chunk))
+				goto err_alloc;
+		}
+
+		burst = dw_edma_alloc_burst(chunk);
+		if (unlikely(!burst))
+			goto err_alloc;
+
+		if (xfer->cyclic)
+			burst->sz = xfer->xfer.cyclic.len;
+		else
+			burst->sz = sg_dma_len(sg);
+
+		chunk->ll_region.sz += burst->sz;
+		desc->alloc_sz += burst->sz;
+
+		if (direction == DMA_DEV_TO_MEM) {
+			burst->sar = src_addr;
+			if (xfer->cyclic) {
+				burst->dar = xfer->xfer.cyclic.paddr;
+			} else {
+				burst->dar = sg_dma_address(sg);
+				/* Unlike the typical assumption by other
+				 * drivers/IPs the peripheral memory isn't
+				 * a FIFO memory, in this case, it's a
+				 * linear memory and that why the source
+				 * and destination addresses are increased
+				 * by the same portion (data length)
+				 */
+				src_addr += sg_dma_len(sg);
+			}
+		} else {
+			burst->dar = dst_addr;
+			if (xfer->cyclic) {
+				burst->sar = xfer->xfer.cyclic.paddr;
+			} else {
+				burst->sar = sg_dma_address(sg);
+				/* Unlike the typical assumption by other
+				 * drivers/IPs the peripheral memory isn't
+				 * a FIFO memory, in this case, it's a
+				 * linear memory and that why the source
+				 * and destination addresses are increased
+				 * by the same portion (data length)
+				 */
+				dst_addr += sg_dma_len(sg);
+			}
+		}
+
+		if (!xfer->cyclic)
+			sg = sg_next(sg);
+	}
+
+	return vchan_tx_prep(&chan->vc, &desc->vd, xfer->flags);
+
+err_alloc:
+	if (desc)
+		dw_edma_free_desc(desc);
+
+	return NULL;
+}
+
+static struct dma_async_tx_descriptor *
+dw_edma_device_prep_slave_sg(struct dma_chan *dchan, struct scatterlist *sgl,
+			     unsigned int len,
+			     enum dma_transfer_direction direction,
+			     unsigned long flags, void *context)
+{
+	struct dw_edma_transfer xfer;
+
+	xfer.dchan = dchan;
+	xfer.direction = direction;
+	xfer.xfer.sg.sgl = sgl;
+	xfer.xfer.sg.len = len;
+	xfer.flags = flags;
+	xfer.cyclic = false;
+
+	return dw_edma_device_transfer(&xfer);
+}
+
+static struct dma_async_tx_descriptor *
+dw_edma_device_prep_dma_cyclic(struct dma_chan *dchan, dma_addr_t paddr,
+			       size_t len, size_t count,
+			       enum dma_transfer_direction direction,
+			       unsigned long flags)
+{
+	struct dw_edma_transfer xfer;
+
+	xfer.dchan = dchan;
+	xfer.direction = direction;
+	xfer.xfer.cyclic.paddr = paddr;
+	xfer.xfer.cyclic.len = len;
+	xfer.xfer.cyclic.cnt = count;
+	xfer.flags = flags;
+	xfer.cyclic = true;
+
+	return dw_edma_device_transfer(&xfer);
+}
+
+static void dw_edma_done_interrupt(struct dw_edma_chan *chan)
+{
+	struct dw_edma_desc *desc;
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+
+	dw_edma_v0_core_clear_done_int(chan);
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	vd = vchan_next_desc(&chan->vc);
+	if (vd) {
+		switch (chan->request) {
+		case EDMA_REQ_NONE:
+			desc = vd2dw_edma_desc(vd);
+			if (desc->chunks_alloc) {
+				chan->status = EDMA_ST_BUSY;
+				dw_edma_start_transfer(chan);
+			} else {
+				list_del(&vd->node);
+				vchan_cookie_complete(vd);
+				chan->status = EDMA_ST_IDLE;
+			}
+			break;
+
+		case EDMA_REQ_STOP:
+			list_del(&vd->node);
+			vchan_cookie_complete(vd);
+			chan->request = EDMA_REQ_NONE;
+			chan->status = EDMA_ST_IDLE;
+			break;
+
+		case EDMA_REQ_PAUSE:
+			chan->request = EDMA_REQ_NONE;
+			chan->status = EDMA_ST_PAUSE;
+			break;
+
+		default:
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+}
+
+static void dw_edma_abort_interrupt(struct dw_edma_chan *chan)
+{
+	struct virt_dma_desc *vd;
+	unsigned long flags;
+
+	dw_edma_v0_core_clear_abort_int(chan);
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	vd = vchan_next_desc(&chan->vc);
+	if (vd) {
+		list_del(&vd->node);
+		vchan_cookie_complete(vd);
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+	chan->request = EDMA_REQ_NONE;
+	chan->status = EDMA_ST_IDLE;
+}
+
+static irqreturn_t dw_edma_interrupt(int irq, void *data, bool write)
+{
+	struct dw_edma_irq *dw_irq = data;
+	struct dw_edma *dw = dw_irq->dw;
+	unsigned long total, pos, val;
+	unsigned long off;
+	u32 mask;
+
+	if (write) {
+		total = dw->wr_ch_cnt;
+		off = 0;
+		mask = dw_irq->wr_mask;
+	} else {
+		total = dw->rd_ch_cnt;
+		off = dw->wr_ch_cnt;
+		mask = dw_irq->rd_mask;
+	}
+
+	val = dw_edma_v0_core_status_done_int(dw, write ?
+							  EDMA_DIR_WRITE :
+							  EDMA_DIR_READ);
+	val &= mask;
+	for_each_set_bit(pos, &val, total) {
+		struct dw_edma_chan *chan = &dw->chan[pos + off];
+
+		dw_edma_done_interrupt(chan);
+	}
+
+	val = dw_edma_v0_core_status_abort_int(dw, write ?
+							   EDMA_DIR_WRITE :
+							   EDMA_DIR_READ);
+	val &= mask;
+	for_each_set_bit(pos, &val, total) {
+		struct dw_edma_chan *chan = &dw->chan[pos + off];
+
+		dw_edma_abort_interrupt(chan);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static inline irqreturn_t dw_edma_interrupt_write(int irq, void *data)
+{
+	return dw_edma_interrupt(irq, data, true);
+}
+
+static inline irqreturn_t dw_edma_interrupt_read(int irq, void *data)
+{
+	return dw_edma_interrupt(irq, data, false);
+}
+
+static irqreturn_t dw_edma_interrupt_common(int irq, void *data)
+{
+	dw_edma_interrupt(irq, data, true);
+	dw_edma_interrupt(irq, data, false);
+
+	return IRQ_HANDLED;
+}
+
+static int dw_edma_alloc_chan_resources(struct dma_chan *dchan)
+{
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+
+	if (chan->status != EDMA_ST_IDLE)
+		return -EBUSY;
+
+	pm_runtime_get(chan->chip->dev);
+
+	return 0;
+}
+
+static void dw_edma_free_chan_resources(struct dma_chan *dchan)
+{
+	unsigned long timeout = jiffies + msecs_to_jiffies(5000);
+	struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan);
+	int ret;
+
+	while (time_before(jiffies, timeout)) {
+		ret = dw_edma_device_terminate_all(dchan);
+		if (!ret)
+			break;
+
+		if (time_after_eq(jiffies, timeout))
+			return;
+
+		cpu_relax();
+	};
+
+	pm_runtime_put(chan->chip->dev);
+}
+
+static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write,
+				 u32 wr_alloc, u32 rd_alloc)
+{
+	struct dw_edma_region *dt_region;
+	struct device *dev = chip->dev;
+	struct dw_edma *dw = chip->dw;
+	struct dw_edma_chan *chan;
+	size_t ll_chunk, dt_chunk;
+	struct dw_edma_irq *irq;
+	struct dma_device *dma;
+	u32 i, j, cnt, ch_cnt;
+	u32 alloc, off_alloc;
+	int err = 0;
+	u32 pos;
+
+	ch_cnt = dw->wr_ch_cnt + dw->rd_ch_cnt;
+	ll_chunk = dw->ll_region.sz;
+	dt_chunk = dw->dt_region.sz;
+
+	/* Calculate linked list chunk for each channel */
+	ll_chunk /= roundup_pow_of_two(ch_cnt);
+
+	/* Calculate linked list chunk for each channel */
+	dt_chunk /= roundup_pow_of_two(ch_cnt);
+
+	if (write) {
+		i = 0;
+		cnt = dw->wr_ch_cnt;
+		dma = &dw->wr_edma;
+		alloc = wr_alloc;
+		off_alloc = 0;
+	} else {
+		i = dw->wr_ch_cnt;
+		cnt = dw->rd_ch_cnt;
+		dma = &dw->rd_edma;
+		alloc = rd_alloc;
+		off_alloc = wr_alloc;
+	}
+
+	INIT_LIST_HEAD(&dma->channels);
+	for (j = 0; (alloc || dw->nr_irqs == 1) && j < cnt; j++, i++) {
+		chan = &dw->chan[i];
+
+		dt_region = devm_kzalloc(dev, sizeof(*dt_region), GFP_KERNEL);
+		if (!dt_region)
+			return -ENOMEM;
+
+		chan->vc.chan.private = dt_region;
+
+		chan->chip = chip;
+		chan->id = j;
+		chan->dir = write ? EDMA_DIR_WRITE : EDMA_DIR_READ;
+		chan->configured = false;
+		chan->request = EDMA_REQ_NONE;
+		chan->status = EDMA_ST_IDLE;
+
+		chan->ll_off = (ll_chunk * i);
+		chan->ll_max = (ll_chunk / EDMA_LL_SZ) - 1;
+
+		chan->dt_off = (dt_chunk * i);
+
+		dev_vdbg(dev, "L. List:\tChannel %s[%u] off=0x%.8lx, max_cnt=%u\n",
+			 write ? "write" : "read", j,
+			 chan->ll_off, chan->ll_max);
+
+		if (dw->nr_irqs == 1)
+			pos = 0;
+		else
+			pos = off_alloc + (j % alloc);
+
+		irq = &dw->irq[pos];
+
+		if (write)
+			irq->wr_mask |= BIT(j);
+		else
+			irq->rd_mask |= BIT(j);
+
+		irq->dw = dw;
+		memcpy(&chan->msi, &irq->msi, sizeof(chan->msi));
+
+		dev_vdbg(dev, "MSI:\t\tChannel %s[%u] addr=0x%.8x%.8x, data=0x%.8x\n",
+			 write ? "write" : "read", j,
+			 chan->msi.address_hi, chan->msi.address_lo,
+			 chan->msi.data);
+
+		chan->vc.desc_free = vchan_free_desc;
+		vchan_init(&chan->vc, dma);
+
+		dt_region->paddr = dw->dt_region.paddr + chan->dt_off;
+		dt_region->vaddr = dw->dt_region.vaddr + chan->dt_off;
+		dt_region->sz = dt_chunk;
+
+		dev_vdbg(dev, "Data:\tChannel %s[%u] off=0x%.8lx\n",
+			 write ? "write" : "read", j, chan->dt_off);
+
+		dw_edma_v0_core_device_config(chan);
+	}
+
+	/* Set DMA channel capabilities */
+	dma_cap_zero(dma->cap_mask);
+	dma_cap_set(DMA_SLAVE, dma->cap_mask);
+	dma_cap_set(DMA_CYCLIC, dma->cap_mask);
+	dma_cap_set(DMA_PRIVATE, dma->cap_mask);
+	dma->directions = BIT(write ? DMA_DEV_TO_MEM : DMA_MEM_TO_DEV);
+	dma->src_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
+	dma->dst_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES);
+	dma->residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR;
+	dma->chancnt = cnt;
+
+	/* Set DMA channel callbacks */
+	dma->dev = chip->dev;
+	dma->device_alloc_chan_resources = dw_edma_alloc_chan_resources;
+	dma->device_free_chan_resources = dw_edma_free_chan_resources;
+	dma->device_config = dw_edma_device_config;
+	dma->device_pause = dw_edma_device_pause;
+	dma->device_resume = dw_edma_device_resume;
+	dma->device_terminate_all = dw_edma_device_terminate_all;
+	dma->device_issue_pending = dw_edma_device_issue_pending;
+	dma->device_tx_status = dw_edma_device_tx_status;
+	dma->device_prep_slave_sg = dw_edma_device_prep_slave_sg;
+	dma->device_prep_dma_cyclic = dw_edma_device_prep_dma_cyclic;
+
+	dma_set_max_seg_size(dma->dev, U32_MAX);
+
+	/* Register DMA device */
+	err = dma_async_device_register(dma);
+
+	return err;
+}
+
+static inline void dw_edma_dec_irq_alloc(int *nr_irqs, u32 *alloc, u16 cnt)
+{
+	if (*nr_irqs && *alloc < cnt) {
+		(*alloc)++;
+		(*nr_irqs)--;
+	}
+}
+
+static inline void dw_edma_add_irq_mask(u32 *mask, u32 alloc, u16 cnt)
+{
+	while (*mask * alloc < cnt)
+		(*mask)++;
+}
+
+static int dw_edma_irq_request(struct dw_edma_chip *chip,
+			       u32 *wr_alloc, u32 *rd_alloc)
+{
+	struct device *dev = chip->dev;
+	struct dw_edma *dw = chip->dw;
+	u32 wr_mask = 1;
+	u32 rd_mask = 1;
+	int i, err = 0;
+	u32 ch_cnt;
+
+	ch_cnt = dw->wr_ch_cnt + dw->rd_ch_cnt;
+
+	if (dw->nr_irqs < 1)
+		return -EINVAL;
+
+	if (dw->nr_irqs == 1) {
+		/* Common IRQ shared among all channels */
+		err = request_irq(pci_irq_vector(to_pci_dev(dev), 0),
+				  dw_edma_interrupt_common,
+				  IRQF_SHARED, dw->name, &dw->irq[0]);
+		if (err) {
+			dw->nr_irqs = 0;
+			return err;
+		}
+
+		get_cached_msi_msg(pci_irq_vector(to_pci_dev(dev), 0),
+				   &dw->irq[0].msi);
+	} else {
+		/* Distribute IRQs equally among all channels */
+		int tmp = dw->nr_irqs;
+
+		while (tmp && (*wr_alloc + *rd_alloc) < ch_cnt) {
+			dw_edma_dec_irq_alloc(&tmp, wr_alloc, dw->wr_ch_cnt);
+			dw_edma_dec_irq_alloc(&tmp, rd_alloc, dw->rd_ch_cnt);
+		}
+
+		dw_edma_add_irq_mask(&wr_mask, *wr_alloc, dw->wr_ch_cnt);
+		dw_edma_add_irq_mask(&rd_mask, *rd_alloc, dw->rd_ch_cnt);
+
+		for (i = 0; i < (*wr_alloc + *rd_alloc); i++) {
+			err = request_irq(pci_irq_vector(to_pci_dev(dev), i),
+					  i < *wr_alloc ?
+						dw_edma_interrupt_write :
+						dw_edma_interrupt_read,
+					  IRQF_SHARED, dw->name,
+					  &dw->irq[i]);
+			if (err) {
+				dw->nr_irqs = i;
+				return err;
+			}
+
+			get_cached_msi_msg(pci_irq_vector(to_pci_dev(dev), i),
+					   &dw->irq[i].msi);
+		}
+
+		dw->nr_irqs = i;
+	}
+
+	return err;
+}
+
+int dw_edma_probe(struct dw_edma_chip *chip)
+{
+	struct device *dev = chip->dev;
+	struct dw_edma *dw = chip->dw;
+	u32 wr_alloc = 0;
+	u32 rd_alloc = 0;
+	int i, err;
+
+	raw_spin_lock_init(&dw->lock);
+
+	/* Find out how many write channels are supported by hardware */
+	dw->wr_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE);
+	if (!dw->wr_ch_cnt)
+		return -EINVAL;
+
+	/* Find out how many read channels are supported by hardware */
+	dw->rd_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ);
+	if (!dw->rd_ch_cnt)
+		return -EINVAL;
+
+	dev_vdbg(dev, "Channels:\twrite=%d, read=%d\n",
+		 dw->wr_ch_cnt, dw->rd_ch_cnt);
+
+	/* Allocate channels */
+	dw->chan = devm_kcalloc(dev, dw->wr_ch_cnt + dw->rd_ch_cnt,
+				sizeof(*dw->chan), GFP_KERNEL);
+	if (!dw->chan)
+		return -ENOMEM;
+
+	snprintf(dw->name, sizeof(dw->name), "dw-edma-core:%d", chip->id);
+
+	/* Disable eDMA, only to establish the ideal initial conditions */
+	dw_edma_v0_core_off(dw);
+
+	/* Request IRQs */
+	err = dw_edma_irq_request(chip, &wr_alloc, &rd_alloc);
+	if (err)
+		return err;
+
+	/* Setup write channels */
+	err = dw_edma_channel_setup(chip, true, wr_alloc, rd_alloc);
+	if (err)
+		goto err_irq_free;
+
+	/* Setup read channels */
+	err = dw_edma_channel_setup(chip, false, wr_alloc, rd_alloc);
+	if (err)
+		goto err_irq_free;
+
+	/* Power management */
+	pm_runtime_enable(dev);
+
+	/* Turn debugfs on */
+	dw_edma_v0_core_debugfs_on(chip);
+
+	return 0;
+
+err_irq_free:
+	for (i = (dw->nr_irqs - 1); i >= 0; i--)
+		free_irq(pci_irq_vector(to_pci_dev(dev), i), &dw->irq[i]);
+
+	dw->nr_irqs = 0;
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(dw_edma_probe);
+
+int dw_edma_remove(struct dw_edma_chip *chip)
+{
+	struct dw_edma_chan *chan, *_chan;
+	struct device *dev = chip->dev;
+	struct dw_edma *dw = chip->dw;
+	int i;
+
+	/* Disable eDMA */
+	dw_edma_v0_core_off(dw);
+
+	/* Free irqs */
+	for (i = (dw->nr_irqs - 1); i >= 0; i--)
+		free_irq(pci_irq_vector(to_pci_dev(dev), i), &dw->irq[i]);
+
+	/* Power management */
+	pm_runtime_disable(dev);
+
+	list_for_each_entry_safe(chan, _chan, &dw->wr_edma.channels,
+				 vc.chan.device_node) {
+		list_del(&chan->vc.chan.device_node);
+		tasklet_kill(&chan->vc.task);
+	}
+
+	list_for_each_entry_safe(chan, _chan, &dw->rd_edma.channels,
+				 vc.chan.device_node) {
+		list_del(&chan->vc.chan.device_node);
+		tasklet_kill(&chan->vc.task);
+	}
+
+	/* Deregister eDMA device */
+	dma_async_device_unregister(&dw->wr_edma);
+	dma_async_device_unregister(&dw->rd_edma);
+
+	/* Turn debugfs off */
+	dw_edma_v0_core_debugfs_off();
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dw_edma_remove);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Synopsys DesignWare eDMA controller core driver");
+MODULE_AUTHOR("Gustavo Pimentel <gustavo.pimentel@synopsys.com>");
diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h
new file mode 100644
index 000000000000..b6cc90cbc9dc
--- /dev/null
+++ b/drivers/dma/dw-edma/dw-edma-core.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018-2019 Synopsys, Inc. and/or its affiliates.
+ * Synopsys DesignWare eDMA core driver
+ *
+ * Author: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
+ */
+
+#ifndef _DW_EDMA_CORE_H
+#define _DW_EDMA_CORE_H
+
+#include <linux/msi.h>
+#include <linux/dma/edma.h>
+
+#include "../virt-dma.h"
+
+#define EDMA_LL_SZ					24
+
+enum dw_edma_dir {
+	EDMA_DIR_WRITE = 0,
+	EDMA_DIR_READ
+};
+
+enum dw_edma_mode {
+	EDMA_MODE_LEGACY = 0,
+	EDMA_MODE_UNROLL
+};
+
+enum dw_edma_request {
+	EDMA_REQ_NONE = 0,
+	EDMA_REQ_STOP,
+	EDMA_REQ_PAUSE
+};
+
+enum dw_edma_status {
+	EDMA_ST_IDLE = 0,
+	EDMA_ST_PAUSE,
+	EDMA_ST_BUSY
+};
+
+struct dw_edma_chan;
+struct dw_edma_chunk;
+
+struct dw_edma_burst {
+	struct list_head		list;
+	u64				sar;
+	u64				dar;
+	u32				sz;
+};
+
+struct dw_edma_region {
+	phys_addr_t			paddr;
+	dma_addr_t			vaddr;
+	size_t				sz;
+};
+
+struct dw_edma_chunk {
+	struct list_head		list;
+	struct dw_edma_chan		*chan;
+	struct dw_edma_burst		*burst;
+
+	u32				bursts_alloc;
+
+	u8				cb;
+	struct dw_edma_region		ll_region;	/* Linked list */
+};
+
+struct dw_edma_desc {
+	struct virt_dma_desc		vd;
+	struct dw_edma_chan		*chan;
+	struct dw_edma_chunk		*chunk;
+
+	u32				chunks_alloc;
+
+	u32				alloc_sz;
+	u32				xfer_sz;
+};
+
+struct dw_edma_chan {
+	struct virt_dma_chan		vc;
+	struct dw_edma_chip		*chip;
+	int				id;
+	enum dw_edma_dir		dir;
+
+	off_t				ll_off;
+	u32				ll_max;
+
+	off_t				dt_off;
+
+	struct msi_msg			msi;
+
+	enum dw_edma_request		request;
+	enum dw_edma_status		status;
+	u8				configured;
+
+	struct dma_slave_config		config;
+};
+
+struct dw_edma_irq {
+	struct msi_msg                  msi;
+	u32				wr_mask;
+	u32				rd_mask;
+	struct dw_edma			*dw;
+};
+
+struct dw_edma {
+	char				name[20];
+
+	struct dma_device		wr_edma;
+	u16				wr_ch_cnt;
+
+	struct dma_device		rd_edma;
+	u16				rd_ch_cnt;
+
+	struct dw_edma_region		rg_region;	/* Registers */
+	struct dw_edma_region		ll_region;	/* Linked list */
+	struct dw_edma_region		dt_region;	/* Data */
+
+	struct dw_edma_irq		*irq;
+	int				nr_irqs;
+
+	u32				version;
+	enum dw_edma_mode		mode;
+
+	struct dw_edma_chan		*chan;
+	const struct dw_edma_core_ops	*ops;
+
+	raw_spinlock_t			lock;		/* Only for legacy */
+};
+
+struct dw_edma_sg {
+	struct scatterlist		*sgl;
+	unsigned int			len;
+};
+
+struct dw_edma_cyclic {
+	dma_addr_t			paddr;
+	size_t				len;
+	size_t				cnt;
+};
+
+struct dw_edma_transfer {
+	struct dma_chan			*dchan;
+	union dw_edma_xfer {
+		struct dw_edma_sg	sg;
+		struct dw_edma_cyclic	cyclic;
+	} xfer;
+	enum dma_transfer_direction	direction;
+	unsigned long			flags;
+	bool				cyclic;
+};
+
+static inline
+struct dw_edma_chan *vc2dw_edma_chan(struct virt_dma_chan *vc)
+{
+	return container_of(vc, struct dw_edma_chan, vc);
+}
+
+static inline
+struct dw_edma_chan *dchan2dw_edma_chan(struct dma_chan *dchan)
+{
+	return vc2dw_edma_chan(to_virt_chan(dchan));
+}
+
+#endif /* _DW_EDMA_CORE_H */
diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h
new file mode 100644
index 000000000000..cab6e18773da
--- /dev/null
+++ b/include/linux/dma/edma.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018-2019 Synopsys, Inc. and/or its affiliates.
+ * Synopsys DesignWare eDMA core driver
+ *
+ * Author: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
+ */
+
+#ifndef _DW_EDMA_H
+#define _DW_EDMA_H
+
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+
+struct dw_edma;
+
+/**
+ * struct dw_edma_chip - representation of DesignWare eDMA controller hardware
+ * @dev:		 struct device of the eDMA controller
+ * @id:			 instance ID
+ * @irq:		 irq line
+ * @dw:			 struct dw_edma that is filed by dw_edma_probe()
+ */
+struct dw_edma_chip {
+	struct device		*dev;
+	int			id;
+	int			irq;
+	struct dw_edma		*dw;
+};
+
+/* Export to the platform drivers */
+#if IS_ENABLED(CONFIG_DW_EDMA)
+int dw_edma_probe(struct dw_edma_chip *chip);
+int dw_edma_remove(struct dw_edma_chip *chip);
+#else
+static inline int dw_edma_probe(struct dw_edma_chip *chip)
+{
+	return -ENODEV;
+}
+
+static inline int dw_edma_remove(struct dw_edma_chip *chip)
+{
+	return 0;
+}
+#endif /* CONFIG_DW_EDMA */
+
+#endif /* _DW_EDMA_H */
-- 
cgit v1.2.3-59-g8ed1b


From 1f418f46503d72594bbe6407d97fd2ae1ce15ee6 Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <Gustavo.Pimentel@synopsys.com>
Date: Tue, 4 Jun 2019 15:29:25 +0200
Subject: PCI: Add Synopsys endpoint EDDA Device ID

Create and add Synopsys Endpoint EDDA Device ID to PCI ID list, since
this ID is now being use on two different drivers (pci_endpoint_test.ko
and dw-edma-pcie.ko).

Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Kishon Vijay Abraham I <kishon@ti.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Joao Pinto <jpinto@synopsys.com>
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/misc/pci_endpoint_test.c | 2 +-
 include/linux/pci_ids.h          | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c
index 7b015f2a1c6f..1f531c1b4f74 100644
--- a/drivers/misc/pci_endpoint_test.c
+++ b/drivers/misc/pci_endpoint_test.c
@@ -804,7 +804,7 @@ static const struct pci_device_id pci_endpoint_test_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA74x) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA72x) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, 0x81c0) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_SYNOPSYS, 0xedda) },
+	{ PCI_DEVICE_DATA(SYNOPSYS, EDDA, NULL) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_AM654),
 	  .driver_data = (kernel_ulong_t)&am654_data
 	},
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 70e86148cb1e..4aad69fc4d6b 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2366,6 +2366,7 @@
 #define PCI_DEVICE_ID_SYNOPSYS_HAPSUSB3		0xabcd
 #define PCI_DEVICE_ID_SYNOPSYS_HAPSUSB3_AXI	0xabce
 #define PCI_DEVICE_ID_SYNOPSYS_HAPSUSB31	0xabcf
+#define PCI_DEVICE_ID_SYNOPSYS_EDDA	0xedda
 
 #define PCI_VENDOR_ID_USR		0x16ec
 
-- 
cgit v1.2.3-59-g8ed1b


From 4e23be473e3063a9d3bc06bb0aee89885fffab0e Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 10 Jun 2019 04:48:05 -0700
Subject: bus: ti-sysc: Add support for module specific reset quirks

Some older interconnect target modules need module internal clock
toggling quirks to reset properly. We've been doing this in the
platform code earlier, but need to be able to it directly in the
ti-sysc driver when we no longer rely on on the platform code.

Let's add reset handling for 1-wire, i2c and watchdog. Later on
we can add more modules like msdi and dss as they get tested.
For dra7 pcie, we should be able to just use the rstctrl reset
driver when available.

Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 drivers/bus/ti-sysc.c                 | 129 ++++++++++++++++++++++++++++++++--
 include/linux/platform_data/ti-sysc.h |   3 +
 2 files changed, 127 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index a366ae548ec9..e6deabd8305d 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -71,6 +71,9 @@ static const char * const clock_names[SYSC_MAX_CLOCKS] = {
  * @name: name if available
  * @revision: interconnect target module revision
  * @needs_resume: runtime resume needed on resume from suspend
+ * @clk_enable_quirk: module specific clock enable quirk
+ * @clk_disable_quirk: module specific clock disable quirk
+ * @reset_done_quirk: module specific reset done quirk
  */
 struct sysc {
 	struct device *dev;
@@ -94,6 +97,9 @@ struct sysc {
 	unsigned int child_needs_resume:1;
 	unsigned int disable_on_idle:1;
 	struct delayed_work idle_work;
+	void (*clk_enable_quirk)(struct sysc *sysc);
+	void (*clk_disable_quirk)(struct sysc *sysc);
+	void (*reset_done_quirk)(struct sysc *sysc);
 };
 
 static void sysc_parse_dts_quirks(struct sysc *ddata, struct device_node *np,
@@ -760,8 +766,11 @@ static int sysc_ioremap(struct sysc *ddata)
 			    ddata->offsets[SYSC_SYSCONFIG],
 			    ddata->offsets[SYSC_SYSSTATUS]);
 
+		if (size < SZ_1K)
+			size = SZ_1K;
+
 		if ((size + sizeof(u32)) > ddata->module_size)
-			return -EINVAL;
+			size = ddata->module_size;
 	}
 
 	ddata->module_va = devm_ioremap(ddata->dev,
@@ -1234,6 +1243,22 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 		   SYSC_QUIRK_EXT_OPT_CLOCK | SYSC_QUIRK_NO_RESET_ON_INIT |
 		   SYSC_QUIRK_SWSUP_SIDLE),
 
+	/* Quirks that need to be set based on detected module */
+	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x00000006, 0xffffffff,
+		   SYSC_MODULE_QUIRK_HDQ1W),
+	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x0000000a, 0xffffffff,
+		   SYSC_MODULE_QUIRK_HDQ1W),
+	SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x00000036, 0x000000ff,
+		   SYSC_MODULE_QUIRK_I2C),
+	SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x0000003c, 0x000000ff,
+		   SYSC_MODULE_QUIRK_I2C),
+	SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x00000040, 0x000000ff,
+		   SYSC_MODULE_QUIRK_I2C),
+	SYSC_QUIRK("i2c", 0, 0, 0x10, 0x90, 0x5040000a, 0xfffff0f0,
+		   SYSC_MODULE_QUIRK_I2C),
+	SYSC_QUIRK("wdt", 0, 0, 0x10, 0x14, 0x502a0500, 0xfffff0f0,
+		   SYSC_MODULE_QUIRK_WDT),
+
 #ifdef DEBUG
 	SYSC_QUIRK("adc", 0, 0, 0x10, -1, 0x47300001, 0xffffffff, 0),
 	SYSC_QUIRK("atl", 0, 0, -1, -1, 0x0a070100, 0xffffffff, 0),
@@ -1247,11 +1272,8 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 	SYSC_QUIRK("dwc3", 0, 0, 0x10, -1, 0x500a0200, 0xffffffff, 0),
 	SYSC_QUIRK("epwmss", 0, 0, 0x4, -1, 0x47400001, 0xffffffff, 0),
 	SYSC_QUIRK("gpu", 0, 0x1fc00, 0x1fc10, -1, 0, 0, 0),
-	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x00000006, 0xffffffff, 0),
-	SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x0000000a, 0xffffffff, 0),
 	SYSC_QUIRK("hsi", 0, 0, 0x10, 0x14, 0x50043101, 0xffffffff, 0),
 	SYSC_QUIRK("iss", 0, 0, 0x10, -1, 0x40000101, 0xffffffff, 0),
-	SYSC_QUIRK("i2c", 0, 0, 0x10, 0x90, 0x5040000a, 0xfffff0f0, 0),
 	SYSC_QUIRK("lcdc", 0, 0, 0x54, -1, 0x4f201000, 0xffffffff, 0),
 	SYSC_QUIRK("mcasp", 0, 0, 0x4, -1, 0x44306302, 0xffffffff, 0),
 	SYSC_QUIRK("mcasp", 0, 0, 0x4, -1, 0x44307b02, 0xffffffff, 0),
@@ -1287,7 +1309,6 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = {
 	SYSC_QUIRK("usb_host_hs", 0, 0, 0x10, -1, 0x50700101, 0xffffffff, 0),
 	SYSC_QUIRK("usb_otg_hs", 0, 0x400, 0x404, 0x408, 0x00000050,
 		   0xffffffff, 0),
-	SYSC_QUIRK("wdt", 0, 0, 0x10, 0x14, 0x502a0500, 0xfffff0f0, 0),
 	SYSC_QUIRK("vfpe", 0, 0, 0x104, -1, 0x4d001200, 0xffffffff, 0),
 #endif
 };
@@ -1360,6 +1381,94 @@ static void sysc_init_revision_quirks(struct sysc *ddata)
 	}
 }
 
+/* 1-wire needs module's internal clocks enabled for reset */
+static void sysc_clk_enable_quirk_hdq1w(struct sysc *ddata)
+{
+	int offset = 0x0c;	/* HDQ_CTRL_STATUS */
+	u16 val;
+
+	val = sysc_read(ddata, offset);
+	val |= BIT(5);
+	sysc_write(ddata, offset, val);
+}
+
+/* I2C needs extra enable bit toggling for reset */
+static void sysc_clk_quirk_i2c(struct sysc *ddata, bool enable)
+{
+	int offset;
+	u16 val;
+
+	/* I2C_CON, omap2/3 is different from omap4 and later */
+	if ((ddata->revision & 0xffffff00) == 0x001f0000)
+		offset = 0x24;
+	else
+		offset = 0xa4;
+
+	/* I2C_EN */
+	val = sysc_read(ddata, offset);
+	if (enable)
+		val |= BIT(15);
+	else
+		val &= ~BIT(15);
+	sysc_write(ddata, offset, val);
+}
+
+static void sysc_clk_enable_quirk_i2c(struct sysc *ddata)
+{
+	sysc_clk_quirk_i2c(ddata, true);
+}
+
+static void sysc_clk_disable_quirk_i2c(struct sysc *ddata)
+{
+	sysc_clk_quirk_i2c(ddata, false);
+}
+
+/* Watchdog timer needs a disable sequence after reset */
+static void sysc_reset_done_quirk_wdt(struct sysc *ddata)
+{
+	int wps, spr, error;
+	u32 val;
+
+	wps = 0x34;
+	spr = 0x48;
+
+	sysc_write(ddata, spr, 0xaaaa);
+	error = readl_poll_timeout(ddata->module_va + wps, val,
+				   !(val & 0x10), 100,
+				   MAX_MODULE_SOFTRESET_WAIT);
+	if (error)
+		dev_warn(ddata->dev, "wdt disable spr failed\n");
+
+	sysc_write(ddata, wps, 0x5555);
+	error = readl_poll_timeout(ddata->module_va + wps, val,
+				   !(val & 0x10), 100,
+				   MAX_MODULE_SOFTRESET_WAIT);
+	if (error)
+		dev_warn(ddata->dev, "wdt disable wps failed\n");
+}
+
+static void sysc_init_module_quirks(struct sysc *ddata)
+{
+	if (ddata->legacy_mode || !ddata->name)
+		return;
+
+	if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_HDQ1W) {
+		ddata->clk_enable_quirk = sysc_clk_enable_quirk_hdq1w;
+
+		return;
+	}
+
+	if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_I2C) {
+		ddata->clk_enable_quirk = sysc_clk_enable_quirk_i2c;
+		ddata->clk_disable_quirk = sysc_clk_disable_quirk_i2c;
+
+		return;
+	}
+
+	if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_WDT)
+		ddata->reset_done_quirk = sysc_reset_done_quirk_wdt;
+}
+
 static int sysc_clockdomain_init(struct sysc *ddata)
 {
 	struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev);
@@ -1468,10 +1577,16 @@ static int sysc_reset(struct sysc *ddata)
 	else
 		syss_done = ddata->cfg.syss_mask;
 
+	if (ddata->clk_disable_quirk)
+		ddata->clk_disable_quirk(ddata);
+
 	sysc_val = sysc_read_sysconfig(ddata);
 	sysc_val |= sysc_mask;
 	sysc_write(ddata, sysc_offset, sysc_val);
 
+	if (ddata->clk_enable_quirk)
+		ddata->clk_enable_quirk(ddata);
+
 	/* Poll on reset status */
 	if (syss_offset >= 0) {
 		error = readx_poll_timeout(sysc_read_sysstatus, ddata, rstval,
@@ -1485,6 +1600,9 @@ static int sysc_reset(struct sysc *ddata)
 					   100, MAX_MODULE_SOFTRESET_WAIT);
 	}
 
+	if (ddata->reset_done_quirk)
+		ddata->reset_done_quirk(ddata);
+
 	return error;
 }
 
@@ -1531,6 +1649,7 @@ static int sysc_init_module(struct sysc *ddata)
 
 	ddata->revision = sysc_read_revision(ddata);
 	sysc_init_revision_quirks(ddata);
+	sysc_init_module_quirks(ddata);
 
 	if (ddata->legacy_mode) {
 		error = sysc_legacy_init(ddata);
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index 8822e99ff813..0c587d4fc718 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -47,6 +47,9 @@ struct sysc_regbits {
 	s8 emufree_shift;
 };
 
+#define SYSC_MODULE_QUIRK_HDQ1W		BIT(17)
+#define SYSC_MODULE_QUIRK_I2C		BIT(16)
+#define SYSC_MODULE_QUIRK_WDT		BIT(15)
 #define SYSS_QUIRK_RESETDONE_INVERTED	BIT(14)
 #define SYSC_QUIRK_SWSUP_MSTANDBY	BIT(13)
 #define SYSC_QUIRK_SWSUP_SIDLE_ACT	BIT(12)
-- 
cgit v1.2.3-59-g8ed1b


From e36acfe6c86d13eec62321e1e86a1ce287e52e7d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 23 May 2019 09:41:19 -0300
Subject: mm/hmm: Use hmm_mirror not mm as an argument for hmm_range_register

Ralph observes that hmm_range_register() can only be called by a driver
while a mirror is registered. Make this clear in the API by passing in the
mirror structure as a parameter.

This also simplifies understanding the lifetime model for struct hmm, as
the hmm pointer must be valid as part of a registered mirror so all we
need in hmm_register_range() is a simple kref_get.

Suggested-by: Ralph Campbell <rcampbell@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 drivers/gpu/drm/nouveau/nouveau_svm.c |  2 +-
 include/linux/hmm.h                   |  7 ++++---
 mm/hmm.c                              | 13 ++++---------
 3 files changed, 9 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 93ed43c413f0..8c92374afcf2 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
 		range.values = nouveau_svm_pfn_values;
 		range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
 again:
-		ret = hmm_vma_fault(&range, true);
+		ret = hmm_vma_fault(&svmm->mirror, &range, true);
 		if (ret == 0) {
 			mutex_lock(&svmm->mutex);
 			if (!hmm_vma_range_done(&range)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index cb01cf1fa3c0..1fba6979adf4 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -496,7 +496,7 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
  * Please see Documentation/vm/hmm.rst for how to use the range API.
  */
 int hmm_range_register(struct hmm_range *range,
-		       struct mm_struct *mm,
+		       struct hmm_mirror *mirror,
 		       unsigned long start,
 		       unsigned long end,
 		       unsigned page_shift);
@@ -532,7 +532,8 @@ static inline bool hmm_vma_range_done(struct hmm_range *range)
 }
 
 /* This is a temporary helper to avoid merge conflict between trees. */
-static inline int hmm_vma_fault(struct hmm_range *range, bool block)
+static inline int hmm_vma_fault(struct hmm_mirror *mirror,
+				struct hmm_range *range, bool block)
 {
 	long ret;
 
@@ -545,7 +546,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block)
 	range->default_flags = 0;
 	range->pfn_flags_mask = -1UL;
 
-	ret = hmm_range_register(range, range->vma->vm_mm,
+	ret = hmm_range_register(range, mirror,
 				 range->start, range->end,
 				 PAGE_SHIFT);
 	if (ret)
diff --git a/mm/hmm.c b/mm/hmm.c
index f6956d78e3cb..22a97ada108b 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -914,13 +914,13 @@ static void hmm_pfns_clear(struct hmm_range *range,
  * Track updates to the CPU page table see include/linux/hmm.h
  */
 int hmm_range_register(struct hmm_range *range,
-		       struct mm_struct *mm,
+		       struct hmm_mirror *mirror,
 		       unsigned long start,
 		       unsigned long end,
 		       unsigned page_shift)
 {
 	unsigned long mask = ((1UL << page_shift) - 1UL);
-	struct hmm *hmm;
+	struct hmm *hmm = mirror->hmm;
 
 	range->valid = false;
 	range->hmm = NULL;
@@ -934,20 +934,15 @@ int hmm_range_register(struct hmm_range *range,
 	range->start = start;
 	range->end = end;
 
-	hmm = hmm_get_or_create(mm);
-	if (!hmm)
-		return -EFAULT;
-
 	/* Check if hmm_mm_destroy() was call. */
-	if (hmm->mm == NULL || hmm->dead) {
-		hmm_put(hmm);
+	if (hmm->mm == NULL || hmm->dead)
 		return -EFAULT;
-	}
 
 	/* Initialize range to track CPU page table updates. */
 	mutex_lock(&hmm->lock);
 
 	range->hmm = hmm;
+	kref_get(&hmm->kref);
 	list_add_rcu(&range->list, &hmm->ranges);
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From c8a53b2db0aec40d8b217936e1b7f3d840c50390 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 23 May 2019 10:36:46 -0300
Subject: mm/hmm: Hold a mmgrab from hmm to mm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So long as a struct hmm pointer exists, so should the struct mm it is
linked too. Hold the mmgrab() as soon as a hmm is created, and mmdrop() it
once the hmm refcount goes to zero.

Since mmdrop() (ie a 0 kref on struct mm) is now impossible with a !NULL
mm->hmm delete the hmm_hmm_destroy().

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h |  3 ---
 kernel/fork.c       |  1 -
 mm/hmm.c            | 22 ++++------------------
 3 files changed, 4 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 1fba6979adf4..1d97b6d62c5b 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -577,14 +577,11 @@ static inline int hmm_vma_fault(struct hmm_mirror *mirror,
 }
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
-void hmm_mm_destroy(struct mm_struct *mm);
-
 static inline void hmm_mm_init(struct mm_struct *mm)
 {
 	mm->hmm = NULL;
 }
 #else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 75675b9bf6df..c704c3cedee7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -673,7 +673,6 @@ void __mmdrop(struct mm_struct *mm)
 	WARN_ON_ONCE(mm == current->active_mm);
 	mm_free_pgd(mm);
 	destroy_context(mm);
-	hmm_mm_destroy(mm);
 	mmu_notifier_mm_destroy(mm);
 	check_mm(mm);
 	put_user_ns(mm->user_ns);
diff --git a/mm/hmm.c b/mm/hmm.c
index 22a97ada108b..080b17a2e87e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -20,6 +20,7 @@
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
 #include <linux/memremap.h>
+#include <linux/sched/mm.h>
 #include <linux/jump_label.h>
 #include <linux/dma-mapping.h>
 #include <linux/mmu_notifier.h>
@@ -73,6 +74,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 	hmm->notifiers = 0;
 	hmm->dead = false;
 	hmm->mm = mm;
+	mmgrab(hmm->mm);
 
 	spin_lock(&mm->page_table_lock);
 	if (!mm->hmm)
@@ -100,6 +102,7 @@ error_mm:
 		mm->hmm = NULL;
 	spin_unlock(&mm->page_table_lock);
 error:
+	mmdrop(hmm->mm);
 	kfree(hmm);
 	return NULL;
 }
@@ -121,6 +124,7 @@ static void hmm_free(struct kref *kref)
 		mm->hmm = NULL;
 	spin_unlock(&mm->page_table_lock);
 
+	mmdrop(hmm->mm);
 	mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu);
 }
 
@@ -129,24 +133,6 @@ static inline void hmm_put(struct hmm *hmm)
 	kref_put(&hmm->kref, hmm_free);
 }
 
-void hmm_mm_destroy(struct mm_struct *mm)
-{
-	struct hmm *hmm;
-
-	spin_lock(&mm->page_table_lock);
-	hmm = mm_get_hmm(mm);
-	mm->hmm = NULL;
-	if (hmm) {
-		hmm->mm = NULL;
-		hmm->dead = true;
-		spin_unlock(&mm->page_table_lock);
-		hmm_put(hmm);
-		return;
-	}
-
-	spin_unlock(&mm->page_table_lock);
-}
-
 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
 	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
-- 
cgit v1.2.3-59-g8ed1b


From 78b99577b3934e3e787fe0c52aa1b59442c8bbb5 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Mon, 10 Jun 2019 00:09:53 +0900
Subject: pinctrl: remove unused pin_is_valid()

This function was used by pin_request() to pointlessly double-check
the pin validity, and it was the only user ever.

Since commit d2f6a1c6fb0e ("pinctrl: remove double pin validity
check."), no one has ever used it.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c          | 23 -----------------------
 include/linux/pinctrl/pinctrl.h | 10 ----------
 2 files changed, 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 04787eefe2a2..e745788fa36f 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -178,29 +178,6 @@ const char *pin_get_name(struct pinctrl_dev *pctldev, const unsigned pin)
 	return desc->name;
 }
 
-/**
- * pin_is_valid() - check if pin exists on controller
- * @pctldev: the pin control device to check the pin on
- * @pin: pin to check, use the local pin controller index number
- *
- * This tells us whether a certain pin exist on a certain pin controller or
- * not. Pin lists may be sparse, so some pins may not exist.
- */
-bool pin_is_valid(struct pinctrl_dev *pctldev, int pin)
-{
-	struct pin_desc *pindesc;
-
-	if (pin < 0)
-		return false;
-
-	mutex_lock(&pctldev->mutex);
-	pindesc = pin_desc_get(pctldev, pin);
-	mutex_unlock(&pctldev->mutex);
-
-	return pindesc != NULL;
-}
-EXPORT_SYMBOL_GPL(pin_is_valid);
-
 /* Deletes a range of pin descriptors */
 static void pinctrl_free_pindescs(struct pinctrl_dev *pctldev,
 				  const struct pinctrl_pin_desc *pins,
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 2744113f1024..36a79fe7b84f 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -172,7 +172,6 @@ extern struct pinctrl_dev *devm_pinctrl_register(struct device *dev,
 extern void devm_pinctrl_unregister(struct device *dev,
 				struct pinctrl_dev *pctldev);
 
-extern bool pin_is_valid(struct pinctrl_dev *pctldev, int pin);
 extern void pinctrl_add_gpio_range(struct pinctrl_dev *pctldev,
 				struct pinctrl_gpio_range *range);
 extern void pinctrl_add_gpio_ranges(struct pinctrl_dev *pctldev,
@@ -203,15 +202,6 @@ struct pinctrl_dev *of_pinctrl_get(struct device_node *np)
 extern const char *pinctrl_dev_get_name(struct pinctrl_dev *pctldev);
 extern const char *pinctrl_dev_get_devname(struct pinctrl_dev *pctldev);
 extern void *pinctrl_dev_get_drvdata(struct pinctrl_dev *pctldev);
-#else
-
-struct pinctrl_dev;
-
-/* Sufficiently stupid default functions when pinctrl is not in use */
-static inline bool pin_is_valid(struct pinctrl_dev *pctldev, int pin)
-{
-	return pin >= 0;
-}
 
 #endif /* !CONFIG_PINCTRL */
 
-- 
cgit v1.2.3-59-g8ed1b


From 0b673b6486998061b0489b09447ebe8452da0146 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Wed, 8 May 2019 11:46:34 -0700
Subject: firmware: arm_scmi: fetch and store sensor scale

In preparation for dealing with scales within the SCMI HWMON driver,
fetch and store the sensor unit scale into the scmi_sensor_info
structure. In order to simplify computations for upper layer, take care
of sign extending the scale to a full 8-bit signed value.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
[sudeep.holla: update bitfield values as per specification]
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/sensors.c | 6 ++++++
 include/linux/scmi_protocol.h       | 1 +
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c
index c00287b5f2c2..0e94ab56f679 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -34,6 +34,8 @@ struct scmi_msg_resp_sensor_description {
 		__le32 attributes_high;
 #define SENSOR_TYPE(x)		((x) & 0xff)
 #define SENSOR_SCALE(x)		(((x) >> 11) & 0x1f)
+#define SENSOR_SCALE_SIGN	BIT(4)
+#define SENSOR_SCALE_EXTEND	GENMASK(7, 5)
 #define SENSOR_UPDATE_SCALE(x)	(((x) >> 22) & 0x1f)
 #define SENSOR_UPDATE_BASE(x)	(((x) >> 27) & 0x1f)
 		    u8 name[SCMI_MAX_STR_SIZE];
@@ -140,6 +142,10 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle,
 			s = &si->sensors[desc_index + cnt];
 			s->id = le32_to_cpu(buf->desc[cnt].id);
 			s->type = SENSOR_TYPE(attrh);
+			s->scale = SENSOR_SCALE(attrh);
+			/* Sign extend to a full s8 */
+			if (s->scale & SENSOR_SCALE_SIGN)
+				s->scale |= SENSOR_SCALE_EXTEND;
 			strlcpy(s->name, buf->desc[cnt].name, SCMI_MAX_STR_SIZE);
 		}
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index 3105055c00a7..9ff2e9357e9a 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -144,6 +144,7 @@ struct scmi_power_ops {
 struct scmi_sensor_info {
 	u32 id;
 	u8 type;
+	s8 scale;
 	char name[SCMI_MAX_STR_SIZE];
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 81f4458c9c6998fcd37c427d16d76d4dba65d015 Mon Sep 17 00:00:00 2001
From: Tero Kristo <t-kristo@ti.com>
Date: Tue, 28 May 2019 16:10:24 +0300
Subject: firmware: ti_sci: extend clock identifiers from u8 to u32

Future SoCs are going to have more than 255 device clocks in certain cases,
and thus the API must be extended to support this. The support is done in
backwards compatible extension, in which the new u32 clock identifier
fields are only used if the existing u8 size clock identifier is set as
255. In all the other cases, the existing u8 clock identifier is used. As
the size of the messages sent / received is not verified for existing
devices / old firmware, increasing the size of the messages from the end
is also fine. Due to this reason, depending on ABI version isn't necessary
either.

Acked-by: Santosh Shilimkar <ssantosh@kernel.org>
Signed-off-by: Tero Kristo <t-kristo@ti.com>
---
 drivers/firmware/ti_sci.c              | 115 ++++++++++++++++++++++++---------
 drivers/firmware/ti_sci.h              |  63 ++++++++++++++----
 include/linux/soc/ti/ti_sci_protocol.h |  28 ++++----
 3 files changed, 150 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index ef93406ace1b..b417cef35769 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -916,7 +916,7 @@ static int ti_sci_cmd_get_device_resets(const struct ti_sci_handle *handle,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_set_clock_state(const struct ti_sci_handle *handle,
-				  u32 dev_id, u8 clk_id,
+				  u32 dev_id, u32 clk_id,
 				  u32 flags, u8 state)
 {
 	struct ti_sci_info *info;
@@ -944,7 +944,12 @@ static int ti_sci_set_clock_state(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_set_clock_state *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 	req->request_state = state;
 
 	ret = ti_sci_do_xfer(info, xfer);
@@ -976,7 +981,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_get_clock_state(const struct ti_sci_handle *handle,
-				      u32 dev_id, u8 clk_id,
+				      u32 dev_id, u32 clk_id,
 				      u8 *programmed_state, u8 *current_state)
 {
 	struct ti_sci_info *info;
@@ -1007,7 +1012,12 @@ static int ti_sci_cmd_get_clock_state(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_get_clock_state *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
@@ -1047,8 +1057,8 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_get_clock(const struct ti_sci_handle *handle, u32 dev_id,
-				u8 clk_id, bool needs_ssc, bool can_change_freq,
-				bool enable_input_term)
+				u32 clk_id, bool needs_ssc,
+				bool can_change_freq, bool enable_input_term)
 {
 	u32 flags = 0;
 
@@ -1073,7 +1083,7 @@ static int ti_sci_cmd_get_clock(const struct ti_sci_handle *handle, u32 dev_id,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_idle_clock(const struct ti_sci_handle *handle,
-				 u32 dev_id, u8 clk_id)
+				 u32 dev_id, u32 clk_id)
 {
 	return ti_sci_set_clock_state(handle, dev_id, clk_id, 0,
 				      MSG_CLOCK_SW_STATE_UNREQ);
@@ -1092,7 +1102,7 @@ static int ti_sci_cmd_idle_clock(const struct ti_sci_handle *handle,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_put_clock(const struct ti_sci_handle *handle,
-				u32 dev_id, u8 clk_id)
+				u32 dev_id, u32 clk_id)
 {
 	return ti_sci_set_clock_state(handle, dev_id, clk_id, 0,
 				      MSG_CLOCK_SW_STATE_AUTO);
@@ -1110,7 +1120,7 @@ static int ti_sci_cmd_put_clock(const struct ti_sci_handle *handle,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_is_auto(const struct ti_sci_handle *handle,
-				  u32 dev_id, u8 clk_id, bool *req_state)
+				  u32 dev_id, u32 clk_id, bool *req_state)
 {
 	u8 state = 0;
 	int ret;
@@ -1139,7 +1149,7 @@ static int ti_sci_cmd_clk_is_auto(const struct ti_sci_handle *handle,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_is_on(const struct ti_sci_handle *handle, u32 dev_id,
-				u8 clk_id, bool *req_state, bool *curr_state)
+				u32 clk_id, bool *req_state, bool *curr_state)
 {
 	u8 c_state = 0, r_state = 0;
 	int ret;
@@ -1172,7 +1182,7 @@ static int ti_sci_cmd_clk_is_on(const struct ti_sci_handle *handle, u32 dev_id,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_is_off(const struct ti_sci_handle *handle, u32 dev_id,
-				 u8 clk_id, bool *req_state, bool *curr_state)
+				 u32 clk_id, bool *req_state, bool *curr_state)
 {
 	u8 c_state = 0, r_state = 0;
 	int ret;
@@ -1204,7 +1214,7 @@ static int ti_sci_cmd_clk_is_off(const struct ti_sci_handle *handle, u32 dev_id,
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_set_parent(const struct ti_sci_handle *handle,
-				     u32 dev_id, u8 clk_id, u8 parent_id)
+				     u32 dev_id, u32 clk_id, u32 parent_id)
 {
 	struct ti_sci_info *info;
 	struct ti_sci_msg_req_set_clock_parent *req;
@@ -1231,8 +1241,18 @@ static int ti_sci_cmd_clk_set_parent(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_set_clock_parent *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
-	req->parent_id = parent_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
+	if (parent_id < 255) {
+		req->parent_id = parent_id;
+	} else {
+		req->parent_id = 255;
+		req->parent_id_32 = parent_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
@@ -1262,7 +1282,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_get_parent(const struct ti_sci_handle *handle,
-				     u32 dev_id, u8 clk_id, u8 *parent_id)
+				     u32 dev_id, u32 clk_id, u32 *parent_id)
 {
 	struct ti_sci_info *info;
 	struct ti_sci_msg_req_get_clock_parent *req;
@@ -1289,7 +1309,12 @@ static int ti_sci_cmd_clk_get_parent(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_get_clock_parent *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
@@ -1299,10 +1324,14 @@ static int ti_sci_cmd_clk_get_parent(const struct ti_sci_handle *handle,
 
 	resp = (struct ti_sci_msg_resp_get_clock_parent *)xfer->xfer_buf;
 
-	if (!ti_sci_is_response_ack(resp))
+	if (!ti_sci_is_response_ack(resp)) {
 		ret = -ENODEV;
-	else
-		*parent_id = resp->parent_id;
+	} else {
+		if (resp->parent_id < 255)
+			*parent_id = resp->parent_id;
+		else
+			*parent_id = resp->parent_id_32;
+	}
 
 fail:
 	ti_sci_put_one_xfer(&info->minfo, xfer);
@@ -1322,8 +1351,8 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_get_num_parents(const struct ti_sci_handle *handle,
-					  u32 dev_id, u8 clk_id,
-					  u8 *num_parents)
+					  u32 dev_id, u32 clk_id,
+					  u32 *num_parents)
 {
 	struct ti_sci_info *info;
 	struct ti_sci_msg_req_get_clock_num_parents *req;
@@ -1350,7 +1379,12 @@ static int ti_sci_cmd_clk_get_num_parents(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_get_clock_num_parents *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
@@ -1360,10 +1394,14 @@ static int ti_sci_cmd_clk_get_num_parents(const struct ti_sci_handle *handle,
 
 	resp = (struct ti_sci_msg_resp_get_clock_num_parents *)xfer->xfer_buf;
 
-	if (!ti_sci_is_response_ack(resp))
+	if (!ti_sci_is_response_ack(resp)) {
 		ret = -ENODEV;
-	else
-		*num_parents = resp->num_parents;
+	} else {
+		if (resp->num_parents < 255)
+			*num_parents = resp->num_parents;
+		else
+			*num_parents = resp->num_parents_32;
+	}
 
 fail:
 	ti_sci_put_one_xfer(&info->minfo, xfer);
@@ -1391,7 +1429,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_get_match_freq(const struct ti_sci_handle *handle,
-					 u32 dev_id, u8 clk_id, u64 min_freq,
+					 u32 dev_id, u32 clk_id, u64 min_freq,
 					 u64 target_freq, u64 max_freq,
 					 u64 *match_freq)
 {
@@ -1420,7 +1458,12 @@ static int ti_sci_cmd_clk_get_match_freq(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_query_clock_freq *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 	req->min_freq_hz = min_freq;
 	req->target_freq_hz = target_freq;
 	req->max_freq_hz = max_freq;
@@ -1463,7 +1506,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_set_freq(const struct ti_sci_handle *handle,
-				   u32 dev_id, u8 clk_id, u64 min_freq,
+				   u32 dev_id, u32 clk_id, u64 min_freq,
 				   u64 target_freq, u64 max_freq)
 {
 	struct ti_sci_info *info;
@@ -1491,7 +1534,12 @@ static int ti_sci_cmd_clk_set_freq(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_set_clock_freq *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 	req->min_freq_hz = min_freq;
 	req->target_freq_hz = target_freq;
 	req->max_freq_hz = max_freq;
@@ -1524,7 +1572,7 @@ fail:
  * Return: 0 if all went well, else returns appropriate error value.
  */
 static int ti_sci_cmd_clk_get_freq(const struct ti_sci_handle *handle,
-				   u32 dev_id, u8 clk_id, u64 *freq)
+				   u32 dev_id, u32 clk_id, u64 *freq)
 {
 	struct ti_sci_info *info;
 	struct ti_sci_msg_req_get_clock_freq *req;
@@ -1551,7 +1599,12 @@ static int ti_sci_cmd_clk_get_freq(const struct ti_sci_handle *handle,
 	}
 	req = (struct ti_sci_msg_req_get_clock_freq *)xfer->xfer_buf;
 	req->dev_id = dev_id;
-	req->clk_id = clk_id;
+	if (clk_id < 255) {
+		req->clk_id = clk_id;
+	} else {
+		req->clk_id = 255;
+		req->clk_id_32 = clk_id;
+	}
 
 	ret = ti_sci_do_xfer(info, xfer);
 	if (ret) {
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 4983827151bf..ad0b47981b87 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -202,7 +202,8 @@ struct ti_sci_msg_req_set_device_resets {
  * @dev_id:	Device identifier this request is for
  * @clk_id:	Clock identifier for the device for this request.
  *		Each device has it's own set of clock inputs. This indexes
- *		which clock input to modify.
+ *		which clock input to modify. Set to 255 if clock ID is
+ *		greater than or equal to 255.
  * @request_state: Request the state for the clock to be set to.
  *		MSG_CLOCK_SW_STATE_UNREQ: The IP does not require this clock,
  *		it can be disabled, regardless of the state of the device
@@ -213,6 +214,9 @@ struct ti_sci_msg_req_set_device_resets {
  *		being required by the device.(default)
  *		MSG_CLOCK_SW_STATE_REQ:  Configure the clock to be enabled,
  *		regardless of the state of the device.
+ * @clk_id_32:	Clock identifier for the device for this request.
+ *		Only to be used if the clock ID is greater than or equal to
+ *		255.
  *
  * Normally, all required clocks are managed by TISCI entity, this is used
  * only for specific control *IF* required. Auto managed state is
@@ -234,6 +238,7 @@ struct ti_sci_msg_req_set_clock_state {
 #define MSG_CLOCK_SW_STATE_AUTO		1
 #define MSG_CLOCK_SW_STATE_REQ		2
 	u8 request_state;
+	u32 clk_id_32;
 } __packed;
 
 /**
@@ -242,7 +247,11 @@ struct ti_sci_msg_req_set_clock_state {
  * @dev_id:	Device identifier this request is for
  * @clk_id:	Clock identifier for the device for this request.
  *		Each device has it's own set of clock inputs. This indexes
- *		which clock input to get state of.
+ *		which clock input to get state of. Set to 255 if the clock
+ *		ID is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier for the device for the request.
+ *		Only to be used if the clock ID is greater than or equal to
+ *		255.
  *
  * Request type is TI_SCI_MSG_GET_CLOCK_STATE, response is state
  * of the clock
@@ -251,6 +260,7 @@ struct ti_sci_msg_req_get_clock_state {
 	struct ti_sci_msg_hdr hdr;
 	u32 dev_id;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
@@ -278,9 +288,13 @@ struct ti_sci_msg_resp_get_clock_state {
  * @dev_id:	Device identifier this request is for
  * @clk_id:	Clock identifier for the device for this request.
  *		Each device has it's own set of clock inputs. This indexes
- *		which clock input to modify.
+ *		which clock input to modify. Set to 255 if clock ID is
+ *		greater than or equal to 255.
  * @parent_id:	The new clock parent is selectable by an index via this
- *		parameter.
+ *		parameter. Set to 255 if clock ID is greater than or
+ *		equal to 255.
+ * @clk_id_32:	Clock identifier if @clk_id field is 255.
+ * @parent_id_32:	Parent identifier if @parent_id is 255.
  *
  * Request type is TI_SCI_MSG_SET_CLOCK_PARENT, response is generic
  * ACK / NACK message.
@@ -290,6 +304,8 @@ struct ti_sci_msg_req_set_clock_parent {
 	u32 dev_id;
 	u8 clk_id;
 	u8 parent_id;
+	u32 clk_id_32;
+	u32 parent_id_32;
 } __packed;
 
 /**
@@ -298,7 +314,10 @@ struct ti_sci_msg_req_set_clock_parent {
  * @dev_id:	Device identifier this request is for
  * @clk_id:	Clock identifier for the device for this request.
  *		Each device has it's own set of clock inputs. This indexes
- *		which clock input to get the parent for.
+ *		which clock input to get the parent for. If this field
+ *		contains 255, the actual clock identifier is stored in
+ *		@clk_id_32.
+ * @clk_id_32:	Clock identifier if the @clk_id field contains 255.
  *
  * Request type is TI_SCI_MSG_GET_CLOCK_PARENT, response is parent information
  */
@@ -306,25 +325,32 @@ struct ti_sci_msg_req_get_clock_parent {
 	struct ti_sci_msg_hdr hdr;
 	u32 dev_id;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
  * struct ti_sci_msg_resp_get_clock_parent - Response with clock parent
  * @hdr:	Generic Header
- * @parent_id:	The current clock parent
+ * @parent_id:	The current clock parent. If set to 255, the current parent
+ *		ID can be found from the @parent_id_32 field.
+ * @parent_id_32:	Current clock parent if @parent_id field is set to
+ *			255.
  *
  * Response to TI_SCI_MSG_GET_CLOCK_PARENT.
  */
 struct ti_sci_msg_resp_get_clock_parent {
 	struct ti_sci_msg_hdr hdr;
 	u8 parent_id;
+	u32 parent_id_32;
 } __packed;
 
 /**
  * struct ti_sci_msg_req_get_clock_num_parents - Request to get clock parents
  * @hdr:	Generic header
  * @dev_id:	Device identifier this request is for
- * @clk_id:	Clock identifier for the device for this request.
+ * @clk_id:	Clock identifier for the device for this request. Set to
+ *		255 if clock ID is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier if the @clk_id field contains 255.
  *
  * This request provides information about how many clock parent options
  * are available for a given clock to a device. This is typically used
@@ -337,18 +363,24 @@ struct ti_sci_msg_req_get_clock_num_parents {
 	struct ti_sci_msg_hdr hdr;
 	u32 dev_id;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
  * struct ti_sci_msg_resp_get_clock_num_parents - Response for get clk parents
  * @hdr:		Generic header
- * @num_parents:	Number of clock parents
+ * @num_parents:	Number of clock parents. If set to 255, the actual
+ *			number of parents is stored into @num_parents_32
+ *			field instead.
+ * @num_parents_32:	Number of clock parents if @num_parents field is
+ *			set to 255.
  *
  * Response to TI_SCI_MSG_GET_NUM_CLOCK_PARENTS
  */
 struct ti_sci_msg_resp_get_clock_num_parents {
 	struct ti_sci_msg_hdr hdr;
 	u8 num_parents;
+	u32 num_parents_32;
 } __packed;
 
 /**
@@ -363,7 +395,9 @@ struct ti_sci_msg_resp_get_clock_num_parents {
  * @max_freq_hz: The maximum allowable frequency in Hz. This is the maximum
  *		allowable programmed frequency and does not account for clock
  *		tolerances and jitter.
- * @clk_id:	Clock identifier for the device for this request.
+ * @clk_id:	Clock identifier for the device for this request. Set to
+ *		255 if clock identifier is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier if @clk_id is set to 255.
  *
  * NOTE: Normally clock frequency management is automatically done by TISCI
  * entity. In case of specific requests, TISCI evaluates capability to achieve
@@ -380,6 +414,7 @@ struct ti_sci_msg_req_query_clock_freq {
 	u64 target_freq_hz;
 	u64 max_freq_hz;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
@@ -407,7 +442,9 @@ struct ti_sci_msg_resp_query_clock_freq {
  * @max_freq_hz: The maximum allowable frequency in Hz. This is the maximum
  *		allowable programmed frequency and does not account for clock
  *		tolerances and jitter.
- * @clk_id:	Clock identifier for the device for this request.
+ * @clk_id:	Clock identifier for the device for this request. Set to
+ *		255 if clock ID is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier if @clk_id field is set to 255.
  *
  * NOTE: Normally clock frequency management is automatically done by TISCI
  * entity. In case of specific requests, TISCI evaluates capability to achieve
@@ -436,13 +473,16 @@ struct ti_sci_msg_req_set_clock_freq {
 	u64 target_freq_hz;
 	u64 max_freq_hz;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
  * struct ti_sci_msg_req_get_clock_freq - Request to get the clock frequency
  * @hdr:	Generic Header
  * @dev_id:	Device identifier this request is for
- * @clk_id:	Clock identifier for the device for this request.
+ * @clk_id:	Clock identifier for the device for this request. Set to
+ *		255 if clock ID is greater than or equal to 255.
+ * @clk_id_32:	Clock identifier if @clk_id field is set to 255.
  *
  * NOTE: Normally clock frequency management is automatically done by TISCI
  * entity. In some cases, clock frequencies are configured by host.
@@ -454,6 +494,7 @@ struct ti_sci_msg_req_get_clock_freq {
 	struct ti_sci_msg_hdr hdr;
 	u32 dev_id;
 	u8 clk_id;
+	u32 clk_id_32;
 } __packed;
 
 /**
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 568722a041bf..406e6717d252 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -166,29 +166,29 @@ struct ti_sci_dev_ops {
  * managed by driver for that purpose.
  */
 struct ti_sci_clk_ops {
-	int (*get_clock)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*get_clock)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 			 bool needs_ssc, bool can_change_freq,
 			 bool enable_input_term);
-	int (*idle_clock)(const struct ti_sci_handle *handle, u32 did, u8 cid);
-	int (*put_clock)(const struct ti_sci_handle *handle, u32 did, u8 cid);
-	int (*is_auto)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*idle_clock)(const struct ti_sci_handle *handle, u32 did, u32 cid);
+	int (*put_clock)(const struct ti_sci_handle *handle, u32 did, u32 cid);
+	int (*is_auto)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 		       bool *req_state);
-	int (*is_on)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*is_on)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 		     bool *req_state, bool *current_state);
-	int (*is_off)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*is_off)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 		      bool *req_state, bool *current_state);
-	int (*set_parent)(const struct ti_sci_handle *handle, u32 did, u8 cid,
-			  u8 parent_id);
-	int (*get_parent)(const struct ti_sci_handle *handle, u32 did, u8 cid,
-			  u8 *parent_id);
+	int (*set_parent)(const struct ti_sci_handle *handle, u32 did, u32 cid,
+			  u32 parent_id);
+	int (*get_parent)(const struct ti_sci_handle *handle, u32 did, u32 cid,
+			  u32 *parent_id);
 	int (*get_num_parents)(const struct ti_sci_handle *handle, u32 did,
-			       u8 cid, u8 *num_parents);
+			       u32 cid, u32 *num_parents);
 	int (*get_best_match_freq)(const struct ti_sci_handle *handle, u32 did,
-				   u8 cid, u64 min_freq, u64 target_freq,
+				   u32 cid, u64 min_freq, u64 target_freq,
 				   u64 max_freq, u64 *match_freq);
-	int (*set_freq)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*set_freq)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 			u64 min_freq, u64 target_freq, u64 max_freq);
-	int (*get_freq)(const struct ti_sci_handle *handle, u32 did, u8 cid,
+	int (*get_freq)(const struct ti_sci_handle *handle, u32 did, u32 cid,
 			u64 *current_freq);
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 5740671e596bdc3986a5391997de194300970201 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Wed, 12 Jun 2019 14:28:30 +0100
Subject: dma-fence/reservation: Markup rcu protected access for DEBUG_MUTEXES
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mark the access to reservation_object.fence as being protected to
silence sparse.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Christian König <christian.koenig@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190612132830.31221-1-chris@chris-wilson.co.uk
---
 include/linux/reservation.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index ee750765cc94..644a22dbe53b 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -216,8 +216,12 @@ reservation_object_unlock(struct reservation_object *obj)
 {
 #ifdef CONFIG_DEBUG_MUTEXES
 	/* Test shared fence slot reservation */
-	if (obj->fence)
-		obj->fence->shared_max = obj->fence->shared_count;
+	if (rcu_access_pointer(obj->fence)) {
+		struct reservation_object_list *fence =
+			reservation_object_get_list(obj);
+
+		fence->shared_max = fence->shared_count;
+	}
 #endif
 	ww_mutex_unlock(&obj->lock);
 }
-- 
cgit v1.2.3-59-g8ed1b


From d664c43958e0d9e0b34e23b6f8a8f4cf8ec61a2e Mon Sep 17 00:00:00 2001
From: Enrico Weigelt <info@metux.net>
Date: Wed, 12 Jun 2019 23:59:36 +0200
Subject: gpio: Fix build warnings on undefined struct pinctrl_dev

This fixes the warnings:

* include/linux/gpio.h:254:11: warning: 'struct pinctrl_dev' declared
  inside parameter list will not be visible outside of this definition
  or declaration
* include/linux/gpio/driver.h:602:11: warning: 'struct pinctrl_dev'
  declared inside parameter list will not be visible outside of this
  definition or declaration

Fixes: 78b99577b393 ("pinctrl: remove unused pin_is_valid()")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Enrico Weigelt <info@metux.net>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio.h        | 1 +
 include/linux/gpio/driver.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 39745b8bdd65..40915b461f18 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -106,6 +106,7 @@ void devm_gpio_free(struct device *dev, unsigned int gpio);
 
 struct device;
 struct gpio_chip;
+struct pinctrl_dev;
 
 static inline bool gpio_is_valid(int number)
 {
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index a1d273c96016..b58b27c11355 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -590,6 +590,8 @@ void gpiochip_remove_pin_ranges(struct gpio_chip *chip);
 
 #else
 
+struct pinctrl_dev;
+
 static inline int
 gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name,
 		       unsigned int gpio_offset, unsigned int pin_offset,
-- 
cgit v1.2.3-59-g8ed1b


From 68608b5e5063dd12942f1118286c6f595d0c4a05 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Mon, 10 Jun 2019 12:18:56 +0300
Subject: firmware: ti_sci: Add resource management APIs for ringacc, psi-l and
 udma

Configuration of NAVSS resource, like rings, UDMAP channels, flows
and PSI-L thread management need to be done via TISCI.

Add the needed structures and functions for NAVSS resource configuration of
the following:
Rings from Ring Accelerator
PSI-L thread management
UDMAP tchan, rchan and rflow configuration.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Reviewed-by: Lokesh Vutla <lokeshvutla@ti.com>
Signed-off-by: Tero Kristo <t-kristo@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 488 ++++++++++++++++++++++++
 drivers/firmware/ti_sci.h              | 675 +++++++++++++++++++++++++++++++++
 include/linux/soc/ti/ti_sci_protocol.h | 215 +++++++++++
 3 files changed, 1378 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 36ce11a67235..02fa196428d8 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2004,6 +2004,481 @@ static int ti_sci_cmd_free_event_map(const struct ti_sci_handle *handle,
 			       ia_id, vint, global_event, vint_status_bit, 0);
 }
 
+/**
+ * ti_sci_cmd_ring_config() - configure RA ring
+ * @handle:		Pointer to TI SCI handle.
+ * @valid_params:	Bitfield defining validity of ring configuration
+ *			parameters
+ * @nav_id:		Device ID of Navigator Subsystem from which the ring is
+ *			allocated
+ * @index:		Ring index
+ * @addr_lo:		The ring base address lo 32 bits
+ * @addr_hi:		The ring base address hi 32 bits
+ * @count:		Number of ring elements
+ * @mode:		The mode of the ring
+ * @size:		The ring element size.
+ * @order_id:		Specifies the ring's bus order ID
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_ring_cfg_req for more info.
+ */
+static int ti_sci_cmd_ring_config(const struct ti_sci_handle *handle,
+				  u32 valid_params, u16 nav_id, u16 index,
+				  u32 addr_lo, u32 addr_hi, u32 count,
+				  u8 mode, u8 size, u8 order_id)
+{
+	struct ti_sci_msg_rm_ring_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(info->dev, "RM_RA:Message config failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_ring_cfg_req *)xfer->xfer_buf;
+	req->valid_params = valid_params;
+	req->nav_id = nav_id;
+	req->index = index;
+	req->addr_lo = addr_lo;
+	req->addr_hi = addr_hi;
+	req->count = count;
+	req->mode = mode;
+	req->size = size;
+	req->order_id = order_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(info->dev, "RM_RA:Mbox config send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "RM_RA:config ring %u ret:%d\n", index, ret);
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_ring_get_config() - get RA ring configuration
+ * @handle:	Pointer to TI SCI handle.
+ * @nav_id:	Device ID of Navigator Subsystem from which the ring is
+ *		allocated
+ * @index:	Ring index
+ * @addr_lo:	Returns ring's base address lo 32 bits
+ * @addr_hi:	Returns ring's base address hi 32 bits
+ * @count:	Returns number of ring elements
+ * @mode:	Returns mode of the ring
+ * @size:	Returns ring element size
+ * @order_id:	Returns ring's bus order ID
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_ring_get_cfg_req for more info.
+ */
+static int ti_sci_cmd_ring_get_config(const struct ti_sci_handle *handle,
+				      u32 nav_id, u32 index, u8 *mode,
+				      u32 *addr_lo, u32 *addr_hi,
+				      u32 *count, u8 *size, u8 *order_id)
+{
+	struct ti_sci_msg_rm_ring_get_cfg_resp *resp;
+	struct ti_sci_msg_rm_ring_get_cfg_req *req;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_GET_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(info->dev,
+			"RM_RA:Message get config failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_ring_get_cfg_req *)xfer->xfer_buf;
+	req->nav_id = nav_id;
+	req->index = index;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(info->dev, "RM_RA:Mbox get config send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_rm_ring_get_cfg_resp *)xfer->xfer_buf;
+
+	if (!ti_sci_is_response_ack(resp)) {
+		ret = -ENODEV;
+	} else {
+		if (mode)
+			*mode = resp->mode;
+		if (addr_lo)
+			*addr_lo = resp->addr_lo;
+		if (addr_hi)
+			*addr_hi = resp->addr_hi;
+		if (count)
+			*count = resp->count;
+		if (size)
+			*size = resp->size;
+		if (order_id)
+			*order_id = resp->order_id;
+	};
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "RM_RA:get config ring %u ret:%d\n", index, ret);
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_psil_pair() - Pair PSI-L source to destination thread
+ * @handle:	Pointer to TI SCI handle.
+ * @nav_id:	Device ID of Navigator Subsystem which should be used for
+ *		pairing
+ * @src_thread:	Source PSI-L thread ID
+ * @dst_thread: Destination PSI-L thread ID
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_rm_psil_pair(const struct ti_sci_handle *handle,
+				   u32 nav_id, u32 src_thread, u32 dst_thread)
+{
+	struct ti_sci_msg_psil_pair *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (!handle)
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_PSIL_PAIR,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "RM_PSIL:Message reconfig failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_psil_pair *)xfer->xfer_buf;
+	req->nav_id = nav_id;
+	req->src_thread = src_thread;
+	req->dst_thread = dst_thread;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "RM_PSIL:Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_psil_unpair() - Unpair PSI-L source from destination thread
+ * @handle:	Pointer to TI SCI handle.
+ * @nav_id:	Device ID of Navigator Subsystem which should be used for
+ *		unpairing
+ * @src_thread:	Source PSI-L thread ID
+ * @dst_thread:	Destination PSI-L thread ID
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_rm_psil_unpair(const struct ti_sci_handle *handle,
+				     u32 nav_id, u32 src_thread, u32 dst_thread)
+{
+	struct ti_sci_msg_psil_unpair *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (!handle)
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_PSIL_UNPAIR,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "RM_PSIL:Message reconfig failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_psil_unpair *)xfer->xfer_buf;
+	req->nav_id = nav_id;
+	req->src_thread = src_thread;
+	req->dst_thread = dst_thread;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "RM_PSIL:Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_udmap_tx_ch_cfg() - Configure a UDMAP TX channel
+ * @handle:	Pointer to TI SCI handle.
+ * @params:	Pointer to ti_sci_msg_rm_udmap_tx_ch_cfg TX channel config
+ *		structure
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_udmap_tx_ch_cfg and @ti_sci_msg_rm_udmap_tx_ch_cfg_req for
+ * more info.
+ */
+static int ti_sci_cmd_rm_udmap_tx_ch_cfg(const struct ti_sci_handle *handle,
+			const struct ti_sci_msg_rm_udmap_tx_ch_cfg *params)
+{
+	struct ti_sci_msg_rm_udmap_tx_ch_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TISCI_MSG_RM_UDMAP_TX_CH_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(info->dev, "Message TX_CH_CFG alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_udmap_tx_ch_cfg_req *)xfer->xfer_buf;
+	req->valid_params = params->valid_params;
+	req->nav_id = params->nav_id;
+	req->index = params->index;
+	req->tx_pause_on_err = params->tx_pause_on_err;
+	req->tx_filt_einfo = params->tx_filt_einfo;
+	req->tx_filt_pswords = params->tx_filt_pswords;
+	req->tx_atype = params->tx_atype;
+	req->tx_chan_type = params->tx_chan_type;
+	req->tx_supr_tdpkt = params->tx_supr_tdpkt;
+	req->tx_fetch_size = params->tx_fetch_size;
+	req->tx_credit_count = params->tx_credit_count;
+	req->txcq_qnum = params->txcq_qnum;
+	req->tx_priority = params->tx_priority;
+	req->tx_qos = params->tx_qos;
+	req->tx_orderid = params->tx_orderid;
+	req->fdepth = params->fdepth;
+	req->tx_sched_priority = params->tx_sched_priority;
+	req->tx_burst_size = params->tx_burst_size;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(info->dev, "Mbox send TX_CH_CFG fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "TX_CH_CFG: chn %u ret:%u\n", params->index, ret);
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_udmap_rx_ch_cfg() - Configure a UDMAP RX channel
+ * @handle:	Pointer to TI SCI handle.
+ * @params:	Pointer to ti_sci_msg_rm_udmap_rx_ch_cfg RX channel config
+ *		structure
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_udmap_rx_ch_cfg and @ti_sci_msg_rm_udmap_rx_ch_cfg_req for
+ * more info.
+ */
+static int ti_sci_cmd_rm_udmap_rx_ch_cfg(const struct ti_sci_handle *handle,
+			const struct ti_sci_msg_rm_udmap_rx_ch_cfg *params)
+{
+	struct ti_sci_msg_rm_udmap_rx_ch_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TISCI_MSG_RM_UDMAP_RX_CH_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(info->dev, "Message RX_CH_CFG alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_udmap_rx_ch_cfg_req *)xfer->xfer_buf;
+	req->valid_params = params->valid_params;
+	req->nav_id = params->nav_id;
+	req->index = params->index;
+	req->rx_fetch_size = params->rx_fetch_size;
+	req->rxcq_qnum = params->rxcq_qnum;
+	req->rx_priority = params->rx_priority;
+	req->rx_qos = params->rx_qos;
+	req->rx_orderid = params->rx_orderid;
+	req->rx_sched_priority = params->rx_sched_priority;
+	req->flowid_start = params->flowid_start;
+	req->flowid_cnt = params->flowid_cnt;
+	req->rx_pause_on_err = params->rx_pause_on_err;
+	req->rx_atype = params->rx_atype;
+	req->rx_chan_type = params->rx_chan_type;
+	req->rx_ignore_short = params->rx_ignore_short;
+	req->rx_ignore_long = params->rx_ignore_long;
+	req->rx_burst_size = params->rx_burst_size;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(info->dev, "Mbox send RX_CH_CFG fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "RX_CH_CFG: chn %u ret:%d\n", params->index, ret);
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_rm_udmap_rx_flow_cfg() - Configure UDMAP RX FLOW
+ * @handle:	Pointer to TI SCI handle.
+ * @params:	Pointer to ti_sci_msg_rm_udmap_flow_cfg RX FLOW config
+ *		structure
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ *
+ * See @ti_sci_msg_rm_udmap_flow_cfg and @ti_sci_msg_rm_udmap_flow_cfg_req for
+ * more info.
+ */
+static int ti_sci_cmd_rm_udmap_rx_flow_cfg(const struct ti_sci_handle *handle,
+			const struct ti_sci_msg_rm_udmap_flow_cfg *params)
+{
+	struct ti_sci_msg_rm_udmap_flow_cfg_req *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_xfer *xfer;
+	struct ti_sci_info *info;
+	struct device *dev;
+	int ret = 0;
+
+	if (IS_ERR_OR_NULL(handle))
+		return -EINVAL;
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TISCI_MSG_RM_UDMAP_FLOW_CFG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "RX_FL_CFG: Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_rm_udmap_flow_cfg_req *)xfer->xfer_buf;
+	req->valid_params = params->valid_params;
+	req->nav_id = params->nav_id;
+	req->flow_index = params->flow_index;
+	req->rx_einfo_present = params->rx_einfo_present;
+	req->rx_psinfo_present = params->rx_psinfo_present;
+	req->rx_error_handling = params->rx_error_handling;
+	req->rx_desc_type = params->rx_desc_type;
+	req->rx_sop_offset = params->rx_sop_offset;
+	req->rx_dest_qnum = params->rx_dest_qnum;
+	req->rx_src_tag_hi = params->rx_src_tag_hi;
+	req->rx_src_tag_lo = params->rx_src_tag_lo;
+	req->rx_dest_tag_hi = params->rx_dest_tag_hi;
+	req->rx_dest_tag_lo = params->rx_dest_tag_lo;
+	req->rx_src_tag_hi_sel = params->rx_src_tag_hi_sel;
+	req->rx_src_tag_lo_sel = params->rx_src_tag_lo_sel;
+	req->rx_dest_tag_hi_sel = params->rx_dest_tag_hi_sel;
+	req->rx_dest_tag_lo_sel = params->rx_dest_tag_lo_sel;
+	req->rx_fdq0_sz0_qnum = params->rx_fdq0_sz0_qnum;
+	req->rx_fdq1_qnum = params->rx_fdq1_qnum;
+	req->rx_fdq2_qnum = params->rx_fdq2_qnum;
+	req->rx_fdq3_qnum = params->rx_fdq3_qnum;
+	req->rx_ps_location = params->rx_ps_location;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "RX_FL_CFG: Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf;
+	ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+	dev_dbg(info->dev, "RX_FL_CFG: %u ret:%d\n", params->flow_index, ret);
+	return ret;
+}
+
 /*
  * ti_sci_setup_ops() - Setup the operations structures
  * @info:	pointer to TISCI pointer
@@ -2016,6 +2491,9 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	struct ti_sci_clk_ops *cops = &ops->clk_ops;
 	struct ti_sci_rm_core_ops *rm_core_ops = &ops->rm_core_ops;
 	struct ti_sci_rm_irq_ops *iops = &ops->rm_irq_ops;
+	struct ti_sci_rm_ringacc_ops *rops = &ops->rm_ring_ops;
+	struct ti_sci_rm_psil_ops *psilops = &ops->rm_psil_ops;
+	struct ti_sci_rm_udmap_ops *udmap_ops = &ops->rm_udmap_ops;
 
 	core_ops->reboot_device = ti_sci_cmd_core_reboot;
 
@@ -2055,6 +2533,16 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	iops->set_event_map = ti_sci_cmd_set_event_map;
 	iops->free_irq = ti_sci_cmd_free_irq;
 	iops->free_event_map = ti_sci_cmd_free_event_map;
+
+	rops->config = ti_sci_cmd_ring_config;
+	rops->get_config = ti_sci_cmd_ring_get_config;
+
+	psilops->pair = ti_sci_cmd_rm_psil_pair;
+	psilops->unpair = ti_sci_cmd_rm_psil_unpair;
+
+	udmap_ops->tx_ch_cfg = ti_sci_cmd_rm_udmap_tx_ch_cfg;
+	udmap_ops->rx_ch_cfg = ti_sci_cmd_rm_udmap_rx_ch_cfg;
+	udmap_ops->rx_flow_cfg = ti_sci_cmd_rm_udmap_rx_flow_cfg;
 }
 
 /**
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 4983827151bf..2bb81ec7793c 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -42,6 +42,35 @@
 #define TI_SCI_MSG_SET_IRQ		0x1000
 #define TI_SCI_MSG_FREE_IRQ		0x1001
 
+/* NAVSS resource management */
+/* Ringacc requests */
+#define TI_SCI_MSG_RM_RING_ALLOCATE		0x1100
+#define TI_SCI_MSG_RM_RING_FREE			0x1101
+#define TI_SCI_MSG_RM_RING_RECONFIG		0x1102
+#define TI_SCI_MSG_RM_RING_RESET		0x1103
+#define TI_SCI_MSG_RM_RING_CFG			0x1110
+#define TI_SCI_MSG_RM_RING_GET_CFG		0x1111
+
+/* PSI-L requests */
+#define TI_SCI_MSG_RM_PSIL_PAIR			0x1280
+#define TI_SCI_MSG_RM_PSIL_UNPAIR		0x1281
+
+#define TI_SCI_MSG_RM_UDMAP_TX_ALLOC		0x1200
+#define TI_SCI_MSG_RM_UDMAP_TX_FREE		0x1201
+#define TI_SCI_MSG_RM_UDMAP_RX_ALLOC		0x1210
+#define TI_SCI_MSG_RM_UDMAP_RX_FREE		0x1211
+#define TI_SCI_MSG_RM_UDMAP_FLOW_CFG		0x1220
+#define TI_SCI_MSG_RM_UDMAP_OPT_FLOW_CFG	0x1221
+
+#define TISCI_MSG_RM_UDMAP_TX_CH_CFG		0x1205
+#define TISCI_MSG_RM_UDMAP_TX_CH_GET_CFG	0x1206
+#define TISCI_MSG_RM_UDMAP_RX_CH_CFG		0x1215
+#define TISCI_MSG_RM_UDMAP_RX_CH_GET_CFG	0x1216
+#define TISCI_MSG_RM_UDMAP_FLOW_CFG		0x1230
+#define TISCI_MSG_RM_UDMAP_FLOW_SIZE_THRESH_CFG	0x1231
+#define TISCI_MSG_RM_UDMAP_FLOW_GET_CFG		0x1232
+#define TISCI_MSG_RM_UDMAP_FLOW_SIZE_THRESH_GET_CFG	0x1233
+
 /**
  * struct ti_sci_msg_hdr - Generic Message Header for All messages and responses
  * @type:	Type of messages: One of TI_SCI_MSG* values
@@ -563,4 +592,650 @@ struct ti_sci_msg_req_manage_irq {
 	u8 secondary_host;
 } __packed;
 
+/**
+ * struct ti_sci_msg_rm_ring_cfg_req - Configure a Navigator Subsystem ring
+ *
+ * Configures the non-real-time registers of a Navigator Subsystem ring.
+ * @hdr:	Generic Header
+ * @valid_params: Bitfield defining validity of ring configuration parameters.
+ *	The ring configuration fields are not valid, and will not be used for
+ *	ring configuration, if their corresponding valid bit is zero.
+ *	Valid bit usage:
+ *	0 - Valid bit for @tisci_msg_rm_ring_cfg_req addr_lo
+ *	1 - Valid bit for @tisci_msg_rm_ring_cfg_req addr_hi
+ *	2 - Valid bit for @tisci_msg_rm_ring_cfg_req count
+ *	3 - Valid bit for @tisci_msg_rm_ring_cfg_req mode
+ *	4 - Valid bit for @tisci_msg_rm_ring_cfg_req size
+ *	5 - Valid bit for @tisci_msg_rm_ring_cfg_req order_id
+ * @nav_id: Device ID of Navigator Subsystem from which the ring is allocated
+ * @index: ring index to be configured.
+ * @addr_lo: 32 LSBs of ring base address to be programmed into the ring's
+ *	RING_BA_LO register
+ * @addr_hi: 16 MSBs of ring base address to be programmed into the ring's
+ *	RING_BA_HI register.
+ * @count: Number of ring elements. Must be even if mode is CREDENTIALS or QM
+ *	modes.
+ * @mode: Specifies the mode the ring is to be configured.
+ * @size: Specifies encoded ring element size. To calculate the encoded size use
+ *	the formula (log2(size_bytes) - 2), where size_bytes cannot be
+ *	greater than 256.
+ * @order_id: Specifies the ring's bus order ID.
+ */
+struct ti_sci_msg_rm_ring_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u32 valid_params;
+	u16 nav_id;
+	u16 index;
+	u32 addr_lo;
+	u32 addr_hi;
+	u32 count;
+	u8 mode;
+	u8 size;
+	u8 order_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_rm_ring_get_cfg_req - Get RA ring's configuration
+ *
+ * Gets the configuration of the non-real-time register fields of a ring.  The
+ * host, or a supervisor of the host, who owns the ring must be the requesting
+ * host.  The values of the non-real-time registers are returned in
+ * @ti_sci_msg_rm_ring_get_cfg_resp.
+ *
+ * @hdr: Generic Header
+ * @nav_id: Device ID of Navigator Subsystem from which the ring is allocated
+ * @index: ring index.
+ */
+struct ti_sci_msg_rm_ring_get_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u16 nav_id;
+	u16 index;
+} __packed;
+
+/**
+ * struct ti_sci_msg_rm_ring_get_cfg_resp -  Ring get configuration response
+ *
+ * Response received by host processor after RM has handled
+ * @ti_sci_msg_rm_ring_get_cfg_req. The response contains the ring's
+ * non-real-time register values.
+ *
+ * @hdr: Generic Header
+ * @addr_lo: Ring 32 LSBs of base address
+ * @addr_hi: Ring 16 MSBs of base address.
+ * @count: Ring number of elements.
+ * @mode: Ring mode.
+ * @size: encoded Ring element size
+ * @order_id: ing order ID.
+ */
+struct ti_sci_msg_rm_ring_get_cfg_resp {
+	struct ti_sci_msg_hdr hdr;
+	u32 addr_lo;
+	u32 addr_hi;
+	u32 count;
+	u8 mode;
+	u8 size;
+	u8 order_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_psil_pair - Pairs a PSI-L source thread to a destination
+ *				 thread
+ * @hdr:	Generic Header
+ * @nav_id:	SoC Navigator Subsystem device ID whose PSI-L config proxy is
+ *		used to pair the source and destination threads.
+ * @src_thread:	PSI-L source thread ID within the PSI-L System thread map.
+ *
+ * UDMAP transmit channels mapped to source threads will have their
+ * TCHAN_THRD_ID register programmed with the destination thread if the pairing
+ * is successful.
+
+ * @dst_thread: PSI-L destination thread ID within the PSI-L System thread map.
+ * PSI-L destination threads start at index 0x8000.  The request is NACK'd if
+ * the destination thread is not greater than or equal to 0x8000.
+ *
+ * UDMAP receive channels mapped to destination threads will have their
+ * RCHAN_THRD_ID register programmed with the source thread if the pairing
+ * is successful.
+ *
+ * Request type is TI_SCI_MSG_RM_PSIL_PAIR, response is a generic ACK or NACK
+ * message.
+ */
+struct ti_sci_msg_psil_pair {
+	struct ti_sci_msg_hdr hdr;
+	u32 nav_id;
+	u32 src_thread;
+	u32 dst_thread;
+} __packed;
+
+/**
+ * struct ti_sci_msg_psil_unpair - Unpairs a PSI-L source thread from a
+ *				   destination thread
+ * @hdr:	Generic Header
+ * @nav_id:	SoC Navigator Subsystem device ID whose PSI-L config proxy is
+ *		used to unpair the source and destination threads.
+ * @src_thread:	PSI-L source thread ID within the PSI-L System thread map.
+ *
+ * UDMAP transmit channels mapped to source threads will have their
+ * TCHAN_THRD_ID register cleared if the unpairing is successful.
+ *
+ * @dst_thread: PSI-L destination thread ID within the PSI-L System thread map.
+ * PSI-L destination threads start at index 0x8000.  The request is NACK'd if
+ * the destination thread is not greater than or equal to 0x8000.
+ *
+ * UDMAP receive channels mapped to destination threads will have their
+ * RCHAN_THRD_ID register cleared if the unpairing is successful.
+ *
+ * Request type is TI_SCI_MSG_RM_PSIL_UNPAIR, response is a generic ACK or NACK
+ * message.
+ */
+struct ti_sci_msg_psil_unpair {
+	struct ti_sci_msg_hdr hdr;
+	u32 nav_id;
+	u32 src_thread;
+	u32 dst_thread;
+} __packed;
+
+/**
+ * struct ti_sci_msg_udmap_rx_flow_cfg -  UDMAP receive flow configuration
+ *					  message
+ * @hdr: Generic Header
+ * @nav_id: SoC Navigator Subsystem device ID from which the receive flow is
+ *	allocated
+ * @flow_index: UDMAP receive flow index for non-optional configuration.
+ * @rx_ch_index: Specifies the index of the receive channel using the flow_index
+ * @rx_einfo_present: UDMAP receive flow extended packet info present.
+ * @rx_psinfo_present: UDMAP receive flow PS words present.
+ * @rx_error_handling: UDMAP receive flow error handling configuration. Valid
+ *	values are TI_SCI_RM_UDMAP_RX_FLOW_ERR_DROP/RETRY.
+ * @rx_desc_type: UDMAP receive flow descriptor type. It can be one of
+ *	TI_SCI_RM_UDMAP_RX_FLOW_DESC_HOST/MONO.
+ * @rx_sop_offset: UDMAP receive flow start of packet offset.
+ * @rx_dest_qnum: UDMAP receive flow destination queue number.
+ * @rx_ps_location: UDMAP receive flow PS words location.
+ *	0 - end of packet descriptor
+ *	1 - Beginning of the data buffer
+ * @rx_src_tag_hi: UDMAP receive flow source tag high byte constant
+ * @rx_src_tag_lo: UDMAP receive flow source tag low byte constant
+ * @rx_dest_tag_hi: UDMAP receive flow destination tag high byte constant
+ * @rx_dest_tag_lo: UDMAP receive flow destination tag low byte constant
+ * @rx_src_tag_hi_sel: UDMAP receive flow source tag high byte selector
+ * @rx_src_tag_lo_sel: UDMAP receive flow source tag low byte selector
+ * @rx_dest_tag_hi_sel: UDMAP receive flow destination tag high byte selector
+ * @rx_dest_tag_lo_sel: UDMAP receive flow destination tag low byte selector
+ * @rx_size_thresh_en: UDMAP receive flow packet size based free buffer queue
+ *	enable. If enabled, the ti_sci_rm_udmap_rx_flow_opt_cfg also need to be
+ *	configured and sent.
+ * @rx_fdq0_sz0_qnum: UDMAP receive flow free descriptor queue 0.
+ * @rx_fdq1_qnum: UDMAP receive flow free descriptor queue 1.
+ * @rx_fdq2_qnum: UDMAP receive flow free descriptor queue 2.
+ * @rx_fdq3_qnum: UDMAP receive flow free descriptor queue 3.
+ *
+ * For detailed information on the settings, see the UDMAP section of the TRM.
+ */
+struct ti_sci_msg_udmap_rx_flow_cfg {
+	struct ti_sci_msg_hdr hdr;
+	u32 nav_id;
+	u32 flow_index;
+	u32 rx_ch_index;
+	u8 rx_einfo_present;
+	u8 rx_psinfo_present;
+	u8 rx_error_handling;
+	u8 rx_desc_type;
+	u16 rx_sop_offset;
+	u16 rx_dest_qnum;
+	u8 rx_ps_location;
+	u8 rx_src_tag_hi;
+	u8 rx_src_tag_lo;
+	u8 rx_dest_tag_hi;
+	u8 rx_dest_tag_lo;
+	u8 rx_src_tag_hi_sel;
+	u8 rx_src_tag_lo_sel;
+	u8 rx_dest_tag_hi_sel;
+	u8 rx_dest_tag_lo_sel;
+	u8 rx_size_thresh_en;
+	u16 rx_fdq0_sz0_qnum;
+	u16 rx_fdq1_qnum;
+	u16 rx_fdq2_qnum;
+	u16 rx_fdq3_qnum;
+} __packed;
+
+/**
+ * struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg - parameters for UDMAP receive
+ *						flow optional configuration
+ * @hdr: Generic Header
+ * @nav_id: SoC Navigator Subsystem device ID from which the receive flow is
+ *	allocated
+ * @flow_index: UDMAP receive flow index for optional configuration.
+ * @rx_ch_index: Specifies the index of the receive channel using the flow_index
+ * @rx_size_thresh0: UDMAP receive flow packet size threshold 0.
+ * @rx_size_thresh1: UDMAP receive flow packet size threshold 1.
+ * @rx_size_thresh2: UDMAP receive flow packet size threshold 2.
+ * @rx_fdq0_sz1_qnum: UDMAP receive flow free descriptor queue for size
+ *	threshold 1.
+ * @rx_fdq0_sz2_qnum: UDMAP receive flow free descriptor queue for size
+ *	threshold 2.
+ * @rx_fdq0_sz3_qnum: UDMAP receive flow free descriptor queue for size
+ *	threshold 3.
+ *
+ * For detailed information on the settings, see the UDMAP section of the TRM.
+ */
+struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg {
+	struct ti_sci_msg_hdr hdr;
+	u32 nav_id;
+	u32 flow_index;
+	u32 rx_ch_index;
+	u16 rx_size_thresh0;
+	u16 rx_size_thresh1;
+	u16 rx_size_thresh2;
+	u16 rx_fdq0_sz1_qnum;
+	u16 rx_fdq0_sz2_qnum;
+	u16 rx_fdq0_sz3_qnum;
+} __packed;
+
+/**
+ * Configures a Navigator Subsystem UDMAP transmit channel
+ *
+ * Configures the non-real-time registers of a Navigator Subsystem UDMAP
+ * transmit channel.  The channel index must be assigned to the host defined
+ * in the TISCI header via the RM board configuration resource assignment
+ * range list.
+ *
+ * @hdr: Generic Header
+ *
+ * @valid_params: Bitfield defining validity of tx channel configuration
+ * parameters. The tx channel configuration fields are not valid, and will not
+ * be used for ch configuration, if their corresponding valid bit is zero.
+ * Valid bit usage:
+ *    0 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_pause_on_err
+ *    1 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_atype
+ *    2 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_chan_type
+ *    3 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_fetch_size
+ *    4 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::txcq_qnum
+ *    5 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_priority
+ *    6 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_qos
+ *    7 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_orderid
+ *    8 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_sched_priority
+ *    9 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_filt_einfo
+ *   10 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_filt_pswords
+ *   11 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_supr_tdpkt
+ *   12 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_credit_count
+ *   13 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::fdepth
+ *   14 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_burst_size
+ *
+ * @nav_id: SoC device ID of Navigator Subsystem where tx channel is located
+ *
+ * @index: UDMAP transmit channel index.
+ *
+ * @tx_pause_on_err: UDMAP transmit channel pause on error configuration to
+ * be programmed into the tx_pause_on_err field of the channel's TCHAN_TCFG
+ * register.
+ *
+ * @tx_filt_einfo: UDMAP transmit channel extended packet information passing
+ * configuration to be programmed into the tx_filt_einfo field of the
+ * channel's TCHAN_TCFG register.
+ *
+ * @tx_filt_pswords: UDMAP transmit channel protocol specific word passing
+ * configuration to be programmed into the tx_filt_pswords field of the
+ * channel's TCHAN_TCFG register.
+ *
+ * @tx_atype: UDMAP transmit channel non Ring Accelerator access pointer
+ * interpretation configuration to be programmed into the tx_atype field of
+ * the channel's TCHAN_TCFG register.
+ *
+ * @tx_chan_type: UDMAP transmit channel functional channel type and work
+ * passing mechanism configuration to be programmed into the tx_chan_type
+ * field of the channel's TCHAN_TCFG register.
+ *
+ * @tx_supr_tdpkt: UDMAP transmit channel teardown packet generation suppression
+ * configuration to be programmed into the tx_supr_tdpkt field of the channel's
+ * TCHAN_TCFG register.
+ *
+ * @tx_fetch_size: UDMAP transmit channel number of 32-bit descriptor words to
+ * fetch configuration to be programmed into the tx_fetch_size field of the
+ * channel's TCHAN_TCFG register.  The user must make sure to set the maximum
+ * word count that can pass through the channel for any allowed descriptor type.
+ *
+ * @tx_credit_count: UDMAP transmit channel transfer request credit count
+ * configuration to be programmed into the count field of the TCHAN_TCREDIT
+ * register.  Specifies how many credits for complete TRs are available.
+ *
+ * @txcq_qnum: UDMAP transmit channel completion queue configuration to be
+ * programmed into the txcq_qnum field of the TCHAN_TCQ register. The specified
+ * completion queue must be assigned to the host, or a subordinate of the host,
+ * requesting configuration of the transmit channel.
+ *
+ * @tx_priority: UDMAP transmit channel transmit priority value to be programmed
+ * into the priority field of the channel's TCHAN_TPRI_CTRL register.
+ *
+ * @tx_qos: UDMAP transmit channel transmit qos value to be programmed into the
+ * qos field of the channel's TCHAN_TPRI_CTRL register.
+ *
+ * @tx_orderid: UDMAP transmit channel bus order id value to be programmed into
+ * the orderid field of the channel's TCHAN_TPRI_CTRL register.
+ *
+ * @fdepth: UDMAP transmit channel FIFO depth configuration to be programmed
+ * into the fdepth field of the TCHAN_TFIFO_DEPTH register. Sets the number of
+ * Tx FIFO bytes which are allowed to be stored for the channel. Check the UDMAP
+ * section of the TRM for restrictions regarding this parameter.
+ *
+ * @tx_sched_priority: UDMAP transmit channel tx scheduling priority
+ * configuration to be programmed into the priority field of the channel's
+ * TCHAN_TST_SCHED register.
+ *
+ * @tx_burst_size: UDMAP transmit channel burst size configuration to be
+ * programmed into the tx_burst_size field of the TCHAN_TCFG register.
+ */
+struct ti_sci_msg_rm_udmap_tx_ch_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u32 valid_params;
+	u16 nav_id;
+	u16 index;
+	u8 tx_pause_on_err;
+	u8 tx_filt_einfo;
+	u8 tx_filt_pswords;
+	u8 tx_atype;
+	u8 tx_chan_type;
+	u8 tx_supr_tdpkt;
+	u16 tx_fetch_size;
+	u8 tx_credit_count;
+	u16 txcq_qnum;
+	u8 tx_priority;
+	u8 tx_qos;
+	u8 tx_orderid;
+	u16 fdepth;
+	u8 tx_sched_priority;
+	u8 tx_burst_size;
+} __packed;
+
+/**
+ * Configures a Navigator Subsystem UDMAP receive channel
+ *
+ * Configures the non-real-time registers of a Navigator Subsystem UDMAP
+ * receive channel.  The channel index must be assigned to the host defined
+ * in the TISCI header via the RM board configuration resource assignment
+ * range list.
+ *
+ * @hdr: Generic Header
+ *
+ * @valid_params: Bitfield defining validity of rx channel configuration
+ * parameters.
+ * The rx channel configuration fields are not valid, and will not be used for
+ * ch configuration, if their corresponding valid bit is zero.
+ * Valid bit usage:
+ *    0 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_pause_on_err
+ *    1 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_atype
+ *    2 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_chan_type
+ *    3 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_fetch_size
+ *    4 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rxcq_qnum
+ *    5 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_priority
+ *    6 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_qos
+ *    7 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_orderid
+ *    8 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_sched_priority
+ *    9 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::flowid_start
+ *   10 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::flowid_cnt
+ *   11 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_ignore_short
+ *   12 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_ignore_long
+ *   14 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_burst_size
+ *
+ * @nav_id: SoC device ID of Navigator Subsystem where rx channel is located
+ *
+ * @index: UDMAP receive channel index.
+ *
+ * @rx_fetch_size: UDMAP receive channel number of 32-bit descriptor words to
+ * fetch configuration to be programmed into the rx_fetch_size field of the
+ * channel's RCHAN_RCFG register.
+ *
+ * @rxcq_qnum: UDMAP receive channel completion queue configuration to be
+ * programmed into the rxcq_qnum field of the RCHAN_RCQ register.
+ * The specified completion queue must be assigned to the host, or a subordinate
+ * of the host, requesting configuration of the receive channel.
+ *
+ * @rx_priority: UDMAP receive channel receive priority value to be programmed
+ * into the priority field of the channel's RCHAN_RPRI_CTRL register.
+ *
+ * @rx_qos: UDMAP receive channel receive qos value to be programmed into the
+ * qos field of the channel's RCHAN_RPRI_CTRL register.
+ *
+ * @rx_orderid: UDMAP receive channel bus order id value to be programmed into
+ * the orderid field of the channel's RCHAN_RPRI_CTRL register.
+ *
+ * @rx_sched_priority: UDMAP receive channel rx scheduling priority
+ * configuration to be programmed into the priority field of the channel's
+ * RCHAN_RST_SCHED register.
+ *
+ * @flowid_start: UDMAP receive channel additional flows starting index
+ * configuration to program into the flow_start field of the RCHAN_RFLOW_RNG
+ * register. Specifies the starting index for flow IDs the receive channel is to
+ * make use of beyond the default flow. flowid_start and @ref flowid_cnt must be
+ * set as valid and configured together. The starting flow ID set by
+ * @ref flowid_cnt must be a flow index within the Navigator Subsystem's subset
+ * of flows beyond the default flows statically mapped to receive channels.
+ * The additional flows must be assigned to the host, or a subordinate of the
+ * host, requesting configuration of the receive channel.
+ *
+ * @flowid_cnt: UDMAP receive channel additional flows count configuration to
+ * program into the flowid_cnt field of the RCHAN_RFLOW_RNG register.
+ * This field specifies how many flow IDs are in the additional contiguous range
+ * of legal flow IDs for the channel.  @ref flowid_start and flowid_cnt must be
+ * set as valid and configured together. Disabling the valid_params field bit
+ * for flowid_cnt indicates no flow IDs other than the default are to be
+ * allocated and used by the receive channel. @ref flowid_start plus flowid_cnt
+ * cannot be greater than the number of receive flows in the receive channel's
+ * Navigator Subsystem.  The additional flows must be assigned to the host, or a
+ * subordinate of the host, requesting configuration of the receive channel.
+ *
+ * @rx_pause_on_err: UDMAP receive channel pause on error configuration to be
+ * programmed into the rx_pause_on_err field of the channel's RCHAN_RCFG
+ * register.
+ *
+ * @rx_atype: UDMAP receive channel non Ring Accelerator access pointer
+ * interpretation configuration to be programmed into the rx_atype field of the
+ * channel's RCHAN_RCFG register.
+ *
+ * @rx_chan_type: UDMAP receive channel functional channel type and work passing
+ * mechanism configuration to be programmed into the rx_chan_type field of the
+ * channel's RCHAN_RCFG register.
+ *
+ * @rx_ignore_short: UDMAP receive channel short packet treatment configuration
+ * to be programmed into the rx_ignore_short field of the RCHAN_RCFG register.
+ *
+ * @rx_ignore_long: UDMAP receive channel long packet treatment configuration to
+ * be programmed into the rx_ignore_long field of the RCHAN_RCFG register.
+ *
+ * @rx_burst_size: UDMAP receive channel burst size configuration to be
+ * programmed into the rx_burst_size field of the RCHAN_RCFG register.
+ */
+struct ti_sci_msg_rm_udmap_rx_ch_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u32 valid_params;
+	u16 nav_id;
+	u16 index;
+	u16 rx_fetch_size;
+	u16 rxcq_qnum;
+	u8 rx_priority;
+	u8 rx_qos;
+	u8 rx_orderid;
+	u8 rx_sched_priority;
+	u16 flowid_start;
+	u16 flowid_cnt;
+	u8 rx_pause_on_err;
+	u8 rx_atype;
+	u8 rx_chan_type;
+	u8 rx_ignore_short;
+	u8 rx_ignore_long;
+	u8 rx_burst_size;
+} __packed;
+
+/**
+ * Configures a Navigator Subsystem UDMAP receive flow
+ *
+ * Configures a Navigator Subsystem UDMAP receive flow's registers.
+ * Configuration does not include the flow registers which handle size-based
+ * free descriptor queue routing.
+ *
+ * The flow index must be assigned to the host defined in the TISCI header via
+ * the RM board configuration resource assignment range list.
+ *
+ * @hdr: Standard TISCI header
+ *
+ * @valid_params
+ * Bitfield defining validity of rx flow configuration parameters.  The
+ * rx flow configuration fields are not valid, and will not be used for flow
+ * configuration, if their corresponding valid bit is zero.  Valid bit usage:
+ *     0 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_einfo_present
+ *     1 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_psinfo_present
+ *     2 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_error_handling
+ *     3 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_desc_type
+ *     4 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_sop_offset
+ *     5 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_qnum
+ *     6 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_hi
+ *     7 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_lo
+ *     8 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_hi
+ *     9 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_lo
+ *    10 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_hi_sel
+ *    11 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_lo_sel
+ *    12 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_hi_sel
+ *    13 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_lo_sel
+ *    14 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq0_sz0_qnum
+ *    15 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq1_sz0_qnum
+ *    16 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq2_sz0_qnum
+ *    17 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq3_sz0_qnum
+ *    18 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_ps_location
+ *
+ * @nav_id: SoC device ID of Navigator Subsystem from which the receive flow is
+ * allocated
+ *
+ * @flow_index: UDMAP receive flow index for non-optional configuration.
+ *
+ * @rx_einfo_present:
+ * UDMAP receive flow extended packet info present configuration to be
+ * programmed into the rx_einfo_present field of the flow's RFLOW_RFA register.
+ *
+ * @rx_psinfo_present:
+ * UDMAP receive flow PS words present configuration to be programmed into the
+ * rx_psinfo_present field of the flow's RFLOW_RFA register.
+ *
+ * @rx_error_handling:
+ * UDMAP receive flow error handling configuration to be programmed into the
+ * rx_error_handling field of the flow's RFLOW_RFA register.
+ *
+ * @rx_desc_type:
+ * UDMAP receive flow descriptor type configuration to be programmed into the
+ * rx_desc_type field field of the flow's RFLOW_RFA register.
+ *
+ * @rx_sop_offset:
+ * UDMAP receive flow start of packet offset configuration to be programmed
+ * into the rx_sop_offset field of the RFLOW_RFA register.  See the UDMAP
+ * section of the TRM for more information on this setting.  Valid values for
+ * this field are 0-255 bytes.
+ *
+ * @rx_dest_qnum:
+ * UDMAP receive flow destination queue configuration to be programmed into the
+ * rx_dest_qnum field of the flow's RFLOW_RFA register.  The specified
+ * destination queue must be valid within the Navigator Subsystem and must be
+ * owned by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_src_tag_hi:
+ * UDMAP receive flow source tag high byte constant configuration to be
+ * programmed into the rx_src_tag_hi field of the flow's RFLOW_RFB register.
+ * See the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_src_tag_lo:
+ * UDMAP receive flow source tag low byte constant configuration to be
+ * programmed into the rx_src_tag_lo field of the flow's RFLOW_RFB register.
+ * See the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_dest_tag_hi:
+ * UDMAP receive flow destination tag high byte constant configuration to be
+ * programmed into the rx_dest_tag_hi field of the flow's RFLOW_RFB register.
+ * See the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_dest_tag_lo:
+ * UDMAP receive flow destination tag low byte constant configuration to be
+ * programmed into the rx_dest_tag_lo field of the flow's RFLOW_RFB register.
+ * See the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_src_tag_hi_sel:
+ * UDMAP receive flow source tag high byte selector configuration to be
+ * programmed into the rx_src_tag_hi_sel field of the RFLOW_RFC register.  See
+ * the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_src_tag_lo_sel:
+ * UDMAP receive flow source tag low byte selector configuration to be
+ * programmed into the rx_src_tag_lo_sel field of the RFLOW_RFC register.  See
+ * the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_dest_tag_hi_sel:
+ * UDMAP receive flow destination tag high byte selector configuration to be
+ * programmed into the rx_dest_tag_hi_sel field of the RFLOW_RFC register.  See
+ * the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_dest_tag_lo_sel:
+ * UDMAP receive flow destination tag low byte selector configuration to be
+ * programmed into the rx_dest_tag_lo_sel field of the RFLOW_RFC register.  See
+ * the UDMAP section of the TRM for more information on this setting.
+ *
+ * @rx_fdq0_sz0_qnum:
+ * UDMAP receive flow free descriptor queue 0 configuration to be programmed
+ * into the rx_fdq0_sz0_qnum field of the flow's RFLOW_RFD register.  See the
+ * UDMAP section of the TRM for more information on this setting. The specified
+ * free queue must be valid within the Navigator Subsystem and must be owned
+ * by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_fdq1_qnum:
+ * UDMAP receive flow free descriptor queue 1 configuration to be programmed
+ * into the rx_fdq1_qnum field of the flow's RFLOW_RFD register.  See the
+ * UDMAP section of the TRM for more information on this setting.  The specified
+ * free queue must be valid within the Navigator Subsystem and must be owned
+ * by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_fdq2_qnum:
+ * UDMAP receive flow free descriptor queue 2 configuration to be programmed
+ * into the rx_fdq2_qnum field of the flow's RFLOW_RFE register.  See the
+ * UDMAP section of the TRM for more information on this setting.  The specified
+ * free queue must be valid within the Navigator Subsystem and must be owned
+ * by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_fdq3_qnum:
+ * UDMAP receive flow free descriptor queue 3 configuration to be programmed
+ * into the rx_fdq3_qnum field of the flow's RFLOW_RFE register.  See the
+ * UDMAP section of the TRM for more information on this setting.  The specified
+ * free queue must be valid within the Navigator Subsystem and must be owned
+ * by the host, or a subordinate of the host, requesting allocation and
+ * configuration of the receive flow.
+ *
+ * @rx_ps_location:
+ * UDMAP receive flow PS words location configuration to be programmed into the
+ * rx_ps_location field of the flow's RFLOW_RFA register.
+ */
+struct ti_sci_msg_rm_udmap_flow_cfg_req {
+	struct ti_sci_msg_hdr hdr;
+	u32 valid_params;
+	u16 nav_id;
+	u16 flow_index;
+	u8 rx_einfo_present;
+	u8 rx_psinfo_present;
+	u8 rx_error_handling;
+	u8 rx_desc_type;
+	u16 rx_sop_offset;
+	u16 rx_dest_qnum;
+	u8 rx_src_tag_hi;
+	u8 rx_src_tag_lo;
+	u8 rx_dest_tag_hi;
+	u8 rx_dest_tag_lo;
+	u8 rx_src_tag_hi_sel;
+	u8 rx_src_tag_lo_sel;
+	u8 rx_dest_tag_hi_sel;
+	u8 rx_dest_tag_lo_sel;
+	u16 rx_fdq0_sz0_qnum;
+	u16 rx_fdq1_qnum;
+	u16 rx_fdq2_qnum;
+	u16 rx_fdq3_qnum;
+	u8 rx_ps_location;
+} __packed;
+
 #endif /* __TI_SCI_H */
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 568722a041bf..4fd9bff5806b 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -241,6 +241,218 @@ struct ti_sci_rm_irq_ops {
 			      u16 global_event, u8 vint_status_bit);
 };
 
+/* RA config.addr_lo parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_ADDR_LO_VALID	BIT(0)
+/* RA config.addr_hi parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_ADDR_HI_VALID	BIT(1)
+ /* RA config.count parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_COUNT_VALID	BIT(2)
+/* RA config.mode parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_MODE_VALID	BIT(3)
+/* RA config.size parameter is valid for RM ring configure TI_SCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID	BIT(4)
+/* RA config.order_id parameter is valid for RM ring configure TISCI message */
+#define TI_SCI_MSG_VALUE_RM_RING_ORDER_ID_VALID	BIT(5)
+
+#define TI_SCI_MSG_VALUE_RM_ALL_NO_ORDER \
+	(TI_SCI_MSG_VALUE_RM_RING_ADDR_LO_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_ADDR_HI_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_COUNT_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_MODE_VALID | \
+	TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID)
+
+/**
+ * struct ti_sci_rm_ringacc_ops - Ring Accelerator Management operations
+ * @config: configure the SoC Navigator Subsystem Ring Accelerator ring
+ * @get_config: get the SoC Navigator Subsystem Ring Accelerator ring
+ *		configuration
+ */
+struct ti_sci_rm_ringacc_ops {
+	int (*config)(const struct ti_sci_handle *handle,
+		      u32 valid_params, u16 nav_id, u16 index,
+		      u32 addr_lo, u32 addr_hi, u32 count, u8 mode,
+		      u8 size, u8 order_id
+	);
+	int (*get_config)(const struct ti_sci_handle *handle,
+			  u32 nav_id, u32 index, u8 *mode,
+			  u32 *addr_lo, u32 *addr_hi, u32 *count,
+			  u8 *size, u8 *order_id);
+};
+
+/**
+ * struct ti_sci_rm_psil_ops - PSI-L thread operations
+ * @pair: pair PSI-L source thread to a destination thread.
+ *	If the src_thread is mapped to UDMA tchan, the corresponding channel's
+ *	TCHAN_THRD_ID register is updated.
+ *	If the dst_thread is mapped to UDMA rchan, the corresponding channel's
+ *	RCHAN_THRD_ID register is updated.
+ * @unpair: unpair PSI-L source thread from a destination thread.
+ *	If the src_thread is mapped to UDMA tchan, the corresponding channel's
+ *	TCHAN_THRD_ID register is cleared.
+ *	If the dst_thread is mapped to UDMA rchan, the corresponding channel's
+ *	RCHAN_THRD_ID register is cleared.
+ */
+struct ti_sci_rm_psil_ops {
+	int (*pair)(const struct ti_sci_handle *handle, u32 nav_id,
+		    u32 src_thread, u32 dst_thread);
+	int (*unpair)(const struct ti_sci_handle *handle, u32 nav_id,
+		      u32 src_thread, u32 dst_thread);
+};
+
+/* UDMAP channel types */
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_PKT_PBRR		2
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_PKT_PBRR_SB		3	/* RX only */
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_PBRR		10
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_PBVR		11
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_BCOPY_PBRR	12
+#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_BCOPY_PBVR	13
+
+#define TI_SCI_RM_UDMAP_RX_FLOW_DESC_HOST		0
+#define TI_SCI_RM_UDMAP_RX_FLOW_DESC_MONO		2
+
+#define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES	1
+#define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_128_BYTES	2
+#define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES	3
+
+/* UDMAP TX/RX channel valid_params common declarations */
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID		BIT(0)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_ATYPE_VALID                BIT(1)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_CHAN_TYPE_VALID            BIT(2)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_FETCH_SIZE_VALID           BIT(3)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_CQ_QNUM_VALID              BIT(4)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_PRIORITY_VALID             BIT(5)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_QOS_VALID                  BIT(6)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_ORDER_ID_VALID             BIT(7)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_SCHED_PRIORITY_VALID       BIT(8)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_BURST_SIZE_VALID		BIT(14)
+
+/**
+ * Configures a Navigator Subsystem UDMAP transmit channel
+ *
+ * Configures a Navigator Subsystem UDMAP transmit channel registers.
+ * See @ti_sci_msg_rm_udmap_tx_ch_cfg_req
+ */
+struct ti_sci_msg_rm_udmap_tx_ch_cfg {
+	u32 valid_params;
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FILT_EINFO_VALID        BIT(9)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FILT_PSWORDS_VALID      BIT(10)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_SUPR_TDPKT_VALID        BIT(11)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_CREDIT_COUNT_VALID      BIT(12)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FDEPTH_VALID            BIT(13)
+	u16 nav_id;
+	u16 index;
+	u8 tx_pause_on_err;
+	u8 tx_filt_einfo;
+	u8 tx_filt_pswords;
+	u8 tx_atype;
+	u8 tx_chan_type;
+	u8 tx_supr_tdpkt;
+	u16 tx_fetch_size;
+	u8 tx_credit_count;
+	u16 txcq_qnum;
+	u8 tx_priority;
+	u8 tx_qos;
+	u8 tx_orderid;
+	u16 fdepth;
+	u8 tx_sched_priority;
+	u8 tx_burst_size;
+};
+
+/**
+ * Configures a Navigator Subsystem UDMAP receive channel
+ *
+ * Configures a Navigator Subsystem UDMAP receive channel registers.
+ * See @ti_sci_msg_rm_udmap_rx_ch_cfg_req
+ */
+struct ti_sci_msg_rm_udmap_rx_ch_cfg {
+	u32 valid_params;
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_START_VALID      BIT(9)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_CNT_VALID        BIT(10)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_IGNORE_SHORT_VALID      BIT(11)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_IGNORE_LONG_VALID       BIT(12)
+	u16 nav_id;
+	u16 index;
+	u16 rx_fetch_size;
+	u16 rxcq_qnum;
+	u8 rx_priority;
+	u8 rx_qos;
+	u8 rx_orderid;
+	u8 rx_sched_priority;
+	u16 flowid_start;
+	u16 flowid_cnt;
+	u8 rx_pause_on_err;
+	u8 rx_atype;
+	u8 rx_chan_type;
+	u8 rx_ignore_short;
+	u8 rx_ignore_long;
+	u8 rx_burst_size;
+};
+
+/**
+ * Configures a Navigator Subsystem UDMAP receive flow
+ *
+ * Configures a Navigator Subsystem UDMAP receive flow's registers.
+ * See @tis_ci_msg_rm_udmap_flow_cfg_req
+ */
+struct ti_sci_msg_rm_udmap_flow_cfg {
+	u32 valid_params;
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_EINFO_PRESENT_VALID	BIT(0)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_PSINFO_PRESENT_VALID     BIT(1)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_ERROR_HANDLING_VALID     BIT(2)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DESC_TYPE_VALID          BIT(3)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SOP_OFFSET_VALID         BIT(4)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_QNUM_VALID          BIT(5)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_HI_VALID         BIT(6)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_LO_VALID         BIT(7)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_HI_VALID        BIT(8)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_LO_VALID        BIT(9)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_HI_SEL_VALID     BIT(10)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_LO_SEL_VALID     BIT(11)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_HI_SEL_VALID    BIT(12)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_LO_SEL_VALID    BIT(13)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ0_SZ0_QNUM_VALID      BIT(14)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ1_QNUM_VALID          BIT(15)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ2_QNUM_VALID          BIT(16)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ3_QNUM_VALID          BIT(17)
+#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_PS_LOCATION_VALID        BIT(18)
+	u16 nav_id;
+	u16 flow_index;
+	u8 rx_einfo_present;
+	u8 rx_psinfo_present;
+	u8 rx_error_handling;
+	u8 rx_desc_type;
+	u16 rx_sop_offset;
+	u16 rx_dest_qnum;
+	u8 rx_src_tag_hi;
+	u8 rx_src_tag_lo;
+	u8 rx_dest_tag_hi;
+	u8 rx_dest_tag_lo;
+	u8 rx_src_tag_hi_sel;
+	u8 rx_src_tag_lo_sel;
+	u8 rx_dest_tag_hi_sel;
+	u8 rx_dest_tag_lo_sel;
+	u16 rx_fdq0_sz0_qnum;
+	u16 rx_fdq1_qnum;
+	u16 rx_fdq2_qnum;
+	u16 rx_fdq3_qnum;
+	u8 rx_ps_location;
+};
+
+/**
+ * struct ti_sci_rm_udmap_ops - UDMA Management operations
+ * @tx_ch_cfg: configure SoC Navigator Subsystem UDMA transmit channel.
+ * @rx_ch_cfg: configure SoC Navigator Subsystem UDMA receive channel.
+ * @rx_flow_cfg1: configure SoC Navigator Subsystem UDMA receive flow.
+ */
+struct ti_sci_rm_udmap_ops {
+	int (*tx_ch_cfg)(const struct ti_sci_handle *handle,
+			 const struct ti_sci_msg_rm_udmap_tx_ch_cfg *params);
+	int (*rx_ch_cfg)(const struct ti_sci_handle *handle,
+			 const struct ti_sci_msg_rm_udmap_rx_ch_cfg *params);
+	int (*rx_flow_cfg)(const struct ti_sci_handle *handle,
+			   const struct ti_sci_msg_rm_udmap_flow_cfg *params);
+};
+
 /**
  * struct ti_sci_ops - Function support for TI SCI
  * @dev_ops:	Device specific operations
@@ -254,6 +466,9 @@ struct ti_sci_ops {
 	struct ti_sci_clk_ops clk_ops;
 	struct ti_sci_rm_core_ops rm_core_ops;
 	struct ti_sci_rm_irq_ops rm_irq_ops;
+	struct ti_sci_rm_ringacc_ops rm_ring_ops;
+	struct ti_sci_rm_psil_ops rm_psil_ops;
+	struct ti_sci_rm_udmap_ops rm_udmap_ops;
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 1e407f337f4015c8ffc56e7cfd70e06b2e9fc9da Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Wed, 5 Jun 2019 17:33:34 -0500
Subject: firmware: ti_sci: Add support for processor control

Texas Instrument's System Control Interface (TI-SCI) Message Protocol
is used in Texas Instrument's System on Chip (SoC) such as those
in K3 family AM654 SoC to communicate between various compute
processors with a central system controller entity.

The system controller provides various services including the control
of other compute processors within the SoC. Extend the TI-SCI protocol
support to add various TI-SCI commands to invoke services associated
with power and reset control, and boot vector management of the
various compute processors from the Linux kernel.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Tero Kristo <t-kristo@ti.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
---
 drivers/firmware/ti_sci.c              | 350 +++++++++++++++++++++++++++++++++
 drivers/firmware/ti_sci.h              | 135 +++++++++++++
 include/linux/soc/ti/ti_sci_protocol.h |  31 +++
 3 files changed, 516 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c
index 02fa196428d8..b47e33e7411f 100644
--- a/drivers/firmware/ti_sci.c
+++ b/drivers/firmware/ti_sci.c
@@ -2479,6 +2479,348 @@ fail:
 	return ret;
 }
 
+/**
+ * ti_sci_cmd_proc_request() - Command to request a physical processor control
+ * @handle:	Pointer to TI SCI handle
+ * @proc_id:	Processor ID this request is for
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_request(const struct ti_sci_handle *handle,
+				   u8 proc_id)
+{
+	struct ti_sci_msg_req_proc_request *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PROC_REQUEST,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_proc_request *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_proc_release() - Command to release a physical processor control
+ * @handle:	Pointer to TI SCI handle
+ * @proc_id:	Processor ID this request is for
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_release(const struct ti_sci_handle *handle,
+				   u8 proc_id)
+{
+	struct ti_sci_msg_req_proc_release *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PROC_RELEASE,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_proc_release *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_proc_handover() - Command to handover a physical processor
+ *				control to a host in the processor's access
+ *				control list.
+ * @handle:	Pointer to TI SCI handle
+ * @proc_id:	Processor ID this request is for
+ * @host_id:	Host ID to get the control of the processor
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_handover(const struct ti_sci_handle *handle,
+				    u8 proc_id, u8 host_id)
+{
+	struct ti_sci_msg_req_proc_handover *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PROC_HANDOVER,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_proc_handover *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+	req->host_id = host_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_proc_set_config() - Command to set the processor boot
+ *				    configuration flags
+ * @handle:		Pointer to TI SCI handle
+ * @proc_id:		Processor ID this request is for
+ * @config_flags_set:	Configuration flags to be set
+ * @config_flags_clear:	Configuration flags to be cleared.
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_set_config(const struct ti_sci_handle *handle,
+				      u8 proc_id, u64 bootvector,
+				      u32 config_flags_set,
+				      u32 config_flags_clear)
+{
+	struct ti_sci_msg_req_set_config *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_SET_CONFIG,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_set_config *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+	req->bootvector_low = bootvector & TI_SCI_ADDR_LOW_MASK;
+	req->bootvector_high = (bootvector & TI_SCI_ADDR_HIGH_MASK) >>
+				TI_SCI_ADDR_HIGH_SHIFT;
+	req->config_flags_set = config_flags_set;
+	req->config_flags_clear = config_flags_clear;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_proc_set_control() - Command to set the processor boot
+ *				     control flags
+ * @handle:			Pointer to TI SCI handle
+ * @proc_id:			Processor ID this request is for
+ * @control_flags_set:		Control flags to be set
+ * @control_flags_clear:	Control flags to be cleared
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_set_control(const struct ti_sci_handle *handle,
+				       u8 proc_id, u32 control_flags_set,
+				       u32 control_flags_clear)
+{
+	struct ti_sci_msg_req_set_ctrl *req;
+	struct ti_sci_msg_hdr *resp;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_SET_CTRL,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_set_ctrl *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+	req->control_flags_set = control_flags_set;
+	req->control_flags_clear = control_flags_clear;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf;
+
+	ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV;
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
+/**
+ * ti_sci_cmd_get_boot_status() - Command to get the processor boot status
+ * @handle:	Pointer to TI SCI handle
+ * @proc_id:	Processor ID this request is for
+ *
+ * Return: 0 if all went well, else returns appropriate error value.
+ */
+static int ti_sci_cmd_proc_get_status(const struct ti_sci_handle *handle,
+				      u8 proc_id, u64 *bv, u32 *cfg_flags,
+				      u32 *ctrl_flags, u32 *sts_flags)
+{
+	struct ti_sci_msg_resp_get_status *resp;
+	struct ti_sci_msg_req_get_status *req;
+	struct ti_sci_info *info;
+	struct ti_sci_xfer *xfer;
+	struct device *dev;
+	int ret = 0;
+
+	if (!handle)
+		return -EINVAL;
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	info = handle_to_ti_sci_info(handle);
+	dev = info->dev;
+
+	xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_GET_STATUS,
+				   TI_SCI_FLAG_REQ_ACK_ON_PROCESSED,
+				   sizeof(*req), sizeof(*resp));
+	if (IS_ERR(xfer)) {
+		ret = PTR_ERR(xfer);
+		dev_err(dev, "Message alloc failed(%d)\n", ret);
+		return ret;
+	}
+	req = (struct ti_sci_msg_req_get_status *)xfer->xfer_buf;
+	req->processor_id = proc_id;
+
+	ret = ti_sci_do_xfer(info, xfer);
+	if (ret) {
+		dev_err(dev, "Mbox send fail %d\n", ret);
+		goto fail;
+	}
+
+	resp = (struct ti_sci_msg_resp_get_status *)xfer->tx_message.buf;
+
+	if (!ti_sci_is_response_ack(resp)) {
+		ret = -ENODEV;
+	} else {
+		*bv = (resp->bootvector_low & TI_SCI_ADDR_LOW_MASK) |
+		      (((u64)resp->bootvector_high << TI_SCI_ADDR_HIGH_SHIFT) &
+		       TI_SCI_ADDR_HIGH_MASK);
+		*cfg_flags = resp->config_flags;
+		*ctrl_flags = resp->control_flags;
+		*sts_flags = resp->status_flags;
+	}
+
+fail:
+	ti_sci_put_one_xfer(&info->minfo, xfer);
+
+	return ret;
+}
+
 /*
  * ti_sci_setup_ops() - Setup the operations structures
  * @info:	pointer to TISCI pointer
@@ -2494,6 +2836,7 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	struct ti_sci_rm_ringacc_ops *rops = &ops->rm_ring_ops;
 	struct ti_sci_rm_psil_ops *psilops = &ops->rm_psil_ops;
 	struct ti_sci_rm_udmap_ops *udmap_ops = &ops->rm_udmap_ops;
+	struct ti_sci_proc_ops *pops = &ops->proc_ops;
 
 	core_ops->reboot_device = ti_sci_cmd_core_reboot;
 
@@ -2543,6 +2886,13 @@ static void ti_sci_setup_ops(struct ti_sci_info *info)
 	udmap_ops->tx_ch_cfg = ti_sci_cmd_rm_udmap_tx_ch_cfg;
 	udmap_ops->rx_ch_cfg = ti_sci_cmd_rm_udmap_rx_ch_cfg;
 	udmap_ops->rx_flow_cfg = ti_sci_cmd_rm_udmap_rx_flow_cfg;
+
+	pops->request = ti_sci_cmd_proc_request;
+	pops->release = ti_sci_cmd_proc_release;
+	pops->handover = ti_sci_cmd_proc_handover;
+	pops->set_config = ti_sci_cmd_proc_set_config;
+	pops->set_control = ti_sci_cmd_proc_set_control;
+	pops->get_status = ti_sci_cmd_proc_get_status;
 }
 
 /**
diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h
index 2bb81ec7793c..662dcffef311 100644
--- a/drivers/firmware/ti_sci.h
+++ b/drivers/firmware/ti_sci.h
@@ -71,6 +71,14 @@
 #define TISCI_MSG_RM_UDMAP_FLOW_GET_CFG		0x1232
 #define TISCI_MSG_RM_UDMAP_FLOW_SIZE_THRESH_GET_CFG	0x1233
 
+/* Processor Control requests */
+#define TI_SCI_MSG_PROC_REQUEST		0xc000
+#define TI_SCI_MSG_PROC_RELEASE		0xc001
+#define TI_SCI_MSG_PROC_HANDOVER	0xc005
+#define TI_SCI_MSG_SET_CONFIG		0xc100
+#define TI_SCI_MSG_SET_CTRL		0xc101
+#define TI_SCI_MSG_GET_STATUS		0xc400
+
 /**
  * struct ti_sci_msg_hdr - Generic Message Header for All messages and responses
  * @type:	Type of messages: One of TI_SCI_MSG* values
@@ -1238,4 +1246,131 @@ struct ti_sci_msg_rm_udmap_flow_cfg_req {
 	u8 rx_ps_location;
 } __packed;
 
+/**
+ * struct ti_sci_msg_req_proc_request - Request a processor
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being requested
+ *
+ * Request type is TI_SCI_MSG_PROC_REQUEST, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_proc_request {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_proc_release - Release a processor
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being released
+ *
+ * Request type is TI_SCI_MSG_PROC_RELEASE, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_proc_release {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_proc_handover - Handover a processor to a host
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being handed over
+ * @host_id:		Host ID the control needs to be transferred to
+ *
+ * Request type is TI_SCI_MSG_PROC_HANDOVER, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_proc_handover {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+	u8 host_id;
+} __packed;
+
+/* Boot Vector masks */
+#define TI_SCI_ADDR_LOW_MASK			GENMASK_ULL(31, 0)
+#define TI_SCI_ADDR_HIGH_MASK			GENMASK_ULL(63, 32)
+#define TI_SCI_ADDR_HIGH_SHIFT			32
+
+/**
+ * struct ti_sci_msg_req_set_config - Set Processor boot configuration
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being configured
+ * @bootvector_low:	Lower 32 bit address (Little Endian) of boot vector
+ * @bootvector_high:	Higher 32 bit address (Little Endian) of boot vector
+ * @config_flags_set:	Optional Processor specific Config Flags to set.
+ *			Setting a bit here implies the corresponding mode
+ *			will be set
+ * @config_flags_clear:	Optional Processor specific Config Flags to clear.
+ *			Setting a bit here implies the corresponding mode
+ *			will be cleared
+ *
+ * Request type is TI_SCI_MSG_PROC_HANDOVER, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_set_config {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+	u32 bootvector_low;
+	u32 bootvector_high;
+	u32 config_flags_set;
+	u32 config_flags_clear;
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_set_ctrl - Set Processor boot control flags
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor being configured
+ * @control_flags_set:	Optional Processor specific Control Flags to set.
+ *			Setting a bit here implies the corresponding mode
+ *			will be set
+ * @control_flags_clear:Optional Processor specific Control Flags to clear.
+ *			Setting a bit here implies the corresponding mode
+ *			will be cleared
+ *
+ * Request type is TI_SCI_MSG_SET_CTRL, response is a generic ACK/NACK
+ * message.
+ */
+struct ti_sci_msg_req_set_ctrl {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+	u32 control_flags_set;
+	u32 control_flags_clear;
+} __packed;
+
+/**
+ * struct ti_sci_msg_req_get_status - Processor boot status request
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor whose status is being requested
+ *
+ * Request type is TI_SCI_MSG_GET_STATUS, response is an appropriate
+ * message, or NACK in case of inability to satisfy request.
+ */
+struct ti_sci_msg_req_get_status {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+} __packed;
+
+/**
+ * struct ti_sci_msg_resp_get_status - Processor boot status response
+ * @hdr:		Generic Header
+ * @processor_id:	ID of processor whose status is returned
+ * @bootvector_low:	Lower 32 bit address (Little Endian) of boot vector
+ * @bootvector_high:	Higher 32 bit address (Little Endian) of boot vector
+ * @config_flags:	Optional Processor specific Config Flags set currently
+ * @control_flags:	Optional Processor specific Control Flags set currently
+ * @status_flags:	Optional Processor specific Status Flags set currently
+ *
+ * Response structure to a TI_SCI_MSG_GET_STATUS request.
+ */
+struct ti_sci_msg_resp_get_status {
+	struct ti_sci_msg_hdr hdr;
+	u8 processor_id;
+	u32 bootvector_low;
+	u32 bootvector_high;
+	u32 config_flags;
+	u32 control_flags;
+	u32 status_flags;
+} __packed;
+
 #endif /* __TI_SCI_H */
diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h
index 4fd9bff5806b..7b3762f68df9 100644
--- a/include/linux/soc/ti/ti_sci_protocol.h
+++ b/include/linux/soc/ti/ti_sci_protocol.h
@@ -453,12 +453,42 @@ struct ti_sci_rm_udmap_ops {
 			   const struct ti_sci_msg_rm_udmap_flow_cfg *params);
 };
 
+/**
+ * struct ti_sci_proc_ops - Processor Control operations
+ * @request:	Request to control a physical processor. The requesting host
+ *		should be in the processor access list
+ * @release:	Relinquish a physical processor control
+ * @handover:	Handover a physical processor control to another host
+ *		in the permitted list
+ * @set_config:	Set base configuration of a processor
+ * @set_control: Setup limited control flags in specific cases
+ * @get_status: Get the state of physical processor
+ *
+ * NOTE: The following paramteres are generic in nature for all these ops,
+ * -handle:	Pointer to TI SCI handle as retrieved by *ti_sci_get_handle
+ * -pid:	Processor ID
+ * -hid:	Host ID
+ */
+struct ti_sci_proc_ops {
+	int (*request)(const struct ti_sci_handle *handle, u8 pid);
+	int (*release)(const struct ti_sci_handle *handle, u8 pid);
+	int (*handover)(const struct ti_sci_handle *handle, u8 pid, u8 hid);
+	int (*set_config)(const struct ti_sci_handle *handle, u8 pid,
+			  u64 boot_vector, u32 cfg_set, u32 cfg_clr);
+	int (*set_control)(const struct ti_sci_handle *handle, u8 pid,
+			   u32 ctrl_set, u32 ctrl_clr);
+	int (*get_status)(const struct ti_sci_handle *handle, u8 pid,
+			  u64 *boot_vector, u32 *cfg_flags, u32 *ctrl_flags,
+			  u32 *status_flags);
+};
+
 /**
  * struct ti_sci_ops - Function support for TI SCI
  * @dev_ops:	Device specific operations
  * @clk_ops:	Clock specific operations
  * @rm_core_ops:	Resource management core operations.
  * @rm_irq_ops:		IRQ management specific operations
+ * @proc_ops:	Processor Control specific operations
  */
 struct ti_sci_ops {
 	struct ti_sci_core_ops core_ops;
@@ -469,6 +499,7 @@ struct ti_sci_ops {
 	struct ti_sci_rm_ringacc_ops rm_ring_ops;
 	struct ti_sci_rm_psil_ops rm_psil_ops;
 	struct ti_sci_rm_udmap_ops rm_udmap_ops;
+	struct ti_sci_proc_ops proc_ops;
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 18c8c0954d15105b02f7d2f556b9eafae426871f Mon Sep 17 00:00:00 2001
From: Wesley Sheng <wesley.sheng@microchip.com>
Date: Tue, 30 Apr 2019 18:04:29 +0800
Subject: NTB: correct ntb_dev_ops and ntb_dev comment typos

The comment for ntb_dev_ops and ntb_dev incorrectly referred to
ntb_ctx_ops and ntb_device.

Signed-off-by: Wesley Sheng <wesley.sheng@microchip.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/ntb.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 56a92e3ae3ae..604abc883741 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -205,7 +205,7 @@ static inline int ntb_ctx_ops_is_valid(const struct ntb_ctx_ops *ops)
 }
 
 /**
- * struct ntb_ctx_ops - ntb device operations
+ * struct ntb_dev_ops - ntb device operations
  * @port_number:	See ntb_port_number().
  * @peer_port_count:	See ntb_peer_port_count().
  * @peer_port_number:	See ntb_peer_port_number().
@@ -404,7 +404,7 @@ struct ntb_client {
 #define drv_ntb_client(__drv) container_of((__drv), struct ntb_client, drv)
 
 /**
- * struct ntb_device - ntb device
+ * struct ntb_dev - ntb device
  * @dev:		Linux device object.
  * @pdev:		PCI device entry of the ntb.
  * @topo:		Detected topology of the ntb.
-- 
cgit v1.2.3-59-g8ed1b


From d7cc609fb679e11dc2b00cbe6c50cbd37ec4bfa2 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 23 May 2019 16:30:51 -0600
Subject: PCI/MSI: Support allocating virtual MSI interrupts

For NTB devices, we want to be able to trigger MSI interrupts
through a memory window. In these cases we may want to use
more interrupts than the NTB PCI device has available in its MSI-X
table.

We allow for this by creating a new 'virtual' interrupt. These
interrupts are allocated as usual but are not programmed into the
MSI-X table (as there may not be space for them).

The MSI address and data will then handled through an NTB MSI library
introduced later in this series.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/pci/msi.c   | 54 ++++++++++++++++++++++++++++++++++++++++++++---------
 include/linux/msi.h |  8 ++++++++
 include/linux/pci.h |  9 +++++++++
 3 files changed, 62 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index e039b740fe74..ace978deaf93 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -192,6 +192,9 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
 
 static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
 {
+	if (desc->msi_attrib.is_virtual)
+		return NULL;
+
 	return desc->mask_base +
 		desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
 }
@@ -206,14 +209,19 @@ static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
 u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag)
 {
 	u32 mask_bits = desc->masked;
+	void __iomem *desc_addr;
 
 	if (pci_msi_ignore_mask)
 		return 0;
+	desc_addr = pci_msix_desc_addr(desc);
+	if (!desc_addr)
+		return 0;
 
 	mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
 	if (flag)
 		mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
-	writel(mask_bits, pci_msix_desc_addr(desc) + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+	writel(mask_bits, desc_addr + PCI_MSIX_ENTRY_VECTOR_CTRL);
 
 	return mask_bits;
 }
@@ -273,6 +281,11 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 	if (entry->msi_attrib.is_msix) {
 		void __iomem *base = pci_msix_desc_addr(entry);
 
+		if (!base) {
+			WARN_ON(1);
+			return;
+		}
+
 		msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
 		msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
 		msg->data = readl(base + PCI_MSIX_ENTRY_DATA);
@@ -303,6 +316,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 	} else if (entry->msi_attrib.is_msix) {
 		void __iomem *base = pci_msix_desc_addr(entry);
 
+		if (!base)
+			goto skip;
+
 		writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
 		writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
 		writel(msg->data, base + PCI_MSIX_ENTRY_DATA);
@@ -327,7 +343,13 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 					      msg->data);
 		}
 	}
+
+skip:
 	entry->msg = *msg;
+
+	if (entry->write_msi_msg)
+		entry->write_msi_msg(entry, entry->write_msi_msg_data);
+
 }
 
 void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
@@ -550,6 +572,7 @@ msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd)
 
 	entry->msi_attrib.is_msix	= 0;
 	entry->msi_attrib.is_64		= !!(control & PCI_MSI_FLAGS_64BIT);
+	entry->msi_attrib.is_virtual    = 0;
 	entry->msi_attrib.entry_nr	= 0;
 	entry->msi_attrib.maskbit	= !!(control & PCI_MSI_FLAGS_MASKBIT);
 	entry->msi_attrib.default_irq	= dev->irq;	/* Save IOAPIC IRQ */
@@ -674,6 +697,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 	struct irq_affinity_desc *curmsk, *masks = NULL;
 	struct msi_desc *entry;
 	int ret, i;
+	int vec_count = pci_msix_vec_count(dev);
 
 	if (affd)
 		masks = irq_create_affinity_masks(nvec, affd);
@@ -696,6 +720,10 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			entry->msi_attrib.entry_nr = entries[i].entry;
 		else
 			entry->msi_attrib.entry_nr = i;
+
+		entry->msi_attrib.is_virtual =
+			entry->msi_attrib.entry_nr >= vec_count;
+
 		entry->msi_attrib.default_irq	= dev->irq;
 		entry->mask_base		= base;
 
@@ -714,12 +742,19 @@ static void msix_program_entries(struct pci_dev *dev,
 {
 	struct msi_desc *entry;
 	int i = 0;
+	void __iomem *desc_addr;
 
 	for_each_pci_msi_entry(entry, dev) {
 		if (entries)
 			entries[i++].vector = entry->irq;
-		entry->masked = readl(pci_msix_desc_addr(entry) +
-				PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+		desc_addr = pci_msix_desc_addr(entry);
+		if (desc_addr)
+			entry->masked = readl(desc_addr +
+					      PCI_MSIX_ENTRY_VECTOR_CTRL);
+		else
+			entry->masked = 0;
+
 		msix_mask_irq(entry, 1);
 	}
 }
@@ -932,7 +967,7 @@ int pci_msix_vec_count(struct pci_dev *dev)
 EXPORT_SYMBOL(pci_msix_vec_count);
 
 static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
-			     int nvec, struct irq_affinity *affd)
+			     int nvec, struct irq_affinity *affd, int flags)
 {
 	int nr_entries;
 	int i, j;
@@ -943,7 +978,7 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 	nr_entries = pci_msix_vec_count(dev);
 	if (nr_entries < 0)
 		return nr_entries;
-	if (nvec > nr_entries)
+	if (nvec > nr_entries && !(flags & PCI_IRQ_VIRTUAL))
 		return nr_entries;
 
 	if (entries) {
@@ -1079,7 +1114,8 @@ EXPORT_SYMBOL(pci_enable_msi);
 
 static int __pci_enable_msix_range(struct pci_dev *dev,
 				   struct msix_entry *entries, int minvec,
-				   int maxvec, struct irq_affinity *affd)
+				   int maxvec, struct irq_affinity *affd,
+				   int flags)
 {
 	int rc, nvec = maxvec;
 
@@ -1096,7 +1132,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 				return -ENOSPC;
 		}
 
-		rc = __pci_enable_msix(dev, entries, nvec, affd);
+		rc = __pci_enable_msix(dev, entries, nvec, affd, flags);
 		if (rc == 0)
 			return nvec;
 
@@ -1127,7 +1163,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
 		int minvec, int maxvec)
 {
-	return __pci_enable_msix_range(dev, entries, minvec, maxvec, NULL);
+	return __pci_enable_msix_range(dev, entries, minvec, maxvec, NULL, 0);
 }
 EXPORT_SYMBOL(pci_enable_msix_range);
 
@@ -1167,7 +1203,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 
 	if (flags & PCI_IRQ_MSIX) {
 		msix_vecs = __pci_enable_msix_range(dev, NULL, min_vecs,
-						    max_vecs, affd);
+						    max_vecs, affd, flags);
 		if (msix_vecs > 0)
 			return msix_vecs;
 	}
diff --git a/include/linux/msi.h b/include/linux/msi.h
index d48e919d55ae..8ad679e9d9c0 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -64,6 +64,10 @@ struct ti_sci_inta_msi_desc {
  * @msg:	The last set MSI message cached for reuse
  * @affinity:	Optional pointer to a cpu affinity mask for this descriptor
  *
+ * @write_msi_msg:	Callback that may be called when the MSI message
+ *			address or data changes
+ * @write_msi_msg_data:	Data parameter for the callback.
+ *
  * @masked:	[PCI MSI/X] Mask bits
  * @is_msix:	[PCI MSI/X] True if MSI-X
  * @multiple:	[PCI MSI/X] log2 num of messages allocated
@@ -90,6 +94,9 @@ struct msi_desc {
 	const void			*iommu_cookie;
 #endif
 
+	void (*write_msi_msg)(struct msi_desc *entry, void *data);
+	void *write_msi_msg_data;
+
 	union {
 		/* PCI MSI/X specific data */
 		struct {
@@ -100,6 +107,7 @@ struct msi_desc {
 				u8	multi_cap	: 3;
 				u8	maskbit		: 1;
 				u8	is_64		: 1;
+				u8	is_virtual	: 1;
 				u16	entry_nr;
 				unsigned default_irq;
 			} msi_attrib;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..19b5c27c6f63 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1362,6 +1362,15 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode,
 #define PCI_IRQ_MSI		(1 << 1) /* Allow MSI interrupts */
 #define PCI_IRQ_MSIX		(1 << 2) /* Allow MSI-X interrupts */
 #define PCI_IRQ_AFFINITY	(1 << 3) /* Auto-assign affinity */
+
+/*
+ * Virtual interrupts allow for more interrupts to be allocated
+ * than the device has interrupts for. These are not programmed
+ * into the device's MSI-X table and must be handled by some
+ * other driver means.
+ */
+#define PCI_IRQ_VIRTUAL		(1 << 4)
+
 #define PCI_IRQ_ALL_TYPES \
 	(PCI_IRQ_LEGACY | PCI_IRQ_MSI | PCI_IRQ_MSIX)
 
-- 
cgit v1.2.3-59-g8ed1b


From 246a42c51bc5dd247629f86c87d5e1b7628343c4 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 23 May 2019 16:30:53 -0600
Subject: NTB: Introduce helper functions to calculate logical port number

This patch introduces the "Logical Port Number" which is similar to the
"Port Number" in that it enumerates the ports in the system.

The original (or Physical) "Port Number" can be any number used by the
hardware to uniquely identify a port in the system. The "Logical Port
Number" enumerates all ports in the system from 0 to the number of
ports minus one.

For example a system with 5 ports might have the following port numbers
which would be enumerated thusly:

Port Number:           1  2  5  7  116
Logical Port Number:   0  1  2  3  4

The logical port number is useful when calculating which resources
to use for which peers. So we thus define two helper functions:
ntb_logical_port_number() and ntb_peer_logical_port_number() which
provide the "Logical Port Number" for the local port and any peer
respectively.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Allen Hubbe <allenbh@gmail.com>
Cc: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/ntb.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 604abc883741..2fadd0352683 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -616,7 +616,6 @@ static inline int ntb_port_number(struct ntb_dev *ntb)
 
 	return ntb->ops->port_number(ntb);
 }
-
 /**
  * ntb_peer_port_count() - get the number of peer device ports
  * @ntb:	NTB device context.
@@ -653,6 +652,58 @@ static inline int ntb_peer_port_number(struct ntb_dev *ntb, int pidx)
 	return ntb->ops->peer_port_number(ntb, pidx);
 }
 
+/**
+ * ntb_logical_port_number() - get the logical port number of the local port
+ * @ntb:	NTB device context.
+ *
+ * The Logical Port Number is defined to be a unique number for each
+ * port starting from zero through to the number of ports minus one.
+ * This is in contrast to the Port Number where each port can be assigned
+ * any unique physical number by the hardware.
+ *
+ * The logical port number is useful for calculating the resource indexes
+ * used by peers.
+ *
+ * Return: the logical port number or negative value indicating an error
+ */
+static inline int ntb_logical_port_number(struct ntb_dev *ntb)
+{
+	int lport = ntb_port_number(ntb);
+	int pidx;
+
+	if (lport < 0)
+		return lport;
+
+	for (pidx = 0; pidx < ntb_peer_port_count(ntb); pidx++)
+		if (lport <= ntb_peer_port_number(ntb, pidx))
+			return pidx;
+
+	return pidx;
+}
+
+/**
+ * ntb_peer_logical_port_number() - get the logical peer port by given index
+ * @ntb:	NTB device context.
+ * @pidx:	Peer port index.
+ *
+ * The Logical Port Number is defined to be a unique number for each
+ * port starting from zero through to the number of ports minus one.
+ * This is in contrast to the Port Number where each port can be assigned
+ * any unique physical number by the hardware.
+ *
+ * The logical port number is useful for calculating the resource indexes
+ * used by peers.
+ *
+ * Return: the peer's logical port number or negative value indicating an error
+ */
+static inline int ntb_peer_logical_port_number(struct ntb_dev *ntb, int pidx)
+{
+	if (ntb_peer_port_number(ntb, pidx) < ntb_port_number(ntb))
+		return pidx;
+	else
+		return pidx + 1;
+}
+
 /**
  * ntb_peer_port_idx() - get the peer device port index by given port number
  * @ntb:	NTB device context.
-- 
cgit v1.2.3-59-g8ed1b


From 5f1b1f065c791de71017502ed3ba46779e231d9b Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 23 May 2019 16:30:54 -0600
Subject: NTB: Introduce functions to calculate multi-port resource index

When using multi-ports each port uses resources (dbs, msgs, mws, etc)
on every other port. Creating a mapping for these resources such that
each port has a corresponding resource on every other port is a bit
tricky.

Introduce the ntb_peer_resource_idx() function for this purpose.
It returns the peer resource number that will correspond with the
local peer index on the remote peer.

Also, introduce ntb_peer_highest_mw_idx() which will use
ntb_peer_resource_idx() but return the MW index starting with the
highest index and working down.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Allen Hubbe <allenbh@gmail.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 include/linux/ntb.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 2fadd0352683..bed421b9579b 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -1557,4 +1557,74 @@ static inline int ntb_peer_msg_write(struct ntb_dev *ntb, int pidx, int midx,
 	return ntb->ops->peer_msg_write(ntb, pidx, midx, msg);
 }
 
+/**
+ * ntb_peer_resource_idx() - get a resource index for a given peer idx
+ * @ntb:	NTB device context.
+ * @pidx:	Peer port index.
+ *
+ * When constructing a graph of peers, each remote peer must use a different
+ * resource index (mw, doorbell, etc) to communicate with each other
+ * peer.
+ *
+ * In a two peer system, this function should always return 0 such that
+ * resource 0 points to the remote peer on both ports.
+ *
+ * In a 5 peer system, this function will return the following matrix
+ *
+ * pidx \ port    0    1    2    3    4
+ * 0              0    0    1    2    3
+ * 1              0    1    1    2    3
+ * 2              0    1    2    2    3
+ * 3              0    1    2    3    3
+ *
+ * For example, if this function is used to program peer's memory
+ * windows, port 0 will program MW 0 on all it's peers to point to itself.
+ * port 1 will program MW 0 in port 0 to point to itself and MW 1 on all
+ * other ports. etc.
+ *
+ * For the legacy two host case, ntb_port_number() and ntb_peer_port_number()
+ * both return zero and therefore this function will always return zero.
+ * So MW 0 on each host would be programmed to point to the other host.
+ *
+ * Return: the resource index to use for that peer.
+ */
+static inline int ntb_peer_resource_idx(struct ntb_dev *ntb, int pidx)
+{
+	int local_port, peer_port;
+
+	if (pidx >= ntb_peer_port_count(ntb))
+		return -EINVAL;
+
+	local_port = ntb_logical_port_number(ntb);
+	peer_port = ntb_peer_logical_port_number(ntb, pidx);
+
+	if (peer_port < local_port)
+		return local_port - 1;
+	else
+		return local_port;
+}
+
+/**
+ * ntb_peer_highest_mw_idx() - get a memory window index for a given peer idx
+ *	using the highest index memory windows first
+ *
+ * @ntb:	NTB device context.
+ * @pidx:	Peer port index.
+ *
+ * Like ntb_peer_resource_idx(), except it returns indexes starting with
+ * last memory window index.
+ *
+ * Return: the resource index to use for that peer.
+ */
+static inline int ntb_peer_highest_mw_idx(struct ntb_dev *ntb, int pidx)
+{
+	int ret;
+
+	ret = ntb_peer_resource_idx(ntb, pidx);
+	if (ret < 0)
+		return ret;
+
+	return ntb_mw_count(ntb, pidx) - ret - 1;
+}
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 26b3a37b928457ba2cd98eaf6d7b0feca5a30fa6 Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 23 May 2019 16:30:56 -0600
Subject: NTB: Introduce MSI library

The NTB MSI library allows passing MSI interrupts across a memory
window. This offers similar functionality to doorbells or messages
except will often have much better latency and the client can
potentially use significantly more remote interrupts than typical hardware
provides for doorbells. (Which can be important in high-multiport
setups.)

The library utilizes one memory window per peer and uses the highest
index memory windows. Before any ntb_msi function may be used, the user
must call ntb_msi_init(). It may then setup and tear down the memory
windows when the link state changes using ntb_msi_setup_mws() and
ntb_msi_clear_mws().

The peer which receives the interrupt must call ntb_msim_request_irq()
to assign the interrupt handler (this function is functionally
similar to devm_request_irq()) and the returned descriptor must be
transferred to the peer which can use it to trigger the interrupt.
The triggering peer, once having received the descriptor, can
trigger the interrupt by calling ntb_msi_peer_trigger().

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Allen Hubbe <allenbh@gmail.com>
Signed-off-by: Jon Mason <jdmason@kudzu.us>
---
 drivers/ntb/Kconfig  |  11 ++
 drivers/ntb/Makefile |   3 +-
 drivers/ntb/msi.c    | 415 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/ntb.h  |  73 +++++++++
 4 files changed, 501 insertions(+), 1 deletion(-)
 create mode 100644 drivers/ntb/msi.c

(limited to 'include/linux')

diff --git a/drivers/ntb/Kconfig b/drivers/ntb/Kconfig
index 95944e52fa36..5760764052be 100644
--- a/drivers/ntb/Kconfig
+++ b/drivers/ntb/Kconfig
@@ -12,6 +12,17 @@ menuconfig NTB
 
 if NTB
 
+config NTB_MSI
+	bool "MSI Interrupt Support"
+	depends on PCI_MSI
+	help
+	 Support using MSI interrupt forwarding instead of (or in addition to)
+	 hardware doorbells. MSI interrupts typically offer lower latency
+	 than doorbells and more MSI interrupts can be made available to
+	 clients. However this requires an extra memory window and support
+	 in the hardware driver for creating the MSI interrupts.
+
+	 If unsure, say N.
 source "drivers/ntb/hw/Kconfig"
 
 source "drivers/ntb/test/Kconfig"
diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile
index 537226f8e78d..cc27ad2ef150 100644
--- a/drivers/ntb/Makefile
+++ b/drivers/ntb/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_NTB) += ntb.o hw/ test/
 obj-$(CONFIG_NTB_TRANSPORT) += ntb_transport.o
 
-ntb-y := core.o
+ntb-y			:= core.o
+ntb-$(CONFIG_NTB_MSI)	+= msi.o
diff --git a/drivers/ntb/msi.c b/drivers/ntb/msi.c
new file mode 100644
index 000000000000..9dddf133658f
--- /dev/null
+++ b/drivers/ntb/msi.c
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/ntb.h>
+#include <linux/msi.h>
+#include <linux/pci.h>
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION("0.1");
+MODULE_AUTHOR("Logan Gunthorpe <logang@deltatee.com>");
+MODULE_DESCRIPTION("NTB MSI Interrupt Library");
+
+struct ntb_msi {
+	u64 base_addr;
+	u64 end_addr;
+
+	void (*desc_changed)(void *ctx);
+
+	u32 __iomem *peer_mws[];
+};
+
+/**
+ * ntb_msi_init() - Initialize the MSI context
+ * @ntb:	NTB device context
+ *
+ * This function must be called before any other ntb_msi function.
+ * It initializes the context for MSI operations and maps
+ * the peer memory windows.
+ *
+ * This function reserves the last N outbound memory windows (where N
+ * is the number of peers).
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_init(struct ntb_dev *ntb,
+		 void (*desc_changed)(void *ctx))
+{
+	phys_addr_t mw_phys_addr;
+	resource_size_t mw_size;
+	size_t struct_size;
+	int peer_widx;
+	int peers;
+	int ret;
+	int i;
+
+	peers = ntb_peer_port_count(ntb);
+	if (peers <= 0)
+		return -EINVAL;
+
+	struct_size = sizeof(*ntb->msi) + sizeof(*ntb->msi->peer_mws) * peers;
+
+	ntb->msi = devm_kzalloc(&ntb->dev, struct_size, GFP_KERNEL);
+	if (!ntb->msi)
+		return -ENOMEM;
+
+	ntb->msi->desc_changed = desc_changed;
+
+	for (i = 0; i < peers; i++) {
+		peer_widx = ntb_peer_mw_count(ntb) - 1 - i;
+
+		ret = ntb_peer_mw_get_addr(ntb, peer_widx, &mw_phys_addr,
+					   &mw_size);
+		if (ret)
+			goto unroll;
+
+		ntb->msi->peer_mws[i] = devm_ioremap(&ntb->dev, mw_phys_addr,
+						     mw_size);
+		if (!ntb->msi->peer_mws[i]) {
+			ret = -EFAULT;
+			goto unroll;
+		}
+	}
+
+	return 0;
+
+unroll:
+	for (i = 0; i < peers; i++)
+		if (ntb->msi->peer_mws[i])
+			devm_iounmap(&ntb->dev, ntb->msi->peer_mws[i]);
+
+	devm_kfree(&ntb->dev, ntb->msi);
+	ntb->msi = NULL;
+	return ret;
+}
+EXPORT_SYMBOL(ntb_msi_init);
+
+/**
+ * ntb_msi_setup_mws() - Initialize the MSI inbound memory windows
+ * @ntb:	NTB device context
+ *
+ * This function sets up the required inbound memory windows. It should be
+ * called from a work function after a link up event.
+ *
+ * Over the entire network, this function will reserves the last N
+ * inbound memory windows for each peer (where N is the number of peers).
+ *
+ * ntb_msi_init() must be called before this function.
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_setup_mws(struct ntb_dev *ntb)
+{
+	struct msi_desc *desc;
+	u64 addr;
+	int peer, peer_widx;
+	resource_size_t addr_align, size_align, size_max;
+	resource_size_t mw_size = SZ_32K;
+	resource_size_t mw_min_size = mw_size;
+	int i;
+	int ret;
+
+	if (!ntb->msi)
+		return -EINVAL;
+
+	desc = first_msi_entry(&ntb->pdev->dev);
+	addr = desc->msg.address_lo + ((uint64_t)desc->msg.address_hi << 32);
+
+	for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) {
+		peer_widx = ntb_peer_highest_mw_idx(ntb, peer);
+		if (peer_widx < 0)
+			return peer_widx;
+
+		ret = ntb_mw_get_align(ntb, peer, peer_widx, &addr_align,
+				       NULL, NULL);
+		if (ret)
+			return ret;
+
+		addr &= ~(addr_align - 1);
+	}
+
+	for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) {
+		peer_widx = ntb_peer_highest_mw_idx(ntb, peer);
+		if (peer_widx < 0) {
+			ret = peer_widx;
+			goto error_out;
+		}
+
+		ret = ntb_mw_get_align(ntb, peer, peer_widx, NULL,
+				       &size_align, &size_max);
+		if (ret)
+			goto error_out;
+
+		mw_size = round_up(mw_size, size_align);
+		mw_size = max(mw_size, size_max);
+		if (mw_size < mw_min_size)
+			mw_min_size = mw_size;
+
+		ret = ntb_mw_set_trans(ntb, peer, peer_widx,
+				       addr, mw_size);
+		if (ret)
+			goto error_out;
+	}
+
+	ntb->msi->base_addr = addr;
+	ntb->msi->end_addr = addr + mw_min_size;
+
+	return 0;
+
+error_out:
+	for (i = 0; i < peer; i++) {
+		peer_widx = ntb_peer_highest_mw_idx(ntb, peer);
+		if (peer_widx < 0)
+			continue;
+
+		ntb_mw_clear_trans(ntb, i, peer_widx);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ntb_msi_setup_mws);
+
+/**
+ * ntb_msi_clear_mws() - Clear all inbound memory windows
+ * @ntb:	NTB device context
+ *
+ * This function tears down the resources used by ntb_msi_setup_mws().
+ */
+void ntb_msi_clear_mws(struct ntb_dev *ntb)
+{
+	int peer;
+	int peer_widx;
+
+	for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) {
+		peer_widx = ntb_peer_highest_mw_idx(ntb, peer);
+		if (peer_widx < 0)
+			continue;
+
+		ntb_mw_clear_trans(ntb, peer, peer_widx);
+	}
+}
+EXPORT_SYMBOL(ntb_msi_clear_mws);
+
+struct ntb_msi_devres {
+	struct ntb_dev *ntb;
+	struct msi_desc *entry;
+	struct ntb_msi_desc *msi_desc;
+};
+
+static int ntb_msi_set_desc(struct ntb_dev *ntb, struct msi_desc *entry,
+			    struct ntb_msi_desc *msi_desc)
+{
+	u64 addr;
+
+	addr = entry->msg.address_lo +
+		((uint64_t)entry->msg.address_hi << 32);
+
+	if (addr < ntb->msi->base_addr || addr >= ntb->msi->end_addr) {
+		dev_warn_once(&ntb->dev,
+			      "IRQ %d: MSI Address not within the memory window (%llx, [%llx %llx])\n",
+			      entry->irq, addr, ntb->msi->base_addr,
+			      ntb->msi->end_addr);
+		return -EFAULT;
+	}
+
+	msi_desc->addr_offset = addr - ntb->msi->base_addr;
+	msi_desc->data = entry->msg.data;
+
+	return 0;
+}
+
+static void ntb_msi_write_msg(struct msi_desc *entry, void *data)
+{
+	struct ntb_msi_devres *dr = data;
+
+	WARN_ON(ntb_msi_set_desc(dr->ntb, entry, dr->msi_desc));
+
+	if (dr->ntb->msi->desc_changed)
+		dr->ntb->msi->desc_changed(dr->ntb->ctx);
+}
+
+static void ntbm_msi_callback_release(struct device *dev, void *res)
+{
+	struct ntb_msi_devres *dr = res;
+
+	dr->entry->write_msi_msg = NULL;
+	dr->entry->write_msi_msg_data = NULL;
+}
+
+static int ntbm_msi_setup_callback(struct ntb_dev *ntb, struct msi_desc *entry,
+				   struct ntb_msi_desc *msi_desc)
+{
+	struct ntb_msi_devres *dr;
+
+	dr = devres_alloc(ntbm_msi_callback_release,
+			  sizeof(struct ntb_msi_devres), GFP_KERNEL);
+	if (!dr)
+		return -ENOMEM;
+
+	dr->ntb = ntb;
+	dr->entry = entry;
+	dr->msi_desc = msi_desc;
+
+	devres_add(&ntb->dev, dr);
+
+	dr->entry->write_msi_msg = ntb_msi_write_msg;
+	dr->entry->write_msi_msg_data = dr;
+
+	return 0;
+}
+
+/**
+ * ntbm_msi_request_threaded_irq() - allocate an MSI interrupt
+ * @ntb:	NTB device context
+ * @handler:	Function to be called when the IRQ occurs
+ * @thread_fn:  Function to be called in a threaded interrupt context. NULL
+ *              for clients which handle everything in @handler
+ * @devname:    An ascii name for the claiming device, dev_name(dev) if NULL
+ * @dev_id:     A cookie passed back to the handler function
+ *
+ * This function assigns an interrupt handler to an unused
+ * MSI interrupt and returns the descriptor used to trigger
+ * it. The descriptor can then be sent to a peer to trigger
+ * the interrupt.
+ *
+ * The interrupt resource is managed with devres so it will
+ * be automatically freed when the NTB device is torn down.
+ *
+ * If an IRQ allocated with this function needs to be freed
+ * separately, ntbm_free_irq() must be used.
+ *
+ * Return: IRQ number assigned on success, otherwise a negative error number.
+ */
+int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler,
+				  irq_handler_t thread_fn,
+				  const char *name, void *dev_id,
+				  struct ntb_msi_desc *msi_desc)
+{
+	struct msi_desc *entry;
+	struct irq_desc *desc;
+	int ret;
+
+	if (!ntb->msi)
+		return -EINVAL;
+
+	for_each_pci_msi_entry(entry, ntb->pdev) {
+		desc = irq_to_desc(entry->irq);
+		if (desc->action)
+			continue;
+
+		ret = devm_request_threaded_irq(&ntb->dev, entry->irq, handler,
+						thread_fn, 0, name, dev_id);
+		if (ret)
+			continue;
+
+		if (ntb_msi_set_desc(ntb, entry, msi_desc)) {
+			devm_free_irq(&ntb->dev, entry->irq, dev_id);
+			continue;
+		}
+
+		ret = ntbm_msi_setup_callback(ntb, entry, msi_desc);
+		if (ret) {
+			devm_free_irq(&ntb->dev, entry->irq, dev_id);
+			return ret;
+		}
+
+
+		return entry->irq;
+	}
+
+	return -ENODEV;
+}
+EXPORT_SYMBOL(ntbm_msi_request_threaded_irq);
+
+static int ntbm_msi_callback_match(struct device *dev, void *res, void *data)
+{
+	struct ntb_dev *ntb = dev_ntb(dev);
+	struct ntb_msi_devres *dr = res;
+
+	return dr->ntb == ntb && dr->entry == data;
+}
+
+/**
+ * ntbm_msi_free_irq() - free an interrupt
+ * @ntb:	NTB device context
+ * @irq:	Interrupt line to free
+ * @dev_id:	Device identity to free
+ *
+ * This function should be used to manually free IRQs allocated with
+ * ntbm_request_[threaded_]irq().
+ */
+void ntbm_msi_free_irq(struct ntb_dev *ntb, unsigned int irq, void *dev_id)
+{
+	struct msi_desc *entry = irq_get_msi_desc(irq);
+
+	entry->write_msi_msg = NULL;
+	entry->write_msi_msg_data = NULL;
+
+	WARN_ON(devres_destroy(&ntb->dev, ntbm_msi_callback_release,
+			       ntbm_msi_callback_match, entry));
+
+	devm_free_irq(&ntb->dev, irq, dev_id);
+}
+EXPORT_SYMBOL(ntbm_msi_free_irq);
+
+/**
+ * ntb_msi_peer_trigger() - Trigger an interrupt handler on a peer
+ * @ntb:	NTB device context
+ * @peer:	Peer index
+ * @desc:	MSI descriptor data which triggers the interrupt
+ *
+ * This function triggers an interrupt on a peer. It requires
+ * the descriptor structure to have been passed from that peer
+ * by some other means.
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer,
+			 struct ntb_msi_desc *desc)
+{
+	int idx;
+
+	if (!ntb->msi)
+		return -EINVAL;
+
+	idx = desc->addr_offset / sizeof(*ntb->msi->peer_mws[peer]);
+
+	iowrite32(desc->data, &ntb->msi->peer_mws[peer][idx]);
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_msi_peer_trigger);
+
+/**
+ * ntb_msi_peer_addr() - Get the DMA address to trigger a peer's MSI interrupt
+ * @ntb:	NTB device context
+ * @peer:	Peer index
+ * @desc:	MSI descriptor data which triggers the interrupt
+ * @msi_addr:   Physical address to trigger the interrupt
+ *
+ * This function allows using DMA engines to trigger an interrupt
+ * (for example, trigger an interrupt to process the data after
+ * sending it). To trigger the interrupt, write @desc.data to the address
+ * returned in @msi_addr
+ *
+ * Return: Zero on success, otherwise a negative error number.
+ */
+int ntb_msi_peer_addr(struct ntb_dev *ntb, int peer,
+		      struct ntb_msi_desc *desc,
+		      phys_addr_t *msi_addr)
+{
+	int peer_widx = ntb_peer_mw_count(ntb) - 1 - peer;
+	phys_addr_t mw_phys_addr;
+	int ret;
+
+	ret = ntb_peer_mw_get_addr(ntb, peer_widx, &mw_phys_addr, NULL);
+	if (ret)
+		return ret;
+
+	if (msi_addr)
+		*msi_addr = mw_phys_addr + desc->addr_offset;
+
+	return 0;
+}
+EXPORT_SYMBOL(ntb_msi_peer_addr);
diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index bed421b9579b..8c13538aeffe 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -58,9 +58,11 @@
 
 #include <linux/completion.h>
 #include <linux/device.h>
+#include <linux/interrupt.h>
 
 struct ntb_client;
 struct ntb_dev;
+struct ntb_msi;
 struct pci_dev;
 
 /**
@@ -426,6 +428,10 @@ struct ntb_dev {
 	spinlock_t			ctx_lock;
 	/* block unregister until device is fully released */
 	struct completion		released;
+
+#ifdef CONFIG_NTB_MSI
+	struct ntb_msi *msi;
+#endif
 };
 #define dev_ntb(__dev) container_of((__dev), struct ntb_dev, dev)
 
@@ -1627,4 +1633,71 @@ static inline int ntb_peer_highest_mw_idx(struct ntb_dev *ntb, int pidx)
 	return ntb_mw_count(ntb, pidx) - ret - 1;
 }
 
+struct ntb_msi_desc {
+	u32 addr_offset;
+	u32 data;
+};
+
+#ifdef CONFIG_NTB_MSI
+
+int ntb_msi_init(struct ntb_dev *ntb, void (*desc_changed)(void *ctx));
+int ntb_msi_setup_mws(struct ntb_dev *ntb);
+void ntb_msi_clear_mws(struct ntb_dev *ntb);
+int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler,
+				  irq_handler_t thread_fn,
+				  const char *name, void *dev_id,
+				  struct ntb_msi_desc *msi_desc);
+void ntbm_msi_free_irq(struct ntb_dev *ntb, unsigned int irq, void *dev_id);
+int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer,
+			 struct ntb_msi_desc *desc);
+int ntb_msi_peer_addr(struct ntb_dev *ntb, int peer,
+		      struct ntb_msi_desc *desc,
+		      phys_addr_t *msi_addr);
+
+#else /* not CONFIG_NTB_MSI */
+
+static inline int ntb_msi_init(struct ntb_dev *ntb,
+			       void (*desc_changed)(void *ctx))
+{
+	return -EOPNOTSUPP;
+}
+static inline int ntb_msi_setup_mws(struct ntb_dev *ntb)
+{
+	return -EOPNOTSUPP;
+}
+static inline void ntb_msi_clear_mws(struct ntb_dev *ntb) {}
+static inline int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb,
+						irq_handler_t handler,
+						irq_handler_t thread_fn,
+						const char *name, void *dev_id,
+						struct ntb_msi_desc *msi_desc)
+{
+	return -EOPNOTSUPP;
+}
+static inline void ntbm_msi_free_irq(struct ntb_dev *ntb, unsigned int irq,
+				     void *dev_id) {}
+static inline int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer,
+				       struct ntb_msi_desc *desc)
+{
+	return -EOPNOTSUPP;
+}
+static inline int ntb_msi_peer_addr(struct ntb_dev *ntb, int peer,
+				    struct ntb_msi_desc *desc,
+				    phys_addr_t *msi_addr)
+{
+	return -EOPNOTSUPP;
+
+}
+
+#endif /* CONFIG_NTB_MSI */
+
+static inline int ntbm_msi_request_irq(struct ntb_dev *ntb,
+				       irq_handler_t handler,
+				       const char *name, void *dev_id,
+				       struct ntb_msi_desc *msi_desc)
+{
+	return ntbm_msi_request_threaded_irq(ntb, handler, NULL, name,
+					     dev_id, msi_desc);
+}
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From de76cda215d56256ffcda7ffa538b70f9fb301a7 Mon Sep 17 00:00:00 2001
From: Gustavo Pimentel <Gustavo.Pimentel@synopsys.com>
Date: Tue, 4 Jun 2019 18:24:43 +0200
Subject: PCI: Decode PCIe 32 GT/s link speed

PCIe r5.0, sec 7.5.3.18, defines a new 32.0 GT/s bit in the Supported Link
Speeds Vector of Link Capabilities 2.  Decode this new speed.  This does
not affect the speed of the link, which should be negotiated automatically
by the hardware; it only adds decoding when showing the speed to the user.

Previously, reading the speed of a link operating at this speed showed
"Unknown speed" instead of "32.0 GT/s".

Link: https://lore.kernel.org/lkml/92365e3caf0fc559f9ab14bcd053bfc92d4f661c.1559664969.git.gustavo.pimentel@synopsys.com
Signed-off-by: Gustavo Pimentel <gustavo.pimentel@synopsys.com>
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci-sysfs.c       | 3 +++
 drivers/pci/pci.c             | 4 +++-
 drivers/pci/probe.c           | 2 +-
 drivers/pci/slot.c            | 1 +
 include/linux/pci.h           | 1 +
 include/uapi/linux/pci_regs.h | 4 ++++
 6 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 6d27475e39b2..d52d30448e41 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -182,6 +182,9 @@ static ssize_t current_link_speed_show(struct device *dev,
 		return -EINVAL;
 
 	switch (linkstat & PCI_EXP_LNKSTA_CLS) {
+	case PCI_EXP_LNKSTA_CLS_32_0GB:
+		speed = "32 GT/s";
+		break;
 	case PCI_EXP_LNKSTA_CLS_16_0GB:
 		speed = "16 GT/s";
 		break;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8abc843b1615..4729a7c7a9d9 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5621,7 +5621,9 @@ enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
 	 */
 	pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2);
 	if (lnkcap2) { /* PCIe r3.0-compliant */
-		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB)
+		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_32_0GB)
+			return PCIE_SPEED_32_0GT;
+		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB)
 			return PCIE_SPEED_16_0GT;
 		else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
 			return PCIE_SPEED_8_0GT;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0e8e2c186f50..c5f27c8cd140 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -668,7 +668,7 @@ const unsigned char pcie_link_speed[] = {
 	PCIE_SPEED_5_0GT,		/* 2 */
 	PCIE_SPEED_8_0GT,		/* 3 */
 	PCIE_SPEED_16_0GT,		/* 4 */
-	PCI_SPEED_UNKNOWN,		/* 5 */
+	PCIE_SPEED_32_0GT,		/* 5 */
 	PCI_SPEED_UNKNOWN,		/* 6 */
 	PCI_SPEED_UNKNOWN,		/* 7 */
 	PCI_SPEED_UNKNOWN,		/* 8 */
diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c
index f4d92b1afe7b..ae4aa0e1f2f4 100644
--- a/drivers/pci/slot.c
+++ b/drivers/pci/slot.c
@@ -75,6 +75,7 @@ static const char *pci_bus_speed_strings[] = {
 	"5.0 GT/s PCIe",	/* 0x15 */
 	"8.0 GT/s PCIe",	/* 0x16 */
 	"16.0 GT/s PCIe",	/* 0x17 */
+	"32.0 GT/s PCIe",	/* 0x18 */
 };
 
 static ssize_t bus_speed_read(enum pci_bus_speed speed, char *buf)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..2173e6b75579 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -258,6 +258,7 @@ enum pci_bus_speed {
 	PCIE_SPEED_5_0GT		= 0x15,
 	PCIE_SPEED_8_0GT		= 0x16,
 	PCIE_SPEED_16_0GT		= 0x17,
+	PCIE_SPEED_32_0GT		= 0x18,
 	PCI_SPEED_UNKNOWN		= 0xff,
 };
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index 27164769d184..f28e562d7ca8 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -528,6 +528,7 @@
 #define  PCI_EXP_LNKCAP_SLS_5_0GB 0x00000002 /* LNKCAP2 SLS Vector bit 1 */
 #define  PCI_EXP_LNKCAP_SLS_8_0GB 0x00000003 /* LNKCAP2 SLS Vector bit 2 */
 #define  PCI_EXP_LNKCAP_SLS_16_0GB 0x00000004 /* LNKCAP2 SLS Vector bit 3 */
+#define  PCI_EXP_LNKCAP_SLS_32_0GB 0x00000005 /* LNKCAP2 SLS Vector bit 4 */
 #define  PCI_EXP_LNKCAP_MLW	0x000003f0 /* Maximum Link Width */
 #define  PCI_EXP_LNKCAP_ASPMS	0x00000c00 /* ASPM Support */
 #define  PCI_EXP_LNKCAP_L0SEL	0x00007000 /* L0s Exit Latency */
@@ -556,6 +557,7 @@
 #define  PCI_EXP_LNKSTA_CLS_5_0GB 0x0002 /* Current Link Speed 5.0GT/s */
 #define  PCI_EXP_LNKSTA_CLS_8_0GB 0x0003 /* Current Link Speed 8.0GT/s */
 #define  PCI_EXP_LNKSTA_CLS_16_0GB 0x0004 /* Current Link Speed 16.0GT/s */
+#define  PCI_EXP_LNKSTA_CLS_32_0GB 0x0005 /* Current Link Speed 32.0GT/s */
 #define  PCI_EXP_LNKSTA_NLW	0x03f0	/* Negotiated Link Width */
 #define  PCI_EXP_LNKSTA_NLW_X1	0x0010	/* Current Link Width x1 */
 #define  PCI_EXP_LNKSTA_NLW_X2	0x0020	/* Current Link Width x2 */
@@ -661,6 +663,7 @@
 #define  PCI_EXP_LNKCAP2_SLS_5_0GB	0x00000004 /* Supported Speed 5GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_8_0GB	0x00000008 /* Supported Speed 8GT/s */
 #define  PCI_EXP_LNKCAP2_SLS_16_0GB	0x00000010 /* Supported Speed 16GT/s */
+#define  PCI_EXP_LNKCAP2_SLS_32_0GB	0x00000020 /* Supported Speed 32GT/s */
 #define  PCI_EXP_LNKCAP2_CROSSLINK	0x00000100 /* Crosslink supported */
 #define PCI_EXP_LNKCTL2		48	/* Link Control 2 */
 #define  PCI_EXP_LNKCTL2_TLS		0x000f
@@ -668,6 +671,7 @@
 #define  PCI_EXP_LNKCTL2_TLS_5_0GT	0x0002 /* Supported Speed 5GT/s */
 #define  PCI_EXP_LNKCTL2_TLS_8_0GT	0x0003 /* Supported Speed 8GT/s */
 #define  PCI_EXP_LNKCTL2_TLS_16_0GT	0x0004 /* Supported Speed 16GT/s */
+#define  PCI_EXP_LNKCTL2_TLS_32_0GT	0x0005 /* Supported Speed 32GT/s */
 #define PCI_EXP_LNKSTA2		50	/* Link Status 2 */
 #define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2	52	/* v2 endpoints with link end here */
 #define PCI_EXP_SLTCAP2		52	/* Slot Capabilities 2 */
-- 
cgit v1.2.3-59-g8ed1b


From bb2bb903042517b8fb17b2bc21e00512f2dcac01 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@google.com>
Date: Thu, 13 Jun 2019 15:34:07 -0700
Subject: dma-buf: add DMA_BUF_SET_NAME ioctls

This patch adds complimentary DMA_BUF_SET_NAME  ioctls, which lets
userspace processes attach a free-form name to each buffer.

This information can be extremely helpful for tracking and accounting
shared buffers.  For example, on Android, we know what each buffer will
be used for at allocation time: GL, multimedia, camera, etc.  The
userspace allocator can use DMA_BUF_SET_NAME to associate that
information with the buffer, so we can later give developers a
breakdown of how much memory they're allocating for graphics, camera,
etc.

Signed-off-by: Greg Hackmann <ghackmann@google.com>
Signed-off-by: Chenbo Feng <fengc@google.com>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190613223408.139221-3-fengc@google.com
---
 drivers/dma-buf/dma-buf.c    | 65 ++++++++++++++++++++++++++++++++++++++++++--
 include/linux/dma-buf.h      |  5 +++-
 include/uapi/linux/dma-buf.h |  3 ++
 3 files changed, 69 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 3612ccededd6..ab96410d1dcd 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -48,8 +48,24 @@ struct dma_buf_list {
 
 static struct dma_buf_list db_list;
 
+static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	struct dma_buf *dmabuf;
+	char name[DMA_BUF_NAME_LEN];
+	size_t ret = 0;
+
+	dmabuf = dentry->d_fsdata;
+	mutex_lock(&dmabuf->lock);
+	if (dmabuf->name)
+		ret = strlcpy(name, dmabuf->name, DMA_BUF_NAME_LEN);
+	mutex_unlock(&dmabuf->lock);
+
+	return dynamic_dname(dentry, buffer, buflen, "/%s:%s",
+			     dentry->d_name.name, ret > 0 ? name : "");
+}
+
 static const struct dentry_operations dma_buf_dentry_ops = {
-	.d_dname = simple_dname,
+	.d_dname = dmabuffs_dname,
 };
 
 static struct vfsmount *dma_buf_mnt;
@@ -301,6 +317,43 @@ out:
 	return events;
 }
 
+/**
+ * dma_buf_set_name - Set a name to a specific dma_buf to track the usage.
+ * The name of the dma-buf buffer can only be set when the dma-buf is not
+ * attached to any devices. It could theoritically support changing the
+ * name of the dma-buf if the same piece of memory is used for multiple
+ * purpose between different devices.
+ *
+ * @dmabuf [in]     dmabuf buffer that will be renamed.
+ * @buf:   [in]     A piece of userspace memory that contains the name of
+ *                  the dma-buf.
+ *
+ * Returns 0 on success. If the dma-buf buffer is already attached to
+ * devices, return -EBUSY.
+ *
+ */
+static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
+{
+	char *name = strndup_user(buf, DMA_BUF_NAME_LEN);
+	long ret = 0;
+
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	mutex_lock(&dmabuf->lock);
+	if (!list_empty(&dmabuf->attachments)) {
+		ret = -EBUSY;
+		kfree(name);
+		goto out_unlock;
+	}
+	kfree(dmabuf->name);
+	dmabuf->name = name;
+
+out_unlock:
+	mutex_unlock(&dmabuf->lock);
+	return ret;
+}
+
 static long dma_buf_ioctl(struct file *file,
 			  unsigned int cmd, unsigned long arg)
 {
@@ -339,6 +392,10 @@ static long dma_buf_ioctl(struct file *file,
 			ret = dma_buf_begin_cpu_access(dmabuf, direction);
 
 		return ret;
+
+	case DMA_BUF_SET_NAME:
+		return dma_buf_set_name(dmabuf, (const char __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
@@ -380,6 +437,7 @@ static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags)
 		goto err_alloc_file;
 	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
 	file->private_data = dmabuf;
+	file->f_path.dentry->d_fsdata = dmabuf;
 
 	return file;
 
@@ -1112,12 +1170,13 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 			continue;
 		}
 
-		seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\n",
+		seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\t%s\n",
 				buf_obj->size,
 				buf_obj->file->f_flags, buf_obj->file->f_mode,
 				file_count(buf_obj->file),
 				buf_obj->exp_name,
-				file_inode(buf_obj->file)->i_ino);
+				file_inode(buf_obj->file)->i_ino,
+				buf_obj->name ?: "");
 
 		robj = buf_obj->resv;
 		while (true) {
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 8a327566d7f4..01ad5b942a6f 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -280,10 +280,12 @@ struct dma_buf_ops {
  * @file: file pointer used for sharing buffers across, and for refcounting.
  * @attachments: list of dma_buf_attachment that denotes all devices attached.
  * @ops: dma_buf_ops associated with this buffer object.
- * @lock: used internally to serialize list manipulation, attach/detach and vmap/unmap
+ * @lock: used internally to serialize list manipulation, attach/detach and
+ *        vmap/unmap, and accesses to name
  * @vmapping_counter: used internally to refcnt the vmaps
  * @vmap_ptr: the current vmap ptr if vmapping_counter > 0
  * @exp_name: name of the exporter; useful for debugging.
+ * @name: userspace-provided name; useful for accounting and debugging.
  * @owner: pointer to exporter module; used for refcounting when exporter is a
  *         kernel module.
  * @list_node: node for dma_buf accounting and debugging.
@@ -311,6 +313,7 @@ struct dma_buf {
 	unsigned vmapping_counter;
 	void *vmap_ptr;
 	const char *exp_name;
+	const char *name;
 	struct module *owner;
 	struct list_head list_node;
 	void *priv;
diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
index d75df5210a4a..dbc7092e04b5 100644
--- a/include/uapi/linux/dma-buf.h
+++ b/include/uapi/linux/dma-buf.h
@@ -35,7 +35,10 @@ struct dma_buf_sync {
 #define DMA_BUF_SYNC_VALID_FLAGS_MASK \
 	(DMA_BUF_SYNC_RW | DMA_BUF_SYNC_END)
 
+#define DMA_BUF_NAME_LEN	32
+
 #define DMA_BUF_BASE		'b'
 #define DMA_BUF_IOCTL_SYNC	_IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
+#define DMA_BUF_SET_NAME	_IOW(DMA_BUF_BASE, 1, const char *)
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 151f4e2bdc7a04020ae5c533896fb91a16e1f501 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 13 Jun 2019 07:10:36 -0300
Subject: docs: power: convert docs to ReST and rename to *.rst

Convert the PM documents to ReST, in order to allow them to
build with Sphinx.

The conversion is actually:
  - add blank lines and indentation in order to identify paragraphs;
  - fix tables markups;
  - add some lists markups;
  - mark literal blocks;
  - adjust title markups.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
---
 Documentation/ABI/testing/sysfs-class-powercap     |    2 +-
 Documentation/admin-guide/kernel-parameters.txt    |    6 +-
 Documentation/cpu-freq/core.txt                    |    2 +-
 Documentation/driver-api/pm/devices.rst            |    6 +-
 Documentation/driver-api/usb/power-management.rst  |    2 +-
 Documentation/power/apm-acpi.rst                   |   36 +
 Documentation/power/apm-acpi.txt                   |   32 -
 Documentation/power/basic-pm-debugging.rst         |  269 +++++
 Documentation/power/basic-pm-debugging.txt         |  254 -----
 Documentation/power/charger-manager.rst            |  205 ++++
 Documentation/power/charger-manager.txt            |  200 ----
 Documentation/power/drivers-testing.rst            |   51 +
 Documentation/power/drivers-testing.txt            |   46 -
 Documentation/power/energy-model.rst               |  147 +++
 Documentation/power/energy-model.txt               |  144 ---
 Documentation/power/freezing-of-tasks.rst          |  244 +++++
 Documentation/power/freezing-of-tasks.txt          |  231 ----
 Documentation/power/index.rst                      |   46 +
 Documentation/power/interface.rst                  |   79 ++
 Documentation/power/interface.txt                  |   77 --
 Documentation/power/opp.rst                        |  379 +++++++
 Documentation/power/opp.txt                        |  342 ------
 Documentation/power/pci.rst                        | 1135 ++++++++++++++++++++
 Documentation/power/pci.txt                        | 1094 -------------------
 Documentation/power/pm_qos_interface.rst           |  225 ++++
 Documentation/power/pm_qos_interface.txt           |  212 ----
 Documentation/power/power_supply_class.rst         |  282 +++++
 Documentation/power/power_supply_class.txt         |  231 ----
 Documentation/power/powercap/powercap.rst          |  257 +++++
 Documentation/power/powercap/powercap.txt          |  236 ----
 Documentation/power/regulator/consumer.rst         |  229 ++++
 Documentation/power/regulator/consumer.txt         |  218 ----
 Documentation/power/regulator/design.rst           |   38 +
 Documentation/power/regulator/design.txt           |   33 -
 Documentation/power/regulator/machine.rst          |   97 ++
 Documentation/power/regulator/machine.txt          |   96 --
 Documentation/power/regulator/overview.rst         |  178 +++
 Documentation/power/regulator/overview.txt         |  171 ---
 Documentation/power/regulator/regulator.rst        |   32 +
 Documentation/power/regulator/regulator.txt        |   30 -
 Documentation/power/runtime_pm.rst                 |  940 ++++++++++++++++
 Documentation/power/runtime_pm.txt                 |  928 ----------------
 Documentation/power/s2ram.rst                      |   87 ++
 Documentation/power/s2ram.txt                      |   85 --
 Documentation/power/suspend-and-cpuhotplug.rst     |  286 +++++
 Documentation/power/suspend-and-cpuhotplug.txt     |  274 -----
 Documentation/power/suspend-and-interrupts.rst     |  137 +++
 Documentation/power/suspend-and-interrupts.txt     |  135 ---
 Documentation/power/swsusp-and-swap-files.rst      |   63 ++
 Documentation/power/swsusp-and-swap-files.txt      |   60 --
 Documentation/power/swsusp-dmcrypt.rst             |  140 +++
 Documentation/power/swsusp-dmcrypt.txt             |  138 ---
 Documentation/power/swsusp.rst                     |  501 +++++++++
 Documentation/power/swsusp.txt                     |  446 --------
 Documentation/power/tricks.rst                     |   29 +
 Documentation/power/tricks.txt                     |   27 -
 Documentation/power/userland-swsusp.rst            |  191 ++++
 Documentation/power/userland-swsusp.txt            |  170 ---
 Documentation/power/video.rst                      |  213 ++++
 Documentation/power/video.txt                      |  185 ----
 Documentation/process/submitting-drivers.rst       |    2 +-
 Documentation/scheduler/sched-energy.txt           |    6 +-
 Documentation/trace/coresight-cpu-debug.txt        |    2 +-
 .../zh_CN/process/submitting-drivers.rst           |    2 +-
 MAINTAINERS                                        |    4 +-
 arch/x86/Kconfig                                   |    2 +-
 drivers/gpu/drm/i915/i915_drv.h                    |    2 +-
 drivers/opp/Kconfig                                |    2 +-
 drivers/power/supply/power_supply_core.c           |    2 +-
 include/linux/interrupt.h                          |    2 +-
 include/linux/pci.h                                |    2 +-
 include/linux/pm.h                                 |    2 +-
 kernel/power/Kconfig                               |    6 +-
 net/wireless/Kconfig                               |    2 +-
 74 files changed, 6544 insertions(+), 6123 deletions(-)
 create mode 100644 Documentation/power/apm-acpi.rst
 delete mode 100644 Documentation/power/apm-acpi.txt
 create mode 100644 Documentation/power/basic-pm-debugging.rst
 delete mode 100644 Documentation/power/basic-pm-debugging.txt
 create mode 100644 Documentation/power/charger-manager.rst
 delete mode 100644 Documentation/power/charger-manager.txt
 create mode 100644 Documentation/power/drivers-testing.rst
 delete mode 100644 Documentation/power/drivers-testing.txt
 create mode 100644 Documentation/power/energy-model.rst
 delete mode 100644 Documentation/power/energy-model.txt
 create mode 100644 Documentation/power/freezing-of-tasks.rst
 delete mode 100644 Documentation/power/freezing-of-tasks.txt
 create mode 100644 Documentation/power/index.rst
 create mode 100644 Documentation/power/interface.rst
 delete mode 100644 Documentation/power/interface.txt
 create mode 100644 Documentation/power/opp.rst
 delete mode 100644 Documentation/power/opp.txt
 create mode 100644 Documentation/power/pci.rst
 delete mode 100644 Documentation/power/pci.txt
 create mode 100644 Documentation/power/pm_qos_interface.rst
 delete mode 100644 Documentation/power/pm_qos_interface.txt
 create mode 100644 Documentation/power/power_supply_class.rst
 delete mode 100644 Documentation/power/power_supply_class.txt
 create mode 100644 Documentation/power/powercap/powercap.rst
 delete mode 100644 Documentation/power/powercap/powercap.txt
 create mode 100644 Documentation/power/regulator/consumer.rst
 delete mode 100644 Documentation/power/regulator/consumer.txt
 create mode 100644 Documentation/power/regulator/design.rst
 delete mode 100644 Documentation/power/regulator/design.txt
 create mode 100644 Documentation/power/regulator/machine.rst
 delete mode 100644 Documentation/power/regulator/machine.txt
 create mode 100644 Documentation/power/regulator/overview.rst
 delete mode 100644 Documentation/power/regulator/overview.txt
 create mode 100644 Documentation/power/regulator/regulator.rst
 delete mode 100644 Documentation/power/regulator/regulator.txt
 create mode 100644 Documentation/power/runtime_pm.rst
 delete mode 100644 Documentation/power/runtime_pm.txt
 create mode 100644 Documentation/power/s2ram.rst
 delete mode 100644 Documentation/power/s2ram.txt
 create mode 100644 Documentation/power/suspend-and-cpuhotplug.rst
 delete mode 100644 Documentation/power/suspend-and-cpuhotplug.txt
 create mode 100644 Documentation/power/suspend-and-interrupts.rst
 delete mode 100644 Documentation/power/suspend-and-interrupts.txt
 create mode 100644 Documentation/power/swsusp-and-swap-files.rst
 delete mode 100644 Documentation/power/swsusp-and-swap-files.txt
 create mode 100644 Documentation/power/swsusp-dmcrypt.rst
 delete mode 100644 Documentation/power/swsusp-dmcrypt.txt
 create mode 100644 Documentation/power/swsusp.rst
 delete mode 100644 Documentation/power/swsusp.txt
 create mode 100644 Documentation/power/tricks.rst
 delete mode 100644 Documentation/power/tricks.txt
 create mode 100644 Documentation/power/userland-swsusp.rst
 delete mode 100644 Documentation/power/userland-swsusp.txt
 create mode 100644 Documentation/power/video.rst
 delete mode 100644 Documentation/power/video.txt

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-powercap b/Documentation/ABI/testing/sysfs-class-powercap
index db3b3ff70d84..742dfd966592 100644
--- a/Documentation/ABI/testing/sysfs-class-powercap
+++ b/Documentation/ABI/testing/sysfs-class-powercap
@@ -5,7 +5,7 @@ Contact:	linux-pm@vger.kernel.org
 Description:
 		The powercap/ class sub directory belongs to the power cap
 		subsystem. Refer to
-		Documentation/power/powercap/powercap.txt for details.
+		Documentation/power/powercap/powercap.rst for details.
 
 What:		/sys/class/powercap/<control type>
 Date:		September 2013
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 138f6664b2e2..7f5ca6e7c4d3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -13,7 +13,7 @@
 			For ARM64, ONLY "acpi=off", "acpi=on" or "acpi=force"
 			are available
 
-			See also Documentation/power/runtime_pm.txt, pci=noacpi
+			See also Documentation/power/runtime_pm.rst, pci=noacpi
 
 	acpi_apic_instance=	[ACPI, IOAPIC]
 			Format: <int>
@@ -223,7 +223,7 @@
 	acpi_sleep=	[HW,ACPI] Sleep options
 			Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
 				  old_ordering, nonvs, sci_force_enable, nobl }
-			See Documentation/power/video.txt for information on
+			See Documentation/power/video.rst for information on
 			s3_bios and s3_mode.
 			s3_beep is for debugging; it makes the PC's speaker beep
 			as soon as the kernel's real-mode entry point is called.
@@ -4108,7 +4108,7 @@
 			Specify the offset from the beginning of the partition
 			given by "resume=" at which the swap header is located,
 			in <PAGE_SIZE> units (needed only for swap files).
-			See  Documentation/power/swsusp-and-swap-files.txt
+			See  Documentation/power/swsusp-and-swap-files.rst
 
 	resumedelay=	[HIBERNATION] Delay (in seconds) to pause before attempting to
 			read the resume files
diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt
index 073f128af5a7..55193e680250 100644
--- a/Documentation/cpu-freq/core.txt
+++ b/Documentation/cpu-freq/core.txt
@@ -95,7 +95,7 @@ flags	- flags of the cpufreq driver
 
 3. CPUFreq Table Generation with Operating Performance Point (OPP)
 ==================================================================
-For details about OPP, see Documentation/power/opp.txt
+For details about OPP, see Documentation/power/opp.rst
 
 dev_pm_opp_init_cpufreq_table -
 	This function provides a ready to use conversion routine to translate
diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst
index 30835683616a..f66c7b9126ea 100644
--- a/Documentation/driver-api/pm/devices.rst
+++ b/Documentation/driver-api/pm/devices.rst
@@ -225,7 +225,7 @@ system-wide transition to a sleep state even though its :c:member:`runtime_auto`
 flag is clear.
 
 For more information about the runtime power management framework, refer to
-:file:`Documentation/power/runtime_pm.txt`.
+:file:`Documentation/power/runtime_pm.rst`.
 
 
 Calling Drivers to Enter and Leave System Sleep States
@@ -728,7 +728,7 @@ it into account in any way.
 
 Devices may be defined as IRQ-safe which indicates to the PM core that their
 runtime PM callbacks may be invoked with disabled interrupts (see
-:file:`Documentation/power/runtime_pm.txt` for more information).  If an
+:file:`Documentation/power/runtime_pm.rst` for more information).  If an
 IRQ-safe device belongs to a PM domain, the runtime PM of the domain will be
 disallowed, unless the domain itself is defined as IRQ-safe. However, it
 makes sense to define a PM domain as IRQ-safe only if all the devices in it
@@ -795,7 +795,7 @@ so on) and the final state of the device must reflect the "active" runtime PM
 status in that case.
 
 During system-wide resume from a sleep state it's easiest to put devices into
-the full-power state, as explained in :file:`Documentation/power/runtime_pm.txt`.
+the full-power state, as explained in :file:`Documentation/power/runtime_pm.rst`.
 [Refer to that document for more information regarding this particular issue as
 well as for information on the device runtime power management framework in
 general.]
diff --git a/Documentation/driver-api/usb/power-management.rst b/Documentation/driver-api/usb/power-management.rst
index 4a74cf6f2797..2525c3622cae 100644
--- a/Documentation/driver-api/usb/power-management.rst
+++ b/Documentation/driver-api/usb/power-management.rst
@@ -46,7 +46,7 @@ device is turned off while the system as a whole remains running, we
 call it a "dynamic suspend" (also known as a "runtime suspend" or
 "selective suspend").  This document concentrates mostly on how
 dynamic PM is implemented in the USB subsystem, although system PM is
-covered to some extent (see ``Documentation/power/*.txt`` for more
+covered to some extent (see ``Documentation/power/*.rst`` for more
 information about system PM).
 
 System PM support is present only if the kernel was built with
diff --git a/Documentation/power/apm-acpi.rst b/Documentation/power/apm-acpi.rst
new file mode 100644
index 000000000000..5b90d947126d
--- /dev/null
+++ b/Documentation/power/apm-acpi.rst
@@ -0,0 +1,36 @@
+============
+APM or ACPI?
+============
+
+If you have a relatively recent x86 mobile, desktop, or server system,
+odds are it supports either Advanced Power Management (APM) or
+Advanced Configuration and Power Interface (ACPI).  ACPI is the newer
+of the two technologies and puts power management in the hands of the
+operating system, allowing for more intelligent power management than
+is possible with BIOS controlled APM.
+
+The best way to determine which, if either, your system supports is to
+build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is
+enabled by default).  If a working ACPI implementation is found, the
+ACPI driver will override and disable APM, otherwise the APM driver
+will be used.
+
+No, sorry, you cannot have both ACPI and APM enabled and running at
+once.  Some people with broken ACPI or broken APM implementations
+would like to use both to get a full set of working features, but you
+simply cannot mix and match the two.  Only one power management
+interface can be in control of the machine at once.  Think about it..
+
+User-space Daemons
+------------------
+Both APM and ACPI rely on user-space daemons, apmd and acpid
+respectively, to be completely functional.  Obtain both of these
+daemons from your Linux distribution or from the Internet (see below)
+and be sure that they are started sometime in the system boot process.
+Go ahead and start both.  If ACPI or APM is not available on your
+system the associated daemon will exit gracefully.
+
+  =====  =======================================
+  apmd   http://ftp.debian.org/pool/main/a/apmd/
+  acpid  http://acpid.sf.net/
+  =====  =======================================
diff --git a/Documentation/power/apm-acpi.txt b/Documentation/power/apm-acpi.txt
deleted file mode 100644
index 6cc423d3662e..000000000000
--- a/Documentation/power/apm-acpi.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-APM or ACPI?
-------------
-If you have a relatively recent x86 mobile, desktop, or server system,
-odds are it supports either Advanced Power Management (APM) or
-Advanced Configuration and Power Interface (ACPI).  ACPI is the newer
-of the two technologies and puts power management in the hands of the
-operating system, allowing for more intelligent power management than
-is possible with BIOS controlled APM.
-
-The best way to determine which, if either, your system supports is to
-build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is
-enabled by default).  If a working ACPI implementation is found, the
-ACPI driver will override and disable APM, otherwise the APM driver
-will be used.
-
-No, sorry, you cannot have both ACPI and APM enabled and running at
-once.  Some people with broken ACPI or broken APM implementations
-would like to use both to get a full set of working features, but you
-simply cannot mix and match the two.  Only one power management
-interface can be in control of the machine at once.  Think about it..
-
-User-space Daemons
-------------------
-Both APM and ACPI rely on user-space daemons, apmd and acpid
-respectively, to be completely functional.  Obtain both of these
-daemons from your Linux distribution or from the Internet (see below)
-and be sure that they are started sometime in the system boot process.
-Go ahead and start both.  If ACPI or APM is not available on your
-system the associated daemon will exit gracefully.
-
-  apmd:   http://ftp.debian.org/pool/main/a/apmd/
-  acpid:  http://acpid.sf.net/
diff --git a/Documentation/power/basic-pm-debugging.rst b/Documentation/power/basic-pm-debugging.rst
new file mode 100644
index 000000000000..69862e759c30
--- /dev/null
+++ b/Documentation/power/basic-pm-debugging.rst
@@ -0,0 +1,269 @@
+=================================
+Debugging hibernation and suspend
+=================================
+
+	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Testing hibernation (aka suspend to disk or STD)
+===================================================
+
+To check if hibernation works, you can try to hibernate in the "reboot" mode::
+
+	# echo reboot > /sys/power/disk
+	# echo disk > /sys/power/state
+
+and the system should create a hibernation image, reboot, resume and get back to
+the command prompt where you have started the transition.  If that happens,
+hibernation is most likely to work correctly.  Still, you need to repeat the
+test at least a couple of times in a row for confidence.  [This is necessary,
+because some problems only show up on a second attempt at suspending and
+resuming the system.]  Moreover, hibernating in the "reboot" and "shutdown"
+modes causes the PM core to skip some platform-related callbacks which on ACPI
+systems might be necessary to make hibernation work.  Thus, if your machine
+fails to hibernate or resume in the "reboot" mode, you should try the
+"platform" mode::
+
+	# echo platform > /sys/power/disk
+	# echo disk > /sys/power/state
+
+which is the default and recommended mode of hibernation.
+
+Unfortunately, the "platform" mode of hibernation does not work on some systems
+with broken BIOSes.  In such cases the "shutdown" mode of hibernation might
+work::
+
+	# echo shutdown > /sys/power/disk
+	# echo disk > /sys/power/state
+
+(it is similar to the "reboot" mode, but it requires you to press the power
+button to make the system resume).
+
+If neither "platform" nor "shutdown" hibernation mode works, you will need to
+identify what goes wrong.
+
+a) Test modes of hibernation
+----------------------------
+
+To find out why hibernation fails on your system, you can use a special testing
+facility available if the kernel is compiled with CONFIG_PM_DEBUG set.  Then,
+there is the file /sys/power/pm_test that can be used to make the hibernation
+core run in a test mode.  There are 5 test modes available:
+
+freezer
+	- test the freezing of processes
+
+devices
+	- test the freezing of processes and suspending of devices
+
+platform
+	- test the freezing of processes, suspending of devices and platform
+	  global control methods [1]_
+
+processors
+	- test the freezing of processes, suspending of devices, platform
+	  global control methods [1]_ and the disabling of nonboot CPUs
+
+core
+	- test the freezing of processes, suspending of devices, platform global
+	  control methods\ [1]_, the disabling of nonboot CPUs and suspending
+	  of platform/system devices
+
+.. [1]
+
+    the platform global control methods are only available on ACPI systems
+    and are only tested if the hibernation mode is set to "platform"
+
+To use one of them it is necessary to write the corresponding string to
+/sys/power/pm_test (eg. "devices" to test the freezing of processes and
+suspending devices) and issue the standard hibernation commands.  For example,
+to use the "devices" test mode along with the "platform" mode of hibernation,
+you should do the following::
+
+	# echo devices > /sys/power/pm_test
+	# echo platform > /sys/power/disk
+	# echo disk > /sys/power/state
+
+Then, the kernel will try to freeze processes, suspend devices, wait a few
+seconds (5 by default, but configurable by the suspend.pm_test_delay module
+parameter), resume devices and thaw processes.  If "platform" is written to
+/sys/power/pm_test , then after suspending devices the kernel will additionally
+invoke the global control methods (eg. ACPI global control methods) used to
+prepare the platform firmware for hibernation.  Next, it will wait a
+configurable number of seconds and invoke the platform (eg. ACPI) global
+methods used to cancel hibernation etc.
+
+Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
+hibernation/suspend operations.  Also, when open for reading, /sys/power/pm_test
+contains a space-separated list of all available tests (including "none" that
+represents the normal functionality) in which the current test level is
+indicated by square brackets.
+
+Generally, as you can see, each test level is more "invasive" than the previous
+one and the "core" level tests the hardware and drivers as deeply as possible
+without creating a hibernation image.  Obviously, if the "devices" test fails,
+the "platform" test will fail as well and so on.  Thus, as a rule of thumb, you
+should try the test modes starting from "freezer", through "devices", "platform"
+and "processors" up to "core" (repeat the test on each level a couple of times
+to make sure that any random factors are avoided).
+
+If the "freezer" test fails, there is a task that cannot be frozen (in that case
+it usually is possible to identify the offending task by analysing the output of
+dmesg obtained after the failing test).  Failure at this level usually means
+that there is a problem with the tasks freezer subsystem that should be
+reported.
+
+If the "devices" test fails, most likely there is a driver that cannot suspend
+or resume its device (in the latter case the system may hang or become unstable
+after the test, so please take that into consideration).  To find this driver,
+you can carry out a binary search according to the rules:
+
+- if the test fails, unload a half of the drivers currently loaded and repeat
+  (that would probably involve rebooting the system, so always note what drivers
+  have been loaded before the test),
+- if the test succeeds, load a half of the drivers you have unloaded most
+  recently and repeat.
+
+Once you have found the failing driver (there can be more than just one of
+them), you have to unload it every time before hibernation.  In that case please
+make sure to report the problem with the driver.
+
+It is also possible that the "devices" test will still fail after you have
+unloaded all modules. In that case, you may want to look in your kernel
+configuration for the drivers that can be compiled as modules (and test again
+with these drivers compiled as modules).  You may also try to use some special
+kernel command line options such as "noapic", "noacpi" or even "acpi=off".
+
+If the "platform" test fails, there is a problem with the handling of the
+platform (eg. ACPI) firmware on your system.  In that case the "platform" mode
+of hibernation is not likely to work.  You can try the "shutdown" mode, but that
+is rather a poor man's workaround.
+
+If the "processors" test fails, the disabling/enabling of nonboot CPUs does not
+work (of course, this only may be an issue on SMP systems) and the problem
+should be reported.  In that case you can also try to switch the nonboot CPUs
+off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and
+see if that works.
+
+If the "core" test fails, which means that suspending of the system/platform
+devices has failed (these devices are suspended on one CPU with interrupts off),
+the problem is most probably hardware-related and serious, so it should be
+reported.
+
+A failure of any of the "platform", "processors" or "core" tests may cause your
+system to hang or become unstable, so please beware.  Such a failure usually
+indicates a serious problem that very well may be related to the hardware, but
+please report it anyway.
+
+b) Testing minimal configuration
+--------------------------------
+
+If all of the hibernation test modes work, you can boot the system with the
+"init=/bin/bash" command line parameter and attempt to hibernate in the
+"reboot", "shutdown" and "platform" modes.  If that does not work, there
+probably is a problem with a driver statically compiled into the kernel and you
+can try to compile more drivers as modules, so that they can be tested
+individually.  Otherwise, there is a problem with a modular driver and you can
+find it by loading a half of the modules you normally use and binary searching
+in accordance with the algorithm:
+- if there are n modules loaded and the attempt to suspend and resume fails,
+unload n/2 of the modules and try again (that would probably involve rebooting
+the system),
+- if there are n modules loaded and the attempt to suspend and resume succeeds,
+load n/2 modules more and try again.
+
+Again, if you find the offending module(s), it(they) must be unloaded every time
+before hibernation, and please report the problem with it(them).
+
+c) Using the "test_resume" hibernation option
+---------------------------------------------
+
+/sys/power/disk generally tells the kernel what to do after creating a
+hibernation image.  One of the available options is "test_resume" which
+causes the just created image to be used for immediate restoration.  Namely,
+after doing::
+
+	# echo test_resume > /sys/power/disk
+	# echo disk > /sys/power/state
+
+a hibernation image will be created and a resume from it will be triggered
+immediately without involving the platform firmware in any way.
+
+That test can be used to check if failures to resume from hibernation are
+related to bad interactions with the platform firmware.  That is, if the above
+works every time, but resume from actual hibernation does not work or is
+unreliable, the platform firmware may be responsible for the failures.
+
+On architectures and platforms that support using different kernels to restore
+hibernation images (that is, the kernel used to read the image from storage and
+load it into memory is different from the one included in the image) or support
+kernel address space randomization, it also can be used to check if failures
+to resume may be related to the differences between the restore and image
+kernels.
+
+d) Advanced debugging
+---------------------
+
+In case that hibernation does not work on your system even in the minimal
+configuration and compiling more drivers as modules is not practical or some
+modules cannot be unloaded, you can use one of the more advanced debugging
+techniques to find the problem.  First, if there is a serial port in your box,
+you can boot the kernel with the 'no_console_suspend' parameter and try to log
+kernel messages using the serial console.  This may provide you with some
+information about the reasons of the suspend (resume) failure.  Alternatively,
+it may be possible to use a FireWire port for debugging with firescope
+(http://v3.sk/~lkundrak/firescope/).  On x86 it is also possible to
+use the PM_TRACE mechanism documented in Documentation/power/s2ram.rst .
+
+2. Testing suspend to RAM (STR)
+===============================
+
+To verify that the STR works, it is generally more convenient to use the s2ram
+tool available from http://suspend.sf.net and documented at
+http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK).
+
+Namely, after writing "freezer", "devices", "platform", "processors", or "core"
+into /sys/power/pm_test (available if the kernel is compiled with
+CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding
+to given string.  The STR test modes are defined in the same way as for
+hibernation, so please refer to Section 1 for more information about them.  In
+particular, the "core" test allows you to test everything except for the actual
+invocation of the platform firmware in order to put the system into the sleep
+state.
+
+Among other things, the testing with the help of /sys/power/pm_test may allow
+you to identify drivers that fail to suspend or resume their devices.  They
+should be unloaded every time before an STR transition.
+
+Next, you can follow the instructions at S2RAM_LINK to test the system, but if
+it does not work "out of the box", you may need to boot it with
+"init=/bin/bash" and test s2ram in the minimal configuration.  In that case,
+you may be able to search for failing drivers by following the procedure
+analogous to the one described in section 1.  If you find some failing drivers,
+you will have to unload them every time before an STR transition (ie. before
+you run s2ram), and please report the problems with them.
+
+There is a debugfs entry which shows the suspend to RAM statistics. Here is an
+example of its output::
+
+	# mount -t debugfs none /sys/kernel/debug
+	# cat /sys/kernel/debug/suspend_stats
+	success: 20
+	fail: 5
+	failed_freeze: 0
+	failed_prepare: 0
+	failed_suspend: 5
+	failed_suspend_noirq: 0
+	failed_resume: 0
+	failed_resume_noirq: 0
+	failures:
+	  last_failed_dev:	alarm
+				adc
+	  last_failed_errno:	-16
+				-16
+	  last_failed_step:	suspend
+				suspend
+
+Field success means the success number of suspend to RAM, and field fail means
+the failure number. Others are the failure number of different steps of suspend
+to RAM. suspend_stats just lists the last 2 failed devices, error number and
+failed step of suspend.
diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt
deleted file mode 100644
index 708f87f78a75..000000000000
--- a/Documentation/power/basic-pm-debugging.txt
+++ /dev/null
@@ -1,254 +0,0 @@
-Debugging hibernation and suspend
-	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-1. Testing hibernation (aka suspend to disk or STD)
-
-To check if hibernation works, you can try to hibernate in the "reboot" mode:
-
-# echo reboot > /sys/power/disk
-# echo disk > /sys/power/state
-
-and the system should create a hibernation image, reboot, resume and get back to
-the command prompt where you have started the transition.  If that happens,
-hibernation is most likely to work correctly.  Still, you need to repeat the
-test at least a couple of times in a row for confidence.  [This is necessary,
-because some problems only show up on a second attempt at suspending and
-resuming the system.]  Moreover, hibernating in the "reboot" and "shutdown"
-modes causes the PM core to skip some platform-related callbacks which on ACPI
-systems might be necessary to make hibernation work.  Thus, if your machine fails
-to hibernate or resume in the "reboot" mode, you should try the "platform" mode:
-
-# echo platform > /sys/power/disk
-# echo disk > /sys/power/state
-
-which is the default and recommended mode of hibernation.
-
-Unfortunately, the "platform" mode of hibernation does not work on some systems
-with broken BIOSes.  In such cases the "shutdown" mode of hibernation might
-work:
-
-# echo shutdown > /sys/power/disk
-# echo disk > /sys/power/state
-
-(it is similar to the "reboot" mode, but it requires you to press the power
-button to make the system resume).
-
-If neither "platform" nor "shutdown" hibernation mode works, you will need to
-identify what goes wrong.
-
-a) Test modes of hibernation
-
-To find out why hibernation fails on your system, you can use a special testing
-facility available if the kernel is compiled with CONFIG_PM_DEBUG set.  Then,
-there is the file /sys/power/pm_test that can be used to make the hibernation
-core run in a test mode.  There are 5 test modes available:
-
-freezer
-- test the freezing of processes
-
-devices
-- test the freezing of processes and suspending of devices
-
-platform
-- test the freezing of processes, suspending of devices and platform
-  global control methods(*)
-
-processors
-- test the freezing of processes, suspending of devices, platform
-  global control methods(*) and the disabling of nonboot CPUs
-
-core
-- test the freezing of processes, suspending of devices, platform global
-  control methods(*), the disabling of nonboot CPUs and suspending of
-  platform/system devices
-
-(*) the platform global control methods are only available on ACPI systems
-    and are only tested if the hibernation mode is set to "platform"
-
-To use one of them it is necessary to write the corresponding string to
-/sys/power/pm_test (eg. "devices" to test the freezing of processes and
-suspending devices) and issue the standard hibernation commands.  For example,
-to use the "devices" test mode along with the "platform" mode of hibernation,
-you should do the following:
-
-# echo devices > /sys/power/pm_test
-# echo platform > /sys/power/disk
-# echo disk > /sys/power/state
-
-Then, the kernel will try to freeze processes, suspend devices, wait a few
-seconds (5 by default, but configurable by the suspend.pm_test_delay module
-parameter), resume devices and thaw processes.  If "platform" is written to
-/sys/power/pm_test , then after suspending devices the kernel will additionally
-invoke the global control methods (eg. ACPI global control methods) used to
-prepare the platform firmware for hibernation.  Next, it will wait a
-configurable number of seconds and invoke the platform (eg. ACPI) global
-methods used to cancel hibernation etc.
-
-Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal
-hibernation/suspend operations.  Also, when open for reading, /sys/power/pm_test
-contains a space-separated list of all available tests (including "none" that
-represents the normal functionality) in which the current test level is
-indicated by square brackets.
-
-Generally, as you can see, each test level is more "invasive" than the previous
-one and the "core" level tests the hardware and drivers as deeply as possible
-without creating a hibernation image.  Obviously, if the "devices" test fails,
-the "platform" test will fail as well and so on.  Thus, as a rule of thumb, you
-should try the test modes starting from "freezer", through "devices", "platform"
-and "processors" up to "core" (repeat the test on each level a couple of times
-to make sure that any random factors are avoided).
-
-If the "freezer" test fails, there is a task that cannot be frozen (in that case
-it usually is possible to identify the offending task by analysing the output of
-dmesg obtained after the failing test).  Failure at this level usually means
-that there is a problem with the tasks freezer subsystem that should be
-reported.
-
-If the "devices" test fails, most likely there is a driver that cannot suspend
-or resume its device (in the latter case the system may hang or become unstable
-after the test, so please take that into consideration).  To find this driver,
-you can carry out a binary search according to the rules:
-- if the test fails, unload a half of the drivers currently loaded and repeat
-(that would probably involve rebooting the system, so always note what drivers
-have been loaded before the test),
-- if the test succeeds, load a half of the drivers you have unloaded most
-recently and repeat.
-
-Once you have found the failing driver (there can be more than just one of
-them), you have to unload it every time before hibernation.  In that case please
-make sure to report the problem with the driver.
-
-It is also possible that the "devices" test will still fail after you have
-unloaded all modules. In that case, you may want to look in your kernel
-configuration for the drivers that can be compiled as modules (and test again
-with these drivers compiled as modules).  You may also try to use some special
-kernel command line options such as "noapic", "noacpi" or even "acpi=off".
-
-If the "platform" test fails, there is a problem with the handling of the
-platform (eg. ACPI) firmware on your system.  In that case the "platform" mode
-of hibernation is not likely to work.  You can try the "shutdown" mode, but that
-is rather a poor man's workaround.
-
-If the "processors" test fails, the disabling/enabling of nonboot CPUs does not
-work (of course, this only may be an issue on SMP systems) and the problem
-should be reported.  In that case you can also try to switch the nonboot CPUs
-off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and
-see if that works.
-
-If the "core" test fails, which means that suspending of the system/platform
-devices has failed (these devices are suspended on one CPU with interrupts off),
-the problem is most probably hardware-related and serious, so it should be
-reported.
-
-A failure of any of the "platform", "processors" or "core" tests may cause your
-system to hang or become unstable, so please beware.  Such a failure usually
-indicates a serious problem that very well may be related to the hardware, but
-please report it anyway.
-
-b) Testing minimal configuration
-
-If all of the hibernation test modes work, you can boot the system with the
-"init=/bin/bash" command line parameter and attempt to hibernate in the
-"reboot", "shutdown" and "platform" modes.  If that does not work, there
-probably is a problem with a driver statically compiled into the kernel and you
-can try to compile more drivers as modules, so that they can be tested
-individually.  Otherwise, there is a problem with a modular driver and you can
-find it by loading a half of the modules you normally use and binary searching
-in accordance with the algorithm:
-- if there are n modules loaded and the attempt to suspend and resume fails,
-unload n/2 of the modules and try again (that would probably involve rebooting
-the system),
-- if there are n modules loaded and the attempt to suspend and resume succeeds,
-load n/2 modules more and try again.
-
-Again, if you find the offending module(s), it(they) must be unloaded every time
-before hibernation, and please report the problem with it(them).
-
-c) Using the "test_resume" hibernation option
-
-/sys/power/disk generally tells the kernel what to do after creating a
-hibernation image.  One of the available options is "test_resume" which
-causes the just created image to be used for immediate restoration.  Namely,
-after doing:
-
-# echo test_resume > /sys/power/disk
-# echo disk > /sys/power/state
-
-a hibernation image will be created and a resume from it will be triggered
-immediately without involving the platform firmware in any way.
-
-That test can be used to check if failures to resume from hibernation are
-related to bad interactions with the platform firmware.  That is, if the above
-works every time, but resume from actual hibernation does not work or is
-unreliable, the platform firmware may be responsible for the failures.
-
-On architectures and platforms that support using different kernels to restore
-hibernation images (that is, the kernel used to read the image from storage and
-load it into memory is different from the one included in the image) or support
-kernel address space randomization, it also can be used to check if failures
-to resume may be related to the differences between the restore and image
-kernels.
-
-d) Advanced debugging
-
-In case that hibernation does not work on your system even in the minimal
-configuration and compiling more drivers as modules is not practical or some
-modules cannot be unloaded, you can use one of the more advanced debugging
-techniques to find the problem.  First, if there is a serial port in your box,
-you can boot the kernel with the 'no_console_suspend' parameter and try to log
-kernel messages using the serial console.  This may provide you with some
-information about the reasons of the suspend (resume) failure.  Alternatively,
-it may be possible to use a FireWire port for debugging with firescope
-(http://v3.sk/~lkundrak/firescope/).  On x86 it is also possible to
-use the PM_TRACE mechanism documented in Documentation/power/s2ram.txt .
-
-2. Testing suspend to RAM (STR)
-
-To verify that the STR works, it is generally more convenient to use the s2ram
-tool available from http://suspend.sf.net and documented at
-http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK).
-
-Namely, after writing "freezer", "devices", "platform", "processors", or "core"
-into /sys/power/pm_test (available if the kernel is compiled with
-CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding
-to given string.  The STR test modes are defined in the same way as for
-hibernation, so please refer to Section 1 for more information about them.  In
-particular, the "core" test allows you to test everything except for the actual
-invocation of the platform firmware in order to put the system into the sleep
-state.
-
-Among other things, the testing with the help of /sys/power/pm_test may allow
-you to identify drivers that fail to suspend or resume their devices.  They
-should be unloaded every time before an STR transition.
-
-Next, you can follow the instructions at S2RAM_LINK to test the system, but if
-it does not work "out of the box", you may need to boot it with
-"init=/bin/bash" and test s2ram in the minimal configuration.  In that case,
-you may be able to search for failing drivers by following the procedure
-analogous to the one described in section 1.  If you find some failing drivers,
-you will have to unload them every time before an STR transition (ie. before
-you run s2ram), and please report the problems with them.
-
-There is a debugfs entry which shows the suspend to RAM statistics. Here is an
-example of its output.
-	# mount -t debugfs none /sys/kernel/debug
-	# cat /sys/kernel/debug/suspend_stats
-	success: 20
-	fail: 5
-	failed_freeze: 0
-	failed_prepare: 0
-	failed_suspend: 5
-	failed_suspend_noirq: 0
-	failed_resume: 0
-	failed_resume_noirq: 0
-	failures:
-	  last_failed_dev:	alarm
-				adc
-	  last_failed_errno:	-16
-				-16
-	  last_failed_step:	suspend
-				suspend
-Field success means the success number of suspend to RAM, and field fail means
-the failure number. Others are the failure number of different steps of suspend
-to RAM. suspend_stats just lists the last 2 failed devices, error number and
-failed step of suspend.
diff --git a/Documentation/power/charger-manager.rst b/Documentation/power/charger-manager.rst
new file mode 100644
index 000000000000..84fab9376792
--- /dev/null
+++ b/Documentation/power/charger-manager.rst
@@ -0,0 +1,205 @@
+===============
+Charger Manager
+===============
+
+	(C) 2011 MyungJoo Ham <myungjoo.ham@samsung.com>, GPL
+
+Charger Manager provides in-kernel battery charger management that
+requires temperature monitoring during suspend-to-RAM state
+and where each battery may have multiple chargers attached and the userland
+wants to look at the aggregated information of the multiple chargers.
+
+Charger Manager is a platform_driver with power-supply-class entries.
+An instance of Charger Manager (a platform-device created with Charger-Manager)
+represents an independent battery with chargers. If there are multiple
+batteries with their own chargers acting independently in a system,
+the system may need multiple instances of Charger Manager.
+
+1. Introduction
+===============
+
+Charger Manager supports the following:
+
+* Support for multiple chargers (e.g., a device with USB, AC, and solar panels)
+	A system may have multiple chargers (or power sources) and some of
+	they may be activated at the same time. Each charger may have its
+	own power-supply-class and each power-supply-class can provide
+	different information about the battery status. This framework
+	aggregates charger-related information from multiple sources and
+	shows combined information as a single power-supply-class.
+
+* Support for in suspend-to-RAM polling (with suspend_again callback)
+	While the battery is being charged and the system is in suspend-to-RAM,
+	we may need to monitor the battery health by looking at the ambient or
+	battery temperature. We can accomplish this by waking up the system
+	periodically. However, such a method wakes up devices unnecessarily for
+	monitoring the battery health and tasks, and user processes that are
+	supposed to be kept suspended. That, in turn, incurs unnecessary power
+	consumption and slow down charging process. Or even, such peak power
+	consumption can stop chargers in the middle of charging
+	(external power input < device power consumption), which not
+	only affects the charging time, but the lifespan of the battery.
+
+	Charger Manager provides a function "cm_suspend_again" that can be
+	used as suspend_again callback of platform_suspend_ops. If the platform
+	requires tasks other than cm_suspend_again, it may implement its own
+	suspend_again callback that calls cm_suspend_again in the middle.
+	Normally, the platform will need to resume and suspend some devices
+	that are used by Charger Manager.
+
+* Support for premature full-battery event handling
+	If the battery voltage drops by "fullbatt_vchkdrop_uV" after
+	"fullbatt_vchkdrop_ms" from the full-battery event, the framework
+	restarts charging. This check is also performed while suspended by
+	setting wakeup time accordingly and using suspend_again.
+
+* Support for uevent-notify
+	With the charger-related events, the device sends
+	notification to users with UEVENT.
+
+2. Global Charger-Manager Data related with suspend_again
+=========================================================
+In order to setup Charger Manager with suspend-again feature
+(in-suspend monitoring), the user should provide charger_global_desc
+with setup_charger_manager(`struct charger_global_desc *`).
+This charger_global_desc data for in-suspend monitoring is global
+as the name suggests. Thus, the user needs to provide only once even
+if there are multiple batteries. If there are multiple batteries, the
+multiple instances of Charger Manager share the same charger_global_desc
+and it will manage in-suspend monitoring for all instances of Charger Manager.
+
+The user needs to provide all the three entries to `struct charger_global_desc`
+properly in order to activate in-suspend monitoring:
+
+`char *rtc_name;`
+	The name of rtc (e.g., "rtc0") used to wakeup the system from
+	suspend for Charger Manager. The alarm interrupt (AIE) of the rtc
+	should be able to wake up the system from suspend. Charger Manager
+	saves and restores the alarm value and use the previously-defined
+	alarm if it is going to go off earlier than Charger Manager so that
+	Charger Manager does not interfere with previously-defined alarms.
+
+`bool (*rtc_only_wakeup)(void);`
+	This callback should let CM know whether
+	the wakeup-from-suspend is caused only by the alarm of "rtc" in the
+	same struct. If there is any other wakeup source triggered the
+	wakeup, it should return false. If the "rtc" is the only wakeup
+	reason, it should return true.
+
+`bool assume_timer_stops_in_suspend;`
+	if true, Charger Manager assumes that
+	the timer (CM uses jiffies as timer) stops during suspend. Then, CM
+	assumes that the suspend-duration is same as the alarm length.
+
+
+3. How to setup suspend_again
+=============================
+Charger Manager provides a function "extern bool cm_suspend_again(void)".
+When cm_suspend_again is called, it monitors every battery. The suspend_ops
+callback of the system's platform_suspend_ops can call cm_suspend_again
+function to know whether Charger Manager wants to suspend again or not.
+If there are no other devices or tasks that want to use suspend_again
+feature, the platform_suspend_ops may directly refer to cm_suspend_again
+for its suspend_again callback.
+
+The cm_suspend_again() returns true (meaning "I want to suspend again")
+if the system was woken up by Charger Manager and the polling
+(in-suspend monitoring) results in "normal".
+
+4. Charger-Manager Data (struct charger_desc)
+=============================================
+For each battery charged independently from other batteries (if a series of
+batteries are charged by a single charger, they are counted as one independent
+battery), an instance of Charger Manager is attached to it. The following
+
+struct charger_desc elements:
+
+`char *psy_name;`
+	The power-supply-class name of the battery. Default is
+	"battery" if psy_name is NULL. Users can access the psy entries
+	at "/sys/class/power_supply/[psy_name]/".
+
+`enum polling_modes polling_mode;`
+	  CM_POLL_DISABLE:
+		do not poll this battery.
+	  CM_POLL_ALWAYS:
+		always poll this battery.
+	  CM_POLL_EXTERNAL_POWER_ONLY:
+		poll this battery if and only if an external power
+		source is attached.
+	  CM_POLL_CHARGING_ONLY:
+		poll this battery if and only if the battery is being charged.
+
+`unsigned int fullbatt_vchkdrop_ms; / unsigned int fullbatt_vchkdrop_uV;`
+	If both have non-zero values, Charger Manager will check the
+	battery voltage drop fullbatt_vchkdrop_ms after the battery is fully
+	charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger
+	Manager will try to recharge the battery by disabling and enabling
+	chargers. Recharge with voltage drop condition only (without delay
+	condition) is needed to be implemented with hardware interrupts from
+	fuel gauges or charger devices/chips.
+
+`unsigned int fullbatt_uV;`
+	If specified with a non-zero value, Charger Manager assumes
+	that the battery is full (capacity = 100) if the battery is not being
+	charged and the battery voltage is equal to or greater than
+	fullbatt_uV.
+
+`unsigned int polling_interval_ms;`
+	Required polling interval in ms. Charger Manager will poll
+	this battery every polling_interval_ms or more frequently.
+
+`enum data_source battery_present;`
+	CM_BATTERY_PRESENT:
+		assume that the battery exists.
+	CM_NO_BATTERY:
+		assume that the battery does not exists.
+	CM_FUEL_GAUGE:
+		get battery presence information from fuel gauge.
+	CM_CHARGER_STAT:
+		get battery presence from chargers.
+
+`char **psy_charger_stat;`
+	An array ending with NULL that has power-supply-class names of
+	chargers. Each power-supply-class should provide "PRESENT" (if
+	battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an
+	external power source is attached or not), and "STATUS" (shows whether
+	the battery is {"FULL" or not FULL} or {"FULL", "Charging",
+	"Discharging", "NotCharging"}).
+
+`int num_charger_regulators; / struct regulator_bulk_data *charger_regulators;`
+	Regulators representing the chargers in the form for
+	regulator framework's bulk functions.
+
+`char *psy_fuel_gauge;`
+	Power-supply-class name of the fuel gauge.
+
+`int (*temperature_out_of_range)(int *mC); / bool measure_battery_temp;`
+	This callback returns 0 if the temperature is safe for charging,
+	a positive number if it is too hot to charge, and a negative number
+	if it is too cold to charge. With the variable mC, the callback returns
+	the temperature in 1/1000 of centigrade.
+	The source of temperature can be battery or ambient one according to
+	the value of measure_battery_temp.
+
+
+5. Notify Charger-Manager of charger events: cm_notify_event()
+==============================================================
+If there is an charger event is required to notify
+Charger Manager, a charger device driver that triggers the event can call
+cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager.
+In the function, psy is the charger driver's power_supply pointer, which is
+associated with Charger-Manager. The parameter "type"
+is the same as irq's type (enum cm_event_types). The event message "msg" is
+optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS".
+
+6. Other Considerations
+=======================
+
+At the charger/battery-related events such as battery-pulled-out,
+charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped,
+and others critical to chargers, the system should be configured to wake up.
+At least the following should wake up the system from a suspend:
+a) charger-on/off b) external-power-in/out c) battery-in/out (while charging)
+
+It is usually accomplished by configuring the PMIC as a wakeup source.
diff --git a/Documentation/power/charger-manager.txt b/Documentation/power/charger-manager.txt
deleted file mode 100644
index 9ff1105e58d6..000000000000
--- a/Documentation/power/charger-manager.txt
+++ /dev/null
@@ -1,200 +0,0 @@
-Charger Manager
-	(C) 2011 MyungJoo Ham <myungjoo.ham@samsung.com>, GPL
-
-Charger Manager provides in-kernel battery charger management that
-requires temperature monitoring during suspend-to-RAM state
-and where each battery may have multiple chargers attached and the userland
-wants to look at the aggregated information of the multiple chargers.
-
-Charger Manager is a platform_driver with power-supply-class entries.
-An instance of Charger Manager (a platform-device created with Charger-Manager)
-represents an independent battery with chargers. If there are multiple
-batteries with their own chargers acting independently in a system,
-the system may need multiple instances of Charger Manager.
-
-1. Introduction
-===============
-
-Charger Manager supports the following:
-
-* Support for multiple chargers (e.g., a device with USB, AC, and solar panels)
-	A system may have multiple chargers (or power sources) and some of
-	they may be activated at the same time. Each charger may have its
-	own power-supply-class and each power-supply-class can provide
-	different information about the battery status. This framework
-	aggregates charger-related information from multiple sources and
-	shows combined information as a single power-supply-class.
-
-* Support for in suspend-to-RAM polling (with suspend_again callback)
-	While the battery is being charged and the system is in suspend-to-RAM,
-	we may need to monitor the battery health by looking at the ambient or
-	battery temperature. We can accomplish this by waking up the system
-	periodically. However, such a method wakes up devices unnecessarily for
-	monitoring the battery health and tasks, and user processes that are
-	supposed to be kept suspended. That, in turn, incurs unnecessary power
-	consumption and slow down charging process. Or even, such peak power
-	consumption can stop chargers in the middle of charging
-	(external power input < device power consumption), which not
-	only affects the charging time, but the lifespan of the battery.
-
-	Charger Manager provides a function "cm_suspend_again" that can be
-	used as suspend_again callback of platform_suspend_ops. If the platform
-	requires tasks other than cm_suspend_again, it may implement its own
-	suspend_again callback that calls cm_suspend_again in the middle.
-	Normally, the platform will need to resume and suspend some devices
-	that are used by Charger Manager.
-
-* Support for premature full-battery event handling
-	If the battery voltage drops by "fullbatt_vchkdrop_uV" after
-	"fullbatt_vchkdrop_ms" from the full-battery event, the framework
-	restarts charging. This check is also performed while suspended by
-	setting wakeup time accordingly and using suspend_again.
-
-* Support for uevent-notify
-	With the charger-related events, the device sends
-	notification to users with UEVENT.
-
-2. Global Charger-Manager Data related with suspend_again
-========================================================
-In order to setup Charger Manager with suspend-again feature
-(in-suspend monitoring), the user should provide charger_global_desc
-with setup_charger_manager(struct charger_global_desc *).
-This charger_global_desc data for in-suspend monitoring is global
-as the name suggests. Thus, the user needs to provide only once even
-if there are multiple batteries. If there are multiple batteries, the
-multiple instances of Charger Manager share the same charger_global_desc
-and it will manage in-suspend monitoring for all instances of Charger Manager.
-
-The user needs to provide all the three entries properly in order to activate
-in-suspend monitoring:
-
-struct charger_global_desc {
-
-char *rtc_name;
-	: The name of rtc (e.g., "rtc0") used to wakeup the system from
-	suspend for Charger Manager. The alarm interrupt (AIE) of the rtc
-	should be able to wake up the system from suspend. Charger Manager
-	saves and restores the alarm value and use the previously-defined
-	alarm if it is going to go off earlier than Charger Manager so that
-	Charger Manager does not interfere with previously-defined alarms.
-
-bool (*rtc_only_wakeup)(void);
-	: This callback should let CM know whether
-	the wakeup-from-suspend is caused only by the alarm of "rtc" in the
-	same struct. If there is any other wakeup source triggered the
-	wakeup, it should return false. If the "rtc" is the only wakeup
-	reason, it should return true.
-
-bool assume_timer_stops_in_suspend;
-	: if true, Charger Manager assumes that
-	the timer (CM uses jiffies as timer) stops during suspend. Then, CM
-	assumes that the suspend-duration is same as the alarm length.
-};
-
-3. How to setup suspend_again
-=============================
-Charger Manager provides a function "extern bool cm_suspend_again(void)".
-When cm_suspend_again is called, it monitors every battery. The suspend_ops
-callback of the system's platform_suspend_ops can call cm_suspend_again
-function to know whether Charger Manager wants to suspend again or not.
-If there are no other devices or tasks that want to use suspend_again
-feature, the platform_suspend_ops may directly refer to cm_suspend_again
-for its suspend_again callback.
-
-The cm_suspend_again() returns true (meaning "I want to suspend again")
-if the system was woken up by Charger Manager and the polling
-(in-suspend monitoring) results in "normal".
-
-4. Charger-Manager Data (struct charger_desc)
-=============================================
-For each battery charged independently from other batteries (if a series of
-batteries are charged by a single charger, they are counted as one independent
-battery), an instance of Charger Manager is attached to it.
-
-struct charger_desc {
-
-char *psy_name;
-	: The power-supply-class name of the battery. Default is
-	"battery" if psy_name is NULL. Users can access the psy entries
-	at "/sys/class/power_supply/[psy_name]/".
-
-enum polling_modes polling_mode;
-	: CM_POLL_DISABLE: do not poll this battery.
-	  CM_POLL_ALWAYS: always poll this battery.
-	  CM_POLL_EXTERNAL_POWER_ONLY: poll this battery if and only if
-				       an external power source is attached.
-	  CM_POLL_CHARGING_ONLY: poll this battery if and only if the
-				 battery is being charged.
-
-unsigned int fullbatt_vchkdrop_ms;
-unsigned int fullbatt_vchkdrop_uV;
-	: If both have non-zero values, Charger Manager will check the
-	battery voltage drop fullbatt_vchkdrop_ms after the battery is fully
-	charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger
-	Manager will try to recharge the battery by disabling and enabling
-	chargers. Recharge with voltage drop condition only (without delay
-	condition) is needed to be implemented with hardware interrupts from
-	fuel gauges or charger devices/chips.
-
-unsigned int fullbatt_uV;
-	: If specified with a non-zero value, Charger Manager assumes
-	that the battery is full (capacity = 100) if the battery is not being
-	charged and the battery voltage is equal to or greater than
-	fullbatt_uV.
-
-unsigned int polling_interval_ms;
-	: Required polling interval in ms. Charger Manager will poll
-	this battery every polling_interval_ms or more frequently.
-
-enum data_source battery_present;
-	: CM_BATTERY_PRESENT: assume that the battery exists.
-	CM_NO_BATTERY: assume that the battery does not exists.
-	CM_FUEL_GAUGE: get battery presence information from fuel gauge.
-	CM_CHARGER_STAT: get battery presence from chargers.
-
-char **psy_charger_stat;
-	: An array ending with NULL that has power-supply-class names of
-	chargers. Each power-supply-class should provide "PRESENT" (if
-	battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an
-	external power source is attached or not), and "STATUS" (shows whether
-	the battery is {"FULL" or not FULL} or {"FULL", "Charging",
-	"Discharging", "NotCharging"}).
-
-int num_charger_regulators;
-struct regulator_bulk_data *charger_regulators;
-	: Regulators representing the chargers in the form for
-	regulator framework's bulk functions.
-
-char *psy_fuel_gauge;
-	: Power-supply-class name of the fuel gauge.
-
-int (*temperature_out_of_range)(int *mC);
-bool measure_battery_temp;
-	: This callback returns 0 if the temperature is safe for charging,
-	a positive number if it is too hot to charge, and a negative number
-	if it is too cold to charge. With the variable mC, the callback returns
-	the temperature in 1/1000 of centigrade.
-	The source of temperature can be battery or ambient one according to
-	the value of measure_battery_temp.
-};
-
-5. Notify Charger-Manager of charger events: cm_notify_event()
-=========================================================
-If there is an charger event is required to notify
-Charger Manager, a charger device driver that triggers the event can call
-cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager.
-In the function, psy is the charger driver's power_supply pointer, which is
-associated with Charger-Manager. The parameter "type"
-is the same as irq's type (enum cm_event_types). The event message "msg" is
-optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS".
-
-6. Other Considerations
-=======================
-
-At the charger/battery-related events such as battery-pulled-out,
-charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped,
-and others critical to chargers, the system should be configured to wake up.
-At least the following should wake up the system from a suspend:
-a) charger-on/off b) external-power-in/out c) battery-in/out (while charging)
-
-It is usually accomplished by configuring the PMIC as a wakeup source.
diff --git a/Documentation/power/drivers-testing.rst b/Documentation/power/drivers-testing.rst
new file mode 100644
index 000000000000..e53f1999fc39
--- /dev/null
+++ b/Documentation/power/drivers-testing.rst
@@ -0,0 +1,51 @@
+====================================================
+Testing suspend and resume support in device drivers
+====================================================
+
+	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+1. Preparing the test system
+============================
+
+Unfortunately, to effectively test the support for the system-wide suspend and
+resume transitions in a driver, it is necessary to suspend and resume a fully
+functional system with this driver loaded.  Moreover, that should be done
+several times, preferably several times in a row, and separately for hibernation
+(aka suspend to disk or STD) and suspend to RAM (STR), because each of these
+cases involves slightly different operations and different interactions with
+the machine's BIOS.
+
+Of course, for this purpose the test system has to be known to suspend and
+resume without the driver being tested.  Thus, if possible, you should first
+resolve all suspend/resume-related problems in the test system before you start
+testing the new driver.  Please see Documentation/power/basic-pm-debugging.rst
+for more information about the debugging of suspend/resume functionality.
+
+2. Testing the driver
+=====================
+
+Once you have resolved the suspend/resume-related problems with your test system
+without the new driver, you are ready to test it:
+
+a) Build the driver as a module, load it and try the test modes of hibernation
+   (see: Documentation/power/basic-pm-debugging.rst, 1).
+
+b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and
+   "platform" modes (see: Documentation/power/basic-pm-debugging.rst, 1).
+
+c) Compile the driver directly into the kernel and try the test modes of
+   hibernation.
+
+d) Attempt to hibernate with the driver compiled directly into the kernel
+   in the "reboot", "shutdown" and "platform" modes.
+
+e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.rst,
+   2).  [As far as the STR tests are concerned, it should not matter whether or
+   not the driver is built as a module.]
+
+f) Attempt to suspend to RAM using the s2ram tool with the driver loaded
+   (see: Documentation/power/basic-pm-debugging.rst, 2).
+
+Each of the above tests should be repeated several times and the STD tests
+should be mixed with the STR tests.  If any of them fails, the driver cannot be
+regarded as suspend/resume-safe.
diff --git a/Documentation/power/drivers-testing.txt b/Documentation/power/drivers-testing.txt
deleted file mode 100644
index 638afdf4d6b8..000000000000
--- a/Documentation/power/drivers-testing.txt
+++ /dev/null
@@ -1,46 +0,0 @@
-Testing suspend and resume support in device drivers
-	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-1. Preparing the test system
-
-Unfortunately, to effectively test the support for the system-wide suspend and
-resume transitions in a driver, it is necessary to suspend and resume a fully
-functional system with this driver loaded.  Moreover, that should be done
-several times, preferably several times in a row, and separately for hibernation
-(aka suspend to disk or STD) and suspend to RAM (STR), because each of these
-cases involves slightly different operations and different interactions with
-the machine's BIOS.
-
-Of course, for this purpose the test system has to be known to suspend and
-resume without the driver being tested.  Thus, if possible, you should first
-resolve all suspend/resume-related problems in the test system before you start
-testing the new driver.  Please see Documentation/power/basic-pm-debugging.txt
-for more information about the debugging of suspend/resume functionality.
-
-2. Testing the driver
-
-Once you have resolved the suspend/resume-related problems with your test system
-without the new driver, you are ready to test it:
-
-a) Build the driver as a module, load it and try the test modes of hibernation
-   (see: Documentation/power/basic-pm-debugging.txt, 1).
-
-b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and
-   "platform" modes (see: Documentation/power/basic-pm-debugging.txt, 1).
-
-c) Compile the driver directly into the kernel and try the test modes of
-   hibernation.
-
-d) Attempt to hibernate with the driver compiled directly into the kernel
-   in the "reboot", "shutdown" and "platform" modes.
-
-e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.txt,
-   2).  [As far as the STR tests are concerned, it should not matter whether or
-   not the driver is built as a module.]
-
-f) Attempt to suspend to RAM using the s2ram tool with the driver loaded
-   (see: Documentation/power/basic-pm-debugging.txt, 2).
-
-Each of the above tests should be repeated several times and the STD tests
-should be mixed with the STR tests.  If any of them fails, the driver cannot be
-regarded as suspend/resume-safe.
diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst
new file mode 100644
index 000000000000..90a345d57ae9
--- /dev/null
+++ b/Documentation/power/energy-model.rst
@@ -0,0 +1,147 @@
+====================
+Energy Model of CPUs
+====================
+
+1. Overview
+-----------
+
+The Energy Model (EM) framework serves as an interface between drivers knowing
+the power consumed by CPUs at various performance levels, and the kernel
+subsystems willing to use that information to make energy-aware decisions.
+
+The source of the information about the power consumed by CPUs can vary greatly
+from one platform to another. These power costs can be estimated using
+devicetree data in some cases. In others, the firmware will know better.
+Alternatively, userspace might be best positioned. And so on. In order to avoid
+each and every client subsystem to re-implement support for each and every
+possible source of information on its own, the EM framework intervenes as an
+abstraction layer which standardizes the format of power cost tables in the
+kernel, hence enabling to avoid redundant work.
+
+The figure below depicts an example of drivers (Arm-specific here, but the
+approach is applicable to any architecture) providing power costs to the EM
+framework, and interested clients reading the data from it::
+
+       +---------------+  +-----------------+  +---------------+
+       | Thermal (IPA) |  | Scheduler (EAS) |  |     Other     |
+       +---------------+  +-----------------+  +---------------+
+               |                   | em_pd_energy()    |
+               |                   | em_cpu_get()      |
+               +---------+         |         +---------+
+                         |         |         |
+                         v         v         v
+                        +---------------------+
+                        |    Energy Model     |
+                        |     Framework       |
+                        +---------------------+
+                           ^       ^       ^
+                           |       |       | em_register_perf_domain()
+                +----------+       |       +---------+
+                |                  |                 |
+        +---------------+  +---------------+  +--------------+
+        |  cpufreq-dt   |  |   arm_scmi    |  |    Other     |
+        +---------------+  +---------------+  +--------------+
+                ^                  ^                 ^
+                |                  |                 |
+        +--------------+   +---------------+  +--------------+
+        | Device Tree  |   |   Firmware    |  |      ?       |
+        +--------------+   +---------------+  +--------------+
+
+The EM framework manages power cost tables per 'performance domain' in the
+system. A performance domain is a group of CPUs whose performance is scaled
+together. Performance domains generally have a 1-to-1 mapping with CPUFreq
+policies. All CPUs in a performance domain are required to have the same
+micro-architecture. CPUs in different performance domains can have different
+micro-architectures.
+
+
+2. Core APIs
+------------
+
+2.1 Config options
+^^^^^^^^^^^^^^^^^^
+
+CONFIG_ENERGY_MODEL must be enabled to use the EM framework.
+
+
+2.2 Registration of performance domains
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Drivers are expected to register performance domains into the EM framework by
+calling the following API::
+
+  int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
+			      struct em_data_callback *cb);
+
+Drivers must specify the CPUs of the performance domains using the cpumask
+argument, and provide a callback function returning <frequency, power> tuples
+for each capacity state. The callback function provided by the driver is free
+to fetch data from any relevant location (DT, firmware, ...), and by any mean
+deemed necessary. See Section 3. for an example of driver implementing this
+callback, and kernel/power/energy_model.c for further documentation on this
+API.
+
+
+2.3 Accessing performance domains
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Subsystems interested in the energy model of a CPU can retrieve it using the
+em_cpu_get() API. The energy model tables are allocated once upon creation of
+the performance domains, and kept in memory untouched.
+
+The energy consumed by a performance domain can be estimated using the
+em_pd_energy() API. The estimation is performed assuming that the schedutil
+CPUfreq governor is in use.
+
+More details about the above APIs can be found in include/linux/energy_model.h.
+
+
+3. Example driver
+-----------------
+
+This section provides a simple example of a CPUFreq driver registering a
+performance domain in the Energy Model framework using the (fake) 'foo'
+protocol. The driver implements an est_power() function to be provided to the
+EM framework::
+
+  -> drivers/cpufreq/foo_cpufreq.c
+
+  01	static int est_power(unsigned long *mW, unsigned long *KHz, int cpu)
+  02	{
+  03		long freq, power;
+  04
+  05		/* Use the 'foo' protocol to ceil the frequency */
+  06		freq = foo_get_freq_ceil(cpu, *KHz);
+  07		if (freq < 0);
+  08			return freq;
+  09
+  10		/* Estimate the power cost for the CPU at the relevant freq. */
+  11		power = foo_estimate_power(cpu, freq);
+  12		if (power < 0);
+  13			return power;
+  14
+  15		/* Return the values to the EM framework */
+  16		*mW = power;
+  17		*KHz = freq;
+  18
+  19		return 0;
+  20	}
+  21
+  22	static int foo_cpufreq_init(struct cpufreq_policy *policy)
+  23	{
+  24		struct em_data_callback em_cb = EM_DATA_CB(est_power);
+  25		int nr_opp, ret;
+  26
+  27		/* Do the actual CPUFreq init work ... */
+  28		ret = do_foo_cpufreq_init(policy);
+  29		if (ret)
+  30			return ret;
+  31
+  32		/* Find the number of OPPs for this policy */
+  33		nr_opp = foo_get_nr_opp(policy);
+  34
+  35		/* And register the new performance domain */
+  36		em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
+  37
+  38	        return 0;
+  39	}
diff --git a/Documentation/power/energy-model.txt b/Documentation/power/energy-model.txt
deleted file mode 100644
index a2b0ae4c76bd..000000000000
--- a/Documentation/power/energy-model.txt
+++ /dev/null
@@ -1,144 +0,0 @@
-                           ====================
-                           Energy Model of CPUs
-                           ====================
-
-1. Overview
------------
-
-The Energy Model (EM) framework serves as an interface between drivers knowing
-the power consumed by CPUs at various performance levels, and the kernel
-subsystems willing to use that information to make energy-aware decisions.
-
-The source of the information about the power consumed by CPUs can vary greatly
-from one platform to another. These power costs can be estimated using
-devicetree data in some cases. In others, the firmware will know better.
-Alternatively, userspace might be best positioned. And so on. In order to avoid
-each and every client subsystem to re-implement support for each and every
-possible source of information on its own, the EM framework intervenes as an
-abstraction layer which standardizes the format of power cost tables in the
-kernel, hence enabling to avoid redundant work.
-
-The figure below depicts an example of drivers (Arm-specific here, but the
-approach is applicable to any architecture) providing power costs to the EM
-framework, and interested clients reading the data from it.
-
-       +---------------+  +-----------------+  +---------------+
-       | Thermal (IPA) |  | Scheduler (EAS) |  |     Other     |
-       +---------------+  +-----------------+  +---------------+
-               |                   | em_pd_energy()    |
-               |                   | em_cpu_get()      |
-               +---------+         |         +---------+
-                         |         |         |
-                         v         v         v
-                        +---------------------+
-                        |    Energy Model     |
-                        |     Framework       |
-                        +---------------------+
-                           ^       ^       ^
-                           |       |       | em_register_perf_domain()
-                +----------+       |       +---------+
-                |                  |                 |
-        +---------------+  +---------------+  +--------------+
-        |  cpufreq-dt   |  |   arm_scmi    |  |    Other     |
-        +---------------+  +---------------+  +--------------+
-                ^                  ^                 ^
-                |                  |                 |
-        +--------------+   +---------------+  +--------------+
-        | Device Tree  |   |   Firmware    |  |      ?       |
-        +--------------+   +---------------+  +--------------+
-
-The EM framework manages power cost tables per 'performance domain' in the
-system. A performance domain is a group of CPUs whose performance is scaled
-together. Performance domains generally have a 1-to-1 mapping with CPUFreq
-policies. All CPUs in a performance domain are required to have the same
-micro-architecture. CPUs in different performance domains can have different
-micro-architectures.
-
-
-2. Core APIs
-------------
-
-  2.1 Config options
-
-CONFIG_ENERGY_MODEL must be enabled to use the EM framework.
-
-
-  2.2 Registration of performance domains
-
-Drivers are expected to register performance domains into the EM framework by
-calling the following API:
-
-  int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
-			      struct em_data_callback *cb);
-
-Drivers must specify the CPUs of the performance domains using the cpumask
-argument, and provide a callback function returning <frequency, power> tuples
-for each capacity state. The callback function provided by the driver is free
-to fetch data from any relevant location (DT, firmware, ...), and by any mean
-deemed necessary. See Section 3. for an example of driver implementing this
-callback, and kernel/power/energy_model.c for further documentation on this
-API.
-
-
-  2.3 Accessing performance domains
-
-Subsystems interested in the energy model of a CPU can retrieve it using the
-em_cpu_get() API. The energy model tables are allocated once upon creation of
-the performance domains, and kept in memory untouched.
-
-The energy consumed by a performance domain can be estimated using the
-em_pd_energy() API. The estimation is performed assuming that the schedutil
-CPUfreq governor is in use.
-
-More details about the above APIs can be found in include/linux/energy_model.h.
-
-
-3. Example driver
------------------
-
-This section provides a simple example of a CPUFreq driver registering a
-performance domain in the Energy Model framework using the (fake) 'foo'
-protocol. The driver implements an est_power() function to be provided to the
-EM framework.
-
- -> drivers/cpufreq/foo_cpufreq.c
-
-01	static int est_power(unsigned long *mW, unsigned long *KHz, int cpu)
-02	{
-03		long freq, power;
-04
-05		/* Use the 'foo' protocol to ceil the frequency */
-06		freq = foo_get_freq_ceil(cpu, *KHz);
-07		if (freq < 0);
-08			return freq;
-09
-10		/* Estimate the power cost for the CPU at the relevant freq. */
-11		power = foo_estimate_power(cpu, freq);
-12		if (power < 0);
-13			return power;
-14
-15		/* Return the values to the EM framework */
-16		*mW = power;
-17		*KHz = freq;
-18
-19		return 0;
-20	}
-21
-22	static int foo_cpufreq_init(struct cpufreq_policy *policy)
-23	{
-24		struct em_data_callback em_cb = EM_DATA_CB(est_power);
-25		int nr_opp, ret;
-26
-27		/* Do the actual CPUFreq init work ... */
-28		ret = do_foo_cpufreq_init(policy);
-29		if (ret)
-30			return ret;
-31
-32		/* Find the number of OPPs for this policy */
-33		nr_opp = foo_get_nr_opp(policy);
-34
-35		/* And register the new performance domain */
-36		em_register_perf_domain(policy->cpus, nr_opp, &em_cb);
-37
-38	        return 0;
-39	}
diff --git a/Documentation/power/freezing-of-tasks.rst b/Documentation/power/freezing-of-tasks.rst
new file mode 100644
index 000000000000..ef110fe55e82
--- /dev/null
+++ b/Documentation/power/freezing-of-tasks.rst
@@ -0,0 +1,244 @@
+=================
+Freezing of tasks
+=================
+
+(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
+
+I. What is the freezing of tasks?
+=================================
+
+The freezing of tasks is a mechanism by which user space processes and some
+kernel threads are controlled during hibernation or system-wide suspend (on some
+architectures).
+
+II. How does it work?
+=====================
+
+There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN
+and PF_FREEZER_SKIP (the last one is auxiliary).  The tasks that have
+PF_NOFREEZE unset (all user space processes and some kernel threads) are
+regarded as 'freezable' and treated in a special way before the system enters a
+suspend state as well as before a hibernation image is created (in what follows
+we only consider hibernation, but the description also applies to suspend).
+
+Namely, as the first step of the hibernation procedure the function
+freeze_processes() (defined in kernel/power/process.c) is called.  A system-wide
+variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate
+whether the system is to undergo a freezing operation. And freeze_processes()
+sets this variable.  After this, it executes try_to_freeze_tasks() that sends a
+fake signal to all user space processes, and wakes up all the kernel threads.
+All freezable tasks must react to that by calling try_to_freeze(), which
+results in a call to __refrigerator() (defined in kernel/freezer.c), which sets
+the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes
+it loop until PF_FROZEN is cleared for it. Then, we say that the task is
+'frozen' and therefore the set of functions handling this mechanism is referred
+to as 'the freezer' (these functions are defined in kernel/power/process.c,
+kernel/freezer.c & include/linux/freezer.h). User space processes are generally
+frozen before kernel threads.
+
+__refrigerator() must not be called directly.  Instead, use the
+try_to_freeze() function (defined in include/linux/freezer.h), that checks
+if the task is to be frozen and makes the task enter __refrigerator().
+
+For user space processes try_to_freeze() is called automatically from the
+signal-handling code, but the freezable kernel threads need to call it
+explicitly in suitable places or use the wait_event_freezable() or
+wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
+that combine interruptible sleep with checking if the task is to be frozen and
+calling try_to_freeze().  The main loop of a freezable kernel thread may look
+like the following one::
+
+	set_freezable();
+	do {
+		hub_events();
+		wait_event_freezable(khubd_wait,
+				!list_empty(&hub_event_list) ||
+				kthread_should_stop());
+	} while (!kthread_should_stop() || !list_empty(&hub_event_list));
+
+(from drivers/usb/core/hub.c::hub_thread()).
+
+If a freezable kernel thread fails to call try_to_freeze() after the freezer has
+initiated a freezing operation, the freezing of tasks will fail and the entire
+hibernation operation will be cancelled.  For this reason, freezable kernel
+threads must call try_to_freeze() somewhere or use one of the
+wait_event_freezable() and wait_event_freezable_timeout() macros.
+
+After the system memory state has been restored from a hibernation image and
+devices have been reinitialized, the function thaw_processes() is called in
+order to clear the PF_FROZEN flag for each frozen task.  Then, the tasks that
+have been frozen leave __refrigerator() and continue running.
+
+
+Rationale behind the functions dealing with freezing and thawing of tasks
+-------------------------------------------------------------------------
+
+freeze_processes():
+  - freezes only userspace tasks
+
+freeze_kernel_threads():
+  - freezes all tasks (including kernel threads) because we can't freeze
+    kernel threads without freezing userspace tasks
+
+thaw_kernel_threads():
+  - thaws only kernel threads; this is particularly useful if we need to do
+    anything special in between thawing of kernel threads and thawing of
+    userspace tasks, or if we want to postpone the thawing of userspace tasks
+
+thaw_processes():
+  - thaws all tasks (including kernel threads) because we can't thaw userspace
+    tasks without thawing kernel threads
+
+
+III. Which kernel threads are freezable?
+========================================
+
+Kernel threads are not freezable by default.  However, a kernel thread may clear
+PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
+directly is not allowed).  From this point it is regarded as freezable
+and must call try_to_freeze() in a suitable place.
+
+IV. Why do we do that?
+======================
+
+Generally speaking, there is a couple of reasons to use the freezing of tasks:
+
+1. The principal reason is to prevent filesystems from being damaged after
+   hibernation.  At the moment we have no simple means of checkpointing
+   filesystems, so if there are any modifications made to filesystem data and/or
+   metadata on disks, we cannot bring them back to the state from before the
+   modifications.  At the same time each hibernation image contains some
+   filesystem-related information that must be consistent with the state of the
+   on-disk data and metadata after the system memory state has been restored
+   from the image (otherwise the filesystems will be damaged in a nasty way,
+   usually making them almost impossible to repair).  We therefore freeze
+   tasks that might cause the on-disk filesystems' data and metadata to be
+   modified after the hibernation image has been created and before the
+   system is finally powered off. The majority of these are user space
+   processes, but if any of the kernel threads may cause something like this
+   to happen, they have to be freezable.
+
+2. Next, to create the hibernation image we need to free a sufficient amount of
+   memory (approximately 50% of available RAM) and we need to do that before
+   devices are deactivated, because we generally need them for swapping out.
+   Then, after the memory for the image has been freed, we don't want tasks
+   to allocate additional memory and we prevent them from doing that by
+   freezing them earlier. [Of course, this also means that device drivers
+   should not allocate substantial amounts of memory from their .suspend()
+   callbacks before hibernation, but this is a separate issue.]
+
+3. The third reason is to prevent user space processes and some kernel threads
+   from interfering with the suspending and resuming of devices.  A user space
+   process running on a second CPU while we are suspending devices may, for
+   example, be troublesome and without the freezing of tasks we would need some
+   safeguards against race conditions that might occur in such a case.
+
+Although Linus Torvalds doesn't like the freezing of tasks, he said this in one
+of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608):
+
+"RJW:> Why we freeze tasks at all or why we freeze kernel threads?
+
+Linus: In many ways, 'at all'.
+
+I **do** realize the IO request queue issues, and that we cannot actually do
+s2ram with some devices in the middle of a DMA.  So we want to be able to
+avoid *that*, there's no question about that.  And I suspect that stopping
+user threads and then waiting for a sync is practically one of the easier
+ways to do so.
+
+So in practice, the 'at all' may become a 'why freeze kernel threads?' and
+freezing user threads I don't find really objectionable."
+
+Still, there are kernel threads that may want to be freezable.  For example, if
+a kernel thread that belongs to a device driver accesses the device directly, it
+in principle needs to know when the device is suspended, so that it doesn't try
+to access it at that time.  However, if the kernel thread is freezable, it will
+be frozen before the driver's .suspend() callback is executed and it will be
+thawed after the driver's .resume() callback has run, so it won't be accessing
+the device while it's suspended.
+
+4. Another reason for freezing tasks is to prevent user space processes from
+   realizing that hibernation (or suspend) operation takes place.  Ideally, user
+   space processes should not notice that such a system-wide operation has
+   occurred and should continue running without any problems after the restore
+   (or resume from suspend).  Unfortunately, in the most general case this
+   is quite difficult to achieve without the freezing of tasks.  Consider,
+   for example, a process that depends on all CPUs being online while it's
+   running.  Since we need to disable nonboot CPUs during the hibernation,
+   if this process is not frozen, it may notice that the number of CPUs has
+   changed and may start to work incorrectly because of that.
+
+V. Are there any problems related to the freezing of tasks?
+===========================================================
+
+Yes, there are.
+
+First of all, the freezing of kernel threads may be tricky if they depend one
+on another.  For example, if kernel thread A waits for a completion (in the
+TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B
+and B is frozen in the meantime, then A will be blocked until B is thawed, which
+may be undesirable.  That's why kernel threads are not freezable by default.
+
+Second, there are the following two problems related to the freezing of user
+space processes:
+
+1. Putting processes into an uninterruptible sleep distorts the load average.
+2. Now that we have FUSE, plus the framework for doing device drivers in
+   userspace, it gets even more complicated because some userspace processes are
+   now doing the sorts of things that kernel threads do
+   (https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html).
+
+The problem 1. seems to be fixable, although it hasn't been fixed so far.  The
+other one is more serious, but it seems that we can work around it by using
+hibernation (and suspend) notifiers (in that case, though, we won't be able to
+avoid the realization by the user space processes that the hibernation is taking
+place).
+
+There are also problems that the freezing of tasks tends to expose, although
+they are not directly related to it.  For example, if request_firmware() is
+called from a device driver's .resume() routine, it will timeout and eventually
+fail, because the user land process that should respond to the request is frozen
+at this point.  So, seemingly, the failure is due to the freezing of tasks.
+Suppose, however, that the firmware file is located on a filesystem accessible
+only through another device that hasn't been resumed yet.  In that case,
+request_firmware() will fail regardless of whether or not the freezing of tasks
+is used.  Consequently, the problem is not really related to the freezing of
+tasks, since it generally exists anyway.
+
+A driver must have all firmwares it may need in RAM before suspend() is called.
+If keeping them is not practical, for example due to their size, they must be
+requested early enough using the suspend notifier API described in
+Documentation/driver-api/pm/notifiers.rst.
+
+VI. Are there any precautions to be taken to prevent freezing failures?
+=======================================================================
+
+Yes, there are.
+
+First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code
+from system-wide sleep such as suspend/hibernation is not encouraged.
+If possible, that piece of code must instead hook onto the suspend/hibernation
+notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code
+(kernel/cpu.c) for an example.
+
+However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary,
+it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since
+that could lead to freezing failures, because if the suspend/hibernate code
+successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed
+to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE
+state. As a consequence, the freezer would not be able to freeze that task,
+leading to freezing failure.
+
+However, the [un]lock_system_sleep() APIs are safe to use in this scenario,
+since they ask the freezer to skip freezing this task, since it is anyway
+"frozen enough" as it is blocked on 'system_transition_mutex', which will be released
+only after the entire suspend/hibernation sequence is complete.
+So, to summarize, use [un]lock_system_sleep() instead of directly using
+mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures.
+
+V. Miscellaneous
+================
+
+/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze
+all user space processes or all freezable kernel threads, in unit of millisecond.
+The default value is 20000, with range of unsigned integer.
diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
deleted file mode 100644
index cd283190855a..000000000000
--- a/Documentation/power/freezing-of-tasks.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-Freezing of tasks
-	(C) 2007 Rafael J. Wysocki <rjw@sisk.pl>, GPL
-
-I. What is the freezing of tasks?
-
-The freezing of tasks is a mechanism by which user space processes and some
-kernel threads are controlled during hibernation or system-wide suspend (on some
-architectures).
-
-II. How does it work?
-
-There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN
-and PF_FREEZER_SKIP (the last one is auxiliary).  The tasks that have
-PF_NOFREEZE unset (all user space processes and some kernel threads) are
-regarded as 'freezable' and treated in a special way before the system enters a
-suspend state as well as before a hibernation image is created (in what follows
-we only consider hibernation, but the description also applies to suspend).
-
-Namely, as the first step of the hibernation procedure the function
-freeze_processes() (defined in kernel/power/process.c) is called.  A system-wide
-variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate
-whether the system is to undergo a freezing operation. And freeze_processes()
-sets this variable.  After this, it executes try_to_freeze_tasks() that sends a
-fake signal to all user space processes, and wakes up all the kernel threads.
-All freezable tasks must react to that by calling try_to_freeze(), which
-results in a call to __refrigerator() (defined in kernel/freezer.c), which sets
-the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes
-it loop until PF_FROZEN is cleared for it. Then, we say that the task is
-'frozen' and therefore the set of functions handling this mechanism is referred
-to as 'the freezer' (these functions are defined in kernel/power/process.c,
-kernel/freezer.c & include/linux/freezer.h). User space processes are generally
-frozen before kernel threads.
-
-__refrigerator() must not be called directly.  Instead, use the
-try_to_freeze() function (defined in include/linux/freezer.h), that checks
-if the task is to be frozen and makes the task enter __refrigerator().
-
-For user space processes try_to_freeze() is called automatically from the
-signal-handling code, but the freezable kernel threads need to call it
-explicitly in suitable places or use the wait_event_freezable() or
-wait_event_freezable_timeout() macros (defined in include/linux/freezer.h)
-that combine interruptible sleep with checking if the task is to be frozen and
-calling try_to_freeze().  The main loop of a freezable kernel thread may look
-like the following one:
-
-	set_freezable();
-	do {
-		hub_events();
-		wait_event_freezable(khubd_wait,
-				!list_empty(&hub_event_list) ||
-				kthread_should_stop());
-	} while (!kthread_should_stop() || !list_empty(&hub_event_list));
-
-(from drivers/usb/core/hub.c::hub_thread()).
-
-If a freezable kernel thread fails to call try_to_freeze() after the freezer has
-initiated a freezing operation, the freezing of tasks will fail and the entire
-hibernation operation will be cancelled.  For this reason, freezable kernel
-threads must call try_to_freeze() somewhere or use one of the
-wait_event_freezable() and wait_event_freezable_timeout() macros.
-
-After the system memory state has been restored from a hibernation image and
-devices have been reinitialized, the function thaw_processes() is called in
-order to clear the PF_FROZEN flag for each frozen task.  Then, the tasks that
-have been frozen leave __refrigerator() and continue running.
-
-
-Rationale behind the functions dealing with freezing and thawing of tasks:
--------------------------------------------------------------------------
-
-freeze_processes():
-  - freezes only userspace tasks
-
-freeze_kernel_threads():
-  - freezes all tasks (including kernel threads) because we can't freeze
-    kernel threads without freezing userspace tasks
-
-thaw_kernel_threads():
-  - thaws only kernel threads; this is particularly useful if we need to do
-    anything special in between thawing of kernel threads and thawing of
-    userspace tasks, or if we want to postpone the thawing of userspace tasks
-
-thaw_processes():
-  - thaws all tasks (including kernel threads) because we can't thaw userspace
-    tasks without thawing kernel threads
-
-
-III. Which kernel threads are freezable?
-
-Kernel threads are not freezable by default.  However, a kernel thread may clear
-PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE
-directly is not allowed).  From this point it is regarded as freezable
-and must call try_to_freeze() in a suitable place.
-
-IV. Why do we do that?
-
-Generally speaking, there is a couple of reasons to use the freezing of tasks:
-
-1. The principal reason is to prevent filesystems from being damaged after
-hibernation.  At the moment we have no simple means of checkpointing
-filesystems, so if there are any modifications made to filesystem data and/or
-metadata on disks, we cannot bring them back to the state from before the
-modifications.  At the same time each hibernation image contains some
-filesystem-related information that must be consistent with the state of the
-on-disk data and metadata after the system memory state has been restored from
-the image (otherwise the filesystems will be damaged in a nasty way, usually
-making them almost impossible to repair).  We therefore freeze tasks that might
-cause the on-disk filesystems' data and metadata to be modified after the
-hibernation image has been created and before the system is finally powered off.
-The majority of these are user space processes, but if any of the kernel threads
-may cause something like this to happen, they have to be freezable.
-
-2. Next, to create the hibernation image we need to free a sufficient amount of
-memory (approximately 50% of available RAM) and we need to do that before
-devices are deactivated, because we generally need them for swapping out.  Then,
-after the memory for the image has been freed, we don't want tasks to allocate
-additional memory and we prevent them from doing that by freezing them earlier.
-[Of course, this also means that device drivers should not allocate substantial
-amounts of memory from their .suspend() callbacks before hibernation, but this
-is a separate issue.]
-
-3. The third reason is to prevent user space processes and some kernel threads
-from interfering with the suspending and resuming of devices.  A user space
-process running on a second CPU while we are suspending devices may, for
-example, be troublesome and without the freezing of tasks we would need some
-safeguards against race conditions that might occur in such a case.
-
-Although Linus Torvalds doesn't like the freezing of tasks, he said this in one
-of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608):
-
-"RJW:> Why we freeze tasks at all or why we freeze kernel threads?
-
-Linus: In many ways, 'at all'.
-
-I _do_ realize the IO request queue issues, and that we cannot actually do
-s2ram with some devices in the middle of a DMA.  So we want to be able to
-avoid *that*, there's no question about that.  And I suspect that stopping
-user threads and then waiting for a sync is practically one of the easier
-ways to do so.
-
-So in practice, the 'at all' may become a 'why freeze kernel threads?' and
-freezing user threads I don't find really objectionable."
-
-Still, there are kernel threads that may want to be freezable.  For example, if
-a kernel thread that belongs to a device driver accesses the device directly, it
-in principle needs to know when the device is suspended, so that it doesn't try
-to access it at that time.  However, if the kernel thread is freezable, it will
-be frozen before the driver's .suspend() callback is executed and it will be
-thawed after the driver's .resume() callback has run, so it won't be accessing
-the device while it's suspended.
-
-4. Another reason for freezing tasks is to prevent user space processes from
-realizing that hibernation (or suspend) operation takes place.  Ideally, user
-space processes should not notice that such a system-wide operation has occurred
-and should continue running without any problems after the restore (or resume
-from suspend).  Unfortunately, in the most general case this is quite difficult
-to achieve without the freezing of tasks.  Consider, for example, a process
-that depends on all CPUs being online while it's running.  Since we need to
-disable nonboot CPUs during the hibernation, if this process is not frozen, it
-may notice that the number of CPUs has changed and may start to work incorrectly
-because of that.
-
-V. Are there any problems related to the freezing of tasks?
-
-Yes, there are.
-
-First of all, the freezing of kernel threads may be tricky if they depend one
-on another.  For example, if kernel thread A waits for a completion (in the
-TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B
-and B is frozen in the meantime, then A will be blocked until B is thawed, which
-may be undesirable.  That's why kernel threads are not freezable by default.
-
-Second, there are the following two problems related to the freezing of user
-space processes:
-1. Putting processes into an uninterruptible sleep distorts the load average.
-2. Now that we have FUSE, plus the framework for doing device drivers in
-userspace, it gets even more complicated because some userspace processes are
-now doing the sorts of things that kernel threads do
-(https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html).
-
-The problem 1. seems to be fixable, although it hasn't been fixed so far.  The
-other one is more serious, but it seems that we can work around it by using
-hibernation (and suspend) notifiers (in that case, though, we won't be able to
-avoid the realization by the user space processes that the hibernation is taking
-place).
-
-There are also problems that the freezing of tasks tends to expose, although
-they are not directly related to it.  For example, if request_firmware() is
-called from a device driver's .resume() routine, it will timeout and eventually
-fail, because the user land process that should respond to the request is frozen
-at this point.  So, seemingly, the failure is due to the freezing of tasks.
-Suppose, however, that the firmware file is located on a filesystem accessible
-only through another device that hasn't been resumed yet.  In that case,
-request_firmware() will fail regardless of whether or not the freezing of tasks
-is used.  Consequently, the problem is not really related to the freezing of
-tasks, since it generally exists anyway.
-
-A driver must have all firmwares it may need in RAM before suspend() is called.
-If keeping them is not practical, for example due to their size, they must be
-requested early enough using the suspend notifier API described in
-Documentation/driver-api/pm/notifiers.rst.
-
-VI. Are there any precautions to be taken to prevent freezing failures?
-
-Yes, there are.
-
-First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code
-from system-wide sleep such as suspend/hibernation is not encouraged.
-If possible, that piece of code must instead hook onto the suspend/hibernation
-notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code
-(kernel/cpu.c) for an example.
-
-However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary,
-it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since
-that could lead to freezing failures, because if the suspend/hibernate code
-successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed
-to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE
-state. As a consequence, the freezer would not be able to freeze that task,
-leading to freezing failure.
-
-However, the [un]lock_system_sleep() APIs are safe to use in this scenario,
-since they ask the freezer to skip freezing this task, since it is anyway
-"frozen enough" as it is blocked on 'system_transition_mutex', which will be released
-only after the entire suspend/hibernation sequence is complete.
-So, to summarize, use [un]lock_system_sleep() instead of directly using
-mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures.
-
-V. Miscellaneous
-/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze
-all user space processes or all freezable kernel threads, in unit of millisecond.
-The default value is 20000, with range of unsigned integer.
diff --git a/Documentation/power/index.rst b/Documentation/power/index.rst
new file mode 100644
index 000000000000..20415f21e48a
--- /dev/null
+++ b/Documentation/power/index.rst
@@ -0,0 +1,46 @@
+:orphan:
+
+================
+Power Management
+================
+
+.. toctree::
+    :maxdepth: 1
+
+    apm-acpi
+    basic-pm-debugging
+    charger-manager
+    drivers-testing
+    energy-model
+    freezing-of-tasks
+    interface
+    opp
+    pci
+    pm_qos_interface
+    power_supply_class
+    runtime_pm
+    s2ram
+    suspend-and-cpuhotplug
+    suspend-and-interrupts
+    swsusp-and-swap-files
+    swsusp-dmcrypt
+    swsusp
+    video
+    tricks
+
+    userland-swsusp
+
+    powercap/powercap
+
+    regulator/consumer
+    regulator/design
+    regulator/machine
+    regulator/overview
+    regulator/regulator
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/power/interface.rst b/Documentation/power/interface.rst
new file mode 100644
index 000000000000..8d270ed27228
--- /dev/null
+++ b/Documentation/power/interface.rst
@@ -0,0 +1,79 @@
+===========================================
+Power Management Interface for System Sleep
+===========================================
+
+Copyright (c) 2016 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+The power management subsystem provides userspace with a unified sysfs interface
+for system sleep regardless of the underlying system architecture or platform.
+The interface is located in the /sys/power/ directory (assuming that sysfs is
+mounted at /sys).
+
+/sys/power/state is the system sleep state control file.
+
+Reading from it returns a list of supported sleep states, encoded as:
+
+- 'freeze' (Suspend-to-Idle)
+- 'standby' (Power-On Suspend)
+- 'mem' (Suspend-to-RAM)
+- 'disk' (Suspend-to-Disk)
+
+Suspend-to-Idle is always supported.  Suspend-to-Disk is always supported
+too as long the kernel has been configured to support hibernation at all
+(ie. CONFIG_HIBERNATION is set in the kernel configuration file).  Support
+for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the
+platform.
+
+If one of the strings listed in /sys/power/state is written to it, the system
+will attempt to transition into the corresponding sleep state.  Refer to
+Documentation/admin-guide/pm/sleep-states.rst for a description of each of
+those states.
+
+/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk).
+Specifically, it tells the kernel what to do after creating a hibernation image.
+
+Reading from it returns a list of supported options encoded as:
+
+- 'platform' (put the system into sleep using a platform-provided method)
+- 'shutdown' (shut the system down)
+- 'reboot' (reboot the system)
+- 'suspend' (trigger a Suspend-to-RAM transition)
+- 'test_resume' (resume-after-hibernation test mode)
+
+The currently selected option is printed in square brackets.
+
+The 'platform' option is only available if the platform provides a special
+mechanism to put the system to sleep after creating a hibernation image (ACPI
+does that, for example).  The 'suspend' option is available if Suspend-to-RAM
+is supported.  Refer to Documentation/power/basic-pm-debugging.rst for the
+description of the 'test_resume' option.
+
+To select an option, write the string representing it to /sys/power/disk.
+
+/sys/power/image_size controls the size of hibernation images.
+
+It can be written a string representing a non-negative integer that will be
+used as a best-effort upper limit of the image size, in bytes.  The hibernation
+core will do its best to ensure that the image size will not exceed that number.
+However, if that turns out to be impossible to achieve, a hibernation image will
+still be created and its size will be as small as possible.  In particular,
+writing '0' to this file will enforce hibernation images to be as small as
+possible.
+
+Reading from this file returns the current image size limit, which is set to
+around 2/5 of available RAM by default.
+
+/sys/power/pm_trace controls the PM trace mechanism saving the last suspend
+or resume event point in the RTC across reboots.
+
+It helps to debug hard lockups or reboots due to device driver failures that
+occur during system suspend or resume (which is more common) more effectively.
+
+If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume
+event point in turn will be stored in the RTC memory (overwriting the actual
+RTC information), so it will survive a system crash if one occurs right after
+storing it and it can be used later to identify the driver that caused the crash
+to happen (see Documentation/power/s2ram.rst for more information).
+
+Initially it contains '0' which may be changed to '1' by writing a string
+representing a nonzero integer into it.
diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
deleted file mode 100644
index 27df7f98668a..000000000000
--- a/Documentation/power/interface.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-Power Management Interface for System Sleep
-
-Copyright (c) 2016 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-The power management subsystem provides userspace with a unified sysfs interface
-for system sleep regardless of the underlying system architecture or platform.
-The interface is located in the /sys/power/ directory (assuming that sysfs is
-mounted at /sys).
-
-/sys/power/state is the system sleep state control file.
-
-Reading from it returns a list of supported sleep states, encoded as:
-
-'freeze' (Suspend-to-Idle)
-'standby' (Power-On Suspend)
-'mem' (Suspend-to-RAM)
-'disk' (Suspend-to-Disk)
-
-Suspend-to-Idle is always supported.  Suspend-to-Disk is always supported
-too as long the kernel has been configured to support hibernation at all
-(ie. CONFIG_HIBERNATION is set in the kernel configuration file).  Support
-for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the
-platform.
-
-If one of the strings listed in /sys/power/state is written to it, the system
-will attempt to transition into the corresponding sleep state.  Refer to
-Documentation/admin-guide/pm/sleep-states.rst for a description of each of
-those states.
-
-/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk).
-Specifically, it tells the kernel what to do after creating a hibernation image.
-
-Reading from it returns a list of supported options encoded as:
-
-'platform' (put the system into sleep using a platform-provided method)
-'shutdown' (shut the system down)
-'reboot' (reboot the system)
-'suspend' (trigger a Suspend-to-RAM transition)
-'test_resume' (resume-after-hibernation test mode)
-
-The currently selected option is printed in square brackets.
-
-The 'platform' option is only available if the platform provides a special
-mechanism to put the system to sleep after creating a hibernation image (ACPI
-does that, for example).  The 'suspend' option is available if Suspend-to-RAM
-is supported.  Refer to Documentation/power/basic-pm-debugging.txt for the
-description of the 'test_resume' option.
-
-To select an option, write the string representing it to /sys/power/disk.
-
-/sys/power/image_size controls the size of hibernation images.
-
-It can be written a string representing a non-negative integer that will be
-used as a best-effort upper limit of the image size, in bytes.  The hibernation
-core will do its best to ensure that the image size will not exceed that number.
-However, if that turns out to be impossible to achieve, a hibernation image will
-still be created and its size will be as small as possible.  In particular,
-writing '0' to this file will enforce hibernation images to be as small as
-possible.
-
-Reading from this file returns the current image size limit, which is set to
-around 2/5 of available RAM by default.
-
-/sys/power/pm_trace controls the PM trace mechanism saving the last suspend
-or resume event point in the RTC across reboots.
-
-It helps to debug hard lockups or reboots due to device driver failures that
-occur during system suspend or resume (which is more common) more effectively.
-
-If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume
-event point in turn will be stored in the RTC memory (overwriting the actual
-RTC information), so it will survive a system crash if one occurs right after
-storing it and it can be used later to identify the driver that caused the crash
-to happen (see Documentation/power/s2ram.txt for more information).
-
-Initially it contains '0' which may be changed to '1' by writing a string
-representing a nonzero integer into it.
diff --git a/Documentation/power/opp.rst b/Documentation/power/opp.rst
new file mode 100644
index 000000000000..b3cf1def9dee
--- /dev/null
+++ b/Documentation/power/opp.rst
@@ -0,0 +1,379 @@
+==========================================
+Operating Performance Points (OPP) Library
+==========================================
+
+(C) 2009-2010 Nishanth Menon <nm@ti.com>, Texas Instruments Incorporated
+
+.. Contents
+
+  1. Introduction
+  2. Initial OPP List Registration
+  3. OPP Search Functions
+  4. OPP Availability Control Functions
+  5. OPP Data Retrieval Functions
+  6. Data Structures
+
+1. Introduction
+===============
+
+1.1 What is an Operating Performance Point (OPP)?
+-------------------------------------------------
+
+Complex SoCs of today consists of a multiple sub-modules working in conjunction.
+In an operational system executing varied use cases, not all modules in the SoC
+need to function at their highest performing frequency all the time. To
+facilitate this, sub-modules in a SoC are grouped into domains, allowing some
+domains to run at lower voltage and frequency while other domains run at
+voltage/frequency pairs that are higher.
+
+The set of discrete tuples consisting of frequency and voltage pairs that
+the device will support per domain are called Operating Performance Points or
+OPPs.
+
+As an example:
+
+Let us consider an MPU device which supports the following:
+{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V},
+{1GHz at minimum voltage of 1.3V}
+
+We can represent these as three OPPs as the following {Hz, uV} tuples:
+
+- {300000000, 1000000}
+- {800000000, 1200000}
+- {1000000000, 1300000}
+
+1.2 Operating Performance Points Library
+----------------------------------------
+
+OPP library provides a set of helper functions to organize and query the OPP
+information. The library is located in drivers/base/power/opp.c and the header
+is located in include/linux/pm_opp.h. OPP library can be enabled by enabling
+CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on
+CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to
+optionally boot at a certain OPP without needing cpufreq.
+
+Typical usage of the OPP library is as follows::
+
+ (users)	-> registers a set of default OPPs		-> (library)
+ SoC framework	-> modifies on required cases certain OPPs	-> OPP layer
+		-> queries to search/retrieve information	->
+
+OPP layer expects each domain to be represented by a unique device pointer. SoC
+framework registers a set of initial OPPs per device with the OPP layer. This
+list is expected to be an optimally small number typically around 5 per device.
+This initial list contains a set of OPPs that the framework expects to be safely
+enabled by default in the system.
+
+Note on OPP Availability
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+As the system proceeds to operate, SoC framework may choose to make certain
+OPPs available or not available on each device based on various external
+factors. Example usage: Thermal management or other exceptional situations where
+SoC framework might choose to disable a higher frequency OPP to safely continue
+operations until that OPP could be re-enabled if possible.
+
+OPP library facilitates this concept in it's implementation. The following
+operational functions operate only on available opps:
+opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count
+
+dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then
+be used for dev_pm_opp_enable/disable functions to make an opp available as required.
+
+WARNING: Users of OPP library should refresh their availability count using
+get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the
+exact mechanism to trigger these or the notification mechanism to other
+dependent subsystems such as cpufreq are left to the discretion of the SoC
+specific framework which uses the OPP library. Similar care needs to be taken
+care to refresh the cpufreq table in cases of these operations.
+
+2. Initial OPP List Registration
+================================
+The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per
+device. It is expected that the SoC framework will register the OPP entries
+optimally- typical numbers range to be less than 5. The list generated by
+registering the OPPs is maintained by OPP library throughout the device
+operation. The SoC framework can subsequently control the availability of the
+OPPs dynamically using the dev_pm_opp_enable / disable functions.
+
+dev_pm_opp_add
+	Add a new OPP for a specific domain represented by the device pointer.
+	The OPP is defined using the frequency and voltage. Once added, the OPP
+	is assumed to be available and control of it's availability can be done
+	with the dev_pm_opp_enable/disable functions. OPP library internally stores
+	and manages this information in the opp struct. This function may be
+	used by SoC framework to define a optimal list as per the demands of
+	SoC usage environment.
+
+	WARNING:
+		Do not use this function in interrupt context.
+
+	Example::
+
+	 soc_pm_init()
+	 {
+		/* Do things */
+		r = dev_pm_opp_add(mpu_dev, 1000000, 900000);
+		if (!r) {
+			pr_err("%s: unable to register mpu opp(%d)\n", r);
+			goto no_cpufreq;
+		}
+		/* Do cpufreq things */
+	 no_cpufreq:
+		/* Do remaining things */
+	 }
+
+3. OPP Search Functions
+=======================
+High level framework such as cpufreq operates on frequencies. To map the
+frequency back to the corresponding OPP, OPP library provides handy functions
+to search the OPP list that OPP library internally manages. These search
+functions return the matching pointer representing the opp if a match is
+found, else returns error. These errors are expected to be handled by standard
+error checks such as IS_ERR() and appropriate actions taken by the caller.
+
+Callers of these functions shall call dev_pm_opp_put() after they have used the
+OPP. Otherwise the memory for the OPP will never get freed and result in
+memleak.
+
+dev_pm_opp_find_freq_exact
+	Search for an OPP based on an *exact* frequency and
+	availability. This function is especially useful to enable an OPP which
+	is not available by default.
+	Example: In a case when SoC framework detects a situation where a
+	higher frequency could be made available, it can use this function to
+	find the OPP prior to call the dev_pm_opp_enable to actually make
+	it available::
+
+	 opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
+	 dev_pm_opp_put(opp);
+	 /* dont operate on the pointer.. just do a sanity check.. */
+	 if (IS_ERR(opp)) {
+		pr_err("frequency not disabled!\n");
+		/* trigger appropriate actions.. */
+	 } else {
+		dev_pm_opp_enable(dev,1000000000);
+	 }
+
+	NOTE:
+	  This is the only search function that operates on OPPs which are
+	  not available.
+
+dev_pm_opp_find_freq_floor
+	Search for an available OPP which is *at most* the
+	provided frequency. This function is useful while searching for a lesser
+	match OR operating on OPP information in the order of decreasing
+	frequency.
+	Example: To find the highest opp for a device::
+
+	 freq = ULONG_MAX;
+	 opp = dev_pm_opp_find_freq_floor(dev, &freq);
+	 dev_pm_opp_put(opp);
+
+dev_pm_opp_find_freq_ceil
+	Search for an available OPP which is *at least* the
+	provided frequency. This function is useful while searching for a
+	higher match OR operating on OPP information in the order of increasing
+	frequency.
+	Example 1: To find the lowest opp for a device::
+
+	 freq = 0;
+	 opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+	 dev_pm_opp_put(opp);
+
+	Example 2: A simplified implementation of a SoC cpufreq_driver->target::
+
+	 soc_cpufreq_target(..)
+	 {
+		/* Do stuff like policy checks etc. */
+		/* Find the best frequency match for the req */
+		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+		dev_pm_opp_put(opp);
+		if (!IS_ERR(opp))
+			soc_switch_to_freq_voltage(freq);
+		else
+			/* do something when we can't satisfy the req */
+		/* do other stuff */
+	 }
+
+4. OPP Availability Control Functions
+=====================================
+A default OPP list registered with the OPP library may not cater to all possible
+situation. The OPP library provides a set of functions to modify the
+availability of a OPP within the OPP list. This allows SoC frameworks to have
+fine grained dynamic control of which sets of OPPs are operationally available.
+These functions are intended to *temporarily* remove an OPP in conditions such
+as thermal considerations (e.g. don't use OPPx until the temperature drops).
+
+WARNING:
+	Do not use these functions in interrupt context.
+
+dev_pm_opp_enable
+	Make a OPP available for operation.
+	Example: Lets say that 1GHz OPP is to be made available only if the
+	SoC temperature is lower than a certain threshold. The SoC framework
+	implementation might choose to do something as follows::
+
+	 if (cur_temp < temp_low_thresh) {
+		/* Enable 1GHz if it was disabled */
+		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
+		dev_pm_opp_put(opp);
+		/* just error check */
+		if (!IS_ERR(opp))
+			ret = dev_pm_opp_enable(dev, 1000000000);
+		else
+			goto try_something_else;
+	 }
+
+dev_pm_opp_disable
+	Make an OPP to be not available for operation
+	Example: Lets say that 1GHz OPP is to be disabled if the temperature
+	exceeds a threshold value. The SoC framework implementation might
+	choose to do something as follows::
+
+	 if (cur_temp > temp_high_thresh) {
+		/* Disable 1GHz if it was enabled */
+		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true);
+		dev_pm_opp_put(opp);
+		/* just error check */
+		if (!IS_ERR(opp))
+			ret = dev_pm_opp_disable(dev, 1000000000);
+		else
+			goto try_something_else;
+	 }
+
+5. OPP Data Retrieval Functions
+===============================
+Since OPP library abstracts away the OPP information, a set of functions to pull
+information from the OPP structure is necessary. Once an OPP pointer is
+retrieved using the search functions, the following functions can be used by SoC
+framework to retrieve the information represented inside the OPP layer.
+
+dev_pm_opp_get_voltage
+	Retrieve the voltage represented by the opp pointer.
+	Example: At a cpufreq transition to a different frequency, SoC
+	framework requires to set the voltage represented by the OPP using
+	the regulator framework to the Power Management chip providing the
+	voltage::
+
+	 soc_switch_to_freq_voltage(freq)
+	 {
+		/* do things */
+		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
+		v = dev_pm_opp_get_voltage(opp);
+		dev_pm_opp_put(opp);
+		if (v)
+			regulator_set_voltage(.., v);
+		/* do other things */
+	 }
+
+dev_pm_opp_get_freq
+	Retrieve the freq represented by the opp pointer.
+	Example: Lets say the SoC framework uses a couple of helper functions
+	we could pass opp pointers instead of doing additional parameters to
+	handle quiet a bit of data parameters::
+
+	 soc_cpufreq_target(..)
+	 {
+		/* do things.. */
+		 max_freq = ULONG_MAX;
+		 max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq);
+		 requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq);
+		 if (!IS_ERR(max_opp) && !IS_ERR(requested_opp))
+			r = soc_test_validity(max_opp, requested_opp);
+		 dev_pm_opp_put(max_opp);
+		 dev_pm_opp_put(requested_opp);
+		/* do other things */
+	 }
+	 soc_test_validity(..)
+	 {
+		 if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp))
+			 return -EINVAL;
+		 if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp))
+			 return -EINVAL;
+		/* do things.. */
+	 }
+
+dev_pm_opp_get_opp_count
+	Retrieve the number of available opps for a device
+	Example: Lets say a co-processor in the SoC needs to know the available
+	frequencies in a table, the main processor can notify as following::
+
+	 soc_notify_coproc_available_frequencies()
+	 {
+		/* Do things */
+		num_available = dev_pm_opp_get_opp_count(dev);
+		speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL);
+		/* populate the table in increasing order */
+		freq = 0;
+		while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) {
+			speeds[i] = freq;
+			freq++;
+			i++;
+			dev_pm_opp_put(opp);
+		}
+
+		soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available);
+		/* Do other things */
+	 }
+
+6. Data Structures
+==================
+Typically an SoC contains multiple voltage domains which are variable. Each
+domain is represented by a device pointer. The relationship to OPP can be
+represented as follows::
+
+  SoC
+   |- device 1
+   |	|- opp 1 (availability, freq, voltage)
+   |	|- opp 2 ..
+   ...	...
+   |	`- opp n ..
+   |- device 2
+   ...
+   `- device m
+
+OPP library maintains a internal list that the SoC framework populates and
+accessed by various functions as described above. However, the structures
+representing the actual OPPs and domains are internal to the OPP library itself
+to allow for suitable abstraction reusable across systems.
+
+struct dev_pm_opp
+	The internal data structure of OPP library which is used to
+	represent an OPP. In addition to the freq, voltage, availability
+	information, it also contains internal book keeping information required
+	for the OPP library to operate on.  Pointer to this structure is
+	provided back to the users such as SoC framework to be used as a
+	identifier for OPP in the interactions with OPP layer.
+
+	WARNING:
+	  The struct dev_pm_opp pointer should not be parsed or modified by the
+	  users. The defaults of for an instance is populated by
+	  dev_pm_opp_add, but the availability of the OPP can be modified
+	  by dev_pm_opp_enable/disable functions.
+
+struct device
+	This is used to identify a domain to the OPP layer. The
+	nature of the device and it's implementation is left to the user of
+	OPP library such as the SoC framework.
+
+Overall, in a simplistic view, the data structure operations is represented as
+following::
+
+  Initialization / modification:
+              +-----+        /- dev_pm_opp_enable
+  dev_pm_opp_add --> | opp | <-------
+    |         +-----+        \- dev_pm_opp_disable
+    \-------> domain_info(device)
+
+  Search functions:
+               /-- dev_pm_opp_find_freq_ceil  ---\   +-----+
+  domain_info<---- dev_pm_opp_find_freq_exact -----> | opp |
+               \-- dev_pm_opp_find_freq_floor ---/   +-----+
+
+  Retrieval functions:
+  +-----+     /- dev_pm_opp_get_voltage
+  | opp | <---
+  +-----+     \- dev_pm_opp_get_freq
+
+  domain_info <- dev_pm_opp_get_opp_count
diff --git a/Documentation/power/opp.txt b/Documentation/power/opp.txt
deleted file mode 100644
index 0c007e250cd1..000000000000
--- a/Documentation/power/opp.txt
+++ /dev/null
@@ -1,342 +0,0 @@
-Operating Performance Points (OPP) Library
-==========================================
-
-(C) 2009-2010 Nishanth Menon <nm@ti.com>, Texas Instruments Incorporated
-
-Contents
---------
-1. Introduction
-2. Initial OPP List Registration
-3. OPP Search Functions
-4. OPP Availability Control Functions
-5. OPP Data Retrieval Functions
-6. Data Structures
-
-1. Introduction
-===============
-1.1 What is an Operating Performance Point (OPP)?
-
-Complex SoCs of today consists of a multiple sub-modules working in conjunction.
-In an operational system executing varied use cases, not all modules in the SoC
-need to function at their highest performing frequency all the time. To
-facilitate this, sub-modules in a SoC are grouped into domains, allowing some
-domains to run at lower voltage and frequency while other domains run at
-voltage/frequency pairs that are higher.
-
-The set of discrete tuples consisting of frequency and voltage pairs that
-the device will support per domain are called Operating Performance Points or
-OPPs.
-
-As an example:
-Let us consider an MPU device which supports the following:
-{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V},
-{1GHz at minimum voltage of 1.3V}
-
-We can represent these as three OPPs as the following {Hz, uV} tuples:
-{300000000, 1000000}
-{800000000, 1200000}
-{1000000000, 1300000}
-
-1.2 Operating Performance Points Library
-
-OPP library provides a set of helper functions to organize and query the OPP
-information. The library is located in drivers/base/power/opp.c and the header
-is located in include/linux/pm_opp.h. OPP library can be enabled by enabling
-CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on
-CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to
-optionally boot at a certain OPP without needing cpufreq.
-
-Typical usage of the OPP library is as follows:
-(users)		-> registers a set of default OPPs		-> (library)
-SoC framework	-> modifies on required cases certain OPPs	-> OPP layer
-		-> queries to search/retrieve information	->
-
-OPP layer expects each domain to be represented by a unique device pointer. SoC
-framework registers a set of initial OPPs per device with the OPP layer. This
-list is expected to be an optimally small number typically around 5 per device.
-This initial list contains a set of OPPs that the framework expects to be safely
-enabled by default in the system.
-
-Note on OPP Availability:
-------------------------
-As the system proceeds to operate, SoC framework may choose to make certain
-OPPs available or not available on each device based on various external
-factors. Example usage: Thermal management or other exceptional situations where
-SoC framework might choose to disable a higher frequency OPP to safely continue
-operations until that OPP could be re-enabled if possible.
-
-OPP library facilitates this concept in it's implementation. The following
-operational functions operate only on available opps:
-opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count
-
-dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then
-be used for dev_pm_opp_enable/disable functions to make an opp available as required.
-
-WARNING: Users of OPP library should refresh their availability count using
-get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the
-exact mechanism to trigger these or the notification mechanism to other
-dependent subsystems such as cpufreq are left to the discretion of the SoC
-specific framework which uses the OPP library. Similar care needs to be taken
-care to refresh the cpufreq table in cases of these operations.
-
-2. Initial OPP List Registration
-================================
-The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per
-device. It is expected that the SoC framework will register the OPP entries
-optimally- typical numbers range to be less than 5. The list generated by
-registering the OPPs is maintained by OPP library throughout the device
-operation. The SoC framework can subsequently control the availability of the
-OPPs dynamically using the dev_pm_opp_enable / disable functions.
-
-dev_pm_opp_add - Add a new OPP for a specific domain represented by the device pointer.
-	The OPP is defined using the frequency and voltage. Once added, the OPP
-	is assumed to be available and control of it's availability can be done
-	with the dev_pm_opp_enable/disable functions. OPP library internally stores
-	and manages this information in the opp struct. This function may be
-	used by SoC framework to define a optimal list as per the demands of
-	SoC usage environment.
-
-	WARNING: Do not use this function in interrupt context.
-
-	Example:
-	 soc_pm_init()
-	 {
-		/* Do things */
-		r = dev_pm_opp_add(mpu_dev, 1000000, 900000);
-		if (!r) {
-			pr_err("%s: unable to register mpu opp(%d)\n", r);
-			goto no_cpufreq;
-		}
-		/* Do cpufreq things */
-	 no_cpufreq:
-		/* Do remaining things */
-	 }
-
-3. OPP Search Functions
-=======================
-High level framework such as cpufreq operates on frequencies. To map the
-frequency back to the corresponding OPP, OPP library provides handy functions
-to search the OPP list that OPP library internally manages. These search
-functions return the matching pointer representing the opp if a match is
-found, else returns error. These errors are expected to be handled by standard
-error checks such as IS_ERR() and appropriate actions taken by the caller.
-
-Callers of these functions shall call dev_pm_opp_put() after they have used the
-OPP. Otherwise the memory for the OPP will never get freed and result in
-memleak.
-
-dev_pm_opp_find_freq_exact - Search for an OPP based on an *exact* frequency and
-	availability. This function is especially useful to enable an OPP which
-	is not available by default.
-	Example: In a case when SoC framework detects a situation where a
-	higher frequency could be made available, it can use this function to
-	find the OPP prior to call the dev_pm_opp_enable to actually make it available.
-	 opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
-	 dev_pm_opp_put(opp);
-	 /* dont operate on the pointer.. just do a sanity check.. */
-	 if (IS_ERR(opp)) {
-		pr_err("frequency not disabled!\n");
-		/* trigger appropriate actions.. */
-	 } else {
-		dev_pm_opp_enable(dev,1000000000);
-	 }
-
-	NOTE: This is the only search function that operates on OPPs which are
-	not available.
-
-dev_pm_opp_find_freq_floor - Search for an available OPP which is *at most* the
-	provided frequency. This function is useful while searching for a lesser
-	match OR operating on OPP information in the order of decreasing
-	frequency.
-	Example: To find the highest opp for a device:
-	 freq = ULONG_MAX;
-	 opp = dev_pm_opp_find_freq_floor(dev, &freq);
-	 dev_pm_opp_put(opp);
-
-dev_pm_opp_find_freq_ceil - Search for an available OPP which is *at least* the
-	provided frequency. This function is useful while searching for a
-	higher match OR operating on OPP information in the order of increasing
-	frequency.
-	Example 1: To find the lowest opp for a device:
-	 freq = 0;
-	 opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-	 dev_pm_opp_put(opp);
-	Example 2: A simplified implementation of a SoC cpufreq_driver->target:
-	 soc_cpufreq_target(..)
-	 {
-		/* Do stuff like policy checks etc. */
-		/* Find the best frequency match for the req */
-		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-		dev_pm_opp_put(opp);
-		if (!IS_ERR(opp))
-			soc_switch_to_freq_voltage(freq);
-		else
-			/* do something when we can't satisfy the req */
-		/* do other stuff */
-	 }
-
-4. OPP Availability Control Functions
-=====================================
-A default OPP list registered with the OPP library may not cater to all possible
-situation. The OPP library provides a set of functions to modify the
-availability of a OPP within the OPP list. This allows SoC frameworks to have
-fine grained dynamic control of which sets of OPPs are operationally available.
-These functions are intended to *temporarily* remove an OPP in conditions such
-as thermal considerations (e.g. don't use OPPx until the temperature drops).
-
-WARNING: Do not use these functions in interrupt context.
-
-dev_pm_opp_enable - Make a OPP available for operation.
-	Example: Lets say that 1GHz OPP is to be made available only if the
-	SoC temperature is lower than a certain threshold. The SoC framework
-	implementation might choose to do something as follows:
-	 if (cur_temp < temp_low_thresh) {
-		/* Enable 1GHz if it was disabled */
-		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false);
-		dev_pm_opp_put(opp);
-		/* just error check */
-		if (!IS_ERR(opp))
-			ret = dev_pm_opp_enable(dev, 1000000000);
-		else
-			goto try_something_else;
-	 }
-
-dev_pm_opp_disable - Make an OPP to be not available for operation
-	Example: Lets say that 1GHz OPP is to be disabled if the temperature
-	exceeds a threshold value. The SoC framework implementation might
-	choose to do something as follows:
-	 if (cur_temp > temp_high_thresh) {
-		/* Disable 1GHz if it was enabled */
-		opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true);
-		dev_pm_opp_put(opp);
-		/* just error check */
-		if (!IS_ERR(opp))
-			ret = dev_pm_opp_disable(dev, 1000000000);
-		else
-			goto try_something_else;
-	 }
-
-5. OPP Data Retrieval Functions
-===============================
-Since OPP library abstracts away the OPP information, a set of functions to pull
-information from the OPP structure is necessary. Once an OPP pointer is
-retrieved using the search functions, the following functions can be used by SoC
-framework to retrieve the information represented inside the OPP layer.
-
-dev_pm_opp_get_voltage - Retrieve the voltage represented by the opp pointer.
-	Example: At a cpufreq transition to a different frequency, SoC
-	framework requires to set the voltage represented by the OPP using
-	the regulator framework to the Power Management chip providing the
-	voltage.
-	 soc_switch_to_freq_voltage(freq)
-	 {
-		/* do things */
-		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
-		v = dev_pm_opp_get_voltage(opp);
-		dev_pm_opp_put(opp);
-		if (v)
-			regulator_set_voltage(.., v);
-		/* do other things */
-	 }
-
-dev_pm_opp_get_freq - Retrieve the freq represented by the opp pointer.
-	Example: Lets say the SoC framework uses a couple of helper functions
-	we could pass opp pointers instead of doing additional parameters to
-	handle quiet a bit of data parameters.
-	 soc_cpufreq_target(..)
-	 {
-		/* do things.. */
-		 max_freq = ULONG_MAX;
-		 max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq);
-		 requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq);
-		 if (!IS_ERR(max_opp) && !IS_ERR(requested_opp))
-			r = soc_test_validity(max_opp, requested_opp);
-		 dev_pm_opp_put(max_opp);
-		 dev_pm_opp_put(requested_opp);
-		/* do other things */
-	 }
-	 soc_test_validity(..)
-	 {
-		 if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp))
-			 return -EINVAL;
-		 if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp))
-			 return -EINVAL;
-		/* do things.. */
-	 }
-
-dev_pm_opp_get_opp_count - Retrieve the number of available opps for a device
-	Example: Lets say a co-processor in the SoC needs to know the available
-	frequencies in a table, the main processor can notify as following:
-	 soc_notify_coproc_available_frequencies()
-	 {
-		/* Do things */
-		num_available = dev_pm_opp_get_opp_count(dev);
-		speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL);
-		/* populate the table in increasing order */
-		freq = 0;
-		while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) {
-			speeds[i] = freq;
-			freq++;
-			i++;
-			dev_pm_opp_put(opp);
-		}
-
-		soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available);
-		/* Do other things */
-	 }
-
-6. Data Structures
-==================
-Typically an SoC contains multiple voltage domains which are variable. Each
-domain is represented by a device pointer. The relationship to OPP can be
-represented as follows:
-SoC
- |- device 1
- |	|- opp 1 (availability, freq, voltage)
- |	|- opp 2 ..
- ...	...
- |	`- opp n ..
- |- device 2
- ...
- `- device m
-
-OPP library maintains a internal list that the SoC framework populates and
-accessed by various functions as described above. However, the structures
-representing the actual OPPs and domains are internal to the OPP library itself
-to allow for suitable abstraction reusable across systems.
-
-struct dev_pm_opp - The internal data structure of OPP library which is used to
-	represent an OPP. In addition to the freq, voltage, availability
-	information, it also contains internal book keeping information required
-	for the OPP library to operate on.  Pointer to this structure is
-	provided back to the users such as SoC framework to be used as a
-	identifier for OPP in the interactions with OPP layer.
-
-	WARNING: The struct dev_pm_opp pointer should not be parsed or modified by the
-	users. The defaults of for an instance is populated by dev_pm_opp_add, but the
-	availability of the OPP can be modified by dev_pm_opp_enable/disable functions.
-
-struct device - This is used to identify a domain to the OPP layer. The
-	nature of the device and it's implementation is left to the user of
-	OPP library such as the SoC framework.
-
-Overall, in a simplistic view, the data structure operations is represented as
-following:
-
-Initialization / modification:
-            +-----+        /- dev_pm_opp_enable
-dev_pm_opp_add --> | opp | <-------
-  |         +-----+        \- dev_pm_opp_disable
-  \-------> domain_info(device)
-
-Search functions:
-             /-- dev_pm_opp_find_freq_ceil  ---\   +-----+
-domain_info<---- dev_pm_opp_find_freq_exact -----> | opp |
-             \-- dev_pm_opp_find_freq_floor ---/   +-----+
-
-Retrieval functions:
-+-----+     /- dev_pm_opp_get_voltage
-| opp | <---
-+-----+     \- dev_pm_opp_get_freq
-
-domain_info <- dev_pm_opp_get_opp_count
diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst
new file mode 100644
index 000000000000..0e2ef7429304
--- /dev/null
+++ b/Documentation/power/pci.rst
@@ -0,0 +1,1135 @@
+====================
+PCI Power Management
+====================
+
+Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+
+An overview of concepts and the Linux kernel's interfaces related to PCI power
+management.  Based on previous work by Patrick Mochel <mochel@transmeta.com>
+(and others).
+
+This document only covers the aspects of power management specific to PCI
+devices.  For general description of the kernel's interfaces related to device
+power management refer to Documentation/driver-api/pm/devices.rst and
+Documentation/power/runtime_pm.rst.
+
+.. contents:
+
+   1. Hardware and Platform Support for PCI Power Management
+   2. PCI Subsystem and Device Power Management
+   3. PCI Device Drivers and Power Management
+   4. Resources
+
+
+1. Hardware and Platform Support for PCI Power Management
+=========================================================
+
+1.1. Native and Platform-Based Power Management
+-----------------------------------------------
+
+In general, power management is a feature allowing one to save energy by putting
+devices into states in which they draw less power (low-power states) at the
+price of reduced functionality or performance.
+
+Usually, a device is put into a low-power state when it is underutilized or
+completely inactive.  However, when it is necessary to use the device once
+again, it has to be put back into the "fully functional" state (full-power
+state).  This may happen when there are some data for the device to handle or
+as a result of an external event requiring the device to be active, which may
+be signaled by the device itself.
+
+PCI devices may be put into low-power states in two ways, by using the device
+capabilities introduced by the PCI Bus Power Management Interface Specification,
+or with the help of platform firmware, such as an ACPI BIOS.  In the first
+approach, that is referred to as the native PCI power management (native PCI PM)
+in what follows, the device power state is changed as a result of writing a
+specific value into one of its standard configuration registers.  The second
+approach requires the platform firmware to provide special methods that may be
+used by the kernel to change the device's power state.
+
+Devices supporting the native PCI PM usually can generate wakeup signals called
+Power Management Events (PMEs) to let the kernel know about external events
+requiring the device to be active.  After receiving a PME the kernel is supposed
+to put the device that sent it into the full-power state.  However, the PCI Bus
+Power Management Interface Specification doesn't define any standard method of
+delivering the PME from the device to the CPU and the operating system kernel.
+It is assumed that the platform firmware will perform this task and therefore,
+even though a PCI device is set up to generate PMEs, it also may be necessary to
+prepare the platform firmware for notifying the CPU of the PMEs coming from the
+device (e.g. by generating interrupts).
+
+In turn, if the methods provided by the platform firmware are used for changing
+the power state of a device, usually the platform also provides a method for
+preparing the device to generate wakeup signals.  In that case, however, it
+often also is necessary to prepare the device for generating PMEs using the
+native PCI PM mechanism, because the method provided by the platform depends on
+that.
+
+Thus in many situations both the native and the platform-based power management
+mechanisms have to be used simultaneously to obtain the desired result.
+
+1.2. Native PCI Power Management
+--------------------------------
+
+The PCI Bus Power Management Interface Specification (PCI PM Spec) was
+introduced between the PCI 2.1 and PCI 2.2 Specifications.  It defined a
+standard interface for performing various operations related to power
+management.
+
+The implementation of the PCI PM Spec is optional for conventional PCI devices,
+but it is mandatory for PCI Express devices.  If a device supports the PCI PM
+Spec, it has an 8 byte power management capability field in its PCI
+configuration space.  This field is used to describe and control the standard
+features related to the native PCI power management.
+
+The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses
+(B0-B3).  The higher the number, the less power is drawn by the device or bus
+in that state.  However, the higher the number, the longer the latency for
+the device or bus to return to the full-power state (D0 or B0, respectively).
+
+There are two variants of the D3 state defined by the specification.  The first
+one is D3hot, referred to as the software accessible D3, because devices can be
+programmed to go into it.  The second one, D3cold, is the state that PCI devices
+are in when the supply voltage (Vcc) is removed from them.  It is not possible
+to program a PCI device to go into D3cold, although there may be a programmable
+interface for putting the bus the device is on into a state in which Vcc is
+removed from all devices on the bus.
+
+PCI bus power management, however, is not supported by the Linux kernel at the
+time of this writing and therefore it is not covered by this document.
+
+Note that every PCI device can be in the full-power state (D0) or in D3cold,
+regardless of whether or not it implements the PCI PM Spec.  In addition to
+that, if the PCI PM Spec is implemented by the device, it must support D3hot
+as well as D0.  The support for the D1 and D2 power states is optional.
+
+PCI devices supporting the PCI PM Spec can be programmed to go to any of the
+supported low-power states (except for D3cold).  While in D1-D3hot the
+standard configuration registers of the device must be accessible to software
+(i.e. the device is required to respond to PCI configuration accesses), although
+its I/O and memory spaces are then disabled.  This allows the device to be
+programmatically put into D0.  Thus the kernel can switch the device back and
+forth between D0 and the supported low-power states (except for D3cold) and the
+possible power state transitions the device can undergo are the following:
+
++----------------------------+
+| Current State | New State  |
++----------------------------+
+| D0            | D1, D2, D3 |
++----------------------------+
+| D1            | D2, D3     |
++----------------------------+
+| D2            | D3         |
++----------------------------+
+| D1, D2, D3    | D0         |
++----------------------------+
+
+The transition from D3cold to D0 occurs when the supply voltage is provided to
+the device (i.e. power is restored).  In that case the device returns to D0 with
+a full power-on reset sequence and the power-on defaults are restored to the
+device by hardware just as at initial power up.
+
+PCI devices supporting the PCI PM Spec can be programmed to generate PMEs
+while in a low-power state (D1-D3), but they are not required to be capable
+of generating PMEs from all supported low-power states.  In particular, the
+capability of generating PMEs from D3cold is optional and depends on the
+presence of additional voltage (3.3Vaux) allowing the device to remain
+sufficiently active to generate a wakeup signal.
+
+1.3. ACPI Device Power Management
+---------------------------------
+
+The platform firmware support for the power management of PCI devices is
+system-specific.  However, if the system in question is compliant with the
+Advanced Configuration and Power Interface (ACPI) Specification, like the
+majority of x86-based systems, it is supposed to implement device power
+management interfaces defined by the ACPI standard.
+
+For this purpose the ACPI BIOS provides special functions called "control
+methods" that may be executed by the kernel to perform specific tasks, such as
+putting a device into a low-power state.  These control methods are encoded
+using special byte-code language called the ACPI Machine Language (AML) and
+stored in the machine's BIOS.  The kernel loads them from the BIOS and executes
+them as needed using an AML interpreter that translates the AML byte code into
+computations and memory or I/O space accesses.  This way, in theory, a BIOS
+writer can provide the kernel with a means to perform actions depending
+on the system design in a system-specific fashion.
+
+ACPI control methods may be divided into global control methods, that are not
+associated with any particular devices, and device control methods, that have
+to be defined separately for each device supposed to be handled with the help of
+the platform.  This means, in particular, that ACPI device control methods can
+only be used to handle devices that the BIOS writer knew about in advance.  The
+ACPI methods used for device power management fall into that category.
+
+The ACPI specification assumes that devices can be in one of four power states
+labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM
+D0-D3 states (although the difference between D3hot and D3cold is not taken
+into account by ACPI).  Moreover, for each power state of a device there is a
+set of power resources that have to be enabled for the device to be put into
+that state.  These power resources are controlled (i.e. enabled or disabled)
+with the help of their own control methods, _ON and _OFF, that have to be
+defined individually for each of them.
+
+To put a device into the ACPI power state Dx (where x is a number between 0 and
+3 inclusive) the kernel is supposed to (1) enable the power resources required
+by the device in this state using their _ON control methods and (2) execute the
+_PSx control method defined for the device.  In addition to that, if the device
+is going to be put into a low-power state (D1-D3) and is supposed to generate
+wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI
+3.0) control method defined for it has to be executed before _PSx.  Power
+resources that are not required by the device in the target power state and are
+not required any more by any other device should be disabled (by executing their
+_OFF control methods).  If the current power state of the device is D3, it can
+only be put into D0 this way.
+
+However, quite often the power states of devices are changed during a
+system-wide transition into a sleep state or back into the working state.  ACPI
+defines four system sleep states, S1, S2, S3, and S4, and denotes the system
+working state as S0.  In general, the target system sleep (or working) state
+determines the highest power (lowest number) state the device can be put
+into and the kernel is supposed to obtain this information by executing the
+device's _SxD control method (where x is a number between 0 and 4 inclusive).
+If the device is required to wake up the system from the target sleep state, the
+lowest power (highest number) state it can be put into is also determined by the
+target state of the system.  The kernel is then supposed to use the device's
+_SxW control method to obtain the number of that state.  It also is supposed to
+use the device's _PRW control method to learn which power resources need to be
+enabled for the device to be able to generate wakeup signals.
+
+1.4. Wakeup Signaling
+---------------------
+
+Wakeup signals generated by PCI devices, either as native PCI PMEs, or as
+a result of the execution of the _DSW (or _PSW) ACPI control method before
+putting the device into a low-power state, have to be caught and handled as
+appropriate.  If they are sent while the system is in the working state
+(ACPI S0), they should be translated into interrupts so that the kernel can
+put the devices generating them into the full-power state and take care of the
+events that triggered them.  In turn, if they are sent while the system is
+sleeping, they should cause the system's core logic to trigger wakeup.
+
+On ACPI-based systems wakeup signals sent by conventional PCI devices are
+converted into ACPI General-Purpose Events (GPEs) which are hardware signals
+from the system core logic generated in response to various events that need to
+be acted upon.  Every GPE is associated with one or more sources of potentially
+interesting events.  In particular, a GPE may be associated with a PCI device
+capable of signaling wakeup.  The information on the connections between GPEs
+and event sources is recorded in the system's ACPI BIOS from where it can be
+read by the kernel.
+
+If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE
+associated with it (if there is one) is triggered.  The GPEs associated with PCI
+bridges may also be triggered in response to a wakeup signal from one of the
+devices below the bridge (this also is the case for root bridges) and, for
+example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be
+handled this way.
+
+A GPE may be triggered when the system is sleeping (i.e. when it is in one of
+the ACPI S1-S4 states), in which case system wakeup is started by its core logic
+(the device that was the source of the signal causing the system wakeup to occur
+may be identified later).  The GPEs used in such situations are referred to as
+wakeup GPEs.
+
+Usually, however, GPEs are also triggered when the system is in the working
+state (ACPI S0) and in that case the system's core logic generates a System
+Control Interrupt (SCI) to notify the kernel of the event.  Then, the SCI
+handler identifies the GPE that caused the interrupt to be generated which,
+in turn, allows the kernel to identify the source of the event (that may be
+a PCI device signaling wakeup).  The GPEs used for notifying the kernel of
+events occurring while the system is in the working state are referred to as
+runtime GPEs.
+
+Unfortunately, there is no standard way of handling wakeup signals sent by
+conventional PCI devices on systems that are not ACPI-based, but there is one
+for PCI Express devices.  Namely, the PCI Express Base Specification introduced
+a native mechanism for converting native PCI PMEs into interrupts generated by
+root ports.  For conventional PCI devices native PMEs are out-of-band, so they
+are routed separately and they need not pass through bridges (in principle they
+may be routed directly to the system's core logic), but for PCI Express devices
+they are in-band messages that have to pass through the PCI Express hierarchy,
+including the root port on the path from the device to the Root Complex.  Thus
+it was possible to introduce a mechanism by which a root port generates an
+interrupt whenever it receives a PME message from one of the devices below it.
+The PCI Express Requester ID of the device that sent the PME message is then
+recorded in one of the root port's configuration registers from where it may be
+read by the interrupt handler allowing the device to be identified.  [PME
+messages sent by PCI Express endpoints integrated with the Root Complex don't
+pass through root ports, but instead they cause a Root Complex Event Collector
+(if there is one) to generate interrupts.]
+
+In principle the native PCI Express PME signaling may also be used on ACPI-based
+systems along with the GPEs, but to use it the kernel has to ask the system's
+ACPI BIOS to release control of root port configuration registers.  The ACPI
+BIOS, however, is not required to allow the kernel to control these registers
+and if it doesn't do that, the kernel must not modify their contents.  Of course
+the native PCI Express PME signaling cannot be used by the kernel in that case.
+
+
+2. PCI Subsystem and Device Power Management
+============================================
+
+2.1. Device Power Management Callbacks
+--------------------------------------
+
+The PCI Subsystem participates in the power management of PCI devices in a
+number of ways.  First of all, it provides an intermediate code layer between
+the device power management core (PM core) and PCI device drivers.
+Specifically, the pm field of the PCI subsystem's struct bus_type object,
+pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing
+pointers to several device power management callbacks::
+
+  const struct dev_pm_ops pci_dev_pm_ops = {
+	.prepare = pci_pm_prepare,
+	.complete = pci_pm_complete,
+	.suspend = pci_pm_suspend,
+	.resume = pci_pm_resume,
+	.freeze = pci_pm_freeze,
+	.thaw = pci_pm_thaw,
+	.poweroff = pci_pm_poweroff,
+	.restore = pci_pm_restore,
+	.suspend_noirq = pci_pm_suspend_noirq,
+	.resume_noirq = pci_pm_resume_noirq,
+	.freeze_noirq = pci_pm_freeze_noirq,
+	.thaw_noirq = pci_pm_thaw_noirq,
+	.poweroff_noirq = pci_pm_poweroff_noirq,
+	.restore_noirq = pci_pm_restore_noirq,
+	.runtime_suspend = pci_pm_runtime_suspend,
+	.runtime_resume = pci_pm_runtime_resume,
+	.runtime_idle = pci_pm_runtime_idle,
+  };
+
+These callbacks are executed by the PM core in various situations related to
+device power management and they, in turn, execute power management callbacks
+provided by PCI device drivers.  They also perform power management operations
+involving some standard configuration registers of PCI devices that device
+drivers need not know or care about.
+
+The structure representing a PCI device, struct pci_dev, contains several fields
+that these callbacks operate on::
+
+  struct pci_dev {
+	...
+	pci_power_t     current_state;  /* Current operating state. */
+	int		pm_cap;		/* PM capability offset in the
+					   configuration space */
+	unsigned int	pme_support:5;	/* Bitmask of states from which PME#
+					   can be generated */
+	unsigned int	pme_interrupt:1;/* Is native PCIe PME signaling used? */
+	unsigned int	d1_support:1;	/* Low power state D1 is supported */
+	unsigned int	d2_support:1;	/* Low power state D2 is supported */
+	unsigned int	no_d1d2:1;	/* D1 and D2 are forbidden */
+	unsigned int	wakeup_prepared:1;  /* Device prepared for wake up */
+	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
+	...
+  };
+
+They also indirectly use some fields of the struct device that is embedded in
+struct pci_dev.
+
+2.2. Device Initialization
+--------------------------
+
+The PCI subsystem's first task related to device power management is to
+prepare the device for power management and initialize the fields of struct
+pci_dev used for this purpose.  This happens in two functions defined in
+drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init().
+
+The first of these functions checks if the device supports native PCI PM
+and if that's the case the offset of its power management capability structure
+in the configuration space is stored in the pm_cap field of the device's struct
+pci_dev object.  Next, the function checks which PCI low-power states are
+supported by the device and from which low-power states the device can generate
+native PCI PMEs.  The power management fields of the device's struct pci_dev and
+the struct device embedded in it are updated accordingly and the generation of
+PMEs by the device is disabled.
+
+The second function checks if the device can be prepared to signal wakeup with
+the help of the platform firmware, such as the ACPI BIOS.  If that is the case,
+the function updates the wakeup fields in struct device embedded in the
+device's struct pci_dev and uses the firmware-provided method to prevent the
+device from signaling wakeup.
+
+At this point the device is ready for power management.  For driverless devices,
+however, this functionality is limited to a few basic operations carried out
+during system-wide transitions to a sleep state and back to the working state.
+
+2.3. Runtime Device Power Management
+------------------------------------
+
+The PCI subsystem plays a vital role in the runtime power management of PCI
+devices.  For this purpose it uses the general runtime power management
+(runtime PM) framework described in Documentation/power/runtime_pm.rst.
+Namely, it provides subsystem-level callbacks::
+
+	pci_pm_runtime_suspend()
+	pci_pm_runtime_resume()
+	pci_pm_runtime_idle()
+
+that are executed by the core runtime PM routines.  It also implements the
+entire mechanics necessary for handling runtime wakeup signals from PCI devices
+in low-power states, which at the time of this writing works for both the native
+PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in
+Section 1.
+
+First, a PCI device is put into a low-power state, or suspended, with the help
+of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call
+pci_pm_runtime_suspend() to do the actual job.  For this to work, the device's
+driver has to provide a pm->runtime_suspend() callback (see below), which is
+run by pci_pm_runtime_suspend() as the first action.  If the driver's callback
+returns successfully, the device's standard configuration registers are saved,
+the device is prepared to generate wakeup signals and, finally, it is put into
+the target low-power state.
+
+The low-power state to put the device into is the lowest-power (highest number)
+state from which it can signal wakeup.  The exact method of signaling wakeup is
+system-dependent and is determined by the PCI subsystem on the basis of the
+reported capabilities of the device and the platform firmware.  To prepare the
+device for signaling wakeup and put it into the selected low-power state, the
+PCI subsystem can use the platform firmware as well as the device's native PCI
+PM capabilities, if supported.
+
+It is expected that the device driver's pm->runtime_suspend() callback will
+not attempt to prepare the device for signaling wakeup or to put it into a
+low-power state.  The driver ought to leave these tasks to the PCI subsystem
+that has all of the information necessary to perform them.
+
+A suspended device is brought back into the "active" state, or resumed,
+with the help of pm_request_resume() or pm_runtime_resume() which both call
+pci_pm_runtime_resume() for PCI devices.  Again, this only works if the device's
+driver provides a pm->runtime_resume() callback (see below).  However, before
+the driver's callback is executed, pci_pm_runtime_resume() brings the device
+back into the full-power state, prevents it from signaling wakeup while in that
+state and restores its standard configuration registers.  Thus the driver's
+callback need not worry about the PCI-specific aspects of the device resume.
+
+Note that generally pci_pm_runtime_resume() may be called in two different
+situations.  First, it may be called at the request of the device's driver, for
+example if there are some data for it to process.  Second, it may be called
+as a result of a wakeup signal from the device itself (this sometimes is
+referred to as "remote wakeup").  Of course, for this purpose the wakeup signal
+is handled in one of the ways described in Section 1 and finally converted into
+a notification for the PCI subsystem after the source device has been
+identified.
+
+The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle()
+and pm_request_idle(), executes the device driver's pm->runtime_idle()
+callback, if defined, and if that callback doesn't return error code (or is not
+present at all), suspends the device with the help of pm_runtime_suspend().
+Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for
+example, it is called right after the device has just been resumed), in which
+cases it is expected to suspend the device if that makes sense.  Usually,
+however, the PCI subsystem doesn't really know if the device really can be
+suspended, so it lets the device's driver decide by running its
+pm->runtime_idle() callback.
+
+2.4. System-Wide Power Transitions
+----------------------------------
+There are a few different types of system-wide power transitions, described in
+Documentation/driver-api/pm/devices.rst.  Each of them requires devices to be handled
+in a specific way and the PM core executes subsystem-level power management
+callbacks for this purpose.  They are executed in phases such that each phase
+involves executing the same subsystem-level callback for every device belonging
+to the given subsystem before the next phase begins.  These phases always run
+after tasks have been frozen.
+
+2.4.1. System Suspend
+^^^^^^^^^^^^^^^^^^^^^
+
+When the system is going into a sleep state in which the contents of memory will
+be preserved, such as one of the ACPI sleep states S1-S3, the phases are:
+
+	prepare, suspend, suspend_noirq.
+
+The following PCI bus type's callbacks, respectively, are used in these phases::
+
+	pci_pm_prepare()
+	pci_pm_suspend()
+	pci_pm_suspend_noirq()
+
+The pci_pm_prepare() routine first puts the device into the "fully functional"
+state with the help of pm_runtime_resume().  Then, it executes the device
+driver's pm->prepare() callback if defined (i.e. if the driver's struct
+dev_pm_ops object is present and the prepare pointer in that object is valid).
+
+The pci_pm_suspend() routine first checks if the device's driver implements
+legacy PCI suspend routines (see Section 3), in which case the driver's legacy
+suspend callback is executed, if present, and its result is returned.  Next, if
+the device's driver doesn't provide a struct dev_pm_ops object (containing
+pointers to the driver's callbacks), pci_pm_default_suspend() is called, which
+simply turns off the device's bus master capability and runs
+pcibios_disable_device() to disable it, unless the device is a bridge (PCI
+bridges are ignored by this routine).  Next, the device driver's pm->suspend()
+callback is executed, if defined, and its result is returned if it fails.
+Finally, pci_fixup_device() is called to apply hardware suspend quirks related
+to the device if necessary.
+
+Note that the suspend phase is carried out asynchronously for PCI devices, so
+the pci_pm_suspend() callback may be executed in parallel for any pair of PCI
+devices that don't depend on each other in a known way (i.e. none of the paths
+in the device tree from the root bridge to a leaf device contains both of them).
+
+The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
+been called, which means that the device driver's interrupt handler won't be
+invoked while this routine is running.  It first checks if the device's driver
+implements legacy PCI suspends routines (Section 3), in which case the legacy
+late suspend routine is called and its result is returned (the standard
+configuration registers of the device are saved if the driver's callback hasn't
+done that).  Second, if the device driver's struct dev_pm_ops object is not
+present, the device's standard configuration registers are saved and the routine
+returns success.  Otherwise the device driver's pm->suspend_noirq() callback is
+executed, if present, and its result is returned if it fails.  Next, if the
+device's standard configuration registers haven't been saved yet (one of the
+device driver's callbacks executed before might do that), pci_pm_suspend_noirq()
+saves them, prepares the device to signal wakeup (if necessary) and puts it into
+a low-power state.
+
+The low-power state to put the device into is the lowest-power (highest number)
+state from which it can signal wakeup while the system is in the target sleep
+state.  Just like in the runtime PM case described above, the mechanism of
+signaling wakeup is system-dependent and determined by the PCI subsystem, which
+is also responsible for preparing the device to signal wakeup from the system's
+target sleep state as appropriate.
+
+PCI device drivers (that don't implement legacy power management callbacks) are
+generally not expected to prepare devices for signaling wakeup or to put them
+into low-power states.  However, if one of the driver's suspend callbacks
+(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration
+registers, pci_pm_suspend_noirq() will assume that the device has been prepared
+to signal wakeup and put into a low-power state by the driver (the driver is
+then assumed to have used the helper functions provided by the PCI subsystem for
+this purpose).  PCI device drivers are not encouraged to do that, but in some
+rare cases doing that in the driver may be the optimum approach.
+
+2.4.2. System Resume
+^^^^^^^^^^^^^^^^^^^^
+
+When the system is undergoing a transition from a sleep state in which the
+contents of memory have been preserved, such as one of the ACPI sleep states
+S1-S3, into the working state (ACPI S0), the phases are:
+
+	resume_noirq, resume, complete.
+
+The following PCI bus type's callbacks, respectively, are executed in these
+phases::
+
+	pci_pm_resume_noirq()
+	pci_pm_resume()
+	pci_pm_complete()
+
+The pci_pm_resume_noirq() routine first puts the device into the full-power
+state, restores its standard configuration registers and applies early resume
+hardware quirks related to the device, if necessary.  This is done
+unconditionally, regardless of whether or not the device's driver implements
+legacy PCI power management callbacks (this way all PCI devices are in the
+full-power state and their standard configuration registers have been restored
+when their interrupt handlers are invoked for the first time during resume,
+which allows the kernel to avoid problems with the handling of shared interrupts
+by drivers whose devices are still suspended).  If legacy PCI power management
+callbacks (see Section 3) are implemented by the device's driver, the legacy
+early resume callback is executed and its result is returned.  Otherwise, the
+device driver's pm->resume_noirq() callback is executed, if defined, and its
+result is returned.
+
+The pci_pm_resume() routine first checks if the device's standard configuration
+registers have been restored and restores them if that's not the case (this
+only is necessary in the error path during a failing suspend).  Next, resume
+hardware quirks related to the device are applied, if necessary, and if the
+device's driver implements legacy PCI power management callbacks (see
+Section 3), the driver's legacy resume callback is executed and its result is
+returned.  Otherwise, the device's wakeup signaling mechanisms are blocked and
+its driver's pm->resume() callback is executed, if defined (the callback's
+result is then returned).
+
+The resume phase is carried out asynchronously for PCI devices, like the
+suspend phase described above, which means that if two PCI devices don't depend
+on each other in a known way, the pci_pm_resume() routine may be executed for
+the both of them in parallel.
+
+The pci_pm_complete() routine only executes the device driver's pm->complete()
+callback, if defined.
+
+2.4.3. System Hibernation
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+System hibernation is more complicated than system suspend, because it requires
+a system image to be created and written into a persistent storage medium.  The
+image is created atomically and all devices are quiesced, or frozen, before that
+happens.
+
+The freezing of devices is carried out after enough memory has been freed (at
+the time of this writing the image creation requires at least 50% of system RAM
+to be free) in the following three phases:
+
+	prepare, freeze, freeze_noirq
+
+that correspond to the PCI bus type's callbacks::
+
+	pci_pm_prepare()
+	pci_pm_freeze()
+	pci_pm_freeze_noirq()
+
+This means that the prepare phase is exactly the same as for system suspend.
+The other two phases, however, are different.
+
+The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs
+the device driver's pm->freeze() callback, if defined, instead of pm->suspend(),
+and it doesn't apply the suspend-related hardware quirks.  It is executed
+asynchronously for different PCI devices that don't depend on each other in a
+known way.
+
+The pci_pm_freeze_noirq() routine, in turn, is similar to
+pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq()
+routine instead of pm->suspend_noirq().  It also doesn't attempt to prepare the
+device for signaling wakeup and put it into a low-power state.  Still, it saves
+the device's standard configuration registers if they haven't been saved by one
+of the driver's callbacks.
+
+Once the image has been created, it has to be saved.  However, at this point all
+devices are frozen and they cannot handle I/O, while their ability to handle
+I/O is obviously necessary for the image saving.  Thus they have to be brought
+back to the fully functional state and this is done in the following phases:
+
+	thaw_noirq, thaw, complete
+
+using the following PCI bus type's callbacks::
+
+	pci_pm_thaw_noirq()
+	pci_pm_thaw()
+	pci_pm_complete()
+
+respectively.
+
+The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(),
+but it doesn't put the device into the full power state and doesn't attempt to
+restore its standard configuration registers.  It also executes the device
+driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq().
+
+The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device
+driver's pm->thaw() callback instead of pm->resume().  It is executed
+asynchronously for different PCI devices that don't depend on each other in a
+known way.
+
+The complete phase it the same as for system resume.
+
+After saving the image, devices need to be powered down before the system can
+enter the target sleep state (ACPI S4 for ACPI-based systems).  This is done in
+three phases:
+
+	prepare, poweroff, poweroff_noirq
+
+where the prepare phase is exactly the same as for system suspend.  The other
+two phases are analogous to the suspend and suspend_noirq phases, respectively.
+The PCI subsystem-level callbacks they correspond to::
+
+	pci_pm_poweroff()
+	pci_pm_poweroff_noirq()
+
+work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively,
+although they don't attempt to save the device's standard configuration
+registers.
+
+2.4.4. System Restore
+^^^^^^^^^^^^^^^^^^^^^
+
+System restore requires a hibernation image to be loaded into memory and the
+pre-hibernation memory contents to be restored before the pre-hibernation system
+activity can be resumed.
+
+As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded
+into memory by a fresh instance of the kernel, called the boot kernel, which in
+turn is loaded and run by a boot loader in the usual way.  After the boot kernel
+has loaded the image, it needs to replace its own code and data with the code
+and data of the "hibernated" kernel stored within the image, called the image
+kernel.  For this purpose all devices are frozen just like before creating
+the image during hibernation, in the
+
+	prepare, freeze, freeze_noirq
+
+phases described above.  However, the devices affected by these phases are only
+those having drivers in the boot kernel; other devices will still be in whatever
+state the boot loader left them.
+
+Should the restoration of the pre-hibernation memory contents fail, the boot
+kernel would go through the "thawing" procedure described above, using the
+thaw_noirq, thaw, and complete phases (that will only affect the devices having
+drivers in the boot kernel), and then continue running normally.
+
+If the pre-hibernation memory contents are restored successfully, which is the
+usual situation, control is passed to the image kernel, which then becomes
+responsible for bringing the system back to the working state.  To achieve this,
+it must restore the devices' pre-hibernation functionality, which is done much
+like waking up from the memory sleep state, although it involves different
+phases:
+
+	restore_noirq, restore, complete
+
+The first two of these are analogous to the resume_noirq and resume phases
+described above, respectively, and correspond to the following PCI subsystem
+callbacks::
+
+	pci_pm_restore_noirq()
+	pci_pm_restore()
+
+These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(),
+respectively, but they execute the device driver's pm->restore_noirq() and
+pm->restore() callbacks, if available.
+
+The complete phase is carried out in exactly the same way as during system
+resume.
+
+
+3. PCI Device Drivers and Power Management
+==========================================
+
+3.1. Power Management Callbacks
+-------------------------------
+
+PCI device drivers participate in power management by providing callbacks to be
+executed by the PCI subsystem's power management routines described above and by
+controlling the runtime power management of their devices.
+
+At the time of this writing there are two ways to define power management
+callbacks for a PCI device driver, the recommended one, based on using a
+dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the
+"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and
+.resume() callbacks from struct pci_driver are used.  The legacy approach,
+however, doesn't allow one to define runtime power management callbacks and is
+not really suitable for any new drivers.  Therefore it is not covered by this
+document (refer to the source code to learn more about it).
+
+It is recommended that all PCI device drivers define a struct dev_pm_ops object
+containing pointers to power management (PM) callbacks that will be executed by
+the PCI subsystem's PM routines in various circumstances.  A pointer to the
+driver's struct dev_pm_ops object has to be assigned to the driver.pm field in
+its struct pci_driver object.  Once that has happened, the "legacy" PM callbacks
+in struct pci_driver are ignored (even if they are not NULL).
+
+The PM callbacks in struct dev_pm_ops are not mandatory and if they are not
+defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI
+subsystem will handle the device in a simplified default manner.  If they are
+defined, though, they are expected to behave as described in the following
+subsections.
+
+3.1.1. prepare()
+^^^^^^^^^^^^^^^^
+
+The prepare() callback is executed during system suspend, during hibernation
+(when a hibernation image is about to be created), during power-off after
+saving a hibernation image and during system restore, when a hibernation image
+has just been loaded into memory.
+
+This callback is only necessary if the driver's device has children that in
+general may be registered at any time.  In that case the role of the prepare()
+callback is to prevent new children of the device from being registered until
+one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run.
+
+In addition to that the prepare() callback may carry out some operations
+preparing the device to be suspended, although it should not allocate memory
+(if additional memory is required to suspend the device, it has to be
+preallocated earlier, for example in a suspend/hibernate notifier as described
+in Documentation/driver-api/pm/notifiers.rst).
+
+3.1.2. suspend()
+^^^^^^^^^^^^^^^^
+
+The suspend() callback is only executed during system suspend, after prepare()
+callbacks have been executed for all devices in the system.
+
+This callback is expected to quiesce the device and prepare it to be put into a
+low-power state by the PCI subsystem.  It is not required (in fact it even is
+not recommended) that a PCI driver's suspend() callback save the standard
+configuration registers of the device, prepare it for waking up the system, or
+put it into a low-power state.  All of these operations can very well be taken
+care of by the PCI subsystem, without the driver's participation.
+
+However, in some rare case it is convenient to carry out these operations in
+a PCI driver.  Then, pci_save_state(), pci_prepare_to_sleep(), and
+pci_set_power_state() should be used to save the device's standard configuration
+registers, to prepare it for system wakeup (if necessary), and to put it into a
+low-power state, respectively.  Moreover, if the driver calls pci_save_state(),
+the PCI subsystem will not execute either pci_prepare_to_sleep(), or
+pci_set_power_state() for its device, so the driver is then responsible for
+handling the device as appropriate.
+
+While the suspend() callback is being executed, the driver's interrupt handler
+can be invoked to handle an interrupt from the device, so all suspend-related
+operations relying on the driver's ability to handle interrupts should be
+carried out in this callback.
+
+3.1.3. suspend_noirq()
+^^^^^^^^^^^^^^^^^^^^^^
+
+The suspend_noirq() callback is only executed during system suspend, after
+suspend() callbacks have been executed for all devices in the system and
+after device interrupts have been disabled by the PM core.
+
+The difference between suspend_noirq() and suspend() is that the driver's
+interrupt handler will not be invoked while suspend_noirq() is running.  Thus
+suspend_noirq() can carry out operations that would cause race conditions to
+arise if they were performed in suspend().
+
+3.1.4. freeze()
+^^^^^^^^^^^^^^^
+
+The freeze() callback is hibernation-specific and is executed in two situations,
+during hibernation, after prepare() callbacks have been executed for all devices
+in preparation for the creation of a system image, and during restore,
+after a system image has been loaded into memory from persistent storage and the
+prepare() callbacks have been executed for all devices.
+
+The role of this callback is analogous to the role of the suspend() callback
+described above.  In fact, they only need to be different in the rare cases when
+the driver takes the responsibility for putting the device into a low-power
+state.
+
+In that cases the freeze() callback should not prepare the device system wakeup
+or put it into a low-power state.  Still, either it or freeze_noirq() should
+save the device's standard configuration registers using pci_save_state().
+
+3.1.5. freeze_noirq()
+^^^^^^^^^^^^^^^^^^^^^
+
+The freeze_noirq() callback is hibernation-specific.  It is executed during
+hibernation, after prepare() and freeze() callbacks have been executed for all
+devices in preparation for the creation of a system image, and during restore,
+after a system image has been loaded into memory and after prepare() and
+freeze() callbacks have been executed for all devices.  It is always executed
+after device interrupts have been disabled by the PM core.
+
+The role of this callback is analogous to the role of the suspend_noirq()
+callback described above and it very rarely is necessary to define
+freeze_noirq().
+
+The difference between freeze_noirq() and freeze() is analogous to the
+difference between suspend_noirq() and suspend().
+
+3.1.6. poweroff()
+^^^^^^^^^^^^^^^^^
+
+The poweroff() callback is hibernation-specific.  It is executed when the system
+is about to be powered off after saving a hibernation image to a persistent
+storage.  prepare() callbacks are executed for all devices before poweroff() is
+called.
+
+The role of this callback is analogous to the role of the suspend() and freeze()
+callbacks described above, although it does not need to save the contents of
+the device's registers.  In particular, if the driver wants to put the device
+into a low-power state itself instead of allowing the PCI subsystem to do that,
+the poweroff() callback should use pci_prepare_to_sleep() and
+pci_set_power_state() to prepare the device for system wakeup and to put it
+into a low-power state, respectively, but it need not save the device's standard
+configuration registers.
+
+3.1.7. poweroff_noirq()
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The poweroff_noirq() callback is hibernation-specific.  It is executed after
+poweroff() callbacks have been executed for all devices in the system.
+
+The role of this callback is analogous to the role of the suspend_noirq() and
+freeze_noirq() callbacks described above, but it does not need to save the
+contents of the device's registers.
+
+The difference between poweroff_noirq() and poweroff() is analogous to the
+difference between suspend_noirq() and suspend().
+
+3.1.8. resume_noirq()
+^^^^^^^^^^^^^^^^^^^^^
+
+The resume_noirq() callback is only executed during system resume, after the
+PM core has enabled the non-boot CPUs.  The driver's interrupt handler will not
+be invoked while resume_noirq() is running, so this callback can carry out
+operations that might race with the interrupt handler.
+
+Since the PCI subsystem unconditionally puts all devices into the full power
+state in the resume_noirq phase of system resume and restores their standard
+configuration registers, resume_noirq() is usually not necessary.  In general
+it should only be used for performing operations that would lead to race
+conditions if carried out by resume().
+
+3.1.9. resume()
+^^^^^^^^^^^^^^^
+
+The resume() callback is only executed during system resume, after
+resume_noirq() callbacks have been executed for all devices in the system and
+device interrupts have been enabled by the PM core.
+
+This callback is responsible for restoring the pre-suspend configuration of the
+device and bringing it back to the fully functional state.  The device should be
+able to process I/O in a usual way after resume() has returned.
+
+3.1.10. thaw_noirq()
+^^^^^^^^^^^^^^^^^^^^
+
+The thaw_noirq() callback is hibernation-specific.  It is executed after a
+system image has been created and the non-boot CPUs have been enabled by the PM
+core, in the thaw_noirq phase of hibernation.  It also may be executed if the
+loading of a hibernation image fails during system restore (it is then executed
+after enabling the non-boot CPUs).  The driver's interrupt handler will not be
+invoked while thaw_noirq() is running.
+
+The role of this callback is analogous to the role of resume_noirq().  The
+difference between these two callbacks is that thaw_noirq() is executed after
+freeze() and freeze_noirq(), so in general it does not need to modify the
+contents of the device's registers.
+
+3.1.11. thaw()
+^^^^^^^^^^^^^^
+
+The thaw() callback is hibernation-specific.  It is executed after thaw_noirq()
+callbacks have been executed for all devices in the system and after device
+interrupts have been enabled by the PM core.
+
+This callback is responsible for restoring the pre-freeze configuration of
+the device, so that it will work in a usual way after thaw() has returned.
+
+3.1.12. restore_noirq()
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The restore_noirq() callback is hibernation-specific.  It is executed in the
+restore_noirq phase of hibernation, when the boot kernel has passed control to
+the image kernel and the non-boot CPUs have been enabled by the image kernel's
+PM core.
+
+This callback is analogous to resume_noirq() with the exception that it cannot
+make any assumption on the previous state of the device, even if the BIOS (or
+generally the platform firmware) is known to preserve that state over a
+suspend-resume cycle.
+
+For the vast majority of PCI device drivers there is no difference between
+resume_noirq() and restore_noirq().
+
+3.1.13. restore()
+^^^^^^^^^^^^^^^^^
+
+The restore() callback is hibernation-specific.  It is executed after
+restore_noirq() callbacks have been executed for all devices in the system and
+after the PM core has enabled device drivers' interrupt handlers to be invoked.
+
+This callback is analogous to resume(), just like restore_noirq() is analogous
+to resume_noirq().  Consequently, the difference between restore_noirq() and
+restore() is analogous to the difference between resume_noirq() and resume().
+
+For the vast majority of PCI device drivers there is no difference between
+resume() and restore().
+
+3.1.14. complete()
+^^^^^^^^^^^^^^^^^^
+
+The complete() callback is executed in the following situations:
+
+  - during system resume, after resume() callbacks have been executed for all
+    devices,
+  - during hibernation, before saving the system image, after thaw() callbacks
+    have been executed for all devices,
+  - during system restore, when the system is going back to its pre-hibernation
+    state, after restore() callbacks have been executed for all devices.
+
+It also may be executed if the loading of a hibernation image into memory fails
+(in that case it is run after thaw() callbacks have been executed for all
+devices that have drivers in the boot kernel).
+
+This callback is entirely optional, although it may be necessary if the
+prepare() callback performs operations that need to be reversed.
+
+3.1.15. runtime_suspend()
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_suspend() callback is specific to device runtime power management
+(runtime PM).  It is executed by the PM core's runtime PM framework when the
+device is about to be suspended (i.e. quiesced and put into a low-power state)
+at run time.
+
+This callback is responsible for freezing the device and preparing it to be
+put into a low-power state, but it must allow the PCI subsystem to perform all
+of the PCI-specific actions necessary for suspending the device.
+
+3.1.16. runtime_resume()
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_resume() callback is specific to device runtime PM.  It is executed
+by the PM core's runtime PM framework when the device is about to be resumed
+(i.e. put into the full-power state and programmed to process I/O normally) at
+run time.
+
+This callback is responsible for restoring the normal functionality of the
+device after it has been put into the full-power state by the PCI subsystem.
+The device is expected to be able to process I/O in the usual way after
+runtime_resume() has returned.
+
+3.1.17. runtime_idle()
+^^^^^^^^^^^^^^^^^^^^^^
+
+The runtime_idle() callback is specific to device runtime PM.  It is executed
+by the PM core's runtime PM framework whenever it may be desirable to suspend
+the device according to the PM core's information.  In particular, it is
+automatically executed right after runtime_resume() has returned in case the
+resume of the device has happened as a result of a spurious event.
+
+This callback is optional, but if it is not implemented or if it returns 0, the
+PCI subsystem will call pm_runtime_suspend() for the device, which in turn will
+cause the driver's runtime_suspend() callback to be executed.
+
+3.1.18. Pointing Multiple Callback Pointers to One Routine
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Although in principle each of the callbacks described in the previous
+subsections can be defined as a separate function, it often is convenient to
+point two or more members of struct dev_pm_ops to the same routine.  There are
+a few convenience macros that can be used for this purpose.
+
+The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one
+suspend routine pointed to by the .suspend(), .freeze(), and .poweroff()
+members and one resume routine pointed to by the .resume(), .thaw(), and
+.restore() members.  The other function pointers in this struct dev_pm_ops are
+unset.
+
+The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it
+additionally sets the .runtime_resume() pointer to the same value as
+.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to
+the same value as .suspend() (and .freeze() and .poweroff()).
+
+The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct
+dev_pm_ops to indicate that one suspend routine is to be pointed to by the
+.suspend(), .freeze(), and .poweroff() members and one resume routine is to
+be pointed to by the .resume(), .thaw(), and .restore() members.
+
+3.1.19. Driver Flags for Power Management
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The PM core allows device drivers to set flags that influence the handling of
+power management for the devices by the core itself and by middle layer code
+including the PCI bus type.  The flags should be set once at the driver probe
+time with the help of the dev_pm_set_driver_flags() function and they should not
+be updated directly afterwards.
+
+The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete
+mechanism allowing device suspend/resume callbacks to be skipped if the device
+is in runtime suspend when the system suspend starts.  That also affects all of
+the ancestors of the device, so this flag should only be used if absolutely
+necessary.
+
+The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a
+positive value from pci_pm_prepare() if the ->prepare callback provided by the
+driver of the device returns a positive value.  That allows the driver to opt
+out from using the direct-complete mechanism dynamically.
+
+The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's
+perspective the device can be safely left in runtime suspend during system
+suspend.  That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff()
+to skip resuming the device from runtime suspend unless there are PCI-specific
+reasons for doing that.  Also, it causes pci_pm_suspend_late/noirq(),
+pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early
+if the device remains in runtime suspend in the beginning of the "late" phase
+of the system-wide transition under way.  Moreover, if the device is in
+runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime
+power management status will be changed to "active" (as it is going to be put
+into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(),
+the function will set the power.direct_complete flag for it (to make the PM core
+skip the subsequent "thaw" callbacks for it) and return.
+
+Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the
+device to be left in suspend after system-wide transitions to the working state.
+This flag is checked by the PM core, but the PCI bus type informs the PM core
+which devices may be left in suspend from its perspective (that happens during
+the "noirq" phase of system-wide suspend and analogous transitions) and next it
+uses the dev_pm_may_skip_resume() helper to decide whether or not to return from
+pci_pm_resume_noirq() early, as the PM core will skip the remaining resume
+callbacks for the device during the transition under way and will set its
+runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for
+it.
+
+3.2. Device Runtime Power Management
+------------------------------------
+
+In addition to providing device power management callbacks PCI device drivers
+are responsible for controlling the runtime power management (runtime PM) of
+their devices.
+
+The PCI device runtime PM is optional, but it is recommended that PCI device
+drivers implement it at least in the cases where there is a reliable way of
+verifying that the device is not used (like when the network cable is detached
+from an Ethernet adapter or there are no devices attached to a USB controller).
+
+To support the PCI runtime PM the driver first needs to implement the
+runtime_suspend() and runtime_resume() callbacks.  It also may need to implement
+the runtime_idle() callback to prevent the device from being suspended again
+every time right after the runtime_resume() callback has returned
+(alternatively, the runtime_suspend() callback will have to check if the
+device should really be suspended and return -EAGAIN if that is not the case).
+
+The runtime PM of PCI devices is enabled by default by the PCI core.  PCI
+device drivers do not need to enable it and should not attempt to do so.
+However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid()
+helper function.  In addition to that, the runtime PM usage counter of
+each PCI device is incremented by local_pci_probe() before executing the
+probe callback provided by the device's driver.
+
+If a PCI driver implements the runtime PM callbacks and intends to use the
+runtime PM framework provided by the PM core and the PCI subsystem, it needs
+to decrement the device's runtime PM usage counter in its probe callback
+function.  If it doesn't do that, the counter will always be different from
+zero for the device and it will never be runtime-suspended.  The simplest
+way to do that is by calling pm_runtime_put_noidle(), but if the driver
+wants to schedule an autosuspend right away, for example, it may call
+pm_runtime_put_autosuspend() instead for this purpose.  Generally, it
+just needs to call a function that decrements the devices usage counter
+from its probe routine to make runtime PM work for the device.
+
+It is important to remember that the driver's runtime_suspend() callback
+may be executed right after the usage counter has been decremented, because
+user space may already have caused the pm_runtime_allow() helper function
+unblocking the runtime PM of the device to run via sysfs, so the driver must
+be prepared to cope with that.
+
+The driver itself should not call pm_runtime_allow(), though.  Instead, it
+should let user space or some platform-specific code do that (user space can
+do it via sysfs as stated above), but it must be prepared to handle the
+runtime PM of the device correctly as soon as pm_runtime_allow() is called
+(which may happen at any time, even before the driver is loaded).
+
+When the driver's remove callback runs, it has to balance the decrementation
+of the device's runtime PM usage counter at the probe time.  For this reason,
+if it has decremented the counter in its probe callback, it must run
+pm_runtime_get_noresume() in its remove callback.  [Since the core carries
+out a runtime resume of the device and bumps up the device's usage counter
+before running the driver's remove callback, the runtime PM of the device
+is effectively disabled for the duration of the remove execution and all
+runtime PM helper functions incrementing the device's usage counter are
+then effectively equivalent to pm_runtime_get_noresume().]
+
+The runtime PM framework works by processing requests to suspend or resume
+devices, or to check if they are idle (in which cases it is reasonable to
+subsequently request that they be suspended).  These requests are represented
+by work items put into the power management workqueue, pm_wq.  Although there
+are a few situations in which power management requests are automatically
+queued by the PM core (for example, after processing a request to resume a
+device the PM core automatically queues a request to check if the device is
+idle), device drivers are generally responsible for queuing power management
+requests for their devices.  For this purpose they should use the runtime PM
+helper functions provided by the PM core, discussed in
+Documentation/power/runtime_pm.rst.
+
+Devices can also be suspended and resumed synchronously, without placing a
+request into pm_wq.  In the majority of cases this also is done by their
+drivers that use helper functions provided by the PM core for this purpose.
+
+For more information on the runtime PM of devices refer to
+Documentation/power/runtime_pm.rst.
+
+
+4. Resources
+============
+
+PCI Local Bus Specification, Rev. 3.0
+
+PCI Bus Power Management Interface Specification, Rev. 1.2
+
+Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b
+
+PCI Express Base Specification, Rev. 2.0
+
+Documentation/driver-api/pm/devices.rst
+
+Documentation/power/runtime_pm.rst
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
deleted file mode 100644
index 8eaf9ee24d43..000000000000
--- a/Documentation/power/pci.txt
+++ /dev/null
@@ -1,1094 +0,0 @@
-PCI Power Management
-
-Copyright (c) 2010 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
-
-An overview of concepts and the Linux kernel's interfaces related to PCI power
-management.  Based on previous work by Patrick Mochel <mochel@transmeta.com>
-(and others).
-
-This document only covers the aspects of power management specific to PCI
-devices.  For general description of the kernel's interfaces related to device
-power management refer to Documentation/driver-api/pm/devices.rst and
-Documentation/power/runtime_pm.txt.
-
----------------------------------------------------------------------------
-
-1. Hardware and Platform Support for PCI Power Management
-2. PCI Subsystem and Device Power Management
-3. PCI Device Drivers and Power Management
-4. Resources
-
-
-1. Hardware and Platform Support for PCI Power Management
-=========================================================
-
-1.1. Native and Platform-Based Power Management
------------------------------------------------
-In general, power management is a feature allowing one to save energy by putting
-devices into states in which they draw less power (low-power states) at the
-price of reduced functionality or performance.
-
-Usually, a device is put into a low-power state when it is underutilized or
-completely inactive.  However, when it is necessary to use the device once
-again, it has to be put back into the "fully functional" state (full-power
-state).  This may happen when there are some data for the device to handle or
-as a result of an external event requiring the device to be active, which may
-be signaled by the device itself.
-
-PCI devices may be put into low-power states in two ways, by using the device
-capabilities introduced by the PCI Bus Power Management Interface Specification,
-or with the help of platform firmware, such as an ACPI BIOS.  In the first
-approach, that is referred to as the native PCI power management (native PCI PM)
-in what follows, the device power state is changed as a result of writing a
-specific value into one of its standard configuration registers.  The second
-approach requires the platform firmware to provide special methods that may be
-used by the kernel to change the device's power state.
-
-Devices supporting the native PCI PM usually can generate wakeup signals called
-Power Management Events (PMEs) to let the kernel know about external events
-requiring the device to be active.  After receiving a PME the kernel is supposed
-to put the device that sent it into the full-power state.  However, the PCI Bus
-Power Management Interface Specification doesn't define any standard method of
-delivering the PME from the device to the CPU and the operating system kernel.
-It is assumed that the platform firmware will perform this task and therefore,
-even though a PCI device is set up to generate PMEs, it also may be necessary to
-prepare the platform firmware for notifying the CPU of the PMEs coming from the
-device (e.g. by generating interrupts).
-
-In turn, if the methods provided by the platform firmware are used for changing
-the power state of a device, usually the platform also provides a method for
-preparing the device to generate wakeup signals.  In that case, however, it
-often also is necessary to prepare the device for generating PMEs using the
-native PCI PM mechanism, because the method provided by the platform depends on
-that.
-
-Thus in many situations both the native and the platform-based power management
-mechanisms have to be used simultaneously to obtain the desired result.
-
-1.2. Native PCI Power Management
---------------------------------
-The PCI Bus Power Management Interface Specification (PCI PM Spec) was
-introduced between the PCI 2.1 and PCI 2.2 Specifications.  It defined a
-standard interface for performing various operations related to power
-management.
-
-The implementation of the PCI PM Spec is optional for conventional PCI devices,
-but it is mandatory for PCI Express devices.  If a device supports the PCI PM
-Spec, it has an 8 byte power management capability field in its PCI
-configuration space.  This field is used to describe and control the standard
-features related to the native PCI power management.
-
-The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses
-(B0-B3).  The higher the number, the less power is drawn by the device or bus
-in that state.  However, the higher the number, the longer the latency for
-the device or bus to return to the full-power state (D0 or B0, respectively).
-
-There are two variants of the D3 state defined by the specification.  The first
-one is D3hot, referred to as the software accessible D3, because devices can be
-programmed to go into it.  The second one, D3cold, is the state that PCI devices
-are in when the supply voltage (Vcc) is removed from them.  It is not possible
-to program a PCI device to go into D3cold, although there may be a programmable
-interface for putting the bus the device is on into a state in which Vcc is
-removed from all devices on the bus.
-
-PCI bus power management, however, is not supported by the Linux kernel at the
-time of this writing and therefore it is not covered by this document.
-
-Note that every PCI device can be in the full-power state (D0) or in D3cold,
-regardless of whether or not it implements the PCI PM Spec.  In addition to
-that, if the PCI PM Spec is implemented by the device, it must support D3hot
-as well as D0.  The support for the D1 and D2 power states is optional.
-
-PCI devices supporting the PCI PM Spec can be programmed to go to any of the
-supported low-power states (except for D3cold).  While in D1-D3hot the
-standard configuration registers of the device must be accessible to software
-(i.e. the device is required to respond to PCI configuration accesses), although
-its I/O and memory spaces are then disabled.  This allows the device to be
-programmatically put into D0.  Thus the kernel can switch the device back and
-forth between D0 and the supported low-power states (except for D3cold) and the
-possible power state transitions the device can undergo are the following:
-
-+----------------------------+
-| Current State | New State  |
-+----------------------------+
-| D0            | D1, D2, D3 |
-+----------------------------+
-| D1            | D2, D3     |
-+----------------------------+
-| D2            | D3         |
-+----------------------------+
-| D1, D2, D3    | D0         |
-+----------------------------+
-
-The transition from D3cold to D0 occurs when the supply voltage is provided to
-the device (i.e. power is restored).  In that case the device returns to D0 with
-a full power-on reset sequence and the power-on defaults are restored to the
-device by hardware just as at initial power up.
-
-PCI devices supporting the PCI PM Spec can be programmed to generate PMEs
-while in a low-power state (D1-D3), but they are not required to be capable
-of generating PMEs from all supported low-power states.  In particular, the
-capability of generating PMEs from D3cold is optional and depends on the
-presence of additional voltage (3.3Vaux) allowing the device to remain
-sufficiently active to generate a wakeup signal.
-
-1.3. ACPI Device Power Management
----------------------------------
-The platform firmware support for the power management of PCI devices is
-system-specific.  However, if the system in question is compliant with the
-Advanced Configuration and Power Interface (ACPI) Specification, like the
-majority of x86-based systems, it is supposed to implement device power
-management interfaces defined by the ACPI standard.
-
-For this purpose the ACPI BIOS provides special functions called "control
-methods" that may be executed by the kernel to perform specific tasks, such as
-putting a device into a low-power state.  These control methods are encoded
-using special byte-code language called the ACPI Machine Language (AML) and
-stored in the machine's BIOS.  The kernel loads them from the BIOS and executes
-them as needed using an AML interpreter that translates the AML byte code into
-computations and memory or I/O space accesses.  This way, in theory, a BIOS
-writer can provide the kernel with a means to perform actions depending
-on the system design in a system-specific fashion.
-
-ACPI control methods may be divided into global control methods, that are not
-associated with any particular devices, and device control methods, that have
-to be defined separately for each device supposed to be handled with the help of
-the platform.  This means, in particular, that ACPI device control methods can
-only be used to handle devices that the BIOS writer knew about in advance.  The
-ACPI methods used for device power management fall into that category.
-
-The ACPI specification assumes that devices can be in one of four power states
-labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM
-D0-D3 states (although the difference between D3hot and D3cold is not taken
-into account by ACPI).  Moreover, for each power state of a device there is a
-set of power resources that have to be enabled for the device to be put into
-that state.  These power resources are controlled (i.e. enabled or disabled)
-with the help of their own control methods, _ON and _OFF, that have to be
-defined individually for each of them.
-
-To put a device into the ACPI power state Dx (where x is a number between 0 and
-3 inclusive) the kernel is supposed to (1) enable the power resources required
-by the device in this state using their _ON control methods and (2) execute the
-_PSx control method defined for the device.  In addition to that, if the device
-is going to be put into a low-power state (D1-D3) and is supposed to generate
-wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI
-3.0) control method defined for it has to be executed before _PSx.  Power
-resources that are not required by the device in the target power state and are
-not required any more by any other device should be disabled (by executing their
-_OFF control methods).  If the current power state of the device is D3, it can
-only be put into D0 this way.
-
-However, quite often the power states of devices are changed during a
-system-wide transition into a sleep state or back into the working state.  ACPI
-defines four system sleep states, S1, S2, S3, and S4, and denotes the system
-working state as S0.  In general, the target system sleep (or working) state
-determines the highest power (lowest number) state the device can be put
-into and the kernel is supposed to obtain this information by executing the
-device's _SxD control method (where x is a number between 0 and 4 inclusive).
-If the device is required to wake up the system from the target sleep state, the
-lowest power (highest number) state it can be put into is also determined by the
-target state of the system.  The kernel is then supposed to use the device's
-_SxW control method to obtain the number of that state.  It also is supposed to
-use the device's _PRW control method to learn which power resources need to be
-enabled for the device to be able to generate wakeup signals.
-
-1.4. Wakeup Signaling
----------------------
-Wakeup signals generated by PCI devices, either as native PCI PMEs, or as
-a result of the execution of the _DSW (or _PSW) ACPI control method before
-putting the device into a low-power state, have to be caught and handled as
-appropriate.  If they are sent while the system is in the working state
-(ACPI S0), they should be translated into interrupts so that the kernel can
-put the devices generating them into the full-power state and take care of the
-events that triggered them.  In turn, if they are sent while the system is
-sleeping, they should cause the system's core logic to trigger wakeup.
-
-On ACPI-based systems wakeup signals sent by conventional PCI devices are
-converted into ACPI General-Purpose Events (GPEs) which are hardware signals
-from the system core logic generated in response to various events that need to
-be acted upon.  Every GPE is associated with one or more sources of potentially
-interesting events.  In particular, a GPE may be associated with a PCI device
-capable of signaling wakeup.  The information on the connections between GPEs
-and event sources is recorded in the system's ACPI BIOS from where it can be
-read by the kernel.
-
-If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE
-associated with it (if there is one) is triggered.  The GPEs associated with PCI
-bridges may also be triggered in response to a wakeup signal from one of the
-devices below the bridge (this also is the case for root bridges) and, for
-example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be
-handled this way.
-
-A GPE may be triggered when the system is sleeping (i.e. when it is in one of
-the ACPI S1-S4 states), in which case system wakeup is started by its core logic
-(the device that was the source of the signal causing the system wakeup to occur
-may be identified later).  The GPEs used in such situations are referred to as
-wakeup GPEs.
-
-Usually, however, GPEs are also triggered when the system is in the working
-state (ACPI S0) and in that case the system's core logic generates a System
-Control Interrupt (SCI) to notify the kernel of the event.  Then, the SCI
-handler identifies the GPE that caused the interrupt to be generated which,
-in turn, allows the kernel to identify the source of the event (that may be
-a PCI device signaling wakeup).  The GPEs used for notifying the kernel of
-events occurring while the system is in the working state are referred to as
-runtime GPEs.
-
-Unfortunately, there is no standard way of handling wakeup signals sent by
-conventional PCI devices on systems that are not ACPI-based, but there is one
-for PCI Express devices.  Namely, the PCI Express Base Specification introduced
-a native mechanism for converting native PCI PMEs into interrupts generated by
-root ports.  For conventional PCI devices native PMEs are out-of-band, so they
-are routed separately and they need not pass through bridges (in principle they
-may be routed directly to the system's core logic), but for PCI Express devices
-they are in-band messages that have to pass through the PCI Express hierarchy,
-including the root port on the path from the device to the Root Complex.  Thus
-it was possible to introduce a mechanism by which a root port generates an
-interrupt whenever it receives a PME message from one of the devices below it.
-The PCI Express Requester ID of the device that sent the PME message is then
-recorded in one of the root port's configuration registers from where it may be
-read by the interrupt handler allowing the device to be identified.  [PME
-messages sent by PCI Express endpoints integrated with the Root Complex don't
-pass through root ports, but instead they cause a Root Complex Event Collector
-(if there is one) to generate interrupts.]
-
-In principle the native PCI Express PME signaling may also be used on ACPI-based
-systems along with the GPEs, but to use it the kernel has to ask the system's
-ACPI BIOS to release control of root port configuration registers.  The ACPI
-BIOS, however, is not required to allow the kernel to control these registers
-and if it doesn't do that, the kernel must not modify their contents.  Of course
-the native PCI Express PME signaling cannot be used by the kernel in that case.
-
-
-2. PCI Subsystem and Device Power Management
-============================================
-
-2.1. Device Power Management Callbacks
---------------------------------------
-The PCI Subsystem participates in the power management of PCI devices in a
-number of ways.  First of all, it provides an intermediate code layer between
-the device power management core (PM core) and PCI device drivers.
-Specifically, the pm field of the PCI subsystem's struct bus_type object,
-pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing
-pointers to several device power management callbacks:
-
-const struct dev_pm_ops pci_dev_pm_ops = {
-	.prepare = pci_pm_prepare,
-	.complete = pci_pm_complete,
-	.suspend = pci_pm_suspend,
-	.resume = pci_pm_resume,
-	.freeze = pci_pm_freeze,
-	.thaw = pci_pm_thaw,
-	.poweroff = pci_pm_poweroff,
-	.restore = pci_pm_restore,
-	.suspend_noirq = pci_pm_suspend_noirq,
-	.resume_noirq = pci_pm_resume_noirq,
-	.freeze_noirq = pci_pm_freeze_noirq,
-	.thaw_noirq = pci_pm_thaw_noirq,
-	.poweroff_noirq = pci_pm_poweroff_noirq,
-	.restore_noirq = pci_pm_restore_noirq,
-	.runtime_suspend = pci_pm_runtime_suspend,
-	.runtime_resume = pci_pm_runtime_resume,
-	.runtime_idle = pci_pm_runtime_idle,
-};
-
-These callbacks are executed by the PM core in various situations related to
-device power management and they, in turn, execute power management callbacks
-provided by PCI device drivers.  They also perform power management operations
-involving some standard configuration registers of PCI devices that device
-drivers need not know or care about.
-
-The structure representing a PCI device, struct pci_dev, contains several fields
-that these callbacks operate on:
-
-struct pci_dev {
-	...
-	pci_power_t     current_state;  /* Current operating state. */
-	int		pm_cap;		/* PM capability offset in the
-					   configuration space */
-	unsigned int	pme_support:5;	/* Bitmask of states from which PME#
-					   can be generated */
-	unsigned int	pme_interrupt:1;/* Is native PCIe PME signaling used? */
-	unsigned int	d1_support:1;	/* Low power state D1 is supported */
-	unsigned int	d2_support:1;	/* Low power state D2 is supported */
-	unsigned int	no_d1d2:1;	/* D1 and D2 are forbidden */
-	unsigned int	wakeup_prepared:1;  /* Device prepared for wake up */
-	unsigned int	d3_delay;	/* D3->D0 transition time in ms */
-	...
-};
-
-They also indirectly use some fields of the struct device that is embedded in
-struct pci_dev.
-
-2.2. Device Initialization
---------------------------
-The PCI subsystem's first task related to device power management is to
-prepare the device for power management and initialize the fields of struct
-pci_dev used for this purpose.  This happens in two functions defined in
-drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init().
-
-The first of these functions checks if the device supports native PCI PM
-and if that's the case the offset of its power management capability structure
-in the configuration space is stored in the pm_cap field of the device's struct
-pci_dev object.  Next, the function checks which PCI low-power states are
-supported by the device and from which low-power states the device can generate
-native PCI PMEs.  The power management fields of the device's struct pci_dev and
-the struct device embedded in it are updated accordingly and the generation of
-PMEs by the device is disabled.
-
-The second function checks if the device can be prepared to signal wakeup with
-the help of the platform firmware, such as the ACPI BIOS.  If that is the case,
-the function updates the wakeup fields in struct device embedded in the
-device's struct pci_dev and uses the firmware-provided method to prevent the
-device from signaling wakeup.
-
-At this point the device is ready for power management.  For driverless devices,
-however, this functionality is limited to a few basic operations carried out
-during system-wide transitions to a sleep state and back to the working state.
-
-2.3. Runtime Device Power Management
-------------------------------------
-The PCI subsystem plays a vital role in the runtime power management of PCI
-devices.  For this purpose it uses the general runtime power management
-(runtime PM) framework described in Documentation/power/runtime_pm.txt.
-Namely, it provides subsystem-level callbacks:
-
-	pci_pm_runtime_suspend()
-	pci_pm_runtime_resume()
-	pci_pm_runtime_idle()
-
-that are executed by the core runtime PM routines.  It also implements the
-entire mechanics necessary for handling runtime wakeup signals from PCI devices
-in low-power states, which at the time of this writing works for both the native
-PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in
-Section 1.
-
-First, a PCI device is put into a low-power state, or suspended, with the help
-of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call
-pci_pm_runtime_suspend() to do the actual job.  For this to work, the device's
-driver has to provide a pm->runtime_suspend() callback (see below), which is
-run by pci_pm_runtime_suspend() as the first action.  If the driver's callback
-returns successfully, the device's standard configuration registers are saved,
-the device is prepared to generate wakeup signals and, finally, it is put into
-the target low-power state.
-
-The low-power state to put the device into is the lowest-power (highest number)
-state from which it can signal wakeup.  The exact method of signaling wakeup is
-system-dependent and is determined by the PCI subsystem on the basis of the
-reported capabilities of the device and the platform firmware.  To prepare the
-device for signaling wakeup and put it into the selected low-power state, the
-PCI subsystem can use the platform firmware as well as the device's native PCI
-PM capabilities, if supported.
-
-It is expected that the device driver's pm->runtime_suspend() callback will
-not attempt to prepare the device for signaling wakeup or to put it into a
-low-power state.  The driver ought to leave these tasks to the PCI subsystem
-that has all of the information necessary to perform them.
-
-A suspended device is brought back into the "active" state, or resumed,
-with the help of pm_request_resume() or pm_runtime_resume() which both call
-pci_pm_runtime_resume() for PCI devices.  Again, this only works if the device's
-driver provides a pm->runtime_resume() callback (see below).  However, before
-the driver's callback is executed, pci_pm_runtime_resume() brings the device
-back into the full-power state, prevents it from signaling wakeup while in that
-state and restores its standard configuration registers.  Thus the driver's
-callback need not worry about the PCI-specific aspects of the device resume.
-
-Note that generally pci_pm_runtime_resume() may be called in two different
-situations.  First, it may be called at the request of the device's driver, for
-example if there are some data for it to process.  Second, it may be called
-as a result of a wakeup signal from the device itself (this sometimes is
-referred to as "remote wakeup").  Of course, for this purpose the wakeup signal
-is handled in one of the ways described in Section 1 and finally converted into
-a notification for the PCI subsystem after the source device has been
-identified.
-
-The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle()
-and pm_request_idle(), executes the device driver's pm->runtime_idle()
-callback, if defined, and if that callback doesn't return error code (or is not
-present at all), suspends the device with the help of pm_runtime_suspend().
-Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for
-example, it is called right after the device has just been resumed), in which
-cases it is expected to suspend the device if that makes sense.  Usually,
-however, the PCI subsystem doesn't really know if the device really can be
-suspended, so it lets the device's driver decide by running its
-pm->runtime_idle() callback.
-
-2.4. System-Wide Power Transitions
-----------------------------------
-There are a few different types of system-wide power transitions, described in
-Documentation/driver-api/pm/devices.rst.  Each of them requires devices to be handled
-in a specific way and the PM core executes subsystem-level power management
-callbacks for this purpose.  They are executed in phases such that each phase
-involves executing the same subsystem-level callback for every device belonging
-to the given subsystem before the next phase begins.  These phases always run
-after tasks have been frozen.
-
-2.4.1. System Suspend
-
-When the system is going into a sleep state in which the contents of memory will
-be preserved, such as one of the ACPI sleep states S1-S3, the phases are:
-
-	prepare, suspend, suspend_noirq.
-
-The following PCI bus type's callbacks, respectively, are used in these phases:
-
-	pci_pm_prepare()
-	pci_pm_suspend()
-	pci_pm_suspend_noirq()
-
-The pci_pm_prepare() routine first puts the device into the "fully functional"
-state with the help of pm_runtime_resume().  Then, it executes the device
-driver's pm->prepare() callback if defined (i.e. if the driver's struct
-dev_pm_ops object is present and the prepare pointer in that object is valid).
-
-The pci_pm_suspend() routine first checks if the device's driver implements
-legacy PCI suspend routines (see Section 3), in which case the driver's legacy
-suspend callback is executed, if present, and its result is returned.  Next, if
-the device's driver doesn't provide a struct dev_pm_ops object (containing
-pointers to the driver's callbacks), pci_pm_default_suspend() is called, which
-simply turns off the device's bus master capability and runs
-pcibios_disable_device() to disable it, unless the device is a bridge (PCI
-bridges are ignored by this routine).  Next, the device driver's pm->suspend()
-callback is executed, if defined, and its result is returned if it fails.
-Finally, pci_fixup_device() is called to apply hardware suspend quirks related
-to the device if necessary.
-
-Note that the suspend phase is carried out asynchronously for PCI devices, so
-the pci_pm_suspend() callback may be executed in parallel for any pair of PCI
-devices that don't depend on each other in a known way (i.e. none of the paths
-in the device tree from the root bridge to a leaf device contains both of them).
-
-The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has
-been called, which means that the device driver's interrupt handler won't be
-invoked while this routine is running.  It first checks if the device's driver
-implements legacy PCI suspends routines (Section 3), in which case the legacy
-late suspend routine is called and its result is returned (the standard
-configuration registers of the device are saved if the driver's callback hasn't
-done that).  Second, if the device driver's struct dev_pm_ops object is not
-present, the device's standard configuration registers are saved and the routine
-returns success.  Otherwise the device driver's pm->suspend_noirq() callback is
-executed, if present, and its result is returned if it fails.  Next, if the
-device's standard configuration registers haven't been saved yet (one of the
-device driver's callbacks executed before might do that), pci_pm_suspend_noirq()
-saves them, prepares the device to signal wakeup (if necessary) and puts it into
-a low-power state.
-
-The low-power state to put the device into is the lowest-power (highest number)
-state from which it can signal wakeup while the system is in the target sleep
-state.  Just like in the runtime PM case described above, the mechanism of
-signaling wakeup is system-dependent and determined by the PCI subsystem, which
-is also responsible for preparing the device to signal wakeup from the system's
-target sleep state as appropriate.
-
-PCI device drivers (that don't implement legacy power management callbacks) are
-generally not expected to prepare devices for signaling wakeup or to put them
-into low-power states.  However, if one of the driver's suspend callbacks
-(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration
-registers, pci_pm_suspend_noirq() will assume that the device has been prepared
-to signal wakeup and put into a low-power state by the driver (the driver is
-then assumed to have used the helper functions provided by the PCI subsystem for
-this purpose).  PCI device drivers are not encouraged to do that, but in some
-rare cases doing that in the driver may be the optimum approach.
-
-2.4.2. System Resume
-
-When the system is undergoing a transition from a sleep state in which the
-contents of memory have been preserved, such as one of the ACPI sleep states
-S1-S3, into the working state (ACPI S0), the phases are:
-
-	resume_noirq, resume, complete.
-
-The following PCI bus type's callbacks, respectively, are executed in these
-phases:
-
-	pci_pm_resume_noirq()
-	pci_pm_resume()
-	pci_pm_complete()
-
-The pci_pm_resume_noirq() routine first puts the device into the full-power
-state, restores its standard configuration registers and applies early resume
-hardware quirks related to the device, if necessary.  This is done
-unconditionally, regardless of whether or not the device's driver implements
-legacy PCI power management callbacks (this way all PCI devices are in the
-full-power state and their standard configuration registers have been restored
-when their interrupt handlers are invoked for the first time during resume,
-which allows the kernel to avoid problems with the handling of shared interrupts
-by drivers whose devices are still suspended).  If legacy PCI power management
-callbacks (see Section 3) are implemented by the device's driver, the legacy
-early resume callback is executed and its result is returned.  Otherwise, the
-device driver's pm->resume_noirq() callback is executed, if defined, and its
-result is returned.
-
-The pci_pm_resume() routine first checks if the device's standard configuration
-registers have been restored and restores them if that's not the case (this
-only is necessary in the error path during a failing suspend).  Next, resume
-hardware quirks related to the device are applied, if necessary, and if the
-device's driver implements legacy PCI power management callbacks (see
-Section 3), the driver's legacy resume callback is executed and its result is
-returned.  Otherwise, the device's wakeup signaling mechanisms are blocked and
-its driver's pm->resume() callback is executed, if defined (the callback's
-result is then returned).
-
-The resume phase is carried out asynchronously for PCI devices, like the
-suspend phase described above, which means that if two PCI devices don't depend
-on each other in a known way, the pci_pm_resume() routine may be executed for
-the both of them in parallel.
-
-The pci_pm_complete() routine only executes the device driver's pm->complete()
-callback, if defined.
-
-2.4.3. System Hibernation
-
-System hibernation is more complicated than system suspend, because it requires
-a system image to be created and written into a persistent storage medium.  The
-image is created atomically and all devices are quiesced, or frozen, before that
-happens.
-
-The freezing of devices is carried out after enough memory has been freed (at
-the time of this writing the image creation requires at least 50% of system RAM
-to be free) in the following three phases:
-
-	prepare, freeze, freeze_noirq
-
-that correspond to the PCI bus type's callbacks:
-
-	pci_pm_prepare()
-	pci_pm_freeze()
-	pci_pm_freeze_noirq()
-
-This means that the prepare phase is exactly the same as for system suspend.
-The other two phases, however, are different.
-
-The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs
-the device driver's pm->freeze() callback, if defined, instead of pm->suspend(),
-and it doesn't apply the suspend-related hardware quirks.  It is executed
-asynchronously for different PCI devices that don't depend on each other in a
-known way.
-
-The pci_pm_freeze_noirq() routine, in turn, is similar to
-pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq()
-routine instead of pm->suspend_noirq().  It also doesn't attempt to prepare the
-device for signaling wakeup and put it into a low-power state.  Still, it saves
-the device's standard configuration registers if they haven't been saved by one
-of the driver's callbacks.
-
-Once the image has been created, it has to be saved.  However, at this point all
-devices are frozen and they cannot handle I/O, while their ability to handle
-I/O is obviously necessary for the image saving.  Thus they have to be brought
-back to the fully functional state and this is done in the following phases:
-
-	thaw_noirq, thaw, complete
-
-using the following PCI bus type's callbacks:
-
-	pci_pm_thaw_noirq()
-	pci_pm_thaw()
-	pci_pm_complete()
-
-respectively.
-
-The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(),
-but it doesn't put the device into the full power state and doesn't attempt to
-restore its standard configuration registers.  It also executes the device
-driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq().
-
-The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device
-driver's pm->thaw() callback instead of pm->resume().  It is executed
-asynchronously for different PCI devices that don't depend on each other in a
-known way.
-
-The complete phase it the same as for system resume.
-
-After saving the image, devices need to be powered down before the system can
-enter the target sleep state (ACPI S4 for ACPI-based systems).  This is done in
-three phases:
-
-	prepare, poweroff, poweroff_noirq
-
-where the prepare phase is exactly the same as for system suspend.  The other
-two phases are analogous to the suspend and suspend_noirq phases, respectively.
-The PCI subsystem-level callbacks they correspond to
-
-	pci_pm_poweroff()
-	pci_pm_poweroff_noirq()
-
-work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively,
-although they don't attempt to save the device's standard configuration
-registers.
-
-2.4.4. System Restore
-
-System restore requires a hibernation image to be loaded into memory and the
-pre-hibernation memory contents to be restored before the pre-hibernation system
-activity can be resumed.
-
-As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded
-into memory by a fresh instance of the kernel, called the boot kernel, which in
-turn is loaded and run by a boot loader in the usual way.  After the boot kernel
-has loaded the image, it needs to replace its own code and data with the code
-and data of the "hibernated" kernel stored within the image, called the image
-kernel.  For this purpose all devices are frozen just like before creating
-the image during hibernation, in the
-
-	prepare, freeze, freeze_noirq
-
-phases described above.  However, the devices affected by these phases are only
-those having drivers in the boot kernel; other devices will still be in whatever
-state the boot loader left them.
-
-Should the restoration of the pre-hibernation memory contents fail, the boot
-kernel would go through the "thawing" procedure described above, using the
-thaw_noirq, thaw, and complete phases (that will only affect the devices having
-drivers in the boot kernel), and then continue running normally.
-
-If the pre-hibernation memory contents are restored successfully, which is the
-usual situation, control is passed to the image kernel, which then becomes
-responsible for bringing the system back to the working state.  To achieve this,
-it must restore the devices' pre-hibernation functionality, which is done much
-like waking up from the memory sleep state, although it involves different
-phases:
-
-	restore_noirq, restore, complete
-
-The first two of these are analogous to the resume_noirq and resume phases
-described above, respectively, and correspond to the following PCI subsystem
-callbacks:
-
-	pci_pm_restore_noirq()
-	pci_pm_restore()
-
-These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(),
-respectively, but they execute the device driver's pm->restore_noirq() and
-pm->restore() callbacks, if available.
-
-The complete phase is carried out in exactly the same way as during system
-resume.
-
-
-3. PCI Device Drivers and Power Management
-==========================================
-
-3.1. Power Management Callbacks
--------------------------------
-PCI device drivers participate in power management by providing callbacks to be
-executed by the PCI subsystem's power management routines described above and by
-controlling the runtime power management of their devices.
-
-At the time of this writing there are two ways to define power management
-callbacks for a PCI device driver, the recommended one, based on using a
-dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the
-"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and
-.resume() callbacks from struct pci_driver are used.  The legacy approach,
-however, doesn't allow one to define runtime power management callbacks and is
-not really suitable for any new drivers.  Therefore it is not covered by this
-document (refer to the source code to learn more about it).
-
-It is recommended that all PCI device drivers define a struct dev_pm_ops object
-containing pointers to power management (PM) callbacks that will be executed by
-the PCI subsystem's PM routines in various circumstances.  A pointer to the
-driver's struct dev_pm_ops object has to be assigned to the driver.pm field in
-its struct pci_driver object.  Once that has happened, the "legacy" PM callbacks
-in struct pci_driver are ignored (even if they are not NULL).
-
-The PM callbacks in struct dev_pm_ops are not mandatory and if they are not
-defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI
-subsystem will handle the device in a simplified default manner.  If they are
-defined, though, they are expected to behave as described in the following
-subsections.
-
-3.1.1. prepare()
-
-The prepare() callback is executed during system suspend, during hibernation
-(when a hibernation image is about to be created), during power-off after
-saving a hibernation image and during system restore, when a hibernation image
-has just been loaded into memory.
-
-This callback is only necessary if the driver's device has children that in
-general may be registered at any time.  In that case the role of the prepare()
-callback is to prevent new children of the device from being registered until
-one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run.
-
-In addition to that the prepare() callback may carry out some operations
-preparing the device to be suspended, although it should not allocate memory
-(if additional memory is required to suspend the device, it has to be
-preallocated earlier, for example in a suspend/hibernate notifier as described
-in Documentation/driver-api/pm/notifiers.rst).
-
-3.1.2. suspend()
-
-The suspend() callback is only executed during system suspend, after prepare()
-callbacks have been executed for all devices in the system.
-
-This callback is expected to quiesce the device and prepare it to be put into a
-low-power state by the PCI subsystem.  It is not required (in fact it even is
-not recommended) that a PCI driver's suspend() callback save the standard
-configuration registers of the device, prepare it for waking up the system, or
-put it into a low-power state.  All of these operations can very well be taken
-care of by the PCI subsystem, without the driver's participation.
-
-However, in some rare case it is convenient to carry out these operations in
-a PCI driver.  Then, pci_save_state(), pci_prepare_to_sleep(), and
-pci_set_power_state() should be used to save the device's standard configuration
-registers, to prepare it for system wakeup (if necessary), and to put it into a
-low-power state, respectively.  Moreover, if the driver calls pci_save_state(),
-the PCI subsystem will not execute either pci_prepare_to_sleep(), or
-pci_set_power_state() for its device, so the driver is then responsible for
-handling the device as appropriate.
-
-While the suspend() callback is being executed, the driver's interrupt handler
-can be invoked to handle an interrupt from the device, so all suspend-related
-operations relying on the driver's ability to handle interrupts should be
-carried out in this callback.
-
-3.1.3. suspend_noirq()
-
-The suspend_noirq() callback is only executed during system suspend, after
-suspend() callbacks have been executed for all devices in the system and
-after device interrupts have been disabled by the PM core.
-
-The difference between suspend_noirq() and suspend() is that the driver's
-interrupt handler will not be invoked while suspend_noirq() is running.  Thus
-suspend_noirq() can carry out operations that would cause race conditions to
-arise if they were performed in suspend().
-
-3.1.4. freeze()
-
-The freeze() callback is hibernation-specific and is executed in two situations,
-during hibernation, after prepare() callbacks have been executed for all devices
-in preparation for the creation of a system image, and during restore,
-after a system image has been loaded into memory from persistent storage and the
-prepare() callbacks have been executed for all devices.
-
-The role of this callback is analogous to the role of the suspend() callback
-described above.  In fact, they only need to be different in the rare cases when
-the driver takes the responsibility for putting the device into a low-power
-state.
-
-In that cases the freeze() callback should not prepare the device system wakeup
-or put it into a low-power state.  Still, either it or freeze_noirq() should
-save the device's standard configuration registers using pci_save_state().
-
-3.1.5. freeze_noirq()
-
-The freeze_noirq() callback is hibernation-specific.  It is executed during
-hibernation, after prepare() and freeze() callbacks have been executed for all
-devices in preparation for the creation of a system image, and during restore,
-after a system image has been loaded into memory and after prepare() and
-freeze() callbacks have been executed for all devices.  It is always executed
-after device interrupts have been disabled by the PM core.
-
-The role of this callback is analogous to the role of the suspend_noirq()
-callback described above and it very rarely is necessary to define
-freeze_noirq().
-
-The difference between freeze_noirq() and freeze() is analogous to the
-difference between suspend_noirq() and suspend().
-
-3.1.6. poweroff()
-
-The poweroff() callback is hibernation-specific.  It is executed when the system
-is about to be powered off after saving a hibernation image to a persistent
-storage.  prepare() callbacks are executed for all devices before poweroff() is
-called.
-
-The role of this callback is analogous to the role of the suspend() and freeze()
-callbacks described above, although it does not need to save the contents of
-the device's registers.  In particular, if the driver wants to put the device
-into a low-power state itself instead of allowing the PCI subsystem to do that,
-the poweroff() callback should use pci_prepare_to_sleep() and
-pci_set_power_state() to prepare the device for system wakeup and to put it
-into a low-power state, respectively, but it need not save the device's standard
-configuration registers.
-
-3.1.7. poweroff_noirq()
-
-The poweroff_noirq() callback is hibernation-specific.  It is executed after
-poweroff() callbacks have been executed for all devices in the system.
-
-The role of this callback is analogous to the role of the suspend_noirq() and
-freeze_noirq() callbacks described above, but it does not need to save the
-contents of the device's registers.
-
-The difference between poweroff_noirq() and poweroff() is analogous to the
-difference between suspend_noirq() and suspend().
-
-3.1.8. resume_noirq()
-
-The resume_noirq() callback is only executed during system resume, after the
-PM core has enabled the non-boot CPUs.  The driver's interrupt handler will not
-be invoked while resume_noirq() is running, so this callback can carry out
-operations that might race with the interrupt handler.
-
-Since the PCI subsystem unconditionally puts all devices into the full power
-state in the resume_noirq phase of system resume and restores their standard
-configuration registers, resume_noirq() is usually not necessary.  In general
-it should only be used for performing operations that would lead to race
-conditions if carried out by resume().
-
-3.1.9. resume()
-
-The resume() callback is only executed during system resume, after
-resume_noirq() callbacks have been executed for all devices in the system and
-device interrupts have been enabled by the PM core.
-
-This callback is responsible for restoring the pre-suspend configuration of the
-device and bringing it back to the fully functional state.  The device should be
-able to process I/O in a usual way after resume() has returned.
-
-3.1.10. thaw_noirq()
-
-The thaw_noirq() callback is hibernation-specific.  It is executed after a
-system image has been created and the non-boot CPUs have been enabled by the PM
-core, in the thaw_noirq phase of hibernation.  It also may be executed if the
-loading of a hibernation image fails during system restore (it is then executed
-after enabling the non-boot CPUs).  The driver's interrupt handler will not be
-invoked while thaw_noirq() is running.
-
-The role of this callback is analogous to the role of resume_noirq().  The
-difference between these two callbacks is that thaw_noirq() is executed after
-freeze() and freeze_noirq(), so in general it does not need to modify the
-contents of the device's registers.
-
-3.1.11. thaw()
-
-The thaw() callback is hibernation-specific.  It is executed after thaw_noirq()
-callbacks have been executed for all devices in the system and after device
-interrupts have been enabled by the PM core.
-
-This callback is responsible for restoring the pre-freeze configuration of
-the device, so that it will work in a usual way after thaw() has returned.
-
-3.1.12. restore_noirq()
-
-The restore_noirq() callback is hibernation-specific.  It is executed in the
-restore_noirq phase of hibernation, when the boot kernel has passed control to
-the image kernel and the non-boot CPUs have been enabled by the image kernel's
-PM core.
-
-This callback is analogous to resume_noirq() with the exception that it cannot
-make any assumption on the previous state of the device, even if the BIOS (or
-generally the platform firmware) is known to preserve that state over a
-suspend-resume cycle.
-
-For the vast majority of PCI device drivers there is no difference between
-resume_noirq() and restore_noirq().
-
-3.1.13. restore()
-
-The restore() callback is hibernation-specific.  It is executed after
-restore_noirq() callbacks have been executed for all devices in the system and
-after the PM core has enabled device drivers' interrupt handlers to be invoked.
-
-This callback is analogous to resume(), just like restore_noirq() is analogous
-to resume_noirq().  Consequently, the difference between restore_noirq() and
-restore() is analogous to the difference between resume_noirq() and resume().
-
-For the vast majority of PCI device drivers there is no difference between
-resume() and restore().
-
-3.1.14. complete()
-
-The complete() callback is executed in the following situations:
-  - during system resume, after resume() callbacks have been executed for all
-    devices,
-  - during hibernation, before saving the system image, after thaw() callbacks
-    have been executed for all devices,
-  - during system restore, when the system is going back to its pre-hibernation
-    state, after restore() callbacks have been executed for all devices.
-It also may be executed if the loading of a hibernation image into memory fails
-(in that case it is run after thaw() callbacks have been executed for all
-devices that have drivers in the boot kernel).
-
-This callback is entirely optional, although it may be necessary if the
-prepare() callback performs operations that need to be reversed.
-
-3.1.15. runtime_suspend()
-
-The runtime_suspend() callback is specific to device runtime power management
-(runtime PM).  It is executed by the PM core's runtime PM framework when the
-device is about to be suspended (i.e. quiesced and put into a low-power state)
-at run time.
-
-This callback is responsible for freezing the device and preparing it to be
-put into a low-power state, but it must allow the PCI subsystem to perform all
-of the PCI-specific actions necessary for suspending the device.
-
-3.1.16. runtime_resume()
-
-The runtime_resume() callback is specific to device runtime PM.  It is executed
-by the PM core's runtime PM framework when the device is about to be resumed
-(i.e. put into the full-power state and programmed to process I/O normally) at
-run time.
-
-This callback is responsible for restoring the normal functionality of the
-device after it has been put into the full-power state by the PCI subsystem.
-The device is expected to be able to process I/O in the usual way after
-runtime_resume() has returned.
-
-3.1.17. runtime_idle()
-
-The runtime_idle() callback is specific to device runtime PM.  It is executed
-by the PM core's runtime PM framework whenever it may be desirable to suspend
-the device according to the PM core's information.  In particular, it is
-automatically executed right after runtime_resume() has returned in case the
-resume of the device has happened as a result of a spurious event.
-
-This callback is optional, but if it is not implemented or if it returns 0, the
-PCI subsystem will call pm_runtime_suspend() for the device, which in turn will
-cause the driver's runtime_suspend() callback to be executed.
-
-3.1.18. Pointing Multiple Callback Pointers to One Routine
-
-Although in principle each of the callbacks described in the previous
-subsections can be defined as a separate function, it often is convenient to
-point two or more members of struct dev_pm_ops to the same routine.  There are
-a few convenience macros that can be used for this purpose.
-
-The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one
-suspend routine pointed to by the .suspend(), .freeze(), and .poweroff()
-members and one resume routine pointed to by the .resume(), .thaw(), and
-.restore() members.  The other function pointers in this struct dev_pm_ops are
-unset.
-
-The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it
-additionally sets the .runtime_resume() pointer to the same value as
-.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to
-the same value as .suspend() (and .freeze() and .poweroff()).
-
-The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct
-dev_pm_ops to indicate that one suspend routine is to be pointed to by the
-.suspend(), .freeze(), and .poweroff() members and one resume routine is to
-be pointed to by the .resume(), .thaw(), and .restore() members.
-
-3.1.19. Driver Flags for Power Management
-
-The PM core allows device drivers to set flags that influence the handling of
-power management for the devices by the core itself and by middle layer code
-including the PCI bus type.  The flags should be set once at the driver probe
-time with the help of the dev_pm_set_driver_flags() function and they should not
-be updated directly afterwards.
-
-The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete
-mechanism allowing device suspend/resume callbacks to be skipped if the device
-is in runtime suspend when the system suspend starts.  That also affects all of
-the ancestors of the device, so this flag should only be used if absolutely
-necessary.
-
-The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a
-positive value from pci_pm_prepare() if the ->prepare callback provided by the
-driver of the device returns a positive value.  That allows the driver to opt
-out from using the direct-complete mechanism dynamically.
-
-The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's
-perspective the device can be safely left in runtime suspend during system
-suspend.  That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff()
-to skip resuming the device from runtime suspend unless there are PCI-specific
-reasons for doing that.  Also, it causes pci_pm_suspend_late/noirq(),
-pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early
-if the device remains in runtime suspend in the beginning of the "late" phase
-of the system-wide transition under way.  Moreover, if the device is in
-runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime
-power management status will be changed to "active" (as it is going to be put
-into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(),
-the function will set the power.direct_complete flag for it (to make the PM core
-skip the subsequent "thaw" callbacks for it) and return.
-
-Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the
-device to be left in suspend after system-wide transitions to the working state.
-This flag is checked by the PM core, but the PCI bus type informs the PM core
-which devices may be left in suspend from its perspective (that happens during
-the "noirq" phase of system-wide suspend and analogous transitions) and next it
-uses the dev_pm_may_skip_resume() helper to decide whether or not to return from
-pci_pm_resume_noirq() early, as the PM core will skip the remaining resume
-callbacks for the device during the transition under way and will set its
-runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for
-it.
-
-3.2. Device Runtime Power Management
-------------------------------------
-In addition to providing device power management callbacks PCI device drivers
-are responsible for controlling the runtime power management (runtime PM) of
-their devices.
-
-The PCI device runtime PM is optional, but it is recommended that PCI device
-drivers implement it at least in the cases where there is a reliable way of
-verifying that the device is not used (like when the network cable is detached
-from an Ethernet adapter or there are no devices attached to a USB controller).
-
-To support the PCI runtime PM the driver first needs to implement the
-runtime_suspend() and runtime_resume() callbacks.  It also may need to implement
-the runtime_idle() callback to prevent the device from being suspended again
-every time right after the runtime_resume() callback has returned
-(alternatively, the runtime_suspend() callback will have to check if the
-device should really be suspended and return -EAGAIN if that is not the case).
-
-The runtime PM of PCI devices is enabled by default by the PCI core.  PCI
-device drivers do not need to enable it and should not attempt to do so.
-However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid()
-helper function.  In addition to that, the runtime PM usage counter of
-each PCI device is incremented by local_pci_probe() before executing the
-probe callback provided by the device's driver.
-
-If a PCI driver implements the runtime PM callbacks and intends to use the
-runtime PM framework provided by the PM core and the PCI subsystem, it needs
-to decrement the device's runtime PM usage counter in its probe callback
-function.  If it doesn't do that, the counter will always be different from
-zero for the device and it will never be runtime-suspended.  The simplest
-way to do that is by calling pm_runtime_put_noidle(), but if the driver
-wants to schedule an autosuspend right away, for example, it may call
-pm_runtime_put_autosuspend() instead for this purpose.  Generally, it
-just needs to call a function that decrements the devices usage counter
-from its probe routine to make runtime PM work for the device.
-
-It is important to remember that the driver's runtime_suspend() callback
-may be executed right after the usage counter has been decremented, because
-user space may already have caused the pm_runtime_allow() helper function
-unblocking the runtime PM of the device to run via sysfs, so the driver must
-be prepared to cope with that.
-
-The driver itself should not call pm_runtime_allow(), though.  Instead, it
-should let user space or some platform-specific code do that (user space can
-do it via sysfs as stated above), but it must be prepared to handle the
-runtime PM of the device correctly as soon as pm_runtime_allow() is called
-(which may happen at any time, even before the driver is loaded).
-
-When the driver's remove callback runs, it has to balance the decrementation
-of the device's runtime PM usage counter at the probe time.  For this reason,
-if it has decremented the counter in its probe callback, it must run
-pm_runtime_get_noresume() in its remove callback.  [Since the core carries
-out a runtime resume of the device and bumps up the device's usage counter
-before running the driver's remove callback, the runtime PM of the device
-is effectively disabled for the duration of the remove execution and all
-runtime PM helper functions incrementing the device's usage counter are
-then effectively equivalent to pm_runtime_get_noresume().]
-
-The runtime PM framework works by processing requests to suspend or resume
-devices, or to check if they are idle (in which cases it is reasonable to
-subsequently request that they be suspended).  These requests are represented
-by work items put into the power management workqueue, pm_wq.  Although there
-are a few situations in which power management requests are automatically
-queued by the PM core (for example, after processing a request to resume a
-device the PM core automatically queues a request to check if the device is
-idle), device drivers are generally responsible for queuing power management
-requests for their devices.  For this purpose they should use the runtime PM
-helper functions provided by the PM core, discussed in
-Documentation/power/runtime_pm.txt.
-
-Devices can also be suspended and resumed synchronously, without placing a
-request into pm_wq.  In the majority of cases this also is done by their
-drivers that use helper functions provided by the PM core for this purpose.
-
-For more information on the runtime PM of devices refer to
-Documentation/power/runtime_pm.txt.
-
-
-4. Resources
-============
-
-PCI Local Bus Specification, Rev. 3.0
-PCI Bus Power Management Interface Specification, Rev. 1.2
-Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b
-PCI Express Base Specification, Rev. 2.0
-Documentation/driver-api/pm/devices.rst
-Documentation/power/runtime_pm.txt
diff --git a/Documentation/power/pm_qos_interface.rst b/Documentation/power/pm_qos_interface.rst
new file mode 100644
index 000000000000..945fc6d760c9
--- /dev/null
+++ b/Documentation/power/pm_qos_interface.rst
@@ -0,0 +1,225 @@
+===============================
+PM Quality Of Service Interface
+===============================
+
+This interface provides a kernel and user mode interface for registering
+performance expectations by drivers, subsystems and user space applications on
+one of the parameters.
+
+Two different PM QoS frameworks are available:
+1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput,
+memory_bandwidth.
+2. the per-device PM QoS framework provides the API to manage the per-device latency
+constraints and PM QoS flags.
+
+Each parameters have defined units:
+
+ * latency: usec
+ * timeout: usec
+ * throughput: kbs (kilo bit / sec)
+ * memory bandwidth: mbs (mega bit / sec)
+
+
+1. PM QoS framework
+===================
+
+The infrastructure exposes multiple misc device nodes one per implemented
+parameter.  The set of parameters implement is defined by pm_qos_power_init()
+and pm_qos_params.h.  This is done because having the available parameters
+being runtime configurable or changeable from a driver was seen as too easy to
+abuse.
+
+For each parameter a list of performance requests is maintained along with
+an aggregated target value.  The aggregated target value is updated with
+changes to the request list or elements of the list.  Typically the
+aggregated target value is simply the max or min of the request values held
+in the parameter list elements.
+Note: the aggregated target value is implemented as an atomic variable so that
+reading the aggregated value does not require any locking mechanism.
+
+
+From kernel mode the use of this interface is simple:
+
+void pm_qos_add_request(handle, param_class, target_value):
+  Will insert an element into the list for that identified PM QoS class with the
+  target value.  Upon change to this list the new target is recomputed and any
+  registered notifiers are called only if the target value is now different.
+  Clients of pm_qos need to save the returned handle for future use in other
+  pm_qos API functions.
+
+void pm_qos_update_request(handle, new_target_value):
+  Will update the list element pointed to by the handle with the new target value
+  and recompute the new aggregated target, calling the notification tree if the
+  target is changed.
+
+void pm_qos_remove_request(handle):
+  Will remove the element.  After removal it will update the aggregate target and
+  call the notification tree if the target was changed as a result of removing
+  the request.
+
+int pm_qos_request(param_class):
+  Returns the aggregated value for a given PM QoS class.
+
+int pm_qos_request_active(handle):
+  Returns if the request is still active, i.e. it has not been removed from a
+  PM QoS class constraints list.
+
+int pm_qos_add_notifier(param_class, notifier):
+  Adds a notification callback function to the PM QoS class. The callback is
+  called when the aggregated value for the PM QoS class is changed.
+
+int pm_qos_remove_notifier(int param_class, notifier):
+  Removes the notification callback function for the PM QoS class.
+
+
+From user mode:
+
+Only processes can register a pm_qos request.  To provide for automatic
+cleanup of a process, the interface requires the process to register its
+parameter requests in the following way:
+
+To register the default pm_qos target for the specific parameter, the process
+must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
+
+As long as the device node is held open that process has a registered
+request on the parameter.
+
+To change the requested target value the process needs to write an s32 value to
+the open device node.  Alternatively the user mode program could write a hex
+string for the value using 10 char long format e.g. "0x12345678".  This
+translates to a pm_qos_update_request call.
+
+To remove the user mode request for a target value simply close the device
+node.
+
+
+2. PM QoS per-device latency and flags framework
+================================================
+
+For each device, there are three lists of PM QoS requests. Two of them are
+maintained along with the aggregated targets of resume latency and active
+state latency tolerance (in microseconds) and the third one is for PM QoS flags.
+Values are updated in response to changes of the request list.
+
+The target values of resume latency and active state latency tolerance are
+simply the minimum of the request values held in the parameter list elements.
+The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements'
+values.  One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF.
+
+Note: The aggregated target values are implemented in such a way that reading
+the aggregated value does not require any locking mechanism.
+
+
+From kernel mode the use of this interface is the following:
+
+int dev_pm_qos_add_request(device, handle, type, value):
+  Will insert an element into the list for that identified device with the
+  target value.  Upon change to this list the new target is recomputed and any
+  registered notifiers are called only if the target value is now different.
+  Clients of dev_pm_qos need to save the handle for future use in other
+  dev_pm_qos API functions.
+
+int dev_pm_qos_update_request(handle, new_value):
+  Will update the list element pointed to by the handle with the new target
+  value and recompute the new aggregated target, calling the notification
+  trees if the target is changed.
+
+int dev_pm_qos_remove_request(handle):
+  Will remove the element.  After removal it will update the aggregate target
+  and call the notification trees if the target was changed as a result of
+  removing the request.
+
+s32 dev_pm_qos_read_value(device):
+  Returns the aggregated value for a given device's constraints list.
+
+enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
+  Check PM QoS flags of the given device against the given mask of flags.
+  The meaning of the return values is as follows:
+
+	PM_QOS_FLAGS_ALL:
+		All flags from the mask are set
+	PM_QOS_FLAGS_SOME:
+		Some flags from the mask are set
+	PM_QOS_FLAGS_NONE:
+		No flags from the mask are set
+	PM_QOS_FLAGS_UNDEFINED:
+		The device's PM QoS structure has not been initialized
+		or the list of requests is empty.
+
+int dev_pm_qos_add_ancestor_request(dev, handle, type, value)
+  Add a PM QoS request for the first direct ancestor of the given device whose
+  power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests)
+  or whose power.set_latency_tolerance callback pointer is not NULL (for
+  DEV_PM_QOS_LATENCY_TOLERANCE requests).
+
+int dev_pm_qos_expose_latency_limit(device, value)
+  Add a request to the device's PM QoS list of resume latency constraints and
+  create a sysfs attribute pm_qos_resume_latency_us under the device's power
+  directory allowing user space to manipulate that request.
+
+void dev_pm_qos_hide_latency_limit(device)
+  Drop the request added by dev_pm_qos_expose_latency_limit() from the device's
+  PM QoS list of resume latency constraints and remove sysfs attribute
+  pm_qos_resume_latency_us from the device's power directory.
+
+int dev_pm_qos_expose_flags(device, value)
+  Add a request to the device's PM QoS list of flags and create sysfs attribute
+  pm_qos_no_power_off under the device's power directory allowing user space to
+  change the value of the PM_QOS_FLAG_NO_POWER_OFF flag.
+
+void dev_pm_qos_hide_flags(device)
+  Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list
+  of flags and remove sysfs attribute pm_qos_no_power_off from the device's power
+  directory.
+
+Notification mechanisms:
+
+The per-device PM QoS framework has a per-device notification tree.
+
+int dev_pm_qos_add_notifier(device, notifier):
+  Adds a notification callback function for the device.
+  The callback is called when the aggregated value of the device constraints list
+  is changed (for resume latency device PM QoS only).
+
+int dev_pm_qos_remove_notifier(device, notifier):
+  Removes the notification callback function for the device.
+
+
+Active state latency tolerance
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This device PM QoS type is used to support systems in which hardware may switch
+to energy-saving operation modes on the fly.  In those systems, if the operation
+mode chosen by the hardware attempts to save energy in an overly aggressive way,
+it may cause excess latencies to be visible to software, causing it to miss
+certain protocol requirements or target frame or sample rates etc.
+
+If there is a latency tolerance control mechanism for a given device available
+to software, the .set_latency_tolerance callback in that device's dev_pm_info
+structure should be populated.  The routine pointed to by it is should implement
+whatever is necessary to transfer the effective requirement value to the
+hardware.
+
+Whenever the effective latency tolerance changes for the device, its
+.set_latency_tolerance() callback will be executed and the effective value will
+be passed to it.  If that value is negative, which means that the list of
+latency tolerance requirements for the device is empty, the callback is expected
+to switch the underlying hardware latency tolerance control mechanism to an
+autonomous mode if available.  If that value is PM_QOS_LATENCY_ANY, in turn, and
+the hardware supports a special "no requirement" setting, the callback is
+expected to use it.  That allows software to prevent the hardware from
+automatically updating the device's latency tolerance in response to its power
+state changes (e.g. during transitions from D3cold to D0), which generally may
+be done in the autonomous latency tolerance control mode.
+
+If .set_latency_tolerance() is present for the device, sysfs attribute
+pm_qos_latency_tolerance_us will be present in the devivce's power directory.
+Then, user space can use that attribute to specify its latency tolerance
+requirement for the device, if any.  Writing "any" to it means "no requirement,
+but do not let the hardware control latency tolerance" and writing "auto" to it
+allows the hardware to be switched to the autonomous mode if there are no other
+requirements from the kernel side in the device's list.
+
+Kernel code can use the functions described above along with the
+DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update
+latency tolerance requirements for devices.
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
deleted file mode 100644
index 19c5f7b1a7ba..000000000000
--- a/Documentation/power/pm_qos_interface.txt
+++ /dev/null
@@ -1,212 +0,0 @@
-PM Quality Of Service Interface.
-
-This interface provides a kernel and user mode interface for registering
-performance expectations by drivers, subsystems and user space applications on
-one of the parameters.
-
-Two different PM QoS frameworks are available:
-1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput,
-memory_bandwidth.
-2. the per-device PM QoS framework provides the API to manage the per-device latency
-constraints and PM QoS flags.
-
-Each parameters have defined units:
- * latency: usec
- * timeout: usec
- * throughput: kbs (kilo bit / sec)
- * memory bandwidth: mbs (mega bit / sec)
-
-
-1. PM QoS framework
-
-The infrastructure exposes multiple misc device nodes one per implemented
-parameter.  The set of parameters implement is defined by pm_qos_power_init()
-and pm_qos_params.h.  This is done because having the available parameters
-being runtime configurable or changeable from a driver was seen as too easy to
-abuse.
-
-For each parameter a list of performance requests is maintained along with
-an aggregated target value.  The aggregated target value is updated with
-changes to the request list or elements of the list.  Typically the
-aggregated target value is simply the max or min of the request values held
-in the parameter list elements.
-Note: the aggregated target value is implemented as an atomic variable so that
-reading the aggregated value does not require any locking mechanism.
-
-
-From kernel mode the use of this interface is simple:
-
-void pm_qos_add_request(handle, param_class, target_value):
-Will insert an element into the list for that identified PM QoS class with the
-target value.  Upon change to this list the new target is recomputed and any
-registered notifiers are called only if the target value is now different.
-Clients of pm_qos need to save the returned handle for future use in other
-pm_qos API functions.
-
-void pm_qos_update_request(handle, new_target_value):
-Will update the list element pointed to by the handle with the new target value
-and recompute the new aggregated target, calling the notification tree if the
-target is changed.
-
-void pm_qos_remove_request(handle):
-Will remove the element.  After removal it will update the aggregate target and
-call the notification tree if the target was changed as a result of removing
-the request.
-
-int pm_qos_request(param_class):
-Returns the aggregated value for a given PM QoS class.
-
-int pm_qos_request_active(handle):
-Returns if the request is still active, i.e. it has not been removed from a
-PM QoS class constraints list.
-
-int pm_qos_add_notifier(param_class, notifier):
-Adds a notification callback function to the PM QoS class. The callback is
-called when the aggregated value for the PM QoS class is changed.
-
-int pm_qos_remove_notifier(int param_class, notifier):
-Removes the notification callback function for the PM QoS class.
-
-
-From user mode:
-Only processes can register a pm_qos request.  To provide for automatic
-cleanup of a process, the interface requires the process to register its
-parameter requests in the following way:
-
-To register the default pm_qos target for the specific parameter, the process
-must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
-
-As long as the device node is held open that process has a registered
-request on the parameter.
-
-To change the requested target value the process needs to write an s32 value to
-the open device node.  Alternatively the user mode program could write a hex
-string for the value using 10 char long format e.g. "0x12345678".  This
-translates to a pm_qos_update_request call.
-
-To remove the user mode request for a target value simply close the device
-node.
-
-
-2. PM QoS per-device latency and flags framework
-
-For each device, there are three lists of PM QoS requests. Two of them are
-maintained along with the aggregated targets of resume latency and active
-state latency tolerance (in microseconds) and the third one is for PM QoS flags.
-Values are updated in response to changes of the request list.
-
-The target values of resume latency and active state latency tolerance are
-simply the minimum of the request values held in the parameter list elements.
-The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements'
-values.  One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF.
-
-Note: The aggregated target values are implemented in such a way that reading
-the aggregated value does not require any locking mechanism.
-
-
-From kernel mode the use of this interface is the following:
-
-int dev_pm_qos_add_request(device, handle, type, value):
-Will insert an element into the list for that identified device with the
-target value.  Upon change to this list the new target is recomputed and any
-registered notifiers are called only if the target value is now different.
-Clients of dev_pm_qos need to save the handle for future use in other
-dev_pm_qos API functions.
-
-int dev_pm_qos_update_request(handle, new_value):
-Will update the list element pointed to by the handle with the new target value
-and recompute the new aggregated target, calling the notification trees if the
-target is changed.
-
-int dev_pm_qos_remove_request(handle):
-Will remove the element.  After removal it will update the aggregate target and
-call the notification trees if the target was changed as a result of removing
-the request.
-
-s32 dev_pm_qos_read_value(device):
-Returns the aggregated value for a given device's constraints list.
-
-enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
-Check PM QoS flags of the given device against the given mask of flags.
-The meaning of the return values is as follows:
-	PM_QOS_FLAGS_ALL: All flags from the mask are set
-	PM_QOS_FLAGS_SOME: Some flags from the mask are set
-	PM_QOS_FLAGS_NONE: No flags from the mask are set
-	PM_QOS_FLAGS_UNDEFINED: The device's PM QoS structure has not been
-			initialized or the list of requests is empty.
-
-int dev_pm_qos_add_ancestor_request(dev, handle, type, value)
-Add a PM QoS request for the first direct ancestor of the given device whose
-power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests)
-or whose power.set_latency_tolerance callback pointer is not NULL (for
-DEV_PM_QOS_LATENCY_TOLERANCE requests).
-
-int dev_pm_qos_expose_latency_limit(device, value)
-Add a request to the device's PM QoS list of resume latency constraints and
-create a sysfs attribute pm_qos_resume_latency_us under the device's power
-directory allowing user space to manipulate that request.
-
-void dev_pm_qos_hide_latency_limit(device)
-Drop the request added by dev_pm_qos_expose_latency_limit() from the device's
-PM QoS list of resume latency constraints and remove sysfs attribute
-pm_qos_resume_latency_us from the device's power directory.
-
-int dev_pm_qos_expose_flags(device, value)
-Add a request to the device's PM QoS list of flags and create sysfs attribute
-pm_qos_no_power_off under the device's power directory allowing user space to
-change the value of the PM_QOS_FLAG_NO_POWER_OFF flag.
-
-void dev_pm_qos_hide_flags(device)
-Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list
-of flags and remove sysfs attribute pm_qos_no_power_off from the device's power
-directory.
-
-Notification mechanisms:
-The per-device PM QoS framework has a per-device notification tree.
-
-int dev_pm_qos_add_notifier(device, notifier):
-Adds a notification callback function for the device.
-The callback is called when the aggregated value of the device constraints list
-is changed (for resume latency device PM QoS only).
-
-int dev_pm_qos_remove_notifier(device, notifier):
-Removes the notification callback function for the device.
-
-
-Active state latency tolerance
-
-This device PM QoS type is used to support systems in which hardware may switch
-to energy-saving operation modes on the fly.  In those systems, if the operation
-mode chosen by the hardware attempts to save energy in an overly aggressive way,
-it may cause excess latencies to be visible to software, causing it to miss
-certain protocol requirements or target frame or sample rates etc.
-
-If there is a latency tolerance control mechanism for a given device available
-to software, the .set_latency_tolerance callback in that device's dev_pm_info
-structure should be populated.  The routine pointed to by it is should implement
-whatever is necessary to transfer the effective requirement value to the
-hardware.
-
-Whenever the effective latency tolerance changes for the device, its
-.set_latency_tolerance() callback will be executed and the effective value will
-be passed to it.  If that value is negative, which means that the list of
-latency tolerance requirements for the device is empty, the callback is expected
-to switch the underlying hardware latency tolerance control mechanism to an
-autonomous mode if available.  If that value is PM_QOS_LATENCY_ANY, in turn, and
-the hardware supports a special "no requirement" setting, the callback is
-expected to use it.  That allows software to prevent the hardware from
-automatically updating the device's latency tolerance in response to its power
-state changes (e.g. during transitions from D3cold to D0), which generally may
-be done in the autonomous latency tolerance control mode.
-
-If .set_latency_tolerance() is present for the device, sysfs attribute
-pm_qos_latency_tolerance_us will be present in the devivce's power directory.
-Then, user space can use that attribute to specify its latency tolerance
-requirement for the device, if any.  Writing "any" to it means "no requirement,
-but do not let the hardware control latency tolerance" and writing "auto" to it
-allows the hardware to be switched to the autonomous mode if there are no other
-requirements from the kernel side in the device's list.
-
-Kernel code can use the functions described above along with the
-DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update
-latency tolerance requirements for devices.
diff --git a/Documentation/power/power_supply_class.rst b/Documentation/power/power_supply_class.rst
new file mode 100644
index 000000000000..3f2c3fe38a61
--- /dev/null
+++ b/Documentation/power/power_supply_class.rst
@@ -0,0 +1,282 @@
+========================
+Linux power supply class
+========================
+
+Synopsis
+~~~~~~~~
+Power supply class used to represent battery, UPS, AC or DC power supply
+properties to user-space.
+
+It defines core set of attributes, which should be applicable to (almost)
+every power supply out there. Attributes are available via sysfs and uevent
+interfaces.
+
+Each attribute has well defined meaning, up to unit of measure used. While
+the attributes provided are believed to be universally applicable to any
+power supply, specific monitoring hardware may not be able to provide them
+all, so any of them may be skipped.
+
+Power supply class is extensible, and allows to define drivers own attributes.
+The core attribute set is subject to the standard Linux evolution (i.e.
+if it will be found that some attribute is applicable to many power supply
+types or their drivers, it can be added to the core set).
+
+It also integrates with LED framework, for the purpose of providing
+typically expected feedback of battery charging/fully charged status and
+AC/USB power supply online status. (Note that specific details of the
+indication (including whether to use it at all) are fully controllable by
+user and/or specific machine defaults, per design principles of LED
+framework).
+
+
+Attributes/properties
+~~~~~~~~~~~~~~~~~~~~~
+Power supply class has predefined set of attributes, this eliminates code
+duplication across drivers. Power supply class insist on reusing its
+predefined attributes *and* their units.
+
+So, userspace gets predictable set of attributes and their units for any
+kind of power supply, and can process/present them to a user in consistent
+manner. Results for different power supplies and machines are also directly
+comparable.
+
+See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c
+for the example how to declare and handle attributes.
+
+
+Units
+~~~~~
+Quoting include/linux/power_supply.h:
+
+  All voltages, currents, charges, energies, time and temperatures in µV,
+  µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise
+  stated. It's driver's job to convert its raw values to units in which
+  this class operates.
+
+
+Attributes/properties detailed
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
++--------------------------------------------------------------------------+
+|               **Charge/Energy/Capacity - how to not confuse**            |
++--------------------------------------------------------------------------+
+| **Because both "charge" (µAh) and "energy" (µWh) represents "capacity"   |
+| of battery, this class distinguish these terms. Don't mix them!**        |
+|                                                                          |
+| - `CHARGE_*`                                                             |
+|	attributes represents capacity in µAh only.                        |
+| - `ENERGY_*`                                                             |
+|	attributes represents capacity in µWh only.                        |
+| - `CAPACITY`                                                             |
+|	attribute represents capacity in *percents*, from 0 to 100.        |
++--------------------------------------------------------------------------+
+
+Postfixes:
+
+_AVG
+  *hardware* averaged value, use it if your hardware is really able to
+  report averaged values.
+_NOW
+  momentary/instantaneous values.
+
+STATUS
+  this attribute represents operating status (charging, full,
+  discharging (i.e. powering a load), etc.). This corresponds to
+  `BATTERY_STATUS_*` values, as defined in battery.h.
+
+CHARGE_TYPE
+  batteries can typically charge at different rates.
+  This defines trickle and fast charges.  For batteries that
+  are already charged or discharging, 'n/a' can be displayed (or
+  'unknown', if the status is not known).
+
+AUTHENTIC
+  indicates the power supply (battery or charger) connected
+  to the platform is authentic(1) or non authentic(0).
+
+HEALTH
+  represents health of the battery, values corresponds to
+  POWER_SUPPLY_HEALTH_*, defined in battery.h.
+
+VOLTAGE_OCV
+  open circuit voltage of the battery.
+
+VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN
+  design values for maximal and minimal power supply voltages.
+  Maximal/minimal means values of voltages when battery considered
+  "full"/"empty" at normal conditions. Yes, there is no direct relation
+  between voltage and battery capacity, but some dumb
+  batteries use voltage for very approximated calculation of capacity.
+  Battery driver also can use this attribute just to inform userspace
+  about maximal and minimal voltage thresholds of a given battery.
+
+VOLTAGE_MAX, VOLTAGE_MIN
+  same as _DESIGN voltage values except that these ones should be used
+  if hardware could only guess (measure and retain) the thresholds of a
+  given power supply.
+
+VOLTAGE_BOOT
+  Reports the voltage measured during boot
+
+CURRENT_BOOT
+  Reports the current measured during boot
+
+CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN
+  design charge values, when battery considered full/empty.
+
+ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN
+  same as above but for energy.
+
+CHARGE_FULL, CHARGE_EMPTY
+  These attributes means "last remembered value of charge when battery
+  became full/empty". It also could mean "value of charge when battery
+  considered full/empty at given conditions (temperature, age)".
+  I.e. these attributes represents real thresholds, not design values.
+
+ENERGY_FULL, ENERGY_EMPTY
+  same as above but for energy.
+
+CHARGE_COUNTER
+  the current charge counter (in µAh).  This could easily
+  be negative; there is no empty or full value.  It is only useful for
+  relative, time-based measurements.
+
+PRECHARGE_CURRENT
+  the maximum charge current during precharge phase of charge cycle
+  (typically 20% of battery capacity).
+
+CHARGE_TERM_CURRENT
+  Charge termination current. The charge cycle terminates when battery
+  voltage is above recharge threshold, and charge current is below
+  this setting (typically 10% of battery capacity).
+
+CONSTANT_CHARGE_CURRENT
+  constant charge current programmed by charger.
+
+
+CONSTANT_CHARGE_CURRENT_MAX
+  maximum charge current supported by the power supply object.
+
+CONSTANT_CHARGE_VOLTAGE
+  constant charge voltage programmed by charger.
+CONSTANT_CHARGE_VOLTAGE_MAX
+  maximum charge voltage supported by the power supply object.
+
+INPUT_CURRENT_LIMIT
+  input current limit programmed by charger. Indicates
+  the current drawn from a charging source.
+
+CHARGE_CONTROL_LIMIT
+  current charge control limit setting
+CHARGE_CONTROL_LIMIT_MAX
+  maximum charge control limit setting
+
+CALIBRATE
+  battery or coulomb counter calibration status
+
+CAPACITY
+  capacity in percents.
+CAPACITY_ALERT_MIN
+  minimum capacity alert value in percents.
+CAPACITY_ALERT_MAX
+  maximum capacity alert value in percents.
+CAPACITY_LEVEL
+  capacity level. This corresponds to POWER_SUPPLY_CAPACITY_LEVEL_*.
+
+TEMP
+  temperature of the power supply.
+TEMP_ALERT_MIN
+  minimum battery temperature alert.
+TEMP_ALERT_MAX
+  maximum battery temperature alert.
+TEMP_AMBIENT
+  ambient temperature.
+TEMP_AMBIENT_ALERT_MIN
+  minimum ambient temperature alert.
+TEMP_AMBIENT_ALERT_MAX
+  maximum ambient temperature alert.
+TEMP_MIN
+  minimum operatable temperature
+TEMP_MAX
+  maximum operatable temperature
+
+TIME_TO_EMPTY
+  seconds left for battery to be considered empty
+  (i.e. while battery powers a load)
+TIME_TO_FULL
+  seconds left for battery to be considered full
+  (i.e. while battery is charging)
+
+
+Battery <-> external power supply interaction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Often power supplies are acting as supplies and supplicants at the same
+time. Batteries are good example. So, batteries usually care if they're
+externally powered or not.
+
+For that case, power supply class implements notification mechanism for
+batteries.
+
+External power supply (AC) lists supplicants (batteries) names in
+"supplied_to" struct member, and each power_supply_changed() call
+issued by external power supply will notify supplicants via
+external_power_changed callback.
+
+
+Devicetree battery characteristics
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Drivers should call power_supply_get_battery_info() to obtain battery
+characteristics from a devicetree battery node, defined in
+Documentation/devicetree/bindings/power/supply/battery.txt. This is
+implemented in drivers/power/supply/bq27xxx_battery.c.
+
+Properties in struct power_supply_battery_info and their counterparts in the
+battery node have names corresponding to elements in enum power_supply_property,
+for naming consistency between sysfs attributes and battery node properties.
+
+
+QA
+~~
+
+Q:
+   Where is POWER_SUPPLY_PROP_XYZ attribute?
+A:
+   If you cannot find attribute suitable for your driver needs, feel free
+   to add it and send patch along with your driver.
+
+   The attributes available currently are the ones currently provided by the
+   drivers written.
+
+   Good candidates to add in future: model/part#, cycle_time, manufacturer,
+   etc.
+
+
+Q:
+   I have some very specific attribute (e.g. battery color), should I add
+   this attribute to standard ones?
+A:
+   Most likely, no. Such attribute can be placed in the driver itself, if
+   it is useful. Of course, if the attribute in question applicable to
+   large set of batteries, provided by many drivers, and/or comes from
+   some general battery specification/standard, it may be a candidate to
+   be added to the core attribute set.
+
+
+Q:
+   Suppose, my battery monitoring chip/firmware does not provides capacity
+   in percents, but provides charge_{now,full,empty}. Should I calculate
+   percentage capacity manually, inside the driver, and register CAPACITY
+   attribute? The same question about time_to_empty/time_to_full.
+A:
+   Most likely, no. This class is designed to export properties which are
+   directly measurable by the specific hardware available.
+
+   Inferring not available properties using some heuristics or mathematical
+   model is not subject of work for a battery driver. Such functionality
+   should be factored out, and in fact, apm_power, the driver to serve
+   legacy APM API on top of power supply class, uses a simple heuristic of
+   approximating remaining battery capacity based on its charge, current,
+   voltage and so on. But full-fledged battery model is likely not subject
+   for kernel at all, as it would require floating point calculation to deal
+   with things like differential equations and Kalman filters. This is
+   better be handled by batteryd/libbattery, yet to be written.
diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt
deleted file mode 100644
index 300d37896e51..000000000000
--- a/Documentation/power/power_supply_class.txt
+++ /dev/null
@@ -1,231 +0,0 @@
-Linux power supply class
-========================
-
-Synopsis
-~~~~~~~~
-Power supply class used to represent battery, UPS, AC or DC power supply
-properties to user-space.
-
-It defines core set of attributes, which should be applicable to (almost)
-every power supply out there. Attributes are available via sysfs and uevent
-interfaces.
-
-Each attribute has well defined meaning, up to unit of measure used. While
-the attributes provided are believed to be universally applicable to any
-power supply, specific monitoring hardware may not be able to provide them
-all, so any of them may be skipped.
-
-Power supply class is extensible, and allows to define drivers own attributes.
-The core attribute set is subject to the standard Linux evolution (i.e.
-if it will be found that some attribute is applicable to many power supply
-types or their drivers, it can be added to the core set).
-
-It also integrates with LED framework, for the purpose of providing
-typically expected feedback of battery charging/fully charged status and
-AC/USB power supply online status. (Note that specific details of the
-indication (including whether to use it at all) are fully controllable by
-user and/or specific machine defaults, per design principles of LED
-framework).
-
-
-Attributes/properties
-~~~~~~~~~~~~~~~~~~~~~
-Power supply class has predefined set of attributes, this eliminates code
-duplication across drivers. Power supply class insist on reusing its
-predefined attributes *and* their units.
-
-So, userspace gets predictable set of attributes and their units for any
-kind of power supply, and can process/present them to a user in consistent
-manner. Results for different power supplies and machines are also directly
-comparable.
-
-See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c
-for the example how to declare and handle attributes.
-
-
-Units
-~~~~~
-Quoting include/linux/power_supply.h:
-
-  All voltages, currents, charges, energies, time and temperatures in µV,
-  µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise
-  stated. It's driver's job to convert its raw values to units in which
-  this class operates.
-
-
-Attributes/properties detailed
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-~ ~ ~ ~ ~ ~ ~  Charge/Energy/Capacity - how to not confuse  ~ ~ ~ ~ ~ ~ ~
-~                                                                       ~
-~ Because both "charge" (µAh) and "energy" (µWh) represents "capacity"  ~
-~ of battery, this class distinguish these terms. Don't mix them!       ~
-~                                                                       ~
-~ CHARGE_* attributes represents capacity in µAh only.                  ~
-~ ENERGY_* attributes represents capacity in µWh only.                  ~
-~ CAPACITY attribute represents capacity in *percents*, from 0 to 100.  ~
-~                                                                       ~
-~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
-
-Postfixes:
-_AVG - *hardware* averaged value, use it if your hardware is really able to
-report averaged values.
-_NOW - momentary/instantaneous values.
-
-STATUS - this attribute represents operating status (charging, full,
-discharging (i.e. powering a load), etc.). This corresponds to
-BATTERY_STATUS_* values, as defined in battery.h.
-
-CHARGE_TYPE - batteries can typically charge at different rates.
-This defines trickle and fast charges.  For batteries that
-are already charged or discharging, 'n/a' can be displayed (or
-'unknown', if the status is not known).
-
-AUTHENTIC - indicates the power supply (battery or charger) connected
-to the platform is authentic(1) or non authentic(0).
-
-HEALTH - represents health of the battery, values corresponds to
-POWER_SUPPLY_HEALTH_*, defined in battery.h.
-
-VOLTAGE_OCV - open circuit voltage of the battery.
-
-VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN - design values for maximal and
-minimal power supply voltages. Maximal/minimal means values of voltages
-when battery considered "full"/"empty" at normal conditions. Yes, there is
-no direct relation between voltage and battery capacity, but some dumb
-batteries use voltage for very approximated calculation of capacity.
-Battery driver also can use this attribute just to inform userspace
-about maximal and minimal voltage thresholds of a given battery.
-
-VOLTAGE_MAX, VOLTAGE_MIN - same as _DESIGN voltage values except that
-these ones should be used if hardware could only guess (measure and
-retain) the thresholds of a given power supply.
-
-VOLTAGE_BOOT - Reports the voltage measured during boot
-
-CURRENT_BOOT - Reports the current measured during boot
-
-CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN - design charge values, when
-battery considered full/empty.
-
-ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN - same as above but for energy.
-
-CHARGE_FULL, CHARGE_EMPTY - These attributes means "last remembered value
-of charge when battery became full/empty". It also could mean "value of
-charge when battery considered full/empty at given conditions (temperature,
-age)". I.e. these attributes represents real thresholds, not design values.
-
-ENERGY_FULL, ENERGY_EMPTY - same as above but for energy.
-
-CHARGE_COUNTER - the current charge counter (in µAh).  This could easily
-be negative; there is no empty or full value.  It is only useful for
-relative, time-based measurements.
-
-PRECHARGE_CURRENT - the maximum charge current during precharge phase
-of charge cycle (typically 20% of battery capacity).
-CHARGE_TERM_CURRENT - Charge termination current. The charge cycle
-terminates when battery voltage is above recharge threshold, and charge
-current is below this setting (typically 10% of battery capacity).
-
-CONSTANT_CHARGE_CURRENT - constant charge current programmed by charger.
-CONSTANT_CHARGE_CURRENT_MAX - maximum charge current supported by the
-power supply object.
-
-CONSTANT_CHARGE_VOLTAGE - constant charge voltage programmed by charger.
-CONSTANT_CHARGE_VOLTAGE_MAX - maximum charge voltage supported by the
-power supply object.
-
-INPUT_CURRENT_LIMIT - input current limit programmed by charger. Indicates
-the current drawn from a charging source.
-
-CHARGE_CONTROL_LIMIT - current charge control limit setting
-CHARGE_CONTROL_LIMIT_MAX - maximum charge control limit setting
-
-CALIBRATE - battery or coulomb counter calibration status
-
-CAPACITY - capacity in percents.
-CAPACITY_ALERT_MIN - minimum capacity alert value in percents.
-CAPACITY_ALERT_MAX - maximum capacity alert value in percents.
-CAPACITY_LEVEL - capacity level. This corresponds to
-POWER_SUPPLY_CAPACITY_LEVEL_*.
-
-TEMP - temperature of the power supply.
-TEMP_ALERT_MIN - minimum battery temperature alert.
-TEMP_ALERT_MAX - maximum battery temperature alert.
-TEMP_AMBIENT - ambient temperature.
-TEMP_AMBIENT_ALERT_MIN - minimum ambient temperature alert.
-TEMP_AMBIENT_ALERT_MAX - maximum ambient temperature alert.
-TEMP_MIN - minimum operatable temperature
-TEMP_MAX - maximum operatable temperature
-
-TIME_TO_EMPTY - seconds left for battery to be considered empty (i.e.
-while battery powers a load)
-TIME_TO_FULL - seconds left for battery to be considered full (i.e.
-while battery is charging)
-
-
-Battery <-> external power supply interaction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Often power supplies are acting as supplies and supplicants at the same
-time. Batteries are good example. So, batteries usually care if they're
-externally powered or not.
-
-For that case, power supply class implements notification mechanism for
-batteries.
-
-External power supply (AC) lists supplicants (batteries) names in
-"supplied_to" struct member, and each power_supply_changed() call
-issued by external power supply will notify supplicants via
-external_power_changed callback.
-
-
-Devicetree battery characteristics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Drivers should call power_supply_get_battery_info() to obtain battery
-characteristics from a devicetree battery node, defined in
-Documentation/devicetree/bindings/power/supply/battery.txt. This is
-implemented in drivers/power/supply/bq27xxx_battery.c.
-
-Properties in struct power_supply_battery_info and their counterparts in the
-battery node have names corresponding to elements in enum power_supply_property,
-for naming consistency between sysfs attributes and battery node properties.
-
-
-QA
-~~
-Q: Where is POWER_SUPPLY_PROP_XYZ attribute?
-A: If you cannot find attribute suitable for your driver needs, feel free
-   to add it and send patch along with your driver.
-
-   The attributes available currently are the ones currently provided by the
-   drivers written.
-
-   Good candidates to add in future: model/part#, cycle_time, manufacturer,
-   etc.
-
-
-Q: I have some very specific attribute (e.g. battery color), should I add
-   this attribute to standard ones?
-A: Most likely, no. Such attribute can be placed in the driver itself, if
-   it is useful. Of course, if the attribute in question applicable to
-   large set of batteries, provided by many drivers, and/or comes from
-   some general battery specification/standard, it may be a candidate to
-   be added to the core attribute set.
-
-
-Q: Suppose, my battery monitoring chip/firmware does not provides capacity
-   in percents, but provides charge_{now,full,empty}. Should I calculate
-   percentage capacity manually, inside the driver, and register CAPACITY
-   attribute? The same question about time_to_empty/time_to_full.
-A: Most likely, no. This class is designed to export properties which are
-   directly measurable by the specific hardware available.
-
-   Inferring not available properties using some heuristics or mathematical
-   model is not subject of work for a battery driver. Such functionality
-   should be factored out, and in fact, apm_power, the driver to serve
-   legacy APM API on top of power supply class, uses a simple heuristic of
-   approximating remaining battery capacity based on its charge, current,
-   voltage and so on. But full-fledged battery model is likely not subject
-   for kernel at all, as it would require floating point calculation to deal
-   with things like differential equations and Kalman filters. This is
-   better be handled by batteryd/libbattery, yet to be written.
diff --git a/Documentation/power/powercap/powercap.rst b/Documentation/power/powercap/powercap.rst
new file mode 100644
index 000000000000..7ae3b44c7624
--- /dev/null
+++ b/Documentation/power/powercap/powercap.rst
@@ -0,0 +1,257 @@
+=======================
+Power Capping Framework
+=======================
+
+The power capping framework provides a consistent interface between the kernel
+and the user space that allows power capping drivers to expose the settings to
+user space in a uniform way.
+
+Terminology
+===========
+
+The framework exposes power capping devices to user space via sysfs in the
+form of a tree of objects. The objects at the root level of the tree represent
+'control types', which correspond to different methods of power capping.  For
+example, the intel-rapl control type represents the Intel "Running Average
+Power Limit" (RAPL) technology, whereas the 'idle-injection' control type
+corresponds to the use of idle injection for controlling power.
+
+Power zones represent different parts of the system, which can be controlled and
+monitored using the power capping method determined by the control type the
+given zone belongs to. They each contain attributes for monitoring power, as
+well as controls represented in the form of power constraints.  If the parts of
+the system represented by different power zones are hierarchical (that is, one
+bigger part consists of multiple smaller parts that each have their own power
+controls), those power zones may also be organized in a hierarchy with one
+parent power zone containing multiple subzones and so on to reflect the power
+control topology of the system.  In that case, it is possible to apply power
+capping to a set of devices together using the parent power zone and if more
+fine grained control is required, it can be applied through the subzones.
+
+
+Example sysfs interface tree::
+
+  /sys/devices/virtual/powercap
+  └──intel-rapl
+      ├──intel-rapl:0
+      │   ├──constraint_0_name
+      │   ├──constraint_0_power_limit_uw
+      │   ├──constraint_0_time_window_us
+      │   ├──constraint_1_name
+      │   ├──constraint_1_power_limit_uw
+      │   ├──constraint_1_time_window_us
+      │   ├──device -> ../../intel-rapl
+      │   ├──energy_uj
+      │   ├──intel-rapl:0:0
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:0
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──intel-rapl:0:1
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:0
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──max_energy_range_uj
+      │   ├──max_power_range_uw
+      │   ├──name
+      │   ├──enabled
+      │   ├──power
+      │   │   ├──async
+      │   │   []
+      │   ├──subsystem -> ../../../../../class/power_cap
+      │   ├──enabled
+      │   ├──uevent
+      ├──intel-rapl:1
+      │   ├──constraint_0_name
+      │   ├──constraint_0_power_limit_uw
+      │   ├──constraint_0_time_window_us
+      │   ├──constraint_1_name
+      │   ├──constraint_1_power_limit_uw
+      │   ├──constraint_1_time_window_us
+      │   ├──device -> ../../intel-rapl
+      │   ├──energy_uj
+      │   ├──intel-rapl:1:0
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:1
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──intel-rapl:1:1
+      │   │   ├──constraint_0_name
+      │   │   ├──constraint_0_power_limit_uw
+      │   │   ├──constraint_0_time_window_us
+      │   │   ├──constraint_1_name
+      │   │   ├──constraint_1_power_limit_uw
+      │   │   ├──constraint_1_time_window_us
+      │   │   ├──device -> ../../intel-rapl:1
+      │   │   ├──energy_uj
+      │   │   ├──max_energy_range_uj
+      │   │   ├──name
+      │   │   ├──enabled
+      │   │   ├──power
+      │   │   │   ├──async
+      │   │   │   []
+      │   │   ├──subsystem -> ../../../../../../class/power_cap
+      │   │   └──uevent
+      │   ├──max_energy_range_uj
+      │   ├──max_power_range_uw
+      │   ├──name
+      │   ├──enabled
+      │   ├──power
+      │   │   ├──async
+      │   │   []
+      │   ├──subsystem -> ../../../../../class/power_cap
+      │   ├──uevent
+      ├──power
+      │   ├──async
+      │   []
+      ├──subsystem -> ../../../../class/power_cap
+      ├──enabled
+      └──uevent
+
+The above example illustrates a case in which the Intel RAPL technology,
+available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one
+control type called intel-rapl which contains two power zones, intel-rapl:0 and
+intel-rapl:1, representing CPU packages.  Each of these power zones contains
+two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the
+"core" and the "uncore" parts of the given CPU package, respectively.  All of
+the zones and subzones contain energy monitoring attributes (energy_uj,
+max_energy_range_uj) and constraint attributes (constraint_*) allowing controls
+to be applied (the constraints in the 'package' power zones apply to the whole
+CPU packages and the subzone constraints only apply to the respective parts of
+the given package individually). Since Intel RAPL doesn't provide instantaneous
+power value, there is no power_uw attribute.
+
+In addition to that, each power zone contains a name attribute, allowing the
+part of the system represented by that zone to be identified.
+For example::
+
+	cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name
+
+package-0
+---------
+
+The Intel RAPL technology allows two constraints, short term and long term,
+with two different time windows to be applied to each power zone.  Thus for
+each zone there are 2 attributes representing the constraint names, 2 power
+limits and 2 attributes representing the sizes of the time windows. Such that,
+constraint_j_* attributes correspond to the jth constraint (j = 0,1).
+
+For example::
+
+	constraint_0_name
+	constraint_0_power_limit_uw
+	constraint_0_time_window_us
+	constraint_1_name
+	constraint_1_power_limit_uw
+	constraint_1_time_window_us
+
+Power Zone Attributes
+=====================
+
+Monitoring attributes
+---------------------
+
+energy_uj (rw)
+	Current energy counter in micro joules. Write "0" to reset.
+	If the counter can not be reset, then this attribute is read only.
+
+max_energy_range_uj (ro)
+	Range of the above energy counter in micro-joules.
+
+power_uw (ro)
+	Current power in micro watts.
+
+max_power_range_uw (ro)
+	Range of the above power value in micro-watts.
+
+name (ro)
+	Name of this power zone.
+
+It is possible that some domains have both power ranges and energy counter ranges;
+however, only one is mandatory.
+
+Constraints
+-----------
+
+constraint_X_power_limit_uw (rw)
+	Power limit in micro watts, which should be applicable for the
+	time window specified by "constraint_X_time_window_us".
+
+constraint_X_time_window_us (rw)
+	Time window in micro seconds.
+
+constraint_X_name (ro)
+	An optional name of the constraint
+
+constraint_X_max_power_uw(ro)
+	Maximum allowed power in micro watts.
+
+constraint_X_min_power_uw(ro)
+	Minimum allowed power in micro watts.
+
+constraint_X_max_time_window_us(ro)
+	Maximum allowed time window in micro seconds.
+
+constraint_X_min_time_window_us(ro)
+	Minimum allowed time window in micro seconds.
+
+Except power_limit_uw and time_window_us other fields are optional.
+
+Common zone and control type attributes
+---------------------------------------
+
+enabled (rw): Enable/Disable controls at zone level or for all zones using
+a control type.
+
+Power Cap Client Driver Interface
+=================================
+
+The API summary:
+
+Call powercap_register_control_type() to register control type object.
+Call powercap_register_zone() to register a power zone (under a given
+control type), either as a top-level power zone or as a subzone of another
+power zone registered earlier.
+The number of constraints in a power zone and the corresponding callbacks have
+to be defined prior to calling powercap_register_zone() to register that zone.
+
+To Free a power zone call powercap_unregister_zone().
+To free a control type object call powercap_unregister_control_type().
+Detailed API can be generated using kernel-doc on include/linux/powercap.h.
diff --git a/Documentation/power/powercap/powercap.txt b/Documentation/power/powercap/powercap.txt
deleted file mode 100644
index 1e6ef164e07a..000000000000
--- a/Documentation/power/powercap/powercap.txt
+++ /dev/null
@@ -1,236 +0,0 @@
-Power Capping Framework
-==================================
-
-The power capping framework provides a consistent interface between the kernel
-and the user space that allows power capping drivers to expose the settings to
-user space in a uniform way.
-
-Terminology
-=========================
-The framework exposes power capping devices to user space via sysfs in the
-form of a tree of objects. The objects at the root level of the tree represent
-'control types', which correspond to different methods of power capping.  For
-example, the intel-rapl control type represents the Intel "Running Average
-Power Limit" (RAPL) technology, whereas the 'idle-injection' control type
-corresponds to the use of idle injection for controlling power.
-
-Power zones represent different parts of the system, which can be controlled and
-monitored using the power capping method determined by the control type the
-given zone belongs to. They each contain attributes for monitoring power, as
-well as controls represented in the form of power constraints.  If the parts of
-the system represented by different power zones are hierarchical (that is, one
-bigger part consists of multiple smaller parts that each have their own power
-controls), those power zones may also be organized in a hierarchy with one
-parent power zone containing multiple subzones and so on to reflect the power
-control topology of the system.  In that case, it is possible to apply power
-capping to a set of devices together using the parent power zone and if more
-fine grained control is required, it can be applied through the subzones.
-
-
-Example sysfs interface tree:
-
-/sys/devices/virtual/powercap
-??? intel-rapl
-    ??? intel-rapl:0
-    ?   ??? constraint_0_name
-    ?   ??? constraint_0_power_limit_uw
-    ?   ??? constraint_0_time_window_us
-    ?   ??? constraint_1_name
-    ?   ??? constraint_1_power_limit_uw
-    ?   ??? constraint_1_time_window_us
-    ?   ??? device -> ../../intel-rapl
-    ?   ??? energy_uj
-    ?   ??? intel-rapl:0:0
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:0
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? intel-rapl:0:1
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:0
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? max_energy_range_uj
-    ?   ??? max_power_range_uw
-    ?   ??? name
-    ?   ??? enabled
-    ?   ??? power
-    ?   ?   ??? async
-    ?   ?   []
-    ?   ??? subsystem -> ../../../../../class/power_cap
-    ?   ??? enabled
-    ?   ??? uevent
-    ??? intel-rapl:1
-    ?   ??? constraint_0_name
-    ?   ??? constraint_0_power_limit_uw
-    ?   ??? constraint_0_time_window_us
-    ?   ??? constraint_1_name
-    ?   ??? constraint_1_power_limit_uw
-    ?   ??? constraint_1_time_window_us
-    ?   ??? device -> ../../intel-rapl
-    ?   ??? energy_uj
-    ?   ??? intel-rapl:1:0
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:1
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? intel-rapl:1:1
-    ?   ?   ??? constraint_0_name
-    ?   ?   ??? constraint_0_power_limit_uw
-    ?   ?   ??? constraint_0_time_window_us
-    ?   ?   ??? constraint_1_name
-    ?   ?   ??? constraint_1_power_limit_uw
-    ?   ?   ??? constraint_1_time_window_us
-    ?   ?   ??? device -> ../../intel-rapl:1
-    ?   ?   ??? energy_uj
-    ?   ?   ??? max_energy_range_uj
-    ?   ?   ??? name
-    ?   ?   ??? enabled
-    ?   ?   ??? power
-    ?   ?   ?   ??? async
-    ?   ?   ?   []
-    ?   ?   ??? subsystem -> ../../../../../../class/power_cap
-    ?   ?   ??? uevent
-    ?   ??? max_energy_range_uj
-    ?   ??? max_power_range_uw
-    ?   ??? name
-    ?   ??? enabled
-    ?   ??? power
-    ?   ?   ??? async
-    ?   ?   []
-    ?   ??? subsystem -> ../../../../../class/power_cap
-    ?   ??? uevent
-    ??? power
-    ?   ??? async
-    ?   []
-    ??? subsystem -> ../../../../class/power_cap
-    ??? enabled
-    ??? uevent
-
-The above example illustrates a case in which the Intel RAPL technology,
-available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one
-control type called intel-rapl which contains two power zones, intel-rapl:0 and
-intel-rapl:1, representing CPU packages.  Each of these power zones contains
-two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the
-"core" and the "uncore" parts of the given CPU package, respectively.  All of
-the zones and subzones contain energy monitoring attributes (energy_uj,
-max_energy_range_uj) and constraint attributes (constraint_*) allowing controls
-to be applied (the constraints in the 'package' power zones apply to the whole
-CPU packages and the subzone constraints only apply to the respective parts of
-the given package individually). Since Intel RAPL doesn't provide instantaneous
-power value, there is no power_uw attribute.
-
-In addition to that, each power zone contains a name attribute, allowing the
-part of the system represented by that zone to be identified.
-For example:
-
-cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name
-package-0
-
-The Intel RAPL technology allows two constraints, short term and long term,
-with two different time windows to be applied to each power zone.  Thus for
-each zone there are 2 attributes representing the constraint names, 2 power
-limits and 2 attributes representing the sizes of the time windows. Such that,
-constraint_j_* attributes correspond to the jth constraint (j = 0,1).
-
-For example:
-	constraint_0_name
-	constraint_0_power_limit_uw
-	constraint_0_time_window_us
-	constraint_1_name
-	constraint_1_power_limit_uw
-	constraint_1_time_window_us
-
-Power Zone Attributes
-=================================
-Monitoring attributes
-----------------------
-
-energy_uj (rw): Current energy counter in micro joules. Write "0" to reset.
-If the counter can not be reset, then this attribute is read only.
-
-max_energy_range_uj (ro): Range of the above energy counter in micro-joules.
-
-power_uw (ro): Current power in micro watts.
-
-max_power_range_uw (ro): Range of the above power value in micro-watts.
-
-name (ro): Name of this power zone.
-
-It is possible that some domains have both power ranges and energy counter ranges;
-however, only one is mandatory.
-
-Constraints
-----------------
-constraint_X_power_limit_uw (rw): Power limit in micro watts, which should be
-applicable for the time window specified by "constraint_X_time_window_us".
-
-constraint_X_time_window_us (rw): Time window in micro seconds.
-
-constraint_X_name (ro): An optional name of the constraint
-
-constraint_X_max_power_uw(ro): Maximum allowed power in micro watts.
-
-constraint_X_min_power_uw(ro): Minimum allowed power in micro watts.
-
-constraint_X_max_time_window_us(ro): Maximum allowed time window in micro seconds.
-
-constraint_X_min_time_window_us(ro): Minimum allowed time window in micro seconds.
-
-Except power_limit_uw and time_window_us other fields are optional.
-
-Common zone and control type attributes
-----------------------------------------
-enabled (rw): Enable/Disable controls at zone level or for all zones using
-a control type.
-
-Power Cap Client Driver Interface
-==================================
-The API summary:
-
-Call powercap_register_control_type() to register control type object.
-Call powercap_register_zone() to register a power zone (under a given
-control type), either as a top-level power zone or as a subzone of another
-power zone registered earlier.
-The number of constraints in a power zone and the corresponding callbacks have
-to be defined prior to calling powercap_register_zone() to register that zone.
-
-To Free a power zone call powercap_unregister_zone().
-To free a control type object call powercap_unregister_control_type().
-Detailed API can be generated using kernel-doc on include/linux/powercap.h.
diff --git a/Documentation/power/regulator/consumer.rst b/Documentation/power/regulator/consumer.rst
new file mode 100644
index 000000000000..0cd8cc1275a7
--- /dev/null
+++ b/Documentation/power/regulator/consumer.rst
@@ -0,0 +1,229 @@
+===================================
+Regulator Consumer Driver Interface
+===================================
+
+This text describes the regulator interface for consumer device drivers.
+Please see overview.txt for a description of the terms used in this text.
+
+
+1. Consumer Regulator Access (static & dynamic drivers)
+=======================================================
+
+A consumer driver can get access to its supply regulator by calling ::
+
+	regulator = regulator_get(dev, "Vcc");
+
+The consumer passes in its struct device pointer and power supply ID. The core
+then finds the correct regulator by consulting a machine specific lookup table.
+If the lookup is successful then this call will return a pointer to the struct
+regulator that supplies this consumer.
+
+To release the regulator the consumer driver should call ::
+
+	regulator_put(regulator);
+
+Consumers can be supplied by more than one regulator e.g. codec consumer with
+analog and digital supplies ::
+
+	digital = regulator_get(dev, "Vcc");  /* digital core */
+	analog = regulator_get(dev, "Avdd");  /* analog */
+
+The regulator access functions regulator_get() and regulator_put() will
+usually be called in your device drivers probe() and remove() respectively.
+
+
+2. Regulator Output Enable & Disable (static & dynamic drivers)
+===============================================================
+
+
+A consumer can enable its power supply by calling::
+
+	int regulator_enable(regulator);
+
+NOTE:
+  The supply may already be enabled before regulator_enabled() is called.
+  This may happen if the consumer shares the regulator or the regulator has been
+  previously enabled by bootloader or kernel board initialization code.
+
+A consumer can determine if a regulator is enabled by calling::
+
+	int regulator_is_enabled(regulator);
+
+This will return > zero when the regulator is enabled.
+
+
+A consumer can disable its supply when no longer needed by calling::
+
+	int regulator_disable(regulator);
+
+NOTE:
+  This may not disable the supply if it's shared with other consumers. The
+  regulator will only be disabled when the enabled reference count is zero.
+
+Finally, a regulator can be forcefully disabled in the case of an emergency::
+
+	int regulator_force_disable(regulator);
+
+NOTE:
+  this will immediately and forcefully shutdown the regulator output. All
+  consumers will be powered off.
+
+
+3. Regulator Voltage Control & Status (dynamic drivers)
+=======================================================
+
+Some consumer drivers need to be able to dynamically change their supply
+voltage to match system operating points. e.g. CPUfreq drivers can scale
+voltage along with frequency to save power, SD drivers may need to select the
+correct card voltage, etc.
+
+Consumers can control their supply voltage by calling::
+
+	int regulator_set_voltage(regulator, min_uV, max_uV);
+
+Where min_uV and max_uV are the minimum and maximum acceptable voltages in
+microvolts.
+
+NOTE: this can be called when the regulator is enabled or disabled. If called
+when enabled, then the voltage changes instantly, otherwise the voltage
+configuration changes and the voltage is physically set when the regulator is
+next enabled.
+
+The regulators configured voltage output can be found by calling::
+
+	int regulator_get_voltage(regulator);
+
+NOTE:
+  get_voltage() will return the configured output voltage whether the
+  regulator is enabled or disabled and should NOT be used to determine regulator
+  output state. However this can be used in conjunction with is_enabled() to
+  determine the regulator physical output voltage.
+
+
+4. Regulator Current Limit Control & Status (dynamic drivers)
+=============================================================
+
+Some consumer drivers need to be able to dynamically change their supply
+current limit to match system operating points. e.g. LCD backlight driver can
+change the current limit to vary the backlight brightness, USB drivers may want
+to set the limit to 500mA when supplying power.
+
+Consumers can control their supply current limit by calling::
+
+	int regulator_set_current_limit(regulator, min_uA, max_uA);
+
+Where min_uA and max_uA are the minimum and maximum acceptable current limit in
+microamps.
+
+NOTE:
+  this can be called when the regulator is enabled or disabled. If called
+  when enabled, then the current limit changes instantly, otherwise the current
+  limit configuration changes and the current limit is physically set when the
+  regulator is next enabled.
+
+A regulators current limit can be found by calling::
+
+	int regulator_get_current_limit(regulator);
+
+NOTE:
+  get_current_limit() will return the current limit whether the regulator
+  is enabled or disabled and should not be used to determine regulator current
+  load.
+
+
+5. Regulator Operating Mode Control & Status (dynamic drivers)
+==============================================================
+
+Some consumers can further save system power by changing the operating mode of
+their supply regulator to be more efficient when the consumers operating state
+changes. e.g. consumer driver is idle and subsequently draws less current
+
+Regulator operating mode can be changed indirectly or directly.
+
+Indirect operating mode control.
+--------------------------------
+Consumer drivers can request a change in their supply regulator operating mode
+by calling::
+
+	int regulator_set_load(struct regulator *regulator, int load_uA);
+
+This will cause the core to recalculate the total load on the regulator (based
+on all its consumers) and change operating mode (if necessary and permitted)
+to best match the current operating load.
+
+The load_uA value can be determined from the consumer's datasheet. e.g. most
+datasheets have tables showing the maximum current consumed in certain
+situations.
+
+Most consumers will use indirect operating mode control since they have no
+knowledge of the regulator or whether the regulator is shared with other
+consumers.
+
+Direct operating mode control.
+------------------------------
+
+Bespoke or tightly coupled drivers may want to directly control regulator
+operating mode depending on their operating point. This can be achieved by
+calling::
+
+	int regulator_set_mode(struct regulator *regulator, unsigned int mode);
+	unsigned int regulator_get_mode(struct regulator *regulator);
+
+Direct mode will only be used by consumers that *know* about the regulator and
+are not sharing the regulator with other consumers.
+
+
+6. Regulator Events
+===================
+
+Regulators can notify consumers of external events. Events could be received by
+consumers under regulator stress or failure conditions.
+
+Consumers can register interest in regulator events by calling::
+
+	int regulator_register_notifier(struct regulator *regulator,
+					struct notifier_block *nb);
+
+Consumers can unregister interest by calling::
+
+	int regulator_unregister_notifier(struct regulator *regulator,
+					  struct notifier_block *nb);
+
+Regulators use the kernel notifier framework to send event to their interested
+consumers.
+
+7. Regulator Direct Register Access
+===================================
+
+Some kinds of power management hardware or firmware are designed such that
+they need to do low-level hardware access to regulators, with no involvement
+from the kernel. Examples of such devices are:
+
+- clocksource with a voltage-controlled oscillator and control logic to change
+  the supply voltage over I2C to achieve a desired output clock rate
+- thermal management firmware that can issue an arbitrary I2C transaction to
+  perform system poweroff during overtemperature conditions
+
+To set up such a device/firmware, various parameters like I2C address of the
+regulator, addresses of various regulator registers etc. need to be configured
+to it. The regulator framework provides the following helpers for querying
+these details.
+
+Bus-specific details, like I2C addresses or transfer rates are handled by the
+regmap framework. To get the regulator's regmap (if supported), use::
+
+	struct regmap *regulator_get_regmap(struct regulator *regulator);
+
+To obtain the hardware register offset and bitmask for the regulator's voltage
+selector register, use::
+
+	int regulator_get_hardware_vsel_register(struct regulator *regulator,
+						 unsigned *vsel_reg,
+						 unsigned *vsel_mask);
+
+To convert a regulator framework voltage selector code (used by
+regulator_list_voltage) to a hardware-specific voltage selector that can be
+directly written to the voltage selector register, use::
+
+	int regulator_list_hardware_vsel(struct regulator *regulator,
+					 unsigned selector);
diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt
deleted file mode 100644
index e51564c1a140..000000000000
--- a/Documentation/power/regulator/consumer.txt
+++ /dev/null
@@ -1,218 +0,0 @@
-Regulator Consumer Driver Interface
-===================================
-
-This text describes the regulator interface for consumer device drivers.
-Please see overview.txt for a description of the terms used in this text.
-
-
-1. Consumer Regulator Access (static & dynamic drivers)
-=======================================================
-
-A consumer driver can get access to its supply regulator by calling :-
-
-regulator = regulator_get(dev, "Vcc");
-
-The consumer passes in its struct device pointer and power supply ID. The core
-then finds the correct regulator by consulting a machine specific lookup table.
-If the lookup is successful then this call will return a pointer to the struct
-regulator that supplies this consumer.
-
-To release the regulator the consumer driver should call :-
-
-regulator_put(regulator);
-
-Consumers can be supplied by more than one regulator e.g. codec consumer with
-analog and digital supplies :-
-
-digital = regulator_get(dev, "Vcc");  /* digital core */
-analog = regulator_get(dev, "Avdd");  /* analog */
-
-The regulator access functions regulator_get() and regulator_put() will
-usually be called in your device drivers probe() and remove() respectively.
-
-
-2. Regulator Output Enable & Disable (static & dynamic drivers)
-====================================================================
-
-A consumer can enable its power supply by calling:-
-
-int regulator_enable(regulator);
-
-NOTE: The supply may already be enabled before regulator_enabled() is called.
-This may happen if the consumer shares the regulator or the regulator has been
-previously enabled by bootloader or kernel board initialization code.
-
-A consumer can determine if a regulator is enabled by calling :-
-
-int regulator_is_enabled(regulator);
-
-This will return > zero when the regulator is enabled.
-
-
-A consumer can disable its supply when no longer needed by calling :-
-
-int regulator_disable(regulator);
-
-NOTE: This may not disable the supply if it's shared with other consumers. The
-regulator will only be disabled when the enabled reference count is zero.
-
-Finally, a regulator can be forcefully disabled in the case of an emergency :-
-
-int regulator_force_disable(regulator);
-
-NOTE: this will immediately and forcefully shutdown the regulator output. All
-consumers will be powered off.
-
-
-3. Regulator Voltage Control & Status (dynamic drivers)
-======================================================
-
-Some consumer drivers need to be able to dynamically change their supply
-voltage to match system operating points. e.g. CPUfreq drivers can scale
-voltage along with frequency to save power, SD drivers may need to select the
-correct card voltage, etc.
-
-Consumers can control their supply voltage by calling :-
-
-int regulator_set_voltage(regulator, min_uV, max_uV);
-
-Where min_uV and max_uV are the minimum and maximum acceptable voltages in
-microvolts.
-
-NOTE: this can be called when the regulator is enabled or disabled. If called
-when enabled, then the voltage changes instantly, otherwise the voltage
-configuration changes and the voltage is physically set when the regulator is
-next enabled.
-
-The regulators configured voltage output can be found by calling :-
-
-int regulator_get_voltage(regulator);
-
-NOTE: get_voltage() will return the configured output voltage whether the
-regulator is enabled or disabled and should NOT be used to determine regulator
-output state. However this can be used in conjunction with is_enabled() to
-determine the regulator physical output voltage.
-
-
-4. Regulator Current Limit Control & Status (dynamic drivers)
-===========================================================
-
-Some consumer drivers need to be able to dynamically change their supply
-current limit to match system operating points. e.g. LCD backlight driver can
-change the current limit to vary the backlight brightness, USB drivers may want
-to set the limit to 500mA when supplying power.
-
-Consumers can control their supply current limit by calling :-
-
-int regulator_set_current_limit(regulator, min_uA, max_uA);
-
-Where min_uA and max_uA are the minimum and maximum acceptable current limit in
-microamps.
-
-NOTE: this can be called when the regulator is enabled or disabled. If called
-when enabled, then the current limit changes instantly, otherwise the current
-limit configuration changes and the current limit is physically set when the
-regulator is next enabled.
-
-A regulators current limit can be found by calling :-
-
-int regulator_get_current_limit(regulator);
-
-NOTE: get_current_limit() will return the current limit whether the regulator
-is enabled or disabled and should not be used to determine regulator current
-load.
-
-
-5. Regulator Operating Mode Control & Status (dynamic drivers)
-=============================================================
-
-Some consumers can further save system power by changing the operating mode of
-their supply regulator to be more efficient when the consumers operating state
-changes. e.g. consumer driver is idle and subsequently draws less current
-
-Regulator operating mode can be changed indirectly or directly.
-
-Indirect operating mode control.
---------------------------------
-Consumer drivers can request a change in their supply regulator operating mode
-by calling :-
-
-int regulator_set_load(struct regulator *regulator, int load_uA);
-
-This will cause the core to recalculate the total load on the regulator (based
-on all its consumers) and change operating mode (if necessary and permitted)
-to best match the current operating load.
-
-The load_uA value can be determined from the consumer's datasheet. e.g. most
-datasheets have tables showing the maximum current consumed in certain
-situations.
-
-Most consumers will use indirect operating mode control since they have no
-knowledge of the regulator or whether the regulator is shared with other
-consumers.
-
-Direct operating mode control.
-------------------------------
-Bespoke or tightly coupled drivers may want to directly control regulator
-operating mode depending on their operating point. This can be achieved by
-calling :-
-
-int regulator_set_mode(struct regulator *regulator, unsigned int mode);
-unsigned int regulator_get_mode(struct regulator *regulator);
-
-Direct mode will only be used by consumers that *know* about the regulator and
-are not sharing the regulator with other consumers.
-
-
-6. Regulator Events
-===================
-Regulators can notify consumers of external events. Events could be received by
-consumers under regulator stress or failure conditions.
-
-Consumers can register interest in regulator events by calling :-
-
-int regulator_register_notifier(struct regulator *regulator,
-			      struct notifier_block *nb);
-
-Consumers can unregister interest by calling :-
-
-int regulator_unregister_notifier(struct regulator *regulator,
-				struct notifier_block *nb);
-
-Regulators use the kernel notifier framework to send event to their interested
-consumers.
-
-7. Regulator Direct Register Access
-===================================
-Some kinds of power management hardware or firmware are designed such that
-they need to do low-level hardware access to regulators, with no involvement
-from the kernel. Examples of such devices are:
-
-- clocksource with a voltage-controlled oscillator and control logic to change
-  the supply voltage over I2C to achieve a desired output clock rate
-- thermal management firmware that can issue an arbitrary I2C transaction to
-  perform system poweroff during overtemperature conditions
-
-To set up such a device/firmware, various parameters like I2C address of the
-regulator, addresses of various regulator registers etc. need to be configured
-to it. The regulator framework provides the following helpers for querying
-these details.
-
-Bus-specific details, like I2C addresses or transfer rates are handled by the
-regmap framework. To get the regulator's regmap (if supported), use :-
-
-struct regmap *regulator_get_regmap(struct regulator *regulator);
-
-To obtain the hardware register offset and bitmask for the regulator's voltage
-selector register, use :-
-
-int regulator_get_hardware_vsel_register(struct regulator *regulator,
-					 unsigned *vsel_reg,
-					 unsigned *vsel_mask);
-
-To convert a regulator framework voltage selector code (used by
-regulator_list_voltage) to a hardware-specific voltage selector that can be
-directly written to the voltage selector register, use :-
-
-int regulator_list_hardware_vsel(struct regulator *regulator,
-				 unsigned selector);
diff --git a/Documentation/power/regulator/design.rst b/Documentation/power/regulator/design.rst
new file mode 100644
index 000000000000..3b09c6841dc4
--- /dev/null
+++ b/Documentation/power/regulator/design.rst
@@ -0,0 +1,38 @@
+==========================
+Regulator API design notes
+==========================
+
+This document provides a brief, partially structured, overview of some
+of the design considerations which impact the regulator API design.
+
+Safety
+------
+
+ - Errors in regulator configuration can have very serious consequences
+   for the system, potentially including lasting hardware damage.
+ - It is not possible to automatically determine the power configuration
+   of the system - software-equivalent variants of the same chip may
+   have different power requirements, and not all components with power
+   requirements are visible to software.
+
+.. note::
+
+     The API should make no changes to the hardware state unless it has
+     specific knowledge that these changes are safe to perform on this
+     particular system.
+
+Consumer use cases
+------------------
+
+ - The overwhelming majority of devices in a system will have no
+   requirement to do any runtime configuration of their power beyond
+   being able to turn it on or off.
+
+ - Many of the power supplies in the system will be shared between many
+   different consumers.
+
+.. note::
+
+     The consumer API should be structured so that these use cases are
+     very easy to handle and so that consumers will work with shared
+     supplies without any additional effort.
diff --git a/Documentation/power/regulator/design.txt b/Documentation/power/regulator/design.txt
deleted file mode 100644
index fdd919b96830..000000000000
--- a/Documentation/power/regulator/design.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-Regulator API design notes
-==========================
-
-This document provides a brief, partially structured, overview of some
-of the design considerations which impact the regulator API design.
-
-Safety
-------
-
- - Errors in regulator configuration can have very serious consequences
-   for the system, potentially including lasting hardware damage.
- - It is not possible to automatically determine the power configuration
-   of the system - software-equivalent variants of the same chip may
-   have different power requirements, and not all components with power
-   requirements are visible to software.
-
-  => The API should make no changes to the hardware state unless it has
-     specific knowledge that these changes are safe to perform on this
-     particular system.
-
-Consumer use cases
-------------------
-
- - The overwhelming majority of devices in a system will have no
-   requirement to do any runtime configuration of their power beyond
-   being able to turn it on or off.
-
- - Many of the power supplies in the system will be shared between many
-   different consumers.
-
-  => The consumer API should be structured so that these use cases are
-     very easy to handle and so that consumers will work with shared
-     supplies without any additional effort.
diff --git a/Documentation/power/regulator/machine.rst b/Documentation/power/regulator/machine.rst
new file mode 100644
index 000000000000..22fffefaa3ad
--- /dev/null
+++ b/Documentation/power/regulator/machine.rst
@@ -0,0 +1,97 @@
+==================================
+Regulator Machine Driver Interface
+==================================
+
+The regulator machine driver interface is intended for board/machine specific
+initialisation code to configure the regulator subsystem.
+
+Consider the following machine::
+
+  Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V]
+               |
+               +-> [Consumer B @ 3.3V]
+
+The drivers for consumers A & B must be mapped to the correct regulator in
+order to control their power supplies. This mapping can be achieved in machine
+initialisation code by creating a struct regulator_consumer_supply for
+each regulator::
+
+  struct regulator_consumer_supply {
+	const char *dev_name;	/* consumer dev_name() */
+	const char *supply;	/* consumer supply - e.g. "vcc" */
+  };
+
+e.g. for the machine above::
+
+  static struct regulator_consumer_supply regulator1_consumers[] = {
+	REGULATOR_SUPPLY("Vcc", "consumer B"),
+  };
+
+  static struct regulator_consumer_supply regulator2_consumers[] = {
+	REGULATOR_SUPPLY("Vcc", "consumer A"),
+  };
+
+This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2
+to the 'Vcc' supply for Consumer A.
+
+Constraints can now be registered by defining a struct regulator_init_data
+for each regulator power domain. This structure also maps the consumers
+to their supply regulators::
+
+  static struct regulator_init_data regulator1_data = {
+	.constraints = {
+		.name = "Regulator-1",
+		.min_uV = 3300000,
+		.max_uV = 3300000,
+		.valid_modes_mask = REGULATOR_MODE_NORMAL,
+	},
+	.num_consumer_supplies = ARRAY_SIZE(regulator1_consumers),
+	.consumer_supplies = regulator1_consumers,
+  };
+
+The name field should be set to something that is usefully descriptive
+for the board for configuration of supplies for other regulators and
+for use in logging and other diagnostic output.  Normally the name
+used for the supply rail in the schematic is a good choice.  If no
+name is provided then the subsystem will choose one.
+
+Regulator-1 supplies power to Regulator-2. This relationship must be registered
+with the core so that Regulator-1 is also enabled when Consumer A enables its
+supply (Regulator-2). The supply regulator is set by the supply_regulator
+field below and co::
+
+  static struct regulator_init_data regulator2_data = {
+	.supply_regulator = "Regulator-1",
+	.constraints = {
+		.min_uV = 1800000,
+		.max_uV = 2000000,
+		.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE,
+		.valid_modes_mask = REGULATOR_MODE_NORMAL,
+	},
+	.num_consumer_supplies = ARRAY_SIZE(regulator2_consumers),
+	.consumer_supplies = regulator2_consumers,
+  };
+
+Finally the regulator devices must be registered in the usual manner::
+
+  static struct platform_device regulator_devices[] = {
+	{
+		.name = "regulator",
+		.id = DCDC_1,
+		.dev = {
+			.platform_data = &regulator1_data,
+		},
+	},
+	{
+		.name = "regulator",
+		.id = DCDC_2,
+		.dev = {
+			.platform_data = &regulator2_data,
+		},
+	},
+  };
+  /* register regulator 1 device */
+  platform_device_register(&regulator_devices[0]);
+
+  /* register regulator 2 device */
+  platform_device_register(&regulator_devices[1]);
diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt
deleted file mode 100644
index eff4dcaaa252..000000000000
--- a/Documentation/power/regulator/machine.txt
+++ /dev/null
@@ -1,96 +0,0 @@
-Regulator Machine Driver Interface
-===================================
-
-The regulator machine driver interface is intended for board/machine specific
-initialisation code to configure the regulator subsystem.
-
-Consider the following machine :-
-
-  Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V]
-               |
-               +-> [Consumer B @ 3.3V]
-
-The drivers for consumers A & B must be mapped to the correct regulator in
-order to control their power supplies. This mapping can be achieved in machine
-initialisation code by creating a struct regulator_consumer_supply for
-each regulator.
-
-struct regulator_consumer_supply {
-	const char *dev_name;	/* consumer dev_name() */
-	const char *supply;	/* consumer supply - e.g. "vcc" */
-};
-
-e.g. for the machine above
-
-static struct regulator_consumer_supply regulator1_consumers[] = {
-	REGULATOR_SUPPLY("Vcc", "consumer B"),
-};
-
-static struct regulator_consumer_supply regulator2_consumers[] = {
-	REGULATOR_SUPPLY("Vcc", "consumer A"),
-};
-
-This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2
-to the 'Vcc' supply for Consumer A.
-
-Constraints can now be registered by defining a struct regulator_init_data
-for each regulator power domain. This structure also maps the consumers
-to their supply regulators :-
-
-static struct regulator_init_data regulator1_data = {
-	.constraints = {
-		.name = "Regulator-1",
-		.min_uV = 3300000,
-		.max_uV = 3300000,
-		.valid_modes_mask = REGULATOR_MODE_NORMAL,
-	},
-	.num_consumer_supplies = ARRAY_SIZE(regulator1_consumers),
-	.consumer_supplies = regulator1_consumers,
-};
-
-The name field should be set to something that is usefully descriptive
-for the board for configuration of supplies for other regulators and
-for use in logging and other diagnostic output.  Normally the name
-used for the supply rail in the schematic is a good choice.  If no
-name is provided then the subsystem will choose one.
-
-Regulator-1 supplies power to Regulator-2. This relationship must be registered
-with the core so that Regulator-1 is also enabled when Consumer A enables its
-supply (Regulator-2). The supply regulator is set by the supply_regulator
-field below and co:-
-
-static struct regulator_init_data regulator2_data = {
-	.supply_regulator = "Regulator-1",
-	.constraints = {
-		.min_uV = 1800000,
-		.max_uV = 2000000,
-		.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE,
-		.valid_modes_mask = REGULATOR_MODE_NORMAL,
-	},
-	.num_consumer_supplies = ARRAY_SIZE(regulator2_consumers),
-	.consumer_supplies = regulator2_consumers,
-};
-
-Finally the regulator devices must be registered in the usual manner.
-
-static struct platform_device regulator_devices[] = {
-	{
-		.name = "regulator",
-		.id = DCDC_1,
-		.dev = {
-			.platform_data = &regulator1_data,
-		},
-	},
-	{
-		.name = "regulator",
-		.id = DCDC_2,
-		.dev = {
-			.platform_data = &regulator2_data,
-		},
-	},
-};
-/* register regulator 1 device */
-platform_device_register(&regulator_devices[0]);
-
-/* register regulator 2 device */
-platform_device_register(&regulator_devices[1]);
diff --git a/Documentation/power/regulator/overview.rst b/Documentation/power/regulator/overview.rst
new file mode 100644
index 000000000000..ee494c70a7c4
--- /dev/null
+++ b/Documentation/power/regulator/overview.rst
@@ -0,0 +1,178 @@
+=============================================
+Linux voltage and current regulator framework
+=============================================
+
+About
+=====
+
+This framework is designed to provide a standard kernel interface to control
+voltage and current regulators.
+
+The intention is to allow systems to dynamically control regulator power output
+in order to save power and prolong battery life. This applies to both voltage
+regulators (where voltage output is controllable) and current sinks (where
+current limit is controllable).
+
+(C) 2008  Wolfson Microelectronics PLC.
+
+Author: Liam Girdwood <lrg@slimlogic.co.uk>
+
+
+Nomenclature
+============
+
+Some terms used in this document:
+
+  - Regulator
+                 - Electronic device that supplies power to other devices.
+                   Most regulators can enable and disable their output while
+                   some can control their output voltage and or current.
+
+                   Input Voltage -> Regulator -> Output Voltage
+
+
+  - PMIC
+                 - Power Management IC. An IC that contains numerous
+                   regulators and often contains other subsystems.
+
+
+  - Consumer
+                 - Electronic device that is supplied power by a regulator.
+                   Consumers can be classified into two types:-
+
+                   Static: consumer does not change its supply voltage or
+                   current limit. It only needs to enable or disable its
+                   power supply. Its supply voltage is set by the hardware,
+                   bootloader, firmware or kernel board initialisation code.
+
+                   Dynamic: consumer needs to change its supply voltage or
+                   current limit to meet operation demands.
+
+
+  - Power Domain
+                 - Electronic circuit that is supplied its input power by the
+                   output power of a regulator, switch or by another power
+                   domain.
+
+                   The supply regulator may be behind a switch(s). i.e.::
+
+                     Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A]
+                                |             |
+                                |             +-> [Consumer B], [Consumer C]
+                                |
+                                +-> [Consumer D], [Consumer E]
+
+                   That is one regulator and three power domains:
+
+                   - Domain 1: Switch-1, Consumers D & E.
+                   - Domain 2: Switch-2, Consumers B & C.
+                   - Domain 3: Consumer A.
+
+                   and this represents a "supplies" relationship:
+
+                   Domain-1 --> Domain-2 --> Domain-3.
+
+                   A power domain may have regulators that are supplied power
+                   by other regulators. i.e.::
+
+                     Regulator-1 -+-> Regulator-2 -+-> [Consumer A]
+                                  |
+                                  +-> [Consumer B]
+
+                   This gives us two regulators and two power domains:
+
+                   - Domain 1: Regulator-2, Consumer B.
+                   - Domain 2: Consumer A.
+
+                   and a "supplies" relationship:
+
+                   Domain-1 --> Domain-2
+
+
+  - Constraints
+                 - Constraints are used to define power levels for performance
+                   and hardware protection. Constraints exist at three levels:
+
+                   Regulator Level: This is defined by the regulator hardware
+                   operating parameters and is specified in the regulator
+                   datasheet. i.e.
+
+                     - voltage output is in the range 800mV -> 3500mV.
+                     - regulator current output limit is 20mA @ 5V but is
+                       10mA @ 10V.
+
+                   Power Domain Level: This is defined in software by kernel
+                   level board initialisation code. It is used to constrain a
+                   power domain to a particular power range. i.e.
+
+                     - Domain-1 voltage is 3300mV
+                     - Domain-2 voltage is 1400mV -> 1600mV
+                     - Domain-3 current limit is 0mA -> 20mA.
+
+                   Consumer Level: This is defined by consumer drivers
+                   dynamically setting voltage or current limit levels.
+
+                   e.g. a consumer backlight driver asks for a current increase
+                   from 5mA to 10mA to increase LCD illumination. This passes
+                   to through the levels as follows :-
+
+                   Consumer: need to increase LCD brightness. Lookup and
+                   request next current mA value in brightness table (the
+                   consumer driver could be used on several different
+                   personalities based upon the same reference device).
+
+                   Power Domain: is the new current limit within the domain
+                   operating limits for this domain and system state (e.g.
+                   battery power, USB power)
+
+                   Regulator Domains: is the new current limit within the
+                   regulator operating parameters for input/output voltage.
+
+                   If the regulator request passes all the constraint tests
+                   then the new regulator value is applied.
+
+
+Design
+======
+
+The framework is designed and targeted at SoC based devices but may also be
+relevant to non SoC devices and is split into the following four interfaces:-
+
+
+   1. Consumer driver interface.
+
+      This uses a similar API to the kernel clock interface in that consumer
+      drivers can get and put a regulator (like they can with clocks atm) and
+      get/set voltage, current limit, mode, enable and disable. This should
+      allow consumers complete control over their supply voltage and current
+      limit. This also compiles out if not in use so drivers can be reused in
+      systems with no regulator based power control.
+
+        See Documentation/power/regulator/consumer.rst
+
+   2. Regulator driver interface.
+
+      This allows regulator drivers to register their regulators and provide
+      operations to the core. It also has a notifier call chain for propagating
+      regulator events to clients.
+
+        See Documentation/power/regulator/regulator.rst
+
+   3. Machine interface.
+
+      This interface is for machine specific code and allows the creation of
+      voltage/current domains (with constraints) for each regulator. It can
+      provide regulator constraints that will prevent device damage through
+      overvoltage or overcurrent caused by buggy client drivers. It also
+      allows the creation of a regulator tree whereby some regulators are
+      supplied by others (similar to a clock tree).
+
+        See Documentation/power/regulator/machine.rst
+
+   4. Userspace ABI.
+
+      The framework also exports a lot of useful voltage/current/opmode data to
+      userspace via sysfs. This could be used to help monitor device power
+      consumption and status.
+
+        See Documentation/ABI/testing/sysfs-class-regulator
diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt
deleted file mode 100644
index 721b4739ec32..000000000000
--- a/Documentation/power/regulator/overview.txt
+++ /dev/null
@@ -1,171 +0,0 @@
-Linux voltage and current regulator framework
-=============================================
-
-About
-=====
-
-This framework is designed to provide a standard kernel interface to control
-voltage and current regulators.
-
-The intention is to allow systems to dynamically control regulator power output
-in order to save power and prolong battery life. This applies to both voltage
-regulators (where voltage output is controllable) and current sinks (where
-current limit is controllable).
-
-(C) 2008  Wolfson Microelectronics PLC.
-Author: Liam Girdwood <lrg@slimlogic.co.uk>
-
-
-Nomenclature
-============
-
-Some terms used in this document:-
-
-  o Regulator    - Electronic device that supplies power to other devices.
-                   Most regulators can enable and disable their output while
-                   some can control their output voltage and or current.
-
-                   Input Voltage -> Regulator -> Output Voltage
-
-
-  o PMIC         - Power Management IC. An IC that contains numerous regulators
-                   and often contains other subsystems.
-
-
-  o Consumer     - Electronic device that is supplied power by a regulator.
-                   Consumers can be classified into two types:-
-
-                   Static: consumer does not change its supply voltage or
-                   current limit. It only needs to enable or disable its
-                   power supply. Its supply voltage is set by the hardware,
-                   bootloader, firmware or kernel board initialisation code.
-
-                   Dynamic: consumer needs to change its supply voltage or
-                   current limit to meet operation demands.
-
-
-  o Power Domain - Electronic circuit that is supplied its input power by the
-                   output power of a regulator, switch or by another power
-                   domain.
-
-                   The supply regulator may be behind a switch(s). i.e.
-
-                   Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A]
-                              |             |
-                              |             +-> [Consumer B], [Consumer C]
-                              |
-                              +-> [Consumer D], [Consumer E]
-
-                   That is one regulator and three power domains:
-
-                   Domain 1: Switch-1, Consumers D & E.
-                   Domain 2: Switch-2, Consumers B & C.
-                   Domain 3: Consumer A.
-
-                   and this represents a "supplies" relationship:
-
-                   Domain-1 --> Domain-2 --> Domain-3.
-
-                   A power domain may have regulators that are supplied power
-                   by other regulators. i.e.
-
-                   Regulator-1 -+-> Regulator-2 -+-> [Consumer A]
-                                |
-                                +-> [Consumer B]
-
-                   This gives us two regulators and two power domains:
-
-                   Domain 1: Regulator-2, Consumer B.
-                   Domain 2: Consumer A.
-
-                   and a "supplies" relationship:
-
-                   Domain-1 --> Domain-2
-
-
-  o Constraints  - Constraints are used to define power levels for performance
-                   and hardware protection. Constraints exist at three levels:
-
-                   Regulator Level: This is defined by the regulator hardware
-                   operating parameters and is specified in the regulator
-                   datasheet. i.e.
-
-                     - voltage output is in the range 800mV -> 3500mV.
-                     - regulator current output limit is 20mA @ 5V but is
-                       10mA @ 10V.
-
-                   Power Domain Level: This is defined in software by kernel
-                   level board initialisation code. It is used to constrain a
-                   power domain to a particular power range. i.e.
-
-                     - Domain-1 voltage is 3300mV
-                     - Domain-2 voltage is 1400mV -> 1600mV
-                     - Domain-3 current limit is 0mA -> 20mA.
-
-                   Consumer Level: This is defined by consumer drivers
-                   dynamically setting voltage or current limit levels.
-
-                   e.g. a consumer backlight driver asks for a current increase
-                   from 5mA to 10mA to increase LCD illumination. This passes
-                   to through the levels as follows :-
-
-                   Consumer: need to increase LCD brightness. Lookup and
-                   request next current mA value in brightness table (the
-                   consumer driver could be used on several different
-                   personalities based upon the same reference device).
-
-                   Power Domain: is the new current limit within the domain
-                   operating limits for this domain and system state (e.g.
-                   battery power, USB power)
-
-                   Regulator Domains: is the new current limit within the
-                   regulator operating parameters for input/output voltage.
-
-                   If the regulator request passes all the constraint tests
-                   then the new regulator value is applied.
-
-
-Design
-======
-
-The framework is designed and targeted at SoC based devices but may also be
-relevant to non SoC devices and is split into the following four interfaces:-
-
-
-   1. Consumer driver interface.
-
-      This uses a similar API to the kernel clock interface in that consumer
-      drivers can get and put a regulator (like they can with clocks atm) and
-      get/set voltage, current limit, mode, enable and disable. This should
-      allow consumers complete control over their supply voltage and current
-      limit. This also compiles out if not in use so drivers can be reused in
-      systems with no regulator based power control.
-
-        See Documentation/power/regulator/consumer.txt
-
-   2. Regulator driver interface.
-
-      This allows regulator drivers to register their regulators and provide
-      operations to the core. It also has a notifier call chain for propagating
-      regulator events to clients.
-
-        See Documentation/power/regulator/regulator.txt
-
-   3. Machine interface.
-
-      This interface is for machine specific code and allows the creation of
-      voltage/current domains (with constraints) for each regulator. It can
-      provide regulator constraints that will prevent device damage through
-      overvoltage or overcurrent caused by buggy client drivers. It also
-      allows the creation of a regulator tree whereby some regulators are
-      supplied by others (similar to a clock tree).
-
-        See Documentation/power/regulator/machine.txt
-
-   4. Userspace ABI.
-
-      The framework also exports a lot of useful voltage/current/opmode data to
-      userspace via sysfs. This could be used to help monitor device power
-      consumption and status.
-
-        See Documentation/ABI/testing/sysfs-class-regulator
diff --git a/Documentation/power/regulator/regulator.rst b/Documentation/power/regulator/regulator.rst
new file mode 100644
index 000000000000..794b3256fbb9
--- /dev/null
+++ b/Documentation/power/regulator/regulator.rst
@@ -0,0 +1,32 @@
+==========================
+Regulator Driver Interface
+==========================
+
+The regulator driver interface is relatively simple and designed to allow
+regulator drivers to register their services with the core framework.
+
+
+Registration
+============
+
+Drivers can register a regulator by calling::
+
+  struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
+					   const struct regulator_config *config);
+
+This will register the regulator's capabilities and operations to the regulator
+core.
+
+Regulators can be unregistered by calling::
+
+  void regulator_unregister(struct regulator_dev *rdev);
+
+
+Regulator Events
+================
+
+Regulators can send events (e.g. overtemperature, undervoltage, etc) to
+consumer drivers by calling::
+
+  int regulator_notifier_call_chain(struct regulator_dev *rdev,
+				    unsigned long event, void *data);
diff --git a/Documentation/power/regulator/regulator.txt b/Documentation/power/regulator/regulator.txt
deleted file mode 100644
index b17e5833ce21..000000000000
--- a/Documentation/power/regulator/regulator.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-Regulator Driver Interface
-==========================
-
-The regulator driver interface is relatively simple and designed to allow
-regulator drivers to register their services with the core framework.
-
-
-Registration
-============
-
-Drivers can register a regulator by calling :-
-
-struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc,
-					 const struct regulator_config *config);
-
-This will register the regulator's capabilities and operations to the regulator
-core.
-
-Regulators can be unregistered by calling :-
-
-void regulator_unregister(struct regulator_dev *rdev);
-
-
-Regulator Events
-================
-Regulators can send events (e.g. overtemperature, undervoltage, etc) to
-consumer drivers by calling :-
-
-int regulator_notifier_call_chain(struct regulator_dev *rdev,
-				  unsigned long event, void *data);
diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst
new file mode 100644
index 000000000000..2c2ec99b5088
--- /dev/null
+++ b/Documentation/power/runtime_pm.rst
@@ -0,0 +1,940 @@
+==================================================
+Runtime Power Management Framework for I/O Devices
+==================================================
+
+(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+
+(C) 2010 Alan Stern <stern@rowland.harvard.edu>
+
+(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+1. Introduction
+===============
+
+Support for runtime power management (runtime PM) of I/O devices is provided
+at the power management core (PM core) level by means of:
+
+* The power management workqueue pm_wq in which bus types and device drivers can
+  put their PM-related work items.  It is strongly recommended that pm_wq be
+  used for queuing all work items related to runtime PM, because this allows
+  them to be synchronized with system-wide power transitions (suspend to RAM,
+  hibernation and resume from system sleep states).  pm_wq is declared in
+  include/linux/pm_runtime.h and defined in kernel/power/main.c.
+
+* A number of runtime PM fields in the 'power' member of 'struct device' (which
+  is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
+  be used for synchronizing runtime PM operations with one another.
+
+* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in
+  include/linux/pm.h).
+
+* A set of helper functions defined in drivers/base/power/runtime.c that can be
+  used for carrying out runtime PM operations in such a way that the
+  synchronization between them is taken care of by the PM core.  Bus types and
+  device drivers are encouraged to use these functions.
+
+The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM
+fields of 'struct dev_pm_info' and the core helper functions provided for
+runtime PM are described below.
+
+2. Device Runtime PM Callbacks
+==============================
+
+There are three device runtime PM callbacks defined in 'struct dev_pm_ops'::
+
+  struct dev_pm_ops {
+	...
+	int (*runtime_suspend)(struct device *dev);
+	int (*runtime_resume)(struct device *dev);
+	int (*runtime_idle)(struct device *dev);
+	...
+  };
+
+The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks
+are executed by the PM core for the device's subsystem that may be either of
+the following:
+
+  1. PM domain of the device, if the device's PM domain object, dev->pm_domain,
+     is present.
+
+  2. Device type of the device, if both dev->type and dev->type->pm are present.
+
+  3. Device class of the device, if both dev->class and dev->class->pm are
+     present.
+
+  4. Bus type of the device, if both dev->bus and dev->bus->pm are present.
+
+If the subsystem chosen by applying the above rules doesn't provide the relevant
+callback, the PM core will invoke the corresponding driver callback stored in
+dev->driver->pm directly (if present).
+
+The PM core always checks which callback to use in the order given above, so the
+priority order of callbacks from high to low is: PM domain, device type, class
+and bus type.  Moreover, the high-priority one will always take precedence over
+a low-priority one.  The PM domain, bus type, device type and class callbacks
+are referred to as subsystem-level callbacks in what follows.
+
+By default, the callbacks are always invoked in process context with interrupts
+enabled.  However, the pm_runtime_irq_safe() helper function can be used to tell
+the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume()
+and ->runtime_idle() callbacks for the given device in atomic context with
+interrupts disabled.  This implies that the callback routines in question must
+not block or sleep, but it also means that the synchronous helper functions
+listed at the end of Section 4 may be used for that device within an interrupt
+handler or generally in an atomic context.
+
+The subsystem-level suspend callback, if present, is _entirely_ _responsible_
+for handling the suspend of the device as appropriate, which may, but need not
+include executing the device driver's own ->runtime_suspend() callback (from the
+PM core's point of view it is not necessary to implement a ->runtime_suspend()
+callback in a device driver as long as the subsystem-level suspend callback
+knows what to do to handle the device).
+
+  * Once the subsystem-level suspend callback (or the driver suspend callback,
+    if invoked directly) has completed successfully for the given device, the PM
+    core regards the device as suspended, which need not mean that it has been
+    put into a low power state.  It is supposed to mean, however, that the
+    device will not process data and will not communicate with the CPU(s) and
+    RAM until the appropriate resume callback is executed for it.  The runtime
+    PM status of a device after successful execution of the suspend callback is
+    'suspended'.
+
+  * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM
+    status remains 'active', which means that the device _must_ be fully
+    operational afterwards.
+
+  * If the suspend callback returns an error code different from -EBUSY and
+    -EAGAIN, the PM core regards this as a fatal error and will refuse to run
+    the helper functions described in Section 4 for the device until its status
+    is directly set to  either 'active', or 'suspended' (the PM core provides
+    special helper functions for this purpose).
+
+In particular, if the driver requires remote wakeup capability (i.e. hardware
+mechanism allowing the device to request a change of its power state, such as
+PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the
+device, then ->runtime_suspend() should return -EBUSY.  On the other hand, if
+device_can_wakeup() returns 'true' for the device and the device is put into a
+low-power state during the execution of the suspend callback, it is expected
+that remote wakeup will be enabled for the device.  Generally, remote wakeup
+should be enabled for all input devices put into low-power states at run time.
+
+The subsystem-level resume callback, if present, is **entirely responsible** for
+handling the resume of the device as appropriate, which may, but need not
+include executing the device driver's own ->runtime_resume() callback (from the
+PM core's point of view it is not necessary to implement a ->runtime_resume()
+callback in a device driver as long as the subsystem-level resume callback knows
+what to do to handle the device).
+
+  * Once the subsystem-level resume callback (or the driver resume callback, if
+    invoked directly) has completed successfully, the PM core regards the device
+    as fully operational, which means that the device _must_ be able to complete
+    I/O operations as needed.  The runtime PM status of the device is then
+    'active'.
+
+  * If the resume callback returns an error code, the PM core regards this as a
+    fatal error and will refuse to run the helper functions described in Section
+    4 for the device, until its status is directly set to either 'active', or
+    'suspended' (by means of special helper functions provided by the PM core
+    for this purpose).
+
+The idle callback (a subsystem-level one, if present, or the driver one) is
+executed by the PM core whenever the device appears to be idle, which is
+indicated to the PM core by two counters, the device's usage counter and the
+counter of 'active' children of the device.
+
+  * If any of these counters is decreased using a helper function provided by
+    the PM core and it turns out to be equal to zero, the other counter is
+    checked.  If that counter also is equal to zero, the PM core executes the
+    idle callback with the device as its argument.
+
+The action performed by the idle callback is totally dependent on the subsystem
+(or driver) in question, but the expected and recommended action is to check
+if the device can be suspended (i.e. if all of the conditions necessary for
+suspending the device are satisfied) and to queue up a suspend request for the
+device in that case.  If there is no idle callback, or if the callback returns
+0, then the PM core will attempt to carry out a runtime suspend of the device,
+also respecting devices configured for autosuspend.  In essence this means a
+call to pm_runtime_autosuspend() (do note that drivers needs to update the
+device last busy mark, pm_runtime_mark_last_busy(), to control the delay under
+this circumstance).  To prevent this (for example, if the callback routine has
+started a delayed suspend), the routine must return a non-zero value.  Negative
+error return codes are ignored by the PM core.
+
+The helper functions provided by the PM core, described in Section 4, guarantee
+that the following constraints are met with respect to runtime PM callbacks for
+one device:
+
+(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
+    ->runtime_suspend() in parallel with ->runtime_resume() or with another
+    instance of ->runtime_suspend() for the same device) with the exception that
+    ->runtime_suspend() or ->runtime_resume() can be executed in parallel with
+    ->runtime_idle() (although ->runtime_idle() will not be started while any
+    of the other callbacks is being executed for the same device).
+
+(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
+    devices (i.e. the PM core will only execute ->runtime_idle() or
+    ->runtime_suspend() for the devices the runtime PM status of which is
+    'active').
+
+(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
+    the usage counter of which is equal to zero _and_ either the counter of
+    'active' children of which is equal to zero, or the 'power.ignore_children'
+    flag of which is set.
+
+(4) ->runtime_resume() can only be executed for 'suspended' devices  (i.e. the
+    PM core will only execute ->runtime_resume() for the devices the runtime
+    PM status of which is 'suspended').
+
+Additionally, the helper functions provided by the PM core obey the following
+rules:
+
+  * If ->runtime_suspend() is about to be executed or there's a pending request
+    to execute it, ->runtime_idle() will not be executed for the same device.
+
+  * A request to execute or to schedule the execution of ->runtime_suspend()
+    will cancel any pending requests to execute ->runtime_idle() for the same
+    device.
+
+  * If ->runtime_resume() is about to be executed or there's a pending request
+    to execute it, the other callbacks will not be executed for the same device.
+
+  * A request to execute ->runtime_resume() will cancel any pending or
+    scheduled requests to execute the other callbacks for the same device,
+    except for scheduled autosuspends.
+
+3. Runtime PM Device Fields
+===========================
+
+The following device runtime PM fields are present in 'struct dev_pm_info', as
+defined in include/linux/pm.h:
+
+  `struct timer_list suspend_timer;`
+    - timer used for scheduling (delayed) suspend and autosuspend requests
+
+  `unsigned long timer_expires;`
+    - timer expiration time, in jiffies (if this is different from zero, the
+      timer is running and will expire at that time, otherwise the timer is not
+      running)
+
+  `struct work_struct work;`
+    - work structure used for queuing up requests (i.e. work items in pm_wq)
+
+  `wait_queue_head_t wait_queue;`
+    - wait queue used if any of the helper functions needs to wait for another
+      one to complete
+
+  `spinlock_t lock;`
+    - lock used for synchronization
+
+  `atomic_t usage_count;`
+    - the usage counter of the device
+
+  `atomic_t child_count;`
+    - the count of 'active' children of the device
+
+  `unsigned int ignore_children;`
+    - if set, the value of child_count is ignored (but still updated)
+
+  `unsigned int disable_depth;`
+    - used for disabling the helper functions (they work normally if this is
+      equal to zero); the initial value of it is 1 (i.e. runtime PM is
+      initially disabled for all devices)
+
+  `int runtime_error;`
+    - if set, there was a fatal error (one of the callbacks returned error code
+      as described in Section 2), so the helper functions will not work until
+      this flag is cleared; this is the error code returned by the failing
+      callback
+
+  `unsigned int idle_notification;`
+    - if set, ->runtime_idle() is being executed
+
+  `unsigned int request_pending;`
+    - if set, there's a pending request (i.e. a work item queued up into pm_wq)
+
+  `enum rpm_request request;`
+    - type of request that's pending (valid if request_pending is set)
+
+  `unsigned int deferred_resume;`
+    - set if ->runtime_resume() is about to be run while ->runtime_suspend() is
+      being executed for that device and it is not practical to wait for the
+      suspend to complete; means "start a resume as soon as you've suspended"
+
+  `enum rpm_status runtime_status;`
+    - the runtime PM status of the device; this field's initial value is
+      RPM_SUSPENDED, which means that each device is initially regarded by the
+      PM core as 'suspended', regardless of its real hardware status
+
+  `unsigned int runtime_auto;`
+    - if set, indicates that the user space has allowed the device driver to
+      power manage the device at run time via the /sys/devices/.../power/control
+      `interface;` it may only be modified with the help of the pm_runtime_allow()
+      and pm_runtime_forbid() helper functions
+
+  `unsigned int no_callbacks;`
+    - indicates that the device does not use the runtime PM callbacks (see
+      Section 8); it may be modified only by the pm_runtime_no_callbacks()
+      helper function
+
+  `unsigned int irq_safe;`
+    - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks
+      will be invoked with the spinlock held and interrupts disabled
+
+  `unsigned int use_autosuspend;`
+    - indicates that the device's driver supports delayed autosuspend (see
+      Section 9); it may be modified only by the
+      pm_runtime{_dont}_use_autosuspend() helper functions
+
+  `unsigned int timer_autosuspends;`
+    - indicates that the PM core should attempt to carry out an autosuspend
+      when the timer expires rather than a normal suspend
+
+  `int autosuspend_delay;`
+    - the delay time (in milliseconds) to be used for autosuspend
+
+  `unsigned long last_busy;`
+    - the time (in jiffies) when the pm_runtime_mark_last_busy() helper
+      function was last called for this device; used in calculating inactivity
+      periods for autosuspend
+
+All of the above fields are members of the 'power' member of 'struct device'.
+
+4. Runtime PM Device Helper Functions
+=====================================
+
+The following runtime PM helper functions are defined in
+drivers/base/power/runtime.c and include/linux/pm_runtime.h:
+
+  `void pm_runtime_init(struct device *dev);`
+    - initialize the device runtime PM fields in 'struct dev_pm_info'
+
+  `void pm_runtime_remove(struct device *dev);`
+    - make sure that the runtime PM of the device will be disabled after
+      removing the device from device hierarchy
+
+  `int pm_runtime_idle(struct device *dev);`
+    - execute the subsystem-level idle callback for the device; returns an
+      error code on failure, where -EINPROGRESS means that ->runtime_idle() is
+      already being executed; if there is no callback or the callback returns 0
+      then run pm_runtime_autosuspend(dev) and return its result
+
+  `int pm_runtime_suspend(struct device *dev);`
+    - execute the subsystem-level suspend callback for the device; returns 0 on
+      success, 1 if the device's runtime PM status was already 'suspended', or
+      error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
+      to suspend the device again in future and -EACCES means that
+      'power.disable_depth' is different from 0
+
+  `int pm_runtime_autosuspend(struct device *dev);`
+    - same as pm_runtime_suspend() except that the autosuspend delay is taken
+      `into account;` if pm_runtime_autosuspend_expiration() says the delay has
+      not yet expired then an autosuspend is scheduled for the appropriate time
+      and 0 is returned
+
+  `int pm_runtime_resume(struct device *dev);`
+    - execute the subsystem-level resume callback for the device; returns 0 on
+      success, 1 if the device's runtime PM status was already 'active' or
+      error code on failure, where -EAGAIN means it may be safe to attempt to
+      resume the device again in future, but 'power.runtime_error' should be
+      checked additionally, and -EACCES means that 'power.disable_depth' is
+      different from 0
+
+  `int pm_request_idle(struct device *dev);`
+    - submit a request to execute the subsystem-level idle callback for the
+      device (the request is represented by a work item in pm_wq); returns 0 on
+      success or error code if the request has not been queued up
+
+  `int pm_request_autosuspend(struct device *dev);`
+    - schedule the execution of the subsystem-level suspend callback for the
+      device when the autosuspend delay has expired; if the delay has already
+      expired then the work item is queued up immediately
+
+  `int pm_schedule_suspend(struct device *dev, unsigned int delay);`
+    - schedule the execution of the subsystem-level suspend callback for the
+      device in future, where 'delay' is the time to wait before queuing up a
+      suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work
+      item is queued up immediately); returns 0 on success, 1 if the device's PM
+      runtime status was already 'suspended', or error code if the request
+      hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
+      ->runtime_suspend() is already scheduled and not yet expired, the new
+      value of 'delay' will be used as the time to wait
+
+  `int pm_request_resume(struct device *dev);`
+    - submit a request to execute the subsystem-level resume callback for the
+      device (the request is represented by a work item in pm_wq); returns 0 on
+      success, 1 if the device's runtime PM status was already 'active', or
+      error code if the request hasn't been queued up
+
+  `void pm_runtime_get_noresume(struct device *dev);`
+    - increment the device's usage counter
+
+  `int pm_runtime_get(struct device *dev);`
+    - increment the device's usage counter, run pm_request_resume(dev) and
+      return its result
+
+  `int pm_runtime_get_sync(struct device *dev);`
+    - increment the device's usage counter, run pm_runtime_resume(dev) and
+      return its result
+
+  `int pm_runtime_get_if_in_use(struct device *dev);`
+    - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
+      runtime PM status is RPM_ACTIVE and the runtime PM usage counter is
+      nonzero, increment the counter and return 1; otherwise return 0 without
+      changing the counter
+
+  `void pm_runtime_put_noidle(struct device *dev);`
+    - decrement the device's usage counter
+
+  `int pm_runtime_put(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_request_idle(dev) and return its result
+
+  `int pm_runtime_put_autosuspend(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_request_autosuspend(dev) and return its result
+
+  `int pm_runtime_put_sync(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_idle(dev) and return its result
+
+  `int pm_runtime_put_sync_suspend(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_suspend(dev) and return its result
+
+  `int pm_runtime_put_sync_autosuspend(struct device *dev);`
+    - decrement the device's usage counter; if the result is 0 then run
+      pm_runtime_autosuspend(dev) and return its result
+
+  `void pm_runtime_enable(struct device *dev);`
+    - decrement the device's 'power.disable_depth' field; if that field is equal
+      to zero, the runtime PM helper functions can execute subsystem-level
+      callbacks described in Section 2 for the device
+
+  `int pm_runtime_disable(struct device *dev);`
+    - increment the device's 'power.disable_depth' field (if the value of that
+      field was previously zero, this prevents subsystem-level runtime PM
+      callbacks from being run for the device), make sure that all of the
+      pending runtime PM operations on the device are either completed or
+      canceled; returns 1 if there was a resume request pending and it was
+      necessary to execute the subsystem-level resume callback for the device
+      to satisfy that request, otherwise 0 is returned
+
+  `int pm_runtime_barrier(struct device *dev);`
+    - check if there's a resume request pending for the device and resume it
+      (synchronously) in that case, cancel any other pending runtime PM requests
+      regarding it and wait for all runtime PM operations on it in progress to
+      complete; returns 1 if there was a resume request pending and it was
+      necessary to execute the subsystem-level resume callback for the device to
+      satisfy that request, otherwise 0 is returned
+
+  `void pm_suspend_ignore_children(struct device *dev, bool enable);`
+    - set/unset the power.ignore_children flag of the device
+
+  `int pm_runtime_set_active(struct device *dev);`
+    - clear the device's 'power.runtime_error' flag, set the device's runtime
+      PM status to 'active' and update its parent's counter of 'active'
+      children as appropriate (it is only valid to use this function if
+      'power.runtime_error' is set or 'power.disable_depth' is greater than
+      zero); it will fail and return error code if the device has a parent
+      which is not active and the 'power.ignore_children' flag of which is unset
+
+  `void pm_runtime_set_suspended(struct device *dev);`
+    - clear the device's 'power.runtime_error' flag, set the device's runtime
+      PM status to 'suspended' and update its parent's counter of 'active'
+      children as appropriate (it is only valid to use this function if
+      'power.runtime_error' is set or 'power.disable_depth' is greater than
+      zero)
+
+  `bool pm_runtime_active(struct device *dev);`
+    - return true if the device's runtime PM status is 'active' or its
+      'power.disable_depth' field is not equal to zero, or false otherwise
+
+  `bool pm_runtime_suspended(struct device *dev);`
+    - return true if the device's runtime PM status is 'suspended' and its
+      'power.disable_depth' field is equal to zero, or false otherwise
+
+  `bool pm_runtime_status_suspended(struct device *dev);`
+    - return true if the device's runtime PM status is 'suspended'
+
+  `void pm_runtime_allow(struct device *dev);`
+    - set the power.runtime_auto flag for the device and decrease its usage
+      counter (used by the /sys/devices/.../power/control interface to
+      effectively allow the device to be power managed at run time)
+
+  `void pm_runtime_forbid(struct device *dev);`
+    - unset the power.runtime_auto flag for the device and increase its usage
+      counter (used by the /sys/devices/.../power/control interface to
+      effectively prevent the device from being power managed at run time)
+
+  `void pm_runtime_no_callbacks(struct device *dev);`
+    - set the power.no_callbacks flag for the device and remove the runtime
+      PM attributes from /sys/devices/.../power (or prevent them from being
+      added when the device is registered)
+
+  `void pm_runtime_irq_safe(struct device *dev);`
+    - set the power.irq_safe flag for the device, causing the runtime-PM
+      callbacks to be invoked with interrupts off
+
+  `bool pm_runtime_is_irq_safe(struct device *dev);`
+    - return true if power.irq_safe flag was set for the device, causing
+      the runtime-PM callbacks to be invoked with interrupts off
+
+  `void pm_runtime_mark_last_busy(struct device *dev);`
+    - set the power.last_busy field to the current time
+
+  `void pm_runtime_use_autosuspend(struct device *dev);`
+    - set the power.use_autosuspend flag, enabling autosuspend delays; call
+      pm_runtime_get_sync if the flag was previously cleared and
+      power.autosuspend_delay is negative
+
+  `void pm_runtime_dont_use_autosuspend(struct device *dev);`
+    - clear the power.use_autosuspend flag, disabling autosuspend delays;
+      decrement the device's usage counter if the flag was previously set and
+      power.autosuspend_delay is negative; call pm_runtime_idle
+
+  `void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);`
+    - set the power.autosuspend_delay value to 'delay' (expressed in
+      milliseconds); if 'delay' is negative then runtime suspends are
+      prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be
+      called or the device's usage counter may be decremented and
+      pm_runtime_idle called depending on if power.autosuspend_delay is
+      changed to or from a negative value; if power.use_autosuspend is clear,
+      pm_runtime_idle is called
+
+  `unsigned long pm_runtime_autosuspend_expiration(struct device *dev);`
+    - calculate the time when the current autosuspend delay period will expire,
+      based on power.last_busy and power.autosuspend_delay; if the delay time
+      is 1000 ms or larger then the expiration time is rounded up to the
+      nearest second; returns 0 if the delay period has already expired or
+      power.use_autosuspend isn't set, otherwise returns the expiration time
+      in jiffies
+
+It is safe to execute the following helper functions from interrupt context:
+
+- pm_request_idle()
+- pm_request_autosuspend()
+- pm_schedule_suspend()
+- pm_request_resume()
+- pm_runtime_get_noresume()
+- pm_runtime_get()
+- pm_runtime_put_noidle()
+- pm_runtime_put()
+- pm_runtime_put_autosuspend()
+- pm_runtime_enable()
+- pm_suspend_ignore_children()
+- pm_runtime_set_active()
+- pm_runtime_set_suspended()
+- pm_runtime_suspended()
+- pm_runtime_mark_last_busy()
+- pm_runtime_autosuspend_expiration()
+
+If pm_runtime_irq_safe() has been called for a device then the following helper
+functions may also be used in interrupt context:
+
+- pm_runtime_idle()
+- pm_runtime_suspend()
+- pm_runtime_autosuspend()
+- pm_runtime_resume()
+- pm_runtime_get_sync()
+- pm_runtime_put_sync()
+- pm_runtime_put_sync_suspend()
+- pm_runtime_put_sync_autosuspend()
+
+5. Runtime PM Initialization, Device Probing and Removal
+========================================================
+
+Initially, the runtime PM is disabled for all devices, which means that the
+majority of the runtime PM helper functions described in Section 4 will return
+-EAGAIN until pm_runtime_enable() is called for the device.
+
+In addition to that, the initial runtime PM status of all devices is
+'suspended', but it need not reflect the actual physical state of the device.
+Thus, if the device is initially active (i.e. it is able to process I/O), its
+runtime PM status must be changed to 'active', with the help of
+pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
+
+However, if the device has a parent and the parent's runtime PM is enabled,
+calling pm_runtime_set_active() for the device will affect the parent, unless
+the parent's 'power.ignore_children' flag is set.  Namely, in that case the
+parent won't be able to suspend at run time, using the PM core's helper
+functions, as long as the child's status is 'active', even if the child's
+runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
+the child yet or pm_runtime_disable() has been called for it).  For this reason,
+once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
+should be called for it too as soon as reasonably possible or its runtime PM
+status should be changed back to 'suspended' with the help of
+pm_runtime_set_suspended().
+
+If the default initial runtime PM status of the device (i.e. 'suspended')
+reflects the actual state of the device, its bus type's or its driver's
+->probe() callback will likely need to wake it up using one of the PM core's
+helper functions described in Section 4.  In that case, pm_runtime_resume()
+should be used.  Of course, for this purpose the device's runtime PM has to be
+enabled earlier by calling pm_runtime_enable().
+
+Note, if the device may execute pm_runtime calls during the probe (such as
+if it is registers with a subsystem that may call back in) then the
+pm_runtime_get_sync() call paired with a pm_runtime_put() call will be
+appropriate to ensure that the device is not put back to sleep during the
+probe. This can happen with systems such as the network device layer.
+
+It may be desirable to suspend the device once ->probe() has finished.
+Therefore the driver core uses the asynchronous pm_request_idle() to submit a
+request to execute the subsystem-level idle callback for the device at that
+time.  A driver that makes use of the runtime autosuspend feature, may want to
+update the last busy mark before returning from ->probe().
+
+Moreover, the driver core prevents runtime PM callbacks from racing with the bus
+notifier callback in __device_release_driver(), which is necessary, because the
+notifier is used by some subsystems to carry out operations affecting the
+runtime PM functionality.  It does so by calling pm_runtime_get_sync() before
+driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications.  This
+resumes the device if it's in the suspended state and prevents it from
+being suspended again while those routines are being executed.
+
+To allow bus types and drivers to put devices into the suspended state by
+calling pm_runtime_suspend() from their ->remove() routines, the driver core
+executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER
+notifications in __device_release_driver().  This requires bus types and
+drivers to make their ->remove() callbacks avoid races with runtime PM directly,
+but also it allows of more flexibility in the handling of devices during the
+removal of their drivers.
+
+Drivers in ->remove() callback should undo the runtime PM changes done
+in ->probe(). Usually this means calling pm_runtime_disable(),
+pm_runtime_dont_use_autosuspend() etc.
+
+The user space can effectively disallow the driver of the device to power manage
+it at run time by changing the value of its /sys/devices/.../power/control
+attribute to "on", which causes pm_runtime_forbid() to be called.  In principle,
+this mechanism may also be used by the driver to effectively turn off the
+runtime power management of the device until the user space turns it on.
+Namely, during the initialization the driver can make sure that the runtime PM
+status of the device is 'active' and call pm_runtime_forbid().  It should be
+noted, however, that if the user space has already intentionally changed the
+value of /sys/devices/.../power/control to "auto" to allow the driver to power
+manage the device at run time, the driver may confuse it by using
+pm_runtime_forbid() this way.
+
+6. Runtime PM and System Sleep
+==============================
+
+Runtime PM and system sleep (i.e., system suspend and hibernation, also known
+as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of
+ways.  If a device is active when a system sleep starts, everything is
+straightforward.  But what should happen if the device is already suspended?
+
+The device may have different wake-up settings for runtime PM and system sleep.
+For example, remote wake-up may be enabled for runtime suspend but disallowed
+for system sleep (device_may_wakeup(dev) returns 'false').  When this happens,
+the subsystem-level system suspend callback is responsible for changing the
+device's wake-up setting (it may leave that to the device driver's system
+suspend routine).  It may be necessary to resume the device and suspend it again
+in order to do so.  The same is true if the driver uses different power levels
+or other settings for runtime suspend and system sleep.
+
+During system resume, the simplest approach is to bring all devices back to full
+power, even if they had been suspended before the system suspend began.  There
+are several reasons for this, including:
+
+  * The device might need to switch power levels, wake-up settings, etc.
+
+  * Remote wake-up events might have been lost by the firmware.
+
+  * The device's children may need the device to be at full power in order
+    to resume themselves.
+
+  * The driver's idea of the device state may not agree with the device's
+    physical state.  This can happen during resume from hibernation.
+
+  * The device might need to be reset.
+
+  * Even though the device was suspended, if its usage counter was > 0 then most
+    likely it would need a runtime resume in the near future anyway.
+
+If the device had been suspended before the system suspend began and it's
+brought back to full power during resume, then its runtime PM status will have
+to be updated to reflect the actual post-system sleep status.  The way to do
+this is:
+
+	 - pm_runtime_disable(dev);
+	 - pm_runtime_set_active(dev);
+	 - pm_runtime_enable(dev);
+
+The PM core always increments the runtime usage counter before calling the
+->suspend() callback and decrements it after calling the ->resume() callback.
+Hence disabling runtime PM temporarily like this will not cause any runtime
+suspend attempts to be permanently lost.  If the usage count goes to zero
+following the return of the ->resume() callback, the ->runtime_idle() callback
+will be invoked as usual.
+
+On some systems, however, system sleep is not entered through a global firmware
+or hardware operation.  Instead, all hardware components are put into low-power
+states directly by the kernel in a coordinated way.  Then, the system sleep
+state effectively follows from the states the hardware components end up in
+and the system is woken up from that state by a hardware interrupt or a similar
+mechanism entirely under the kernel's control.  As a result, the kernel never
+gives control away and the states of all devices during resume are precisely
+known to it.  If that is the case and none of the situations listed above takes
+place (in particular, if the system is not waking up from hibernation), it may
+be more efficient to leave the devices that had been suspended before the system
+suspend began in the suspended state.
+
+To this end, the PM core provides a mechanism allowing some coordination between
+different levels of device hierarchy.  Namely, if a system suspend .prepare()
+callback returns a positive number for a device, that indicates to the PM core
+that the device appears to be runtime-suspended and its state is fine, so it
+may be left in runtime suspend provided that all of its descendants are also
+left in runtime suspend.  If that happens, the PM core will not execute any
+system suspend and resume callbacks for all of those devices, except for the
+complete callback, which is then entirely responsible for handling the device
+as appropriate.  This only applies to system suspend transitions that are not
+related to hibernation (see Documentation/driver-api/pm/devices.rst for more
+information).
+
+The PM core does its best to reduce the probability of race conditions between
+the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
+out the following operations:
+
+  * During system suspend pm_runtime_get_noresume() is called for every device
+    right before executing the subsystem-level .prepare() callback for it and
+    pm_runtime_barrier() is called for every device right before executing the
+    subsystem-level .suspend() callback for it.  In addition to that the PM core
+    calls  __pm_runtime_disable() with 'false' as the second argument for every
+    device right before executing the subsystem-level .suspend_late() callback
+    for it.
+
+  * During system resume pm_runtime_enable() and pm_runtime_put() are called for
+    every device right after executing the subsystem-level .resume_early()
+    callback and right after executing the subsystem-level .complete() callback
+    for it, respectively.
+
+7. Generic subsystem callbacks
+
+Subsystems may wish to conserve code space by using the set of generic power
+management callbacks provided by the PM core, defined in
+driver/base/power/generic_ops.c:
+
+  `int pm_generic_runtime_suspend(struct device *dev);`
+    - invoke the ->runtime_suspend() callback provided by the driver of this
+      device and return its result, or return 0 if not defined
+
+  `int pm_generic_runtime_resume(struct device *dev);`
+    - invoke the ->runtime_resume() callback provided by the driver of this
+      device and return its result, or return 0 if not defined
+
+  `int pm_generic_suspend(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->suspend()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_suspend_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_resume(struct device *dev);`
+    - invoke the ->resume() callback provided by the driver of this device and,
+      if successful, change the device's runtime PM status to 'active'
+
+  `int pm_generic_resume_noirq(struct device *dev);`
+    - invoke the ->resume_noirq() callback provided by the driver of this device
+
+  `int pm_generic_freeze(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->freeze()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_freeze_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_thaw(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->thaw()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_thaw_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_poweroff(struct device *dev);`
+    - if the device has not been suspended at run time, invoke the ->poweroff()
+      callback provided by its driver and return its result, or return 0 if not
+      defined
+
+  `int pm_generic_poweroff_noirq(struct device *dev);`
+    - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq()
+      callback provided by the device's driver and return its result, or return
+      0 if not defined
+
+  `int pm_generic_restore(struct device *dev);`
+    - invoke the ->restore() callback provided by the driver of this device and,
+      if successful, change the device's runtime PM status to 'active'
+
+  `int pm_generic_restore_noirq(struct device *dev);`
+    - invoke the ->restore_noirq() callback provided by the device's driver
+
+These functions are the defaults used by the PM core, if a subsystem doesn't
+provide its own callbacks for ->runtime_idle(), ->runtime_suspend(),
+->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(),
+->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(),
+->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the
+subsystem-level dev_pm_ops structure.
+
+Device drivers that wish to use the same function as a system suspend, freeze,
+poweroff and runtime suspend callback, and similarly for system resume, thaw,
+restore, and runtime resume, can achieve this with the help of the
+UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its
+last argument to NULL).
+
+8. "No-Callback" Devices
+========================
+
+Some "devices" are only logical sub-devices of their parent and cannot be
+power-managed on their own.  (The prototype example is a USB interface.  Entire
+USB devices can go into low-power mode or send wake-up requests, but neither is
+possible for individual interfaces.)  The drivers for these devices have no
+need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend()
+and ->runtime_resume() would always return 0 without doing anything else and
+->runtime_idle() would always call pm_runtime_suspend().
+
+Subsystems can tell the PM core about these devices by calling
+pm_runtime_no_callbacks().  This should be done after the device structure is
+initialized and before it is registered (although after device registration is
+also okay).  The routine will set the device's power.no_callbacks flag and
+prevent the non-debugging runtime PM sysfs attributes from being created.
+
+When power.no_callbacks is set, the PM core will not invoke the
+->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks.
+Instead it will assume that suspends and resumes always succeed and that idle
+devices should be suspended.
+
+As a consequence, the PM core will never directly inform the device's subsystem
+or driver about runtime power changes.  Instead, the driver for the device's
+parent must take responsibility for telling the device's driver when the
+parent's power state changes.
+
+9. Autosuspend, or automatically-delayed suspends
+=================================================
+
+Changing a device's power state isn't free; it requires both time and energy.
+A device should be put in a low-power state only when there's some reason to
+think it will remain in that state for a substantial time.  A common heuristic
+says that a device which hasn't been used for a while is liable to remain
+unused; following this advice, drivers should not allow devices to be suspended
+at runtime until they have been inactive for some minimum period.  Even when
+the heuristic ends up being non-optimal, it will still prevent devices from
+"bouncing" too rapidly between low-power and full-power states.
+
+The term "autosuspend" is an historical remnant.  It doesn't mean that the
+device is automatically suspended (the subsystem or driver still has to call
+the appropriate PM routines); rather it means that runtime suspends will
+automatically be delayed until the desired period of inactivity has elapsed.
+
+Inactivity is determined based on the power.last_busy field.  Drivers should
+call pm_runtime_mark_last_busy() to update this field after carrying out I/O,
+typically just before calling pm_runtime_put_autosuspend().  The desired length
+of the inactivity period is a matter of policy.  Subsystems can set this length
+initially by calling pm_runtime_set_autosuspend_delay(), but after device
+registration the length should be controlled by user space, using the
+/sys/devices/.../power/autosuspend_delay_ms attribute.
+
+In order to use autosuspend, subsystems or drivers must call
+pm_runtime_use_autosuspend() (preferably before registering the device), and
+thereafter they should use the various `*_autosuspend()` helper functions
+instead of the non-autosuspend counterparts::
+
+	Instead of: pm_runtime_suspend    use: pm_runtime_autosuspend;
+	Instead of: pm_schedule_suspend   use: pm_request_autosuspend;
+	Instead of: pm_runtime_put        use: pm_runtime_put_autosuspend;
+	Instead of: pm_runtime_put_sync   use: pm_runtime_put_sync_autosuspend.
+
+Drivers may also continue to use the non-autosuspend helper functions; they
+will behave normally, which means sometimes taking the autosuspend delay into
+account (see pm_runtime_idle).
+
+Under some circumstances a driver or subsystem may want to prevent a device
+from autosuspending immediately, even though the usage counter is zero and the
+autosuspend delay time has expired.  If the ->runtime_suspend() callback
+returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is
+in the future (as it normally would be if the callback invoked
+pm_runtime_mark_last_busy()), the PM core will automatically reschedule the
+autosuspend.  The ->runtime_suspend() callback can't do this rescheduling
+itself because no suspend requests of any kind are accepted while the device is
+suspending (i.e., while the callback is running).
+
+The implementation is well suited for asynchronous use in interrupt contexts.
+However such use inevitably involves races, because the PM core can't
+synchronize ->runtime_suspend() callbacks with the arrival of I/O requests.
+This synchronization must be handled by the driver, using its private lock.
+Here is a schematic pseudo-code example::
+
+	foo_read_or_write(struct foo_priv *foo, void *data)
+	{
+		lock(&foo->private_lock);
+		add_request_to_io_queue(foo, data);
+		if (foo->num_pending_requests++ == 0)
+			pm_runtime_get(&foo->dev);
+		if (!foo->is_suspended)
+			foo_process_next_request(foo);
+		unlock(&foo->private_lock);
+	}
+
+	foo_io_completion(struct foo_priv *foo, void *req)
+	{
+		lock(&foo->private_lock);
+		if (--foo->num_pending_requests == 0) {
+			pm_runtime_mark_last_busy(&foo->dev);
+			pm_runtime_put_autosuspend(&foo->dev);
+		} else {
+			foo_process_next_request(foo);
+		}
+		unlock(&foo->private_lock);
+		/* Send req result back to the user ... */
+	}
+
+	int foo_runtime_suspend(struct device *dev)
+	{
+		struct foo_priv foo = container_of(dev, ...);
+		int ret = 0;
+
+		lock(&foo->private_lock);
+		if (foo->num_pending_requests > 0) {
+			ret = -EBUSY;
+		} else {
+			/* ... suspend the device ... */
+			foo->is_suspended = 1;
+		}
+		unlock(&foo->private_lock);
+		return ret;
+	}
+
+	int foo_runtime_resume(struct device *dev)
+	{
+		struct foo_priv foo = container_of(dev, ...);
+
+		lock(&foo->private_lock);
+		/* ... resume the device ... */
+		foo->is_suspended = 0;
+		pm_runtime_mark_last_busy(&foo->dev);
+		if (foo->num_pending_requests > 0)
+			foo_process_next_request(foo);
+		unlock(&foo->private_lock);
+		return 0;
+	}
+
+The important point is that after foo_io_completion() asks for an autosuspend,
+the foo_runtime_suspend() callback may race with foo_read_or_write().
+Therefore foo_runtime_suspend() has to check whether there are any pending I/O
+requests (while holding the private lock) before allowing the suspend to
+proceed.
+
+In addition, the power.autosuspend_delay field can be changed by user space at
+any time.  If a driver cares about this, it can call
+pm_runtime_autosuspend_expiration() from within the ->runtime_suspend()
+callback while holding its private lock.  If the function returns a nonzero
+value then the delay has not yet expired and the callback should return
+-EAGAIN.
diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
deleted file mode 100644
index 937e33c46211..000000000000
--- a/Documentation/power/runtime_pm.txt
+++ /dev/null
@@ -1,928 +0,0 @@
-Runtime Power Management Framework for I/O Devices
-
-(C) 2009-2011 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
-(C) 2010 Alan Stern <stern@rowland.harvard.edu>
-(C) 2014 Intel Corp., Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-1. Introduction
-
-Support for runtime power management (runtime PM) of I/O devices is provided
-at the power management core (PM core) level by means of:
-
-* The power management workqueue pm_wq in which bus types and device drivers can
-  put their PM-related work items.  It is strongly recommended that pm_wq be
-  used for queuing all work items related to runtime PM, because this allows
-  them to be synchronized with system-wide power transitions (suspend to RAM,
-  hibernation and resume from system sleep states).  pm_wq is declared in
-  include/linux/pm_runtime.h and defined in kernel/power/main.c.
-
-* A number of runtime PM fields in the 'power' member of 'struct device' (which
-  is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can
-  be used for synchronizing runtime PM operations with one another.
-
-* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in
-  include/linux/pm.h).
-
-* A set of helper functions defined in drivers/base/power/runtime.c that can be
-  used for carrying out runtime PM operations in such a way that the
-  synchronization between them is taken care of by the PM core.  Bus types and
-  device drivers are encouraged to use these functions.
-
-The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM
-fields of 'struct dev_pm_info' and the core helper functions provided for
-runtime PM are described below.
-
-2. Device Runtime PM Callbacks
-
-There are three device runtime PM callbacks defined in 'struct dev_pm_ops':
-
-struct dev_pm_ops {
-	...
-	int (*runtime_suspend)(struct device *dev);
-	int (*runtime_resume)(struct device *dev);
-	int (*runtime_idle)(struct device *dev);
-	...
-};
-
-The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks
-are executed by the PM core for the device's subsystem that may be either of
-the following:
-
-  1. PM domain of the device, if the device's PM domain object, dev->pm_domain,
-     is present.
-
-  2. Device type of the device, if both dev->type and dev->type->pm are present.
-
-  3. Device class of the device, if both dev->class and dev->class->pm are
-     present.
-
-  4. Bus type of the device, if both dev->bus and dev->bus->pm are present.
-
-If the subsystem chosen by applying the above rules doesn't provide the relevant
-callback, the PM core will invoke the corresponding driver callback stored in
-dev->driver->pm directly (if present).
-
-The PM core always checks which callback to use in the order given above, so the
-priority order of callbacks from high to low is: PM domain, device type, class
-and bus type.  Moreover, the high-priority one will always take precedence over
-a low-priority one.  The PM domain, bus type, device type and class callbacks
-are referred to as subsystem-level callbacks in what follows.
-
-By default, the callbacks are always invoked in process context with interrupts
-enabled.  However, the pm_runtime_irq_safe() helper function can be used to tell
-the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume()
-and ->runtime_idle() callbacks for the given device in atomic context with
-interrupts disabled.  This implies that the callback routines in question must
-not block or sleep, but it also means that the synchronous helper functions
-listed at the end of Section 4 may be used for that device within an interrupt
-handler or generally in an atomic context.
-
-The subsystem-level suspend callback, if present, is _entirely_ _responsible_
-for handling the suspend of the device as appropriate, which may, but need not
-include executing the device driver's own ->runtime_suspend() callback (from the
-PM core's point of view it is not necessary to implement a ->runtime_suspend()
-callback in a device driver as long as the subsystem-level suspend callback
-knows what to do to handle the device).
-
-  * Once the subsystem-level suspend callback (or the driver suspend callback,
-    if invoked directly) has completed successfully for the given device, the PM
-    core regards the device as suspended, which need not mean that it has been
-    put into a low power state.  It is supposed to mean, however, that the
-    device will not process data and will not communicate with the CPU(s) and
-    RAM until the appropriate resume callback is executed for it.  The runtime
-    PM status of a device after successful execution of the suspend callback is
-    'suspended'.
-
-  * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM
-    status remains 'active', which means that the device _must_ be fully
-    operational afterwards.
-
-  * If the suspend callback returns an error code different from -EBUSY and
-    -EAGAIN, the PM core regards this as a fatal error and will refuse to run
-    the helper functions described in Section 4 for the device until its status
-    is directly set to  either 'active', or 'suspended' (the PM core provides
-    special helper functions for this purpose).
-
-In particular, if the driver requires remote wakeup capability (i.e. hardware
-mechanism allowing the device to request a change of its power state, such as
-PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the
-device, then ->runtime_suspend() should return -EBUSY.  On the other hand, if
-device_can_wakeup() returns 'true' for the device and the device is put into a
-low-power state during the execution of the suspend callback, it is expected
-that remote wakeup will be enabled for the device.  Generally, remote wakeup
-should be enabled for all input devices put into low-power states at run time.
-
-The subsystem-level resume callback, if present, is _entirely_ _responsible_ for
-handling the resume of the device as appropriate, which may, but need not
-include executing the device driver's own ->runtime_resume() callback (from the
-PM core's point of view it is not necessary to implement a ->runtime_resume()
-callback in a device driver as long as the subsystem-level resume callback knows
-what to do to handle the device).
-
-  * Once the subsystem-level resume callback (or the driver resume callback, if
-    invoked directly) has completed successfully, the PM core regards the device
-    as fully operational, which means that the device _must_ be able to complete
-    I/O operations as needed.  The runtime PM status of the device is then
-    'active'.
-
-  * If the resume callback returns an error code, the PM core regards this as a
-    fatal error and will refuse to run the helper functions described in Section
-    4 for the device, until its status is directly set to either 'active', or
-    'suspended' (by means of special helper functions provided by the PM core
-    for this purpose).
-
-The idle callback (a subsystem-level one, if present, or the driver one) is
-executed by the PM core whenever the device appears to be idle, which is
-indicated to the PM core by two counters, the device's usage counter and the
-counter of 'active' children of the device.
-
-  * If any of these counters is decreased using a helper function provided by
-    the PM core and it turns out to be equal to zero, the other counter is
-    checked.  If that counter also is equal to zero, the PM core executes the
-    idle callback with the device as its argument.
-
-The action performed by the idle callback is totally dependent on the subsystem
-(or driver) in question, but the expected and recommended action is to check
-if the device can be suspended (i.e. if all of the conditions necessary for
-suspending the device are satisfied) and to queue up a suspend request for the
-device in that case.  If there is no idle callback, or if the callback returns
-0, then the PM core will attempt to carry out a runtime suspend of the device,
-also respecting devices configured for autosuspend.  In essence this means a
-call to pm_runtime_autosuspend() (do note that drivers needs to update the
-device last busy mark, pm_runtime_mark_last_busy(), to control the delay under
-this circumstance).  To prevent this (for example, if the callback routine has
-started a delayed suspend), the routine must return a non-zero value.  Negative
-error return codes are ignored by the PM core.
-
-The helper functions provided by the PM core, described in Section 4, guarantee
-that the following constraints are met with respect to runtime PM callbacks for
-one device:
-
-(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute
-    ->runtime_suspend() in parallel with ->runtime_resume() or with another
-    instance of ->runtime_suspend() for the same device) with the exception that
-    ->runtime_suspend() or ->runtime_resume() can be executed in parallel with
-    ->runtime_idle() (although ->runtime_idle() will not be started while any
-    of the other callbacks is being executed for the same device).
-
-(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active'
-    devices (i.e. the PM core will only execute ->runtime_idle() or
-    ->runtime_suspend() for the devices the runtime PM status of which is
-    'active').
-
-(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device
-    the usage counter of which is equal to zero _and_ either the counter of
-    'active' children of which is equal to zero, or the 'power.ignore_children'
-    flag of which is set.
-
-(4) ->runtime_resume() can only be executed for 'suspended' devices  (i.e. the
-    PM core will only execute ->runtime_resume() for the devices the runtime
-    PM status of which is 'suspended').
-
-Additionally, the helper functions provided by the PM core obey the following
-rules:
-
-  * If ->runtime_suspend() is about to be executed or there's a pending request
-    to execute it, ->runtime_idle() will not be executed for the same device.
-
-  * A request to execute or to schedule the execution of ->runtime_suspend()
-    will cancel any pending requests to execute ->runtime_idle() for the same
-    device.
-
-  * If ->runtime_resume() is about to be executed or there's a pending request
-    to execute it, the other callbacks will not be executed for the same device.
-
-  * A request to execute ->runtime_resume() will cancel any pending or
-    scheduled requests to execute the other callbacks for the same device,
-    except for scheduled autosuspends.
-
-3. Runtime PM Device Fields
-
-The following device runtime PM fields are present in 'struct dev_pm_info', as
-defined in include/linux/pm.h:
-
-  struct timer_list suspend_timer;
-    - timer used for scheduling (delayed) suspend and autosuspend requests
-
-  unsigned long timer_expires;
-    - timer expiration time, in jiffies (if this is different from zero, the
-      timer is running and will expire at that time, otherwise the timer is not
-      running)
-
-  struct work_struct work;
-    - work structure used for queuing up requests (i.e. work items in pm_wq)
-
-  wait_queue_head_t wait_queue;
-    - wait queue used if any of the helper functions needs to wait for another
-      one to complete
-
-  spinlock_t lock;
-    - lock used for synchronization
-
-  atomic_t usage_count;
-    - the usage counter of the device
-
-  atomic_t child_count;
-    - the count of 'active' children of the device
-
-  unsigned int ignore_children;
-    - if set, the value of child_count is ignored (but still updated)
-
-  unsigned int disable_depth;
-    - used for disabling the helper functions (they work normally if this is
-      equal to zero); the initial value of it is 1 (i.e. runtime PM is
-      initially disabled for all devices)
-
-  int runtime_error;
-    - if set, there was a fatal error (one of the callbacks returned error code
-      as described in Section 2), so the helper functions will not work until
-      this flag is cleared; this is the error code returned by the failing
-      callback
-
-  unsigned int idle_notification;
-    - if set, ->runtime_idle() is being executed
-
-  unsigned int request_pending;
-    - if set, there's a pending request (i.e. a work item queued up into pm_wq)
-
-  enum rpm_request request;
-    - type of request that's pending (valid if request_pending is set)
-
-  unsigned int deferred_resume;
-    - set if ->runtime_resume() is about to be run while ->runtime_suspend() is
-      being executed for that device and it is not practical to wait for the
-      suspend to complete; means "start a resume as soon as you've suspended"
-
-  enum rpm_status runtime_status;
-    - the runtime PM status of the device; this field's initial value is
-      RPM_SUSPENDED, which means that each device is initially regarded by the
-      PM core as 'suspended', regardless of its real hardware status
-
-  unsigned int runtime_auto;
-    - if set, indicates that the user space has allowed the device driver to
-      power manage the device at run time via the /sys/devices/.../power/control
-      interface; it may only be modified with the help of the pm_runtime_allow()
-      and pm_runtime_forbid() helper functions
-
-  unsigned int no_callbacks;
-    - indicates that the device does not use the runtime PM callbacks (see
-      Section 8); it may be modified only by the pm_runtime_no_callbacks()
-      helper function
-
-  unsigned int irq_safe;
-    - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks
-      will be invoked with the spinlock held and interrupts disabled
-
-  unsigned int use_autosuspend;
-    - indicates that the device's driver supports delayed autosuspend (see
-      Section 9); it may be modified only by the
-      pm_runtime{_dont}_use_autosuspend() helper functions
-
-  unsigned int timer_autosuspends;
-    - indicates that the PM core should attempt to carry out an autosuspend
-      when the timer expires rather than a normal suspend
-
-  int autosuspend_delay;
-    - the delay time (in milliseconds) to be used for autosuspend
-
-  unsigned long last_busy;
-    - the time (in jiffies) when the pm_runtime_mark_last_busy() helper
-      function was last called for this device; used in calculating inactivity
-      periods for autosuspend
-
-All of the above fields are members of the 'power' member of 'struct device'.
-
-4. Runtime PM Device Helper Functions
-
-The following runtime PM helper functions are defined in
-drivers/base/power/runtime.c and include/linux/pm_runtime.h:
-
-  void pm_runtime_init(struct device *dev);
-    - initialize the device runtime PM fields in 'struct dev_pm_info'
-
-  void pm_runtime_remove(struct device *dev);
-    - make sure that the runtime PM of the device will be disabled after
-      removing the device from device hierarchy
-
-  int pm_runtime_idle(struct device *dev);
-    - execute the subsystem-level idle callback for the device; returns an
-      error code on failure, where -EINPROGRESS means that ->runtime_idle() is
-      already being executed; if there is no callback or the callback returns 0
-      then run pm_runtime_autosuspend(dev) and return its result
-
-  int pm_runtime_suspend(struct device *dev);
-    - execute the subsystem-level suspend callback for the device; returns 0 on
-      success, 1 if the device's runtime PM status was already 'suspended', or
-      error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt
-      to suspend the device again in future and -EACCES means that
-      'power.disable_depth' is different from 0
-
-  int pm_runtime_autosuspend(struct device *dev);
-    - same as pm_runtime_suspend() except that the autosuspend delay is taken
-      into account; if pm_runtime_autosuspend_expiration() says the delay has
-      not yet expired then an autosuspend is scheduled for the appropriate time
-      and 0 is returned
-
-  int pm_runtime_resume(struct device *dev);
-    - execute the subsystem-level resume callback for the device; returns 0 on
-      success, 1 if the device's runtime PM status was already 'active' or
-      error code on failure, where -EAGAIN means it may be safe to attempt to
-      resume the device again in future, but 'power.runtime_error' should be
-      checked additionally, and -EACCES means that 'power.disable_depth' is
-      different from 0
-
-  int pm_request_idle(struct device *dev);
-    - submit a request to execute the subsystem-level idle callback for the
-      device (the request is represented by a work item in pm_wq); returns 0 on
-      success or error code if the request has not been queued up
-
-  int pm_request_autosuspend(struct device *dev);
-    - schedule the execution of the subsystem-level suspend callback for the
-      device when the autosuspend delay has expired; if the delay has already
-      expired then the work item is queued up immediately
-
-  int pm_schedule_suspend(struct device *dev, unsigned int delay);
-    - schedule the execution of the subsystem-level suspend callback for the
-      device in future, where 'delay' is the time to wait before queuing up a
-      suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work
-      item is queued up immediately); returns 0 on success, 1 if the device's PM
-      runtime status was already 'suspended', or error code if the request
-      hasn't been scheduled (or queued up if 'delay' is 0); if the execution of
-      ->runtime_suspend() is already scheduled and not yet expired, the new
-      value of 'delay' will be used as the time to wait
-
-  int pm_request_resume(struct device *dev);
-    - submit a request to execute the subsystem-level resume callback for the
-      device (the request is represented by a work item in pm_wq); returns 0 on
-      success, 1 if the device's runtime PM status was already 'active', or
-      error code if the request hasn't been queued up
-
-  void pm_runtime_get_noresume(struct device *dev);
-    - increment the device's usage counter
-
-  int pm_runtime_get(struct device *dev);
-    - increment the device's usage counter, run pm_request_resume(dev) and
-      return its result
-
-  int pm_runtime_get_sync(struct device *dev);
-    - increment the device's usage counter, run pm_runtime_resume(dev) and
-      return its result
-
-  int pm_runtime_get_if_in_use(struct device *dev);
-    - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the
-      runtime PM status is RPM_ACTIVE and the runtime PM usage counter is
-      nonzero, increment the counter and return 1; otherwise return 0 without
-      changing the counter
-
-  void pm_runtime_put_noidle(struct device *dev);
-    - decrement the device's usage counter
-
-  int pm_runtime_put(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_request_idle(dev) and return its result
-
-  int pm_runtime_put_autosuspend(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_request_autosuspend(dev) and return its result
-
-  int pm_runtime_put_sync(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_runtime_idle(dev) and return its result
-
-  int pm_runtime_put_sync_suspend(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_runtime_suspend(dev) and return its result
-
-  int pm_runtime_put_sync_autosuspend(struct device *dev);
-    - decrement the device's usage counter; if the result is 0 then run
-      pm_runtime_autosuspend(dev) and return its result
-
-  void pm_runtime_enable(struct device *dev);
-    - decrement the device's 'power.disable_depth' field; if that field is equal
-      to zero, the runtime PM helper functions can execute subsystem-level
-      callbacks described in Section 2 for the device
-
-  int pm_runtime_disable(struct device *dev);
-    - increment the device's 'power.disable_depth' field (if the value of that
-      field was previously zero, this prevents subsystem-level runtime PM
-      callbacks from being run for the device), make sure that all of the
-      pending runtime PM operations on the device are either completed or
-      canceled; returns 1 if there was a resume request pending and it was
-      necessary to execute the subsystem-level resume callback for the device
-      to satisfy that request, otherwise 0 is returned
-
-  int pm_runtime_barrier(struct device *dev);
-    - check if there's a resume request pending for the device and resume it
-      (synchronously) in that case, cancel any other pending runtime PM requests
-      regarding it and wait for all runtime PM operations on it in progress to
-      complete; returns 1 if there was a resume request pending and it was
-      necessary to execute the subsystem-level resume callback for the device to
-      satisfy that request, otherwise 0 is returned
-
-  void pm_suspend_ignore_children(struct device *dev, bool enable);
-    - set/unset the power.ignore_children flag of the device
-
-  int pm_runtime_set_active(struct device *dev);
-    - clear the device's 'power.runtime_error' flag, set the device's runtime
-      PM status to 'active' and update its parent's counter of 'active'
-      children as appropriate (it is only valid to use this function if
-      'power.runtime_error' is set or 'power.disable_depth' is greater than
-      zero); it will fail and return error code if the device has a parent
-      which is not active and the 'power.ignore_children' flag of which is unset
-
-  void pm_runtime_set_suspended(struct device *dev);
-    - clear the device's 'power.runtime_error' flag, set the device's runtime
-      PM status to 'suspended' and update its parent's counter of 'active'
-      children as appropriate (it is only valid to use this function if
-      'power.runtime_error' is set or 'power.disable_depth' is greater than
-      zero)
-
-  bool pm_runtime_active(struct device *dev);
-    - return true if the device's runtime PM status is 'active' or its
-      'power.disable_depth' field is not equal to zero, or false otherwise
-
-  bool pm_runtime_suspended(struct device *dev);
-    - return true if the device's runtime PM status is 'suspended' and its
-      'power.disable_depth' field is equal to zero, or false otherwise
-
-  bool pm_runtime_status_suspended(struct device *dev);
-    - return true if the device's runtime PM status is 'suspended'
-
-  void pm_runtime_allow(struct device *dev);
-    - set the power.runtime_auto flag for the device and decrease its usage
-      counter (used by the /sys/devices/.../power/control interface to
-      effectively allow the device to be power managed at run time)
-
-  void pm_runtime_forbid(struct device *dev);
-    - unset the power.runtime_auto flag for the device and increase its usage
-      counter (used by the /sys/devices/.../power/control interface to
-      effectively prevent the device from being power managed at run time)
-
-  void pm_runtime_no_callbacks(struct device *dev);
-    - set the power.no_callbacks flag for the device and remove the runtime
-      PM attributes from /sys/devices/.../power (or prevent them from being
-      added when the device is registered)
-
-  void pm_runtime_irq_safe(struct device *dev);
-    - set the power.irq_safe flag for the device, causing the runtime-PM
-      callbacks to be invoked with interrupts off
-
-  bool pm_runtime_is_irq_safe(struct device *dev);
-    - return true if power.irq_safe flag was set for the device, causing
-      the runtime-PM callbacks to be invoked with interrupts off
-
-  void pm_runtime_mark_last_busy(struct device *dev);
-    - set the power.last_busy field to the current time
-
-  void pm_runtime_use_autosuspend(struct device *dev);
-    - set the power.use_autosuspend flag, enabling autosuspend delays; call
-      pm_runtime_get_sync if the flag was previously cleared and
-      power.autosuspend_delay is negative
-
-  void pm_runtime_dont_use_autosuspend(struct device *dev);
-    - clear the power.use_autosuspend flag, disabling autosuspend delays;
-      decrement the device's usage counter if the flag was previously set and
-      power.autosuspend_delay is negative; call pm_runtime_idle
-
-  void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);
-    - set the power.autosuspend_delay value to 'delay' (expressed in
-      milliseconds); if 'delay' is negative then runtime suspends are
-      prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be
-      called or the device's usage counter may be decremented and
-      pm_runtime_idle called depending on if power.autosuspend_delay is
-      changed to or from a negative value; if power.use_autosuspend is clear,
-      pm_runtime_idle is called
-
-  unsigned long pm_runtime_autosuspend_expiration(struct device *dev);
-    - calculate the time when the current autosuspend delay period will expire,
-      based on power.last_busy and power.autosuspend_delay; if the delay time
-      is 1000 ms or larger then the expiration time is rounded up to the
-      nearest second; returns 0 if the delay period has already expired or
-      power.use_autosuspend isn't set, otherwise returns the expiration time
-      in jiffies
-
-It is safe to execute the following helper functions from interrupt context:
-
-pm_request_idle()
-pm_request_autosuspend()
-pm_schedule_suspend()
-pm_request_resume()
-pm_runtime_get_noresume()
-pm_runtime_get()
-pm_runtime_put_noidle()
-pm_runtime_put()
-pm_runtime_put_autosuspend()
-pm_runtime_enable()
-pm_suspend_ignore_children()
-pm_runtime_set_active()
-pm_runtime_set_suspended()
-pm_runtime_suspended()
-pm_runtime_mark_last_busy()
-pm_runtime_autosuspend_expiration()
-
-If pm_runtime_irq_safe() has been called for a device then the following helper
-functions may also be used in interrupt context:
-
-pm_runtime_idle()
-pm_runtime_suspend()
-pm_runtime_autosuspend()
-pm_runtime_resume()
-pm_runtime_get_sync()
-pm_runtime_put_sync()
-pm_runtime_put_sync_suspend()
-pm_runtime_put_sync_autosuspend()
-
-5. Runtime PM Initialization, Device Probing and Removal
-
-Initially, the runtime PM is disabled for all devices, which means that the
-majority of the runtime PM helper functions described in Section 4 will return
--EAGAIN until pm_runtime_enable() is called for the device.
-
-In addition to that, the initial runtime PM status of all devices is
-'suspended', but it need not reflect the actual physical state of the device.
-Thus, if the device is initially active (i.e. it is able to process I/O), its
-runtime PM status must be changed to 'active', with the help of
-pm_runtime_set_active(), before pm_runtime_enable() is called for the device.
-
-However, if the device has a parent and the parent's runtime PM is enabled,
-calling pm_runtime_set_active() for the device will affect the parent, unless
-the parent's 'power.ignore_children' flag is set.  Namely, in that case the
-parent won't be able to suspend at run time, using the PM core's helper
-functions, as long as the child's status is 'active', even if the child's
-runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for
-the child yet or pm_runtime_disable() has been called for it).  For this reason,
-once pm_runtime_set_active() has been called for the device, pm_runtime_enable()
-should be called for it too as soon as reasonably possible or its runtime PM
-status should be changed back to 'suspended' with the help of
-pm_runtime_set_suspended().
-
-If the default initial runtime PM status of the device (i.e. 'suspended')
-reflects the actual state of the device, its bus type's or its driver's
-->probe() callback will likely need to wake it up using one of the PM core's
-helper functions described in Section 4.  In that case, pm_runtime_resume()
-should be used.  Of course, for this purpose the device's runtime PM has to be
-enabled earlier by calling pm_runtime_enable().
-
-Note, if the device may execute pm_runtime calls during the probe (such as
-if it is registers with a subsystem that may call back in) then the
-pm_runtime_get_sync() call paired with a pm_runtime_put() call will be
-appropriate to ensure that the device is not put back to sleep during the
-probe. This can happen with systems such as the network device layer.
-
-It may be desirable to suspend the device once ->probe() has finished.
-Therefore the driver core uses the asynchronous pm_request_idle() to submit a
-request to execute the subsystem-level idle callback for the device at that
-time.  A driver that makes use of the runtime autosuspend feature, may want to
-update the last busy mark before returning from ->probe().
-
-Moreover, the driver core prevents runtime PM callbacks from racing with the bus
-notifier callback in __device_release_driver(), which is necessary, because the
-notifier is used by some subsystems to carry out operations affecting the
-runtime PM functionality.  It does so by calling pm_runtime_get_sync() before
-driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications.  This
-resumes the device if it's in the suspended state and prevents it from
-being suspended again while those routines are being executed.
-
-To allow bus types and drivers to put devices into the suspended state by
-calling pm_runtime_suspend() from their ->remove() routines, the driver core
-executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER
-notifications in __device_release_driver().  This requires bus types and
-drivers to make their ->remove() callbacks avoid races with runtime PM directly,
-but also it allows of more flexibility in the handling of devices during the
-removal of their drivers.
-
-Drivers in ->remove() callback should undo the runtime PM changes done
-in ->probe(). Usually this means calling pm_runtime_disable(),
-pm_runtime_dont_use_autosuspend() etc.
-
-The user space can effectively disallow the driver of the device to power manage
-it at run time by changing the value of its /sys/devices/.../power/control
-attribute to "on", which causes pm_runtime_forbid() to be called.  In principle,
-this mechanism may also be used by the driver to effectively turn off the
-runtime power management of the device until the user space turns it on.
-Namely, during the initialization the driver can make sure that the runtime PM
-status of the device is 'active' and call pm_runtime_forbid().  It should be
-noted, however, that if the user space has already intentionally changed the
-value of /sys/devices/.../power/control to "auto" to allow the driver to power
-manage the device at run time, the driver may confuse it by using
-pm_runtime_forbid() this way.
-
-6. Runtime PM and System Sleep
-
-Runtime PM and system sleep (i.e., system suspend and hibernation, also known
-as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of
-ways.  If a device is active when a system sleep starts, everything is
-straightforward.  But what should happen if the device is already suspended?
-
-The device may have different wake-up settings for runtime PM and system sleep.
-For example, remote wake-up may be enabled for runtime suspend but disallowed
-for system sleep (device_may_wakeup(dev) returns 'false').  When this happens,
-the subsystem-level system suspend callback is responsible for changing the
-device's wake-up setting (it may leave that to the device driver's system
-suspend routine).  It may be necessary to resume the device and suspend it again
-in order to do so.  The same is true if the driver uses different power levels
-or other settings for runtime suspend and system sleep.
-
-During system resume, the simplest approach is to bring all devices back to full
-power, even if they had been suspended before the system suspend began.  There
-are several reasons for this, including:
-
-  * The device might need to switch power levels, wake-up settings, etc.
-
-  * Remote wake-up events might have been lost by the firmware.
-
-  * The device's children may need the device to be at full power in order
-    to resume themselves.
-
-  * The driver's idea of the device state may not agree with the device's
-    physical state.  This can happen during resume from hibernation.
-
-  * The device might need to be reset.
-
-  * Even though the device was suspended, if its usage counter was > 0 then most
-    likely it would need a runtime resume in the near future anyway.
-
-If the device had been suspended before the system suspend began and it's
-brought back to full power during resume, then its runtime PM status will have
-to be updated to reflect the actual post-system sleep status.  The way to do
-this is:
-
-	pm_runtime_disable(dev);
-	pm_runtime_set_active(dev);
-	pm_runtime_enable(dev);
-
-The PM core always increments the runtime usage counter before calling the
-->suspend() callback and decrements it after calling the ->resume() callback.
-Hence disabling runtime PM temporarily like this will not cause any runtime
-suspend attempts to be permanently lost.  If the usage count goes to zero
-following the return of the ->resume() callback, the ->runtime_idle() callback
-will be invoked as usual.
-
-On some systems, however, system sleep is not entered through a global firmware
-or hardware operation.  Instead, all hardware components are put into low-power
-states directly by the kernel in a coordinated way.  Then, the system sleep
-state effectively follows from the states the hardware components end up in
-and the system is woken up from that state by a hardware interrupt or a similar
-mechanism entirely under the kernel's control.  As a result, the kernel never
-gives control away and the states of all devices during resume are precisely
-known to it.  If that is the case and none of the situations listed above takes
-place (in particular, if the system is not waking up from hibernation), it may
-be more efficient to leave the devices that had been suspended before the system
-suspend began in the suspended state.
-
-To this end, the PM core provides a mechanism allowing some coordination between
-different levels of device hierarchy.  Namely, if a system suspend .prepare()
-callback returns a positive number for a device, that indicates to the PM core
-that the device appears to be runtime-suspended and its state is fine, so it
-may be left in runtime suspend provided that all of its descendants are also
-left in runtime suspend.  If that happens, the PM core will not execute any
-system suspend and resume callbacks for all of those devices, except for the
-complete callback, which is then entirely responsible for handling the device
-as appropriate.  This only applies to system suspend transitions that are not
-related to hibernation (see Documentation/driver-api/pm/devices.rst for more
-information).
-
-The PM core does its best to reduce the probability of race conditions between
-the runtime PM and system suspend/resume (and hibernation) callbacks by carrying
-out the following operations:
-
-  * During system suspend pm_runtime_get_noresume() is called for every device
-    right before executing the subsystem-level .prepare() callback for it and
-    pm_runtime_barrier() is called for every device right before executing the
-    subsystem-level .suspend() callback for it.  In addition to that the PM core
-    calls  __pm_runtime_disable() with 'false' as the second argument for every
-    device right before executing the subsystem-level .suspend_late() callback
-    for it.
-
-  * During system resume pm_runtime_enable() and pm_runtime_put() are called for
-    every device right after executing the subsystem-level .resume_early()
-    callback and right after executing the subsystem-level .complete() callback
-    for it, respectively.
-
-7. Generic subsystem callbacks
-
-Subsystems may wish to conserve code space by using the set of generic power
-management callbacks provided by the PM core, defined in
-driver/base/power/generic_ops.c:
-
-  int pm_generic_runtime_suspend(struct device *dev);
-    - invoke the ->runtime_suspend() callback provided by the driver of this
-      device and return its result, or return 0 if not defined
-
-  int pm_generic_runtime_resume(struct device *dev);
-    - invoke the ->runtime_resume() callback provided by the driver of this
-      device and return its result, or return 0 if not defined
-
-  int pm_generic_suspend(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->suspend()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_suspend_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_resume(struct device *dev);
-    - invoke the ->resume() callback provided by the driver of this device and,
-      if successful, change the device's runtime PM status to 'active'
-
-  int pm_generic_resume_noirq(struct device *dev);
-    - invoke the ->resume_noirq() callback provided by the driver of this device
-
-  int pm_generic_freeze(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->freeze()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_freeze_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_thaw(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->thaw()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_thaw_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_poweroff(struct device *dev);
-    - if the device has not been suspended at run time, invoke the ->poweroff()
-      callback provided by its driver and return its result, or return 0 if not
-      defined
-
-  int pm_generic_poweroff_noirq(struct device *dev);
-    - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq()
-      callback provided by the device's driver and return its result, or return
-      0 if not defined
-
-  int pm_generic_restore(struct device *dev);
-    - invoke the ->restore() callback provided by the driver of this device and,
-      if successful, change the device's runtime PM status to 'active'
-
-  int pm_generic_restore_noirq(struct device *dev);
-    - invoke the ->restore_noirq() callback provided by the device's driver
-
-These functions are the defaults used by the PM core, if a subsystem doesn't
-provide its own callbacks for ->runtime_idle(), ->runtime_suspend(),
-->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(),
-->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(),
-->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the
-subsystem-level dev_pm_ops structure.
-
-Device drivers that wish to use the same function as a system suspend, freeze,
-poweroff and runtime suspend callback, and similarly for system resume, thaw,
-restore, and runtime resume, can achieve this with the help of the
-UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its
-last argument to NULL).
-
-8. "No-Callback" Devices
-
-Some "devices" are only logical sub-devices of their parent and cannot be
-power-managed on their own.  (The prototype example is a USB interface.  Entire
-USB devices can go into low-power mode or send wake-up requests, but neither is
-possible for individual interfaces.)  The drivers for these devices have no
-need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend()
-and ->runtime_resume() would always return 0 without doing anything else and
-->runtime_idle() would always call pm_runtime_suspend().
-
-Subsystems can tell the PM core about these devices by calling
-pm_runtime_no_callbacks().  This should be done after the device structure is
-initialized and before it is registered (although after device registration is
-also okay).  The routine will set the device's power.no_callbacks flag and
-prevent the non-debugging runtime PM sysfs attributes from being created.
-
-When power.no_callbacks is set, the PM core will not invoke the
-->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks.
-Instead it will assume that suspends and resumes always succeed and that idle
-devices should be suspended.
-
-As a consequence, the PM core will never directly inform the device's subsystem
-or driver about runtime power changes.  Instead, the driver for the device's
-parent must take responsibility for telling the device's driver when the
-parent's power state changes.
-
-9. Autosuspend, or automatically-delayed suspends
-
-Changing a device's power state isn't free; it requires both time and energy.
-A device should be put in a low-power state only when there's some reason to
-think it will remain in that state for a substantial time.  A common heuristic
-says that a device which hasn't been used for a while is liable to remain
-unused; following this advice, drivers should not allow devices to be suspended
-at runtime until they have been inactive for some minimum period.  Even when
-the heuristic ends up being non-optimal, it will still prevent devices from
-"bouncing" too rapidly between low-power and full-power states.
-
-The term "autosuspend" is an historical remnant.  It doesn't mean that the
-device is automatically suspended (the subsystem or driver still has to call
-the appropriate PM routines); rather it means that runtime suspends will
-automatically be delayed until the desired period of inactivity has elapsed.
-
-Inactivity is determined based on the power.last_busy field.  Drivers should
-call pm_runtime_mark_last_busy() to update this field after carrying out I/O,
-typically just before calling pm_runtime_put_autosuspend().  The desired length
-of the inactivity period is a matter of policy.  Subsystems can set this length
-initially by calling pm_runtime_set_autosuspend_delay(), but after device
-registration the length should be controlled by user space, using the
-/sys/devices/.../power/autosuspend_delay_ms attribute.
-
-In order to use autosuspend, subsystems or drivers must call
-pm_runtime_use_autosuspend() (preferably before registering the device), and
-thereafter they should use the various *_autosuspend() helper functions instead
-of the non-autosuspend counterparts:
-
-	Instead of: pm_runtime_suspend    use: pm_runtime_autosuspend;
-	Instead of: pm_schedule_suspend   use: pm_request_autosuspend;
-	Instead of: pm_runtime_put        use: pm_runtime_put_autosuspend;
-	Instead of: pm_runtime_put_sync   use: pm_runtime_put_sync_autosuspend.
-
-Drivers may also continue to use the non-autosuspend helper functions; they
-will behave normally, which means sometimes taking the autosuspend delay into
-account (see pm_runtime_idle).
-
-Under some circumstances a driver or subsystem may want to prevent a device
-from autosuspending immediately, even though the usage counter is zero and the
-autosuspend delay time has expired.  If the ->runtime_suspend() callback
-returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is
-in the future (as it normally would be if the callback invoked
-pm_runtime_mark_last_busy()), the PM core will automatically reschedule the
-autosuspend.  The ->runtime_suspend() callback can't do this rescheduling
-itself because no suspend requests of any kind are accepted while the device is
-suspending (i.e., while the callback is running).
-
-The implementation is well suited for asynchronous use in interrupt contexts.
-However such use inevitably involves races, because the PM core can't
-synchronize ->runtime_suspend() callbacks with the arrival of I/O requests.
-This synchronization must be handled by the driver, using its private lock.
-Here is a schematic pseudo-code example:
-
-	foo_read_or_write(struct foo_priv *foo, void *data)
-	{
-		lock(&foo->private_lock);
-		add_request_to_io_queue(foo, data);
-		if (foo->num_pending_requests++ == 0)
-			pm_runtime_get(&foo->dev);
-		if (!foo->is_suspended)
-			foo_process_next_request(foo);
-		unlock(&foo->private_lock);
-	}
-
-	foo_io_completion(struct foo_priv *foo, void *req)
-	{
-		lock(&foo->private_lock);
-		if (--foo->num_pending_requests == 0) {
-			pm_runtime_mark_last_busy(&foo->dev);
-			pm_runtime_put_autosuspend(&foo->dev);
-		} else {
-			foo_process_next_request(foo);
-		}
-		unlock(&foo->private_lock);
-		/* Send req result back to the user ... */
-	}
-
-	int foo_runtime_suspend(struct device *dev)
-	{
-		struct foo_priv foo = container_of(dev, ...);
-		int ret = 0;
-
-		lock(&foo->private_lock);
-		if (foo->num_pending_requests > 0) {
-			ret = -EBUSY;
-		} else {
-			/* ... suspend the device ... */
-			foo->is_suspended = 1;
-		}
-		unlock(&foo->private_lock);
-		return ret;
-	}
-
-	int foo_runtime_resume(struct device *dev)
-	{
-		struct foo_priv foo = container_of(dev, ...);
-
-		lock(&foo->private_lock);
-		/* ... resume the device ... */
-		foo->is_suspended = 0;
-		pm_runtime_mark_last_busy(&foo->dev);
-		if (foo->num_pending_requests > 0)
-			foo_process_next_request(foo);
-		unlock(&foo->private_lock);
-		return 0;
-	}
-
-The important point is that after foo_io_completion() asks for an autosuspend,
-the foo_runtime_suspend() callback may race with foo_read_or_write().
-Therefore foo_runtime_suspend() has to check whether there are any pending I/O
-requests (while holding the private lock) before allowing the suspend to
-proceed.
-
-In addition, the power.autosuspend_delay field can be changed by user space at
-any time.  If a driver cares about this, it can call
-pm_runtime_autosuspend_expiration() from within the ->runtime_suspend()
-callback while holding its private lock.  If the function returns a nonzero
-value then the delay has not yet expired and the callback should return
--EAGAIN.
diff --git a/Documentation/power/s2ram.rst b/Documentation/power/s2ram.rst
new file mode 100644
index 000000000000..d739aa7c742c
--- /dev/null
+++ b/Documentation/power/s2ram.rst
@@ -0,0 +1,87 @@
+========================
+How to get s2ram working
+========================
+
+2006 Linus Torvalds
+2006 Pavel Machek
+
+1) Check suspend.sf.net, program s2ram there has long whitelist of
+   "known ok" machines, along with tricks to use on each one.
+
+2) If that does not help, try reading tricks.txt and
+   video.txt. Perhaps problem is as simple as broken module, and
+   simple module unload can fix it.
+
+3) You can use Linus' TRACE_RESUME infrastructure, described below.
+
+Using TRACE_RESUME
+~~~~~~~~~~~~~~~~~~
+
+I've been working at making the machines I have able to STR, and almost
+always it's a driver that is buggy. Thank God for the suspend/resume
+debugging - the thing that Chuck tried to disable. That's often the _only_
+way to debug these things, and it's actually pretty powerful (but
+time-consuming - having to insert TRACE_RESUME() markers into the device
+driver that doesn't resume and recompile and reboot).
+
+Anyway, the way to debug this for people who are interested (have a
+machine that doesn't boot) is:
+
+ - enable PM_DEBUG, and PM_TRACE
+
+ - use a script like this::
+
+	#!/bin/sh
+	sync
+	echo 1 > /sys/power/pm_trace
+	echo mem > /sys/power/state
+
+   to suspend
+
+ - if it doesn't come back up (which is usually the problem), reboot by
+   holding the power button down, and look at the dmesg output for things
+   like::
+
+	Magic number: 4:156:725
+	hash matches drivers/base/power/resume.c:28
+	hash matches device 0000:01:00.0
+
+   which means that the last trace event was just before trying to resume
+   device 0000:01:00.0. Then figure out what driver is controlling that
+   device (lspci and /sys/devices/pci* is your friend), and see if you can
+   fix it, disable it, or trace into its resume function.
+
+   If no device matches the hash (or any matches appear to be false positives),
+   the culprit may be a device from a loadable kernel module that is not loaded
+   until after the hash is checked. You can check the hash against the current
+   devices again after more modules are loaded using sysfs::
+
+	cat /sys/power/pm_trace_dev_match
+
+For example, the above happens to be the VGA device on my EVO, which I
+used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out
+that "radeonfb" simply cannot resume that device - it tries to set the
+PLL's, and it just _hangs_. Using the regular VGA console and letting X
+resume it instead works fine.
+
+NOTE
+====
+pm_trace uses the system's Real Time Clock (RTC) to save the magic number.
+Reason for this is that the RTC is the only reliably available piece of
+hardware during resume operations where a value can be set that will
+survive a reboot.
+
+pm_trace is not compatible with asynchronous suspend, so it turns
+asynchronous suspend off (which may work around timing or
+ordering-sensitive bugs).
+
+Consequence is that after a resume (even if it is successful) your system
+clock will have a value corresponding to the magic number instead of the
+correct date/time! It is therefore advisable to use a program like ntp-date
+or rdate to reset the correct date/time from an external time source when
+using this trace option.
+
+As the clock keeps ticking it is also essential that the reboot is done
+quickly after the resume failure. The trace option does not use the seconds
+or the low order bits of the minutes of the RTC, but a too long delay will
+corrupt the magic value.
diff --git a/Documentation/power/s2ram.txt b/Documentation/power/s2ram.txt
deleted file mode 100644
index 4685aee197fd..000000000000
--- a/Documentation/power/s2ram.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-			How to get s2ram working
-			~~~~~~~~~~~~~~~~~~~~~~~~
-			2006 Linus Torvalds
-			2006 Pavel Machek
-
-1) Check suspend.sf.net, program s2ram there has long whitelist of
-   "known ok" machines, along with tricks to use on each one.
-
-2) If that does not help, try reading tricks.txt and
-   video.txt. Perhaps problem is as simple as broken module, and
-   simple module unload can fix it.
-
-3) You can use Linus' TRACE_RESUME infrastructure, described below.
-
-		      Using TRACE_RESUME
-		      ~~~~~~~~~~~~~~~~~~
-
-I've been working at making the machines I have able to STR, and almost
-always it's a driver that is buggy. Thank God for the suspend/resume
-debugging - the thing that Chuck tried to disable. That's often the _only_
-way to debug these things, and it's actually pretty powerful (but
-time-consuming - having to insert TRACE_RESUME() markers into the device
-driver that doesn't resume and recompile and reboot).
-
-Anyway, the way to debug this for people who are interested (have a
-machine that doesn't boot) is:
-
- - enable PM_DEBUG, and PM_TRACE
-
- - use a script like this:
-
-	#!/bin/sh
-	sync
-	echo 1 > /sys/power/pm_trace
-	echo mem > /sys/power/state
-
-   to suspend
-
- - if it doesn't come back up (which is usually the problem), reboot by
-   holding the power button down, and look at the dmesg output for things
-   like
-
-	Magic number: 4:156:725
-	hash matches drivers/base/power/resume.c:28
-	hash matches device 0000:01:00.0
-
-   which means that the last trace event was just before trying to resume
-   device 0000:01:00.0. Then figure out what driver is controlling that
-   device (lspci and /sys/devices/pci* is your friend), and see if you can
-   fix it, disable it, or trace into its resume function.
-
-   If no device matches the hash (or any matches appear to be false positives),
-   the culprit may be a device from a loadable kernel module that is not loaded
-   until after the hash is checked. You can check the hash against the current
-   devices again after more modules are loaded using sysfs:
-
-	cat /sys/power/pm_trace_dev_match
-
-For example, the above happens to be the VGA device on my EVO, which I
-used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out
-that "radeonfb" simply cannot resume that device - it tries to set the
-PLL's, and it just _hangs_. Using the regular VGA console and letting X
-resume it instead works fine.
-
-NOTE
-====
-pm_trace uses the system's Real Time Clock (RTC) to save the magic number.
-Reason for this is that the RTC is the only reliably available piece of
-hardware during resume operations where a value can be set that will
-survive a reboot.
-
-pm_trace is not compatible with asynchronous suspend, so it turns
-asynchronous suspend off (which may work around timing or
-ordering-sensitive bugs).
-
-Consequence is that after a resume (even if it is successful) your system
-clock will have a value corresponding to the magic number instead of the
-correct date/time! It is therefore advisable to use a program like ntp-date
-or rdate to reset the correct date/time from an external time source when
-using this trace option.
-
-As the clock keeps ticking it is also essential that the reboot is done
-quickly after the resume failure. The trace option does not use the seconds
-or the low order bits of the minutes of the RTC, but a too long delay will
-corrupt the magic value.
diff --git a/Documentation/power/suspend-and-cpuhotplug.rst b/Documentation/power/suspend-and-cpuhotplug.rst
new file mode 100644
index 000000000000..7ac8e1f549f4
--- /dev/null
+++ b/Documentation/power/suspend-and-cpuhotplug.rst
@@ -0,0 +1,286 @@
+====================================================================
+Interaction of Suspend code (S3) with the CPU hotplug infrastructure
+====================================================================
+
+(C) 2011 - 2014 Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
+
+
+I. Differences between CPU hotplug and Suspend-to-RAM
+======================================================
+
+How does the regular CPU hotplug code differ from how the Suspend-to-RAM
+infrastructure uses it internally? And where do they share common code?
+
+Well, a picture is worth a thousand words... So ASCII art follows :-)
+
+[This depicts the current design in the kernel, and focusses only on the
+interactions involving the freezer and CPU hotplug and also tries to explain
+the locking involved. It outlines the notifications involved as well.
+But please note that here, only the call paths are illustrated, with the aim
+of describing where they take different paths and where they share code.
+What happens when regular CPU hotplug and Suspend-to-RAM race with each other
+is not depicted here.]
+
+On a high level, the suspend-resume cycle goes like this::
+
+  |Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw |
+  |tasks |    |     cpus      |    |          |    |     cpus     |    |tasks|
+
+
+More details follow::
+
+                                Suspend call path
+                                -----------------
+
+                                  Write 'mem' to
+                                /sys/power/state
+                                    sysfs file
+                                        |
+                                        v
+                               Acquire system_transition_mutex lock
+                                        |
+                                        v
+                             Send PM_SUSPEND_PREPARE
+                                   notifications
+                                        |
+                                        v
+                                   Freeze tasks
+                                        |
+                                        |
+                                        v
+                              disable_nonboot_cpus()
+                                   /* start */
+                                        |
+                                        v
+                            Acquire cpu_add_remove_lock
+                                        |
+                                        v
+                             Iterate over CURRENTLY
+                                   online CPUs
+                                        |
+                                        |
+                                        |                ----------
+                                        v                          | L
+             ======>               _cpu_down()                     |
+            |              [This takes cpuhotplug.lock             |
+  Common    |               before taking down the CPU             |
+   code     |               and releases it when done]             | O
+            |            While it is at it, notifications          |
+            |            are sent when notable events occur,       |
+             ======>     by running all registered callbacks.      |
+                                        |                          | O
+                                        |                          |
+                                        |                          |
+                                        v                          |
+                            Note down these cpus in                | P
+                                frozen_cpus mask         ----------
+                                        |
+                                        v
+                           Disable regular cpu hotplug
+                        by increasing cpu_hotplug_disabled
+                                        |
+                                        v
+                            Release cpu_add_remove_lock
+                                        |
+                                        v
+                       /* disable_nonboot_cpus() complete */
+                                        |
+                                        v
+                                   Do suspend
+
+
+
+Resuming back is likewise, with the counterparts being (in the order of
+execution during resume):
+
+* enable_nonboot_cpus() which involves::
+
+   |  Acquire cpu_add_remove_lock
+   |  Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug
+   |  Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop]
+   |  Release cpu_add_remove_lock
+   v
+
+* thaw tasks
+* send PM_POST_SUSPEND notifications
+* Release system_transition_mutex lock.
+
+
+It is to be noted here that the system_transition_mutex lock is acquired at the very
+beginning, when we are just starting out to suspend, and then released only
+after the entire cycle is complete (i.e., suspend + resume).
+
+::
+
+
+
+                          Regular CPU hotplug call path
+                          -----------------------------
+
+                                Write 0 (or 1) to
+                       /sys/devices/system/cpu/cpu*/online
+                                    sysfs file
+                                        |
+                                        |
+                                        v
+                                    cpu_down()
+                                        |
+                                        v
+                           Acquire cpu_add_remove_lock
+                                        |
+                                        v
+                          If cpu_hotplug_disabled > 0
+                                return gracefully
+                                        |
+                                        |
+                                        v
+             ======>                _cpu_down()
+            |              [This takes cpuhotplug.lock
+  Common    |               before taking down the CPU
+   code     |               and releases it when done]
+            |            While it is at it, notifications
+            |           are sent when notable events occur,
+             ======>    by running all registered callbacks.
+                                        |
+                                        |
+                                        v
+                          Release cpu_add_remove_lock
+                               [That's it!, for
+                              regular CPU hotplug]
+
+
+
+So, as can be seen from the two diagrams (the parts marked as "Common code"),
+regular CPU hotplug and the suspend code path converge at the _cpu_down() and
+_cpu_up() functions. They differ in the arguments passed to these functions,
+in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen'
+argument. But during suspend, since the tasks are already frozen by the time
+the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called
+with the 'tasks_frozen' argument set to 1.
+[See below for some known issues regarding this.]
+
+
+Important files and functions/entry points:
+-------------------------------------------
+
+- kernel/power/process.c : freeze_processes(), thaw_processes()
+- kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish()
+- kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus()
+
+
+
+II. What are the issues involved in CPU hotplug?
+------------------------------------------------
+
+There are some interesting situations involving CPU hotplug and microcode
+update on the CPUs, as discussed below:
+
+[Please bear in mind that the kernel requests the microcode images from
+userspace, using the request_firmware() function defined in
+drivers/base/firmware_loader/main.c]
+
+
+a. When all the CPUs are identical:
+
+   This is the most common situation and it is quite straightforward: we want
+   to apply the same microcode revision to each of the CPUs.
+   To give an example of x86, the collect_cpu_info() function defined in
+   arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU
+   and thereby in applying the correct microcode revision to it.
+   But note that the kernel does not maintain a common microcode image for the
+   all CPUs, in order to handle case 'b' described below.
+
+
+b. When some of the CPUs are different than the rest:
+
+   In this case since we probably need to apply different microcode revisions
+   to different CPUs, the kernel maintains a copy of the correct microcode
+   image for each CPU (after appropriate CPU type/model discovery using
+   functions such as collect_cpu_info()).
+
+
+c. When a CPU is physically hot-unplugged and a new (and possibly different
+   type of) CPU is hot-plugged into the system:
+
+   In the current design of the kernel, whenever a CPU is taken offline during
+   a regular CPU hotplug operation, upon receiving the CPU_DEAD notification
+   (which is sent by the CPU hotplug code), the microcode update driver's
+   callback for that event reacts by freeing the kernel's copy of the
+   microcode image for that CPU.
+
+   Hence, when a new CPU is brought online, since the kernel finds that it
+   doesn't have the microcode image, it does the CPU type/model discovery
+   afresh and then requests the userspace for the appropriate microcode image
+   for that CPU, which is subsequently applied.
+
+   For example, in x86, the mc_cpu_callback() function (which is the microcode
+   update driver's callback registered for CPU hotplug events) calls
+   microcode_update_cpu() which would call microcode_init_cpu() in this case,
+   instead of microcode_resume_cpu() when it finds that the kernel doesn't
+   have a valid microcode image. This ensures that the CPU type/model
+   discovery is performed and the right microcode is applied to the CPU after
+   getting it from userspace.
+
+
+d. Handling microcode update during suspend/hibernate:
+
+   Strictly speaking, during a CPU hotplug operation which does not involve
+   physically removing or inserting CPUs, the CPUs are not actually powered
+   off during a CPU offline. They are just put to the lowest C-states possible.
+   Hence, in such a case, it is not really necessary to re-apply microcode
+   when the CPUs are brought back online, since they wouldn't have lost the
+   image during the CPU offline operation.
+
+   This is the usual scenario encountered during a resume after a suspend.
+   However, in the case of hibernation, since all the CPUs are completely
+   powered off, during restore it becomes necessary to apply the microcode
+   images to all the CPUs.
+
+   [Note that we don't expect someone to physically pull out nodes and insert
+   nodes with a different type of CPUs in-between a suspend-resume or a
+   hibernate/restore cycle.]
+
+   In the current design of the kernel however, during a CPU offline operation
+   as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set),
+   the existing copy of microcode image in the kernel is not freed up.
+   And during the CPU online operations (during resume/restore), since the
+   kernel finds that it already has copies of the microcode images for all the
+   CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU
+   type/model and the need for validating whether the microcode revisions are
+   right for the CPUs or not (due to the above assumption that physical CPU
+   hotplug will not be done in-between suspend/resume or hibernate/restore
+   cycles).
+
+
+III. Known problems
+===================
+
+Are there any known problems when regular CPU hotplug and suspend race
+with each other?
+
+Yes, they are listed below:
+
+1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to
+   the _cpu_down() and _cpu_up() functions is *always* 0.
+   This might not reflect the true current state of the system, since the
+   tasks could have been frozen by an out-of-band event such as a suspend
+   operation in progress. Hence, the cpuhp_tasks_frozen variable will not
+   reflect the frozen state and the CPU hotplug callbacks which evaluate
+   that variable might execute the wrong code path.
+
+2. If a regular CPU hotplug stress test happens to race with the freezer due
+   to a suspend operation in progress at the same time, then we could hit the
+   situation described below:
+
+    * A regular cpu online operation continues its journey from userspace
+      into the kernel, since the freezing has not yet begun.
+    * Then freezer gets to work and freezes userspace.
+    * If cpu online has not yet completed the microcode update stuff by now,
+      it will now start waiting on the frozen userspace in the
+      TASK_UNINTERRUPTIBLE state, in order to get the microcode image.
+    * Now the freezer continues and tries to freeze the remaining tasks. But
+      due to this wait mentioned above, the freezer won't be able to freeze
+      the cpu online hotplug task and hence freezing of tasks fails.
+
+   As a result of this task freezing failure, the suspend operation gets
+   aborted.
diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt
deleted file mode 100644
index a8751b8df10e..000000000000
--- a/Documentation/power/suspend-and-cpuhotplug.txt
+++ /dev/null
@@ -1,274 +0,0 @@
-Interaction of Suspend code (S3) with the CPU hotplug infrastructure
-
-     (C) 2011 - 2014 Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
-
-
-I. How does the regular CPU hotplug code differ from how the Suspend-to-RAM
-   infrastructure uses it internally? And where do they share common code?
-
-Well, a picture is worth a thousand words... So ASCII art follows :-)
-
-[This depicts the current design in the kernel, and focusses only on the
-interactions involving the freezer and CPU hotplug and also tries to explain
-the locking involved. It outlines the notifications involved as well.
-But please note that here, only the call paths are illustrated, with the aim
-of describing where they take different paths and where they share code.
-What happens when regular CPU hotplug and Suspend-to-RAM race with each other
-is not depicted here.]
-
-On a high level, the suspend-resume cycle goes like this:
-
-|Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw |
-|tasks |    |     cpus      |    |          |    |     cpus     |    |tasks|
-
-
-More details follow:
-
-                                Suspend call path
-                                -----------------
-
-                                  Write 'mem' to
-                                /sys/power/state
-                                    sysfs file
-                                        |
-                                        v
-                               Acquire system_transition_mutex lock
-                                        |
-                                        v
-                             Send PM_SUSPEND_PREPARE
-                                   notifications
-                                        |
-                                        v
-                                   Freeze tasks
-                                        |
-                                        |
-                                        v
-                              disable_nonboot_cpus()
-                                   /* start */
-                                        |
-                                        v
-                            Acquire cpu_add_remove_lock
-                                        |
-                                        v
-                             Iterate over CURRENTLY
-                                   online CPUs
-                                        |
-                                        |
-                                        |                ----------
-                                        v                          | L
-             ======>               _cpu_down()                     |
-            |              [This takes cpuhotplug.lock             |
-  Common    |               before taking down the CPU             |
-   code     |               and releases it when done]             | O
-            |            While it is at it, notifications          |
-            |            are sent when notable events occur,       |
-             ======>     by running all registered callbacks.      |
-                                        |                          | O
-                                        |                          |
-                                        |                          |
-                                        v                          |
-                            Note down these cpus in                | P
-                                frozen_cpus mask         ----------
-                                        |
-                                        v
-                           Disable regular cpu hotplug
-                        by increasing cpu_hotplug_disabled
-                                        |
-                                        v
-                            Release cpu_add_remove_lock
-                                        |
-                                        v
-                       /* disable_nonboot_cpus() complete */
-                                        |
-                                        v
-                                   Do suspend
-
-
-
-Resuming back is likewise, with the counterparts being (in the order of
-execution during resume):
-* enable_nonboot_cpus() which involves:
-   |  Acquire cpu_add_remove_lock
-   |  Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug
-   |  Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop]
-   |  Release cpu_add_remove_lock
-   v
-
-* thaw tasks
-* send PM_POST_SUSPEND notifications
-* Release system_transition_mutex lock.
-
-
-It is to be noted here that the system_transition_mutex lock is acquired at the very
-beginning, when we are just starting out to suspend, and then released only
-after the entire cycle is complete (i.e., suspend + resume).
-
-
-
-                          Regular CPU hotplug call path
-                          -----------------------------
-
-                                Write 0 (or 1) to
-                       /sys/devices/system/cpu/cpu*/online
-                                    sysfs file
-                                        |
-                                        |
-                                        v
-                                    cpu_down()
-                                        |
-                                        v
-                           Acquire cpu_add_remove_lock
-                                        |
-                                        v
-                          If cpu_hotplug_disabled > 0
-                                return gracefully
-                                        |
-                                        |
-                                        v
-             ======>                _cpu_down()
-            |              [This takes cpuhotplug.lock
-  Common    |               before taking down the CPU
-   code     |               and releases it when done]
-            |            While it is at it, notifications
-            |           are sent when notable events occur,
-             ======>    by running all registered callbacks.
-                                        |
-                                        |
-                                        v
-                          Release cpu_add_remove_lock
-                               [That's it!, for
-                              regular CPU hotplug]
-
-
-
-So, as can be seen from the two diagrams (the parts marked as "Common code"),
-regular CPU hotplug and the suspend code path converge at the _cpu_down() and
-_cpu_up() functions. They differ in the arguments passed to these functions,
-in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen'
-argument. But during suspend, since the tasks are already frozen by the time
-the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called
-with the 'tasks_frozen' argument set to 1.
-[See below for some known issues regarding this.]
-
-
-Important files and functions/entry points:
-------------------------------------------
-
-kernel/power/process.c : freeze_processes(), thaw_processes()
-kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish()
-kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus()
-
-
-
-II. What are the issues involved in CPU hotplug?
-    -------------------------------------------
-
-There are some interesting situations involving CPU hotplug and microcode
-update on the CPUs, as discussed below:
-
-[Please bear in mind that the kernel requests the microcode images from
-userspace, using the request_firmware() function defined in
-drivers/base/firmware_loader/main.c]
-
-
-a. When all the CPUs are identical:
-
-   This is the most common situation and it is quite straightforward: we want
-   to apply the same microcode revision to each of the CPUs.
-   To give an example of x86, the collect_cpu_info() function defined in
-   arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU
-   and thereby in applying the correct microcode revision to it.
-   But note that the kernel does not maintain a common microcode image for the
-   all CPUs, in order to handle case 'b' described below.
-
-
-b. When some of the CPUs are different than the rest:
-
-   In this case since we probably need to apply different microcode revisions
-   to different CPUs, the kernel maintains a copy of the correct microcode
-   image for each CPU (after appropriate CPU type/model discovery using
-   functions such as collect_cpu_info()).
-
-
-c. When a CPU is physically hot-unplugged and a new (and possibly different
-   type of) CPU is hot-plugged into the system:
-
-   In the current design of the kernel, whenever a CPU is taken offline during
-   a regular CPU hotplug operation, upon receiving the CPU_DEAD notification
-   (which is sent by the CPU hotplug code), the microcode update driver's
-   callback for that event reacts by freeing the kernel's copy of the
-   microcode image for that CPU.
-
-   Hence, when a new CPU is brought online, since the kernel finds that it
-   doesn't have the microcode image, it does the CPU type/model discovery
-   afresh and then requests the userspace for the appropriate microcode image
-   for that CPU, which is subsequently applied.
-
-   For example, in x86, the mc_cpu_callback() function (which is the microcode
-   update driver's callback registered for CPU hotplug events) calls
-   microcode_update_cpu() which would call microcode_init_cpu() in this case,
-   instead of microcode_resume_cpu() when it finds that the kernel doesn't
-   have a valid microcode image. This ensures that the CPU type/model
-   discovery is performed and the right microcode is applied to the CPU after
-   getting it from userspace.
-
-
-d. Handling microcode update during suspend/hibernate:
-
-   Strictly speaking, during a CPU hotplug operation which does not involve
-   physically removing or inserting CPUs, the CPUs are not actually powered
-   off during a CPU offline. They are just put to the lowest C-states possible.
-   Hence, in such a case, it is not really necessary to re-apply microcode
-   when the CPUs are brought back online, since they wouldn't have lost the
-   image during the CPU offline operation.
-
-   This is the usual scenario encountered during a resume after a suspend.
-   However, in the case of hibernation, since all the CPUs are completely
-   powered off, during restore it becomes necessary to apply the microcode
-   images to all the CPUs.
-
-   [Note that we don't expect someone to physically pull out nodes and insert
-   nodes with a different type of CPUs in-between a suspend-resume or a
-   hibernate/restore cycle.]
-
-   In the current design of the kernel however, during a CPU offline operation
-   as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set),
-   the existing copy of microcode image in the kernel is not freed up.
-   And during the CPU online operations (during resume/restore), since the
-   kernel finds that it already has copies of the microcode images for all the
-   CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU
-   type/model and the need for validating whether the microcode revisions are
-   right for the CPUs or not (due to the above assumption that physical CPU
-   hotplug will not be done in-between suspend/resume or hibernate/restore
-   cycles).
-
-
-III. Are there any known problems when regular CPU hotplug and suspend race
-     with each other?
-
-Yes, they are listed below:
-
-1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to
-   the _cpu_down() and _cpu_up() functions is *always* 0.
-   This might not reflect the true current state of the system, since the
-   tasks could have been frozen by an out-of-band event such as a suspend
-   operation in progress. Hence, the cpuhp_tasks_frozen variable will not
-   reflect the frozen state and the CPU hotplug callbacks which evaluate
-   that variable might execute the wrong code path.
-
-2. If a regular CPU hotplug stress test happens to race with the freezer due
-   to a suspend operation in progress at the same time, then we could hit the
-   situation described below:
-
-    * A regular cpu online operation continues its journey from userspace
-      into the kernel, since the freezing has not yet begun.
-    * Then freezer gets to work and freezes userspace.
-    * If cpu online has not yet completed the microcode update stuff by now,
-      it will now start waiting on the frozen userspace in the
-      TASK_UNINTERRUPTIBLE state, in order to get the microcode image.
-    * Now the freezer continues and tries to freeze the remaining tasks. But
-      due to this wait mentioned above, the freezer won't be able to freeze
-      the cpu online hotplug task and hence freezing of tasks fails.
-
-   As a result of this task freezing failure, the suspend operation gets
-   aborted.
diff --git a/Documentation/power/suspend-and-interrupts.rst b/Documentation/power/suspend-and-interrupts.rst
new file mode 100644
index 000000000000..4cda6617709a
--- /dev/null
+++ b/Documentation/power/suspend-and-interrupts.rst
@@ -0,0 +1,137 @@
+====================================
+System Suspend and Device Interrupts
+====================================
+
+Copyright (C) 2014 Intel Corp.
+Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+
+Suspending and Resuming Device IRQs
+-----------------------------------
+
+Device interrupt request lines (IRQs) are generally disabled during system
+suspend after the "late" phase of suspending devices (that is, after all of the
+->prepare, ->suspend and ->suspend_late callbacks have been executed for all
+devices).  That is done by suspend_device_irqs().
+
+The rationale for doing so is that after the "late" phase of device suspend
+there is no legitimate reason why any interrupts from suspended devices should
+trigger and if any devices have not been suspended properly yet, it is better to
+block interrupts from them anyway.  Also, in the past we had problems with
+interrupt handlers for shared IRQs that device drivers implementing them were
+not prepared for interrupts triggering after their devices had been suspended.
+In some cases they would attempt to access, for example, memory address spaces
+of suspended devices and cause unpredictable behavior to ensue as a result.
+Unfortunately, such problems are very difficult to debug and the introduction
+of suspend_device_irqs(), along with the "noirq" phase of device suspend and
+resume, was the only practical way to mitigate them.
+
+Device IRQs are re-enabled during system resume, right before the "early" phase
+of resuming devices (that is, before starting to execute ->resume_early
+callbacks for devices).  The function doing that is resume_device_irqs().
+
+
+The IRQF_NO_SUSPEND Flag
+------------------------
+
+There are interrupts that can legitimately trigger during the entire system
+suspend-resume cycle, including the "noirq" phases of suspending and resuming
+devices as well as during the time when nonboot CPUs are taken offline and
+brought back online.  That applies to timer interrupts in the first place,
+but also to IPIs and to some other special-purpose interrupts.
+
+The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when
+requesting a special-purpose interrupt.  It causes suspend_device_irqs() to
+leave the corresponding IRQ enabled so as to allow the interrupt to work as
+expected during the suspend-resume cycle, but does not guarantee that the
+interrupt will wake the system from a suspended state -- for such cases it is
+necessary to use enable_irq_wake().
+
+Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one
+user of it.  Thus, if the IRQ is shared, all of the interrupt handlers installed
+for it will be executed as usual after suspend_device_irqs(), even if the
+IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of
+the IRQ's users.  For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the
+same time should be avoided.
+
+
+System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake()
+------------------------------------------------------------------
+
+System wakeup interrupts generally need to be configured to wake up the system
+from sleep states, especially if they are used for different purposes (e.g. as
+I/O interrupts) in the working state.
+
+That may involve turning on a special signal handling logic within the platform
+(such as an SoC) so that signals from a given line are routed in a different way
+during system sleep so as to trigger a system wakeup when needed.  For example,
+the platform may include a dedicated interrupt controller used specifically for
+handling system wakeup events.  Then, if a given interrupt line is supposed to
+wake up the system from sleep sates, the corresponding input of that interrupt
+controller needs to be enabled to receive signals from the line in question.
+After wakeup, it generally is better to disable that input to prevent the
+dedicated controller from triggering interrupts unnecessarily.
+
+The IRQ subsystem provides two helper functions to be used by device drivers for
+those purposes.  Namely, enable_irq_wake() turns on the platform's logic for
+handling the given IRQ as a system wakeup interrupt line and disable_irq_wake()
+turns that logic off.
+
+Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ
+in a special way.  Namely, the IRQ remains enabled, by on the first interrupt
+it will be disabled, marked as pending and "suspended" so that it will be
+re-enabled by resume_device_irqs() during the subsequent system resume.  Also
+the PM core is notified about the event which causes the system suspend in
+progress to be aborted (that doesn't have to happen immediately, but at one
+of the points where the suspend thread looks for pending wakeup events).
+
+This way every interrupt from a wakeup interrupt source will either cause the
+system suspend currently in progress to be aborted or wake up the system if
+already suspended.  However, after suspend_device_irqs() interrupt handlers are
+not executed for system wakeup IRQs.  They are only executed for IRQF_NO_SUSPEND
+IRQs at that time, but those IRQs should not be configured for system wakeup
+using enable_irq_wake().
+
+
+Interrupts and Suspend-to-Idle
+------------------------------
+
+Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new
+system sleep state that works by idling all of the processors and waiting for
+interrupts right after the "noirq" phase of suspending devices.
+
+Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag
+set will bring CPUs out of idle while in that state, but they will not cause the
+IRQ subsystem to trigger a system wakeup.
+
+System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in
+analogy with what they do in the full system suspend case.  The only difference
+is that the wakeup from suspend-to-idle is signaled using the usual working
+state interrupt delivery mechanisms and doesn't require the platform to use
+any special interrupt handling logic for it to work.
+
+
+IRQF_NO_SUSPEND and enable_irq_wake()
+-------------------------------------
+
+There are very few valid reasons to use both enable_irq_wake() and the
+IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the
+same device.
+
+First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND
+interrupts (interrupt handlers are invoked after suspend_device_irqs()) are
+directly at odds with the rules for handling system wakeup interrupts (interrupt
+handlers are not invoked after suspend_device_irqs()).
+
+Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not
+to individual interrupt handlers, so sharing an IRQ between a system wakeup
+interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally
+make sense.
+
+In rare cases an IRQ can be shared between a wakeup device driver and an
+IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver
+must be able to discern spurious IRQs from genuine wakeup events (signalling
+the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to
+ensure that the IRQ will function as a wakeup source, and must request the IRQ
+with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If
+these requirements are not met, it is not valid to use IRQF_COND_SUSPEND.
diff --git a/Documentation/power/suspend-and-interrupts.txt b/Documentation/power/suspend-and-interrupts.txt
deleted file mode 100644
index 8afb29a8604a..000000000000
--- a/Documentation/power/suspend-and-interrupts.txt
+++ /dev/null
@@ -1,135 +0,0 @@
-System Suspend and Device Interrupts
-
-Copyright (C) 2014 Intel Corp.
-Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
-
-
-Suspending and Resuming Device IRQs
------------------------------------
-
-Device interrupt request lines (IRQs) are generally disabled during system
-suspend after the "late" phase of suspending devices (that is, after all of the
-->prepare, ->suspend and ->suspend_late callbacks have been executed for all
-devices).  That is done by suspend_device_irqs().
-
-The rationale for doing so is that after the "late" phase of device suspend
-there is no legitimate reason why any interrupts from suspended devices should
-trigger and if any devices have not been suspended properly yet, it is better to
-block interrupts from them anyway.  Also, in the past we had problems with
-interrupt handlers for shared IRQs that device drivers implementing them were
-not prepared for interrupts triggering after their devices had been suspended.
-In some cases they would attempt to access, for example, memory address spaces
-of suspended devices and cause unpredictable behavior to ensue as a result.
-Unfortunately, such problems are very difficult to debug and the introduction
-of suspend_device_irqs(), along with the "noirq" phase of device suspend and
-resume, was the only practical way to mitigate them.
-
-Device IRQs are re-enabled during system resume, right before the "early" phase
-of resuming devices (that is, before starting to execute ->resume_early
-callbacks for devices).  The function doing that is resume_device_irqs().
-
-
-The IRQF_NO_SUSPEND Flag
-------------------------
-
-There are interrupts that can legitimately trigger during the entire system
-suspend-resume cycle, including the "noirq" phases of suspending and resuming
-devices as well as during the time when nonboot CPUs are taken offline and
-brought back online.  That applies to timer interrupts in the first place,
-but also to IPIs and to some other special-purpose interrupts.
-
-The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when
-requesting a special-purpose interrupt.  It causes suspend_device_irqs() to
-leave the corresponding IRQ enabled so as to allow the interrupt to work as
-expected during the suspend-resume cycle, but does not guarantee that the
-interrupt will wake the system from a suspended state -- for such cases it is
-necessary to use enable_irq_wake().
-
-Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one
-user of it.  Thus, if the IRQ is shared, all of the interrupt handlers installed
-for it will be executed as usual after suspend_device_irqs(), even if the
-IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of
-the IRQ's users.  For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the
-same time should be avoided.
-
-
-System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake()
-------------------------------------------------------------------
-
-System wakeup interrupts generally need to be configured to wake up the system
-from sleep states, especially if they are used for different purposes (e.g. as
-I/O interrupts) in the working state.
-
-That may involve turning on a special signal handling logic within the platform
-(such as an SoC) so that signals from a given line are routed in a different way
-during system sleep so as to trigger a system wakeup when needed.  For example,
-the platform may include a dedicated interrupt controller used specifically for
-handling system wakeup events.  Then, if a given interrupt line is supposed to
-wake up the system from sleep sates, the corresponding input of that interrupt
-controller needs to be enabled to receive signals from the line in question.
-After wakeup, it generally is better to disable that input to prevent the
-dedicated controller from triggering interrupts unnecessarily.
-
-The IRQ subsystem provides two helper functions to be used by device drivers for
-those purposes.  Namely, enable_irq_wake() turns on the platform's logic for
-handling the given IRQ as a system wakeup interrupt line and disable_irq_wake()
-turns that logic off.
-
-Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ
-in a special way.  Namely, the IRQ remains enabled, by on the first interrupt
-it will be disabled, marked as pending and "suspended" so that it will be
-re-enabled by resume_device_irqs() during the subsequent system resume.  Also
-the PM core is notified about the event which causes the system suspend in
-progress to be aborted (that doesn't have to happen immediately, but at one
-of the points where the suspend thread looks for pending wakeup events).
-
-This way every interrupt from a wakeup interrupt source will either cause the
-system suspend currently in progress to be aborted or wake up the system if
-already suspended.  However, after suspend_device_irqs() interrupt handlers are
-not executed for system wakeup IRQs.  They are only executed for IRQF_NO_SUSPEND
-IRQs at that time, but those IRQs should not be configured for system wakeup
-using enable_irq_wake().
-
-
-Interrupts and Suspend-to-Idle
-------------------------------
-
-Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new
-system sleep state that works by idling all of the processors and waiting for
-interrupts right after the "noirq" phase of suspending devices.
-
-Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag
-set will bring CPUs out of idle while in that state, but they will not cause the
-IRQ subsystem to trigger a system wakeup.
-
-System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in
-analogy with what they do in the full system suspend case.  The only difference
-is that the wakeup from suspend-to-idle is signaled using the usual working
-state interrupt delivery mechanisms and doesn't require the platform to use
-any special interrupt handling logic for it to work.
-
-
-IRQF_NO_SUSPEND and enable_irq_wake()
--------------------------------------
-
-There are very few valid reasons to use both enable_irq_wake() and the
-IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the
-same device.
-
-First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND
-interrupts (interrupt handlers are invoked after suspend_device_irqs()) are
-directly at odds with the rules for handling system wakeup interrupts (interrupt
-handlers are not invoked after suspend_device_irqs()).
-
-Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not
-to individual interrupt handlers, so sharing an IRQ between a system wakeup
-interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally
-make sense.
-
-In rare cases an IRQ can be shared between a wakeup device driver and an
-IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver
-must be able to discern spurious IRQs from genuine wakeup events (signalling
-the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to
-ensure that the IRQ will function as a wakeup source, and must request the IRQ
-with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If
-these requirements are not met, it is not valid to use IRQF_COND_SUSPEND.
diff --git a/Documentation/power/swsusp-and-swap-files.rst b/Documentation/power/swsusp-and-swap-files.rst
new file mode 100644
index 000000000000..a33a2919dbe4
--- /dev/null
+++ b/Documentation/power/swsusp-and-swap-files.rst
@@ -0,0 +1,63 @@
+===============================================
+Using swap files with software suspend (swsusp)
+===============================================
+
+	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+The Linux kernel handles swap files almost in the same way as it handles swap
+partitions and there are only two differences between these two types of swap
+areas:
+(1) swap files need not be contiguous,
+(2) the header of a swap file is not in the first block of the partition that
+holds it.  From the swsusp's point of view (1) is not a problem, because it is
+already taken care of by the swap-handling code, but (2) has to be taken into
+consideration.
+
+In principle the location of a swap file's header may be determined with the
+help of appropriate filesystem driver.  Unfortunately, however, it requires the
+filesystem holding the swap file to be mounted, and if this filesystem is
+journaled, it cannot be mounted during resume from disk.  For this reason to
+identify a swap file swsusp uses the name of the partition that holds the file
+and the offset from the beginning of the partition at which the swap file's
+header is located.  For convenience, this offset is expressed in <PAGE_SIZE>
+units.
+
+In order to use a swap file with swsusp, you need to:
+
+1) Create the swap file and make it active, eg.::
+
+    # dd if=/dev/zero of=<swap_file_path> bs=1024 count=<swap_file_size_in_k>
+    # mkswap <swap_file_path>
+    # swapon <swap_file_path>
+
+2) Use an application that will bmap the swap file with the help of the
+FIBMAP ioctl and determine the location of the file's swap header, as the
+offset, in <PAGE_SIZE> units, from the beginning of the partition which
+holds the swap file.
+
+3) Add the following parameters to the kernel command line::
+
+    resume=<swap_file_partition> resume_offset=<swap_file_offset>
+
+where <swap_file_partition> is the partition on which the swap file is located
+and <swap_file_offset> is the offset of the swap header determined by the
+application in 2) (of course, this step may be carried out automatically
+by the same application that determines the swap file's header offset using the
+FIBMAP ioctl)
+
+OR
+
+Use a userland suspend application that will set the partition and offset
+with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in
+Documentation/power/userland-swsusp.rst (this is the only method to suspend
+to a swap file allowing the resume to be initiated from an initrd or initramfs
+image).
+
+Now, swsusp will use the swap file in the same way in which it would use a swap
+partition.  In particular, the swap file has to be active (ie. be present in
+/proc/swaps) so that it can be used for suspending.
+
+Note that if the swap file used for suspending is deleted and recreated,
+the location of its header need not be the same as before.  Thus every time
+this happens the value of the "resume_offset=" kernel command line parameter
+has to be updated.
diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt
deleted file mode 100644
index f281886de490..000000000000
--- a/Documentation/power/swsusp-and-swap-files.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-Using swap files with software suspend (swsusp)
-	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
-
-The Linux kernel handles swap files almost in the same way as it handles swap
-partitions and there are only two differences between these two types of swap
-areas:
-(1) swap files need not be contiguous,
-(2) the header of a swap file is not in the first block of the partition that
-holds it.  From the swsusp's point of view (1) is not a problem, because it is
-already taken care of by the swap-handling code, but (2) has to be taken into
-consideration.
-
-In principle the location of a swap file's header may be determined with the
-help of appropriate filesystem driver.  Unfortunately, however, it requires the
-filesystem holding the swap file to be mounted, and if this filesystem is
-journaled, it cannot be mounted during resume from disk.  For this reason to
-identify a swap file swsusp uses the name of the partition that holds the file
-and the offset from the beginning of the partition at which the swap file's
-header is located.  For convenience, this offset is expressed in <PAGE_SIZE>
-units.
-
-In order to use a swap file with swsusp, you need to:
-
-1) Create the swap file and make it active, eg.
-
-# dd if=/dev/zero of=<swap_file_path> bs=1024 count=<swap_file_size_in_k>
-# mkswap <swap_file_path>
-# swapon <swap_file_path>
-
-2) Use an application that will bmap the swap file with the help of the
-FIBMAP ioctl and determine the location of the file's swap header, as the
-offset, in <PAGE_SIZE> units, from the beginning of the partition which
-holds the swap file.
-
-3) Add the following parameters to the kernel command line:
-
-resume=<swap_file_partition> resume_offset=<swap_file_offset>
-
-where <swap_file_partition> is the partition on which the swap file is located
-and <swap_file_offset> is the offset of the swap header determined by the
-application in 2) (of course, this step may be carried out automatically
-by the same application that determines the swap file's header offset using the
-FIBMAP ioctl)
-
-OR
-
-Use a userland suspend application that will set the partition and offset
-with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in
-Documentation/power/userland-swsusp.txt (this is the only method to suspend
-to a swap file allowing the resume to be initiated from an initrd or initramfs
-image).
-
-Now, swsusp will use the swap file in the same way in which it would use a swap
-partition.  In particular, the swap file has to be active (ie. be present in
-/proc/swaps) so that it can be used for suspending.
-
-Note that if the swap file used for suspending is deleted and recreated,
-the location of its header need not be the same as before.  Thus every time
-this happens the value of the "resume_offset=" kernel command line parameter
-has to be updated.
diff --git a/Documentation/power/swsusp-dmcrypt.rst b/Documentation/power/swsusp-dmcrypt.rst
new file mode 100644
index 000000000000..426df59172cd
--- /dev/null
+++ b/Documentation/power/swsusp-dmcrypt.rst
@@ -0,0 +1,140 @@
+=======================================
+How to use dm-crypt and swsusp together
+=======================================
+
+Author: Andreas Steinmetz <ast@domdv.de>
+
+
+
+Some prerequisites:
+You know how dm-crypt works. If not, visit the following web page:
+http://www.saout.de/misc/dm-crypt/
+You have read Documentation/power/swsusp.rst and understand it.
+You did read Documentation/admin-guide/initrd.rst and know how an initrd works.
+You know how to create or how to modify an initrd.
+
+Now your system is properly set up, your disk is encrypted except for
+the swap device(s) and the boot partition which may contain a mini
+system for crypto setup and/or rescue purposes. You may even have
+an initrd that does your current crypto setup already.
+
+At this point you want to encrypt your swap, too. Still you want to
+be able to suspend using swsusp. This, however, means that you
+have to be able to either enter a passphrase or that you read
+the key(s) from an external device like a pcmcia flash disk
+or an usb stick prior to resume. So you need an initrd, that sets
+up dm-crypt and then asks swsusp to resume from the encrypted
+swap device.
+
+The most important thing is that you set up dm-crypt in such
+a way that the swap device you suspend to/resume from has
+always the same major/minor within the initrd as well as
+within your running system. The easiest way to achieve this is
+to always set up this swap device first with dmsetup, so that
+it will always look like the following::
+
+  brw-------  1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0
+
+Now set up your kernel to use /dev/mapper/swap0 as the default
+resume partition, so your kernel .config contains::
+
+  CONFIG_PM_STD_PARTITION="/dev/mapper/swap0"
+
+Prepare your boot loader to use the initrd you will create or
+modify. For lilo the simplest setup looks like the following
+lines::
+
+  image=/boot/vmlinuz
+  initrd=/boot/initrd.gz
+  label=linux
+  append="root=/dev/ram0 init=/linuxrc rw"
+
+Finally you need to create or modify your initrd. Lets assume
+you create an initrd that reads the required dm-crypt setup
+from a pcmcia flash disk card. The card is formatted with an ext2
+fs which resides on /dev/hde1 when the card is inserted. The
+card contains at least the encrypted swap setup in a file
+named "swapkey". /etc/fstab of your initrd contains something
+like the following::
+
+  /dev/hda1   /mnt    ext3      ro                            0 0
+  none        /proc   proc      defaults,noatime,nodiratime   0 0
+  none        /sys    sysfs     defaults,noatime,nodiratime   0 0
+
+/dev/hda1 contains an unencrypted mini system that sets up all
+of your crypto devices, again by reading the setup from the
+pcmcia flash disk. What follows now is a /linuxrc for your
+initrd that allows you to resume from encrypted swap and that
+continues boot with your mini system on /dev/hda1 if resume
+does not happen::
+
+  #!/bin/sh
+  PATH=/sbin:/bin:/usr/sbin:/usr/bin
+  mount /proc
+  mount /sys
+  mapped=0
+  noresume=`grep -c noresume /proc/cmdline`
+  if [ "$*" != "" ]
+  then
+    noresume=1
+  fi
+  dmesg -n 1
+  /sbin/cardmgr -q
+  for i in 1 2 3 4 5 6 7 8 9 0
+  do
+    if [ -f /proc/ide/hde/media ]
+    then
+      usleep 500000
+      mount -t ext2 -o ro /dev/hde1 /mnt
+      if [ -f /mnt/swapkey ]
+      then
+        dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1
+      fi
+      umount /mnt
+      break
+    fi
+    usleep 500000
+  done
+  killproc /sbin/cardmgr
+  dmesg -n 6
+  if [ $mapped = 1 ]
+  then
+    if [ $noresume != 0 ]
+    then
+      mkswap /dev/mapper/swap0 > /dev/null 2>&1
+    fi
+    echo 254:0 > /sys/power/resume
+    dmsetup remove swap0
+  fi
+  umount /sys
+  mount /mnt
+  umount /proc
+  cd /mnt
+  pivot_root . mnt
+  mount /proc
+  umount -l /mnt
+  umount /proc
+  exec chroot . /sbin/init $* < dev/console > dev/console 2>&1
+
+Please don't mind the weird loop above, busybox's msh doesn't know
+the let statement. Now, what is happening in the script?
+First we have to decide if we want to try to resume, or not.
+We will not resume if booting with "noresume" or any parameters
+for init like "single" or "emergency" as boot parameters.
+
+Then we need to set up dmcrypt with the setup data from the
+pcmcia flash disk. If this succeeds we need to reset the swap
+device if we don't want to resume. The line "echo 254:0 > /sys/power/resume"
+then attempts to resume from the first device mapper device.
+Note that it is important to set the device in /sys/power/resume,
+regardless if resuming or not, otherwise later suspend will fail.
+If resume starts, script execution terminates here.
+
+Otherwise we just remove the encrypted swap device and leave it to the
+mini system on /dev/hda1 to set the whole crypto up (it is up to
+you to modify this to your taste).
+
+What then follows is the well known process to change the root
+file system and continue booting from there. I prefer to unmount
+the initrd prior to continue booting but it is up to you to modify
+this.
diff --git a/Documentation/power/swsusp-dmcrypt.txt b/Documentation/power/swsusp-dmcrypt.txt
deleted file mode 100644
index b802fbfd95ef..000000000000
--- a/Documentation/power/swsusp-dmcrypt.txt
+++ /dev/null
@@ -1,138 +0,0 @@
-Author: Andreas Steinmetz <ast@domdv.de>
-
-
-How to use dm-crypt and swsusp together:
-========================================
-
-Some prerequisites:
-You know how dm-crypt works. If not, visit the following web page:
-http://www.saout.de/misc/dm-crypt/
-You have read Documentation/power/swsusp.txt and understand it.
-You did read Documentation/admin-guide/initrd.rst and know how an initrd works.
-You know how to create or how to modify an initrd.
-
-Now your system is properly set up, your disk is encrypted except for
-the swap device(s) and the boot partition which may contain a mini
-system for crypto setup and/or rescue purposes. You may even have
-an initrd that does your current crypto setup already.
-
-At this point you want to encrypt your swap, too. Still you want to
-be able to suspend using swsusp. This, however, means that you
-have to be able to either enter a passphrase or that you read
-the key(s) from an external device like a pcmcia flash disk
-or an usb stick prior to resume. So you need an initrd, that sets
-up dm-crypt and then asks swsusp to resume from the encrypted
-swap device.
-
-The most important thing is that you set up dm-crypt in such
-a way that the swap device you suspend to/resume from has
-always the same major/minor within the initrd as well as
-within your running system. The easiest way to achieve this is
-to always set up this swap device first with dmsetup, so that
-it will always look like the following:
-
-brw-------  1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0
-
-Now set up your kernel to use /dev/mapper/swap0 as the default
-resume partition, so your kernel .config contains:
-
-CONFIG_PM_STD_PARTITION="/dev/mapper/swap0"
-
-Prepare your boot loader to use the initrd you will create or
-modify. For lilo the simplest setup looks like the following
-lines:
-
-image=/boot/vmlinuz
-initrd=/boot/initrd.gz
-label=linux
-append="root=/dev/ram0 init=/linuxrc rw"
-
-Finally you need to create or modify your initrd. Lets assume
-you create an initrd that reads the required dm-crypt setup
-from a pcmcia flash disk card. The card is formatted with an ext2
-fs which resides on /dev/hde1 when the card is inserted. The
-card contains at least the encrypted swap setup in a file
-named "swapkey". /etc/fstab of your initrd contains something
-like the following:
-
-/dev/hda1   /mnt    ext3      ro                            0 0
-none        /proc   proc      defaults,noatime,nodiratime   0 0
-none        /sys    sysfs     defaults,noatime,nodiratime   0 0
-
-/dev/hda1 contains an unencrypted mini system that sets up all
-of your crypto devices, again by reading the setup from the
-pcmcia flash disk. What follows now is a /linuxrc for your
-initrd that allows you to resume from encrypted swap and that
-continues boot with your mini system on /dev/hda1 if resume
-does not happen:
-
-#!/bin/sh
-PATH=/sbin:/bin:/usr/sbin:/usr/bin
-mount /proc
-mount /sys
-mapped=0
-noresume=`grep -c noresume /proc/cmdline`
-if [ "$*" != "" ]
-then
-  noresume=1
-fi
-dmesg -n 1
-/sbin/cardmgr -q
-for i in 1 2 3 4 5 6 7 8 9 0
-do
-  if [ -f /proc/ide/hde/media ]
-  then
-    usleep 500000
-    mount -t ext2 -o ro /dev/hde1 /mnt
-    if [ -f /mnt/swapkey ]
-    then
-      dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1
-    fi
-    umount /mnt
-    break
-  fi
-  usleep 500000
-done
-killproc /sbin/cardmgr
-dmesg -n 6
-if [ $mapped = 1 ]
-then
-  if [ $noresume != 0 ]
-  then
-    mkswap /dev/mapper/swap0 > /dev/null 2>&1
-  fi
-  echo 254:0 > /sys/power/resume
-  dmsetup remove swap0
-fi
-umount /sys
-mount /mnt
-umount /proc
-cd /mnt
-pivot_root . mnt
-mount /proc
-umount -l /mnt
-umount /proc
-exec chroot . /sbin/init $* < dev/console > dev/console 2>&1
-
-Please don't mind the weird loop above, busybox's msh doesn't know
-the let statement. Now, what is happening in the script?
-First we have to decide if we want to try to resume, or not.
-We will not resume if booting with "noresume" or any parameters
-for init like "single" or "emergency" as boot parameters.
-
-Then we need to set up dmcrypt with the setup data from the
-pcmcia flash disk. If this succeeds we need to reset the swap
-device if we don't want to resume. The line "echo 254:0 > /sys/power/resume"
-then attempts to resume from the first device mapper device.
-Note that it is important to set the device in /sys/power/resume,
-regardless if resuming or not, otherwise later suspend will fail.
-If resume starts, script execution terminates here.
-
-Otherwise we just remove the encrypted swap device and leave it to the
-mini system on /dev/hda1 to set the whole crypto up (it is up to
-you to modify this to your taste).
-
-What then follows is the well known process to change the root
-file system and continue booting from there. I prefer to unmount
-the initrd prior to continue booting but it is up to you to modify
-this.
diff --git a/Documentation/power/swsusp.rst b/Documentation/power/swsusp.rst
new file mode 100644
index 000000000000..d000312f6965
--- /dev/null
+++ b/Documentation/power/swsusp.rst
@@ -0,0 +1,501 @@
+============
+Swap suspend
+============
+
+Some warnings, first.
+
+.. warning::
+
+   **BIG FAT WARNING**
+
+   If you touch anything on disk between suspend and resume...
+				...kiss your data goodbye.
+
+   If you do resume from initrd after your filesystems are mounted...
+				...bye bye root partition.
+
+			[this is actually same case as above]
+
+   If you have unsupported ( ) devices using DMA, you may have some
+   problems. If your disk driver does not support suspend... (IDE does),
+   it may cause some problems, too. If you change kernel command line
+   between suspend and resume, it may do something wrong. If you change
+   your hardware while system is suspended... well, it was not good idea;
+   but it will probably only crash.
+
+   ( ) suspend/resume support is needed to make it safe.
+
+   If you have any filesystems on USB devices mounted before software suspend,
+   they won't be accessible after resume and you may lose data, as though
+   you have unplugged the USB devices with mounted filesystems on them;
+   see the FAQ below for details.  (This is not true for more traditional
+   power states like "standby", which normally don't turn USB off.)
+
+Swap partition:
+  You need to append resume=/dev/your_swap_partition to kernel command
+  line or specify it using /sys/power/resume.
+
+Swap file:
+  If using a swapfile you can also specify a resume offset using
+  resume_offset=<number> on the kernel command line or specify it
+  in /sys/power/resume_offset.
+
+After preparing then you suspend by::
+
+	echo shutdown > /sys/power/disk; echo disk > /sys/power/state
+
+- If you feel ACPI works pretty well on your system, you might try::
+
+	echo platform > /sys/power/disk; echo disk > /sys/power/state
+
+- If you would like to write hibernation image to swap and then suspend
+  to RAM (provided your platform supports it), you can try::
+
+	echo suspend > /sys/power/disk; echo disk > /sys/power/state
+
+- If you have SATA disks, you'll need recent kernels with SATA suspend
+  support. For suspend and resume to work, make sure your disk drivers
+  are built into kernel -- not modules. [There's way to make
+  suspend/resume with modular disk drivers, see FAQ, but you probably
+  should not do that.]
+
+If you want to limit the suspend image size to N bytes, do::
+
+	echo N > /sys/power/image_size
+
+before suspend (it is limited to around 2/5 of available RAM by default).
+
+- The resume process checks for the presence of the resume device,
+  if found, it then checks the contents for the hibernation image signature.
+  If both are found, it resumes the hibernation image.
+
+- The resume process may be triggered in two ways:
+
+  1) During lateinit:  If resume=/dev/your_swap_partition is specified on
+     the kernel command line, lateinit runs the resume process.  If the
+     resume device has not been probed yet, the resume process fails and
+     bootup continues.
+  2) Manually from an initrd or initramfs:  May be run from
+     the init script by using the /sys/power/resume file.  It is vital
+     that this be done prior to remounting any filesystems (even as
+     read-only) otherwise data may be corrupted.
+
+Article about goals and implementation of Software Suspend for Linux
+====================================================================
+
+Author: Gábor Kuti
+Last revised: 2003-10-20 by Pavel Machek
+
+Idea and goals to achieve
+-------------------------
+
+Nowadays it is common in several laptops that they have a suspend button. It
+saves the state of the machine to a filesystem or to a partition and switches
+to standby mode. Later resuming the machine the saved state is loaded back to
+ram and the machine can continue its work. It has two real benefits. First we
+save ourselves the time machine goes down and later boots up, energy costs
+are real high when running from batteries. The other gain is that we don't have
+to interrupt our programs so processes that are calculating something for a long
+time shouldn't need to be written interruptible.
+
+swsusp saves the state of the machine into active swaps and then reboots or
+powerdowns.  You must explicitly specify the swap partition to resume from with
+`resume=` kernel option. If signature is found it loads and restores saved
+state. If the option `noresume` is specified as a boot parameter, it skips
+the resuming.  If the option `hibernate=nocompress` is specified as a boot
+parameter, it saves hibernation image without compression.
+
+In the meantime while the system is suspended you should not add/remove any
+of the hardware, write to the filesystems, etc.
+
+Sleep states summary
+====================
+
+There are three different interfaces you can use, /proc/acpi should
+work like this:
+
+In a really perfect world::
+
+  echo 1 > /proc/acpi/sleep       # for standby
+  echo 2 > /proc/acpi/sleep       # for suspend to ram
+  echo 3 > /proc/acpi/sleep       # for suspend to ram, but with more power conservative
+  echo 4 > /proc/acpi/sleep       # for suspend to disk
+  echo 5 > /proc/acpi/sleep       # for shutdown unfriendly the system
+
+and perhaps::
+
+  echo 4b > /proc/acpi/sleep      # for suspend to disk via s4bios
+
+Frequently Asked Questions
+==========================
+
+Q:
+  well, suspending a server is IMHO a really stupid thing,
+  but... (Diego Zuccato):
+
+A:
+  You bought new UPS for your server. How do you install it without
+  bringing machine down? Suspend to disk, rearrange power cables,
+  resume.
+
+  You have your server on UPS. Power died, and UPS is indicating 30
+  seconds to failure. What do you do? Suspend to disk.
+
+
+Q:
+  Maybe I'm missing something, but why don't the regular I/O paths work?
+
+A:
+  We do use the regular I/O paths. However we cannot restore the data
+  to its original location as we load it. That would create an
+  inconsistent kernel state which would certainly result in an oops.
+  Instead, we load the image into unused memory and then atomically copy
+  it back to it original location. This implies, of course, a maximum
+  image size of half the amount of memory.
+
+  There are two solutions to this:
+
+  * require half of memory to be free during suspend. That way you can
+    read "new" data onto free spots, then cli and copy
+
+  * assume we had special "polling" ide driver that only uses memory
+    between 0-640KB. That way, I'd have to make sure that 0-640KB is free
+    during suspending, but otherwise it would work...
+
+  suspend2 shares this fundamental limitation, but does not include user
+  data and disk caches into "used memory" by saving them in
+  advance. That means that the limitation goes away in practice.
+
+Q:
+  Does linux support ACPI S4?
+
+A:
+  Yes. That's what echo platform > /sys/power/disk does.
+
+Q:
+  What is 'suspend2'?
+
+A:
+  suspend2 is 'Software Suspend 2', a forked implementation of
+  suspend-to-disk which is available as separate patches for 2.4 and 2.6
+  kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB
+  highmem and preemption. It also has a extensible architecture that
+  allows for arbitrary transformations on the image (compression,
+  encryption) and arbitrary backends for writing the image (eg to swap
+  or an NFS share[Work In Progress]). Questions regarding suspend2
+  should be sent to the mailing list available through the suspend2
+  website, and not to the Linux Kernel Mailing List. We are working
+  toward merging suspend2 into the mainline kernel.
+
+Q:
+  What is the freezing of tasks and why are we using it?
+
+A:
+  The freezing of tasks is a mechanism by which user space processes and some
+  kernel threads are controlled during hibernation or system-wide suspend (on some
+  architectures).  See freezing-of-tasks.txt for details.
+
+Q:
+  What is the difference between "platform" and "shutdown"?
+
+A:
+  shutdown:
+	save state in linux, then tell bios to powerdown
+
+  platform:
+	save state in linux, then tell bios to powerdown and blink
+        "suspended led"
+
+  "platform" is actually right thing to do where supported, but
+  "shutdown" is most reliable (except on ACPI systems).
+
+Q:
+  I do not understand why you have such strong objections to idea of
+  selective suspend.
+
+A:
+  Do selective suspend during runtime power management, that's okay. But
+  it's useless for suspend-to-disk. (And I do not see how you could use
+  it for suspend-to-ram, I hope you do not want that).
+
+  Lets see, so you suggest to
+
+  * SUSPEND all but swap device and parents
+  * Snapshot
+  * Write image to disk
+  * SUSPEND swap device and parents
+  * Powerdown
+
+  Oh no, that does not work, if swap device or its parents uses DMA,
+  you've corrupted data. You'd have to do
+
+  * SUSPEND all but swap device and parents
+  * FREEZE swap device and parents
+  * Snapshot
+  * UNFREEZE swap device and parents
+  * Write
+  * SUSPEND swap device and parents
+
+  Which means that you still need that FREEZE state, and you get more
+  complicated code. (And I have not yet introduce details like system
+  devices).
+
+Q:
+  There don't seem to be any generally useful behavioral
+  distinctions between SUSPEND and FREEZE.
+
+A:
+  Doing SUSPEND when you are asked to do FREEZE is always correct,
+  but it may be unnecessarily slow. If you want your driver to stay simple,
+  slowness may not matter to you. It can always be fixed later.
+
+  For devices like disk it does matter, you do not want to spindown for
+  FREEZE.
+
+Q:
+  After resuming, system is paging heavily, leading to very bad interactivity.
+
+A:
+  Try running::
+
+    cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file
+    do
+      test -f "$file" && cat "$file" > /dev/null
+    done
+
+  after resume. swapoff -a; swapon -a may also be useful.
+
+Q:
+  What happens to devices during swsusp? They seem to be resumed
+  during system suspend?
+
+A:
+  That's correct. We need to resume them if we want to write image to
+  disk. Whole sequence goes like
+
+      **Suspend part**
+
+      running system, user asks for suspend-to-disk
+
+      user processes are stopped
+
+      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+      with state snapshot
+
+      state snapshot: copy of whole used memory is taken with interrupts disabled
+
+      resume(): devices are woken up so that we can write image to swap
+
+      write image to swap
+
+      suspend(PMSG_SUSPEND): suspend devices so that we can power off
+
+      turn the power off
+
+      **Resume part**
+
+      (is actually pretty similar)
+
+      running system, user asks for suspend-to-disk
+
+      user processes are stopped (in common case there are none,
+      but with resume-from-initrd, no one knows)
+
+      read image from disk
+
+      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
+      with image restoration
+
+      image restoration: rewrite memory with image
+
+      resume(): devices are woken up so that system can continue
+
+      thaw all user processes
+
+Q:
+  What is this 'Encrypt suspend image' for?
+
+A:
+  First of all: it is not a replacement for dm-crypt encrypted swap.
+  It cannot protect your computer while it is suspended. Instead it does
+  protect from leaking sensitive data after resume from suspend.
+
+  Think of the following: you suspend while an application is running
+  that keeps sensitive data in memory. The application itself prevents
+  the data from being swapped out. Suspend, however, must write these
+  data to swap to be able to resume later on. Without suspend encryption
+  your sensitive data are then stored in plaintext on disk.  This means
+  that after resume your sensitive data are accessible to all
+  applications having direct access to the swap device which was used
+  for suspend. If you don't need swap after resume these data can remain
+  on disk virtually forever. Thus it can happen that your system gets
+  broken in weeks later and sensitive data which you thought were
+  encrypted and protected are retrieved and stolen from the swap device.
+  To prevent this situation you should use 'Encrypt suspend image'.
+
+  During suspend a temporary key is created and this key is used to
+  encrypt the data written to disk. When, during resume, the data was
+  read back into memory the temporary key is destroyed which simply
+  means that all data written to disk during suspend are then
+  inaccessible so they can't be stolen later on.  The only thing that
+  you must then take care of is that you call 'mkswap' for the swap
+  partition used for suspend as early as possible during regular
+  boot. This asserts that any temporary key from an oopsed suspend or
+  from a failed or aborted resume is erased from the swap device.
+
+  As a rule of thumb use encrypted swap to protect your data while your
+  system is shut down or suspended. Additionally use the encrypted
+  suspend image to prevent sensitive data from being stolen after
+  resume.
+
+Q:
+  Can I suspend to a swap file?
+
+A:
+  Generally, yes, you can.  However, it requires you to use the "resume=" and
+  "resume_offset=" kernel command line parameters, so the resume from a swap file
+  cannot be initiated from an initrd or initramfs image.  See
+  swsusp-and-swap-files.txt for details.
+
+Q:
+  Is there a maximum system RAM size that is supported by swsusp?
+
+A:
+  It should work okay with highmem.
+
+Q:
+  Does swsusp (to disk) use only one swap partition or can it use
+  multiple swap partitions (aggregate them into one logical space)?
+
+A:
+  Only one swap partition, sorry.
+
+Q:
+  If my application(s) causes lots of memory & swap space to be used
+  (over half of the total system RAM), is it correct that it is likely
+  to be useless to try to suspend to disk while that app is running?
+
+A:
+  No, it should work okay, as long as your app does not mlock()
+  it. Just prepare big enough swap partition.
+
+Q:
+  What information is useful for debugging suspend-to-disk problems?
+
+A:
+  Well, last messages on the screen are always useful. If something
+  is broken, it is usually some kernel driver, therefore trying with as
+  little as possible modules loaded helps a lot. I also prefer people to
+  suspend from console, preferably without X running. Booting with
+  init=/bin/bash, then swapon and starting suspend sequence manually
+  usually does the trick. Then it is good idea to try with latest
+  vanilla kernel.
+
+Q:
+  How can distributions ship a swsusp-supporting kernel with modular
+  disk drivers (especially SATA)?
+
+A:
+  Well, it can be done, load the drivers, then do echo into
+  /sys/power/resume file from initrd. Be sure not to mount
+  anything, not even read-only mount, or you are going to lose your
+  data.
+
+Q:
+  How do I make suspend more verbose?
+
+A:
+  If you want to see any non-error kernel messages on the virtual
+  terminal the kernel switches to during suspend, you have to set the
+  kernel console loglevel to at least 4 (KERN_WARNING), for example by
+  doing::
+
+	# save the old loglevel
+	read LOGLEVEL DUMMY < /proc/sys/kernel/printk
+	# set the loglevel so we see the progress bar.
+	# if the level is higher than needed, we leave it alone.
+	if [ $LOGLEVEL -lt 5 ]; then
+	        echo 5 > /proc/sys/kernel/printk
+		fi
+
+        IMG_SZ=0
+        read IMG_SZ < /sys/power/image_size
+        echo -n disk > /sys/power/state
+        RET=$?
+        #
+        # the logic here is:
+        # if image_size > 0 (without kernel support, IMG_SZ will be zero),
+        # then try again with image_size set to zero.
+	if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size
+                echo 0 > /sys/power/image_size
+                echo -n disk > /sys/power/state
+                RET=$?
+        fi
+
+	# restore previous loglevel
+	echo $LOGLEVEL > /proc/sys/kernel/printk
+	exit $RET
+
+Q:
+  Is this true that if I have a mounted filesystem on a USB device and
+  I suspend to disk, I can lose data unless the filesystem has been mounted
+  with "sync"?
+
+A:
+  That's right ... if you disconnect that device, you may lose data.
+  In fact, even with "-o sync" you can lose data if your programs have
+  information in buffers they haven't written out to a disk you disconnect,
+  or if you disconnect before the device finished saving data you wrote.
+
+  Software suspend normally powers down USB controllers, which is equivalent
+  to disconnecting all USB devices attached to your system.
+
+  Your system might well support low-power modes for its USB controllers
+  while the system is asleep, maintaining the connection, using true sleep
+  modes like "suspend-to-RAM" or "standby".  (Don't write "disk" to the
+  /sys/power/state file; write "standby" or "mem".)  We've not seen any
+  hardware that can use these modes through software suspend, although in
+  theory some systems might support "platform" modes that won't break the
+  USB connections.
+
+  Remember that it's always a bad idea to unplug a disk drive containing a
+  mounted filesystem.  That's true even when your system is asleep!  The
+  safest thing is to unmount all filesystems on removable media (such USB,
+  Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays)
+  before suspending; then remount them after resuming.
+
+  There is a work-around for this problem.  For more information, see
+  Documentation/driver-api/usb/persist.rst.
+
+Q:
+  Can I suspend-to-disk using a swap partition under LVM?
+
+A:
+  Yes and No.  You can suspend successfully, but the kernel will not be able
+  to resume on its own.  You need an initramfs that can recognize the resume
+  situation, activate the logical volume containing the swap volume (but not
+  touch any filesystems!), and eventually call::
+
+    echo -n "$major:$minor" > /sys/power/resume
+
+  where $major and $minor are the respective major and minor device numbers of
+  the swap volume.
+
+  uswsusp works with LVM, too.  See http://suspend.sourceforge.net/
+
+Q:
+  I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
+  compiled with the similar configuration files. Anyway I found that
+  suspend to disk (and resume) is much slower on 2.6.16 compared to
+  2.6.15. Any idea for why that might happen or how can I speed it up?
+
+A:
+  This is because the size of the suspend image is now greater than
+  for 2.6.15 (by saving more data we can get more responsive system
+  after resume).
+
+  There's the /sys/power/image_size knob that controls the size of the
+  image.  If you set it to 0 (eg. by echo 0 > /sys/power/image_size as
+  root), the 2.6.15 behavior should be restored.  If it is still too
+  slow, take a look at suspend.sf.net -- userland suspend is faster and
+  supports LZF compression to speed it up further.
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
deleted file mode 100644
index 236d1fb13640..000000000000
--- a/Documentation/power/swsusp.txt
+++ /dev/null
@@ -1,446 +0,0 @@
-Some warnings, first.
-
- * BIG FAT WARNING *********************************************************
- *
- * If you touch anything on disk between suspend and resume...
- *				...kiss your data goodbye.
- *
- * If you do resume from initrd after your filesystems are mounted...
- *				...bye bye root partition.
- *			[this is actually same case as above]
- *
- * If you have unsupported (*) devices using DMA, you may have some
- * problems. If your disk driver does not support suspend... (IDE does),
- * it may cause some problems, too. If you change kernel command line
- * between suspend and resume, it may do something wrong. If you change
- * your hardware while system is suspended... well, it was not good idea;
- * but it will probably only crash.
- *
- * (*) suspend/resume support is needed to make it safe.
- *
- * If you have any filesystems on USB devices mounted before software suspend,
- * they won't be accessible after resume and you may lose data, as though
- * you have unplugged the USB devices with mounted filesystems on them;
- * see the FAQ below for details.  (This is not true for more traditional
- * power states like "standby", which normally don't turn USB off.)
-
-Swap partition:
-You need to append resume=/dev/your_swap_partition to kernel command
-line or specify it using /sys/power/resume.
-
-Swap file:
-If using a swapfile you can also specify a resume offset using
-resume_offset=<number> on the kernel command line or specify it
-in /sys/power/resume_offset.
-
-After preparing then you suspend by
-
-echo shutdown > /sys/power/disk; echo disk > /sys/power/state
-
-. If you feel ACPI works pretty well on your system, you might try
-
-echo platform > /sys/power/disk; echo disk > /sys/power/state
-
-. If you would like to write hibernation image to swap and then suspend
-to RAM (provided your platform supports it), you can try
-
-echo suspend > /sys/power/disk; echo disk > /sys/power/state
-
-. If you have SATA disks, you'll need recent kernels with SATA suspend
-support. For suspend and resume to work, make sure your disk drivers
-are built into kernel -- not modules. [There's way to make
-suspend/resume with modular disk drivers, see FAQ, but you probably
-should not do that.]
-
-If you want to limit the suspend image size to N bytes, do
-
-echo N > /sys/power/image_size
-
-before suspend (it is limited to around 2/5 of available RAM by default).
-
-. The resume process checks for the presence of the resume device,
-if found, it then checks the contents for the hibernation image signature.
-If both are found, it resumes the hibernation image.
-
-. The resume process may be triggered in two ways:
-  1) During lateinit:  If resume=/dev/your_swap_partition is specified on
-     the kernel command line, lateinit runs the resume process.  If the
-     resume device has not been probed yet, the resume process fails and
-     bootup continues.
-  2) Manually from an initrd or initramfs:  May be run from
-     the init script by using the /sys/power/resume file.  It is vital
-     that this be done prior to remounting any filesystems (even as
-     read-only) otherwise data may be corrupted.
-
-Article about goals and implementation of Software Suspend for Linux
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Author: Gábor Kuti
-Last revised: 2003-10-20 by Pavel Machek
-
-Idea and goals to achieve
-
-Nowadays it is common in several laptops that they have a suspend button. It
-saves the state of the machine to a filesystem or to a partition and switches
-to standby mode. Later resuming the machine the saved state is loaded back to
-ram and the machine can continue its work. It has two real benefits. First we
-save ourselves the time machine goes down and later boots up, energy costs
-are real high when running from batteries. The other gain is that we don't have to
-interrupt our programs so processes that are calculating something for a long
-time shouldn't need to be written interruptible.
-
-swsusp saves the state of the machine into active swaps and then reboots or
-powerdowns.  You must explicitly specify the swap partition to resume from with
-``resume='' kernel option. If signature is found it loads and restores saved
-state. If the option ``noresume'' is specified as a boot parameter, it skips
-the resuming.  If the option ``hibernate=nocompress'' is specified as a boot
-parameter, it saves hibernation image without compression.
-
-In the meantime while the system is suspended you should not add/remove any
-of the hardware, write to the filesystems, etc.
-
-Sleep states summary
-====================
-
-There are three different interfaces you can use, /proc/acpi should
-work like this:
-
-In a really perfect world:
-echo 1 > /proc/acpi/sleep       # for standby
-echo 2 > /proc/acpi/sleep       # for suspend to ram
-echo 3 > /proc/acpi/sleep       # for suspend to ram, but with more power conservative
-echo 4 > /proc/acpi/sleep       # for suspend to disk
-echo 5 > /proc/acpi/sleep       # for shutdown unfriendly the system
-
-and perhaps
-echo 4b > /proc/acpi/sleep      # for suspend to disk via s4bios
-
-Frequently Asked Questions
-==========================
-
-Q: well, suspending a server is IMHO a really stupid thing,
-but... (Diego Zuccato):
-
-A: You bought new UPS for your server. How do you install it without
-bringing machine down? Suspend to disk, rearrange power cables,
-resume.
-
-You have your server on UPS. Power died, and UPS is indicating 30
-seconds to failure. What do you do? Suspend to disk.
-
-
-Q: Maybe I'm missing something, but why don't the regular I/O paths work?
-
-A: We do use the regular I/O paths. However we cannot restore the data
-to its original location as we load it. That would create an
-inconsistent kernel state which would certainly result in an oops.
-Instead, we load the image into unused memory and then atomically copy
-it back to it original location. This implies, of course, a maximum
-image size of half the amount of memory.
-
-There are two solutions to this:
-
-* require half of memory to be free during suspend. That way you can
-read "new" data onto free spots, then cli and copy
-
-* assume we had special "polling" ide driver that only uses memory
-between 0-640KB. That way, I'd have to make sure that 0-640KB is free
-during suspending, but otherwise it would work...
-
-suspend2 shares this fundamental limitation, but does not include user
-data and disk caches into "used memory" by saving them in
-advance. That means that the limitation goes away in practice.
-
-Q: Does linux support ACPI S4?
-
-A: Yes. That's what echo platform > /sys/power/disk does.
-
-Q: What is 'suspend2'?
-
-A: suspend2 is 'Software Suspend 2', a forked implementation of
-suspend-to-disk which is available as separate patches for 2.4 and 2.6
-kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB
-highmem and preemption. It also has a extensible architecture that
-allows for arbitrary transformations on the image (compression,
-encryption) and arbitrary backends for writing the image (eg to swap
-or an NFS share[Work In Progress]). Questions regarding suspend2
-should be sent to the mailing list available through the suspend2
-website, and not to the Linux Kernel Mailing List. We are working
-toward merging suspend2 into the mainline kernel.
-
-Q: What is the freezing of tasks and why are we using it?
-
-A: The freezing of tasks is a mechanism by which user space processes and some
-kernel threads are controlled during hibernation or system-wide suspend (on some
-architectures).  See freezing-of-tasks.txt for details.
-
-Q: What is the difference between "platform" and "shutdown"?
-
-A:
-
-shutdown: save state in linux, then tell bios to powerdown
-
-platform: save state in linux, then tell bios to powerdown and blink
-          "suspended led"
-
-"platform" is actually right thing to do where supported, but
-"shutdown" is most reliable (except on ACPI systems).
-
-Q: I do not understand why you have such strong objections to idea of
-selective suspend.
-
-A: Do selective suspend during runtime power management, that's okay. But
-it's useless for suspend-to-disk. (And I do not see how you could use
-it for suspend-to-ram, I hope you do not want that).
-
-Lets see, so you suggest to
-
-* SUSPEND all but swap device and parents
-* Snapshot
-* Write image to disk
-* SUSPEND swap device and parents
-* Powerdown
-
-Oh no, that does not work, if swap device or its parents uses DMA,
-you've corrupted data. You'd have to do
-
-* SUSPEND all but swap device and parents
-* FREEZE swap device and parents
-* Snapshot
-* UNFREEZE swap device and parents
-* Write
-* SUSPEND swap device and parents
-
-Which means that you still need that FREEZE state, and you get more
-complicated code. (And I have not yet introduce details like system
-devices).
-
-Q: There don't seem to be any generally useful behavioral
-distinctions between SUSPEND and FREEZE.
-
-A: Doing SUSPEND when you are asked to do FREEZE is always correct,
-but it may be unnecessarily slow. If you want your driver to stay simple,
-slowness may not matter to you. It can always be fixed later.
-
-For devices like disk it does matter, you do not want to spindown for
-FREEZE.
-
-Q: After resuming, system is paging heavily, leading to very bad interactivity.
-
-A: Try running
-
-cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file
-do
-  test -f "$file" && cat "$file" > /dev/null
-done
-
-after resume. swapoff -a; swapon -a may also be useful.
-
-Q: What happens to devices during swsusp? They seem to be resumed
-during system suspend?
-
-A: That's correct. We need to resume them if we want to write image to
-disk. Whole sequence goes like
-
-      Suspend part
-      ~~~~~~~~~~~~
-      running system, user asks for suspend-to-disk
-
-      user processes are stopped
-
-      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
-      		      with state snapshot
-
-      state snapshot: copy of whole used memory is taken with interrupts disabled
-
-      resume(): devices are woken up so that we can write image to swap
-
-      write image to swap
-
-      suspend(PMSG_SUSPEND): suspend devices so that we can power off
-
-      turn the power off
-
-      Resume part
-      ~~~~~~~~~~~
-      (is actually pretty similar)
-
-      running system, user asks for suspend-to-disk
-
-      user processes are stopped (in common case there are none, but with resume-from-initrd, no one knows)
-
-      read image from disk
-
-      suspend(PMSG_FREEZE): devices are frozen so that they don't interfere
-      		      with image restoration
-
-      image restoration: rewrite memory with image
-
-      resume(): devices are woken up so that system can continue
-
-      thaw all user processes
-
-Q: What is this 'Encrypt suspend image' for?
-
-A: First of all: it is not a replacement for dm-crypt encrypted swap.
-It cannot protect your computer while it is suspended. Instead it does
-protect from leaking sensitive data after resume from suspend.
-
-Think of the following: you suspend while an application is running
-that keeps sensitive data in memory. The application itself prevents
-the data from being swapped out. Suspend, however, must write these
-data to swap to be able to resume later on. Without suspend encryption
-your sensitive data are then stored in plaintext on disk.  This means
-that after resume your sensitive data are accessible to all
-applications having direct access to the swap device which was used
-for suspend. If you don't need swap after resume these data can remain
-on disk virtually forever. Thus it can happen that your system gets
-broken in weeks later and sensitive data which you thought were
-encrypted and protected are retrieved and stolen from the swap device.
-To prevent this situation you should use 'Encrypt suspend image'.
-
-During suspend a temporary key is created and this key is used to
-encrypt the data written to disk. When, during resume, the data was
-read back into memory the temporary key is destroyed which simply
-means that all data written to disk during suspend are then
-inaccessible so they can't be stolen later on.  The only thing that
-you must then take care of is that you call 'mkswap' for the swap
-partition used for suspend as early as possible during regular
-boot. This asserts that any temporary key from an oopsed suspend or
-from a failed or aborted resume is erased from the swap device.
-
-As a rule of thumb use encrypted swap to protect your data while your
-system is shut down or suspended. Additionally use the encrypted
-suspend image to prevent sensitive data from being stolen after
-resume.
-
-Q: Can I suspend to a swap file?
-
-A: Generally, yes, you can.  However, it requires you to use the "resume=" and
-"resume_offset=" kernel command line parameters, so the resume from a swap file
-cannot be initiated from an initrd or initramfs image.  See
-swsusp-and-swap-files.txt for details.
-
-Q: Is there a maximum system RAM size that is supported by swsusp?
-
-A: It should work okay with highmem.
-
-Q: Does swsusp (to disk) use only one swap partition or can it use
-multiple swap partitions (aggregate them into one logical space)?
-
-A: Only one swap partition, sorry.
-
-Q: If my application(s) causes lots of memory & swap space to be used
-(over half of the total system RAM), is it correct that it is likely
-to be useless to try to suspend to disk while that app is running?
-
-A: No, it should work okay, as long as your app does not mlock()
-it. Just prepare big enough swap partition.
-
-Q: What information is useful for debugging suspend-to-disk problems?
-
-A: Well, last messages on the screen are always useful. If something
-is broken, it is usually some kernel driver, therefore trying with as
-little as possible modules loaded helps a lot. I also prefer people to
-suspend from console, preferably without X running. Booting with
-init=/bin/bash, then swapon and starting suspend sequence manually
-usually does the trick. Then it is good idea to try with latest
-vanilla kernel.
-
-Q: How can distributions ship a swsusp-supporting kernel with modular
-disk drivers (especially SATA)?
-
-A: Well, it can be done, load the drivers, then do echo into
-/sys/power/resume file from initrd. Be sure not to mount
-anything, not even read-only mount, or you are going to lose your
-data.
-
-Q: How do I make suspend more verbose?
-
-A: If you want to see any non-error kernel messages on the virtual
-terminal the kernel switches to during suspend, you have to set the
-kernel console loglevel to at least 4 (KERN_WARNING), for example by
-doing
-
-	# save the old loglevel
-	read LOGLEVEL DUMMY < /proc/sys/kernel/printk
-	# set the loglevel so we see the progress bar.
-	# if the level is higher than needed, we leave it alone.
-	if [ $LOGLEVEL -lt 5 ]; then
-	        echo 5 > /proc/sys/kernel/printk
-		fi
-
-        IMG_SZ=0
-        read IMG_SZ < /sys/power/image_size
-        echo -n disk > /sys/power/state
-        RET=$?
-        #
-        # the logic here is:
-        # if image_size > 0 (without kernel support, IMG_SZ will be zero),
-        # then try again with image_size set to zero.
-	if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size
-                echo 0 > /sys/power/image_size
-                echo -n disk > /sys/power/state
-                RET=$?
-        fi
-
-	# restore previous loglevel
-	echo $LOGLEVEL > /proc/sys/kernel/printk
-	exit $RET
-
-Q: Is this true that if I have a mounted filesystem on a USB device and
-I suspend to disk, I can lose data unless the filesystem has been mounted
-with "sync"?
-
-A: That's right ... if you disconnect that device, you may lose data.
-In fact, even with "-o sync" you can lose data if your programs have
-information in buffers they haven't written out to a disk you disconnect,
-or if you disconnect before the device finished saving data you wrote.
-
-Software suspend normally powers down USB controllers, which is equivalent
-to disconnecting all USB devices attached to your system.
-
-Your system might well support low-power modes for its USB controllers
-while the system is asleep, maintaining the connection, using true sleep
-modes like "suspend-to-RAM" or "standby".  (Don't write "disk" to the
-/sys/power/state file; write "standby" or "mem".)  We've not seen any
-hardware that can use these modes through software suspend, although in
-theory some systems might support "platform" modes that won't break the
-USB connections.
-
-Remember that it's always a bad idea to unplug a disk drive containing a
-mounted filesystem.  That's true even when your system is asleep!  The
-safest thing is to unmount all filesystems on removable media (such USB,
-Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays)
-before suspending; then remount them after resuming.
-
-There is a work-around for this problem.  For more information, see
-Documentation/driver-api/usb/persist.rst.
-
-Q: Can I suspend-to-disk using a swap partition under LVM?
-
-A: Yes and No.  You can suspend successfully, but the kernel will not be able
-to resume on its own.  You need an initramfs that can recognize the resume
-situation, activate the logical volume containing the swap volume (but not
-touch any filesystems!), and eventually call
-
-echo -n "$major:$minor" > /sys/power/resume
-
-where $major and $minor are the respective major and minor device numbers of
-the swap volume.
-
-uswsusp works with LVM, too.  See http://suspend.sourceforge.net/
-
-Q: I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were
-compiled with the similar configuration files. Anyway I found that
-suspend to disk (and resume) is much slower on 2.6.16 compared to
-2.6.15. Any idea for why that might happen or how can I speed it up?
-
-A: This is because the size of the suspend image is now greater than
-for 2.6.15 (by saving more data we can get more responsive system
-after resume).
-
-There's the /sys/power/image_size knob that controls the size of the
-image.  If you set it to 0 (eg. by echo 0 > /sys/power/image_size as
-root), the 2.6.15 behavior should be restored.  If it is still too
-slow, take a look at suspend.sf.net -- userland suspend is faster and
-supports LZF compression to speed it up further.
diff --git a/Documentation/power/tricks.rst b/Documentation/power/tricks.rst
new file mode 100644
index 000000000000..ca787f142c3f
--- /dev/null
+++ b/Documentation/power/tricks.rst
@@ -0,0 +1,29 @@
+================
+swsusp/S3 tricks
+================
+
+Pavel Machek <pavel@ucw.cz>
+
+If you want to trick swsusp/S3 into working, you might want to try:
+
+* go with minimal config, turn off drivers like USB, AGP you don't
+  really need
+
+* turn off APIC and preempt
+
+* use ext2. At least it has working fsck. [If something seems to go
+  wrong, force fsck when you have a chance]
+
+* turn off modules
+
+* use vga text console, shut down X. [If you really want X, you might
+  want to try vesafb later]
+
+* try running as few processes as possible, preferably go to single
+  user mode.
+
+* due to video issues, swsusp should be easier to get working than
+  S3. Try that first.
+
+When you make it work, try to find out what exactly was it that broke
+suspend, and preferably fix that.
diff --git a/Documentation/power/tricks.txt b/Documentation/power/tricks.txt
deleted file mode 100644
index a1b8f7249f4c..000000000000
--- a/Documentation/power/tricks.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-	swsusp/S3 tricks
-	~~~~~~~~~~~~~~~~
-Pavel Machek <pavel@ucw.cz>
-
-If you want to trick swsusp/S3 into working, you might want to try:
-
-* go with minimal config, turn off drivers like USB, AGP you don't
-  really need
-
-* turn off APIC and preempt
-
-* use ext2. At least it has working fsck. [If something seems to go
-  wrong, force fsck when you have a chance]
-
-* turn off modules
-
-* use vga text console, shut down X. [If you really want X, you might
-  want to try vesafb later]
-
-* try running as few processes as possible, preferably go to single
-  user mode.
-
-* due to video issues, swsusp should be easier to get working than
-  S3. Try that first.
-
-When you make it work, try to find out what exactly was it that broke
-suspend, and preferably fix that.
diff --git a/Documentation/power/userland-swsusp.rst b/Documentation/power/userland-swsusp.rst
new file mode 100644
index 000000000000..a0fa51bb1a4d
--- /dev/null
+++ b/Documentation/power/userland-swsusp.rst
@@ -0,0 +1,191 @@
+=====================================================
+Documentation for userland software suspend interface
+=====================================================
+
+	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+First, the warnings at the beginning of swsusp.txt still apply.
+
+Second, you should read the FAQ in swsusp.txt _now_ if you have not
+done it already.
+
+Now, to use the userland interface for software suspend you need special
+utilities that will read/write the system memory snapshot from/to the
+kernel.  Such utilities are available, for example, from
+<http://suspend.sourceforge.net>.  You may want to have a look at them if you
+are going to develop your own suspend/resume utilities.
+
+The interface consists of a character device providing the open(),
+release(), read(), and write() operations as well as several ioctl()
+commands defined in include/linux/suspend_ioctls.h .  The major and minor
+numbers of the device are, respectively, 10 and 231, and they can
+be read from /sys/class/misc/snapshot/dev.
+
+The device can be open either for reading or for writing.  If open for
+reading, it is considered to be in the suspend mode.  Otherwise it is
+assumed to be in the resume mode.  The device cannot be open for simultaneous
+reading and writing.  It is also impossible to have the device open more than
+once at a time.
+
+Even opening the device has side effects. Data structures are
+allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
+called.
+
+The ioctl() commands recognized by the device are:
+
+SNAPSHOT_FREEZE
+	freeze user space processes (the current process is
+	not frozen); this is required for SNAPSHOT_CREATE_IMAGE
+	and SNAPSHOT_ATOMIC_RESTORE to succeed
+
+SNAPSHOT_UNFREEZE
+	thaw user space processes frozen by SNAPSHOT_FREEZE
+
+SNAPSHOT_CREATE_IMAGE
+	create a snapshot of the system memory; the
+	last argument of ioctl() should be a pointer to an int variable,
+	the value of which will indicate whether the call returned after
+	creating the snapshot (1) or after restoring the system memory state
+	from it (0) (after resume the system finds itself finishing the
+	SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot
+	has been created the read() operation can be used to transfer
+	it out of the kernel
+
+SNAPSHOT_ATOMIC_RESTORE
+	restore the system memory state from the
+	uploaded snapshot image; before calling it you should transfer
+	the system memory snapshot back to the kernel using the write()
+	operation; this call will not succeed if the snapshot
+	image is not available to the kernel
+
+SNAPSHOT_FREE
+	free memory allocated for the snapshot image
+
+SNAPSHOT_PREF_IMAGE_SIZE
+	set the preferred maximum size of the image
+	(the kernel will do its best to ensure the image size will not exceed
+	this number, but if it turns out to be impossible, the kernel will
+	create the smallest image possible)
+
+SNAPSHOT_GET_IMAGE_SIZE
+	return the actual size of the hibernation image
+
+SNAPSHOT_AVAIL_SWAP_SIZE
+	return the amount of available swap in bytes (the
+	last argument should be a pointer to an unsigned int variable that will
+	contain the result if the call is successful).
+
+SNAPSHOT_ALLOC_SWAP_PAGE
+	allocate a swap page from the resume partition
+	(the last argument should be a pointer to a loff_t variable that
+	will contain the swap page offset if the call is successful)
+
+SNAPSHOT_FREE_SWAP_PAGES
+	free all swap pages allocated by
+	SNAPSHOT_ALLOC_SWAP_PAGE
+
+SNAPSHOT_SET_SWAP_AREA
+	set the resume partition and the offset (in <PAGE_SIZE>
+	units) from the beginning of the partition at which the swap header is
+	located (the last ioctl() argument should point to a struct
+	resume_swap_area, as defined in kernel/power/suspend_ioctls.h,
+	containing the resume device specification and the offset); for swap
+	partitions the offset is always 0, but it is different from zero for
+	swap files (see Documentation/power/swsusp-and-swap-files.rst for
+	details).
+
+SNAPSHOT_PLATFORM_SUPPORT
+	enable/disable the hibernation platform support,
+	depending on the argument value (enable, if the argument is nonzero)
+
+SNAPSHOT_POWER_OFF
+	make the kernel transition the system to the hibernation
+	state (eg. ACPI S4) using the platform (eg. ACPI) driver
+
+SNAPSHOT_S2RAM
+	suspend to RAM; using this call causes the kernel to
+	immediately enter the suspend-to-RAM state, so this call must always
+	be preceded by the SNAPSHOT_FREEZE call and it is also necessary
+	to use the SNAPSHOT_UNFREEZE call after the system wakes up.  This call
+	is needed to implement the suspend-to-both mechanism in which the
+	suspend image is first created, as though the system had been suspended
+	to disk, and then the system is suspended to RAM (this makes it possible
+	to resume the system from RAM if there's enough battery power or restore
+	its state on the basis of the saved suspend image otherwise)
+
+The device's read() operation can be used to transfer the snapshot image from
+the kernel.  It has the following limitations:
+
+- you cannot read() more than one virtual memory page at a time
+- read()s across page boundaries are impossible (ie. if you read() 1/2 of
+  a page in the previous call, you will only be able to read()
+  **at most** 1/2 of the page in the next call)
+
+The device's write() operation is used for uploading the system memory snapshot
+into the kernel.  It has the same limitations as the read() operation.
+
+The release() operation frees all memory allocated for the snapshot image
+and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any).
+Thus it is not necessary to use either SNAPSHOT_FREE or
+SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
+unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
+still frozen when the device is being closed).
+
+Currently it is assumed that the userland utilities reading/writing the
+snapshot image from/to the kernel will use a swap partition, called the resume
+partition, or a swap file as storage space (if a swap file is used, the resume
+partition is the partition that holds this file).  However, this is not really
+required, as they can use, for example, a special (blank) suspend partition or
+a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and
+mounted afterwards.
+
+These utilities MUST NOT make any assumptions regarding the ordering of
+data within the snapshot image.  The contents of the image are entirely owned
+by the kernel and its structure may be changed in future kernel releases.
+
+The snapshot image MUST be written to the kernel unaltered (ie. all of the image
+data, metadata and header MUST be written in _exactly_ the same amount, form
+and order in which they have been read).  Otherwise, the behavior of the
+resumed system may be totally unpredictable.
+
+While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
+structure of the snapshot image is consistent with the information stored
+in the image header.  If any inconsistencies are detected,
+SNAPSHOT_ATOMIC_RESTORE will not succeed.  Still, this is not a fool-proof
+mechanism and the userland utilities using the interface SHOULD use additional
+means, such as checksums, to ensure the integrity of the snapshot image.
+
+The suspending and resuming utilities MUST lock themselves in memory,
+preferably using mlockall(), before calling SNAPSHOT_FREEZE.
+
+The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE
+in the memory location pointed to by the last argument of ioctl() and proceed
+in accordance with it:
+
+1. 	If the value is 1 (ie. the system memory snapshot has just been
+	created and the system is ready for saving it):
+
+	(a)	The suspending utility MUST NOT close the snapshot device
+		_unless_ the whole suspend procedure is to be cancelled, in
+		which case, if the snapshot image has already been saved, the
+		suspending utility SHOULD destroy it, preferably by zapping
+		its header.  If the suspend is not to be cancelled, the
+		system MUST be powered off or rebooted after the snapshot
+		image has been saved.
+	(b)	The suspending utility SHOULD NOT attempt to perform any
+		file system operations (including reads) on the file systems
+		that were mounted before SNAPSHOT_CREATE_IMAGE has been
+		called.  However, it MAY mount a file system that was not
+		mounted at that time and perform some operations on it (eg.
+		use it for saving the image).
+
+2.	If the value is 0 (ie. the system state has just been restored from
+	the snapshot image), the suspending utility MUST close the snapshot
+	device.  Afterwards it will be treated as a regular userland process,
+	so it need not exit.
+
+The resuming utility SHOULD NOT attempt to mount any file systems that could
+be mounted before suspend and SHOULD NOT attempt to perform any operations
+involving such file systems.
+
+For details, please refer to the source code.
diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
deleted file mode 100644
index bbfcd1bbedc5..000000000000
--- a/Documentation/power/userland-swsusp.txt
+++ /dev/null
@@ -1,170 +0,0 @@
-Documentation for userland software suspend interface
-	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
-
-First, the warnings at the beginning of swsusp.txt still apply.
-
-Second, you should read the FAQ in swsusp.txt _now_ if you have not
-done it already.
-
-Now, to use the userland interface for software suspend you need special
-utilities that will read/write the system memory snapshot from/to the
-kernel.  Such utilities are available, for example, from
-<http://suspend.sourceforge.net>.  You may want to have a look at them if you
-are going to develop your own suspend/resume utilities.
-
-The interface consists of a character device providing the open(),
-release(), read(), and write() operations as well as several ioctl()
-commands defined in include/linux/suspend_ioctls.h .  The major and minor
-numbers of the device are, respectively, 10 and 231, and they can
-be read from /sys/class/misc/snapshot/dev.
-
-The device can be open either for reading or for writing.  If open for
-reading, it is considered to be in the suspend mode.  Otherwise it is
-assumed to be in the resume mode.  The device cannot be open for simultaneous
-reading and writing.  It is also impossible to have the device open more than
-once at a time.
-
-Even opening the device has side effects. Data structures are
-allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are
-called.
-
-The ioctl() commands recognized by the device are:
-
-SNAPSHOT_FREEZE - freeze user space processes (the current process is
-	not frozen); this is required for SNAPSHOT_CREATE_IMAGE
-	and SNAPSHOT_ATOMIC_RESTORE to succeed
-
-SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE
-
-SNAPSHOT_CREATE_IMAGE - create a snapshot of the system memory; the
-	last argument of ioctl() should be a pointer to an int variable,
-	the value of which will indicate whether the call returned after
-	creating the snapshot (1) or after restoring the system memory state
-	from it (0) (after resume the system finds itself finishing the
-	SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot
-	has been created the read() operation can be used to transfer
-	it out of the kernel
-
-SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the
-	uploaded snapshot image; before calling it you should transfer
-	the system memory snapshot back to the kernel using the write()
-	operation; this call will not succeed if the snapshot
-	image is not available to the kernel
-
-SNAPSHOT_FREE - free memory allocated for the snapshot image
-
-SNAPSHOT_PREF_IMAGE_SIZE - set the preferred maximum size of the image
-	(the kernel will do its best to ensure the image size will not exceed
-	this number, but if it turns out to be impossible, the kernel will
-	create the smallest image possible)
-
-SNAPSHOT_GET_IMAGE_SIZE - return the actual size of the hibernation image
-
-SNAPSHOT_AVAIL_SWAP_SIZE - return the amount of available swap in bytes (the
-	last argument should be a pointer to an unsigned int variable that will
-	contain the result if the call is successful).
-
-SNAPSHOT_ALLOC_SWAP_PAGE - allocate a swap page from the resume partition
-	(the last argument should be a pointer to a loff_t variable that
-	will contain the swap page offset if the call is successful)
-
-SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated by
-	SNAPSHOT_ALLOC_SWAP_PAGE
-
-SNAPSHOT_SET_SWAP_AREA - set the resume partition and the offset (in <PAGE_SIZE>
-	units) from the beginning of the partition at which the swap header is
-	located (the last ioctl() argument should point to a struct
-	resume_swap_area, as defined in kernel/power/suspend_ioctls.h,
-	containing the resume device specification and the offset); for swap
-	partitions the offset is always 0, but it is different from zero for
-	swap files (see Documentation/power/swsusp-and-swap-files.txt for
-	details).
-
-SNAPSHOT_PLATFORM_SUPPORT - enable/disable the hibernation platform support,
-	depending on the argument value (enable, if the argument is nonzero)
-
-SNAPSHOT_POWER_OFF - make the kernel transition the system to the hibernation
-	state (eg. ACPI S4) using the platform (eg. ACPI) driver
-
-SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to
-	immediately enter the suspend-to-RAM state, so this call must always
-	be preceded by the SNAPSHOT_FREEZE call and it is also necessary
-	to use the SNAPSHOT_UNFREEZE call after the system wakes up.  This call
-	is needed to implement the suspend-to-both mechanism in which the
-	suspend image is first created, as though the system had been suspended
-	to disk, and then the system is suspended to RAM (this makes it possible
-	to resume the system from RAM if there's enough battery power or restore
-	its state on the basis of the saved suspend image otherwise)
-
-The device's read() operation can be used to transfer the snapshot image from
-the kernel.  It has the following limitations:
-- you cannot read() more than one virtual memory page at a time
-- read()s across page boundaries are impossible (ie. if you read() 1/2 of
-	a page in the previous call, you will only be able to read()
-	_at_ _most_ 1/2 of the page in the next call)
-
-The device's write() operation is used for uploading the system memory snapshot
-into the kernel.  It has the same limitations as the read() operation.
-
-The release() operation frees all memory allocated for the snapshot image
-and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any).
-Thus it is not necessary to use either SNAPSHOT_FREE or
-SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
-unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
-still frozen when the device is being closed).
-
-Currently it is assumed that the userland utilities reading/writing the
-snapshot image from/to the kernel will use a swap partition, called the resume
-partition, or a swap file as storage space (if a swap file is used, the resume
-partition is the partition that holds this file).  However, this is not really
-required, as they can use, for example, a special (blank) suspend partition or
-a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and
-mounted afterwards.
-
-These utilities MUST NOT make any assumptions regarding the ordering of
-data within the snapshot image.  The contents of the image are entirely owned
-by the kernel and its structure may be changed in future kernel releases.
-
-The snapshot image MUST be written to the kernel unaltered (ie. all of the image
-data, metadata and header MUST be written in _exactly_ the same amount, form
-and order in which they have been read).  Otherwise, the behavior of the
-resumed system may be totally unpredictable.
-
-While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
-structure of the snapshot image is consistent with the information stored
-in the image header.  If any inconsistencies are detected,
-SNAPSHOT_ATOMIC_RESTORE will not succeed.  Still, this is not a fool-proof
-mechanism and the userland utilities using the interface SHOULD use additional
-means, such as checksums, to ensure the integrity of the snapshot image.
-
-The suspending and resuming utilities MUST lock themselves in memory,
-preferably using mlockall(), before calling SNAPSHOT_FREEZE.
-
-The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE
-in the memory location pointed to by the last argument of ioctl() and proceed
-in accordance with it:
-1. 	If the value is 1 (ie. the system memory snapshot has just been
-	created and the system is ready for saving it):
-	(a)	The suspending utility MUST NOT close the snapshot device
-		_unless_ the whole suspend procedure is to be cancelled, in
-		which case, if the snapshot image has already been saved, the
-		suspending utility SHOULD destroy it, preferably by zapping
-		its header.  If the suspend is not to be cancelled, the
-		system MUST be powered off or rebooted after the snapshot
-		image has been saved.
-	(b)	The suspending utility SHOULD NOT attempt to perform any
-		file system operations (including reads) on the file systems
-		that were mounted before SNAPSHOT_CREATE_IMAGE has been
-		called.  However, it MAY mount a file system that was not
-		mounted at that time and perform some operations on it (eg.
-		use it for saving the image).
-2.	If the value is 0 (ie. the system state has just been restored from
-	the snapshot image), the suspending utility MUST close the snapshot
-	device.  Afterwards it will be treated as a regular userland process,
-	so it need not exit.
-
-The resuming utility SHOULD NOT attempt to mount any file systems that could
-be mounted before suspend and SHOULD NOT attempt to perform any operations
-involving such file systems.
-
-For details, please refer to the source code.
diff --git a/Documentation/power/video.rst b/Documentation/power/video.rst
new file mode 100644
index 000000000000..337a2ba9f32f
--- /dev/null
+++ b/Documentation/power/video.rst
@@ -0,0 +1,213 @@
+===========================
+Video issues with S3 resume
+===========================
+
+2003-2006, Pavel Machek
+
+During S3 resume, hardware needs to be reinitialized. For most
+devices, this is easy, and kernel driver knows how to do
+it. Unfortunately there's one exception: video card. Those are usually
+initialized by BIOS, and kernel does not have enough information to
+boot video card. (Kernel usually does not even contain video card
+driver -- vesafb and vgacon are widely used).
+
+This is not problem for swsusp, because during swsusp resume, BIOS is
+run normally so video card is normally initialized. It should not be
+problem for S1 standby, because hardware should retain its state over
+that.
+
+We either have to run video BIOS during early resume, or interpret it
+using vbetool later, or maybe nothing is necessary on particular
+system because video state is preserved. Unfortunately different
+methods work on different systems, and no known method suits all of
+them.
+
+Userland application called s2ram has been developed; it contains long
+whitelist of systems, and automatically selects working method for a
+given system. It can be downloaded from CVS at
+www.sf.net/projects/suspend . If you get a system that is not in the
+whitelist, please try to find a working solution, and submit whitelist
+entry so that work does not need to be repeated.
+
+Currently, VBE_SAVE method (6 below) works on most
+systems. Unfortunately, vbetool only runs after userland is resumed,
+so it makes debugging of early resume problems
+hard/impossible. Methods that do not rely on userland are preferable.
+
+Details
+~~~~~~~
+
+There are a few types of systems where video works after S3 resume:
+
+(1) systems where video state is preserved over S3.
+
+(2) systems where it is possible to call the video BIOS during S3
+    resume. Unfortunately, it is not correct to call the video BIOS at
+    that point, but it happens to work on some machines. Use
+    acpi_sleep=s3_bios.
+
+(3) systems that initialize video card into vga text mode and where
+    the BIOS works well enough to be able to set video mode. Use
+    acpi_sleep=s3_mode on these.
+
+(4) on some systems s3_bios kicks video into text mode, and
+    acpi_sleep=s3_bios,s3_mode is needed.
+
+(5) radeon systems, where X can soft-boot your video card. You'll need
+    a new enough X, and a plain text console (no vesafb or radeonfb). See
+    http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information.
+    Alternatively, you should use vbetool (6) instead.
+
+(6) other radeon systems, where vbetool is enough to bring system back
+    to life. It needs text console to be working. Do vbetool vbestate
+    save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool
+    vbestate restore < /tmp/delme; setfont <whatever>, and your video
+    should work.
+
+(7) on some systems, it is possible to boot most of kernel, and then
+    POSTing bios works. Ole Rohne has patch to do just that at
+    http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2.
+
+(8) on some systems, you can use the video_post utility and or
+    do echo 3 > /sys/power/state  && /usr/sbin/video_post - which will
+    initialize the display in console mode. If you are in X, you can switch
+    to a virtual terminal and back to X using  CTRL+ALT+F1 - CTRL+ALT+F7 to get
+    the display working in graphical mode again.
+
+Now, if you pass acpi_sleep=something, and it does not work with your
+bios, you'll get a hard crash during resume. Be careful. Also it is
+safest to do your experiments with plain old VGA console. The vesafb
+and radeonfb (etc) drivers have a tendency to crash the machine during
+resume.
+
+You may have a system where none of above works. At that point you
+either invent another ugly hack that works, or write proper driver for
+your video card (good luck getting docs :-(). Maybe suspending from X
+(proper X, knowing your hardware, not XF68_FBcon) might have better
+chance of working.
+
+Table of known working notebooks:
+
+
+=============================== ===============================================
+Model                           hack (or "how to do it")
+=============================== ===============================================
+Acer Aspire 1406LC		ole's late BIOS init (7), turn off DRI
+Acer TM 230			s3_bios (2)
+Acer TM 242FX			vbetool (6)
+Acer TM C110			video_post (8)
+Acer TM C300                    vga=normal (only suspend on console, not in X),
+				vbetool (6) or video_post (8)
+Acer TM 4052LCi		        s3_bios (2)
+Acer TM 636Lci			s3_bios,s3_mode (4)
+Acer TM 650 (Radeon M7)		vga=normal plus boot-radeon (5) gets text
+				console back
+Acer TM 660			??? [#f1]_
+Acer TM 800			vga=normal, X patches, see webpage (5)
+				or vbetool (6)
+Acer TM 803			vga=normal, X patches, see webpage (5)
+				or vbetool (6)
+Acer TM 803LCi			vga=normal, vbetool (6)
+Arima W730a			vbetool needed (6)
+Asus L2400D                     s3_mode (3) [#f2]_ (S1 also works OK)
+Asus L3350M (SiS 740)           (6)
+Asus L3800C (Radeon M7)		s3_bios (2) (S1 also works OK)
+Asus M6887Ne			vga=normal, s3_bios (2), use radeon driver
+				instead of fglrx in x.org
+Athlon64 desktop prototype	s3_bios (2)
+Compal CL-50			??? [#f1]_
+Compaq Armada E500 - P3-700     none (1) (S1 also works OK)
+Compaq Evo N620c		vga=normal, s3_bios (2)
+Dell 600m, ATI R250 Lf		none (1), but needs xorg-x11-6.8.1.902-1
+Dell D600, ATI RV250            vga=normal and X, or try vbestate (6)
+Dell D610			vga=normal and X (possibly vbestate (6) too,
+				but not tested)
+Dell Inspiron 4000		??? [#f1]_
+Dell Inspiron 500m		??? [#f1]_
+Dell Inspiron 510m		???
+Dell Inspiron 5150		vbetool needed (6)
+Dell Inspiron 600m		??? [#f1]_
+Dell Inspiron 8200		??? [#f1]_
+Dell Inspiron 8500		??? [#f1]_
+Dell Inspiron 8600		??? [#f1]_
+eMachines athlon64 machines	vbetool needed (6) (someone please get
+				me model #s)
+HP NC6000			s3_bios, may not use radeonfb (2);
+				or vbetool (6)
+HP NX7000			??? [#f1]_
+HP Pavilion ZD7000		vbetool post needed, need open-source nv
+				driver for X
+HP Omnibook XE3	athlon version	none (1)
+HP Omnibook XE3GC		none (1), video is S3 Savage/IX-MV
+HP Omnibook XE3L-GF		vbetool (6)
+HP Omnibook 5150		none (1), (S1 also works OK)
+IBM TP T20, model 2647-44G	none (1), video is S3 Inc. 86C270-294
+				Savage/IX-MV, vesafb gets "interesting"
+				but X work.
+IBM TP A31 / Type 2652-M5G      s3_mode (3) [works ok with
+				BIOS 1.04 2002-08-23, but not at all with
+				BIOS 1.11 2004-11-05 :-(]
+IBM TP R32 / Type 2658-MMG      none (1)
+IBM TP R40 2722B3G		??? [#f1]_
+IBM TP R50p / Type 1832-22U     s3_bios (2)
+IBM TP R51			none (1)
+IBM TP T30	236681A		??? [#f1]_
+IBM TP T40 / Type 2373-MU4      none (1)
+IBM TP T40p			none (1)
+IBM TP R40p			s3_bios (2)
+IBM TP T41p			s3_bios (2), switch to X after resume
+IBM TP T42			s3_bios (2)
+IBM ThinkPad T42p (2373-GTG)	s3_bios (2)
+IBM TP X20			??? [#f1]_
+IBM TP X30			s3_bios, s3_mode (4)
+IBM TP X31 / Type 2672-XXH      none (1), use radeontool
+				(http://fdd.com/software/radeon/) to
+				turn off backlight.
+IBM TP X32			none (1), but backlight is on and video is
+				trashed after long suspend. s3_bios,
+				s3_mode (4) works too. Perhaps that gets
+				better results?
+IBM Thinkpad X40 Type 2371-7JG  s3_bios,s3_mode (4)
+IBM TP 600e			none(1), but a switch to console and
+				back to X is needed
+Medion MD4220			??? [#f1]_
+Samsung P35			vbetool needed (6)
+Sharp PC-AR10 (ATI rage)	none (1), backlight does not switch off
+Sony Vaio PCG-C1VRX/K		s3_bios (2)
+Sony Vaio PCG-F403		??? [#f1]_
+Sony Vaio PCG-GRT995MP		none (1), works with 'nv' X driver
+Sony Vaio PCG-GR7/K		none (1), but needs radeonfb, use
+				radeontool (http://fdd.com/software/radeon/)
+				to turn off backlight.
+Sony Vaio PCG-N505SN		??? [#f1]_
+Sony Vaio vgn-s260		X or boot-radeon can init it (5)
+Sony Vaio vgn-S580BH		vga=normal, but suspend from X. Console will
+				be blank unless you return to X.
+Sony Vaio vgn-FS115B		s3_bios (2),s3_mode (4)
+Toshiba Libretto L5		none (1)
+Toshiba Libretto 100CT/110CT    vbetool (6)
+Toshiba Portege 3020CT		s3_mode (3)
+Toshiba Satellite 4030CDT	s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4080XCDT      s3_mode (3) (S1 also works OK)
+Toshiba Satellite 4090XCDT      ??? [#f1]_
+Toshiba Satellite P10-554       s3_bios,s3_mode (4)[#f3]_
+Toshiba M30                     (2) xor X with nvidia driver using internal AGP
+Uniwill 244IIO			??? [#f1]_
+=============================== ===============================================
+
+Known working desktop systems
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+=================== ============================= ========================
+Mainboard	    Graphics card                 hack (or "how to do it")
+=================== ============================= ========================
+Asus A7V8X	    nVidia RIVA TNT2 model 64	  s3_bios,s3_mode (4)
+=================== ============================= ========================
+
+
+.. [#f1] from https://wiki.ubuntu.com/HoaryPMResults, not sure
+         which options to use. If you know, please tell me.
+
+.. [#f2] To be tested with a newer kernel.
+
+.. [#f3] Not with SMP kernel, UP only.
diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt
deleted file mode 100644
index 3e6272bc4472..000000000000
--- a/Documentation/power/video.txt
+++ /dev/null
@@ -1,185 +0,0 @@
-
-		Video issues with S3 resume
-		~~~~~~~~~~~~~~~~~~~~~~~~~~~
-		  2003-2006, Pavel Machek
-
-During S3 resume, hardware needs to be reinitialized. For most
-devices, this is easy, and kernel driver knows how to do
-it. Unfortunately there's one exception: video card. Those are usually
-initialized by BIOS, and kernel does not have enough information to
-boot video card. (Kernel usually does not even contain video card
-driver -- vesafb and vgacon are widely used).
-
-This is not problem for swsusp, because during swsusp resume, BIOS is
-run normally so video card is normally initialized. It should not be
-problem for S1 standby, because hardware should retain its state over
-that.
-
-We either have to run video BIOS during early resume, or interpret it
-using vbetool later, or maybe nothing is necessary on particular
-system because video state is preserved. Unfortunately different
-methods work on different systems, and no known method suits all of
-them.
-
-Userland application called s2ram has been developed; it contains long
-whitelist of systems, and automatically selects working method for a
-given system. It can be downloaded from CVS at
-www.sf.net/projects/suspend . If you get a system that is not in the
-whitelist, please try to find a working solution, and submit whitelist
-entry so that work does not need to be repeated.
-
-Currently, VBE_SAVE method (6 below) works on most
-systems. Unfortunately, vbetool only runs after userland is resumed,
-so it makes debugging of early resume problems
-hard/impossible. Methods that do not rely on userland are preferable.
-
-Details
-~~~~~~~
-
-There are a few types of systems where video works after S3 resume:
-
-(1) systems where video state is preserved over S3.
-
-(2) systems where it is possible to call the video BIOS during S3
-  resume. Unfortunately, it is not correct to call the video BIOS at
-  that point, but it happens to work on some machines. Use
-  acpi_sleep=s3_bios.
-
-(3) systems that initialize video card into vga text mode and where
-  the BIOS works well enough to be able to set video mode. Use
-  acpi_sleep=s3_mode on these.
-
-(4) on some systems s3_bios kicks video into text mode, and
-  acpi_sleep=s3_bios,s3_mode is needed.
-
-(5) radeon systems, where X can soft-boot your video card. You'll need
-  a new enough X, and a plain text console (no vesafb or radeonfb). See
-  http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information.
-  Alternatively, you should use vbetool (6) instead.
-
-(6) other radeon systems, where vbetool is enough to bring system back
-  to life. It needs text console to be working. Do vbetool vbestate
-  save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool
-  vbestate restore < /tmp/delme; setfont <whatever>, and your video
-  should work.
-
-(7) on some systems, it is possible to boot most of kernel, and then
-  POSTing bios works. Ole Rohne has patch to do just that at
-  http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2.
-
-(8) on some systems, you can use the video_post utility and or 
-  do echo 3 > /sys/power/state  && /usr/sbin/video_post - which will 
-  initialize the display in console mode. If you are in X, you can switch
-  to a virtual terminal and back to X using  CTRL+ALT+F1 - CTRL+ALT+F7 to get
-  the display working in graphical mode again.
-
-Now, if you pass acpi_sleep=something, and it does not work with your
-bios, you'll get a hard crash during resume. Be careful. Also it is
-safest to do your experiments with plain old VGA console. The vesafb
-and radeonfb (etc) drivers have a tendency to crash the machine during
-resume.
-
-You may have a system where none of above works. At that point you
-either invent another ugly hack that works, or write proper driver for
-your video card (good luck getting docs :-(). Maybe suspending from X
-(proper X, knowing your hardware, not XF68_FBcon) might have better
-chance of working.
-
-Table of known working notebooks:
-
-Model                           hack (or "how to do it")
-------------------------------------------------------------------------------
-Acer Aspire 1406LC		ole's late BIOS init (7), turn off DRI
-Acer TM 230			s3_bios (2)
-Acer TM 242FX			vbetool (6)
-Acer TM C110			video_post (8)
-Acer TM C300                    vga=normal (only suspend on console, not in X), vbetool (6) or video_post (8)
-Acer TM 4052LCi		        s3_bios (2)
-Acer TM 636Lci			s3_bios,s3_mode (4)
-Acer TM 650 (Radeon M7)		vga=normal plus boot-radeon (5) gets text console back
-Acer TM 660			??? (*)
-Acer TM 800			vga=normal, X patches, see webpage (5) or vbetool (6)
-Acer TM 803			vga=normal, X patches, see webpage (5) or vbetool (6)
-Acer TM 803LCi			vga=normal, vbetool (6)
-Arima W730a			vbetool needed (6)
-Asus L2400D                     s3_mode (3)(***) (S1 also works OK)
-Asus L3350M (SiS 740)           (6)
-Asus L3800C (Radeon M7)		s3_bios (2) (S1 also works OK)
-Asus M6887Ne			vga=normal, s3_bios (2), use radeon driver instead of fglrx in x.org
-Athlon64 desktop prototype	s3_bios (2)
-Compal CL-50			??? (*)
-Compaq Armada E500 - P3-700     none (1) (S1 also works OK)
-Compaq Evo N620c		vga=normal, s3_bios (2)
-Dell 600m, ATI R250 Lf		none (1), but needs xorg-x11-6.8.1.902-1
-Dell D600, ATI RV250            vga=normal and X, or try vbestate (6)
-Dell D610			vga=normal and X (possibly vbestate (6) too, but not tested)
-Dell Inspiron 4000		??? (*)
-Dell Inspiron 500m		??? (*)
-Dell Inspiron 510m		???
-Dell Inspiron 5150		vbetool needed (6)
-Dell Inspiron 600m		??? (*)
-Dell Inspiron 8200		??? (*)
-Dell Inspiron 8500		??? (*)
-Dell Inspiron 8600		??? (*)
-eMachines athlon64 machines	vbetool needed (6) (someone please get me model #s)
-HP NC6000			s3_bios, may not use radeonfb (2); or vbetool (6)
-HP NX7000			??? (*)
-HP Pavilion ZD7000		vbetool post needed, need open-source nv driver for X
-HP Omnibook XE3	athlon version	none (1)
-HP Omnibook XE3GC		none (1), video is S3 Savage/IX-MV
-HP Omnibook XE3L-GF		vbetool (6)
-HP Omnibook 5150		none (1), (S1 also works OK)
-IBM TP T20, model 2647-44G	none (1), video is S3 Inc. 86C270-294 Savage/IX-MV, vesafb gets "interesting" but X work.
-IBM TP A31 / Type 2652-M5G      s3_mode (3) [works ok with BIOS 1.04 2002-08-23, but not at all with BIOS 1.11 2004-11-05 :-(]
-IBM TP R32 / Type 2658-MMG      none (1)
-IBM TP R40 2722B3G		??? (*)
-IBM TP R50p / Type 1832-22U     s3_bios (2)
-IBM TP R51			none (1)
-IBM TP T30	236681A		??? (*)
-IBM TP T40 / Type 2373-MU4      none (1)
-IBM TP T40p			none (1)
-IBM TP R40p			s3_bios (2)
-IBM TP T41p			s3_bios (2), switch to X after resume
-IBM TP T42			s3_bios (2)
-IBM ThinkPad T42p (2373-GTG)	s3_bios (2)
-IBM TP X20			??? (*)
-IBM TP X30			s3_bios, s3_mode (4)
-IBM TP X31 / Type 2672-XXH      none (1), use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
-IBM TP X32			none (1), but backlight is on and video is trashed after long suspend. s3_bios,s3_mode (4) works too. Perhaps that gets better results?
-IBM Thinkpad X40 Type 2371-7JG  s3_bios,s3_mode (4)
-IBM TP 600e			none(1), but a switch to console and back to X is needed
-Medion MD4220			??? (*)
-Samsung P35			vbetool needed (6)
-Sharp PC-AR10 (ATI rage)	none (1), backlight does not switch off
-Sony Vaio PCG-C1VRX/K		s3_bios (2)
-Sony Vaio PCG-F403		??? (*)
-Sony Vaio PCG-GRT995MP		none (1), works with 'nv' X driver
-Sony Vaio PCG-GR7/K		none (1), but needs radeonfb, use radeontool (http://fdd.com/software/radeon/) to turn off backlight.
-Sony Vaio PCG-N505SN		??? (*)
-Sony Vaio vgn-s260		X or boot-radeon can init it (5)
-Sony Vaio vgn-S580BH		vga=normal, but suspend from X. Console will be blank unless you return to X.
-Sony Vaio vgn-FS115B		s3_bios (2),s3_mode (4)
-Toshiba Libretto L5		none (1)
-Toshiba Libretto 100CT/110CT    vbetool (6)
-Toshiba Portege 3020CT		s3_mode (3)
-Toshiba Satellite 4030CDT	s3_mode (3) (S1 also works OK)
-Toshiba Satellite 4080XCDT      s3_mode (3) (S1 also works OK)
-Toshiba Satellite 4090XCDT      ??? (*)
-Toshiba Satellite P10-554       s3_bios,s3_mode (4)(****)
-Toshiba M30                     (2) xor X with nvidia driver using internal AGP
-Uniwill 244IIO			??? (*)
-
-Known working desktop systems
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Mainboard	    Graphics card                 hack (or "how to do it")
-------------------------------------------------------------------------------
-Asus A7V8X	    nVidia RIVA TNT2 model 64	  s3_bios,s3_mode (4)
-
-
-(*) from https://wiki.ubuntu.com/HoaryPMResults, not sure
-    which options to use. If you know, please tell me.
-
-(***) To be tested with a newer kernel.
-
-(****) Not with SMP kernel, UP only.
diff --git a/Documentation/process/submitting-drivers.rst b/Documentation/process/submitting-drivers.rst
index 58bc047e7b95..1acaa14903d6 100644
--- a/Documentation/process/submitting-drivers.rst
+++ b/Documentation/process/submitting-drivers.rst
@@ -117,7 +117,7 @@ PM support:
 		implemented") error.  You should also try to make sure that your
 		driver uses as little power as possible when it's not doing
 		anything.  For the driver testing instructions see
-		Documentation/power/drivers-testing.txt and for a relatively
+		Documentation/power/drivers-testing.rst and for a relatively
 		complete overview of the power management issues related to
 		drivers see :ref:`Documentation/driver-api/pm/devices.rst <driverapi_pm_devices>`.
 
diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt
index 197d81f4b836..d97207b9accb 100644
--- a/Documentation/scheduler/sched-energy.txt
+++ b/Documentation/scheduler/sched-energy.txt
@@ -22,7 +22,7 @@ the highest.
 
 The actual EM used by EAS is _not_ maintained by the scheduler, but by a
 dedicated framework. For details about this framework and what it provides,
-please refer to its documentation (see Documentation/power/energy-model.txt).
+please refer to its documentation (see Documentation/power/energy-model.rst).
 
 
 2. Background and Terminology
@@ -81,7 +81,7 @@ through the arch_scale_cpu_capacity() callback.
 
 The rest of platform knowledge used by EAS is directly read from the Energy
 Model (EM) framework. The EM of a platform is composed of a power cost table
-per 'performance domain' in the system (see Documentation/power/energy-model.txt
+per 'performance domain' in the system (see Documentation/power/energy-model.rst
 for futher details about performance domains).
 
 The scheduler manages references to the EM objects in the topology code when the
@@ -352,7 +352,7 @@ could be amended in the future if proven otherwise.
 EAS uses the EM of a platform to estimate the impact of scheduling decisions on
 energy. So, your platform must provide power cost tables to the EM framework in
 order to make EAS start. To do so, please refer to documentation of the
-independent EM framework in Documentation/power/energy-model.txt.
+independent EM framework in Documentation/power/energy-model.rst.
 
 Please also note that the scheduling domains need to be re-built after the
 EM has been registered in order to start EAS.
diff --git a/Documentation/trace/coresight-cpu-debug.txt b/Documentation/trace/coresight-cpu-debug.txt
index f07e38094b40..1a660a39e3c0 100644
--- a/Documentation/trace/coresight-cpu-debug.txt
+++ b/Documentation/trace/coresight-cpu-debug.txt
@@ -151,7 +151,7 @@ At the runtime you can disable idle states with below methods:
 
 It is possible to disable CPU idle states by way of the PM QoS
 subsystem, more specifically by using the "/dev/cpu_dma_latency"
-interface (see Documentation/power/pm_qos_interface.txt for more
+interface (see Documentation/power/pm_qos_interface.rst for more
 details).  As specified in the PM QoS documentation the requested
 parameter will stay in effect until the file descriptor is released.
 For example:
diff --git a/Documentation/translations/zh_CN/process/submitting-drivers.rst b/Documentation/translations/zh_CN/process/submitting-drivers.rst
index 72c6cd935821..f1c3906c69a8 100644
--- a/Documentation/translations/zh_CN/process/submitting-drivers.rst
+++ b/Documentation/translations/zh_CN/process/submitting-drivers.rst
@@ -97,7 +97,7 @@ Linux 2.6:
 		函数定义成返回 -ENOSYS（功能未实现）错误。你还应该尝试确
 		保你的驱动在什么都不干的情况下将耗电降到最低。要获得驱动
 		程序测试的指导，请参阅
-		Documentation/power/drivers-testing.txt。有关驱动程序电
+		Documentation/power/drivers-testing.rst。有关驱动程序电
 		源管理问题相对全面的概述，请参阅
 		Documentation/driver-api/pm/devices.rst。
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 9c382053ce6a..5a6137df3f0e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6446,7 +6446,7 @@ M:	"Rafael J. Wysocki" <rjw@rjwysocki.net>
 M:	Pavel Machek <pavel@ucw.cz>
 L:	linux-pm@vger.kernel.org
 S:	Supported
-F:	Documentation/power/freezing-of-tasks.txt
+F:	Documentation/power/freezing-of-tasks.rst
 F:	include/linux/freezer.h
 F:	kernel/freezer.c
 
@@ -11764,7 +11764,7 @@ S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git
 F:	drivers/opp/
 F:	include/linux/pm_opp.h
-F:	Documentation/power/opp.txt
+F:	Documentation/power/opp.rst
 F:	Documentation/devicetree/bindings/opp/
 
 OPL4 DRIVER
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2bbbd4d1ba31..77a724771dbb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2447,7 +2447,7 @@ menuconfig APM
 	  machines with more than one CPU.
 
 	  In order to use APM, you will need supporting software. For location
-	  and more information, read <file:Documentation/power/apm-acpi.txt>
+	  and more information, read <file:Documentation/power/apm-acpi.rst>
 	  and the Battery Powered Linux mini-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 066fd2a12851..10d040e2e807 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1175,7 +1175,7 @@ struct skl_wm_params {
  * to be disabled. This shouldn't happen and we'll print some error messages in
  * case it happens.
  *
- * For more, read the Documentation/power/runtime_pm.txt.
+ * For more, read the Documentation/power/runtime_pm.rst.
  */
 struct i915_runtime_pm {
 	atomic_t wakeref_count;
diff --git a/drivers/opp/Kconfig b/drivers/opp/Kconfig
index a7fbb93f302c..1f64a3d46c8a 100644
--- a/drivers/opp/Kconfig
+++ b/drivers/opp/Kconfig
@@ -10,4 +10,4 @@ config PM_OPP
 	  OPP layer organizes the data internally using device pointers
 	  representing individual voltage domains and provides SOC
 	  implementations a ready to use framework to manage OPPs.
-	  For more information, read <file:Documentation/power/opp.txt>
+	  For more information, read <file:Documentation/power/opp.rst>
diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index f7033ecf6d0b..11f9c875b028 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -607,7 +607,7 @@ int power_supply_get_battery_info(struct power_supply *psy,
 
 	/* The property and field names below must correspond to elements
 	 * in enum power_supply_property. For reasoning, see
-	 * Documentation/power/power_supply_class.txt.
+	 * Documentation/power/power_supply_class.rst.
 	 */
 
 	of_property_read_u32(battery_np, "energy-full-design-microwatt-hours",
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c7eef32e7739..5b8328a99b2a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -52,7 +52,7 @@
  *                irq line disabled until the threaded handler has been run.
  * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend.  Does not guarantee
  *                   that this interrupt will wake the system from a suspended
- *                   state.  See Documentation/power/suspend-and-interrupts.txt
+ *                   state.  See Documentation/power/suspend-and-interrupts.rst
  * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set
  * IRQF_NO_THREAD - Interrupt cannot be threaded
  * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b74b2a4e6df2..3d9a167ca5c3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -807,7 +807,7 @@ struct module;
  * @suspend_late: Put device into low power state.
  * @resume_early: Wake device from low power state.
  * @resume:	Wake device from low power state.
- *		(Please see Documentation/power/pci.txt for descriptions
+ *		(Please see Documentation/power/pci.rst for descriptions
  *		of PCI Power Management and the related functions.)
  * @shutdown:	Hook into reboot_notifier_list (kernel/sys.c).
  *		Intended to stop any idling DMA operations.
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 66c19a65a514..c14ad8bc1a41 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -284,7 +284,7 @@ typedef struct pm_message {
  * actions to be performed by a device driver's callbacks generally depend on
  * the platform and subsystem the device belongs to.
  *
- * Refer to Documentation/power/runtime_pm.txt for more information about the
+ * Refer to Documentation/power/runtime_pm.rst for more information about the
  * role of the @runtime_suspend(), @runtime_resume() and @runtime_idle()
  * callbacks in device runtime power management.
  */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9bbaaab14b36..7a4dda9e5309 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -65,7 +65,7 @@ config HIBERNATION
 	  need to run mkswap against the swap partition used for the suspend.
 
 	  It also works with swap files to a limited extent (for details see
-	  <file:Documentation/power/swsusp-and-swap-files.txt>).
+	  <file:Documentation/power/swsusp-and-swap-files.rst>).
 
 	  Right now you may boot without resuming and resume later but in the
 	  meantime you cannot use the swap partition(s)/file(s) involved in
@@ -74,7 +74,7 @@ config HIBERNATION
 	  MOUNT any journaled filesystems mounted before the suspend or they
 	  will get corrupted in a nasty way.
 
-	  For more information take a look at <file:Documentation/power/swsusp.txt>.
+	  For more information take a look at <file:Documentation/power/swsusp.rst>.
 
 config ARCH_SAVE_PAGE_KEYS
 	bool
@@ -255,7 +255,7 @@ config APM_EMULATION
 	  notification of APM "events" (e.g. battery status change).
 
 	  In order to use APM, you will need supporting software. For location
-	  and more information, read <file:Documentation/power/apm-acpi.txt>
+	  and more information, read <file:Documentation/power/apm-acpi.rst>
 	  and the Battery Powered Linux mini-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index 41722046b937..0cd26289bfbc 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -165,7 +165,7 @@ config CFG80211_DEFAULT_PS
 
 	  If this causes your applications to misbehave you should fix your
 	  applications instead -- they need to register their network
-	  latency requirement, see Documentation/power/pm_qos_interface.txt.
+	  latency requirement, see Documentation/power/pm_qos_interface.rst.
 
 config CFG80211_DEBUGFS
 	bool "cfg80211 DebugFS entries"
-- 
cgit v1.2.3-59-g8ed1b


From e1714daad7cf8fe4d6dd91adcfbbdd0604b0210d Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 3 Jun 2019 10:25:31 +0200
Subject: i2c: headers: don't use 'dev' as adapter variable

It is not a struct device, so 'dev' is confusing. Use 'adap', the most
common name.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e982b8913b73..6bd199cfe61f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -703,14 +703,14 @@ struct i2c_adapter {
 };
 #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
 
-static inline void *i2c_get_adapdata(const struct i2c_adapter *dev)
+static inline void *i2c_get_adapdata(const struct i2c_adapter *adap)
 {
-	return dev_get_drvdata(&dev->dev);
+	return dev_get_drvdata(&adap->dev);
 }
 
-static inline void i2c_set_adapdata(struct i2c_adapter *dev, void *data)
+static inline void i2c_set_adapdata(struct i2c_adapter *adap, void *data)
 {
-	dev_set_drvdata(&dev->dev, data);
+	dev_set_drvdata(&adap->dev, data);
 }
 
 static inline struct i2c_adapter *
-- 
cgit v1.2.3-59-g8ed1b


From d68222d4d6647611be5a32c80a53a145e7c80ce9 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 3 Jun 2019 10:25:32 +0200
Subject: i2c: headers: always have a named variable in arguments

Much better to read and understand. Naming for i2c_adapter is not
consistent (yet), so use the name which is also used in core code.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 6bd199cfe61f..14e04fb4f46f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -40,7 +40,8 @@ struct i2c_device_identity;
 union i2c_smbus_data;
 struct i2c_board_info;
 enum i2c_slave_event;
-typedef int (*i2c_slave_cb_t)(struct i2c_client *, enum i2c_slave_event, u8 *);
+typedef int (*i2c_slave_cb_t)(struct i2c_client *client,
+			      enum i2c_slave_event event, u8 *val);
 
 struct module;
 struct property_entry;
@@ -257,16 +258,16 @@ struct i2c_driver {
 	unsigned int class;
 
 	/* Standard driver model interfaces */
-	int (*probe)(struct i2c_client *, const struct i2c_device_id *);
-	int (*remove)(struct i2c_client *);
+	int (*probe)(struct i2c_client *client, const struct i2c_device_id *id);
+	int (*remove)(struct i2c_client *client);
 
 	/* New driver model interface to aid the seamless removal of the
 	 * current probe()'s, more commonly unused than used second parameter.
 	 */
-	int (*probe_new)(struct i2c_client *);
+	int (*probe_new)(struct i2c_client *client);
 
 	/* driver model interfaces that don't relate to enumeration  */
-	void (*shutdown)(struct i2c_client *);
+	void (*shutdown)(struct i2c_client *client);
 
 	/* Alert callback, for example for the SMBus alert protocol.
 	 * The format and meaning of the data value depends on the protocol.
@@ -275,7 +276,7 @@ struct i2c_driver {
 	 * For the SMBus Host Notify protocol, the data corresponds to the
 	 * 16-bit payload data reported by the slave device acting as master.
 	 */
-	void (*alert)(struct i2c_client *, enum i2c_alert_protocol protocol,
+	void (*alert)(struct i2c_client *client, enum i2c_alert_protocol protocol,
 		      unsigned int data);
 
 	/* a ioctl like command that can be used to perform specific functions
@@ -287,7 +288,7 @@ struct i2c_driver {
 	const struct i2c_device_id *id_table;
 
 	/* Device detection callback for automatic device creation */
-	int (*detect)(struct i2c_client *, struct i2c_board_info *);
+	int (*detect)(struct i2c_client *client, struct i2c_board_info *info);
 	const unsigned short *address_list;
 	struct list_head clients;
 
@@ -447,10 +448,10 @@ extern struct i2c_client *
 i2c_new_probed_device(struct i2c_adapter *adap,
 		      struct i2c_board_info *info,
 		      unsigned short const *addr_list,
-		      int (*probe)(struct i2c_adapter *, unsigned short addr));
+		      int (*probe)(struct i2c_adapter *adap, unsigned short addr));
 
 /* Common custom probe functions */
-extern int i2c_probe_func_quick_read(struct i2c_adapter *, unsigned short addr);
+extern int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short addr);
 
 /* For devices that use several addresses, use i2c_new_dummy() to make
  * client handles for the extra addresses.
@@ -466,7 +467,7 @@ i2c_new_secondary_device(struct i2c_client *client,
 				const char *name,
 				u16 default_addr);
 
-extern void i2c_unregister_device(struct i2c_client *);
+extern void i2c_unregister_device(struct i2c_client *client);
 #endif /* I2C */
 
 /* Mainboard arch_initcall() code should register all its I2C devices.
@@ -551,9 +552,9 @@ struct i2c_algorithm {
  * The main operations are wrapped by i2c_lock_bus and i2c_unlock_bus.
  */
 struct i2c_lock_operations {
-	void (*lock_bus)(struct i2c_adapter *, unsigned int flags);
-	int (*trylock_bus)(struct i2c_adapter *, unsigned int flags);
-	void (*unlock_bus)(struct i2c_adapter *, unsigned int flags);
+	void (*lock_bus)(struct i2c_adapter *adapter, unsigned int flags);
+	int (*trylock_bus)(struct i2c_adapter *adapter, unsigned int flags);
+	void (*unlock_bus)(struct i2c_adapter *adapter, unsigned int flags);
 };
 
 /**
@@ -726,7 +727,7 @@ i2c_parent_is_i2c_adapter(const struct i2c_adapter *adapter)
 		return NULL;
 }
 
-int i2c_for_each_dev(void *data, int (*fn)(struct device *, void *));
+int i2c_for_each_dev(void *data, int (*fn)(struct device *dev, void *data));
 
 /* Adapter locking functions, exported for shared pin cases */
 #define I2C_LOCK_ROOT_ADAPTER BIT(0)
@@ -832,12 +833,12 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap)
 /* administration...
  */
 #if IS_ENABLED(CONFIG_I2C)
-extern int i2c_add_adapter(struct i2c_adapter *);
-extern void i2c_del_adapter(struct i2c_adapter *);
-extern int i2c_add_numbered_adapter(struct i2c_adapter *);
+extern int i2c_add_adapter(struct i2c_adapter *adap);
+extern void i2c_del_adapter(struct i2c_adapter *adap);
+extern int i2c_add_numbered_adapter(struct i2c_adapter *adap);
 
-extern int i2c_register_driver(struct module *, struct i2c_driver *);
-extern void i2c_del_driver(struct i2c_driver *);
+extern int i2c_register_driver(struct module *owner, struct i2c_driver *driver);
+extern void i2c_del_driver(struct i2c_driver *driver);
 
 /* use a define to avoid include chaining to get THIS_MODULE */
 #define i2c_add_driver(driver) \
-- 
cgit v1.2.3-59-g8ed1b


From 2caea56f569ac361fc854f6bf2fe94b70514c917 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 3 Jun 2019 10:25:34 +0200
Subject: i2c: headers: update docs about I2C_CLIENT_*

Update kerneldoc for i2c client flags because they increased over time.
Also, move them to a position where they can be more easily found.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 14e04fb4f46f..9853fae9b505 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -298,8 +298,7 @@ struct i2c_driver {
 
 /**
  * struct i2c_client - represent an I2C slave device
- * @flags: I2C_CLIENT_TEN indicates the device uses a ten bit chip address;
- *	I2C_CLIENT_PEC indicates it uses SMBus Packet Error Checking
+ * @flags: see I2C_CLIENT_* for possible flags
  * @addr: Address used on the I2C bus connected to the parent adapter.
  * @name: Indicates the type of the device, usually a chip name that's
  *	generic enough to hide second-sourcing and compatible revisions.
@@ -317,6 +316,15 @@ struct i2c_driver {
  */
 struct i2c_client {
 	unsigned short flags;		/* div., see below		*/
+#define I2C_CLIENT_PEC		0x04	/* Use Packet Error Checking */
+#define I2C_CLIENT_TEN		0x10	/* we have a ten bit chip address */
+					/* Must equal I2C_M_TEN below */
+#define I2C_CLIENT_SLAVE	0x20	/* we are the slave */
+#define I2C_CLIENT_HOST_NOTIFY	0x40	/* We want to use I2C host notify */
+#define I2C_CLIENT_WAKE		0x80	/* for board_info; true iff can wake */
+#define I2C_CLIENT_SCCB		0x9000	/* Use Omnivision SCCB protocol */
+					/* Must match I2C_M_STOP|IGNORE_NAK */
+
 	unsigned short addr;		/* chip address - NOTE: 7bit	*/
 					/* addresses are stored in the	*/
 					/* _LOWER_ 7 bits		*/
@@ -803,16 +811,6 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap)
 	i2c_unlock_bus(adap, I2C_LOCK_ROOT_ADAPTER);
 }
 
-/*flags for the client struct: */
-#define I2C_CLIENT_PEC		0x04	/* Use Packet Error Checking */
-#define I2C_CLIENT_TEN		0x10	/* we have a ten bit chip address */
-					/* Must equal I2C_M_TEN below */
-#define I2C_CLIENT_SLAVE	0x20	/* we are the slave */
-#define I2C_CLIENT_HOST_NOTIFY	0x40	/* We want to use I2C host notify */
-#define I2C_CLIENT_WAKE		0x80	/* for board_info; true iff can wake */
-#define I2C_CLIENT_SCCB		0x9000	/* Use Omnivision SCCB protocol */
-					/* Must match I2C_M_STOP|IGNORE_NAK */
-
 /* i2c adapter classes (bitmask) */
 #define I2C_CLASS_HWMON		(1<<0)	/* lm_sensors, ... */
 #define I2C_CLASS_DDC		(1<<3)	/* DDC bus on graphics adapters */
-- 
cgit v1.2.3-59-g8ed1b


From 76cc9f0efd952d376e93e79b1f19fd6fdb8291bc Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa@the-dreams.de>
Date: Mon, 3 Jun 2019 10:25:35 +0200
Subject: i2c: headers: reformat header comment and update copyright

Let's stick to coding style.

Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 9853fae9b505..d8f9060179d0 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -1,16 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
-/* ------------------------------------------------------------------------- */
-/*									     */
-/* i2c.h - definitions for the i2c-bus interface			     */
-/*									     */
-/* ------------------------------------------------------------------------- */
-/*   Copyright (C) 1995-2000 Simon G. Vogl
-
+/*
+ * i2c.h - definitions for the Linux i2c bus interface
+ * Copyright (C) 1995-2000 Simon G. Vogl
+ * Copyright (C) 2013-2019 Wolfram Sang <wsa@the-dreams.de>
+ *
+ * With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and
+ * Frodo Looijaard <frodol@dds.nl>
  */
-/* ------------------------------------------------------------------------- */
-
-/* With some changes from Kyösti Mälkki <kmalkki@cc.hut.fi> and
-   Frodo Looijaard <frodol@dds.nl> */
 #ifndef _LINUX_I2C_H
 #define _LINUX_I2C_H
 
-- 
cgit v1.2.3-59-g8ed1b


From 013e868bc9465452c7b667830712ab57de236d08 Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Wed, 15 May 2019 15:38:47 +0530
Subject: mfd: lp87565: Add support for 4-phase LP87561 combination

Add support for 4-phase LP87561 combination.

Data Sheet: https://www.ti.com/lit/ds/symlink/lp87561-q1.pdf

Signed-off-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/lp87565.c       | 4 ++++
 include/linux/mfd/lp87565.h | 2 ++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/lp87565.c b/drivers/mfd/lp87565.c
index 32d2a07d4354..8ad688fe75f9 100644
--- a/drivers/mfd/lp87565.c
+++ b/drivers/mfd/lp87565.c
@@ -33,6 +33,10 @@ static const struct of_device_id of_lp87565_match_table[] = {
 		.compatible = "ti,lp87565-q1",
 		.data = (void *)LP87565_DEVICE_TYPE_LP87565_Q1,
 	},
+	{
+		.compatible = "ti,lp87561-q1",
+		.data = (void *)LP87565_DEVICE_TYPE_LP87561_Q1,
+	},
 	{}
 };
 MODULE_DEVICE_TABLE(of, of_lp87565_match_table);
diff --git a/include/linux/mfd/lp87565.h b/include/linux/mfd/lp87565.h
index d0c91ba65525..976447607ea2 100644
--- a/include/linux/mfd/lp87565.h
+++ b/include/linux/mfd/lp87565.h
@@ -17,6 +17,7 @@
 
 enum lp87565_device_type {
 	LP87565_DEVICE_TYPE_UNKNOWN	= 0,
+	LP87565_DEVICE_TYPE_LP87561_Q1,
 	LP87565_DEVICE_TYPE_LP87565_Q1,
 };
 
@@ -249,6 +250,7 @@ enum LP87565_regulator_id {
 	LP87565_BUCK_3,
 	LP87565_BUCK_10,
 	LP87565_BUCK_23,
+	LP87565_BUCK_3210,
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From e7488e58c7cfe4be0c52db68622a0397bb75258e Mon Sep 17 00:00:00 2001
From: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Date: Tue, 14 May 2019 20:59:01 +0200
Subject: platform/x86: wmi: Add function to get _UID of WMI device

Add a new function to acpi.h / wmi.c that returns _UID of the ACPI WMI
device. For example, it returns "ATK" for the following declaration in
DSDT:
Device (ATKD)
{
    Name (_HID, "PNP0C14" /* Windows Management Instrumentation Device */)
      // _HID: Hardware ID
    Name (_UID, "ATK")  // _UID: Unique ID
    ..

Generally, it is possible that multiple PNP0C14 ACPI devices are present in
the system as mentioned in the commit message of commit bff431e49ff5
("ACPI: WMI: Add ACPI-WMI mapping driver").

Therefore the _UID is returned for a specific ACPI device that declares the
given GUID, to which it is also mapped by other methods of wmi module.

Signed-off-by: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/wmi.c | 19 +++++++++++++++++++
 include/linux/acpi.h       |  1 +
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index 7b26b6ccf1a0..b08ffb769cbe 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -635,6 +635,25 @@ bool wmi_has_guid(const char *guid_string)
 }
 EXPORT_SYMBOL_GPL(wmi_has_guid);
 
+/**
+ * wmi_get_acpi_device_uid() - Get _UID name of ACPI device that defines GUID
+ * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba
+ *
+ * Find the _UID of ACPI device associated with this WMI GUID.
+ *
+ * Return: The ACPI _UID field value or NULL if the WMI GUID was not found
+ */
+char *wmi_get_acpi_device_uid(const char *guid_string)
+{
+	struct wmi_block *wblock = NULL;
+
+	if (!find_guid(guid_string, &wblock))
+		return NULL;
+
+	return acpi_device_uid(wblock->acpi_device);
+}
+EXPORT_SYMBOL_GPL(wmi_get_acpi_device_uid);
+
 static struct wmi_block *dev_to_wblock(struct device *dev)
 {
 	return container_of(dev, struct wmi_block, dev.dev);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 98440df7fe42..d867a9a904f9 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -380,6 +380,7 @@ extern acpi_status wmi_install_notify_handler(const char *guid,
 extern acpi_status wmi_remove_notify_handler(const char *guid);
 extern acpi_status wmi_get_event_data(u32 event, struct acpi_buffer *out);
 extern bool wmi_has_guid(const char *guid);
+extern char *wmi_get_acpi_device_uid(const char *guid);
 
 #endif	/* CONFIG_ACPI_WMI */
 
-- 
cgit v1.2.3-59-g8ed1b


From e0668f28888184f6c633110a37386f2d4a6fa00e Mon Sep 17 00:00:00 2001
From: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Date: Tue, 14 May 2019 21:00:31 +0200
Subject: platform/x86: asus-wmi: Improve DSTS WMI method ID detection

The DSTS method detection mistakenly selects DCTS instead of DSTS if
nothing is returned when the method ID is not defined in WMNB. As a result,
the control of keyboard backlight is not functional for TUF Gaming series
laptops. Implement detection based on _UID of the WMI device instead.

There is evidence that DCTS is handled by ACPI WMI devices that have _UID
ASUSWMI, whereas none of the devices without ASUSWMI respond to DCTS and
DSTS is used instead [1].

DSDT examples:

FX505GM (_UID ATK):
Method (WMNB, 3, Serialized)
{ ...
    If ((Local0 == 0x53545344))
    {
        ...
        Return (Zero)
    }
    ...
    // No return
}

K54C (_UID ATK):
Method (WMNB, 3, Serialized)
{ ...
    If ((Local0 == 0x53545344))
    {
        ...
        Return (0x02)
    }
    ...
    Return (0xFFFFFFFE)
}

[1] Link: https://lkml.org/lkml/2019/4/11/322

Signed-off-by: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Suggested-by: Daniel Drake <drake@endlessm.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/hid/hid-asus.c                     |  2 +-
 drivers/platform/x86/asus-wmi.c            | 23 ++++++++++++++++++++---
 include/linux/platform_data/x86/asus-wmi.h |  4 ++--
 3 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c
index 336aeaed1159..1d01fe23ca0c 100644
--- a/drivers/hid/hid-asus.c
+++ b/drivers/hid/hid-asus.c
@@ -396,7 +396,7 @@ static bool asus_kbd_wmi_led_control_present(struct hid_device *hdev)
 	if (!IS_ENABLED(CONFIG_ASUS_WMI))
 		return false;
 
-	ret = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS2,
+	ret = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS,
 				       ASUS_WMI_DEVID_KBD_BACKLIGHT, 0, &value);
 	hid_dbg(hdev, "WMI backlight check: rc %d value %x", ret, value);
 	if (ret)
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index c67f11e0d6e7..ef526dcfeac5 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -83,6 +83,8 @@ MODULE_LICENSE("GPL");
 #define USB_INTEL_XUSB2PR		0xD0
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI	0x9c31
 
+#define ASUS_ACPI_UID_ASUSWMI		"ASUSWMI"
+
 static const char * const ashs_ids[] = { "ATK4001", "ATK4002", NULL };
 
 static bool ashs_present(void)
@@ -1874,6 +1876,8 @@ static int asus_wmi_sysfs_init(struct platform_device *device)
  */
 static int asus_wmi_platform_init(struct asus_wmi *asus)
 {
+	struct device *dev = &asus->platform_device->dev;
+	char *wmi_uid;
 	int rv;
 
 	/* INIT enable hotkeys on some models */
@@ -1903,11 +1907,24 @@ static int asus_wmi_platform_init(struct asus_wmi *asus)
 	 * Note, on most Eeepc, there is no way to check if a method exist
 	 * or note, while on notebooks, they returns 0xFFFFFFFE on failure,
 	 * but once again, SPEC may probably be used for that kind of things.
+	 *
+	 * Additionally at least TUF Gaming series laptops return nothing for
+	 * unknown methods, so the detection in this way is not possible.
+	 *
+	 * There is strong indication that only ACPI WMI devices that have _UID
+	 * equal to "ASUSWMI" use DCTS whereas those with "ATK" use DSTS.
 	 */
-	if (!asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, 0, 0, NULL))
+	wmi_uid = wmi_get_acpi_device_uid(ASUS_WMI_MGMT_GUID);
+	if (!wmi_uid)
+		return -ENODEV;
+
+	if (!strcmp(wmi_uid, ASUS_ACPI_UID_ASUSWMI)) {
+		dev_info(dev, "Detected ASUSWMI, use DCTS\n");
+		asus->dsts_id = ASUS_WMI_METHODID_DCTS;
+	} else {
+		dev_info(dev, "Detected %s, not ASUSWMI, use DSTS\n", wmi_uid);
 		asus->dsts_id = ASUS_WMI_METHODID_DSTS;
-	else
-		asus->dsts_id = ASUS_WMI_METHODID_DSTS2;
+	}
 
 	/* CWAP allow to define the behavior of the Fn+F2 key,
 	 * this method doesn't seems to be present on Eee PCs */
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index bfba245636a7..0668f76df921 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -18,8 +18,8 @@
 #define ASUS_WMI_METHODID_GDSP		0x50534447 /* Get DiSPlay output */
 #define ASUS_WMI_METHODID_DEVP		0x50564544 /* DEVice Policy */
 #define ASUS_WMI_METHODID_OSVR		0x5256534F /* OS VeRsion */
-#define ASUS_WMI_METHODID_DSTS		0x53544344 /* Device STatuS */
-#define ASUS_WMI_METHODID_DSTS2		0x53545344 /* Device STatuS #2*/
+#define ASUS_WMI_METHODID_DCTS		0x53544344 /* Device status (DCTS) */
+#define ASUS_WMI_METHODID_DSTS		0x53545344 /* Device status (DSTS) */
 #define ASUS_WMI_METHODID_BSTS		0x53545342 /* Bios STatuS ? */
 #define ASUS_WMI_METHODID_DEVS		0x53564544 /* DEVice Set */
 #define ASUS_WMI_METHODID_CFVS		0x53564643 /* CPU Frequency Volt Set */
-- 
cgit v1.2.3-59-g8ed1b


From b096f626a6827ad2ced5ebdbdc04e62422d463f6 Mon Sep 17 00:00:00 2001
From: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Date: Tue, 14 May 2019 21:07:05 +0200
Subject: platform/x86: asus-wmi: Switch fan boost mode

The WMI exposes a write-only device ID where up to three fan modes can be
switched on some laptops (TUF Gaming FX505GM). There is a hotkey
combination Fn-F5 that does have a fan icon, which is designed to toggle
between fan modes. The DSTS of the device ID returns information about the
presence of this capability and the presence of each of the two additional
fan modes as a bitmask (0x01 - overboost present, 0x02 - silent present)
[1].

Add a SysFS entry that reads the last written value and updates value in
WMI on write and a hotkey handler that toggles the modes taking into
account their availability according to DSTS.

Modes:
* 0x00 - normal or balanced,
* 0x01 - overboost, increased fan RPM,
* 0x02 - silent, decreased fan RPM

[1] Link: https://lkml.org/lkml/2019/4/12/110

Signed-off-by: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Suggested-by: Daniel Drake <drake@endlessm.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 Documentation/ABI/testing/sysfs-platform-asus-wmi |  10 ++
 drivers/platform/x86/asus-wmi.c                   | 151 ++++++++++++++++++++--
 include/linux/platform_data/x86/asus-wmi.h        |   1 +
 3 files changed, 154 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 019e1e29370e..87ae5cc983bf 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -36,3 +36,13 @@ KernelVersion:	3.5
 Contact:	"AceLan Kao" <acelan.kao@canonical.com>
 Description:
 		Resume on lid open. 1 means on, 0 means off.
+
+What:		/sys/devices/platform/<platform>/fan_mode
+Date:		Apr 2019
+KernelVersion:	5.2
+Contact:	"Yurii Pavlovskyi" <yurii.pavlovskyi@gmail.com>
+Description:
+		Fan boost mode:
+			* 0 - normal,
+			* 1 - overboost,
+			* 2 - silent
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index a1d85667383c..5712bc56fa10 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -70,6 +70,7 @@ MODULE_LICENSE("GPL");
 #define NOTIFY_KBD_BRTUP		0xc4
 #define NOTIFY_KBD_BRTDWN		0xc5
 #define NOTIFY_KBD_BRTTOGGLE		0xc7
+#define NOTIFY_KBD_FBM			0x99
 
 #define ASUS_WMI_FNLOCK_BIOS_DISABLED	BIT(0)
 
@@ -80,6 +81,13 @@ MODULE_LICENSE("GPL");
 #define ASUS_FAN_CTRL_MANUAL		1
 #define ASUS_FAN_CTRL_AUTO		2
 
+#define ASUS_FAN_MODE_NORMAL		0
+#define ASUS_FAN_MODE_OVERBOOST		1
+#define ASUS_FAN_MODE_OVERBOOST_MASK	0x01
+#define ASUS_FAN_MODE_SILENT		2
+#define ASUS_FAN_MODE_SILENT_MASK	0x02
+#define ASUS_FAN_MODES_MASK		0x03
+
 #define USB_INTEL_XUSB2PR		0xD0
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI	0x9c31
 
@@ -187,6 +195,10 @@ struct asus_wmi {
 	int asus_hwmon_num_fans;
 	int asus_hwmon_pwm;
 
+	bool fan_mode_available;
+	u8 fan_mode_mask;
+	u8 fan_mode;
+
 	struct hotplug_slot hotplug_slot;
 	struct mutex hotplug_lock;
 	struct mutex wmi_lock;
@@ -1483,6 +1495,116 @@ static int asus_wmi_fan_init(struct asus_wmi *asus)
 	return 0;
 }
 
+/* Fan mode *******************************************************************/
+
+static int fan_mode_check_present(struct asus_wmi *asus)
+{
+	u32 result;
+	int err;
+
+	asus->fan_mode_available = false;
+
+	err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_MODE, &result);
+	if (err) {
+		if (err == -ENODEV)
+			return 0;
+		else
+			return err;
+	}
+
+	if ((result & ASUS_WMI_DSTS_PRESENCE_BIT) &&
+			(result & ASUS_FAN_MODES_MASK)) {
+		asus->fan_mode_available = true;
+		asus->fan_mode_mask = result & ASUS_FAN_MODES_MASK;
+	}
+
+	return 0;
+}
+
+static int fan_mode_write(struct asus_wmi *asus)
+{
+	int err;
+	u8 value;
+	u32 retval;
+
+	value = asus->fan_mode;
+
+	pr_info("Set fan mode: %u\n", value);
+	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_FAN_MODE, value, &retval);
+
+	if (err) {
+		pr_warn("Failed to set fan mode: %d\n", err);
+		return err;
+	}
+
+	if (retval != 1) {
+		pr_warn("Failed to set fan mode (retval): 0x%x\n", retval);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int fan_mode_switch_next(struct asus_wmi *asus)
+{
+	if (asus->fan_mode == ASUS_FAN_MODE_NORMAL) {
+		if (asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK)
+			asus->fan_mode = ASUS_FAN_MODE_OVERBOOST;
+		else if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)
+			asus->fan_mode = ASUS_FAN_MODE_SILENT;
+	} else if (asus->fan_mode == ASUS_FAN_MODE_OVERBOOST) {
+		if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)
+			asus->fan_mode = ASUS_FAN_MODE_SILENT;
+		else
+			asus->fan_mode = ASUS_FAN_MODE_NORMAL;
+	} else {
+		asus->fan_mode = ASUS_FAN_MODE_NORMAL;
+	}
+
+	return fan_mode_write(asus);
+}
+
+static ssize_t fan_mode_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct asus_wmi *asus = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", asus->fan_mode);
+}
+
+static ssize_t fan_mode_store(struct device *dev, struct device_attribute *attr,
+		const char *buf, size_t count)
+{
+	int result;
+	u8 new_mode;
+
+	struct asus_wmi *asus = dev_get_drvdata(dev);
+
+	result = kstrtou8(buf, 10, &new_mode);
+	if (result < 0) {
+		pr_warn("Trying to store invalid value\n");
+		return result;
+	}
+
+	if (new_mode == ASUS_FAN_MODE_OVERBOOST) {
+		if (!(asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK))
+			return -EINVAL;
+	} else if (new_mode == ASUS_FAN_MODE_SILENT) {
+		if (!(asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK))
+			return -EINVAL;
+	} else if (new_mode != ASUS_FAN_MODE_NORMAL) {
+		return -EINVAL;
+	}
+
+	asus->fan_mode = new_mode;
+	fan_mode_write(asus);
+
+	return result;
+}
+
+// Fan mode: 0 - normal, 1 - overboost, 2 - silent
+static DEVICE_ATTR_RW(fan_mode);
+
 /* Backlight ******************************************************************/
 
 static int read_backlight_power(struct asus_wmi *asus)
@@ -1761,6 +1883,11 @@ static void asus_wmi_handle_event_code(int code, struct asus_wmi *asus)
 		return;
 	}
 
+	if (asus->fan_mode_available && code == NOTIFY_KBD_FBM) {
+		fan_mode_switch_next(asus);
+		return;
+	}
+
 	if (is_display_toggle(code) && asus->driver->quirks->no_display_toggle)
 		return;
 
@@ -1917,6 +2044,7 @@ static struct attribute *platform_attributes[] = {
 	&dev_attr_touchpad.attr,
 	&dev_attr_lid_resume.attr,
 	&dev_attr_als_enable.attr,
+	&dev_attr_fan_mode.attr,
 	NULL
 };
 
@@ -1938,6 +2066,8 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
 		devid = ASUS_WMI_DEVID_LID_RESUME;
 	else if (attr == &dev_attr_als_enable.attr)
 		devid = ASUS_WMI_DEVID_ALS_ENABLE;
+	else if (attr == &dev_attr_fan_mode.attr)
+		ok = asus->fan_mode_available;
 
 	if (devid != -1)
 		ok = !(asus_wmi_get_devstate_simple(asus, devid) < 0);
@@ -2037,12 +2167,7 @@ static int asus_wmi_platform_init(struct asus_wmi *asus)
 		asus_wmi_set_devstate(ASUS_WMI_DEVID_CWAP,
 				      asus->driver->quirks->wapf, NULL);
 
-	return asus_wmi_sysfs_init(asus->platform_device);
-}
-
-static void asus_wmi_platform_exit(struct asus_wmi *asus)
-{
-	asus_wmi_sysfs_exit(asus->platform_device);
+	return 0;
 }
 
 /* debugfs ********************************************************************/
@@ -2200,6 +2325,14 @@ static int asus_wmi_add(struct platform_device *pdev)
 	if (err)
 		goto fail_platform;
 
+	err = fan_mode_check_present(asus);
+	if (err)
+		goto fail_fan_mode;
+
+	err = asus_wmi_sysfs_init(asus->platform_device);
+	if (err)
+		goto fail_sysfs;
+
 	err = asus_wmi_input_init(asus);
 	if (err)
 		goto fail_input;
@@ -2277,7 +2410,9 @@ fail_leds:
 fail_hwmon:
 	asus_wmi_input_exit(asus);
 fail_input:
-	asus_wmi_platform_exit(asus);
+	asus_wmi_sysfs_exit(asus->platform_device);
+fail_sysfs:
+fail_fan_mode:
 fail_platform:
 	kfree(asus);
 	return err;
@@ -2294,7 +2429,7 @@ static int asus_wmi_remove(struct platform_device *device)
 	asus_wmi_led_exit(asus);
 	asus_wmi_rfkill_exit(asus);
 	asus_wmi_debugfs_exit(asus);
-	asus_wmi_platform_exit(asus);
+	asus_wmi_sysfs_exit(asus->platform_device);
 	asus_hwmon_fan_set_auto(asus);
 
 	kfree(asus);
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 0668f76df921..8551156b8dca 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -57,6 +57,7 @@
 #define ASUS_WMI_DEVID_KBD_BACKLIGHT	0x00050021
 #define ASUS_WMI_DEVID_LIGHT_SENSOR	0x00050022 /* ?? */
 #define ASUS_WMI_DEVID_LIGHTBAR		0x00050025
+#define ASUS_WMI_DEVID_FAN_MODE		0x00110018
 
 /* Misc */
 #define ASUS_WMI_DEVID_CAMERA		0x00060013
-- 
cgit v1.2.3-59-g8ed1b


From a48e23385fcf397e69e2a75d72a81c545ec8bec2 Mon Sep 17 00:00:00 2001
From: Mattias Jacobsson <2pi@mok.nu>
Date: Mon, 27 May 2019 18:21:29 +0200
Subject: platform/x86: wmi: add context pointer field to struct wmi_device_id

When using wmi_install_notify_handler() to initialize a WMI handler a
data pointer can be supplied which will be passed on to the notification
handler. No similar feature exist when handling WMI events via struct
wmi_driver.

Add a context field pointer to struct wmi_device_id and add a function
find_guid_context() to retrieve that context pointer.

Signed-off-by: Mattias Jacobsson <2pi@mok.nu>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/wmi.c      | 22 ++++++++++++++++++++++
 include/linux/mod_devicetable.h |  1 +
 2 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index b08ffb769cbe..f3be1c008856 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -146,6 +146,28 @@ static bool find_guid(const char *guid_string, struct wmi_block **out)
 	return false;
 }
 
+static const void *find_guid_context(struct wmi_block *wblock,
+				      struct wmi_driver *wdriver)
+{
+	const struct wmi_device_id *id;
+	uuid_le guid_input;
+
+	if (wblock == NULL || wdriver == NULL)
+		return NULL;
+	if (wdriver->id_table == NULL)
+		return NULL;
+
+	id = wdriver->id_table;
+	while (*id->guid_string) {
+		if (uuid_le_to_bin(id->guid_string, &guid_input))
+			continue;
+		if (!memcmp(wblock->gblock.guid, &guid_input, 16))
+			return id->context;
+		id++;
+	}
+	return NULL;
+}
+
 static int get_subobj_info(acpi_handle handle, const char *pathname,
 			   struct acpi_device_info **info)
 {
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 448621c32e4d..09366859aac2 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -798,6 +798,7 @@ struct tee_client_device_id {
  */
 struct wmi_device_id {
 	const char guid_string[UUID_STRING_LEN+1];
+	const void *context;
 };
 
 #endif /* LINUX_MOD_DEVICETABLE_H */
-- 
cgit v1.2.3-59-g8ed1b


From 440c4983de262f78033ec58f6abcd199a664327d Mon Sep 17 00:00:00 2001
From: Mattias Jacobsson <2pi@mok.nu>
Date: Mon, 27 May 2019 18:21:30 +0200
Subject: platform/x86: wmi: add context argument to the probe function

The struct wmi_device_id has a context pointer field, forward this
pointer as an argument to the probe function in struct wmi_driver.

Update existing users of the same probe function to accept this new
context argument.

Signed-off-by: Mattias Jacobsson <2pi@mok.nu>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/platform/x86/dell-smbios-wmi.c       | 2 +-
 drivers/platform/x86/dell-wmi-descriptor.c   | 3 ++-
 drivers/platform/x86/dell-wmi.c              | 2 +-
 drivers/platform/x86/huawei-wmi.c            | 2 +-
 drivers/platform/x86/intel-wmi-thunderbolt.c | 3 ++-
 drivers/platform/x86/wmi-bmof.c              | 2 +-
 drivers/platform/x86/wmi.c                   | 3 ++-
 include/linux/wmi.h                          | 2 +-
 8 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/platform/x86/dell-smbios-wmi.c b/drivers/platform/x86/dell-smbios-wmi.c
index c3ed3c8c17b9..add2687079f7 100644
--- a/drivers/platform/x86/dell-smbios-wmi.c
+++ b/drivers/platform/x86/dell-smbios-wmi.c
@@ -146,7 +146,7 @@ fail_smbios_cmd:
 	return ret;
 }
 
-static int dell_smbios_wmi_probe(struct wmi_device *wdev)
+static int dell_smbios_wmi_probe(struct wmi_device *wdev, const void *context)
 {
 	struct wmi_driver *wdriver =
 		container_of(wdev->dev.driver, struct wmi_driver, driver);
diff --git a/drivers/platform/x86/dell-wmi-descriptor.c b/drivers/platform/x86/dell-wmi-descriptor.c
index 14ab250b7d5a..9994fd1a5acf 100644
--- a/drivers/platform/x86/dell-wmi-descriptor.c
+++ b/drivers/platform/x86/dell-wmi-descriptor.c
@@ -106,7 +106,8 @@ EXPORT_SYMBOL_GPL(dell_wmi_get_hotfix);
  * WMI buffer length        12       4    <length>
  * WMI hotfix number        16       4    <hotfix>
  */
-static int dell_wmi_descriptor_probe(struct wmi_device *wdev)
+static int dell_wmi_descriptor_probe(struct wmi_device *wdev,
+				     const void *context)
 {
 	union acpi_object *obj = NULL;
 	struct descriptor_priv *priv;
diff --git a/drivers/platform/x86/dell-wmi.c b/drivers/platform/x86/dell-wmi.c
index d118bb73fcae..72b0a69a6ed0 100644
--- a/drivers/platform/x86/dell-wmi.c
+++ b/drivers/platform/x86/dell-wmi.c
@@ -672,7 +672,7 @@ static int dell_wmi_events_set_enabled(bool enable)
 	return dell_smbios_error(ret);
 }
 
-static int dell_wmi_probe(struct wmi_device *wdev)
+static int dell_wmi_probe(struct wmi_device *wdev, const void *context)
 {
 	struct dell_wmi_priv *priv;
 	int ret;
diff --git a/drivers/platform/x86/huawei-wmi.c b/drivers/platform/x86/huawei-wmi.c
index 52fcac5b393a..195a7f3638cb 100644
--- a/drivers/platform/x86/huawei-wmi.c
+++ b/drivers/platform/x86/huawei-wmi.c
@@ -166,7 +166,7 @@ static int huawei_wmi_input_setup(struct wmi_device *wdev)
 	return input_register_device(priv->idev);
 }
 
-static int huawei_wmi_probe(struct wmi_device *wdev)
+static int huawei_wmi_probe(struct wmi_device *wdev, const void *context)
 {
 	struct huawei_wmi_priv *priv;
 	int err;
diff --git a/drivers/platform/x86/intel-wmi-thunderbolt.c b/drivers/platform/x86/intel-wmi-thunderbolt.c
index 4dfa61434a76..974c22a7ff61 100644
--- a/drivers/platform/x86/intel-wmi-thunderbolt.c
+++ b/drivers/platform/x86/intel-wmi-thunderbolt.c
@@ -56,7 +56,8 @@ static const struct attribute_group tbt_attribute_group = {
 	.attrs = tbt_attrs,
 };
 
-static int intel_wmi_thunderbolt_probe(struct wmi_device *wdev)
+static int intel_wmi_thunderbolt_probe(struct wmi_device *wdev,
+				       const void *context)
 {
 	int ret;
 
diff --git a/drivers/platform/x86/wmi-bmof.c b/drivers/platform/x86/wmi-bmof.c
index 8751a13134be..105a82b6b076 100644
--- a/drivers/platform/x86/wmi-bmof.c
+++ b/drivers/platform/x86/wmi-bmof.c
@@ -54,7 +54,7 @@ read_bmof(struct file *filp, struct kobject *kobj,
 	return count;
 }
 
-static int wmi_bmof_probe(struct wmi_device *wdev)
+static int wmi_bmof_probe(struct wmi_device *wdev, const void *context)
 {
 	struct bmof_priv *priv;
 	int ret;
diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c
index f3be1c008856..2163fd8bf9e1 100644
--- a/drivers/platform/x86/wmi.c
+++ b/drivers/platform/x86/wmi.c
@@ -945,7 +945,8 @@ static int wmi_dev_probe(struct device *dev)
 		dev_warn(dev, "failed to enable device -- probing anyway\n");
 
 	if (wdriver->probe) {
-		ret = wdriver->probe(dev_to_wdev(dev));
+		ret = wdriver->probe(dev_to_wdev(dev),
+				find_guid_context(wblock, wdriver));
 		if (ret != 0)
 			goto probe_failure;
 	}
diff --git a/include/linux/wmi.h b/include/linux/wmi.h
index 592f81afecbb..1e84c474a993 100644
--- a/include/linux/wmi.h
+++ b/include/linux/wmi.h
@@ -44,7 +44,7 @@ struct wmi_driver {
 	struct device_driver driver;
 	const struct wmi_device_id *id_table;
 
-	int (*probe)(struct wmi_device *wdev);
+	int (*probe)(struct wmi_device *wdev, const void *context);
 	int (*remove)(struct wmi_device *wdev);
 	void (*notify)(struct wmi_device *device, union acpi_object *data);
 	long (*filter_callback)(struct wmi_device *wdev, unsigned int cmd,
-- 
cgit v1.2.3-59-g8ed1b


From 99600fd47eafd20b9ba6e04562bb2fcc48475344 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 22 Apr 2019 07:15:05 +0800
Subject: clk: Add CLK_HW_INIT_* macros using .parent_hws

With the new clk parenting code, struct clk_init_data was expanded to
include .parent_hws, for clk drivers to directly list parents by
pointing to their respective struct clk_hw's.

Add macros that can take either one single struct clk_hw *, or an array
of them, for drivers to use.

A special CLK_HW_INIT_HWS macro is included, which takes an array of
struct clk_hw *, but sets .num_parents to 1. This variant is to allow
the reuse of the array, instead of having a compound literal allocated
for each clk sharing the same parent.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index bb6118f79784..70aad5cefea7 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -904,6 +904,29 @@ extern struct of_device_id __clk_of_table;
 		.ops		= _ops,				\
 	})
 
+#define CLK_HW_INIT_HW(_name, _parent, _ops, _flags)			\
+	(&(struct clk_init_data) {					\
+		.flags		= _flags,				\
+		.name		= _name,				\
+		.parent_hws	= (const struct clk_hw*[]) { _parent },	\
+		.num_parents	= 1,					\
+		.ops		= _ops,					\
+	})
+
+/*
+ * This macro is intended for drivers to be able to share the otherwise
+ * individual struct clk_hw[] compound literals created by the compiler
+ * when using CLK_HW_INIT_HW. It does NOT support multiple parents.
+ */
+#define CLK_HW_INIT_HWS(_name, _parent, _ops, _flags)			\
+	(&(struct clk_init_data) {					\
+		.flags		= _flags,				\
+		.name		= _name,				\
+		.parent_hws	= _parent,				\
+		.num_parents	= 1,					\
+		.ops		= _ops,					\
+	})
+
 #define CLK_HW_INIT_PARENTS(_name, _parents, _ops, _flags)	\
 	(&(struct clk_init_data) {				\
 		.flags		= _flags,			\
@@ -913,6 +936,15 @@ extern struct of_device_id __clk_of_table;
 		.ops		= _ops,				\
 	})
 
+#define CLK_HW_INIT_PARENTS_HW(_name, _parents, _ops, _flags)	\
+	(&(struct clk_init_data) {				\
+		.flags		= _flags,			\
+		.name		= _name,			\
+		.parent_hws	= _parents,			\
+		.num_parents	= ARRAY_SIZE(_parents),		\
+		.ops		= _ops,				\
+	})
+
 #define CLK_HW_INIT_NO_PARENT(_name, _ops, _flags)	\
 	(&(struct clk_init_data) {			\
 		.flags          = _flags,		\
-- 
cgit v1.2.3-59-g8ed1b


From 2d6b4f33e637bf51c50c536966a19e94a59f3212 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 3 May 2019 11:49:03 +0800
Subject: clk: Add CLK_HW_INIT_FW_NAME macro using .fw_name in .parent_data

With the new clk parenting code, clk_init_data was expanded to include
.parent_data, for clk drivers that have parents referenced using a
combination of device tree clock-names, clock indices, and/or clk_hw
pointers.

Add a CLK_HW_INIT macro for specifying a single parent from the device
tree using .fw_name in struct clk_parent_data.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 70aad5cefea7..b19063512a29 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -927,6 +927,17 @@ extern struct of_device_id __clk_of_table;
 		.ops		= _ops,					\
 	})
 
+#define CLK_HW_INIT_FW_NAME(_name, _parent, _ops, _flags)		\
+	(&(struct clk_init_data) {					\
+		.flags		= _flags,				\
+		.name		= _name,				\
+		.parent_data	= (const struct clk_parent_data[]) {	\
+					{ .fw_name = _parent },		\
+				  },					\
+		.num_parents	= 1,					\
+		.ops		= _ops,					\
+	})
+
 #define CLK_HW_INIT_PARENTS(_name, _parents, _ops, _flags)	\
 	(&(struct clk_init_data) {				\
 		.flags		= _flags,			\
-- 
cgit v1.2.3-59-g8ed1b


From 13933109dff0a5abbfc3980304c6c21c90829810 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 22 Apr 2019 07:17:50 +0800
Subject: clk: Add CLK_HW_INIT_PARENT_DATA macro using .parent_data

With the new clk parenting code, struct clk_init_data was expanded to
include .parent_data, for clk drivers that have parents referenced using
a combination of device tree clock-names, clock indices, and/or struct
clk_hw pointers.

Add a new macro that can take a list of struct clk_parent_data for
drivers to use.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index b19063512a29..0fd14c4874d6 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -956,6 +956,15 @@ extern struct of_device_id __clk_of_table;
 		.ops		= _ops,				\
 	})
 
+#define CLK_HW_INIT_PARENTS_DATA(_name, _parents, _ops, _flags)	\
+	(&(struct clk_init_data) {				\
+		.flags		= _flags,			\
+		.name		= _name,			\
+		.parent_data	= _parents,			\
+		.num_parents	= ARRAY_SIZE(_parents),		\
+		.ops		= _ops,				\
+	})
+
 #define CLK_HW_INIT_NO_PARENT(_name, _ops, _flags)	\
 	(&(struct clk_init_data) {			\
 		.flags          = _flags,		\
-- 
cgit v1.2.3-59-g8ed1b


From d7b15114aba956ca395ec5cc28f68fe861ffc208 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 22 Apr 2019 07:19:46 +0800
Subject: clk: fixed-factor: Add CLK_FIXED_FACTOR_HW which takes clk_hw pointer
 as parent

With the new clk parenting code, clk_init_data was expanded to include
.parent_hws, for clk drivers to directly reference parents by clk_hw.

Add a new macro, CLK_FIXED_FACTOR_HW, that can take a struct clk_hw
pointer, instead of a string, as its parent.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 0fd14c4874d6..c85e9f3809f2 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -985,6 +985,17 @@ extern struct of_device_id __clk_of_table;
 					      _flags),			\
 	}
 
+#define CLK_FIXED_FACTOR_HW(_struct, _name, _parent,			\
+			    _div, _mult, _flags)			\
+	struct clk_fixed_factor _struct = {				\
+		.div		= _div,					\
+		.mult		= _mult,				\
+		.hw.init	= CLK_HW_INIT_HW(_name,			\
+						 _parent,		\
+						 &clk_fixed_factor_ops,	\
+						 _flags),		\
+	}
+
 #ifdef CONFIG_OF
 int of_clk_add_provider(struct device_node *np,
 			struct clk *(*clk_src_get)(struct of_phandle_args *args,
-- 
cgit v1.2.3-59-g8ed1b


From 1bef004e2680511ecbb6b5db3954fba430501ecb Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Mon, 6 May 2019 10:43:16 +0800
Subject: clk: fixed-factor: Add CLK_FIXED_FACTOR_HWS which takes list of
 struct clk_hw *

With the new clk parenting code, clk_init_data was expanded to include
.parent_hws, for clk drivers to directly reference parents by clk_hw.

Add a new macro, CLK_FIXED_FACTOR_HWS, that can take an array of pointers
to struct clk_hw, instead of a string, as its parent. Taking an array
instead of a direct pointer allows the reuse of the array for multiple
clks, rather than having one compound literal with the same contents
allocated for each clk declaration.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index c85e9f3809f2..146a6859969e 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -996,6 +996,21 @@ extern struct of_device_id __clk_of_table;
 						 _flags),		\
 	}
 
+/*
+ * This macro allows the driver to reuse the _parent array for multiple
+ * fixed factor clk declarations.
+ */
+#define CLK_FIXED_FACTOR_HWS(_struct, _name, _parent,			\
+			     _div, _mult, _flags)			\
+	struct clk_fixed_factor _struct = {				\
+		.div		= _div,					\
+		.mult		= _mult,				\
+		.hw.init	= CLK_HW_INIT_HWS(_name,		\
+						  _parent,		\
+						  &clk_fixed_factor_ops, \
+						  _flags),	\
+	}
+
 #ifdef CONFIG_OF
 int of_clk_add_provider(struct device_node *np,
 			struct clk *(*clk_src_get)(struct of_phandle_args *args,
-- 
cgit v1.2.3-59-g8ed1b


From 8b13a48b891c7c855e9f3a401d91391a946f4ca7 Mon Sep 17 00:00:00 2001
From: Chen-Yu Tsai <wens@csie.org>
Date: Fri, 3 May 2019 11:58:20 +0800
Subject: clk: fixed-factor: Add CLK_FIXED_FACTOR_FW_NAME for DT clock-names
 parent

With the new clk parenting code, clk_init_data was expanded to include
.parent_data, for clk drivers to specify parents using a combination of
device tree clock-names, pointers to struct clk_hw, device tree clocks,
and/or fallback global clock names.

Add a new macro, CLK_FIXED_FACTOR_FW_NAME, that takes a string to match
a clock-names entry in the device tree to specify the clock parent.

Signed-off-by: Chen-Yu Tsai <wens@csie.org>
---
 include/linux/clk-provider.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 146a6859969e..e5c44f6dd897 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -1011,6 +1011,17 @@ extern struct of_device_id __clk_of_table;
 						  _flags),	\
 	}
 
+#define CLK_FIXED_FACTOR_FW_NAME(_struct, _name, _parent,		\
+				 _div, _mult, _flags)			\
+	struct clk_fixed_factor _struct = {				\
+		.div		= _div,					\
+		.mult		= _mult,				\
+		.hw.init	= CLK_HW_INIT_FW_NAME(_name,		\
+						      _parent,		\
+						      &clk_fixed_factor_ops, \
+						      _flags),		\
+	}
+
 #ifdef CONFIG_OF
 int of_clk_add_provider(struct device_node *np,
 			struct clk *(*clk_src_get)(struct of_phandle_args *args,
-- 
cgit v1.2.3-59-g8ed1b


From 4eb293487d05a69862a4907ee944aa271ed49a4c Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 13 Jun 2019 10:55:32 +0900
Subject: pinctrl: make pinconf.h self-contained

This header uses 'bool', but it does not include any header by itself.

So, it could cause unknown type name error, depending on the header
include order, although probably <linux/types.h> has been included by
someone else.

Include <linux/types.h> to make it self-contained.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index 93c9dd133e9d..9bebc3554809 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -14,6 +14,8 @@
 
 #ifdef CONFIG_PINCONF
 
+#include <linux/types.h>
+
 struct pinctrl_dev;
 struct seq_file;
 
-- 
cgit v1.2.3-59-g8ed1b


From 29875a52915e09abb9703722054f6443cb492ccc Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Fri, 12 Oct 2018 17:06:06 +0200
Subject: mm: Add an apply_to_pfn_range interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is basically apply_to_page_range with added functionality:
Allocating missing parts of the page table becomes optional, which
means that the function can be guaranteed not to error if allocation
is disabled. Also passing of the closure struct and callback function
becomes different and more in line with how things are done elsewhere.

Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range

The reason for not using the page-walk code is that we want to perform
the page-walk on vmas pointing to an address space without requiring the
mmap_sem to be held rather than on vmas belonging to a process with the
mmap_sem held.

Notable changes since RFC:
Don't export apply_to_pfn range.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com> #v1
---
 include/linux/mm.h |  10 ++++
 mm/memory.c        | 135 ++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 113 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e8834ac32b7..3d06ce2a64af 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2675,6 +2675,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
+struct pfn_range_apply;
+typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
+			 struct pfn_range_apply *closure);
+struct pfn_range_apply {
+	struct mm_struct *mm;
+	pter_fn_t ptefn;
+	unsigned int alloc;
+};
+extern int apply_to_pfn_range(struct pfn_range_apply *closure,
+			      unsigned long address, unsigned long size);
 
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
diff --git a/mm/memory.c b/mm/memory.c
index 168f546af1ad..462aa47f8878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2032,18 +2032,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long
 }
 EXPORT_SYMBOL(vm_iomap_memory);
 
-static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
-				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
+			      unsigned long addr, unsigned long end)
 {
 	pte_t *pte;
 	int err;
 	pgtable_t token;
 	spinlock_t *uninitialized_var(ptl);
 
-	pte = (mm == &init_mm) ?
+	pte = (closure->mm == &init_mm) ?
 		pte_alloc_kernel(pmd, addr) :
-		pte_alloc_map_lock(mm, pmd, addr, &ptl);
+		pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 
@@ -2054,86 +2053,109 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	token = pmd_pgtable(*pmd);
 
 	do {
-		err = fn(pte++, token, addr, data);
+		err = closure->ptefn(pte++, token, addr, closure);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
 
-	if (mm != &init_mm)
+	if (closure->mm != &init_mm)
 		pte_unmap_unlock(pte-1, ptl);
 	return err;
 }
 
-static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
-				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
+			      unsigned long addr, unsigned long end)
 {
 	pmd_t *pmd;
 	unsigned long next;
-	int err;
+	int err = 0;
 
 	BUG_ON(pud_huge(*pud));
 
-	pmd = pmd_alloc(mm, pud, addr);
+	pmd = pmd_alloc(closure->mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
+
 	do {
 		next = pmd_addr_end(addr, end);
-		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
+		if (!closure->alloc && pmd_none_or_clear_bad(pmd))
+			continue;
+		err = apply_to_pte_range(closure, pmd, addr, next);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
 	return err;
 }
 
-static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
-				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d,
+			      unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
 	unsigned long next;
-	int err;
+	int err = 0;
 
-	pud = pud_alloc(mm, p4d, addr);
+	pud = pud_alloc(closure->mm, p4d, addr);
 	if (!pud)
 		return -ENOMEM;
+
 	do {
 		next = pud_addr_end(addr, end);
-		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
+		if (!closure->alloc && pud_none_or_clear_bad(pud))
+			continue;
+		err = apply_to_pmd_range(closure, pud, addr, next);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
 	return err;
 }
 
-static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
-				     unsigned long addr, unsigned long end,
-				     pte_fn_t fn, void *data)
+static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd,
+			      unsigned long addr, unsigned long end)
 {
 	p4d_t *p4d;
 	unsigned long next;
-	int err;
+	int err = 0;
 
-	p4d = p4d_alloc(mm, pgd, addr);
+	p4d = p4d_alloc(closure->mm, pgd, addr);
 	if (!p4d)
 		return -ENOMEM;
+
 	do {
 		next = p4d_addr_end(addr, end);
-		err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
+		if (!closure->alloc && p4d_none_or_clear_bad(p4d))
+			continue;
+		err = apply_to_pud_range(closure, p4d, addr, next);
 		if (err)
 			break;
 	} while (p4d++, addr = next, addr != end);
 	return err;
 }
 
-/*
- * Scan a region of virtual memory, filling in page tables as necessary
- * and calling a provided function on each leaf page table.
+/**
+ * apply_to_pfn_range - Scan a region of virtual memory, calling a provided
+ * function on each leaf page table entry
+ * @closure: Details about how to scan and what function to apply
+ * @addr: Start virtual address
+ * @size: Size of the region
+ *
+ * If @closure->alloc is set to 1, the function will fill in the page table
+ * as necessary. Otherwise it will skip non-present parts.
+ * Note: The caller must ensure that the range does not contain huge pages.
+ * The caller must also assure that the proper mmu_notifier functions are
+ * called before and after the call to apply_to_pfn_range.
+ *
+ * WARNING: Do not use this function unless you know exactly what you are
+ * doing. It is lacking support for huge pages and transparent huge pages.
+ *
+ * Return: Zero on success. If the provided function returns a non-zero status,
+ * the page table walk will terminate and that status will be returned.
+ * If @closure->alloc is set to 1, then this function may also return memory
+ * allocation errors arising from allocating page table memory.
  */
-int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
-			unsigned long size, pte_fn_t fn, void *data)
+int apply_to_pfn_range(struct pfn_range_apply *closure,
+		       unsigned long addr, unsigned long size)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -2143,16 +2165,65 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 	if (WARN_ON(addr >= end))
 		return -EINVAL;
 
-	pgd = pgd_offset(mm, addr);
+	pgd = pgd_offset(closure->mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
+		if (!closure->alloc && pgd_none_or_clear_bad(pgd))
+			continue;
+		err = apply_to_p4d_range(closure, pgd, addr, next);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 
 	return err;
 }
+
+/**
+ * struct page_range_apply - Closure structure for apply_to_page_range()
+ * @pter: The base closure structure we derive from
+ * @fn: The leaf pte function to call
+ * @data: The leaf pte function closure
+ */
+struct page_range_apply {
+	struct pfn_range_apply pter;
+	pte_fn_t fn;
+	void *data;
+};
+
+/*
+ * Callback wrapper to enable use of apply_to_pfn_range for
+ * the apply_to_page_range interface
+ */
+static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
+				       unsigned long addr,
+				       struct pfn_range_apply *pter)
+{
+	struct page_range_apply *pra =
+		container_of(pter, typeof(*pra), pter);
+
+	return pra->fn(pte, token, addr, pra->data);
+}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ *
+ * WARNING: Do not use this function unless you know exactly what you are
+ * doing. It is lacking support for huge pages and transparent huge pages.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+			unsigned long size, pte_fn_t fn, void *data)
+{
+	struct page_range_apply pra = {
+		.pter = {.mm = mm,
+			 .alloc = 1,
+			 .ptefn = apply_to_page_range_wrapper },
+		.fn = fn,
+		.data = data
+	};
+
+	return apply_to_pfn_range(&pra.pter, addr, size);
+}
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 4fe51e9e7902b5724b618dadd9527b1bbf2b55cc Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Tue, 19 Mar 2019 13:12:30 +0100
Subject: mm: Add write-protect and clean utilities for address space ranges
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add two utilities to a) write-protect and b) clean all ptes pointing into
a range of an address space.
The utilities are intended to aid in tracking dirty pages (either
driver-allocated system memory or pci device memory).
The write-protect utility should be used in conjunction with
page_mkwrite() and pfn_mkwrite() to trigger write page-faults on page
accesses. Typically one would want to use this on sparse accesses into
large memory regions. The clean utility should be used to utilize
hardware dirtying functionality and avoid the overhead of page-faults,
typically on large accesses into small memory regions.

The added file "as_dirty_helpers.c" is initially listed as maintained by
VMware under our DRM driver. If somebody would like it elsewhere,
that's of course no problem.

Notable changes since RFC:
- Added comments to help avoid the usage of these function for VMAs
  it's not intended for. We also do advisory checks on the vm_flags and
  warn on illegal usage.
- Perform the pte modifications the same way softdirty does.
- Add mmu_notifier range invalidation calls.
- Add a config option so that this code is not unconditionally included.
- Tell the mmu_gather code about pending tlb flushes.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Souptick Joarder <jrdr.linux@gmail.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org

Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com> #v1
---
 MAINTAINERS           |   1 +
 include/linux/mm.h    |   9 +-
 mm/Kconfig            |   3 +
 mm/Makefile           |   1 +
 mm/as_dirty_helpers.c | 300 ++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 313 insertions(+), 1 deletion(-)
 create mode 100644 mm/as_dirty_helpers.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 7a2f487ea49a..a55d4ef91b0b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5179,6 +5179,7 @@ T:	git git://people.freedesktop.org/~thomash/linux
 S:	Supported
 F:	drivers/gpu/drm/vmwgfx/
 F:	include/uapi/drm/vmwgfx_drm.h
+F:	mm/as_dirty_helpers.c
 
 DRM DRIVERS
 M:	David Airlie <airlied@linux.ie>
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3d06ce2a64af..a0bc2a82917e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2685,7 +2685,14 @@ struct pfn_range_apply {
 };
 extern int apply_to_pfn_range(struct pfn_range_apply *closure,
 			      unsigned long address, unsigned long size);
-
+unsigned long apply_as_wrprotect(struct address_space *mapping,
+				 pgoff_t first_index, pgoff_t nr);
+unsigned long apply_as_clean(struct address_space *mapping,
+			     pgoff_t first_index, pgoff_t nr,
+			     pgoff_t bitmap_pgoff,
+			     unsigned long *bitmap,
+			     pgoff_t *start,
+			     pgoff_t *end);
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
 extern void kernel_poison_pages(struct page *page, int numpages, int enable);
diff --git a/mm/Kconfig b/mm/Kconfig
index f0c76ba47695..5006d0e6a5c7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -765,4 +765,7 @@ config GUP_BENCHMARK
 config ARCH_HAS_PTE_SPECIAL
 	bool
 
+config AS_DIRTY_HELPERS
+        bool
+
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index ac5e5ba78874..f5d412bbc2f7 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -104,3 +104,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
 obj-$(CONFIG_HMM) += hmm.o
 obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+obj-$(CONFIG_AS_DIRTY_HELPERS) += as_dirty_helpers.o
diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c
new file mode 100644
index 000000000000..f600e31534fb
--- /dev/null
+++ b/mm/as_dirty_helpers.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/hugetlb.h>
+#include <linux/bitops.h>
+#include <linux/mmu_notifier.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+/**
+ * struct apply_as - Closure structure for apply_as_range
+ * @base: struct pfn_range_apply we derive from
+ * @start: Address of first modified pte
+ * @end: Address of last modified pte + 1
+ * @total: Total number of modified ptes
+ * @vma: Pointer to the struct vm_area_struct we're currently operating on
+ */
+struct apply_as {
+	struct pfn_range_apply base;
+	unsigned long start;
+	unsigned long end;
+	unsigned long total;
+	struct vm_area_struct *vma;
+};
+
+/**
+ * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
+ * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
+ * @addr: The virtual page address
+ * @closure: Pointer to a struct pfn_range_apply embedded in a
+ * struct apply_as
+ *
+ * The function write-protects a pte and records the range in
+ * virtual address space of touched ptes for efficient range TLB flushes.
+ *
+ * Return: Always zero.
+ */
+static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
+			      unsigned long addr,
+			      struct pfn_range_apply *closure)
+{
+	struct apply_as *aas = container_of(closure, typeof(*aas), base);
+	pte_t ptent = *pte;
+
+	if (pte_write(ptent)) {
+		pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte);
+
+		ptent = pte_wrprotect(old_pte);
+		ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent);
+		aas->total++;
+		aas->start = min(aas->start, addr);
+		aas->end = max(aas->end, addr + PAGE_SIZE);
+	}
+
+	return 0;
+}
+
+/**
+ * struct apply_as_clean - Closure structure for apply_as_clean
+ * @base: struct apply_as we derive from
+ * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
+ * @bitmap: Bitmap with one bit for each page offset in the address_space range
+ * covered.
+ * @start: Address_space page offset of first modified pte relative
+ * to @bitmap_pgoff
+ * @end: Address_space page offset of last modified pte relative
+ * to @bitmap_pgoff
+ */
+struct apply_as_clean {
+	struct apply_as base;
+	pgoff_t bitmap_pgoff;
+	unsigned long *bitmap;
+	pgoff_t start;
+	pgoff_t end;
+};
+
+/**
+ * apply_pt_clean - Leaf pte callback to clean a pte
+ * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
+ * @addr: The virtual page address
+ * @closure: Pointer to a struct pfn_range_apply embedded in a
+ * struct apply_as_clean
+ *
+ * The function cleans a pte and records the range in
+ * virtual address space of touched ptes for efficient TLB flushes.
+ * It also records dirty ptes in a bitmap representing page offsets
+ * in the address_space, as well as the first and last of the bits
+ * touched.
+ *
+ * Return: Always zero.
+ */
+static int apply_pt_clean(pte_t *pte, pgtable_t token,
+			  unsigned long addr,
+			  struct pfn_range_apply *closure)
+{
+	struct apply_as *aas = container_of(closure, typeof(*aas), base);
+	struct apply_as_clean *clean = container_of(aas, typeof(*clean), base);
+	pte_t ptent = *pte;
+
+	if (pte_dirty(ptent)) {
+		pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) +
+			aas->vma->vm_pgoff - clean->bitmap_pgoff;
+		pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte);
+
+		ptent = pte_mkclean(old_pte);
+		ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent);
+
+		aas->total++;
+		aas->start = min(aas->start, addr);
+		aas->end = max(aas->end, addr + PAGE_SIZE);
+
+		__set_bit(pgoff, clean->bitmap);
+		clean->start = min(clean->start, pgoff);
+		clean->end = max(clean->end, pgoff + 1);
+	}
+
+	return 0;
+}
+
+/**
+ * apply_as_range - Apply a pte callback to all PTEs pointing into a range
+ * of an address_space.
+ * @mapping: Pointer to the struct address_space
+ * @aas: Closure structure
+ * @first_index: First page offset in the address_space
+ * @nr: Number of incremental page offsets to cover
+ *
+ * Return: Number of ptes touched. Note that this number might be larger
+ * than @nr if there are overlapping vmas
+ */
+static unsigned long apply_as_range(struct address_space *mapping,
+				    struct apply_as *aas,
+				    pgoff_t first_index, pgoff_t nr)
+{
+	struct vm_area_struct *vma;
+	pgoff_t vba, vea, cba, cea;
+	unsigned long start_addr, end_addr;
+	struct mmu_notifier_range range;
+
+	i_mmap_lock_read(mapping);
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
+				  first_index + nr - 1) {
+		unsigned long vm_flags = READ_ONCE(vma->vm_flags);
+
+		/*
+		 * We can only do advisory flag tests below, since we can't
+		 * require the vm's mmap_sem to be held to protect the flags.
+		 * Therefore, callers that strictly depend on specific mmap
+		 * flags to remain constant throughout the operation must
+		 * either ensure those flags are immutable for all relevant
+		 * vmas or can't use this function. Fixing this properly would
+		 * require the vma::vm_flags to be protected by a separate
+		 * lock taken after the i_mmap_lock
+		 */
+
+		/* Skip non-applicable VMAs */
+		if ((vm_flags & (VM_SHARED | VM_WRITE)) !=
+		    (VM_SHARED | VM_WRITE))
+			continue;
+
+		/* Warn on and skip VMAs whose flags indicate illegal usage */
+		if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO))
+			continue;
+
+		/* Clip to the vma */
+		vba = vma->vm_pgoff;
+		vea = vba + vma_pages(vma);
+		cba = first_index;
+		cba = max(cba, vba);
+		cea = first_index + nr;
+		cea = min(cea, vea);
+
+		/* Translate to virtual address */
+		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
+		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
+		if (start_addr >= end_addr)
+			continue;
+
+		aas->base.mm = vma->vm_mm;
+		aas->vma = vma;
+		aas->start = end_addr;
+		aas->end = start_addr;
+
+		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
+					vma, vma->vm_mm, start_addr, end_addr);
+		mmu_notifier_invalidate_range_start(&range);
+
+		/* Needed when we only change protection? */
+		flush_cache_range(vma, start_addr, end_addr);
+
+		/*
+		 * We're not using tlb_gather_mmu() since typically
+		 * only a small subrange of PTEs are affected.
+		 */
+		inc_tlb_flush_pending(vma->vm_mm);
+
+		/* Should not error since aas->base.alloc == 0 */
+		WARN_ON(apply_to_pfn_range(&aas->base, start_addr,
+					   end_addr - start_addr));
+		if (aas->end > aas->start)
+			flush_tlb_range(vma, aas->start, aas->end);
+
+		mmu_notifier_invalidate_range_end(&range);
+		dec_tlb_flush_pending(vma->vm_mm);
+	}
+	i_mmap_unlock_read(mapping);
+
+	return aas->total;
+}
+
+/**
+ * apply_as_wrprotect - Write-protect all ptes in an address_space range
+ * @mapping: The address_space we want to write protect
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ *
+ * WARNING: This function should only be used for address spaces whose
+ * vmas are marked VM_IO and that do not contain huge pages.
+ * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
+ * simply skipped.
+ *
+ * Return: The number of ptes actually write-protected. Note that
+ * already write-protected ptes are not counted.
+ */
+unsigned long apply_as_wrprotect(struct address_space *mapping,
+				 pgoff_t first_index, pgoff_t nr)
+{
+	struct apply_as aas = {
+		.base = {
+			.alloc = 0,
+			.ptefn = apply_pt_wrprotect,
+		},
+		.total = 0,
+	};
+
+	return apply_as_range(mapping, &aas, first_index, nr);
+}
+EXPORT_SYMBOL_GPL(apply_as_wrprotect);
+
+/**
+ * apply_as_clean - Clean all ptes in an address_space range
+ * @mapping: The address_space we want to clean
+ * @first_index: The first page offset in the range
+ * @nr: Number of incremental page offsets to cover
+ * @bitmap_pgoff: The page offset of the first bit in @bitmap
+ * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
+ * cover the whole range @first_index..@first_index + @nr.
+ * @start: Pointer to number of the first set bit in @bitmap.
+ * is modified as new bits are set by the function.
+ * @end: Pointer to the number of the last set bit in @bitmap.
+ * none set. The value is modified as new bits are set by the function.
+ *
+ * Note: When this function returns there is no guarantee that a CPU has
+ * not already dirtied new ptes. However it will not clean any ptes not
+ * reported in the bitmap.
+ *
+ * If a caller needs to make sure all dirty ptes are picked up and none
+ * additional are added, it first needs to write-protect the address-space
+ * range and make sure new writers are blocked in page_mkwrite() or
+ * pfn_mkwrite(). And then after a TLB flush following the write-protection
+ * pick up all dirty bits.
+ *
+ * WARNING: This function should only be used for address spaces whose
+ * vmas are marked VM_IO and that do not contain huge pages.
+ * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
+ * simply skipped.
+ *
+ * Return: The number of dirty ptes actually cleaned.
+ */
+unsigned long apply_as_clean(struct address_space *mapping,
+			     pgoff_t first_index, pgoff_t nr,
+			     pgoff_t bitmap_pgoff,
+			     unsigned long *bitmap,
+			     pgoff_t *start,
+			     pgoff_t *end)
+{
+	bool none_set = (*start >= *end);
+	struct apply_as_clean clean = {
+		.base = {
+			.base = {
+				.alloc = 0,
+				.ptefn = apply_pt_clean,
+			},
+			.total = 0,
+		},
+		.bitmap_pgoff = bitmap_pgoff,
+		.bitmap = bitmap,
+		.start = none_set ? nr : *start,
+		.end = none_set ? 0 : *end,
+	};
+	unsigned long ret = apply_as_range(mapping, &clean.base, first_index,
+					   nr);
+
+	*start = clean.start;
+	*end = clean.end;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(apply_as_clean);
-- 
cgit v1.2.3-59-g8ed1b


From 378a60406415bd20ec6e845a3d6883d460656537 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 23 May 2019 11:17:22 -0300
Subject: mm/hmm: Remove duplicate condition test before wait_event_timeout

The wait_event_timeout macro already tests the condition as its first
action, so there is no reason to open code another version of this, all
that does is skip the might_sleep() debugging in common cases, which is
not helpful.

Further, based on prior patches, we can now simplify the required condition
test:
 - If range is valid memory then so is range->hmm
 - If hmm_release() has run then range->valid is set to false
   at the same time as dead, so no reason to check both.
 - A valid hmm has a valid hmm->mm.

Allowing the return value of wait_event_timeout() (along with its internal
barriers) to compute the result of the function.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 1d97b6d62c5b..26e7c477490c 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -209,17 +209,8 @@ static inline unsigned long hmm_range_page_size(const struct hmm_range *range)
 static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
 					      unsigned long timeout)
 {
-	/* Check if mm is dead ? */
-	if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
-		range->valid = false;
-		return false;
-	}
-	if (range->valid)
-		return true;
-	wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
-			   msecs_to_jiffies(timeout));
-	/* Return current valid status just in case we get lucky */
-	return range->valid;
+	return wait_event_timeout(range->hmm->wq, range->valid,
+				  msecs_to_jiffies(timeout)) != 0;
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 47f245985a4f3e270b1e4f28aa49f4c939527981 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 23 May 2019 11:08:28 -0300
Subject: mm/hmm: Hold on to the mmget for the lifetime of the range

Range functions like hmm_range_snapshot() and hmm_range_fault() call
find_vma, which requires hodling the mmget() and the mmap_sem for the mm.

Make this simpler for the callers by holding the mmget() inside the range
for the lifetime of the range. Other functions that accept a range should
only be called if the range is registered.

This has the side effect of directly preventing hmm_release() from
happening while a range is registered. That means range->dead cannot be
false during the lifetime of the range, so remove dead and
hmm_mirror_mm_is_alive() entirely.

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h | 26 --------------------------
 mm/hmm.c            | 32 +++++++++++---------------------
 2 files changed, 11 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 26e7c477490c..bf013e965257 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -82,7 +82,6 @@
  * @mirrors_sem: read/write semaphore protecting the mirrors list
  * @wq: wait queue for user waiting on a range invalidation
  * @notifiers: count of active mmu notifiers
- * @dead: is the mm dead ?
  */
 struct hmm {
 	struct mm_struct	*mm;
@@ -95,7 +94,6 @@ struct hmm {
 	wait_queue_head_t	wq;
 	struct rcu_head		rcu;
 	long			notifiers;
-	bool			dead;
 };
 
 /*
@@ -459,30 +457,6 @@ struct hmm_mirror {
 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
 
-/*
- * hmm_mirror_mm_is_alive() - test if mm is still alive
- * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
- * Return: false if the mm is dead, true otherwise
- *
- * This is an optimization, it will not always accurately return false if the
- * mm is dead; i.e., there can be false negatives (process is being killed but
- * HMM is not yet informed of that). It is only intended to be used to optimize
- * out cases where the driver is about to do something time consuming and it
- * would be better to skip it if the mm is dead.
- */
-static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
-{
-	struct mm_struct *mm;
-
-	if (!mirror || !mirror->hmm)
-		return false;
-	mm = READ_ONCE(mirror->hmm->mm);
-	if (mirror->hmm->dead || !mm)
-		return false;
-
-	return true;
-}
-
 /*
  * Please see Documentation/vm/hmm.rst for how to use the range API.
  */
diff --git a/mm/hmm.c b/mm/hmm.c
index 73c8af4827fe..1eddda45cefa 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -67,7 +67,6 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 	mutex_init(&hmm->lock);
 	kref_init(&hmm->kref);
 	hmm->notifiers = 0;
-	hmm->dead = false;
 	hmm->mm = mm;
 
 	hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
@@ -120,21 +119,16 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
 	struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
 	struct hmm_mirror *mirror;
-	struct hmm_range *range;
 
 	/* Bail out if hmm is in the process of being freed */
 	if (!kref_get_unless_zero(&hmm->kref))
 		return;
 
-	/* Report this HMM as dying. */
-	hmm->dead = true;
-
-	/* Wake-up everyone waiting on any range. */
-	mutex_lock(&hmm->lock);
-	list_for_each_entry(range, &hmm->ranges, list)
-		range->valid = false;
-	wake_up_all(&hmm->wq);
-	mutex_unlock(&hmm->lock);
+	/*
+	 * Since hmm_range_register() holds the mmget() lock hmm_release() is
+	 * prevented as long as a range exists.
+	 */
+	WARN_ON(!list_empty_careful(&hmm->ranges));
 
 	down_write(&hmm->mirrors_sem);
 	mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
@@ -903,8 +897,8 @@ int hmm_range_register(struct hmm_range *range,
 	range->start = start;
 	range->end = end;
 
-	/* Check if hmm_mm_destroy() was call. */
-	if (hmm->mm == NULL || hmm->dead)
+	/* Prevent hmm_release() from running while the range is valid */
+	if (!mmget_not_zero(hmm->mm))
 		return -EFAULT;
 
 	/* Initialize range to track CPU page table updates. */
@@ -942,11 +936,12 @@ void hmm_range_unregister(struct hmm_range *range)
 		return;
 
 	mutex_lock(&hmm->lock);
-	list_del(&range->list);
+	list_del_init(&range->list);
 	mutex_unlock(&hmm->lock);
 
 	/* Drop reference taken by hmm_range_register() */
 	range->valid = false;
+	mmput(hmm->mm);
 	hmm_put(hmm);
 	range->hmm = NULL;
 }
@@ -974,10 +969,7 @@ long hmm_range_snapshot(struct hmm_range *range)
 	struct vm_area_struct *vma;
 	struct mm_walk mm_walk;
 
-	/* Check if hmm_mm_destroy() was call. */
-	if (hmm->mm == NULL || hmm->dead)
-		return -EFAULT;
-
+	lockdep_assert_held(&hmm->mm->mmap_sem);
 	do {
 		/* If range is no longer valid force retry. */
 		if (!range->valid)
@@ -1072,9 +1064,7 @@ long hmm_range_fault(struct hmm_range *range, bool block)
 	struct mm_walk mm_walk;
 	int ret;
 
-	/* Check if hmm_mm_destroy() was call. */
-	if (hmm->mm == NULL || hmm->dead)
-		return -EFAULT;
+	lockdep_assert_held(&hmm->mm->mmap_sem);
 
 	do {
 		/* If range is no longer valid force retry. */
-- 
cgit v1.2.3-59-g8ed1b


From a78cf9657ba5426f54aa93a067c10d097944c082 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sat, 15 Jun 2019 10:23:57 +1000
Subject: PCI/ACPI: Evaluate PCI Boot Configuration _DSM

Evaluate _DSM Function #5, the "PCI Boot Configuration" function.  If the
result is 0, the OS should preserve any resource assignments made by the
firmware.

Link: https://lore.kernel.org/r/20190615002359.29577-2-benh@kernel.crashing.org
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[bhelgaas: commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/acpi/pci_root.c  | 12 ++++++++++++
 include/linux/pci-acpi.h |  7 ++++---
 include/linux/pci.h      |  2 ++
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index c36781a9b493..0d57f817ef1e 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -894,6 +894,7 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 	int node = acpi_get_node(device->handle);
 	struct pci_bus *bus;
 	struct pci_host_bridge *host_bridge;
+	union acpi_object *obj;
 
 	info->root = root;
 	info->bridge = device;
@@ -930,6 +931,17 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root,
 	if (!(root->osc_control_set & OSC_PCI_EXPRESS_LTR_CONTROL))
 		host_bridge->native_ltr = 0;
 
+	/*
+	 * Evaluate the "PCI Boot Configuration" _DSM Function.  If it
+	 * exists and returns 0, we must preserve any PCI resource
+	 * assignments made by firmware for this host bridge.
+	 */
+	obj = acpi_evaluate_dsm(ACPI_HANDLE(bus->bridge), &pci_acpi_dsm_guid, 1,
+	                        IGNORE_PCI_BOOT_CONFIG_DSM, NULL);
+	if (obj && obj->type == ACPI_TYPE_INTEGER && obj->integer.value == 0)
+		host_bridge->preserve_config = 1;
+	ACPI_FREE(obj);
+
 	pci_scan_child_bus(bus);
 	pci_set_host_bridge_release(host_bridge, acpi_pci_root_release_info,
 				    info);
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 8082b612f561..62b7fdcc661c 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -107,9 +107,10 @@ static inline void acpiphp_check_host_bridge(struct acpi_device *adev) { }
 #endif
 
 extern const guid_t pci_acpi_dsm_guid;
-#define DEVICE_LABEL_DSM	0x07
-#define RESET_DELAY_DSM		0x08
-#define FUNCTION_DELAY_DSM	0x09
+#define IGNORE_PCI_BOOT_CONFIG_DSM	0x05
+#define DEVICE_LABEL_DSM		0x07
+#define RESET_DELAY_DSM			0x08
+#define FUNCTION_DELAY_DSM		0x09
 
 #else	/* CONFIG_ACPI */
 static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..5e2b309363a3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -505,6 +505,8 @@ struct pci_host_bridge {
 	unsigned int	native_shpc_hotplug:1;	/* OS may use SHPC hotplug */
 	unsigned int	native_pme:1;		/* OS may use PCIe PME */
 	unsigned int	native_ltr:1;		/* OS may use PCIe LTR */
+	unsigned int	preserve_config:1;	/* Preserve FW resource setup */
+
 	/* Resource alignment requirements */
 	resource_size_t (*align_resource)(struct pci_dev *dev,
 			const struct resource *res,
-- 
cgit v1.2.3-59-g8ed1b


From d308dfbf62eff897d71968d764f21a78678ee0a5 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 18 Jun 2019 12:58:33 +0200
Subject: i2c: mux/i801: Switch to use descriptor passing

This switches the i801 GPIO mux to use GPIO descriptors for
handling the GPIO lines. The previous hack which was reaching
inside the GPIO chips etc cannot live on. We pass descriptors
along with the GPIO mux device at creation instead.

The GPIO mux was only used by way of platform data with a
platform device from one place in the kernel: the i801 i2c bus
driver. Let's just associate the GPIO descriptor table with
the actual device like everyone else and dynamically create
a descriptor table passed along with the GPIO i2c mux.

This enables simplification of the GPIO i2c mux driver to
use only the descriptor API and the OF probe path gets
simplified in the process.

The i801 driver was registering the GPIO i2c mux with
PLATFORM_DEVID_AUTO which would make it hard to predict the
device name and assign the descriptor table properly, but
this seems to be a mistake to begin with: all of the
GPIO mux devices are hardcoded to look up GPIO lines from
the "gpio_ich" GPIO chip. If there are more than one mux,
there is certainly more than one gpio chip as well, and
then we have more serious problems. Switch to
PLATFORM_DEVID_NONE instead. There can be only one.

Cc: Mika Westerberg <mika.westerberg@linux.intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Peter Rosin <peda@axentia.se>
Cc: Jean Delvare <jdelvare@suse.com>
Signed-off-by: Serge Semin <fancer.lancer@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
[Removed a newline, suggested by Andy. /Peter]
Signed-off-by: Peter Rosin <peda@axentia.se>
---
 drivers/i2c/busses/i2c-i801.c              |  37 +++++++--
 drivers/i2c/muxes/i2c-mux-gpio.c           | 116 ++++++++---------------------
 include/linux/platform_data/i2c-mux-gpio.h |   7 --
 3 files changed, 60 insertions(+), 100 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
index 679c6c41f64b..bf484cd775ec 100644
--- a/drivers/i2c/busses/i2c-i801.c
+++ b/drivers/i2c/busses/i2c-i801.c
@@ -107,7 +107,7 @@
 #include <linux/pm_runtime.h>
 
 #if IS_ENABLED(CONFIG_I2C_MUX_GPIO) && defined CONFIG_DMI
-#include <linux/gpio.h>
+#include <linux/gpio/machine.h>
 #include <linux/platform_data/i2c-mux-gpio.h>
 #endif
 
@@ -274,6 +274,7 @@ struct i801_priv {
 #if IS_ENABLED(CONFIG_I2C_MUX_GPIO) && defined CONFIG_DMI
 	const struct i801_mux_config *mux_drvdata;
 	struct platform_device *mux_pdev;
+	struct gpiod_lookup_table *lookup;
 #endif
 	struct platform_device *tco_pdev;
 
@@ -1258,7 +1259,8 @@ static int i801_add_mux(struct i801_priv *priv)
 	struct device *dev = &priv->adapter.dev;
 	const struct i801_mux_config *mux_config;
 	struct i2c_mux_gpio_platform_data gpio_data;
-	int err;
+	struct gpiod_lookup_table *lookup;
+	int err, i;
 
 	if (!priv->mux_drvdata)
 		return 0;
@@ -1270,17 +1272,36 @@ static int i801_add_mux(struct i801_priv *priv)
 	gpio_data.values = mux_config->values;
 	gpio_data.n_values = mux_config->n_values;
 	gpio_data.classes = mux_config->classes;
-	gpio_data.gpio_chip = mux_config->gpio_chip;
-	gpio_data.gpios = mux_config->gpios;
-	gpio_data.n_gpios = mux_config->n_gpios;
 	gpio_data.idle = I2C_MUX_GPIO_NO_IDLE;
 
-	/* Register the mux device */
+	/* Register GPIO descriptor lookup table */
+	lookup = devm_kzalloc(dev,
+			      struct_size(lookup, table, mux_config->n_gpios),
+			      GFP_KERNEL);
+	if (!lookup)
+		return -ENOMEM;
+	lookup->dev_id = "i2c-mux-gpio";
+	for (i = 0; i < mux_config->n_gpios; i++) {
+		lookup->table[i].chip_label = mux_config->gpio_chip;
+		lookup->table[i].chip_hwnum = mux_config->gpios[i];
+		lookup->table[i].con_id = "mux";
+	}
+	gpiod_add_lookup_table(lookup);
+	priv->lookup = lookup;
+
+	/*
+	 * Register the mux device, we use PLATFORM_DEVID_NONE here
+	 * because since we are referring to the GPIO chip by name we are
+	 * anyways in deep trouble if there is more than one of these
+	 * devices, and there should likely only be one platform controller
+	 * hub.
+	 */
 	priv->mux_pdev = platform_device_register_data(dev, "i2c-mux-gpio",
-				PLATFORM_DEVID_AUTO, &gpio_data,
+				PLATFORM_DEVID_NONE, &gpio_data,
 				sizeof(struct i2c_mux_gpio_platform_data));
 	if (IS_ERR(priv->mux_pdev)) {
 		err = PTR_ERR(priv->mux_pdev);
+		gpiod_remove_lookup_table(lookup);
 		priv->mux_pdev = NULL;
 		dev_err(dev, "Failed to register i2c-mux-gpio device\n");
 		return err;
@@ -1293,6 +1314,8 @@ static void i801_del_mux(struct i801_priv *priv)
 {
 	if (priv->mux_pdev)
 		platform_device_unregister(priv->mux_pdev);
+	if (priv->lookup)
+		gpiod_remove_lookup_table(priv->lookup);
 }
 
 static unsigned int i801_get_adapter_class(struct i801_priv *priv)
diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c
index 13882a2a4f60..fd482feafb19 100644
--- a/drivers/i2c/muxes/i2c-mux-gpio.c
+++ b/drivers/i2c/muxes/i2c-mux-gpio.c
@@ -14,13 +14,14 @@
 #include <linux/platform_device.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/gpio.h>
+#include <linux/bits.h>
+#include <linux/gpio/consumer.h>
+/* FIXME: stop poking around inside gpiolib */
 #include "../../gpio/gpiolib.h"
-#include <linux/of_gpio.h>
 
 struct gpiomux {
 	struct i2c_mux_gpio_platform_data data;
-	unsigned gpio_base;
+	int ngpios;
 	struct gpio_desc **gpios;
 };
 
@@ -30,8 +31,7 @@ static void i2c_mux_gpio_set(const struct gpiomux *mux, unsigned val)
 
 	values[0] = val;
 
-	gpiod_set_array_value_cansleep(mux->data.n_gpios, mux->gpios, NULL,
-				       values);
+	gpiod_set_array_value_cansleep(mux->ngpios, mux->gpios, NULL, values);
 }
 
 static int i2c_mux_gpio_select(struct i2c_mux_core *muxc, u32 chan)
@@ -52,12 +52,6 @@ static int i2c_mux_gpio_deselect(struct i2c_mux_core *muxc, u32 chan)
 	return 0;
 }
 
-static int match_gpio_chip_by_label(struct gpio_chip *chip,
-					      void *data)
-{
-	return !strcmp(chip->label, data);
-}
-
 #ifdef CONFIG_OF
 static int i2c_mux_gpio_probe_dt(struct gpiomux *mux,
 					struct platform_device *pdev)
@@ -65,8 +59,8 @@ static int i2c_mux_gpio_probe_dt(struct gpiomux *mux,
 	struct device_node *np = pdev->dev.of_node;
 	struct device_node *adapter_np, *child;
 	struct i2c_adapter *adapter;
-	unsigned *values, *gpios;
-	int i = 0, ret;
+	unsigned *values;
+	int i = 0;
 
 	if (!np)
 		return -ENODEV;
@@ -103,29 +97,6 @@ static int i2c_mux_gpio_probe_dt(struct gpiomux *mux,
 	if (of_property_read_u32(np, "idle-state", &mux->data.idle))
 		mux->data.idle = I2C_MUX_GPIO_NO_IDLE;
 
-	mux->data.n_gpios = of_gpio_named_count(np, "mux-gpios");
-	if (mux->data.n_gpios < 0) {
-		dev_err(&pdev->dev, "Missing mux-gpios property in the DT.\n");
-		return -EINVAL;
-	}
-
-	gpios = devm_kcalloc(&pdev->dev,
-			     mux->data.n_gpios, sizeof(*mux->data.gpios),
-			     GFP_KERNEL);
-	if (!gpios) {
-		dev_err(&pdev->dev, "Cannot allocate gpios array");
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < mux->data.n_gpios; i++) {
-		ret = of_get_named_gpio(np, "mux-gpios", i);
-		if (ret < 0)
-			return ret;
-		gpios[i] = ret;
-	}
-
-	mux->data.gpios = gpios;
-
 	return 0;
 }
 #else
@@ -142,8 +113,8 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 	struct gpiomux *mux;
 	struct i2c_adapter *parent;
 	struct i2c_adapter *root;
-	unsigned initial_state, gpio_base;
-	int i, ret;
+	unsigned initial_state;
+	int i, ngpios, ret;
 
 	mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL);
 	if (!mux)
@@ -158,29 +129,19 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 			sizeof(mux->data));
 	}
 
-	/*
-	 * If a GPIO chip name is provided, the GPIO pin numbers provided are
-	 * relative to its base GPIO number. Otherwise they are absolute.
-	 */
-	if (mux->data.gpio_chip) {
-		struct gpio_chip *gpio;
-
-		gpio = gpiochip_find(mux->data.gpio_chip,
-				     match_gpio_chip_by_label);
-		if (!gpio)
-			return -EPROBE_DEFER;
-
-		gpio_base = gpio->base;
-	} else {
-		gpio_base = 0;
+	ngpios = gpiod_count(&pdev->dev, "mux");
+	if (ngpios <= 0) {
+		dev_err(&pdev->dev, "no valid gpios provided\n");
+		return ngpios ?: -EINVAL;
 	}
+	mux->ngpios = ngpios;
 
 	parent = i2c_get_adapter(mux->data.parent);
 	if (!parent)
 		return -EPROBE_DEFER;
 
 	muxc = i2c_mux_alloc(parent, &pdev->dev, mux->data.n_values,
-			     mux->data.n_gpios * sizeof(*mux->gpios), 0,
+			     ngpios * sizeof(*mux->gpios), 0,
 			     i2c_mux_gpio_select, NULL);
 	if (!muxc) {
 		ret = -ENOMEM;
@@ -194,7 +155,6 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 	root = i2c_root_adapter(&parent->dev);
 
 	muxc->mux_locked = true;
-	mux->gpio_base = gpio_base;
 
 	if (mux->data.idle != I2C_MUX_GPIO_NO_IDLE) {
 		initial_state = mux->data.idle;
@@ -203,34 +163,28 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 		initial_state = mux->data.values[0];
 	}
 
-	for (i = 0; i < mux->data.n_gpios; i++) {
+	for (i = 0; i < ngpios; i++) {
 		struct device *gpio_dev;
-		struct gpio_desc *gpio_desc;
-
-		ret = gpio_request(gpio_base + mux->data.gpios[i], "i2c-mux-gpio");
-		if (ret) {
-			dev_err(&pdev->dev, "Failed to request GPIO %d\n",
-				mux->data.gpios[i]);
-			goto err_request_gpio;
+		struct gpio_desc *gpiod;
+		enum gpiod_flags flag;
+
+		if (initial_state & BIT(i))
+			flag = GPIOD_OUT_HIGH;
+		else
+			flag = GPIOD_OUT_LOW;
+		gpiod = devm_gpiod_get_index(&pdev->dev, "mux", i, flag);
+		if (IS_ERR(gpiod)) {
+			ret = PTR_ERR(gpiod);
+			goto alloc_failed;
 		}
 
-		ret = gpio_direction_output(gpio_base + mux->data.gpios[i],
-					    initial_state & (1 << i));
-		if (ret) {
-			dev_err(&pdev->dev,
-				"Failed to set direction of GPIO %d to output\n",
-				mux->data.gpios[i]);
-			i++;	/* gpio_request above succeeded, so must free */
-			goto err_request_gpio;
-		}
-
-		gpio_desc = gpio_to_desc(gpio_base + mux->data.gpios[i]);
-		mux->gpios[i] = gpio_desc;
+		mux->gpios[i] = gpiod;
 
 		if (!muxc->mux_locked)
 			continue;
 
-		gpio_dev = &gpio_desc->gdev->dev;
+		/* FIXME: find a proper way to access the GPIO device */
+		gpio_dev = &gpiod->gdev->dev;
 		muxc->mux_locked = i2c_root_adapter(gpio_dev) == root;
 	}
 
@@ -253,10 +207,6 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 
 add_adapter_failed:
 	i2c_mux_del_adapters(muxc);
-	i = mux->data.n_gpios;
-err_request_gpio:
-	for (; i > 0; i--)
-		gpio_free(gpio_base + mux->data.gpios[i - 1]);
 alloc_failed:
 	i2c_put_adapter(parent);
 
@@ -266,14 +216,8 @@ alloc_failed:
 static int i2c_mux_gpio_remove(struct platform_device *pdev)
 {
 	struct i2c_mux_core *muxc = platform_get_drvdata(pdev);
-	struct gpiomux *mux = i2c_mux_priv(muxc);
-	int i;
 
 	i2c_mux_del_adapters(muxc);
-
-	for (i = 0; i < mux->data.n_gpios; i++)
-		gpio_free(mux->gpio_base + mux->data.gpios[i]);
-
 	i2c_put_adapter(muxc->parent);
 
 	return 0;
diff --git a/include/linux/platform_data/i2c-mux-gpio.h b/include/linux/platform_data/i2c-mux-gpio.h
index 4406108201fe..28f288eed652 100644
--- a/include/linux/platform_data/i2c-mux-gpio.h
+++ b/include/linux/platform_data/i2c-mux-gpio.h
@@ -22,10 +22,6 @@
  *	position
  * @n_values: Number of multiplexer positions (busses to instantiate)
  * @classes: Optional I2C auto-detection classes
- * @gpio_chip: Optional GPIO chip name; if set, GPIO pin numbers are given
- *	relative to the base GPIO number of that chip
- * @gpios: Array of GPIO numbers used to control MUX
- * @n_gpios: Number of GPIOs used to control MUX
  * @idle: Bitmask to write to MUX when idle or GPIO_I2CMUX_NO_IDLE if not used
  */
 struct i2c_mux_gpio_platform_data {
@@ -34,9 +30,6 @@ struct i2c_mux_gpio_platform_data {
 	const unsigned *values;
 	int n_values;
 	const unsigned *classes;
-	char *gpio_chip;
-	const unsigned *gpios;
-	int n_gpios;
 	unsigned idle;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 62de37da9f382455b983f2f92b10012109005278 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Thu, 20 Jun 2019 15:26:29 +0300
Subject: mtd: spi-nor: intel-spi: Convert to use SPDX identifier

This gets rid of the license boilerplate duplicated in each file.

No functional changes intended.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Tudor Ambarus <tudor.ambarus@microchip.com>
---
 drivers/mtd/spi-nor/intel-spi-pci.c      | 5 +----
 drivers/mtd/spi-nor/intel-spi-platform.c | 5 +----
 drivers/mtd/spi-nor/intel-spi.c          | 5 +----
 drivers/mtd/spi-nor/intel-spi.h          | 5 +----
 include/linux/platform_data/intel-spi.h  | 5 +----
 5 files changed, 5 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/spi-nor/intel-spi-pci.c b/drivers/mtd/spi-nor/intel-spi-pci.c
index 578f0c74e536..1b9c2d99ba38 100644
--- a/drivers/mtd/spi-nor/intel-spi-pci.c
+++ b/drivers/mtd/spi-nor/intel-spi-pci.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Intel PCH/PCU SPI flash PCI driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/ioport.h>
diff --git a/drivers/mtd/spi-nor/intel-spi-platform.c b/drivers/mtd/spi-nor/intel-spi-platform.c
index 5c943df9398f..25b18804e9bb 100644
--- a/drivers/mtd/spi-nor/intel-spi-platform.c
+++ b/drivers/mtd/spi-nor/intel-spi-platform.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Intel PCH/PCU SPI flash platform driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/ioport.h>
diff --git a/drivers/mtd/spi-nor/intel-spi.c b/drivers/mtd/spi-nor/intel-spi.c
index d60cbf23d9aa..021cef930f9f 100644
--- a/drivers/mtd/spi-nor/intel-spi.c
+++ b/drivers/mtd/spi-nor/intel-spi.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Intel PCH/PCU SPI flash driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #include <linux/err.h>
diff --git a/drivers/mtd/spi-nor/intel-spi.h b/drivers/mtd/spi-nor/intel-spi.h
index 5ab7dc250050..b03bf296fda3 100644
--- a/drivers/mtd/spi-nor/intel-spi.h
+++ b/drivers/mtd/spi-nor/intel-spi.h
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Intel PCH/PCU SPI flash driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #ifndef INTEL_SPI_H
diff --git a/include/linux/platform_data/intel-spi.h b/include/linux/platform_data/intel-spi.h
index 942b0c3f8f08..001f377fb5ef 100644
--- a/include/linux/platform_data/intel-spi.h
+++ b/include/linux/platform_data/intel-spi.h
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Intel PCH/PCU SPI flash driver.
  *
  * Copyright (C) 2016, Intel Corporation
  * Author: Mika Westerberg <mika.westerberg@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
  */
 
 #ifndef INTEL_SPI_PDATA_H
-- 
cgit v1.2.3-59-g8ed1b


From e67d4dfc9ff19dbe74b29617cf2592ccc50c3920 Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Wed, 12 Jun 2019 01:44:04 -0700
Subject: power: supply: Add HWMON compatibility layer

Add code implementing HWMON adapter/compatibility layer to allow
expositing various sensors present on power supply devices via HWMON
subsystem. This is done in order to allow userspace to use single
ABI/library(libsensors) to access/manipulate all of the sensors of the
system.

Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Chris Healy <cphealy@gmail.com>
Cc: Chris Healy <cphealy@gmail.com>
Cc: Cory Tusar <cory.tusar@zii.aero>
Cc: Lucas Stach <l.stach@pengutronix.de>
Cc: Fabio Estevam <fabio.estevam@nxp.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: linux-kernel@vger.kernel.org
Cc: linux-pm@vger.kernel.org
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/Kconfig              |  14 ++
 drivers/power/supply/Makefile             |   1 +
 drivers/power/supply/power_supply_core.c  |   7 +
 drivers/power/supply/power_supply_hwmon.c | 355 ++++++++++++++++++++++++++++++
 include/linux/power_supply.h              |  13 ++
 5 files changed, 390 insertions(+)
 create mode 100644 drivers/power/supply/power_supply_hwmon.c

(limited to 'include/linux')

diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig
index 26dacdab03cc..1f2252cb95fd 100644
--- a/drivers/power/supply/Kconfig
+++ b/drivers/power/supply/Kconfig
@@ -14,6 +14,20 @@ config POWER_SUPPLY_DEBUG
 	  Say Y here to enable debugging messages for power supply class
 	  and drivers.
 
+config POWER_SUPPLY_HWMON
+	bool
+	prompt "Expose power supply sensors as hwmon device"
+	depends on HWMON=y || HWMON=POWER_SUPPLY
+	default y
+	help
+	  This options enables API that allows sensors found on a
+	  power supply device (current, voltage, temperature) to be
+	  exposed as a hwmon device.
+
+	  Say 'Y' here if you want power supplies to
+	  have hwmon sysfs interface too.
+
+
 config PDA_POWER
 	tristate "Generic PDA/phone power driver"
 	depends on !S390
diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile
index f208273f9686..c47e88ba16b9 100644
--- a/drivers/power/supply/Makefile
+++ b/drivers/power/supply/Makefile
@@ -6,6 +6,7 @@ power_supply-$(CONFIG_SYSFS)		+= power_supply_sysfs.o
 power_supply-$(CONFIG_LEDS_TRIGGERS)	+= power_supply_leds.o
 
 obj-$(CONFIG_POWER_SUPPLY)	+= power_supply.o
+obj-$(CONFIG_POWER_SUPPLY_HWMON) += power_supply_hwmon.o
 obj-$(CONFIG_GENERIC_ADC_BATTERY)	+= generic-adc-battery.o
 
 obj-$(CONFIG_PDA_POWER)		+= pda_power.o
diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c
index f7033ecf6d0b..35624193a346 100644
--- a/drivers/power/supply/power_supply_core.c
+++ b/drivers/power/supply/power_supply_core.c
@@ -1072,6 +1072,10 @@ __power_supply_register(struct device *parent,
 	if (rc)
 		goto create_triggers_failed;
 
+	rc = power_supply_add_hwmon_sysfs(psy);
+	if (rc)
+		goto add_hwmon_sysfs_failed;
+
 	/*
 	 * Update use_cnt after any uevents (most notably from device_add()).
 	 * We are here still during driver's probe but
@@ -1090,6 +1094,8 @@ __power_supply_register(struct device *parent,
 
 	return psy;
 
+add_hwmon_sysfs_failed:
+	power_supply_remove_triggers(psy);
 create_triggers_failed:
 	psy_unregister_cooler(psy);
 register_cooler_failed:
@@ -1242,6 +1248,7 @@ void power_supply_unregister(struct power_supply *psy)
 	cancel_work_sync(&psy->changed_work);
 	cancel_delayed_work_sync(&psy->deferred_register_work);
 	sysfs_remove_link(&psy->dev.kobj, "powers");
+	power_supply_remove_hwmon_sysfs(psy);
 	power_supply_remove_triggers(psy);
 	psy_unregister_cooler(psy);
 	psy_unregister_thermal(psy);
diff --git a/drivers/power/supply/power_supply_hwmon.c b/drivers/power/supply/power_supply_hwmon.c
new file mode 100644
index 000000000000..51fe60440d12
--- /dev/null
+++ b/drivers/power/supply/power_supply_hwmon.c
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  power_supply_hwmon.c - power supply hwmon support.
+ */
+
+#include <linux/err.h>
+#include <linux/hwmon.h>
+#include <linux/power_supply.h>
+#include <linux/slab.h>
+
+struct power_supply_hwmon {
+	struct power_supply *psy;
+	unsigned long *props;
+};
+
+static int power_supply_hwmon_in_to_property(u32 attr)
+{
+	switch (attr) {
+	case hwmon_in_average:
+		return POWER_SUPPLY_PROP_VOLTAGE_AVG;
+	case hwmon_in_min:
+		return POWER_SUPPLY_PROP_VOLTAGE_MIN;
+	case hwmon_in_max:
+		return POWER_SUPPLY_PROP_VOLTAGE_MAX;
+	case hwmon_in_input:
+		return POWER_SUPPLY_PROP_VOLTAGE_NOW;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int power_supply_hwmon_curr_to_property(u32 attr)
+{
+	switch (attr) {
+	case hwmon_curr_average:
+		return POWER_SUPPLY_PROP_CURRENT_AVG;
+	case hwmon_curr_max:
+		return POWER_SUPPLY_PROP_CURRENT_MAX;
+	case hwmon_curr_input:
+		return POWER_SUPPLY_PROP_CURRENT_NOW;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int power_supply_hwmon_temp_to_property(u32 attr, int channel)
+{
+	if (channel) {
+		switch (attr) {
+		case hwmon_temp_input:
+			return POWER_SUPPLY_PROP_TEMP_AMBIENT;
+		case hwmon_temp_min_alarm:
+			return POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN;
+		case hwmon_temp_max_alarm:
+			return POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX;
+		default:
+			break;
+		}
+	} else {
+		switch (attr) {
+		case hwmon_temp_input:
+			return POWER_SUPPLY_PROP_TEMP;
+		case hwmon_temp_max:
+			return POWER_SUPPLY_PROP_TEMP_MAX;
+		case hwmon_temp_min:
+			return POWER_SUPPLY_PROP_TEMP_MIN;
+		case hwmon_temp_min_alarm:
+			return POWER_SUPPLY_PROP_TEMP_ALERT_MIN;
+		case hwmon_temp_max_alarm:
+			return POWER_SUPPLY_PROP_TEMP_ALERT_MAX;
+		default:
+			break;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static int
+power_supply_hwmon_to_property(enum hwmon_sensor_types type,
+			       u32 attr, int channel)
+{
+	switch (type) {
+	case hwmon_in:
+		return power_supply_hwmon_in_to_property(attr);
+	case hwmon_curr:
+		return power_supply_hwmon_curr_to_property(attr);
+	case hwmon_temp:
+		return power_supply_hwmon_temp_to_property(attr, channel);
+	default:
+		return -EINVAL;
+	}
+}
+
+static bool power_supply_hwmon_is_a_label(enum hwmon_sensor_types type,
+					   u32 attr)
+{
+	return type == hwmon_temp && attr == hwmon_temp_label;
+}
+
+static bool power_supply_hwmon_is_writable(enum hwmon_sensor_types type,
+					   u32 attr)
+{
+	switch (type) {
+	case hwmon_in:
+		return attr == hwmon_in_min ||
+		       attr == hwmon_in_max;
+	case hwmon_curr:
+		return attr == hwmon_curr_max;
+	case hwmon_temp:
+		return attr == hwmon_temp_max ||
+		       attr == hwmon_temp_min ||
+		       attr == hwmon_temp_min_alarm ||
+		       attr == hwmon_temp_max_alarm;
+	default:
+		return false;
+	}
+}
+
+static umode_t power_supply_hwmon_is_visible(const void *data,
+					     enum hwmon_sensor_types type,
+					     u32 attr, int channel)
+{
+	const struct power_supply_hwmon *psyhw = data;
+	int prop;
+
+
+	if (power_supply_hwmon_is_a_label(type, attr))
+		return 0444;
+
+	prop = power_supply_hwmon_to_property(type, attr, channel);
+	if (prop < 0 || !test_bit(prop, psyhw->props))
+		return 0;
+
+	if (power_supply_property_is_writeable(psyhw->psy, prop) > 0 &&
+	    power_supply_hwmon_is_writable(type, attr))
+		return 0644;
+
+	return 0444;
+}
+
+static int power_supply_hwmon_read_string(struct device *dev,
+					  enum hwmon_sensor_types type,
+					  u32 attr, int channel,
+					  const char **str)
+{
+	*str = channel ? "temp" : "temp ambient";
+	return 0;
+}
+
+static int
+power_supply_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+			u32 attr, int channel, long *val)
+{
+	struct power_supply_hwmon *psyhw = dev_get_drvdata(dev);
+	struct power_supply *psy = psyhw->psy;
+	union power_supply_propval pspval;
+	int ret, prop;
+
+	prop = power_supply_hwmon_to_property(type, attr, channel);
+	if (prop < 0)
+		return prop;
+
+	ret  = power_supply_get_property(psy, prop, &pspval);
+	if (ret)
+		return ret;
+
+	switch (type) {
+	/*
+	 * Both voltage and current is reported in units of
+	 * microvolts/microamps, so we need to adjust it to
+	 * milliamps(volts)
+	 */
+	case hwmon_curr:
+	case hwmon_in:
+		pspval.intval = DIV_ROUND_CLOSEST(pspval.intval, 1000);
+		break;
+	/*
+	 * Temp needs to be converted from 1/10 C to milli-C
+	 */
+	case hwmon_temp:
+		if (check_mul_overflow(pspval.intval, 100,
+				       &pspval.intval))
+			return -EOVERFLOW;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	*val = pspval.intval;
+
+	return 0;
+}
+
+static int
+power_supply_hwmon_write(struct device *dev, enum hwmon_sensor_types type,
+			 u32 attr, int channel, long val)
+{
+	struct power_supply_hwmon *psyhw = dev_get_drvdata(dev);
+	struct power_supply *psy = psyhw->psy;
+	union power_supply_propval pspval;
+	int prop;
+
+	prop = power_supply_hwmon_to_property(type, attr, channel);
+	if (prop < 0)
+		return prop;
+
+	pspval.intval = val;
+
+	switch (type) {
+	/*
+	 * Both voltage and current is reported in units of
+	 * microvolts/microamps, so we need to adjust it to
+	 * milliamps(volts)
+	 */
+	case hwmon_curr:
+	case hwmon_in:
+		if (check_mul_overflow(pspval.intval, 1000,
+				       &pspval.intval))
+			return -EOVERFLOW;
+		break;
+	/*
+	 * Temp needs to be converted from 1/10 C to milli-C
+	 */
+	case hwmon_temp:
+		pspval.intval = DIV_ROUND_CLOSEST(pspval.intval, 100);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return power_supply_set_property(psy, prop, &pspval);
+}
+
+static const struct hwmon_ops power_supply_hwmon_ops = {
+	.is_visible	= power_supply_hwmon_is_visible,
+	.read		= power_supply_hwmon_read,
+	.write		= power_supply_hwmon_write,
+	.read_string	= power_supply_hwmon_read_string,
+};
+
+static const struct hwmon_channel_info *power_supply_hwmon_info[] = {
+	HWMON_CHANNEL_INFO(temp,
+			   HWMON_T_LABEL     |
+			   HWMON_T_INPUT     |
+			   HWMON_T_MAX       |
+			   HWMON_T_MIN       |
+			   HWMON_T_MIN_ALARM |
+			   HWMON_T_MIN_ALARM,
+
+			   HWMON_T_LABEL     |
+			   HWMON_T_INPUT     |
+			   HWMON_T_MIN_ALARM |
+			   HWMON_T_LABEL     |
+			   HWMON_T_MAX_ALARM),
+
+	HWMON_CHANNEL_INFO(curr,
+			   HWMON_C_AVERAGE |
+			   HWMON_C_MAX     |
+			   HWMON_C_INPUT),
+
+	HWMON_CHANNEL_INFO(in,
+			   HWMON_I_AVERAGE |
+			   HWMON_I_MIN     |
+			   HWMON_I_MAX     |
+			   HWMON_I_INPUT),
+	NULL
+};
+
+static const struct hwmon_chip_info power_supply_hwmon_chip_info = {
+	.ops = &power_supply_hwmon_ops,
+	.info = power_supply_hwmon_info,
+};
+
+static void power_supply_hwmon_bitmap_free(void *data)
+{
+	bitmap_free(data);
+}
+
+int power_supply_add_hwmon_sysfs(struct power_supply *psy)
+{
+	const struct power_supply_desc *desc = psy->desc;
+	struct power_supply_hwmon *psyhw;
+	struct device *dev = &psy->dev;
+	struct device *hwmon;
+	int ret, i;
+
+	if (!devres_open_group(dev, power_supply_add_hwmon_sysfs,
+			       GFP_KERNEL))
+		return -ENOMEM;
+
+	psyhw = devm_kzalloc(dev, sizeof(*psyhw), GFP_KERNEL);
+	if (!psyhw) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	psyhw->psy = psy;
+	psyhw->props = bitmap_zalloc(POWER_SUPPLY_PROP_TIME_TO_FULL_AVG + 1,
+				     GFP_KERNEL);
+	if (!psyhw->props) {
+		ret = -ENOMEM;
+		goto error;
+	}
+
+	ret = devm_add_action(dev, power_supply_hwmon_bitmap_free,
+			      psyhw->props);
+	if (ret)
+		goto error;
+
+	for (i = 0; i < desc->num_properties; i++) {
+		const enum power_supply_property prop = desc->properties[i];
+
+		switch (prop) {
+		case POWER_SUPPLY_PROP_CURRENT_AVG:
+		case POWER_SUPPLY_PROP_CURRENT_MAX:
+		case POWER_SUPPLY_PROP_CURRENT_NOW:
+		case POWER_SUPPLY_PROP_TEMP:
+		case POWER_SUPPLY_PROP_TEMP_MAX:
+		case POWER_SUPPLY_PROP_TEMP_MIN:
+		case POWER_SUPPLY_PROP_TEMP_ALERT_MIN:
+		case POWER_SUPPLY_PROP_TEMP_ALERT_MAX:
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT:
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN:
+		case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX:
+		case POWER_SUPPLY_PROP_VOLTAGE_AVG:
+		case POWER_SUPPLY_PROP_VOLTAGE_MIN:
+		case POWER_SUPPLY_PROP_VOLTAGE_MAX:
+		case POWER_SUPPLY_PROP_VOLTAGE_NOW:
+			set_bit(prop, psyhw->props);
+			break;
+		default:
+			break;
+		}
+	}
+
+	hwmon = devm_hwmon_device_register_with_info(dev, psy->desc->name,
+						psyhw,
+						&power_supply_hwmon_chip_info,
+						NULL);
+	ret = PTR_ERR_OR_ZERO(hwmon);
+	if (ret)
+		goto error;
+
+	devres_close_group(dev, power_supply_add_hwmon_sysfs);
+	return 0;
+error:
+	devres_release_group(dev, NULL);
+	return ret;
+}
+
+void power_supply_remove_hwmon_sysfs(struct power_supply *psy)
+{
+	devres_release_group(&psy->dev, power_supply_add_hwmon_sysfs);
+}
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index d9c0c094f8a0..d5b15e039f4f 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -481,4 +481,17 @@ static inline bool power_supply_is_watt_property(enum power_supply_property psp)
 	return 0;
 }
 
+#ifdef CONFIG_POWER_SUPPLY_HWMON
+int power_supply_add_hwmon_sysfs(struct power_supply *psy);
+void power_supply_remove_hwmon_sysfs(struct power_supply *psy);
+#else
+static inline int power_supply_add_hwmon_sysfs(struct power_supply *psy)
+{
+	return 0;
+}
+
+static inline
+void power_supply_remove_hwmon_sysfs(struct power_supply *psy) {}
+#endif
+
 #endif /* __LINUX_POWER_SUPPLY_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 38b37d631aec80da0c65ac03a7ef680b468c7857 Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Date: Fri, 7 Jun 2019 12:49:11 +0200
Subject: module: allow arch overrides for .exit section names

Some archs like ARM store unwind information for .exit.text in sections
with unusual names. As this unwind information refers to .exit.text, it
must not be loaded when .exit.text is not loaded (when CONFIG_MODULE_UNLOAD
is unset); otherwise, loading a module can fail due to relocation failures.

Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 include/linux/moduleloader.h | 5 +++++
 kernel/module.c              | 7 ++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 31013c2effd3..5229c18025e9 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -29,6 +29,11 @@ void *module_alloc(unsigned long size);
 /* Free memory returned from module_alloc. */
 void module_memfree(void *module_region);
 
+/* Determines if the section name is an exit section (that is only used during
+ * module unloading)
+ */
+bool module_exit_section(const char *name);
+
 /*
  * Apply the given relocation to the (simplified) ELF.  Return -error
  * or 0.
diff --git a/kernel/module.c b/kernel/module.c
index 41258bab24f1..537c456ce3ee 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2748,6 +2748,11 @@ void * __weak module_alloc(unsigned long size)
 	return vmalloc_exec(size);
 }
 
+bool __weak module_exit_section(const char *name)
+{
+	return strstarts(name, ".exit");
+}
+
 #ifdef CONFIG_DEBUG_KMEMLEAK
 static void kmemleak_load_module(const struct module *mod,
 				 const struct load_info *info)
@@ -2937,7 +2942,7 @@ static int rewrite_section_headers(struct load_info *info, int flags)
 
 #ifndef CONFIG_MODULE_UNLOAD
 		/* Don't load .exit sections */
-		if (strstarts(info->secstrings+shdr->sh_name, ".exit"))
+		if (module_exit_section(info->secstrings+shdr->sh_name))
 			shdr->sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 38ca87c6f1e514686d4a385246d1afe1e1f2e482 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Tue, 11 Jun 2019 18:52:46 +0300
Subject: RDMA/mlx5: Introduce and implement new IB_WR_REG_MR_INTEGRITY work
 request

This new WR will be used to perform PI (protection information) handover
using the new API. Using the new API, the user will post a single WR that
will internally perform all the needed actions to complete PI operation.
This new WR will use a memory region that was allocated as
IB_MR_TYPE_INTEGRITY and was mapped using ib_map_mr_sg_pi to perform the
registration. In the old API, in order to perform a signature handover
operation, each ULP should perform the following:
1. Map and register the data buffers.
2. Map and register the protection buffers.
3. Post a special reg WR to configure the signature handover operation
   layout.
4. Invalidate the signature memory key.
5. Invalidate protection buffers memory key.
6. Invalidate data buffers memory key.

In the new API, the mapping of both data and protection buffers is
performed using a single call to ib_map_mr_sg_pi function. Also the
registration of the buffers and the configuration of the signature
operation layout is done by a single new work request called
IB_WR_REG_MR_INTEGRITY.
This patch implements this operation for mlx5 devices that are capable to
offload data integrity generation/validation while performing the actual
buffer transfer.
This patch will not remove the old signature API that is used by the iSER
initiator and target drivers. This will be done in the future.

In the internal implementation, for each IB_WR_REG_MR_INTEGRITY work
request, we are using a single UMR operation to register both data and
protection buffers using KLM's.
Afterwards, another UMR operation will describe the strided block format.
These will be followed by 2 SET_PSV operations to set the memory/wire
domains initial signature parameters passed by the user.
In the end of the whole transaction, only the signature memory key
(the one that exposed for the RDMA operation) will be invalidated.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/qp.c | 218 ++++++++++++++++++++++++++++++++++++----
 include/linux/mlx5/qp.h         |   3 +-
 include/rdma/ib_verbs.h         |   1 +
 3 files changed, 201 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index ce8fccb04c3c..f6651b93e469 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -4169,7 +4169,7 @@ static __be64 sig_mkey_mask(void)
 static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
 			    struct mlx5_ib_mr *mr, u8 flags)
 {
-	int size = mr->ndescs * mr->desc_size;
+	int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
 
 	memset(umr, 0, sizeof(*umr));
 
@@ -4300,7 +4300,7 @@ static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg,
 			     struct mlx5_ib_mr *mr,
 			     u32 key, int access)
 {
-	int ndescs = ALIGN(mr->ndescs, 8) >> 1;
+	int ndescs = ALIGN(mr->ndescs + mr->meta_ndescs, 8) >> 1;
 
 	memset(seg, 0, sizeof(*seg));
 
@@ -4351,7 +4351,7 @@ static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg,
 			     struct mlx5_ib_mr *mr,
 			     struct mlx5_ib_pd *pd)
 {
-	int bcount = mr->desc_size * mr->ndescs;
+	int bcount = mr->desc_size * (mr->ndescs + mr->meta_ndescs);
 
 	dseg->addr = cpu_to_be64(mr->desc_map);
 	dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64));
@@ -4544,23 +4544,52 @@ static int mlx5_set_bsf(struct ib_mr *sig_mr,
 	return 0;
 }
 
-static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
-				struct mlx5_ib_qp *qp, void **seg,
-				int *size, void **cur_edge)
+static int set_sig_data_segment(const struct ib_send_wr *send_wr,
+				struct ib_mr *sig_mr,
+				struct ib_sig_attrs *sig_attrs,
+				struct mlx5_ib_qp *qp, void **seg, int *size,
+				void **cur_edge)
 {
-	struct ib_sig_attrs *sig_attrs = wr->sig_attrs;
-	struct ib_mr *sig_mr = wr->sig_mr;
 	struct mlx5_bsf *bsf;
-	u32 data_len = wr->wr.sg_list->length;
-	u32 data_key = wr->wr.sg_list->lkey;
-	u64 data_va = wr->wr.sg_list->addr;
+	u32 data_len;
+	u32 data_key;
+	u64 data_va;
+	u32 prot_len = 0;
+	u32 prot_key = 0;
+	u64 prot_va = 0;
+	bool prot = false;
 	int ret;
 	int wqe_size;
 
-	if (!wr->prot ||
-	    (data_key == wr->prot->lkey &&
-	     data_va == wr->prot->addr &&
-	     data_len == wr->prot->length)) {
+	if (send_wr->opcode == IB_WR_REG_SIG_MR) {
+		const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr);
+
+		data_len = wr->wr.sg_list->length;
+		data_key = wr->wr.sg_list->lkey;
+		data_va = wr->wr.sg_list->addr;
+		if (wr->prot) {
+			prot_len = wr->prot->length;
+			prot_key = wr->prot->lkey;
+			prot_va = wr->prot->addr;
+			prot = true;
+		}
+	} else {
+		struct mlx5_ib_mr *mr = to_mmr(sig_mr);
+		struct mlx5_ib_mr *pi_mr = mr->pi_mr;
+
+		data_len = pi_mr->data_length;
+		data_key = pi_mr->ibmr.lkey;
+		data_va = pi_mr->ibmr.iova;
+		if (pi_mr->meta_ndescs) {
+			prot_len = pi_mr->meta_length;
+			prot_key = pi_mr->ibmr.lkey;
+			prot_va = pi_mr->ibmr.iova + data_len;
+			prot = true;
+		}
+	}
+
+	if (!prot || (data_key == prot_key && data_va == prot_va &&
+		      data_len == prot_len)) {
 		/**
 		 * Source domain doesn't contain signature information
 		 * or data and protection are interleaved in memory.
@@ -4594,8 +4623,6 @@ static int set_sig_data_segment(const struct ib_sig_handover_wr *wr,
 		struct mlx5_stride_block_ctrl_seg *sblock_ctrl;
 		struct mlx5_stride_block_entry *data_sentry;
 		struct mlx5_stride_block_entry *prot_sentry;
-		u32 prot_key = wr->prot->lkey;
-		u64 prot_va = wr->prot->addr;
 		u16 block_size = sig_attrs->mem.sig.dif.pi_interval;
 		int prot_size;
 
@@ -4673,6 +4700,56 @@ static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
 	umr->mkey_mask = sig_mkey_mask();
 }
 
+static int set_pi_umr_wr(const struct ib_send_wr *send_wr,
+			 struct mlx5_ib_qp *qp, void **seg, int *size,
+			 void **cur_edge)
+{
+	const struct ib_reg_wr *wr = reg_wr(send_wr);
+	struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr);
+	struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr;
+	struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs;
+	u32 pdn = get_pd(qp)->pdn;
+	u32 xlt_size;
+	int region_len, ret;
+
+	if (unlikely(send_wr->num_sge != 0) ||
+	    unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) ||
+	    unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) ||
+	    unlikely(!sig_mr->sig->sig_status_checked))
+		return -EINVAL;
+
+	/* length of the protected region, data + protection */
+	region_len = pi_mr->ibmr.length;
+
+	/**
+	 * KLM octoword size - if protection was provided
+	 * then we use strided block format (3 octowords),
+	 * else we use single KLM (1 octoword)
+	 **/
+	if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE)
+		xlt_size = 0x30;
+	else
+		xlt_size = sizeof(struct mlx5_klm);
+
+	set_sig_umr_segment(*seg, xlt_size);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len,
+			     pdn);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size,
+				   cur_edge);
+	if (ret)
+		return ret;
+
+	sig_mr->sig->sig_status_checked = false;
+	return 0;
+}
 
 static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
 			  struct mlx5_ib_qp *qp, void **seg, int *size,
@@ -4716,7 +4793,8 @@ static int set_sig_umr_wr(const struct ib_send_wr *send_wr,
 	*size += sizeof(struct mlx5_mkey_seg) / 16;
 	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
 
-	ret = set_sig_data_segment(wr, qp, seg, size, cur_edge);
+	ret = set_sig_data_segment(send_wr, wr->sig_mr, wr->sig_attrs, qp, seg,
+				   size, cur_edge);
 	if (ret)
 		return ret;
 
@@ -4758,7 +4836,7 @@ static int set_reg_wr(struct mlx5_ib_qp *qp,
 {
 	struct mlx5_ib_mr *mr = to_mmr(wr->mr);
 	struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd);
-	size_t mr_list_size = mr->ndescs * mr->desc_size;
+	int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
 	bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD;
 	u8 flags = 0;
 
@@ -4899,8 +4977,11 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 	struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
 	struct mlx5_core_dev *mdev = dev->mdev;
+	struct ib_reg_wr reg_pi_wr;
 	struct mlx5_ib_qp *qp;
 	struct mlx5_ib_mr *mr;
+	struct mlx5_ib_mr *pi_mr;
+	struct ib_sig_attrs *sig_attrs;
 	struct mlx5_wqe_xrc_seg *xrc;
 	struct mlx5_bf *bf;
 	void *cur_edge;
@@ -4954,7 +5035,8 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 			goto out;
 		}
 
-		if (wr->opcode == IB_WR_REG_MR) {
+		if (wr->opcode == IB_WR_REG_MR ||
+		    wr->opcode == IB_WR_REG_MR_INTEGRITY) {
 			fence = dev->umr_fence;
 			next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
 		} else  {
@@ -5012,6 +5094,102 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 				num_sge = 0;
 				break;
 
+			case IB_WR_REG_MR_INTEGRITY:
+				memset(&reg_pi_wr, 0, sizeof(struct ib_reg_wr));
+
+				mr = to_mmr(reg_wr(wr)->mr);
+				pi_mr = mr->pi_mr;
+
+				reg_pi_wr.mr = &pi_mr->ibmr;
+				reg_pi_wr.access = reg_wr(wr)->access;
+				reg_pi_wr.key = pi_mr->ibmr.rkey;
+
+				qp->sq.wr_data[idx] = IB_WR_REG_MR_INTEGRITY;
+				ctrl->imm = cpu_to_be32(reg_pi_wr.key);
+				/* UMR for data + protection registration */
+				err = set_reg_wr(qp, &reg_pi_wr, &seg, &size,
+						 &cur_edge, false);
+				if (err) {
+					*bad_wr = wr;
+					goto out;
+				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, fence,
+					   MLX5_OPCODE_UMR);
+
+				err = begin_wqe(qp, &seg, &ctrl, wr, &idx,
+						&size, &cur_edge, nreq);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+				ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
+				/* UMR for sig MR */
+				err = set_pi_umr_wr(wr, qp, &seg, &size,
+						    &cur_edge);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, fence,
+					   MLX5_OPCODE_UMR);
+
+				/*
+				 * SET_PSV WQEs are not signaled and solicited
+				 * on error
+				 */
+				sig_attrs = mr->ibmr.sig_attrs;
+				err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
+						  &size, &cur_edge, nreq, false,
+						  true);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+				err = set_psv_wr(&sig_attrs->mem,
+						 mr->sig->psv_memory.psv_idx,
+						 &seg, &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, next_fence,
+					   MLX5_OPCODE_SET_PSV);
+
+				err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
+						  &size, &cur_edge, nreq, false,
+						  true);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+				err = set_psv_wr(&sig_attrs->wire,
+						 mr->sig->psv_wire.psv_idx,
+						 &seg, &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
+					   wr->wr_id, nreq, next_fence,
+					   MLX5_OPCODE_SET_PSV);
+
+				qp->next_fence =
+					MLX5_FENCE_MODE_INITIATOR_SMALL;
+				num_sge = 0;
+				goto skip_psv;
+
 			case IB_WR_REG_SIG_MR:
 				qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
 				mr = to_mmr(sig_handover_wr(wr)->sig_mr);
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 3ba4edbd17a6..08e43cd9e742 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -37,7 +37,8 @@
 #include <linux/mlx5/driver.h>
 
 #define MLX5_INVALID_LKEY	0x100
-#define MLX5_SIG_WQE_SIZE	(MLX5_SEND_WQE_BB * 5)
+/* UMR (3 WQE_BB's) + SIG (3 WQE_BB's) + PSV (mem) + PSV (wire) */
+#define MLX5_SIG_WQE_SIZE	(MLX5_SEND_WQE_BB * 8)
 #define MLX5_DIF_SIZE		8
 #define MLX5_STRIDE_BLOCK_OP	0x400
 #define MLX5_CPY_GRD_MASK	0xc0
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9169e798334f..28db256cbdb9 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1236,6 +1236,7 @@ enum ib_wr_opcode {
 	/* These are kernel only and can not be issued by userspace */
 	IB_WR_REG_MR = 0x20,
 	IB_WR_REG_SIG_MR,
+	IB_WR_REG_MR_INTEGRITY,
 
 	/* reserve values for low level drivers' internal use.
 	 * These values will not be used at all in the ib core layer.
-- 
cgit v1.2.3-59-g8ed1b


From ec6516bfbaf72e7c81811162b6de96322e32a027 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 13 Jun 2019 10:55:31 +0900
Subject: pinctrl: remove unneeded #ifdef around declarations

What is the point in surrounding the whole of declarations with
ifdef like this?

  #ifdef CONFIG_FOO
  int foo(void);
  #endif

If CONFIG_FOO is not defined, all callers of foo() will fail
with implicit declaration errors since the top Makefile adds
-Werror-implicit-function-declaration to KBUILD_CFLAGS.

This breaks the build earlier when you are doing something wrong.
That's it.

Anyway, it will fail to link since the definition of foo() is not
compiled.

In summary, these ifdef are unneeded.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf-generic.h | 20 ++++++--------------
 include/linux/pinctrl/pinconf.h         |  4 ----
 include/linux/pinctrl/pinctrl.h         |  4 ----
 include/linux/pinctrl/pinmux.h          |  4 ----
 4 files changed, 6 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 72d06d6a3099..673828a52294 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -12,6 +12,12 @@
 #ifndef __LINUX_PINCTRL_PINCONF_GENERIC_H
 #define __LINUX_PINCTRL_PINCONF_GENERIC_H
 
+#include <linux/device.h>
+#include <linux/pinctrl/machine.h>
+
+struct pinctrl_dev;
+struct pinctrl_map;
+
 /**
  * enum pin_config_param - possible pin configuration parameters
  * @PIN_CONFIG_BIAS_BUS_HOLD: the pin will be set to weakly latch so that it
@@ -159,9 +165,6 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param
 	return PIN_CONF_PACKED(param, argument);
 }
 
-#ifdef CONFIG_GENERIC_PINCONF
-
-#ifdef CONFIG_DEBUG_FS
 #define PCONFDUMP(a, b, c, d) {					\
 	.param = a, .display = b, .format = c, .has_arg = d	\
 	}
@@ -172,14 +175,6 @@ struct pin_config_item {
 	const char * const format;
 	bool has_arg;
 };
-#endif /* CONFIG_DEBUG_FS */
-
-#ifdef CONFIG_OF
-
-#include <linux/device.h>
-#include <linux/pinctrl/machine.h>
-struct pinctrl_dev;
-struct pinctrl_map;
 
 struct pinconf_generic_params {
 	const char * const property;
@@ -224,8 +219,5 @@ static inline int pinconf_generic_dt_node_to_map_all(
 	return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps,
 			PIN_MAP_TYPE_INVALID);
 }
-#endif
-
-#endif /* CONFIG_GENERIC_PINCONF */
 
 #endif /* __LINUX_PINCTRL_PINCONF_GENERIC_H */
diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index 9bebc3554809..513883dcc5a9 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -12,8 +12,6 @@
 #ifndef __LINUX_PINCTRL_PINCONF_H
 #define __LINUX_PINCTRL_PINCONF_H
 
-#ifdef CONFIG_PINCONF
-
 #include <linux/types.h>
 
 struct pinctrl_dev;
@@ -67,6 +65,4 @@ struct pinconf_ops {
 					    unsigned long config);
 };
 
-#endif
-
 #endif /* __LINUX_PINCTRL_PINCONF_H */
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 36a79fe7b84f..27738164daa7 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -12,8 +12,6 @@
 #ifndef __LINUX_PINCTRL_PINCTRL_H
 #define __LINUX_PINCTRL_PINCTRL_H
 
-#ifdef CONFIG_PINCTRL
-
 #include <linux/radix-tree.h>
 #include <linux/list.h>
 #include <linux/seq_file.h>
@@ -203,6 +201,4 @@ extern const char *pinctrl_dev_get_name(struct pinctrl_dev *pctldev);
 extern const char *pinctrl_dev_get_devname(struct pinctrl_dev *pctldev);
 extern void *pinctrl_dev_get_drvdata(struct pinctrl_dev *pctldev);
 
-#endif /* !CONFIG_PINCTRL */
-
 #endif /* __LINUX_PINCTRL_PINCTRL_H */
diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h
index ace60d775b20..566a5fe8eab5 100644
--- a/include/linux/pinctrl/pinmux.h
+++ b/include/linux/pinctrl/pinmux.h
@@ -16,8 +16,6 @@
 #include <linux/seq_file.h>
 #include <linux/pinctrl/pinctrl.h>
 
-#ifdef CONFIG_PINMUX
-
 struct pinctrl_dev;
 
 /**
@@ -85,6 +83,4 @@ struct pinmux_ops {
 	bool strict;
 };
 
-#endif /* CONFIG_PINMUX */
-
 #endif /* __LINUX_PINCTRL_PINMUX_H */
-- 
cgit v1.2.3-59-g8ed1b


From 2f25528e4edddc6eddd42c8d41c9c9e341c8b9da Mon Sep 17 00:00:00 2001
From: Sylwester Nawrocki <s.nawrocki@samsung.com>
Date: Wed, 19 Jun 2019 11:39:25 +0200
Subject: clk: Add clk_bulk_get_optional() function

clk_bulk_get_optional() allows to get a group of clocks where one
or more is optional.  For a not available clock, e.g. not specifed
in the clock consumer node in DT, its respective struct clk pointer
will be NULL.  This allows for operating on a group of returned
clocks (struct clk_bulk_data array) with existing clk_bulk* APIs.

Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-bulk.c | 23 ++++++++++++++++++++---
 include/linux/clk.h    | 19 +++++++++++++++++++
 2 files changed, 39 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-bulk.c b/drivers/clk/clk-bulk.c
index 06499568cf07..524bf9a53098 100644
--- a/drivers/clk/clk-bulk.c
+++ b/drivers/clk/clk-bulk.c
@@ -75,8 +75,8 @@ void clk_bulk_put(int num_clks, struct clk_bulk_data *clks)
 }
 EXPORT_SYMBOL_GPL(clk_bulk_put);
 
-int __must_check clk_bulk_get(struct device *dev, int num_clks,
-			      struct clk_bulk_data *clks)
+static int __clk_bulk_get(struct device *dev, int num_clks,
+			  struct clk_bulk_data *clks, bool optional)
 {
 	int ret;
 	int i;
@@ -88,10 +88,14 @@ int __must_check clk_bulk_get(struct device *dev, int num_clks,
 		clks[i].clk = clk_get(dev, clks[i].id);
 		if (IS_ERR(clks[i].clk)) {
 			ret = PTR_ERR(clks[i].clk);
+			clks[i].clk = NULL;
+
+			if (ret == -ENOENT && optional)
+				continue;
+
 			if (ret != -EPROBE_DEFER)
 				dev_err(dev, "Failed to get clk '%s': %d\n",
 					clks[i].id, ret);
-			clks[i].clk = NULL;
 			goto err;
 		}
 	}
@@ -103,8 +107,21 @@ err:
 
 	return ret;
 }
+
+int __must_check clk_bulk_get(struct device *dev, int num_clks,
+			      struct clk_bulk_data *clks)
+{
+	return __clk_bulk_get(dev, num_clks, clks, false);
+}
 EXPORT_SYMBOL(clk_bulk_get);
 
+int __must_check clk_bulk_get_optional(struct device *dev, int num_clks,
+				       struct clk_bulk_data *clks)
+{
+	return __clk_bulk_get(dev, num_clks, clks, true);
+}
+EXPORT_SYMBOL_GPL(clk_bulk_get_optional);
+
 void clk_bulk_put_all(int num_clks, struct clk_bulk_data *clks)
 {
 	if (IS_ERR_OR_NULL(clks))
diff --git a/include/linux/clk.h b/include/linux/clk.h
index f689fc58d7be..1b50e7d1675c 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -332,6 +332,19 @@ int __must_check clk_bulk_get(struct device *dev, int num_clks,
  */
 int __must_check clk_bulk_get_all(struct device *dev,
 				  struct clk_bulk_data **clks);
+
+/**
+ * clk_bulk_get_optional - lookup and obtain a number of references to clock producer
+ * @dev: device for clock "consumer"
+ * @num_clks: the number of clk_bulk_data
+ * @clks: the clk_bulk_data table of consumer
+ *
+ * Behaves the same as clk_bulk_get() except where there is no clock producer.
+ * In this case, instead of returning -ENOENT, the function returns 0 and
+ * NULL for a clk for which a clock producer could not be determined.
+ */
+int __must_check clk_bulk_get_optional(struct device *dev, int num_clks,
+				       struct clk_bulk_data *clks);
 /**
  * devm_clk_bulk_get - managed get multiple clk consumers
  * @dev: device for clock "consumer"
@@ -718,6 +731,12 @@ static inline int __must_check clk_bulk_get(struct device *dev, int num_clks,
 	return 0;
 }
 
+static inline int __must_check clk_bulk_get_optional(struct device *dev,
+				int num_clks, struct clk_bulk_data *clks)
+{
+	return 0;
+}
+
 static inline int __must_check clk_bulk_get_all(struct device *dev,
 					 struct clk_bulk_data **clks)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 9bd5ef0bd8743700d9adffb6fbb1baa346575457 Mon Sep 17 00:00:00 2001
From: Sylwester Nawrocki <s.nawrocki@samsung.com>
Date: Wed, 19 Jun 2019 11:39:26 +0200
Subject: clk: Add devm_clk_bulk_get_optional() function

Add managed version of the clk_bulk_get_optional() helper function.

Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
[sboyd@kernel.org: Mark __devm_clk_bulk_get() static]
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk-devres.c | 22 +++++++++++++++++++---
 include/linux/clk.h      | 28 ++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c
index daa1fc8fba53..be160764911b 100644
--- a/drivers/clk/clk-devres.c
+++ b/drivers/clk/clk-devres.c
@@ -52,8 +52,8 @@ static void devm_clk_bulk_release(struct device *dev, void *res)
 	clk_bulk_put(devres->num_clks, devres->clks);
 }
 
-int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
-		      struct clk_bulk_data *clks)
+static int __devm_clk_bulk_get(struct device *dev, int num_clks,
+			       struct clk_bulk_data *clks, bool optional)
 {
 	struct clk_bulk_devres *devres;
 	int ret;
@@ -63,7 +63,10 @@ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 	if (!devres)
 		return -ENOMEM;
 
-	ret = clk_bulk_get(dev, num_clks, clks);
+	if (optional)
+		ret = clk_bulk_get_optional(dev, num_clks, clks);
+	else
+		ret = clk_bulk_get(dev, num_clks, clks);
 	if (!ret) {
 		devres->clks = clks;
 		devres->num_clks = num_clks;
@@ -74,8 +77,21 @@ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 
 	return ret;
 }
+
+int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
+		      struct clk_bulk_data *clks)
+{
+	return __devm_clk_bulk_get(dev, num_clks, clks, false);
+}
 EXPORT_SYMBOL_GPL(devm_clk_bulk_get);
 
+int __must_check devm_clk_bulk_get_optional(struct device *dev, int num_clks,
+		      struct clk_bulk_data *clks)
+{
+	return __devm_clk_bulk_get(dev, num_clks, clks, true);
+}
+EXPORT_SYMBOL_GPL(devm_clk_bulk_get_optional);
+
 int __must_check devm_clk_bulk_get_all(struct device *dev,
 				       struct clk_bulk_data **clks)
 {
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 1b50e7d1675c..5e7b2dd84965 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -359,6 +359,28 @@ int __must_check clk_bulk_get_optional(struct device *dev, int num_clks,
  */
 int __must_check devm_clk_bulk_get(struct device *dev, int num_clks,
 				   struct clk_bulk_data *clks);
+/**
+ * devm_clk_bulk_get_optional - managed get multiple optional consumer clocks
+ * @dev: device for clock "consumer"
+ * @clks: pointer to the clk_bulk_data table of consumer
+ *
+ * Behaves the same as devm_clk_bulk_get() except where there is no clock
+ * producer.  In this case, instead of returning -ENOENT, the function returns
+ * NULL for given clk. It is assumed all clocks in clk_bulk_data are optional.
+ *
+ * Returns 0 if all clocks specified in clk_bulk_data table are obtained
+ * successfully or for any clk there was no clk provider available, otherwise
+ * returns valid IS_ERR() condition containing errno.
+ * The implementation uses @dev and @clk_bulk_data.id to determine the
+ * clock consumer, and thereby the clock producer.
+ * The clock returned is stored in each @clk_bulk_data.clk field.
+ *
+ * Drivers must assume that the clock source is not enabled.
+ *
+ * clk_bulk_get should not be called from within interrupt context.
+ */
+int __must_check devm_clk_bulk_get_optional(struct device *dev, int num_clks,
+					    struct clk_bulk_data *clks);
 /**
  * devm_clk_bulk_get_all - managed get multiple clk consumers
  * @dev: device for clock "consumer"
@@ -760,6 +782,12 @@ static inline int __must_check devm_clk_bulk_get(struct device *dev, int num_clk
 	return 0;
 }
 
+static inline int __must_check devm_clk_bulk_get_optional(struct device *dev,
+				int num_clks, struct clk_bulk_data *clks)
+{
+	return 0;
+}
+
 static inline int __must_check devm_clk_bulk_get_all(struct device *dev,
 						     struct clk_bulk_data **clks)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 550113d4e9f5c7b62be760fc01178c9e0139c1f4 Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Mon, 24 Jun 2019 19:04:02 +0200
Subject: i2c: add newly exported functions to the header, too

Nobody (including me) noticed that these functions were exported but not
added to the header :/

Fixes: 7159dbdae3c5 ("i2c: core: improve return value handling of i2c_new_device and i2c_new_dummy")
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Reviewed-by: Bartosz Golaszewski <bgolaszewski@baylibre.com>
Reviewed-by: Kieran Bingham <kieran.bingham+renesas@ideasonboard.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core-base.c | 5 ++---
 include/linux/i2c.h         | 6 ++++++
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c
index e7d5ada40d48..f1949d1e2b54 100644
--- a/drivers/i2c/i2c-core-base.c
+++ b/drivers/i2c/i2c-core-base.c
@@ -729,7 +729,7 @@ static int i2c_dev_irq_from_resources(const struct resource *resources,
  * This returns the new i2c client, which may be saved for later use with
  * i2c_unregister_device(); or an ERR_PTR to describe the error.
  */
-static struct i2c_client *
+struct i2c_client *
 i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info)
 {
 	struct i2c_client	*client;
@@ -895,8 +895,7 @@ static struct i2c_driver dummy_driver = {
  * This returns the new i2c client, which should be saved for later use with
  * i2c_unregister_device(); or an ERR_PTR to describe the error.
  */
-static struct i2c_client *
-i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address)
+struct i2c_client *i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address)
 {
 	struct i2c_board_info info = {
 		I2C_BOARD_INFO("dummy", address),
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index d8f9060179d0..fa5552c2307b 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -442,6 +442,9 @@ struct i2c_board_info {
 extern struct i2c_client *
 i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info);
 
+extern struct i2c_client *
+i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info);
+
 /* If you don't know the exact address of an I2C device, use this variant
  * instead, which can probe for device presence in a list of possible
  * addresses. The "probe" callback function is optional. If it is provided,
@@ -463,6 +466,9 @@ extern int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short ad
 extern struct i2c_client *
 i2c_new_dummy(struct i2c_adapter *adap, u16 address);
 
+extern struct i2c_client *
+i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address);
+
 extern struct i2c_client *
 devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adap, u16 address);
 
-- 
cgit v1.2.3-59-g8ed1b


From 2a6a7aacd4e557a4c7007f8858bcc9654b098fea Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 3 Jun 2019 10:24:32 +0300
Subject: mfd: regulator: clk: Split rohm-bd718x7.h

Split the bd718x7.h to ROHM common and bd718x7 specific parts
so that we do not need to add same things in every new ROHM
PMIC header. Please note that this change requires changes also
in bd718x7 sub-device drivers for regulators and clk.

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/clk/clk-bd718x7.c             |  6 +++---
 drivers/mfd/rohm-bd718x7.c            | 23 ++++++++++++-----------
 drivers/regulator/bd718x7-regulator.c | 25 +++++++++++++------------
 include/linux/mfd/rohm-bd718x7.h      | 22 ++++++++--------------
 include/linux/mfd/rohm-generic.h      | 20 ++++++++++++++++++++
 5 files changed, 56 insertions(+), 40 deletions(-)
 create mode 100644 include/linux/mfd/rohm-generic.h

(limited to 'include/linux')

diff --git a/drivers/clk/clk-bd718x7.c b/drivers/clk/clk-bd718x7.c
index 60422c72d142..461228ebf703 100644
--- a/drivers/clk/clk-bd718x7.c
+++ b/drivers/clk/clk-bd718x7.c
@@ -17,7 +17,7 @@ struct bd718xx_clk {
 	u8 reg;
 	u8 mask;
 	struct platform_device *pdev;
-	struct bd718xx *mfd;
+	struct rohm_regmap_dev *mfd;
 };
 
 static int bd71837_clk_set(struct clk_hw *hw, int status)
@@ -68,7 +68,7 @@ static int bd71837_clk_probe(struct platform_device *pdev)
 	int rval = -ENOMEM;
 	const char *parent_clk;
 	struct device *parent = pdev->dev.parent;
-	struct bd718xx *mfd = dev_get_drvdata(parent);
+	struct rohm_regmap_dev *mfd = dev_get_drvdata(parent);
 	struct clk_init_data init = {
 		.name = "bd718xx-32k-out",
 		.ops = &bd71837_clk_ops,
@@ -119,5 +119,5 @@ static struct platform_driver bd71837_clk = {
 module_platform_driver(bd71837_clk);
 
 MODULE_AUTHOR("Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>");
-MODULE_DESCRIPTION("BD71837 chip clk driver");
+MODULE_DESCRIPTION("BD71837/BD71847 chip clk driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/mfd/rohm-bd718x7.c b/drivers/mfd/rohm-bd718x7.c
index a29d529a96f4..7beb444a57cb 100644
--- a/drivers/mfd/rohm-bd718x7.c
+++ b/drivers/mfd/rohm-bd718x7.c
@@ -98,18 +98,19 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 		return -ENOMEM;
 
 	bd718xx->chip_irq = i2c->irq;
-	bd718xx->chip_type = (unsigned int)(uintptr_t)
+	bd718xx->chip.chip_type = (unsigned int)(uintptr_t)
 				of_device_get_match_data(&i2c->dev);
-	bd718xx->dev = &i2c->dev;
+	bd718xx->chip.dev = &i2c->dev;
 	dev_set_drvdata(&i2c->dev, bd718xx);
 
-	bd718xx->regmap = devm_regmap_init_i2c(i2c, &bd718xx_regmap_config);
-	if (IS_ERR(bd718xx->regmap)) {
+	bd718xx->chip.regmap = devm_regmap_init_i2c(i2c,
+						    &bd718xx_regmap_config);
+	if (IS_ERR(bd718xx->chip.regmap)) {
 		dev_err(&i2c->dev, "regmap initialization failed\n");
-		return PTR_ERR(bd718xx->regmap);
+		return PTR_ERR(bd718xx->chip.regmap);
 	}
 
-	ret = devm_regmap_add_irq_chip(&i2c->dev, bd718xx->regmap,
+	ret = devm_regmap_add_irq_chip(&i2c->dev, bd718xx->chip.regmap,
 				       bd718xx->chip_irq, IRQF_ONESHOT, 0,
 				       &bd718xx_irq_chip, &bd718xx->irq_data);
 	if (ret) {
@@ -118,7 +119,7 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 	}
 
 	/* Configure short press to 10 milliseconds */
-	ret = regmap_update_bits(bd718xx->regmap,
+	ret = regmap_update_bits(bd718xx->chip.regmap,
 				 BD718XX_REG_PWRONCONFIG0,
 				 BD718XX_PWRBTN_PRESS_DURATION_MASK,
 				 BD718XX_PWRBTN_SHORT_PRESS_10MS);
@@ -129,7 +130,7 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 	}
 
 	/* Configure long press to 10 seconds */
-	ret = regmap_update_bits(bd718xx->regmap,
+	ret = regmap_update_bits(bd718xx->chip.regmap,
 				 BD718XX_REG_PWRONCONFIG1,
 				 BD718XX_PWRBTN_PRESS_DURATION_MASK,
 				 BD718XX_PWRBTN_LONG_PRESS_10S);
@@ -149,7 +150,7 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 
 	button.irq = ret;
 
-	ret = devm_mfd_add_devices(bd718xx->dev, PLATFORM_DEVID_AUTO,
+	ret = devm_mfd_add_devices(bd718xx->chip.dev, PLATFORM_DEVID_AUTO,
 				   bd718xx_mfd_cells,
 				   ARRAY_SIZE(bd718xx_mfd_cells), NULL, 0,
 				   regmap_irq_get_domain(bd718xx->irq_data));
@@ -162,11 +163,11 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c,
 static const struct of_device_id bd718xx_of_match[] = {
 	{
 		.compatible = "rohm,bd71837",
-		.data = (void *)BD718XX_TYPE_BD71837,
+		.data = (void *)ROHM_CHIP_TYPE_BD71837,
 	},
 	{
 		.compatible = "rohm,bd71847",
-		.data = (void *)BD718XX_TYPE_BD71847,
+		.data = (void *)ROHM_CHIP_TYPE_BD71847,
 	},
 	{ }
 };
diff --git a/drivers/regulator/bd718x7-regulator.c b/drivers/regulator/bd718x7-regulator.c
index fde4264da6ff..ef2fc175a9ae 100644
--- a/drivers/regulator/bd718x7-regulator.c
+++ b/drivers/regulator/bd718x7-regulator.c
@@ -1152,12 +1152,12 @@ static int bd718xx_probe(struct platform_device *pdev)
 {
 	struct bd718xx *mfd;
 	struct regulator_config config = { 0 };
-	struct bd718xx_pmic_inits pmic_regulators[] = {
-		[BD718XX_TYPE_BD71837] = {
+	struct bd718xx_pmic_inits pmic_regulators[ROHM_CHIP_TYPE_AMOUNT] = {
+		[ROHM_CHIP_TYPE_BD71837] = {
 			.r_datas = bd71837_regulators,
 			.r_amount = ARRAY_SIZE(bd71837_regulators),
 		},
-		[BD718XX_TYPE_BD71847] = {
+		[ROHM_CHIP_TYPE_BD71847] = {
 			.r_datas = bd71847_regulators,
 			.r_amount = ARRAY_SIZE(bd71847_regulators),
 		},
@@ -1173,15 +1173,15 @@ static int bd718xx_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	if (mfd->chip_type >= BD718XX_TYPE_AMOUNT ||
-	    !pmic_regulators[mfd->chip_type].r_datas) {
+	if (mfd->chip.chip_type >= ROHM_CHIP_TYPE_AMOUNT ||
+	    !pmic_regulators[mfd->chip.chip_type].r_datas) {
 		dev_err(&pdev->dev, "Unsupported chip type\n");
 		err = -EINVAL;
 		goto err;
 	}
 
 	/* Register LOCK release */
-	err = regmap_update_bits(mfd->regmap, BD718XX_REG_REGLOCK,
+	err = regmap_update_bits(mfd->chip.regmap, BD718XX_REG_REGLOCK,
 				 (REGLOCK_PWRSEQ | REGLOCK_VREG), 0);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to unlock PMIC (%d)\n", err);
@@ -1200,7 +1200,8 @@ static int bd718xx_probe(struct platform_device *pdev)
 	 * bit allowing HW defaults for power rails to be used
 	 */
 	if (!use_snvs) {
-		err = regmap_update_bits(mfd->regmap, BD718XX_REG_TRANS_COND1,
+		err = regmap_update_bits(mfd->chip.regmap,
+					 BD718XX_REG_TRANS_COND1,
 					 BD718XX_ON_REQ_POWEROFF_MASK |
 					 BD718XX_SWRESET_POWEROFF_MASK |
 					 BD718XX_WDOG_POWEROFF_MASK |
@@ -1215,17 +1216,17 @@ static int bd718xx_probe(struct platform_device *pdev)
 		}
 	}
 
-	for (i = 0; i < pmic_regulators[mfd->chip_type].r_amount; i++) {
+	for (i = 0; i < pmic_regulators[mfd->chip.chip_type].r_amount; i++) {
 
 		const struct regulator_desc *desc;
 		struct regulator_dev *rdev;
 		const struct bd718xx_regulator_data *r;
 
-		r = &pmic_regulators[mfd->chip_type].r_datas[i];
+		r = &pmic_regulators[mfd->chip.chip_type].r_datas[i];
 		desc = &r->desc;
 
 		config.dev = pdev->dev.parent;
-		config.regmap = mfd->regmap;
+		config.regmap = mfd->chip.regmap;
 
 		rdev = devm_regulator_register(&pdev->dev, desc, &config);
 		if (IS_ERR(rdev)) {
@@ -1254,7 +1255,7 @@ static int bd718xx_probe(struct platform_device *pdev)
 		 */
 		if (!use_snvs || !rdev->constraints->always_on ||
 		    !rdev->constraints->boot_on) {
-			err = regmap_update_bits(mfd->regmap, r->init.reg,
+			err = regmap_update_bits(mfd->chip.regmap, r->init.reg,
 						 r->init.mask, r->init.val);
 			if (err) {
 				dev_err(&pdev->dev,
@@ -1264,7 +1265,7 @@ static int bd718xx_probe(struct platform_device *pdev)
 			}
 		}
 		for (j = 0; j < r->additional_init_amnt; j++) {
-			err = regmap_update_bits(mfd->regmap,
+			err = regmap_update_bits(mfd->chip.regmap,
 						 r->additional_inits[j].reg,
 						 r->additional_inits[j].mask,
 						 r->additional_inits[j].val);
diff --git a/include/linux/mfd/rohm-bd718x7.h b/include/linux/mfd/rohm-bd718x7.h
index fd194bfc836f..7f2dbde402a1 100644
--- a/include/linux/mfd/rohm-bd718x7.h
+++ b/include/linux/mfd/rohm-bd718x7.h
@@ -4,14 +4,9 @@
 #ifndef __LINUX_MFD_BD718XX_H__
 #define __LINUX_MFD_BD718XX_H__
 
+#include <linux/mfd/rohm-generic.h>
 #include <linux/regmap.h>
 
-enum {
-	BD718XX_TYPE_BD71837 = 0,
-	BD718XX_TYPE_BD71847,
-	BD718XX_TYPE_AMOUNT
-};
-
 enum {
 	BD718XX_BUCK1 = 0,
 	BD718XX_BUCK2,
@@ -321,18 +316,17 @@ enum {
 	BD718XX_PWRBTN_LONG_PRESS_15S
 };
 
-struct bd718xx_clk;
-
 struct bd718xx {
-	unsigned int chip_type;
-	struct device *dev;
-	struct regmap *regmap;
-	unsigned long int id;
+	/*
+	 * Please keep this as the first member here as some
+	 * drivers (clk) supporting more than one chip may only know this
+	 * generic struct 'struct rohm_regmap_dev' and assume it is
+	 * the first chunk of parent device's private data.
+	 */
+	struct rohm_regmap_dev chip;
 
 	int chip_irq;
 	struct regmap_irq_chip_data *irq_data;
-
-	struct bd718xx_clk *clk;
 };
 
 #endif /* __LINUX_MFD_BD718XX_H__ */
diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h
new file mode 100644
index 000000000000..bff15ac26f2c
--- /dev/null
+++ b/include/linux/mfd/rohm-generic.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (C) 2018 ROHM Semiconductors */
+
+#ifndef __LINUX_MFD_ROHM_H__
+#define __LINUX_MFD_ROHM_H__
+
+enum {
+	ROHM_CHIP_TYPE_BD71837 = 0,
+	ROHM_CHIP_TYPE_BD71847,
+	ROHM_CHIP_TYPE_BD70528,
+	ROHM_CHIP_TYPE_AMOUNT
+};
+
+struct rohm_regmap_dev {
+	unsigned int chip_type;
+	struct device *dev;
+	struct regmap *regmap;
+};
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 21b7c58fc1943f3aa8c18a994ab9bed4ae5aa72d Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Date: Mon, 3 Jun 2019 10:25:08 +0300
Subject: mfd: bd70528: Support ROHM bd70528 PMIC core

ROHM BD70528MWV is an ultra-low quiescent current general
purpose single-chip power management IC for battery-powered
portable devices.

Add MFD core which enables chip access for following subdevices:
	- regulators/LED drivers
	- battery-charger
	- gpios
	- 32.768kHz clk
	- RTC
	- watchdog

Signed-off-by: Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig              |  17 ++
 drivers/mfd/Makefile             |   2 +
 drivers/mfd/rohm-bd70528.c       | 316 ++++++++++++++++++++++++++++++
 include/linux/mfd/rohm-bd70528.h | 408 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 743 insertions(+)
 create mode 100644 drivers/mfd/rohm-bd70528.c
 create mode 100644 include/linux/mfd/rohm-bd70528.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 294d9567cc71..11fc53d78c5f 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1890,6 +1890,23 @@ config MFD_ROHM_BD718XX
 	  NXP i.MX8. It contains 8 BUCK outputs and 7 LDOs, voltage monitoring
 	  and emergency shut down as well as 32,768KHz clock output.
 
+config MFD_ROHM_BD70528
+	tristate "ROHM BD70528 Power Management IC"
+	depends on I2C=y
+	depends on OF
+	select REGMAP_I2C
+	select REGMAP_IRQ
+	select MFD_CORE
+	help
+	  Select this option to get support for the ROHM BD70528 Power
+	  Management IC. BD71837 is general purpose single-chip power
+	  management IC for battery-powered portable devices. It contains
+	  3 ultra-low current consumption buck converters, 3 LDOs and 2 LED
+	  drivers. Also included are 4 GPIOs, a real-time clock (RTC), a 32kHz
+	  crystal oscillator, high-accuracy VREF for use with an external ADC,
+	  10 bits SAR ADC for battery temperature monitor and 1S battery
+	  charger.
+
 config MFD_STM32_LPTIMER
 	tristate "Support for STM32 Low-Power Timer"
 	depends on (ARCH_STM32 && OF) || COMPILE_TEST
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 52b1a90ff515..643d65bcb6ea 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -247,5 +247,7 @@ obj-$(CONFIG_MFD_STM32_TIMERS) 	+= stm32-timers.o
 obj-$(CONFIG_MFD_MXS_LRADC)     += mxs-lradc.o
 obj-$(CONFIG_MFD_SC27XX_PMIC)	+= sprd-sc27xx-spi.o
 obj-$(CONFIG_RAVE_SP_CORE)	+= rave-sp.o
+obj-$(CONFIG_MFD_ROHM_BD70528)	+= rohm-bd70528.o
 obj-$(CONFIG_MFD_ROHM_BD718XX)	+= rohm-bd718x7.o
 obj-$(CONFIG_MFD_STMFX) 	+= stmfx.o
+
diff --git a/drivers/mfd/rohm-bd70528.c b/drivers/mfd/rohm-bd70528.c
new file mode 100644
index 000000000000..55599d5c5c86
--- /dev/null
+++ b/drivers/mfd/rohm-bd70528.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+//
+// Copyright (C) 2019 ROHM Semiconductors
+//
+// ROHM BD70528 PMIC driver
+
+#include <linux/i2c.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/rohm-bd70528.h>
+#include <linux/module.h>
+#include <linux/of_device.h>
+#include <linux/regmap.h>
+#include <linux/types.h>
+
+#define BD70528_NUM_OF_GPIOS 4
+
+static const struct resource rtc_irqs[] = {
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_RTC_ALARM, "bd70528-rtc-alm"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_ELPS_TIM, "bd70528-elapsed-timer"),
+};
+
+static const struct resource charger_irqs[] = {
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_OV_RES, "bd70528-bat-ov-res"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_OV_DET, "bd70528-bat-ov-det"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DBAT_DET, "bd70528-bat-dead"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_COLD_RES, "bd70528-bat-warmed"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_COLD_DET, "bd70528-bat-cold"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_HOT_RES, "bd70528-bat-cooled"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_HOT_DET, "bd70528-bat-hot"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_CHG_TSD, "bd70528-chg-tshd"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_RMV, "bd70528-bat-removed"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_DET, "bd70528-bat-detected"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_OV_RES, "bd70528-dcin2-ov-res"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_OV_DET, "bd70528-dcin2-ov-det"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_RMV, "bd70528-dcin2-removed"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_DET, "bd70528-dcin2-detected"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN1_RMV, "bd70528-dcin1-removed"),
+	DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN1_DET, "bd70528-dcin1-detected"),
+};
+
+static struct mfd_cell bd70528_mfd_cells[] = {
+	{ .name = "bd70528-pmic", },
+	{ .name = "bd70528-gpio", },
+	/*
+	 * We use BD71837 driver to drive the clock block. Only differences to
+	 * BD70528 clock gate are the register address and mask.
+	 */
+	{ .name = "bd718xx-clk", },
+	{ .name = "bd70528-wdt", },
+	{
+		.name = "bd70528-power",
+		.resources = charger_irqs,
+		.num_resources = ARRAY_SIZE(charger_irqs),
+	}, {
+		.name = "bd70528-rtc",
+		.resources = rtc_irqs,
+		.num_resources = ARRAY_SIZE(rtc_irqs),
+	},
+};
+
+static const struct regmap_range volatile_ranges[] = {
+	{
+		.range_min = BD70528_REG_INT_MAIN,
+		.range_max = BD70528_REG_INT_OP_FAIL,
+	}, {
+		.range_min = BD70528_REG_RTC_COUNT_H,
+		.range_max = BD70528_REG_RTC_ALM_REPEAT,
+	}, {
+		/*
+		 * WDT control reg is special. Magic values must be written to
+		 * it in order to change the control. Should not be cached.
+		 */
+		.range_min = BD70528_REG_WDT_CTRL,
+		.range_max = BD70528_REG_WDT_CTRL,
+	}, {
+		/*
+		 * BD70528 also contains a few other registers which require
+		 * magic sequences to be written in order to update the value.
+		 * At least SHIPMODE, HWRESET, WARMRESET,and STANDBY
+		 */
+		.range_min = BD70528_REG_SHIPMODE,
+		.range_max = BD70528_REG_STANDBY,
+	},
+};
+
+static const struct regmap_access_table volatile_regs = {
+	.yes_ranges = &volatile_ranges[0],
+	.n_yes_ranges = ARRAY_SIZE(volatile_ranges),
+};
+
+static struct regmap_config bd70528_regmap = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.volatile_table = &volatile_regs,
+	.max_register = BD70528_MAX_REGISTER,
+	.cache_type = REGCACHE_RBTREE,
+};
+
+/*
+ * Mapping of main IRQ register bits to sub-IRQ register offsets so that we can
+ * access corect sub-IRQ registers based on bits that are set in main IRQ
+ * register.
+ */
+
+/* bit [0] - Shutdown register */
+unsigned int bit0_offsets[] = {0};	/* Shutdown register */
+unsigned int bit1_offsets[] = {1};	/* Power failure register */
+unsigned int bit2_offsets[] = {2};	/* VR FAULT register */
+unsigned int bit3_offsets[] = {3};	/* PMU register interrupts */
+unsigned int bit4_offsets[] = {4, 5};	/* Charger 1 and Charger 2 registers */
+unsigned int bit5_offsets[] = {6};	/* RTC register */
+unsigned int bit6_offsets[] = {7};	/* GPIO register */
+unsigned int bit7_offsets[] = {8};	/* Invalid operation register */
+
+static struct regmap_irq_sub_irq_map bd70528_sub_irq_offsets[] = {
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit0_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit1_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit2_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit3_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit4_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit5_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit6_offsets),
+	REGMAP_IRQ_MAIN_REG_OFFSET(bit7_offsets),
+};
+
+static struct regmap_irq bd70528_irqs[] = {
+	REGMAP_IRQ_REG(BD70528_INT_LONGPUSH, 0, BD70528_INT_LONGPUSH_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_WDT, 0, BD70528_INT_WDT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_HWRESET, 0, BD70528_INT_HWRESET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_RSTB_FAULT, 0, BD70528_INT_RSTB_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_VBAT_UVLO, 0, BD70528_INT_VBAT_UVLO_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_TSD, 0, BD70528_INT_TSD_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_RSTIN, 0, BD70528_INT_RSTIN_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK1_FAULT, 1,
+		       BD70528_INT_BUCK1_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK2_FAULT, 1,
+		       BD70528_INT_BUCK2_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK3_FAULT, 1,
+		       BD70528_INT_BUCK3_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LDO1_FAULT, 1, BD70528_INT_LDO1_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LDO2_FAULT, 1, BD70528_INT_LDO2_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LDO3_FAULT, 1, BD70528_INT_LDO3_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED1_FAULT, 1, BD70528_INT_LED1_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED2_FAULT, 1, BD70528_INT_LED2_FAULT_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK1_OCP, 2, BD70528_INT_BUCK1_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK2_OCP, 2, BD70528_INT_BUCK2_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK3_OCP, 2, BD70528_INT_BUCK3_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED1_OCP, 2, BD70528_INT_LED1_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED2_OCP, 2, BD70528_INT_LED2_OCP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK1_FULLON, 2,
+		       BD70528_INT_BUCK1_FULLON_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK2_FULLON, 2,
+		       BD70528_INT_BUCK2_FULLON_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_SHORTPUSH, 3, BD70528_INT_SHORTPUSH_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_AUTO_WAKEUP, 3,
+		       BD70528_INT_AUTO_WAKEUP_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_STATE_CHANGE, 3,
+		       BD70528_INT_STATE_CHANGE_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BAT_OV_RES, 4, BD70528_INT_BAT_OV_RES_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BAT_OV_DET, 4, BD70528_INT_BAT_OV_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DBAT_DET, 4, BD70528_INT_DBAT_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BATTSD_COLD_RES, 4,
+		       BD70528_INT_BATTSD_COLD_RES_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BATTSD_COLD_DET, 4,
+		       BD70528_INT_BATTSD_COLD_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BATTSD_HOT_RES, 4,
+		       BD70528_INT_BATTSD_HOT_RES_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BATTSD_HOT_DET, 4,
+		       BD70528_INT_BATTSD_HOT_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_CHG_TSD, 4, BD70528_INT_CHG_TSD_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BAT_RMV, 5, BD70528_INT_BAT_RMV_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BAT_DET, 5, BD70528_INT_BAT_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN2_OV_RES, 5,
+		       BD70528_INT_DCIN2_OV_RES_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN2_OV_DET, 5,
+		       BD70528_INT_DCIN2_OV_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN2_RMV, 5, BD70528_INT_DCIN2_RMV_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN2_DET, 5, BD70528_INT_DCIN2_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN1_RMV, 5, BD70528_INT_DCIN1_RMV_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_DCIN1_DET, 5, BD70528_INT_DCIN1_DET_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_RTC_ALARM, 6, BD70528_INT_RTC_ALARM_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_ELPS_TIM, 6, BD70528_INT_ELPS_TIM_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_GPIO0, 7, BD70528_INT_GPIO0_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_GPIO1, 7, BD70528_INT_GPIO1_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_GPIO2, 7, BD70528_INT_GPIO2_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_GPIO3, 7, BD70528_INT_GPIO3_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK1_DVS_OPFAIL, 8,
+		       BD70528_INT_BUCK1_DVS_OPFAIL_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK2_DVS_OPFAIL, 8,
+		       BD70528_INT_BUCK2_DVS_OPFAIL_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_BUCK3_DVS_OPFAIL, 8,
+		       BD70528_INT_BUCK3_DVS_OPFAIL_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED1_VOLT_OPFAIL, 8,
+		       BD70528_INT_LED1_VOLT_OPFAIL_MASK),
+	REGMAP_IRQ_REG(BD70528_INT_LED2_VOLT_OPFAIL, 8,
+		       BD70528_INT_LED2_VOLT_OPFAIL_MASK),
+};
+
+static struct regmap_irq_chip bd70528_irq_chip = {
+	.name = "bd70528_irq",
+	.main_status = BD70528_REG_INT_MAIN,
+	.irqs = &bd70528_irqs[0],
+	.num_irqs = ARRAY_SIZE(bd70528_irqs),
+	.status_base = BD70528_REG_INT_SHDN,
+	.mask_base = BD70528_REG_INT_SHDN_MASK,
+	.ack_base = BD70528_REG_INT_SHDN,
+	.type_base = BD70528_REG_GPIO1_IN,
+	.init_ack_masked = true,
+	.num_regs = 9,
+	.num_main_regs = 1,
+	.num_type_reg = 4,
+	.sub_reg_offsets = &bd70528_sub_irq_offsets[0],
+	.num_main_status_bits = 8,
+	.irq_reg_stride = 1,
+};
+
+static int bd70528_i2c_probe(struct i2c_client *i2c,
+			     const struct i2c_device_id *id)
+{
+	struct bd70528_data *bd70528;
+	struct regmap_irq_chip_data *irq_data;
+	int ret, i;
+
+	if (!i2c->irq) {
+		dev_err(&i2c->dev, "No IRQ configured\n");
+		return -EINVAL;
+	}
+
+	bd70528 = devm_kzalloc(&i2c->dev, sizeof(*bd70528), GFP_KERNEL);
+	if (!bd70528)
+		return -ENOMEM;
+
+	mutex_init(&bd70528->rtc_timer_lock);
+
+	dev_set_drvdata(&i2c->dev, &bd70528->chip);
+
+	bd70528->chip.chip_type = ROHM_CHIP_TYPE_BD70528;
+	bd70528->chip.regmap = devm_regmap_init_i2c(i2c, &bd70528_regmap);
+	if (IS_ERR(bd70528->chip.regmap)) {
+		dev_err(&i2c->dev, "Failed to initialize Regmap\n");
+		return PTR_ERR(bd70528->chip.regmap);
+	}
+
+	/*
+	 * Disallow type setting for all IRQs by default as most of them do not
+	 * support setting type.
+	 */
+	for (i = 0; i < ARRAY_SIZE(bd70528_irqs); i++)
+		bd70528_irqs[i].type.types_supported = 0;
+
+	/* Set IRQ typesetting information for GPIO pins 0 - 3 */
+	for (i = 0; i < BD70528_NUM_OF_GPIOS; i++) {
+		struct regmap_irq_type *type;
+
+		type = &bd70528_irqs[BD70528_INT_GPIO0 + i].type;
+		type->type_reg_offset = 2 * i;
+		type->type_rising_val = 0x20;
+		type->type_falling_val = 0x10;
+		type->type_level_high_val = 0x40;
+		type->type_level_low_val = 0x50;
+		type->types_supported = (IRQ_TYPE_EDGE_BOTH |
+				IRQ_TYPE_LEVEL_HIGH | IRQ_TYPE_LEVEL_LOW);
+	}
+
+	ret = devm_regmap_add_irq_chip(&i2c->dev, bd70528->chip.regmap,
+				       i2c->irq, IRQF_ONESHOT, 0,
+				       &bd70528_irq_chip, &irq_data);
+	if (ret) {
+		dev_err(&i2c->dev, "Failed to add IRQ chip\n");
+		return ret;
+	}
+	dev_dbg(&i2c->dev, "Registered %d IRQs for chip\n",
+		bd70528_irq_chip.num_irqs);
+
+	/*
+	 * BD70528 IRQ controller is not touching the main mask register.
+	 * So enable the GPIO block interrupts at main level. We can just leave
+	 * them enabled as the IRQ controller should disable IRQs from
+	 * sub-registers when IRQ is disabled or freed.
+	 */
+	ret = regmap_update_bits(bd70528->chip.regmap,
+				 BD70528_REG_INT_MAIN_MASK,
+				 BD70528_INT_GPIO_MASK, 0);
+
+	ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_AUTO,
+				   bd70528_mfd_cells,
+				   ARRAY_SIZE(bd70528_mfd_cells), NULL, 0,
+				   regmap_irq_get_domain(irq_data));
+	if (ret)
+		dev_err(&i2c->dev, "Failed to create subdevices\n");
+
+	return ret;
+}
+
+static const struct of_device_id bd70528_of_match[] = {
+	{ .compatible = "rohm,bd70528", },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, bd70528_of_match);
+
+static struct i2c_driver bd70528_drv = {
+	.driver = {
+		.name = "rohm-bd70528",
+		.of_match_table = bd70528_of_match,
+	},
+	.probe = &bd70528_i2c_probe,
+};
+
+module_i2c_driver(bd70528_drv);
+
+MODULE_AUTHOR("Matti Vaittinen <matti.vaittinen@fi.rohmeurope.com>");
+MODULE_DESCRIPTION("ROHM BD70528 Power Management IC driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/rohm-bd70528.h b/include/linux/mfd/rohm-bd70528.h
new file mode 100644
index 000000000000..1013e60c5b25
--- /dev/null
+++ b/include/linux/mfd/rohm-bd70528.h
@@ -0,0 +1,408 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (C) 2018 ROHM Semiconductors */
+
+#ifndef __LINUX_MFD_BD70528_H__
+#define __LINUX_MFD_BD70528_H__
+
+#include <linux/bits.h>
+#include <linux/device.h>
+#include <linux/mfd/rohm-generic.h>
+#include <linux/regmap.h>
+
+enum {
+	BD70528_BUCK1,
+	BD70528_BUCK2,
+	BD70528_BUCK3,
+	BD70528_LDO1,
+	BD70528_LDO2,
+	BD70528_LDO3,
+	BD70528_LED1,
+	BD70528_LED2,
+};
+
+struct bd70528_data {
+	struct rohm_regmap_dev chip;
+	struct mutex rtc_timer_lock;
+};
+
+#define BD70528_BUCK_VOLTS 17
+#define BD70528_BUCK_VOLTS 17
+#define BD70528_BUCK_VOLTS 17
+#define BD70528_LDO_VOLTS 0x20
+
+#define BD70528_REG_BUCK1_EN	0x0F
+#define BD70528_REG_BUCK1_VOLT	0x15
+#define BD70528_REG_BUCK2_EN	0x10
+#define BD70528_REG_BUCK2_VOLT	0x16
+#define BD70528_REG_BUCK3_EN	0x11
+#define BD70528_REG_BUCK3_VOLT	0x17
+#define BD70528_REG_LDO1_EN	0x1b
+#define BD70528_REG_LDO1_VOLT	0x1e
+#define BD70528_REG_LDO2_EN	0x1c
+#define BD70528_REG_LDO2_VOLT	0x1f
+#define BD70528_REG_LDO3_EN	0x1d
+#define BD70528_REG_LDO3_VOLT	0x20
+#define BD70528_REG_LED_CTRL	0x2b
+#define BD70528_REG_LED_VOLT	0x29
+#define BD70528_REG_LED_EN	0x2a
+
+/* main irq registers */
+#define BD70528_REG_INT_MAIN	0x7E
+#define BD70528_REG_INT_MAIN_MASK 0x74
+
+/* 'sub irq' registers */
+#define BD70528_REG_INT_SHDN	0x7F
+#define BD70528_REG_INT_PWR_FLT	0x80
+#define BD70528_REG_INT_VR_FLT	0x81
+#define BD70528_REG_INT_MISC	0x82
+#define BD70528_REG_INT_BAT1	0x83
+#define BD70528_REG_INT_BAT2	0x84
+#define BD70528_REG_INT_RTC	0x85
+#define BD70528_REG_INT_GPIO	0x86
+#define BD70528_REG_INT_OP_FAIL	0x87
+
+#define BD70528_REG_INT_SHDN_MASK	0x75
+#define BD70528_REG_INT_PWR_FLT_MASK	0x76
+#define BD70528_REG_INT_VR_FLT_MASK	0x77
+#define BD70528_REG_INT_MISC_MASK	0x78
+#define BD70528_REG_INT_BAT1_MASK	0x79
+#define BD70528_REG_INT_BAT2_MASK	0x7a
+#define BD70528_REG_INT_RTC_MASK	0x7b
+#define BD70528_REG_INT_GPIO_MASK	0x7c
+#define BD70528_REG_INT_OP_FAIL_MASK	0x7d
+
+/* Reset related 'magic' registers */
+#define BD70528_REG_SHIPMODE	0x03
+#define BD70528_REG_HWRESET	0x04
+#define BD70528_REG_WARMRESET	0x05
+#define BD70528_REG_STANDBY	0x06
+
+/* GPIO registers */
+#define BD70528_REG_GPIO_STATE	0x8F
+
+#define BD70528_REG_GPIO1_IN	0x4d
+#define BD70528_REG_GPIO2_IN	0x4f
+#define BD70528_REG_GPIO3_IN	0x51
+#define BD70528_REG_GPIO4_IN	0x53
+#define BD70528_REG_GPIO1_OUT	0x4e
+#define BD70528_REG_GPIO2_OUT	0x50
+#define BD70528_REG_GPIO3_OUT	0x52
+#define BD70528_REG_GPIO4_OUT	0x54
+
+/* clk control */
+
+#define BD70528_REG_CLK_OUT	0x2c
+
+/* RTC */
+
+#define BD70528_REG_RTC_COUNT_H		0x2d
+#define BD70528_REG_RTC_COUNT_L		0x2e
+#define BD70528_REG_RTC_SEC		0x2f
+#define BD70528_REG_RTC_MINUTE		0x30
+#define BD70528_REG_RTC_HOUR		0x31
+#define BD70528_REG_RTC_WEEK		0x32
+#define BD70528_REG_RTC_DAY		0x33
+#define BD70528_REG_RTC_MONTH		0x34
+#define BD70528_REG_RTC_YEAR		0x35
+
+#define BD70528_REG_RTC_ALM_SEC		0x36
+#define BD70528_REG_RTC_ALM_START	BD70528_REG_RTC_ALM_SEC
+#define BD70528_REG_RTC_ALM_MINUTE	0x37
+#define BD70528_REG_RTC_ALM_HOUR	0x38
+#define BD70528_REG_RTC_ALM_WEEK	0x39
+#define BD70528_REG_RTC_ALM_DAY		0x3a
+#define BD70528_REG_RTC_ALM_MONTH	0x3b
+#define BD70528_REG_RTC_ALM_YEAR	0x3c
+#define BD70528_REG_RTC_ALM_MASK	0x3d
+#define BD70528_REG_RTC_ALM_REPEAT	0x3e
+#define BD70528_REG_RTC_START		BD70528_REG_RTC_SEC
+
+#define BD70528_REG_RTC_WAKE_SEC	0x43
+#define BD70528_REG_RTC_WAKE_START	BD70528_REG_RTC_WAKE_SEC
+#define BD70528_REG_RTC_WAKE_MIN	0x44
+#define BD70528_REG_RTC_WAKE_HOUR	0x45
+#define BD70528_REG_RTC_WAKE_CTRL	0x46
+
+#define BD70528_REG_ELAPSED_TIMER_EN	0x42
+#define BD70528_REG_WAKE_EN		0x46
+
+/* WDT registers */
+#define BD70528_REG_WDT_CTRL		0x4A
+#define BD70528_REG_WDT_HOUR		0x49
+#define BD70528_REG_WDT_MINUTE		0x48
+#define BD70528_REG_WDT_SEC		0x47
+
+/* Charger / Battery */
+#define BD70528_REG_CHG_CURR_STAT	0x59
+#define BD70528_REG_CHG_BAT_STAT	0x57
+#define BD70528_REG_CHG_BAT_TEMP	0x58
+#define BD70528_REG_CHG_IN_STAT		0x56
+#define BD70528_REG_CHG_DCIN_ILIM	0x5d
+#define BD70528_REG_CHG_CHG_CURR_WARM	0x61
+#define BD70528_REG_CHG_CHG_CURR_COLD	0x62
+
+/* Masks for main IRQ register bits */
+enum {
+	BD70528_INT_SHDN,
+#define BD70528_INT_SHDN_MASK BIT(BD70528_INT_SHDN)
+	BD70528_INT_PWR_FLT,
+#define BD70528_INT_PWR_FLT_MASK BIT(BD70528_INT_PWR_FLT)
+	BD70528_INT_VR_FLT,
+#define BD70528_INT_VR_FLT_MASK BIT(BD70528_INT_VR_FLT)
+	BD70528_INT_MISC,
+#define BD70528_INT_MISC_MASK BIT(BD70528_INT_MISC)
+	BD70528_INT_BAT1,
+#define BD70528_INT_BAT1_MASK BIT(BD70528_INT_BAT1)
+	BD70528_INT_RTC,
+#define BD70528_INT_RTC_MASK BIT(BD70528_INT_RTC)
+	BD70528_INT_GPIO,
+#define BD70528_INT_GPIO_MASK BIT(BD70528_INT_GPIO)
+	BD70528_INT_OP_FAIL,
+#define BD70528_INT_OP_FAIL_MASK BIT(BD70528_INT_OP_FAIL)
+};
+
+/* IRQs */
+enum {
+	/* Shutdown register IRQs */
+	BD70528_INT_LONGPUSH,
+	BD70528_INT_WDT,
+	BD70528_INT_HWRESET,
+	BD70528_INT_RSTB_FAULT,
+	BD70528_INT_VBAT_UVLO,
+	BD70528_INT_TSD,
+	BD70528_INT_RSTIN,
+	/* Power failure register IRQs */
+	BD70528_INT_BUCK1_FAULT,
+	BD70528_INT_BUCK2_FAULT,
+	BD70528_INT_BUCK3_FAULT,
+	BD70528_INT_LDO1_FAULT,
+	BD70528_INT_LDO2_FAULT,
+	BD70528_INT_LDO3_FAULT,
+	BD70528_INT_LED1_FAULT,
+	BD70528_INT_LED2_FAULT,
+	/* VR FAULT register IRQs */
+	BD70528_INT_BUCK1_OCP,
+	BD70528_INT_BUCK2_OCP,
+	BD70528_INT_BUCK3_OCP,
+	BD70528_INT_LED1_OCP,
+	BD70528_INT_LED2_OCP,
+	BD70528_INT_BUCK1_FULLON,
+	BD70528_INT_BUCK2_FULLON,
+	/* PMU register interrupts */
+	BD70528_INT_SHORTPUSH,
+	BD70528_INT_AUTO_WAKEUP,
+	BD70528_INT_STATE_CHANGE,
+	/* Charger 1 register IRQs */
+	BD70528_INT_BAT_OV_RES,
+	BD70528_INT_BAT_OV_DET,
+	BD70528_INT_DBAT_DET,
+	BD70528_INT_BATTSD_COLD_RES,
+	BD70528_INT_BATTSD_COLD_DET,
+	BD70528_INT_BATTSD_HOT_RES,
+	BD70528_INT_BATTSD_HOT_DET,
+	BD70528_INT_CHG_TSD,
+	/* Charger 2 register IRQs */
+	BD70528_INT_BAT_RMV,
+	BD70528_INT_BAT_DET,
+	BD70528_INT_DCIN2_OV_RES,
+	BD70528_INT_DCIN2_OV_DET,
+	BD70528_INT_DCIN2_RMV,
+	BD70528_INT_DCIN2_DET,
+	BD70528_INT_DCIN1_RMV,
+	BD70528_INT_DCIN1_DET,
+	/* RTC register IRQs */
+	BD70528_INT_RTC_ALARM,
+	BD70528_INT_ELPS_TIM,
+	/* GPIO register IRQs */
+	BD70528_INT_GPIO0,
+	BD70528_INT_GPIO1,
+	BD70528_INT_GPIO2,
+	BD70528_INT_GPIO3,
+	/* Invalid operation register IRQs */
+	BD70528_INT_BUCK1_DVS_OPFAIL,
+	BD70528_INT_BUCK2_DVS_OPFAIL,
+	BD70528_INT_BUCK3_DVS_OPFAIL,
+	BD70528_INT_LED1_VOLT_OPFAIL,
+	BD70528_INT_LED2_VOLT_OPFAIL,
+};
+
+/* Masks */
+#define BD70528_INT_LONGPUSH_MASK 0x1
+#define BD70528_INT_WDT_MASK 0x2
+#define BD70528_INT_HWRESET_MASK 0x4
+#define BD70528_INT_RSTB_FAULT_MASK 0x8
+#define BD70528_INT_VBAT_UVLO_MASK 0x10
+#define BD70528_INT_TSD_MASK 0x20
+#define BD70528_INT_RSTIN_MASK 0x40
+
+#define BD70528_INT_BUCK1_FAULT_MASK 0x1
+#define BD70528_INT_BUCK2_FAULT_MASK 0x2
+#define BD70528_INT_BUCK3_FAULT_MASK 0x4
+#define BD70528_INT_LDO1_FAULT_MASK 0x8
+#define BD70528_INT_LDO2_FAULT_MASK 0x10
+#define BD70528_INT_LDO3_FAULT_MASK 0x20
+#define BD70528_INT_LED1_FAULT_MASK 0x40
+#define BD70528_INT_LED2_FAULT_MASK 0x80
+
+#define BD70528_INT_BUCK1_OCP_MASK 0x1
+#define BD70528_INT_BUCK2_OCP_MASK 0x2
+#define BD70528_INT_BUCK3_OCP_MASK 0x4
+#define BD70528_INT_LED1_OCP_MASK 0x8
+#define BD70528_INT_LED2_OCP_MASK 0x10
+#define BD70528_INT_BUCK1_FULLON_MASK 0x20
+#define BD70528_INT_BUCK2_FULLON_MASK 0x40
+
+#define BD70528_INT_SHORTPUSH_MASK 0x1
+#define BD70528_INT_AUTO_WAKEUP_MASK 0x2
+#define BD70528_INT_STATE_CHANGE_MASK 0x10
+
+#define BD70528_INT_BAT_OV_RES_MASK 0x1
+#define BD70528_INT_BAT_OV_DET_MASK 0x2
+#define BD70528_INT_DBAT_DET_MASK 0x4
+#define BD70528_INT_BATTSD_COLD_RES_MASK 0x8
+#define BD70528_INT_BATTSD_COLD_DET_MASK 0x10
+#define BD70528_INT_BATTSD_HOT_RES_MASK 0x20
+#define BD70528_INT_BATTSD_HOT_DET_MASK 0x40
+#define BD70528_INT_CHG_TSD_MASK 0x80
+
+#define BD70528_INT_BAT_RMV_MASK 0x1
+#define BD70528_INT_BAT_DET_MASK 0x2
+#define BD70528_INT_DCIN2_OV_RES_MASK 0x4
+#define BD70528_INT_DCIN2_OV_DET_MASK 0x8
+#define BD70528_INT_DCIN2_RMV_MASK 0x10
+#define BD70528_INT_DCIN2_DET_MASK 0x20
+#define BD70528_INT_DCIN1_RMV_MASK 0x40
+#define BD70528_INT_DCIN1_DET_MASK 0x80
+
+#define BD70528_INT_RTC_ALARM_MASK 0x1
+#define BD70528_INT_ELPS_TIM_MASK 0x2
+
+#define BD70528_INT_GPIO0_MASK 0x1
+#define BD70528_INT_GPIO1_MASK 0x2
+#define BD70528_INT_GPIO2_MASK 0x4
+#define BD70528_INT_GPIO3_MASK 0x8
+
+#define BD70528_INT_BUCK1_DVS_OPFAIL_MASK 0x1
+#define BD70528_INT_BUCK2_DVS_OPFAIL_MASK 0x2
+#define BD70528_INT_BUCK3_DVS_OPFAIL_MASK 0x4
+#define BD70528_INT_LED1_VOLT_OPFAIL_MASK 0x10
+#define BD70528_INT_LED2_VOLT_OPFAIL_MASK 0x20
+
+#define BD70528_DEBOUNCE_MASK 0x3
+
+#define BD70528_DEBOUNCE_DISABLE 0
+#define BD70528_DEBOUNCE_15MS 1
+#define BD70528_DEBOUNCE_30MS 2
+#define BD70528_DEBOUNCE_50MS 3
+
+#define BD70528_GPIO_DRIVE_MASK 0x2
+#define BD70528_GPIO_PUSH_PULL 0x0
+#define BD70528_GPIO_OPEN_DRAIN 0x2
+
+#define BD70528_GPIO_OUT_EN_MASK 0x80
+#define BD70528_GPIO_OUT_ENABLE 0x80
+#define BD70528_GPIO_OUT_DISABLE 0x0
+
+#define BD70528_GPIO_OUT_HI 0x1
+#define BD70528_GPIO_OUT_LO 0x0
+#define BD70528_GPIO_OUT_MASK 0x1
+
+#define BD70528_GPIO_IN_STATE_BASE 1
+
+#define BD70528_CLK_OUT_EN_MASK 0x1
+
+/* RTC masks to mask out reserved bits */
+
+#define BD70528_MASK_RTC_SEC		0x7f
+#define BD70528_MASK_RTC_MINUTE		0x7f
+#define BD70528_MASK_RTC_HOUR_24H	0x80
+#define BD70528_MASK_RTC_HOUR_PM	0x20
+#define BD70528_MASK_RTC_HOUR		0x1f
+#define BD70528_MASK_RTC_DAY		0x3f
+#define BD70528_MASK_RTC_WEEK		0x07
+#define BD70528_MASK_RTC_MONTH		0x1f
+#define BD70528_MASK_RTC_YEAR		0xff
+#define BD70528_MASK_RTC_COUNT_L	0x7f
+
+#define BD70528_MASK_ELAPSED_TIMER_EN	0x1
+/* Mask second, min and hour fields
+ * HW would support ALM irq for over 24h
+ * (by setting day, month and year too)
+ * but as we wish to keep this same as for
+ * wake-up we limit ALM to 24H and only
+ * unmask sec, min and hour
+ */
+#define BD70528_MASK_ALM_EN		0x7
+#define BD70528_MASK_WAKE_EN		0x1
+
+/* WDT masks */
+#define BD70528_MASK_WDT_EN		0x1
+#define BD70528_MASK_WDT_HOUR		0x1
+#define BD70528_MASK_WDT_MINUTE		0x7f
+#define BD70528_MASK_WDT_SEC		0x7f
+
+#define BD70528_WDT_STATE_BIT		0x1
+#define BD70528_ELAPSED_STATE_BIT	0x2
+#define BD70528_WAKE_STATE_BIT		0x4
+
+/* Charger masks */
+#define BD70528_MASK_CHG_STAT		0x7f
+#define BD70528_MASK_CHG_BAT_TIMER	0x20
+#define BD70528_MASK_CHG_BAT_OVERVOLT	0x10
+#define BD70528_MASK_CHG_BAT_DETECT	0x1
+#define BD70528_MASK_CHG_DCIN1_UVLO	0x1
+#define BD70528_MASK_CHG_DCIN_ILIM	0x3f
+#define BD70528_MASK_CHG_CHG_CURR	0x1f
+#define BD70528_MASK_CHG_TRICKLE_CURR	0x10
+
+/*
+ * Note, external battery register is the lonely rider at
+ * address 0xc5. See how to stuff that in the regmap
+ */
+#define BD70528_MAX_REGISTER 0x94
+
+/* Buck control masks */
+#define BD70528_MASK_RUN_EN	0x4
+#define BD70528_MASK_STBY_EN	0x2
+#define BD70528_MASK_IDLE_EN	0x1
+#define BD70528_MASK_LED1_EN	0x1
+#define BD70528_MASK_LED2_EN	0x10
+
+#define BD70528_MASK_BUCK_VOLT	0xf
+#define BD70528_MASK_LDO_VOLT	0x1f
+#define BD70528_MASK_LED1_VOLT	0x1
+#define BD70528_MASK_LED2_VOLT	0x10
+
+/* Misc irq masks */
+#define BD70528_INT_MASK_SHORT_PUSH	1
+#define BD70528_INT_MASK_AUTO_WAKE	2
+#define BD70528_INT_MASK_POWER_STATE	4
+
+#define BD70528_MASK_BUCK_RAMP 0x10
+#define BD70528_SIFT_BUCK_RAMP 4
+
+#if IS_ENABLED(CONFIG_BD70528_WATCHDOG)
+
+int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable, int *old_state);
+void bd70528_wdt_lock(struct rohm_regmap_dev *data);
+void bd70528_wdt_unlock(struct rohm_regmap_dev *data);
+
+#else /* CONFIG_BD70528_WATCHDOG */
+
+static inline int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable,
+				  int *old_state)
+{
+	return 0;
+}
+
+static inline void bd70528_wdt_lock(struct rohm_regmap_dev *data)
+{
+}
+
+static inline void bd70528_wdt_unlock(struct rohm_regmap_dev *data)
+{
+}
+
+#endif /* CONFIG_BD70528_WATCHDOG */
+
+#endif /* __LINUX_MFD_BD70528_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 6bbe6f5732faeabb4bb583726ec2d7f9739532bd Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Tue, 18 Jun 2019 18:05:28 -0300
Subject: docs: thermal: convert to ReST

Rename the thermal documentation files to ReST, add an
index for them and adjust in order to produce a nice html
output via the Sphinx build system.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 Documentation/thermal/cpu-cooling-api.rst          | 107 +++
 Documentation/thermal/cpu-cooling-api.txt          |  92 ---
 Documentation/thermal/exynos_thermal               |  77 --
 Documentation/thermal/exynos_thermal.rst           |  90 +++
 Documentation/thermal/exynos_thermal_emulation     |  53 --
 Documentation/thermal/exynos_thermal_emulation.rst |  61 ++
 Documentation/thermal/index.rst                    |  18 +
 Documentation/thermal/intel_powerclamp.rst         | 320 +++++++++
 Documentation/thermal/intel_powerclamp.txt         | 317 --------
 Documentation/thermal/nouveau_thermal              |  82 ---
 Documentation/thermal/nouveau_thermal.rst          |  96 +++
 Documentation/thermal/power_allocator.rst          | 271 +++++++
 Documentation/thermal/power_allocator.txt          | 247 -------
 Documentation/thermal/sysfs-api.rst                | 798 +++++++++++++++++++++
 Documentation/thermal/sysfs-api.txt                | 636 ----------------
 Documentation/thermal/x86_pkg_temperature_thermal  |  47 --
 .../thermal/x86_pkg_temperature_thermal.rst        |  55 ++
 MAINTAINERS                                        |   2 +-
 include/linux/thermal.h                            |   4 +-
 19 files changed, 1819 insertions(+), 1554 deletions(-)
 create mode 100644 Documentation/thermal/cpu-cooling-api.rst
 delete mode 100644 Documentation/thermal/cpu-cooling-api.txt
 delete mode 100644 Documentation/thermal/exynos_thermal
 create mode 100644 Documentation/thermal/exynos_thermal.rst
 delete mode 100644 Documentation/thermal/exynos_thermal_emulation
 create mode 100644 Documentation/thermal/exynos_thermal_emulation.rst
 create mode 100644 Documentation/thermal/index.rst
 create mode 100644 Documentation/thermal/intel_powerclamp.rst
 delete mode 100644 Documentation/thermal/intel_powerclamp.txt
 delete mode 100644 Documentation/thermal/nouveau_thermal
 create mode 100644 Documentation/thermal/nouveau_thermal.rst
 create mode 100644 Documentation/thermal/power_allocator.rst
 delete mode 100644 Documentation/thermal/power_allocator.txt
 create mode 100644 Documentation/thermal/sysfs-api.rst
 delete mode 100644 Documentation/thermal/sysfs-api.txt
 delete mode 100644 Documentation/thermal/x86_pkg_temperature_thermal
 create mode 100644 Documentation/thermal/x86_pkg_temperature_thermal.rst

(limited to 'include/linux')

diff --git a/Documentation/thermal/cpu-cooling-api.rst b/Documentation/thermal/cpu-cooling-api.rst
new file mode 100644
index 000000000000..645d914c45a6
--- /dev/null
+++ b/Documentation/thermal/cpu-cooling-api.rst
@@ -0,0 +1,107 @@
+=======================
+CPU cooling APIs How To
+=======================
+
+Written by Amit Daniel Kachhap <amit.kachhap@linaro.org>
+
+Updated: 6 Jan 2015
+
+Copyright (c)  2012 Samsung Electronics Co., Ltd(http://www.samsung.com)
+
+0. Introduction
+===============
+
+The generic cpu cooling(freq clipping) provides registration/unregistration APIs
+to the caller. The binding of the cooling devices to the trip point is left for
+the user. The registration APIs returns the cooling device pointer.
+
+1. cpu cooling APIs
+===================
+
+1.1 cpufreq registration/unregistration APIs
+--------------------------------------------
+
+    ::
+
+	struct thermal_cooling_device
+	*cpufreq_cooling_register(struct cpumask *clip_cpus)
+
+    This interface function registers the cpufreq cooling device with the name
+    "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
+    cooling devices.
+
+   clip_cpus:
+	cpumask of cpus where the frequency constraints will happen.
+
+    ::
+
+	struct thermal_cooling_device
+	*of_cpufreq_cooling_register(struct cpufreq_policy *policy)
+
+    This interface function registers the cpufreq cooling device with
+    the name "thermal-cpufreq-%x" linking it with a device tree node, in
+    order to bind it via the thermal DT code. This api can support multiple
+    instances of cpufreq cooling devices.
+
+    policy:
+	CPUFreq policy.
+
+
+    ::
+
+	void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
+
+    This interface function unregisters the "thermal-cpufreq-%x" cooling device.
+
+    cdev: Cooling device pointer which has to be unregistered.
+
+2. Power models
+===============
+
+The power API registration functions provide a simple power model for
+CPUs.  The current power is calculated as dynamic power (static power isn't
+supported currently).  This power model requires that the operating-points of
+the CPUs are registered using the kernel's opp library and the
+`cpufreq_frequency_table` is assigned to the `struct device` of the
+cpu.  If you are using CONFIG_CPUFREQ_DT then the
+`cpufreq_frequency_table` should already be assigned to the cpu
+device.
+
+The dynamic power consumption of a processor depends on many factors.
+For a given processor implementation the primary factors are:
+
+- The time the processor spends running, consuming dynamic power, as
+  compared to the time in idle states where dynamic consumption is
+  negligible.  Herein we refer to this as 'utilisation'.
+- The voltage and frequency levels as a result of DVFS.  The DVFS
+  level is a dominant factor governing power consumption.
+- In running time the 'execution' behaviour (instruction types, memory
+  access patterns and so forth) causes, in most cases, a second order
+  variation.  In pathological cases this variation can be significant,
+  but typically it is of a much lesser impact than the factors above.
+
+A high level dynamic power consumption model may then be represented as::
+
+	Pdyn = f(run) * Voltage^2 * Frequency * Utilisation
+
+f(run) here represents the described execution behaviour and its
+result has a units of Watts/Hz/Volt^2 (this often expressed in
+mW/MHz/uVolt^2)
+
+The detailed behaviour for f(run) could be modelled on-line.  However,
+in practice, such an on-line model has dependencies on a number of
+implementation specific processor support and characterisation
+factors.  Therefore, in initial implementation that contribution is
+represented as a constant coefficient.  This is a simplification
+consistent with the relative contribution to overall power variation.
+
+In this simplified representation our model becomes::
+
+	Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation
+
+Where `capacitance` is a constant that represents an indicative
+running time dynamic power coefficient in fundamental units of
+mW/MHz/uVolt^2.  Typical values for mobile CPUs might lie in range
+from 100 to 500.  For reference, the approximate values for the SoC in
+ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
+140 for the Cortex-A53 cluster.
diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt
deleted file mode 100644
index 7df567eaea1a..000000000000
--- a/Documentation/thermal/cpu-cooling-api.txt
+++ /dev/null
@@ -1,92 +0,0 @@
-CPU cooling APIs How To
-===================================
-
-Written by Amit Daniel Kachhap <amit.kachhap@linaro.org>
-
-Updated: 6 Jan 2015
-
-Copyright (c)  2012 Samsung Electronics Co., Ltd(http://www.samsung.com)
-
-0. Introduction
-
-The generic cpu cooling(freq clipping) provides registration/unregistration APIs
-to the caller. The binding of the cooling devices to the trip point is left for
-the user. The registration APIs returns the cooling device pointer.
-
-1. cpu cooling APIs
-
-1.1 cpufreq registration/unregistration APIs
-1.1.1 struct thermal_cooling_device *cpufreq_cooling_register(
-	struct cpumask *clip_cpus)
-
-    This interface function registers the cpufreq cooling device with the name
-    "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
-    cooling devices.
-
-   clip_cpus: cpumask of cpus where the frequency constraints will happen.
-
-1.1.2 struct thermal_cooling_device *of_cpufreq_cooling_register(
-					struct cpufreq_policy *policy)
-
-    This interface function registers the cpufreq cooling device with
-    the name "thermal-cpufreq-%x" linking it with a device tree node, in
-    order to bind it via the thermal DT code. This api can support multiple
-    instances of cpufreq cooling devices.
-
-    policy: CPUFreq policy.
-
-1.1.3 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
-
-    This interface function unregisters the "thermal-cpufreq-%x" cooling device.
-
-    cdev: Cooling device pointer which has to be unregistered.
-
-2. Power models
-
-The power API registration functions provide a simple power model for
-CPUs.  The current power is calculated as dynamic power (static power isn't
-supported currently).  This power model requires that the operating-points of
-the CPUs are registered using the kernel's opp library and the
-`cpufreq_frequency_table` is assigned to the `struct device` of the
-cpu.  If you are using CONFIG_CPUFREQ_DT then the
-`cpufreq_frequency_table` should already be assigned to the cpu
-device.
-
-The dynamic power consumption of a processor depends on many factors.
-For a given processor implementation the primary factors are:
-
-- The time the processor spends running, consuming dynamic power, as
-  compared to the time in idle states where dynamic consumption is
-  negligible.  Herein we refer to this as 'utilisation'.
-- The voltage and frequency levels as a result of DVFS.  The DVFS
-  level is a dominant factor governing power consumption.
-- In running time the 'execution' behaviour (instruction types, memory
-  access patterns and so forth) causes, in most cases, a second order
-  variation.  In pathological cases this variation can be significant,
-  but typically it is of a much lesser impact than the factors above.
-
-A high level dynamic power consumption model may then be represented as:
-
-Pdyn = f(run) * Voltage^2 * Frequency * Utilisation
-
-f(run) here represents the described execution behaviour and its
-result has a units of Watts/Hz/Volt^2 (this often expressed in
-mW/MHz/uVolt^2)
-
-The detailed behaviour for f(run) could be modelled on-line.  However,
-in practice, such an on-line model has dependencies on a number of
-implementation specific processor support and characterisation
-factors.  Therefore, in initial implementation that contribution is
-represented as a constant coefficient.  This is a simplification
-consistent with the relative contribution to overall power variation.
-
-In this simplified representation our model becomes:
-
-Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation
-
-Where `capacitance` is a constant that represents an indicative
-running time dynamic power coefficient in fundamental units of
-mW/MHz/uVolt^2.  Typical values for mobile CPUs might lie in range
-from 100 to 500.  For reference, the approximate values for the SoC in
-ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and
-140 for the Cortex-A53 cluster.
diff --git a/Documentation/thermal/exynos_thermal b/Documentation/thermal/exynos_thermal
deleted file mode 100644
index 9010c4416967..000000000000
--- a/Documentation/thermal/exynos_thermal
+++ /dev/null
@@ -1,77 +0,0 @@
-Kernel driver exynos_tmu
-=================
-
-Supported chips:
-* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC
-  Datasheet: Not publicly available
-
-Authors: Donggeun Kim <dg77.kim@samsung.com>
-Authors: Amit Daniel <amit.daniel@samsung.com>
-
-TMU controller Description:
----------------------------
-
-This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC.
-
-The chip only exposes the measured 8-bit temperature code value
-through a register.
-Temperature can be taken from the temperature code.
-There are three equations converting from temperature to temperature code.
-
-The three equations are:
-  1. Two point trimming
-	Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1
-
-  2. One point trimming
-	Tc = T + TI1 - 25
-
-  3. No trimming
-	Tc = T + 50
-
-  Tc: Temperature code, T: Temperature,
-  TI1: Trimming info for 25 degree Celsius (stored at TRIMINFO register)
-       Temperature code measured at 25 degree Celsius which is unchanged
-  TI2: Trimming info for 85 degree Celsius (stored at TRIMINFO register)
-       Temperature code measured at 85 degree Celsius which is unchanged
-
-TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt
-when temperature exceeds pre-defined levels.
-The maximum number of configurable threshold is five.
-The threshold levels are defined as follows:
-  Level_0: current temperature > trigger_level_0 + threshold
-  Level_1: current temperature > trigger_level_1 + threshold
-  Level_2: current temperature > trigger_level_2 + threshold
-  Level_3: current temperature > trigger_level_3 + threshold
-
-  The threshold and each trigger_level are set
-  through the corresponding registers.
-
-When an interrupt occurs, this driver notify kernel thermal framework
-with the function exynos_report_trigger.
-Although an interrupt condition for level_0 can be set,
-it can be used to synchronize the cooling action.
-
-TMU driver description:
------------------------
-
-The exynos thermal driver is structured as,
-
-					Kernel Core thermal framework
-				(thermal_core.c, step_wise.c, cpu_cooling.c)
-								^
-								|
-								|
-TMU configuration data -------> TMU Driver  <------> Exynos Core thermal wrapper
-(exynos_tmu_data.c)	      (exynos_tmu.c)	   (exynos_thermal_common.c)
-(exynos_tmu_data.h)	      (exynos_tmu.h)	   (exynos_thermal_common.h)
-
-a) TMU configuration data: This consist of TMU register offsets/bitfields
-		described through structure exynos_tmu_registers. Also several
-		other platform data (struct exynos_tmu_platform_data) members
-		are used to configure the TMU.
-b) TMU driver: This component initialises the TMU controller and sets different
-		thresholds. It invokes core thermal implementation with the call
-		exynos_report_trigger.
-c) Exynos Core thermal wrapper: This provides 3 wrapper function to use the
-		Kernel core thermal framework. They are exynos_unregister_thermal,
-		exynos_register_thermal and exynos_report_trigger.
diff --git a/Documentation/thermal/exynos_thermal.rst b/Documentation/thermal/exynos_thermal.rst
new file mode 100644
index 000000000000..5bd556566c70
--- /dev/null
+++ b/Documentation/thermal/exynos_thermal.rst
@@ -0,0 +1,90 @@
+========================
+Kernel driver exynos_tmu
+========================
+
+Supported chips:
+
+* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC
+
+  Datasheet: Not publicly available
+
+Authors: Donggeun Kim <dg77.kim@samsung.com>
+Authors: Amit Daniel <amit.daniel@samsung.com>
+
+TMU controller Description:
+---------------------------
+
+This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC.
+
+The chip only exposes the measured 8-bit temperature code value
+through a register.
+Temperature can be taken from the temperature code.
+There are three equations converting from temperature to temperature code.
+
+The three equations are:
+  1. Two point trimming::
+
+	Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1
+
+  2. One point trimming::
+
+	Tc = T + TI1 - 25
+
+  3. No trimming::
+
+	Tc = T + 50
+
+  Tc:
+       Temperature code, T: Temperature,
+  TI1:
+       Trimming info for 25 degree Celsius (stored at TRIMINFO register)
+       Temperature code measured at 25 degree Celsius which is unchanged
+  TI2:
+       Trimming info for 85 degree Celsius (stored at TRIMINFO register)
+       Temperature code measured at 85 degree Celsius which is unchanged
+
+TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt
+when temperature exceeds pre-defined levels.
+The maximum number of configurable threshold is five.
+The threshold levels are defined as follows::
+
+  Level_0: current temperature > trigger_level_0 + threshold
+  Level_1: current temperature > trigger_level_1 + threshold
+  Level_2: current temperature > trigger_level_2 + threshold
+  Level_3: current temperature > trigger_level_3 + threshold
+
+The threshold and each trigger_level are set
+through the corresponding registers.
+
+When an interrupt occurs, this driver notify kernel thermal framework
+with the function exynos_report_trigger.
+Although an interrupt condition for level_0 can be set,
+it can be used to synchronize the cooling action.
+
+TMU driver description:
+-----------------------
+
+The exynos thermal driver is structured as::
+
+					Kernel Core thermal framework
+				(thermal_core.c, step_wise.c, cpu_cooling.c)
+								^
+								|
+								|
+  TMU configuration data -----> TMU Driver  <----> Exynos Core thermal wrapper
+  (exynos_tmu_data.c)	      (exynos_tmu.c)	   (exynos_thermal_common.c)
+  (exynos_tmu_data.h)	      (exynos_tmu.h)	   (exynos_thermal_common.h)
+
+a) TMU configuration data:
+		This consist of TMU register offsets/bitfields
+		described through structure exynos_tmu_registers. Also several
+		other platform data (struct exynos_tmu_platform_data) members
+		are used to configure the TMU.
+b) TMU driver:
+		This component initialises the TMU controller and sets different
+		thresholds. It invokes core thermal implementation with the call
+		exynos_report_trigger.
+c) Exynos Core thermal wrapper:
+		This provides 3 wrapper function to use the
+		Kernel core thermal framework. They are exynos_unregister_thermal,
+		exynos_register_thermal and exynos_report_trigger.
diff --git a/Documentation/thermal/exynos_thermal_emulation b/Documentation/thermal/exynos_thermal_emulation
deleted file mode 100644
index b15efec6ca28..000000000000
--- a/Documentation/thermal/exynos_thermal_emulation
+++ /dev/null
@@ -1,53 +0,0 @@
-EXYNOS EMULATION MODE
-========================
-
-Copyright (C) 2012 Samsung Electronics
-
-Written by Jonghwa Lee <jonghwa3.lee@samsung.com>
-
-Description
------------
-
-Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal management unit.
-Thermal emulation mode supports software debug for TMU's operation. User can set temperature
-manually with software code and TMU will read current temperature from user value not from
-sensor's value.
-
-Enabling CONFIG_THERMAL_EMULATION option will make this support available.
-When it's enabled, sysfs node will be created as
-/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp.
-
-The sysfs node, 'emul_node', will contain value 0 for the initial state. When you input any
-temperature you want to update to sysfs node, it automatically enable emulation mode and
-current temperature will be changed into it.
-(Exynos also supports user changeable delay time which would be used to delay of
- changing temperature. However, this node only uses same delay of real sensing time, 938us.)
-
-Exynos emulation mode requires synchronous of value changing and enabling. It means when you
-want to update the any value of delay or next temperature, then you have to enable emulation
-mode at the same time. (Or you have to keep the mode enabling.) If you don't, it fails to
-change the value to updated one and just use last succeessful value repeatedly. That's why
-this node gives users the right to change termerpature only. Just one interface makes it more
-simply to use.
-
-Disabling emulation mode only requires writing value 0 to sysfs node.
-
-
-TEMP	120 |
-	    |
-	100 |
-	    |
-	 80 |
-	    |		     	 	 +-----------
-	 60 |      		     	 |	    |
-	    |	           +-------------|          |
-	 40 |              |         	 |          |
-	    |		   |	     	 |          |
-	 20 |		   |	     	 |          +----------
-	    |	 	   |	     	 |          |          |
-	  0 |______________|_____________|__________|__________|_________
-		   A	    	 A	    A	   	       A     TIME
-		   |<----->|	 |<----->|  |<----->|	       |
-		   | 938us |  	 |	 |  |       |          |
-emulation    :  0  50	   |  	 70      |  20      |          0
-current temp :   sensor   50		 70         20	      sensor
diff --git a/Documentation/thermal/exynos_thermal_emulation.rst b/Documentation/thermal/exynos_thermal_emulation.rst
new file mode 100644
index 000000000000..c21d10838bc5
--- /dev/null
+++ b/Documentation/thermal/exynos_thermal_emulation.rst
@@ -0,0 +1,61 @@
+=====================
+Exynos Emulation Mode
+=====================
+
+Copyright (C) 2012 Samsung Electronics
+
+Written by Jonghwa Lee <jonghwa3.lee@samsung.com>
+
+Description
+-----------
+
+Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal
+management unit. Thermal emulation mode supports software debug for
+TMU's operation. User can set temperature manually with software code
+and TMU will read current temperature from user value not from sensor's
+value.
+
+Enabling CONFIG_THERMAL_EMULATION option will make this support
+available. When it's enabled, sysfs node will be created as
+/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp.
+
+The sysfs node, 'emul_node', will contain value 0 for the initial state.
+When you input any temperature you want to update to sysfs node, it
+automatically enable emulation mode and current temperature will be
+changed into it.
+
+(Exynos also supports user changeable delay time which would be used to
+delay of changing temperature. However, this node only uses same delay
+of real sensing time, 938us.)
+
+Exynos emulation mode requires synchronous of value changing and
+enabling. It means when you want to update the any value of delay or
+next temperature, then you have to enable emulation mode at the same
+time. (Or you have to keep the mode enabling.) If you don't, it fails to
+change the value to updated one and just use last succeessful value
+repeatedly. That's why this node gives users the right to change
+termerpature only. Just one interface makes it more simply to use.
+
+Disabling emulation mode only requires writing value 0 to sysfs node.
+
+::
+
+
+  TEMP	120 |
+	    |
+	100 |
+	    |
+	 80 |
+	    |				 +-----------
+	 60 |      			 |	    |
+	    |		   +-------------|          |
+	 40 |              |         	 |          |
+	    |		   |		 |          |
+	 20 |		   |		 |          +----------
+	    |		   |		 |          |          |
+	  0 |______________|_____________|__________|__________|_________
+		   A		 A	    A		       A     TIME
+		   |<----->|	 |<----->|  |<----->|	       |
+		   | 938us |  	 |	 |  |       |          |
+  emulation   : 0  50	   |  	 70      |  20      |          0
+  current temp:   sensor   50		 70         20	      sensor
diff --git a/Documentation/thermal/index.rst b/Documentation/thermal/index.rst
new file mode 100644
index 000000000000..8c1c00146cad
--- /dev/null
+++ b/Documentation/thermal/index.rst
@@ -0,0 +1,18 @@
+:orphan:
+
+=======
+Thermal
+=======
+
+.. toctree::
+   :maxdepth: 1
+
+   cpu-cooling-api
+   sysfs-api
+   power_allocator
+
+   exynos_thermal
+   exynos_thermal_emulation
+   intel_powerclamp
+   nouveau_thermal
+   x86_pkg_temperature_thermal
diff --git a/Documentation/thermal/intel_powerclamp.rst b/Documentation/thermal/intel_powerclamp.rst
new file mode 100644
index 000000000000..3f6dfb0b3ea6
--- /dev/null
+++ b/Documentation/thermal/intel_powerclamp.rst
@@ -0,0 +1,320 @@
+=======================
+Intel Powerclamp Driver
+=======================
+
+By:
+  - Arjan van de Ven <arjan@linux.intel.com>
+  - Jacob Pan <jacob.jun.pan@linux.intel.com>
+
+.. Contents:
+
+	(*) Introduction
+	    - Goals and Objectives
+
+	(*) Theory of Operation
+	    - Idle Injection
+	    - Calibration
+
+	(*) Performance Analysis
+	    - Effectiveness and Limitations
+	    - Power vs Performance
+	    - Scalability
+	    - Calibration
+	    - Comparison with Alternative Techniques
+
+	(*) Usage and Interfaces
+	    - Generic Thermal Layer (sysfs)
+	    - Kernel APIs (TBD)
+
+INTRODUCTION
+============
+
+Consider the situation where a system’s power consumption must be
+reduced at runtime, due to power budget, thermal constraint, or noise
+level, and where active cooling is not preferred. Software managed
+passive power reduction must be performed to prevent the hardware
+actions that are designed for catastrophic scenarios.
+
+Currently, P-states, T-states (clock modulation), and CPU offlining
+are used for CPU throttling.
+
+On Intel CPUs, C-states provide effective power reduction, but so far
+they’re only used opportunistically, based on workload. With the
+development of intel_powerclamp driver, the method of synchronizing
+idle injection across all online CPU threads was introduced. The goal
+is to achieve forced and controllable C-state residency.
+
+Test/Analysis has been made in the areas of power, performance,
+scalability, and user experience. In many cases, clear advantage is
+shown over taking the CPU offline or modulating the CPU clock.
+
+
+THEORY OF OPERATION
+===================
+
+Idle Injection
+--------------
+
+On modern Intel processors (Nehalem or later), package level C-state
+residency is available in MSRs, thus also available to the kernel.
+
+These MSRs are::
+
+      #define MSR_PKG_C2_RESIDENCY      0x60D
+      #define MSR_PKG_C3_RESIDENCY      0x3F8
+      #define MSR_PKG_C6_RESIDENCY      0x3F9
+      #define MSR_PKG_C7_RESIDENCY      0x3FA
+
+If the kernel can also inject idle time to the system, then a
+closed-loop control system can be established that manages package
+level C-state. The intel_powerclamp driver is conceived as such a
+control system, where the target set point is a user-selected idle
+ratio (based on power reduction), and the error is the difference
+between the actual package level C-state residency ratio and the target idle
+ratio.
+
+Injection is controlled by high priority kernel threads, spawned for
+each online CPU.
+
+These kernel threads, with SCHED_FIFO class, are created to perform
+clamping actions of controlled duty ratio and duration. Each per-CPU
+thread synchronizes its idle time and duration, based on the rounding
+of jiffies, so accumulated errors can be prevented to avoid a jittery
+effect. Threads are also bound to the CPU such that they cannot be
+migrated, unless the CPU is taken offline. In this case, threads
+belong to the offlined CPUs will be terminated immediately.
+
+Running as SCHED_FIFO and relatively high priority, also allows such
+scheme to work for both preemptable and non-preemptable kernels.
+Alignment of idle time around jiffies ensures scalability for HZ
+values. This effect can be better visualized using a Perf timechart.
+The following diagram shows the behavior of kernel thread
+kidle_inject/cpu. During idle injection, it runs monitor/mwait idle
+for a given "duration", then relinquishes the CPU to other tasks,
+until the next time interval.
+
+The NOHZ schedule tick is disabled during idle time, but interrupts
+are not masked. Tests show that the extra wakeups from scheduler tick
+have a dramatic impact on the effectiveness of the powerclamp driver
+on large scale systems (Westmere system with 80 processors).
+
+::
+
+  CPU0
+		    ____________          ____________
+  kidle_inject/0   |   sleep    |  mwait |  sleep     |
+	  _________|            |________|            |_______
+				 duration
+  CPU1
+		    ____________          ____________
+  kidle_inject/1   |   sleep    |  mwait |  sleep     |
+	  _________|            |________|            |_______
+				^
+				|
+				|
+				roundup(jiffies, interval)
+
+Only one CPU is allowed to collect statistics and update global
+control parameters. This CPU is referred to as the controlling CPU in
+this document. The controlling CPU is elected at runtime, with a
+policy that favors BSP, taking into account the possibility of a CPU
+hot-plug.
+
+In terms of dynamics of the idle control system, package level idle
+time is considered largely as a non-causal system where its behavior
+cannot be based on the past or current input. Therefore, the
+intel_powerclamp driver attempts to enforce the desired idle time
+instantly as given input (target idle ratio). After injection,
+powerclamp monitors the actual idle for a given time window and adjust
+the next injection accordingly to avoid over/under correction.
+
+When used in a causal control system, such as a temperature control,
+it is up to the user of this driver to implement algorithms where
+past samples and outputs are included in the feedback. For example, a
+PID-based thermal controller can use the powerclamp driver to
+maintain a desired target temperature, based on integral and
+derivative gains of the past samples.
+
+
+
+Calibration
+-----------
+During scalability testing, it is observed that synchronized actions
+among CPUs become challenging as the number of cores grows. This is
+also true for the ability of a system to enter package level C-states.
+
+To make sure the intel_powerclamp driver scales well, online
+calibration is implemented. The goals for doing such a calibration
+are:
+
+a) determine the effective range of idle injection ratio
+b) determine the amount of compensation needed at each target ratio
+
+Compensation to each target ratio consists of two parts:
+
+	a) steady state error compensation
+	This is to offset the error occurring when the system can
+	enter idle without extra wakeups (such as external interrupts).
+
+	b) dynamic error compensation
+	When an excessive amount of wakeups occurs during idle, an
+	additional idle ratio can be added to quiet interrupts, by
+	slowing down CPU activities.
+
+A debugfs file is provided for the user to examine compensation
+progress and results, such as on a Westmere system::
+
+  [jacob@nex01 ~]$ cat
+  /sys/kernel/debug/intel_powerclamp/powerclamp_calib
+  controlling cpu: 0
+  pct confidence steady dynamic (compensation)
+  0       0       0       0
+  1       1       0       0
+  2       1       1       0
+  3       3       1       0
+  4       3       1       0
+  5       3       1       0
+  6       3       1       0
+  7       3       1       0
+  8       3       1       0
+  ...
+  30      3       2       0
+  31      3       2       0
+  32      3       1       0
+  33      3       2       0
+  34      3       1       0
+  35      3       2       0
+  36      3       1       0
+  37      3       2       0
+  38      3       1       0
+  39      3       2       0
+  40      3       3       0
+  41      3       1       0
+  42      3       2       0
+  43      3       1       0
+  44      3       1       0
+  45      3       2       0
+  46      3       3       0
+  47      3       0       0
+  48      3       2       0
+  49      3       3       0
+
+Calibration occurs during runtime. No offline method is available.
+Steady state compensation is used only when confidence levels of all
+adjacent ratios have reached satisfactory level. A confidence level
+is accumulated based on clean data collected at runtime. Data
+collected during a period without extra interrupts is considered
+clean.
+
+To compensate for excessive amounts of wakeup during idle, additional
+idle time is injected when such a condition is detected. Currently,
+we have a simple algorithm to double the injection ratio. A possible
+enhancement might be to throttle the offending IRQ, such as delaying
+EOI for level triggered interrupts. But it is a challenge to be
+non-intrusive to the scheduler or the IRQ core code.
+
+
+CPU Online/Offline
+------------------
+Per-CPU kernel threads are started/stopped upon receiving
+notifications of CPU hotplug activities. The intel_powerclamp driver
+keeps track of clamping kernel threads, even after they are migrated
+to other CPUs, after a CPU offline event.
+
+
+Performance Analysis
+====================
+This section describes the general performance data collected on
+multiple systems, including Westmere (80P) and Ivy Bridge (4P, 8P).
+
+Effectiveness and Limitations
+-----------------------------
+The maximum range that idle injection is allowed is capped at 50
+percent. As mentioned earlier, since interrupts are allowed during
+forced idle time, excessive interrupts could result in less
+effectiveness. The extreme case would be doing a ping -f to generated
+flooded network interrupts without much CPU acknowledgement. In this
+case, little can be done from the idle injection threads. In most
+normal cases, such as scp a large file, applications can be throttled
+by the powerclamp driver, since slowing down the CPU also slows down
+network protocol processing, which in turn reduces interrupts.
+
+When control parameters change at runtime by the controlling CPU, it
+may take an additional period for the rest of the CPUs to catch up
+with the changes. During this time, idle injection is out of sync,
+thus not able to enter package C- states at the expected ratio. But
+this effect is minor, in that in most cases change to the target
+ratio is updated much less frequently than the idle injection
+frequency.
+
+Scalability
+-----------
+Tests also show a minor, but measurable, difference between the 4P/8P
+Ivy Bridge system and the 80P Westmere server under 50% idle ratio.
+More compensation is needed on Westmere for the same amount of
+target idle ratio. The compensation also increases as the idle ratio
+gets larger. The above reason constitutes the need for the
+calibration code.
+
+On the IVB 8P system, compared to an offline CPU, powerclamp can
+achieve up to 40% better performance per watt. (measured by a spin
+counter summed over per CPU counting threads spawned for all running
+CPUs).
+
+Usage and Interfaces
+====================
+The powerclamp driver is registered to the generic thermal layer as a
+cooling device. Currently, it’s not bound to any thermal zones::
+
+  jacob@chromoly:/sys/class/thermal/cooling_device14$ grep . *
+  cur_state:0
+  max_state:50
+  type:intel_powerclamp
+
+cur_state allows user to set the desired idle percentage. Writing 0 to
+cur_state will stop idle injection. Writing a value between 1 and
+max_state will start the idle injection. Reading cur_state returns the
+actual and current idle percentage. This may not be the same value
+set by the user in that current idle percentage depends on workload
+and includes natural idle. When idle injection is disabled, reading
+cur_state returns value -1 instead of 0 which is to avoid confusing
+100% busy state with the disabled state.
+
+Example usage:
+- To inject 25% idle time::
+
+	$ sudo sh -c "echo 25 > /sys/class/thermal/cooling_device80/cur_state
+
+If the system is not busy and has more than 25% idle time already,
+then the powerclamp driver will not start idle injection. Using Top
+will not show idle injection kernel threads.
+
+If the system is busy (spin test below) and has less than 25% natural
+idle time, powerclamp kernel threads will do idle injection. Forced
+idle time is accounted as normal idle in that common code path is
+taken as the idle task.
+
+In this example, 24.1% idle is shown. This helps the system admin or
+user determine the cause of slowdown, when a powerclamp driver is in action::
+
+
+  Tasks: 197 total,   1 running, 196 sleeping,   0 stopped,   0 zombie
+  Cpu(s): 71.2%us,  4.7%sy,  0.0%ni, 24.1%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
+  Mem:   3943228k total,  1689632k used,  2253596k free,    74960k buffers
+  Swap:  4087804k total,        0k used,  4087804k free,   945336k cached
+
+    PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
+   3352 jacob     20   0  262m  644  428 S  286  0.0   0:17.16 spin
+   3341 root     -51   0     0    0    0 D   25  0.0   0:01.62 kidle_inject/0
+   3344 root     -51   0     0    0    0 D   25  0.0   0:01.60 kidle_inject/3
+   3342 root     -51   0     0    0    0 D   25  0.0   0:01.61 kidle_inject/1
+   3343 root     -51   0     0    0    0 D   25  0.0   0:01.60 kidle_inject/2
+   2935 jacob     20   0  696m 125m  35m S    5  3.3   0:31.11 firefox
+   1546 root      20   0  158m  20m 6640 S    3  0.5   0:26.97 Xorg
+   2100 jacob     20   0 1223m  88m  30m S    3  2.3   0:23.68 compiz
+
+Tests have shown that by using the powerclamp driver as a cooling
+device, a PID based userspace thermal controller can manage to
+control CPU temperature effectively, when no other thermal influence
+is added. For example, a UltraBook user can compile the kernel under
+certain temperature (below most active trip points).
diff --git a/Documentation/thermal/intel_powerclamp.txt b/Documentation/thermal/intel_powerclamp.txt
deleted file mode 100644
index b5df21168fbc..000000000000
--- a/Documentation/thermal/intel_powerclamp.txt
+++ /dev/null
@@ -1,317 +0,0 @@
-			 =======================
-			 INTEL POWERCLAMP DRIVER
-			 =======================
-By: Arjan van de Ven <arjan@linux.intel.com>
-    Jacob Pan <jacob.jun.pan@linux.intel.com>
-
-Contents:
-	(*) Introduction
-	    - Goals and Objectives
-
-	(*) Theory of Operation
-	    - Idle Injection
-	    - Calibration
-
-	(*) Performance Analysis
-	    - Effectiveness and Limitations
-	    - Power vs Performance
-	    - Scalability
-	    - Calibration
-	    - Comparison with Alternative Techniques
-
-	(*) Usage and Interfaces
-	    - Generic Thermal Layer (sysfs)
-	    - Kernel APIs (TBD)
-
-============
-INTRODUCTION
-============
-
-Consider the situation where a system’s power consumption must be
-reduced at runtime, due to power budget, thermal constraint, or noise
-level, and where active cooling is not preferred. Software managed
-passive power reduction must be performed to prevent the hardware
-actions that are designed for catastrophic scenarios.
-
-Currently, P-states, T-states (clock modulation), and CPU offlining
-are used for CPU throttling.
-
-On Intel CPUs, C-states provide effective power reduction, but so far
-they’re only used opportunistically, based on workload. With the
-development of intel_powerclamp driver, the method of synchronizing
-idle injection across all online CPU threads was introduced. The goal
-is to achieve forced and controllable C-state residency.
-
-Test/Analysis has been made in the areas of power, performance,
-scalability, and user experience. In many cases, clear advantage is
-shown over taking the CPU offline or modulating the CPU clock.
-
-
-===================
-THEORY OF OPERATION
-===================
-
-Idle Injection
---------------
-
-On modern Intel processors (Nehalem or later), package level C-state
-residency is available in MSRs, thus also available to the kernel.
-
-These MSRs are:
-      #define MSR_PKG_C2_RESIDENCY	0x60D
-      #define MSR_PKG_C3_RESIDENCY	0x3F8
-      #define MSR_PKG_C6_RESIDENCY	0x3F9
-      #define MSR_PKG_C7_RESIDENCY	0x3FA
-
-If the kernel can also inject idle time to the system, then a
-closed-loop control system can be established that manages package
-level C-state. The intel_powerclamp driver is conceived as such a
-control system, where the target set point is a user-selected idle
-ratio (based on power reduction), and the error is the difference
-between the actual package level C-state residency ratio and the target idle
-ratio.
-
-Injection is controlled by high priority kernel threads, spawned for
-each online CPU.
-
-These kernel threads, with SCHED_FIFO class, are created to perform
-clamping actions of controlled duty ratio and duration. Each per-CPU
-thread synchronizes its idle time and duration, based on the rounding
-of jiffies, so accumulated errors can be prevented to avoid a jittery
-effect. Threads are also bound to the CPU such that they cannot be
-migrated, unless the CPU is taken offline. In this case, threads
-belong to the offlined CPUs will be terminated immediately.
-
-Running as SCHED_FIFO and relatively high priority, also allows such
-scheme to work for both preemptable and non-preemptable kernels.
-Alignment of idle time around jiffies ensures scalability for HZ
-values. This effect can be better visualized using a Perf timechart.
-The following diagram shows the behavior of kernel thread
-kidle_inject/cpu. During idle injection, it runs monitor/mwait idle
-for a given "duration", then relinquishes the CPU to other tasks,
-until the next time interval.
-
-The NOHZ schedule tick is disabled during idle time, but interrupts
-are not masked. Tests show that the extra wakeups from scheduler tick
-have a dramatic impact on the effectiveness of the powerclamp driver
-on large scale systems (Westmere system with 80 processors).
-
-CPU0
-		  ____________          ____________
-kidle_inject/0   |   sleep    |  mwait |  sleep     |
-	_________|            |________|            |_______
-			       duration
-CPU1
-		  ____________          ____________
-kidle_inject/1   |   sleep    |  mwait |  sleep     |
-	_________|            |________|            |_______
-			      ^
-			      |
-			      |
-			      roundup(jiffies, interval)
-
-Only one CPU is allowed to collect statistics and update global
-control parameters. This CPU is referred to as the controlling CPU in
-this document. The controlling CPU is elected at runtime, with a
-policy that favors BSP, taking into account the possibility of a CPU
-hot-plug.
-
-In terms of dynamics of the idle control system, package level idle
-time is considered largely as a non-causal system where its behavior
-cannot be based on the past or current input. Therefore, the
-intel_powerclamp driver attempts to enforce the desired idle time
-instantly as given input (target idle ratio). After injection,
-powerclamp monitors the actual idle for a given time window and adjust
-the next injection accordingly to avoid over/under correction.
-
-When used in a causal control system, such as a temperature control,
-it is up to the user of this driver to implement algorithms where
-past samples and outputs are included in the feedback. For example, a
-PID-based thermal controller can use the powerclamp driver to
-maintain a desired target temperature, based on integral and
-derivative gains of the past samples.
-
-
-
-Calibration
------------
-During scalability testing, it is observed that synchronized actions
-among CPUs become challenging as the number of cores grows. This is
-also true for the ability of a system to enter package level C-states.
-
-To make sure the intel_powerclamp driver scales well, online
-calibration is implemented. The goals for doing such a calibration
-are:
-
-a) determine the effective range of idle injection ratio
-b) determine the amount of compensation needed at each target ratio
-
-Compensation to each target ratio consists of two parts:
-
-        a) steady state error compensation
-	This is to offset the error occurring when the system can
-	enter idle without extra wakeups (such as external interrupts).
-
-	b) dynamic error compensation
-	When an excessive amount of wakeups occurs during idle, an
-	additional idle ratio can be added to quiet interrupts, by
-	slowing down CPU activities.
-
-A debugfs file is provided for the user to examine compensation
-progress and results, such as on a Westmere system.
-[jacob@nex01 ~]$ cat
-/sys/kernel/debug/intel_powerclamp/powerclamp_calib
-controlling cpu: 0
-pct confidence steady dynamic (compensation)
-0	0	0	0
-1	1	0	0
-2	1	1	0
-3	3	1	0
-4	3	1	0
-5	3	1	0
-6	3	1	0
-7	3	1	0
-8	3	1	0
-...
-30	3	2	0
-31	3	2	0
-32	3	1	0
-33	3	2	0
-34	3	1	0
-35	3	2	0
-36	3	1	0
-37	3	2	0
-38	3	1	0
-39	3	2	0
-40	3	3	0
-41	3	1	0
-42	3	2	0
-43	3	1	0
-44	3	1	0
-45	3	2	0
-46	3	3	0
-47	3	0	0
-48	3	2	0
-49	3	3	0
-
-Calibration occurs during runtime. No offline method is available.
-Steady state compensation is used only when confidence levels of all
-adjacent ratios have reached satisfactory level. A confidence level
-is accumulated based on clean data collected at runtime. Data
-collected during a period without extra interrupts is considered
-clean.
-
-To compensate for excessive amounts of wakeup during idle, additional
-idle time is injected when such a condition is detected. Currently,
-we have a simple algorithm to double the injection ratio. A possible
-enhancement might be to throttle the offending IRQ, such as delaying
-EOI for level triggered interrupts. But it is a challenge to be
-non-intrusive to the scheduler or the IRQ core code.
-
-
-CPU Online/Offline
-------------------
-Per-CPU kernel threads are started/stopped upon receiving
-notifications of CPU hotplug activities. The intel_powerclamp driver
-keeps track of clamping kernel threads, even after they are migrated
-to other CPUs, after a CPU offline event.
-
-
-=====================
-Performance Analysis
-=====================
-This section describes the general performance data collected on
-multiple systems, including Westmere (80P) and Ivy Bridge (4P, 8P).
-
-Effectiveness and Limitations
------------------------------
-The maximum range that idle injection is allowed is capped at 50
-percent. As mentioned earlier, since interrupts are allowed during
-forced idle time, excessive interrupts could result in less
-effectiveness. The extreme case would be doing a ping -f to generated
-flooded network interrupts without much CPU acknowledgement. In this
-case, little can be done from the idle injection threads. In most
-normal cases, such as scp a large file, applications can be throttled
-by the powerclamp driver, since slowing down the CPU also slows down
-network protocol processing, which in turn reduces interrupts.
-
-When control parameters change at runtime by the controlling CPU, it
-may take an additional period for the rest of the CPUs to catch up
-with the changes. During this time, idle injection is out of sync,
-thus not able to enter package C- states at the expected ratio. But
-this effect is minor, in that in most cases change to the target
-ratio is updated much less frequently than the idle injection
-frequency.
-
-Scalability
------------
-Tests also show a minor, but measurable, difference between the 4P/8P
-Ivy Bridge system and the 80P Westmere server under 50% idle ratio.
-More compensation is needed on Westmere for the same amount of
-target idle ratio. The compensation also increases as the idle ratio
-gets larger. The above reason constitutes the need for the
-calibration code.
-
-On the IVB 8P system, compared to an offline CPU, powerclamp can
-achieve up to 40% better performance per watt. (measured by a spin
-counter summed over per CPU counting threads spawned for all running
-CPUs).
-
-====================
-Usage and Interfaces
-====================
-The powerclamp driver is registered to the generic thermal layer as a
-cooling device. Currently, it’s not bound to any thermal zones.
-
-jacob@chromoly:/sys/class/thermal/cooling_device14$ grep . *
-cur_state:0
-max_state:50
-type:intel_powerclamp
-
-cur_state allows user to set the desired idle percentage. Writing 0 to
-cur_state will stop idle injection. Writing a value between 1 and
-max_state will start the idle injection. Reading cur_state returns the
-actual and current idle percentage. This may not be the same value
-set by the user in that current idle percentage depends on workload
-and includes natural idle. When idle injection is disabled, reading
-cur_state returns value -1 instead of 0 which is to avoid confusing
-100% busy state with the disabled state.
-
-Example usage:
-- To inject 25% idle time
-$ sudo sh -c "echo 25 > /sys/class/thermal/cooling_device80/cur_state
-"
-
-If the system is not busy and has more than 25% idle time already,
-then the powerclamp driver will not start idle injection. Using Top
-will not show idle injection kernel threads.
-
-If the system is busy (spin test below) and has less than 25% natural
-idle time, powerclamp kernel threads will do idle injection. Forced
-idle time is accounted as normal idle in that common code path is
-taken as the idle task.
-
-In this example, 24.1% idle is shown. This helps the system admin or
-user determine the cause of slowdown, when a powerclamp driver is in action.
-
-
-Tasks: 197 total,   1 running, 196 sleeping,   0 stopped,   0 zombie
-Cpu(s): 71.2%us,  4.7%sy,  0.0%ni, 24.1%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
-Mem:   3943228k total,  1689632k used,  2253596k free,    74960k buffers
-Swap:  4087804k total,        0k used,  4087804k free,   945336k cached
-
-  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
- 3352 jacob     20   0  262m  644  428 S  286  0.0   0:17.16 spin
- 3341 root     -51   0     0    0    0 D   25  0.0   0:01.62 kidle_inject/0
- 3344 root     -51   0     0    0    0 D   25  0.0   0:01.60 kidle_inject/3
- 3342 root     -51   0     0    0    0 D   25  0.0   0:01.61 kidle_inject/1
- 3343 root     -51   0     0    0    0 D   25  0.0   0:01.60 kidle_inject/2
- 2935 jacob     20   0  696m 125m  35m S    5  3.3   0:31.11 firefox
- 1546 root      20   0  158m  20m 6640 S    3  0.5   0:26.97 Xorg
- 2100 jacob     20   0 1223m  88m  30m S    3  2.3   0:23.68 compiz
-
-Tests have shown that by using the powerclamp driver as a cooling
-device, a PID based userspace thermal controller can manage to
-control CPU temperature effectively, when no other thermal influence
-is added. For example, a UltraBook user can compile the kernel under
-certain temperature (below most active trip points).
diff --git a/Documentation/thermal/nouveau_thermal b/Documentation/thermal/nouveau_thermal
deleted file mode 100644
index 6e17a11efcb0..000000000000
--- a/Documentation/thermal/nouveau_thermal
+++ /dev/null
@@ -1,82 +0,0 @@
-Kernel driver nouveau
-===================
-
-Supported chips:
-* NV43+
-
-Authors: Martin Peres (mupuf) <martin.peres@free.fr>
-
-Description
----------
-
-This driver allows to read the GPU core temperature, drive the GPU fan and
-set temperature alarms.
-
-Currently, due to the absence of in-kernel API to access HWMON drivers, Nouveau
-cannot access any of the i2c external monitoring chips it may find. If you
-have one of those, temperature and/or fan management through Nouveau's HWMON
-interface is likely not to work. This document may then not cover your situation
-entirely.
-
-Temperature management
---------------------
-
-Temperature is exposed under as a read-only HWMON attribute temp1_input.
-
-In order to protect the GPU from overheating, Nouveau supports 4 configurable
-temperature thresholds:
-
- * Fan_boost: Fan speed is set to 100% when reaching this temperature;
- * Downclock: The GPU will be downclocked to reduce its power dissipation;
- * Critical: The GPU is put on hold to further lower power dissipation;
- * Shutdown: Shut the computer down to protect your GPU.
-
-WARNING: Some of these thresholds may not be used by Nouveau depending
-on your chipset.
-
-The default value for these thresholds comes from the GPU's vbios. These
-thresholds can be configured thanks to the following HWMON attributes:
-
- * Fan_boost: temp1_auto_point1_temp and temp1_auto_point1_temp_hyst;
- * Downclock: temp1_max and temp1_max_hyst;
- * Critical: temp1_crit and temp1_crit_hyst;
- * Shutdown: temp1_emergency and temp1_emergency_hyst.
-
-NOTE: Remember that the values are stored as milli degrees Celsius. Don't forget
-to multiply!
-
-Fan management
-------------
-
-Not all cards have a drivable fan. If you do, then the following HWMON
-attributes should be available:
-
- * pwm1_enable: Current fan management mode (NONE, MANUAL or AUTO);
- * pwm1: Current PWM value (power percentage);
- * pwm1_min: The minimum PWM speed allowed;
- * pwm1_max: The maximum PWM speed allowed (bypassed when hitting Fan_boost);
-
-You may also have the following attribute:
-
- * fan1_input: Speed in RPM of your fan.
-
-Your fan can be driven in different modes:
-
- * 0: The fan is left untouched;
- * 1: The fan can be driven in manual (use pwm1 to change the speed);
- * 2; The fan is driven automatically depending on the temperature.
-
-NOTE: Be sure to use the manual mode if you want to drive the fan speed manually
-
-NOTE2: When operating in manual mode outside the vbios-defined
-[PWM_min, PWM_max] range, the reported fan speed (RPM) may not be accurate
-depending on your hardware.
-
-Bug reports
----------
-
-Thermal management on Nouveau is new and may not work on all cards. If you have
-inquiries, please ping mupuf on IRC (#nouveau, freenode).
-
-Bug reports should be filled on Freedesktop's bug tracker. Please follow
-http://nouveau.freedesktop.org/wiki/Bugs
diff --git a/Documentation/thermal/nouveau_thermal.rst b/Documentation/thermal/nouveau_thermal.rst
new file mode 100644
index 000000000000..37255fd6735d
--- /dev/null
+++ b/Documentation/thermal/nouveau_thermal.rst
@@ -0,0 +1,96 @@
+=====================
+Kernel driver nouveau
+=====================
+
+Supported chips:
+
+* NV43+
+
+Authors: Martin Peres (mupuf) <martin.peres@free.fr>
+
+Description
+-----------
+
+This driver allows to read the GPU core temperature, drive the GPU fan and
+set temperature alarms.
+
+Currently, due to the absence of in-kernel API to access HWMON drivers, Nouveau
+cannot access any of the i2c external monitoring chips it may find. If you
+have one of those, temperature and/or fan management through Nouveau's HWMON
+interface is likely not to work. This document may then not cover your situation
+entirely.
+
+Temperature management
+----------------------
+
+Temperature is exposed under as a read-only HWMON attribute temp1_input.
+
+In order to protect the GPU from overheating, Nouveau supports 4 configurable
+temperature thresholds:
+
+ * Fan_boost:
+	Fan speed is set to 100% when reaching this temperature;
+ * Downclock:
+	The GPU will be downclocked to reduce its power dissipation;
+ * Critical:
+	The GPU is put on hold to further lower power dissipation;
+ * Shutdown:
+	Shut the computer down to protect your GPU.
+
+WARNING:
+	Some of these thresholds may not be used by Nouveau depending
+	on your chipset.
+
+The default value for these thresholds comes from the GPU's vbios. These
+thresholds can be configured thanks to the following HWMON attributes:
+
+ * Fan_boost: temp1_auto_point1_temp and temp1_auto_point1_temp_hyst;
+ * Downclock: temp1_max and temp1_max_hyst;
+ * Critical: temp1_crit and temp1_crit_hyst;
+ * Shutdown: temp1_emergency and temp1_emergency_hyst.
+
+NOTE: Remember that the values are stored as milli degrees Celsius. Don't forget
+to multiply!
+
+Fan management
+--------------
+
+Not all cards have a drivable fan. If you do, then the following HWMON
+attributes should be available:
+
+ * pwm1_enable:
+	Current fan management mode (NONE, MANUAL or AUTO);
+ * pwm1:
+	Current PWM value (power percentage);
+ * pwm1_min:
+	The minimum PWM speed allowed;
+ * pwm1_max:
+	The maximum PWM speed allowed (bypassed when hitting Fan_boost);
+
+You may also have the following attribute:
+
+ * fan1_input:
+	Speed in RPM of your fan.
+
+Your fan can be driven in different modes:
+
+ * 0: The fan is left untouched;
+ * 1: The fan can be driven in manual (use pwm1 to change the speed);
+ * 2; The fan is driven automatically depending on the temperature.
+
+NOTE:
+  Be sure to use the manual mode if you want to drive the fan speed manually
+
+NOTE2:
+  When operating in manual mode outside the vbios-defined
+  [PWM_min, PWM_max] range, the reported fan speed (RPM) may not be accurate
+  depending on your hardware.
+
+Bug reports
+-----------
+
+Thermal management on Nouveau is new and may not work on all cards. If you have
+inquiries, please ping mupuf on IRC (#nouveau, freenode).
+
+Bug reports should be filled on Freedesktop's bug tracker. Please follow
+http://nouveau.freedesktop.org/wiki/Bugs
diff --git a/Documentation/thermal/power_allocator.rst b/Documentation/thermal/power_allocator.rst
new file mode 100644
index 000000000000..67b6a3297238
--- /dev/null
+++ b/Documentation/thermal/power_allocator.rst
@@ -0,0 +1,271 @@
+=================================
+Power allocator governor tunables
+=================================
+
+Trip points
+-----------
+
+The governor works optimally with the following two passive trip points:
+
+1.  "switch on" trip point: temperature above which the governor
+    control loop starts operating.  This is the first passive trip
+    point of the thermal zone.
+
+2.  "desired temperature" trip point: it should be higher than the
+    "switch on" trip point.  This the target temperature the governor
+    is controlling for.  This is the last passive trip point of the
+    thermal zone.
+
+PID Controller
+--------------
+
+The power allocator governor implements a
+Proportional-Integral-Derivative controller (PID controller) with
+temperature as the control input and power as the controlled output:
+
+    P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power
+
+where
+   -  e = desired_temperature - current_temperature
+   -  err_integral is the sum of previous errors
+   -  diff_err = e - previous_error
+
+It is similar to the one depicted below::
+
+				      k_d
+				       |
+  current_temp                         |
+       |                               v
+       |              +----------+   +---+
+       |       +----->| diff_err |-->| X |------+
+       |       |      +----------+   +---+      |
+       |       |                                |      tdp        actor
+       |       |                      k_i       |       |  get_requested_power()
+       |       |                       |        |       |        |     |
+       |       |                       |        |       |        |     | ...
+       v       |                       v        v       v        v     v
+     +---+     |      +-------+      +---+    +---+   +---+   +----------+
+     | S |-----+----->| sum e |----->| X |--->| S |-->| S |-->|power     |
+     +---+     |      +-------+      +---+    +---+   +---+   |allocation|
+       ^       |                                ^             +----------+
+       |       |                                |                |     |
+       |       |        +---+                   |                |     |
+       |       +------->| X |-------------------+                v     v
+       |                +---+                               granted performance
+  desired_temperature     ^
+			  |
+			  |
+		      k_po/k_pu
+
+Sustainable power
+-----------------
+
+An estimate of the sustainable dissipatable power (in mW) should be
+provided while registering the thermal zone.  This estimates the
+sustained power that can be dissipated at the desired control
+temperature.  This is the maximum sustained power for allocation at
+the desired maximum temperature.  The actual sustained power can vary
+for a number of reasons.  The closed loop controller will take care of
+variations such as environmental conditions, and some factors related
+to the speed-grade of the silicon.  `sustainable_power` is therefore
+simply an estimate, and may be tuned to affect the aggressiveness of
+the thermal ramp. For reference, the sustainable power of a 4" phone
+is typically 2000mW, while on a 10" tablet is around 4500mW (may vary
+depending on screen size).
+
+If you are using device tree, do add it as a property of the
+thermal-zone.  For example::
+
+	thermal-zones {
+		soc_thermal {
+			polling-delay = <1000>;
+			polling-delay-passive = <100>;
+			sustainable-power = <2500>;
+			...
+
+Instead, if the thermal zone is registered from the platform code, pass a
+`thermal_zone_params` that has a `sustainable_power`.  If no
+`thermal_zone_params` were being passed, then something like below
+will suffice::
+
+	static const struct thermal_zone_params tz_params = {
+		.sustainable_power = 3500,
+	};
+
+and then pass `tz_params` as the 5th parameter to
+`thermal_zone_device_register()`
+
+k_po and k_pu
+-------------
+
+The implementation of the PID controller in the power allocator
+thermal governor allows the configuration of two proportional term
+constants: `k_po` and `k_pu`.  `k_po` is the proportional term
+constant during temperature overshoot periods (current temperature is
+above "desired temperature" trip point).  Conversely, `k_pu` is the
+proportional term constant during temperature undershoot periods
+(current temperature below "desired temperature" trip point).
+
+These controls are intended as the primary mechanism for configuring
+the permitted thermal "ramp" of the system.  For instance, a lower
+`k_pu` value will provide a slower ramp, at the cost of capping
+available capacity at a low temperature.  On the other hand, a high
+value of `k_pu` will result in the governor granting very high power
+while temperature is low, and may lead to temperature overshooting.
+
+The default value for `k_pu` is::
+
+    2 * sustainable_power / (desired_temperature - switch_on_temp)
+
+This means that at `switch_on_temp` the output of the controller's
+proportional term will be 2 * `sustainable_power`.  The default value
+for `k_po` is::
+
+    sustainable_power / (desired_temperature - switch_on_temp)
+
+Focusing on the proportional and feed forward values of the PID
+controller equation we have::
+
+    P_max = k_p * e + sustainable_power
+
+The proportional term is proportional to the difference between the
+desired temperature and the current one.  When the current temperature
+is the desired one, then the proportional component is zero and
+`P_max` = `sustainable_power`.  That is, the system should operate in
+thermal equilibrium under constant load.  `sustainable_power` is only
+an estimate, which is the reason for closed-loop control such as this.
+
+Expanding `k_pu` we get::
+
+    P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) +
+	sustainable_power
+
+where:
+
+    - T_set is the desired temperature
+    - T is the current temperature
+    - T_on is the switch on temperature
+
+When the current temperature is the switch_on temperature, the above
+formula becomes::
+
+    P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) +
+	sustainable_power = 2 * sustainable_power + sustainable_power =
+	3 * sustainable_power
+
+Therefore, the proportional term alone linearly decreases power from
+3 * `sustainable_power` to `sustainable_power` as the temperature
+rises from the switch on temperature to the desired temperature.
+
+k_i and integral_cutoff
+-----------------------
+
+`k_i` configures the PID loop's integral term constant.  This term
+allows the PID controller to compensate for long term drift and for
+the quantized nature of the output control: cooling devices can't set
+the exact power that the governor requests.  When the temperature
+error is below `integral_cutoff`, errors are accumulated in the
+integral term.  This term is then multiplied by `k_i` and the result
+added to the output of the controller.  Typically `k_i` is set low (1
+or 2) and `integral_cutoff` is 0.
+
+k_d
+---
+
+`k_d` configures the PID loop's derivative term constant.  It's
+recommended to leave it as the default: 0.
+
+Cooling device power API
+========================
+
+Cooling devices controlled by this governor must supply the additional
+"power" API in their `cooling_device_ops`.  It consists on three ops:
+
+1. ::
+
+    int get_requested_power(struct thermal_cooling_device *cdev,
+			    struct thermal_zone_device *tz, u32 *power);
+
+
+@cdev:
+	The `struct thermal_cooling_device` pointer
+@tz:
+	thermal zone in which we are currently operating
+@power:
+	pointer in which to store the calculated power
+
+`get_requested_power()` calculates the power requested by the device
+in milliwatts and stores it in @power .  It should return 0 on
+success, -E* on failure.  This is currently used by the power
+allocator governor to calculate how much power to give to each cooling
+device.
+
+2. ::
+
+	int state2power(struct thermal_cooling_device *cdev, struct
+			thermal_zone_device *tz, unsigned long state,
+			u32 *power);
+
+@cdev:
+	The `struct thermal_cooling_device` pointer
+@tz:
+	thermal zone in which we are currently operating
+@state:
+	A cooling device state
+@power:
+	pointer in which to store the equivalent power
+
+Convert cooling device state @state into power consumption in
+milliwatts and store it in @power.  It should return 0 on success, -E*
+on failure.  This is currently used by thermal core to calculate the
+maximum power that an actor can consume.
+
+3. ::
+
+	int power2state(struct thermal_cooling_device *cdev, u32 power,
+			unsigned long *state);
+
+@cdev:
+	The `struct thermal_cooling_device` pointer
+@power:
+	power in milliwatts
+@state:
+	pointer in which to store the resulting state
+
+Calculate a cooling device state that would make the device consume at
+most @power mW and store it in @state.  It should return 0 on success,
+-E* on failure.  This is currently used by the thermal core to convert
+a given power set by the power allocator governor to a state that the
+cooling device can set.  It is a function because this conversion may
+depend on external factors that may change so this function should the
+best conversion given "current circumstances".
+
+Cooling device weights
+----------------------
+
+Weights are a mechanism to bias the allocation among cooling
+devices.  They express the relative power efficiency of different
+cooling devices.  Higher weight can be used to express higher power
+efficiency.  Weighting is relative such that if each cooling device
+has a weight of one they are considered equal.  This is particularly
+useful in heterogeneous systems where two cooling devices may perform
+the same kind of compute, but with different efficiency.  For example,
+a system with two different types of processors.
+
+If the thermal zone is registered using
+`thermal_zone_device_register()` (i.e., platform code), then weights
+are passed as part of the thermal zone's `thermal_bind_parameters`.
+If the platform is registered using device tree, then they are passed
+as the `contribution` property of each map in the `cooling-maps` node.
+
+Limitations of the power allocator governor
+===========================================
+
+The power allocator governor's PID controller works best if there is a
+periodic tick.  If you have a driver that calls
+`thermal_zone_device_update()` (or anything that ends up calling the
+governor's `throttle()` function) repetitively, the governor response
+won't be very good.  Note that this is not particular to this
+governor, step-wise will also misbehave if you call its throttle()
+faster than the normal thermal framework tick (due to interrupts for
+example) as it will overreact.
diff --git a/Documentation/thermal/power_allocator.txt b/Documentation/thermal/power_allocator.txt
deleted file mode 100644
index 9fb0ff06dca9..000000000000
--- a/Documentation/thermal/power_allocator.txt
+++ /dev/null
@@ -1,247 +0,0 @@
-Power allocator governor tunables
-=================================
-
-Trip points
------------
-
-The governor works optimally with the following two passive trip points:
-
-1.  "switch on" trip point: temperature above which the governor
-    control loop starts operating.  This is the first passive trip
-    point of the thermal zone.
-
-2.  "desired temperature" trip point: it should be higher than the
-    "switch on" trip point.  This the target temperature the governor
-    is controlling for.  This is the last passive trip point of the
-    thermal zone.
-
-PID Controller
---------------
-
-The power allocator governor implements a
-Proportional-Integral-Derivative controller (PID controller) with
-temperature as the control input and power as the controlled output:
-
-    P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power
-
-where
-    e = desired_temperature - current_temperature
-    err_integral is the sum of previous errors
-    diff_err = e - previous_error
-
-It is similar to the one depicted below:
-
-                                      k_d
-                                       |
-current_temp                           |
-     |                                 v
-     |                +----------+   +---+
-     |         +----->| diff_err |-->| X |------+
-     |         |      +----------+   +---+      |
-     |         |                                |      tdp        actor
-     |         |                      k_i       |       |  get_requested_power()
-     |         |                       |        |       |        |     |
-     |         |                       |        |       |        |     | ...
-     v         |                       v        v       v        v     v
-   +---+       |      +-------+      +---+    +---+   +---+   +----------+
-   | S |-------+----->| sum e |----->| X |--->| S |-->| S |-->|power     |
-   +---+       |      +-------+      +---+    +---+   +---+   |allocation|
-     ^         |                                ^             +----------+
-     |         |                                |                |     |
-     |         |        +---+                   |                |     |
-     |         +------->| X |-------------------+                v     v
-     |                  +---+                               granted performance
-desired_temperature       ^
-                          |
-                          |
-                      k_po/k_pu
-
-Sustainable power
------------------
-
-An estimate of the sustainable dissipatable power (in mW) should be
-provided while registering the thermal zone.  This estimates the
-sustained power that can be dissipated at the desired control
-temperature.  This is the maximum sustained power for allocation at
-the desired maximum temperature.  The actual sustained power can vary
-for a number of reasons.  The closed loop controller will take care of
-variations such as environmental conditions, and some factors related
-to the speed-grade of the silicon.  `sustainable_power` is therefore
-simply an estimate, and may be tuned to affect the aggressiveness of
-the thermal ramp. For reference, the sustainable power of a 4" phone
-is typically 2000mW, while on a 10" tablet is around 4500mW (may vary
-depending on screen size).
-
-If you are using device tree, do add it as a property of the
-thermal-zone.  For example:
-
-	thermal-zones {
-		soc_thermal {
-			polling-delay = <1000>;
-			polling-delay-passive = <100>;
-			sustainable-power = <2500>;
-			...
-
-Instead, if the thermal zone is registered from the platform code, pass a
-`thermal_zone_params` that has a `sustainable_power`.  If no
-`thermal_zone_params` were being passed, then something like below
-will suffice:
-
-	static const struct thermal_zone_params tz_params = {
-		.sustainable_power = 3500,
-	};
-
-and then pass `tz_params` as the 5th parameter to
-`thermal_zone_device_register()`
-
-k_po and k_pu
--------------
-
-The implementation of the PID controller in the power allocator
-thermal governor allows the configuration of two proportional term
-constants: `k_po` and `k_pu`.  `k_po` is the proportional term
-constant during temperature overshoot periods (current temperature is
-above "desired temperature" trip point).  Conversely, `k_pu` is the
-proportional term constant during temperature undershoot periods
-(current temperature below "desired temperature" trip point).
-
-These controls are intended as the primary mechanism for configuring
-the permitted thermal "ramp" of the system.  For instance, a lower
-`k_pu` value will provide a slower ramp, at the cost of capping
-available capacity at a low temperature.  On the other hand, a high
-value of `k_pu` will result in the governor granting very high power
-while temperature is low, and may lead to temperature overshooting.
-
-The default value for `k_pu` is:
-
-    2 * sustainable_power / (desired_temperature - switch_on_temp)
-
-This means that at `switch_on_temp` the output of the controller's
-proportional term will be 2 * `sustainable_power`.  The default value
-for `k_po` is:
-
-    sustainable_power / (desired_temperature - switch_on_temp)
-
-Focusing on the proportional and feed forward values of the PID
-controller equation we have:
-
-    P_max = k_p * e + sustainable_power
-
-The proportional term is proportional to the difference between the
-desired temperature and the current one.  When the current temperature
-is the desired one, then the proportional component is zero and
-`P_max` = `sustainable_power`.  That is, the system should operate in
-thermal equilibrium under constant load.  `sustainable_power` is only
-an estimate, which is the reason for closed-loop control such as this.
-
-Expanding `k_pu` we get:
-    P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) +
-        sustainable_power
-
-where
-    T_set is the desired temperature
-    T is the current temperature
-    T_on is the switch on temperature
-
-When the current temperature is the switch_on temperature, the above
-formula becomes:
-
-    P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) +
-        sustainable_power = 2 * sustainable_power + sustainable_power =
-        3 * sustainable_power
-
-Therefore, the proportional term alone linearly decreases power from
-3 * `sustainable_power` to `sustainable_power` as the temperature
-rises from the switch on temperature to the desired temperature.
-
-k_i and integral_cutoff
------------------------
-
-`k_i` configures the PID loop's integral term constant.  This term
-allows the PID controller to compensate for long term drift and for
-the quantized nature of the output control: cooling devices can't set
-the exact power that the governor requests.  When the temperature
-error is below `integral_cutoff`, errors are accumulated in the
-integral term.  This term is then multiplied by `k_i` and the result
-added to the output of the controller.  Typically `k_i` is set low (1
-or 2) and `integral_cutoff` is 0.
-
-k_d
----
-
-`k_d` configures the PID loop's derivative term constant.  It's
-recommended to leave it as the default: 0.
-
-Cooling device power API
-========================
-
-Cooling devices controlled by this governor must supply the additional
-"power" API in their `cooling_device_ops`.  It consists on three ops:
-
-1. int get_requested_power(struct thermal_cooling_device *cdev,
-	struct thermal_zone_device *tz, u32 *power);
-@cdev: The `struct thermal_cooling_device` pointer
-@tz: thermal zone in which we are currently operating
-@power: pointer in which to store the calculated power
-
-`get_requested_power()` calculates the power requested by the device
-in milliwatts and stores it in @power .  It should return 0 on
-success, -E* on failure.  This is currently used by the power
-allocator governor to calculate how much power to give to each cooling
-device.
-
-2. int state2power(struct thermal_cooling_device *cdev, struct
-        thermal_zone_device *tz, unsigned long state, u32 *power);
-@cdev: The `struct thermal_cooling_device` pointer
-@tz: thermal zone in which we are currently operating
-@state: A cooling device state
-@power: pointer in which to store the equivalent power
-
-Convert cooling device state @state into power consumption in
-milliwatts and store it in @power.  It should return 0 on success, -E*
-on failure.  This is currently used by thermal core to calculate the
-maximum power that an actor can consume.
-
-3. int power2state(struct thermal_cooling_device *cdev, u32 power,
-	unsigned long *state);
-@cdev: The `struct thermal_cooling_device` pointer
-@power: power in milliwatts
-@state: pointer in which to store the resulting state
-
-Calculate a cooling device state that would make the device consume at
-most @power mW and store it in @state.  It should return 0 on success,
--E* on failure.  This is currently used by the thermal core to convert
-a given power set by the power allocator governor to a state that the
-cooling device can set.  It is a function because this conversion may
-depend on external factors that may change so this function should the
-best conversion given "current circumstances".
-
-Cooling device weights
-----------------------
-
-Weights are a mechanism to bias the allocation among cooling
-devices.  They express the relative power efficiency of different
-cooling devices.  Higher weight can be used to express higher power
-efficiency.  Weighting is relative such that if each cooling device
-has a weight of one they are considered equal.  This is particularly
-useful in heterogeneous systems where two cooling devices may perform
-the same kind of compute, but with different efficiency.  For example,
-a system with two different types of processors.
-
-If the thermal zone is registered using
-`thermal_zone_device_register()` (i.e., platform code), then weights
-are passed as part of the thermal zone's `thermal_bind_parameters`.
-If the platform is registered using device tree, then they are passed
-as the `contribution` property of each map in the `cooling-maps` node.
-
-Limitations of the power allocator governor
-===========================================
-
-The power allocator governor's PID controller works best if there is a
-periodic tick.  If you have a driver that calls
-`thermal_zone_device_update()` (or anything that ends up calling the
-governor's `throttle()` function) repetitively, the governor response
-won't be very good.  Note that this is not particular to this
-governor, step-wise will also misbehave if you call its throttle()
-faster than the normal thermal framework tick (due to interrupts for
-example) as it will overreact.
diff --git a/Documentation/thermal/sysfs-api.rst b/Documentation/thermal/sysfs-api.rst
new file mode 100644
index 000000000000..e4930761d3e5
--- /dev/null
+++ b/Documentation/thermal/sysfs-api.rst
@@ -0,0 +1,798 @@
+===================================
+Generic Thermal Sysfs driver How To
+===================================
+
+Written by Sujith Thomas <sujith.thomas@intel.com>, Zhang Rui <rui.zhang@intel.com>
+
+Updated: 2 January 2008
+
+Copyright (c)  2008 Intel Corporation
+
+
+0. Introduction
+===============
+
+The generic thermal sysfs provides a set of interfaces for thermal zone
+devices (sensors) and thermal cooling devices (fan, processor...) to register
+with the thermal management solution and to be a part of it.
+
+This how-to focuses on enabling new thermal zone and cooling devices to
+participate in thermal management.
+This solution is platform independent and any type of thermal zone devices
+and cooling devices should be able to make use of the infrastructure.
+
+The main task of the thermal sysfs driver is to expose thermal zone attributes
+as well as cooling device attributes to the user space.
+An intelligent thermal management application can make decisions based on
+inputs from thermal zone attributes (the current temperature and trip point
+temperature) and throttle appropriate devices.
+
+- `[0-*]`	denotes any positive number starting from 0
+- `[1-*]`	denotes any positive number starting from 1
+
+1. thermal sysfs driver interface functions
+===========================================
+
+1.1 thermal zone device interface
+---------------------------------
+
+    ::
+
+	struct thermal_zone_device
+	*thermal_zone_device_register(char *type,
+				      int trips, int mask, void *devdata,
+				      struct thermal_zone_device_ops *ops,
+				      const struct thermal_zone_params *tzp,
+				      int passive_delay, int polling_delay))
+
+    This interface function adds a new thermal zone device (sensor) to
+    /sys/class/thermal folder as `thermal_zone[0-*]`. It tries to bind all the
+    thermal cooling devices registered at the same time.
+
+    type:
+	the thermal zone type.
+    trips:
+	the total number of trip points this thermal zone supports.
+    mask:
+	Bit string: If 'n'th bit is set, then trip point 'n' is writeable.
+    devdata:
+	device private data
+    ops:
+	thermal zone device call-backs.
+
+	.bind:
+		bind the thermal zone device with a thermal cooling device.
+	.unbind:
+		unbind the thermal zone device with a thermal cooling device.
+	.get_temp:
+		get the current temperature of the thermal zone.
+	.set_trips:
+		    set the trip points window. Whenever the current temperature
+		    is updated, the trip points immediately below and above the
+		    current temperature are found.
+	.get_mode:
+		   get the current mode (enabled/disabled) of the thermal zone.
+
+			- "enabled" means the kernel thermal management is
+			  enabled.
+			- "disabled" will prevent kernel thermal driver action
+			  upon trip points so that user applications can take
+			  charge of thermal management.
+	.set_mode:
+		set the mode (enabled/disabled) of the thermal zone.
+	.get_trip_type:
+		get the type of certain trip point.
+	.get_trip_temp:
+			get the temperature above which the certain trip point
+			will be fired.
+	.set_emul_temp:
+			set the emulation temperature which helps in debugging
+			different threshold temperature points.
+    tzp:
+	thermal zone platform parameters.
+    passive_delay:
+	number of milliseconds to wait between polls when
+	performing passive cooling.
+    polling_delay:
+	number of milliseconds to wait between polls when checking
+	whether trip points have been crossed (0 for interrupt driven systems).
+
+    ::
+
+	void thermal_zone_device_unregister(struct thermal_zone_device *tz)
+
+    This interface function removes the thermal zone device.
+    It deletes the corresponding entry from /sys/class/thermal folder and
+    unbinds all the thermal cooling devices it uses.
+
+	::
+
+	   struct thermal_zone_device
+	   *thermal_zone_of_sensor_register(struct device *dev, int sensor_id,
+				void *data,
+				const struct thermal_zone_of_device_ops *ops)
+
+	This interface adds a new sensor to a DT thermal zone.
+	This function will search the list of thermal zones described in
+	device tree and look for the zone that refer to the sensor device
+	pointed by dev->of_node as temperature providers. For the zone
+	pointing to the sensor node, the sensor will be added to the DT
+	thermal zone device.
+
+	The parameters for this interface are:
+
+	dev:
+			Device node of sensor containing valid node pointer in
+			dev->of_node.
+	sensor_id:
+			a sensor identifier, in case the sensor IP has more
+			than one sensors
+	data:
+			a private pointer (owned by the caller) that will be
+			passed back, when a temperature reading is needed.
+	ops:
+			`struct thermal_zone_of_device_ops *`.
+
+			==============  =======================================
+			get_temp	a pointer to a function that reads the
+					sensor temperature. This is mandatory
+					callback provided by sensor driver.
+			set_trips	a pointer to a function that sets a
+					temperature window. When this window is
+					left the driver must inform the thermal
+					core via thermal_zone_device_update.
+			get_trend 	a pointer to a function that reads the
+					sensor temperature trend.
+			set_emul_temp	a pointer to a function that sets
+					sensor emulated temperature.
+			==============  =======================================
+
+	The thermal zone temperature is provided by the get_temp() function
+	pointer of thermal_zone_of_device_ops. When called, it will
+	have the private pointer @data back.
+
+	It returns error pointer if fails otherwise valid thermal zone device
+	handle. Caller should check the return handle with IS_ERR() for finding
+	whether success or not.
+
+	::
+
+	    void thermal_zone_of_sensor_unregister(struct device *dev,
+						   struct thermal_zone_device *tzd)
+
+	This interface unregisters a sensor from a DT thermal zone which was
+	successfully added by interface thermal_zone_of_sensor_register().
+	This function removes the sensor callbacks and private data from the
+	thermal zone device registered with thermal_zone_of_sensor_register()
+	interface. It will also silent the zone by remove the .get_temp() and
+	get_trend() thermal zone device callbacks.
+
+	::
+
+	  struct thermal_zone_device
+	  *devm_thermal_zone_of_sensor_register(struct device *dev,
+				int sensor_id,
+				void *data,
+				const struct thermal_zone_of_device_ops *ops)
+
+	This interface is resource managed version of
+	thermal_zone_of_sensor_register().
+
+	All details of thermal_zone_of_sensor_register() described in
+	section 1.1.3 is applicable here.
+
+	The benefit of using this interface to register sensor is that it
+	is not require to explicitly call thermal_zone_of_sensor_unregister()
+	in error path or during driver unbinding as this is done by driver
+	resource manager.
+
+	::
+
+		void devm_thermal_zone_of_sensor_unregister(struct device *dev,
+						struct thermal_zone_device *tzd)
+
+	This interface is resource managed version of
+	thermal_zone_of_sensor_unregister().
+	All details of thermal_zone_of_sensor_unregister() described in
+	section 1.1.4 is applicable here.
+	Normally this function will not need to be called and the resource
+	management code will ensure that the resource is freed.
+
+	::
+
+		int thermal_zone_get_slope(struct thermal_zone_device *tz)
+
+	This interface is used to read the slope attribute value
+	for the thermal zone device, which might be useful for platform
+	drivers for temperature calculations.
+
+	::
+
+		int thermal_zone_get_offset(struct thermal_zone_device *tz)
+
+	This interface is used to read the offset attribute value
+	for the thermal zone device, which might be useful for platform
+	drivers for temperature calculations.
+
+1.2 thermal cooling device interface
+------------------------------------
+
+
+    ::
+
+	struct thermal_cooling_device
+	*thermal_cooling_device_register(char *name,
+			void *devdata, struct thermal_cooling_device_ops *)
+
+    This interface function adds a new thermal cooling device (fan/processor/...)
+    to /sys/class/thermal/ folder as `cooling_device[0-*]`. It tries to bind itself
+    to all the thermal zone devices registered at the same time.
+
+    name:
+	the cooling device name.
+    devdata:
+	device private data.
+    ops:
+	thermal cooling devices call-backs.
+
+	.get_max_state:
+		get the Maximum throttle state of the cooling device.
+	.get_cur_state:
+		get the Currently requested throttle state of the
+		cooling device.
+	.set_cur_state:
+		set the Current throttle state of the cooling device.
+
+    ::
+
+	void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
+
+    This interface function removes the thermal cooling device.
+    It deletes the corresponding entry from /sys/class/thermal folder and
+    unbinds itself from all the thermal zone devices using it.
+
+1.3 interface for binding a thermal zone device with a thermal cooling device
+-----------------------------------------------------------------------------
+
+    ::
+
+	int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
+		int trip, struct thermal_cooling_device *cdev,
+		unsigned long upper, unsigned long lower, unsigned int weight);
+
+    This interface function binds a thermal cooling device to a particular trip
+    point of a thermal zone device.
+
+    This function is usually called in the thermal zone device .bind callback.
+
+    tz:
+	  the thermal zone device
+    cdev:
+	  thermal cooling device
+    trip:
+	  indicates which trip point in this thermal zone the cooling device
+	  is associated with.
+    upper:
+	  the Maximum cooling state for this trip point.
+	  THERMAL_NO_LIMIT means no upper limit,
+	  and the cooling device can be in max_state.
+    lower:
+	  the Minimum cooling state can be used for this trip point.
+	  THERMAL_NO_LIMIT means no lower limit,
+	  and the cooling device can be in cooling state 0.
+    weight:
+	  the influence of this cooling device in this thermal
+	  zone.  See 1.4.1 below for more information.
+
+    ::
+
+	int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
+				int trip, struct thermal_cooling_device *cdev);
+
+    This interface function unbinds a thermal cooling device from a particular
+    trip point of a thermal zone device. This function is usually called in
+    the thermal zone device .unbind callback.
+
+    tz:
+	the thermal zone device
+    cdev:
+	thermal cooling device
+    trip:
+	indicates which trip point in this thermal zone the cooling device
+	is associated with.
+
+1.4 Thermal Zone Parameters
+---------------------------
+
+    ::
+
+	struct thermal_bind_params
+
+    This structure defines the following parameters that are used to bind
+    a zone with a cooling device for a particular trip point.
+
+    .cdev:
+	     The cooling device pointer
+    .weight:
+	     The 'influence' of a particular cooling device on this
+	     zone. This is relative to the rest of the cooling
+	     devices. For example, if all cooling devices have a
+	     weight of 1, then they all contribute the same. You can
+	     use percentages if you want, but it's not mandatory. A
+	     weight of 0 means that this cooling device doesn't
+	     contribute to the cooling of this zone unless all cooling
+	     devices have a weight of 0. If all weights are 0, then
+	     they all contribute the same.
+    .trip_mask:
+	       This is a bit mask that gives the binding relation between
+	       this thermal zone and cdev, for a particular trip point.
+	       If nth bit is set, then the cdev and thermal zone are bound
+	       for trip point n.
+    .binding_limits:
+		     This is an array of cooling state limits. Must have
+		     exactly 2 * thermal_zone.number_of_trip_points. It is an
+		     array consisting of tuples <lower-state upper-state> of
+		     state limits. Each trip will be associated with one state
+		     limit tuple when binding. A NULL pointer means
+		     <THERMAL_NO_LIMITS THERMAL_NO_LIMITS> on all trips.
+		     These limits are used when binding a cdev to a trip point.
+    .match:
+	    This call back returns success(0) if the 'tz and cdev' need to
+	    be bound, as per platform data.
+
+    ::
+
+	struct thermal_zone_params
+
+    This structure defines the platform level parameters for a thermal zone.
+    This data, for each thermal zone should come from the platform layer.
+    This is an optional feature where some platforms can choose not to
+    provide this data.
+
+    .governor_name:
+	       Name of the thermal governor used for this zone
+    .no_hwmon:
+	       a boolean to indicate if the thermal to hwmon sysfs interface
+	       is required. when no_hwmon == false, a hwmon sysfs interface
+	       will be created. when no_hwmon == true, nothing will be done.
+	       In case the thermal_zone_params is NULL, the hwmon interface
+	       will be created (for backward compatibility).
+    .num_tbps:
+	       Number of thermal_bind_params entries for this zone
+    .tbp:
+	       thermal_bind_params entries
+
+2. sysfs attributes structure
+=============================
+
+==	================
+RO	read only value
+WO	write only value
+RW	read/write value
+==	================
+
+Thermal sysfs attributes will be represented under /sys/class/thermal.
+Hwmon sysfs I/F extension is also available under /sys/class/hwmon
+if hwmon is compiled in or built as a module.
+
+Thermal zone device sys I/F, created once it's registered::
+
+  /sys/class/thermal/thermal_zone[0-*]:
+    |---type:			Type of the thermal zone
+    |---temp:			Current temperature
+    |---mode:			Working mode of the thermal zone
+    |---policy:			Thermal governor used for this zone
+    |---available_policies:	Available thermal governors for this zone
+    |---trip_point_[0-*]_temp:	Trip point temperature
+    |---trip_point_[0-*]_type:	Trip point type
+    |---trip_point_[0-*]_hyst:	Hysteresis value for this trip point
+    |---emul_temp:		Emulated temperature set node
+    |---sustainable_power:      Sustainable dissipatable power
+    |---k_po:                   Proportional term during temperature overshoot
+    |---k_pu:                   Proportional term during temperature undershoot
+    |---k_i:                    PID's integral term in the power allocator gov
+    |---k_d:                    PID's derivative term in the power allocator
+    |---integral_cutoff:        Offset above which errors are accumulated
+    |---slope:                  Slope constant applied as linear extrapolation
+    |---offset:                 Offset constant applied as linear extrapolation
+
+Thermal cooling device sys I/F, created once it's registered::
+
+  /sys/class/thermal/cooling_device[0-*]:
+    |---type:			Type of the cooling device(processor/fan/...)
+    |---max_state:		Maximum cooling state of the cooling device
+    |---cur_state:		Current cooling state of the cooling device
+    |---stats:			Directory containing cooling device's statistics
+    |---stats/reset:		Writing any value resets the statistics
+    |---stats/time_in_state_ms:	Time (msec) spent in various cooling states
+    |---stats/total_trans:	Total number of times cooling state is changed
+    |---stats/trans_table:	Cooing state transition table
+
+
+Then next two dynamic attributes are created/removed in pairs. They represent
+the relationship between a thermal zone and its associated cooling device.
+They are created/removed for each successful execution of
+thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device.
+
+::
+
+  /sys/class/thermal/thermal_zone[0-*]:
+    |---cdev[0-*]:		[0-*]th cooling device in current thermal zone
+    |---cdev[0-*]_trip_point:	Trip point that cdev[0-*] is associated with
+    |---cdev[0-*]_weight:       Influence of the cooling device in
+				this thermal zone
+
+Besides the thermal zone device sysfs I/F and cooling device sysfs I/F,
+the generic thermal driver also creates a hwmon sysfs I/F for each _type_
+of thermal zone device. E.g. the generic thermal driver registers one hwmon
+class device and build the associated hwmon sysfs I/F for all the registered
+ACPI thermal zones.
+
+::
+
+  /sys/class/hwmon/hwmon[0-*]:
+    |---name:			The type of the thermal zone devices
+    |---temp[1-*]_input:	The current temperature of thermal zone [1-*]
+    |---temp[1-*]_critical:	The critical trip point of thermal zone [1-*]
+
+Please read Documentation/hwmon/sysfs-interface.rst for additional information.
+
+Thermal zone attributes
+-----------------------
+
+type
+	Strings which represent the thermal zone type.
+	This is given by thermal zone driver as part of registration.
+	E.g: "acpitz" indicates it's an ACPI thermal device.
+	In order to keep it consistent with hwmon sys attribute; this should
+	be a short, lowercase string, not containing spaces nor dashes.
+	RO, Required
+
+temp
+	Current temperature as reported by thermal zone (sensor).
+	Unit: millidegree Celsius
+	RO, Required
+
+mode
+	One of the predefined values in [enabled, disabled].
+	This file gives information about the algorithm that is currently
+	managing the thermal zone. It can be either default kernel based
+	algorithm or user space application.
+
+	enabled
+			  enable Kernel Thermal management.
+	disabled
+			  Preventing kernel thermal zone driver actions upon
+			  trip points so that user application can take full
+			  charge of the thermal management.
+
+	RW, Optional
+
+policy
+	One of the various thermal governors used for a particular zone.
+
+	RW, Required
+
+available_policies
+	Available thermal governors which can be used for a particular zone.
+
+	RO, Required
+
+`trip_point_[0-*]_temp`
+	The temperature above which trip point will be fired.
+
+	Unit: millidegree Celsius
+
+	RO, Optional
+
+`trip_point_[0-*]_type`
+	Strings which indicate the type of the trip point.
+
+	E.g. it can be one of critical, hot, passive, `active[0-*]` for ACPI
+	thermal zone.
+
+	RO, Optional
+
+`trip_point_[0-*]_hyst`
+	The hysteresis value for a trip point, represented as an integer
+	Unit: Celsius
+	RW, Optional
+
+`cdev[0-*]`
+	Sysfs link to the thermal cooling device node where the sys I/F
+	for cooling device throttling control represents.
+
+	RO, Optional
+
+`cdev[0-*]_trip_point`
+	The trip point in this thermal zone which `cdev[0-*]` is associated
+	with; -1 means the cooling device is not associated with any trip
+	point.
+
+	RO, Optional
+
+`cdev[0-*]_weight`
+	The influence of `cdev[0-*]` in this thermal zone. This value
+	is relative to the rest of cooling devices in the thermal
+	zone. For example, if a cooling device has a weight double
+	than that of other, it's twice as effective in cooling the
+	thermal zone.
+
+	RW, Optional
+
+passive
+	Attribute is only present for zones in which the passive cooling
+	policy is not supported by native thermal driver. Default is zero
+	and can be set to a temperature (in millidegrees) to enable a
+	passive trip point for the zone. Activation is done by polling with
+	an interval of 1 second.
+
+	Unit: millidegrees Celsius
+
+	Valid values: 0 (disabled) or greater than 1000
+
+	RW, Optional
+
+emul_temp
+	Interface to set the emulated temperature method in thermal zone
+	(sensor). After setting this temperature, the thermal zone may pass
+	this temperature to platform emulation function if registered or
+	cache it locally. This is useful in debugging different temperature
+	threshold and its associated cooling action. This is write only node
+	and writing 0 on this node should disable emulation.
+	Unit: millidegree Celsius
+
+	WO, Optional
+
+	  WARNING:
+	    Be careful while enabling this option on production systems,
+	    because userland can easily disable the thermal policy by simply
+	    flooding this sysfs node with low temperature values.
+
+sustainable_power
+	An estimate of the sustained power that can be dissipated by
+	the thermal zone. Used by the power allocator governor. For
+	more information see Documentation/thermal/power_allocator.rst
+
+	Unit: milliwatts
+
+	RW, Optional
+
+k_po
+	The proportional term of the power allocator governor's PID
+	controller during temperature overshoot. Temperature overshoot
+	is when the current temperature is above the "desired
+	temperature" trip point. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	RW, Optional
+
+k_pu
+	The proportional term of the power allocator governor's PID
+	controller during temperature undershoot. Temperature undershoot
+	is when the current temperature is below the "desired
+	temperature" trip point. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	RW, Optional
+
+k_i
+	The integral term of the power allocator governor's PID
+	controller. This term allows the PID controller to compensate
+	for long term drift. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	RW, Optional
+
+k_d
+	The derivative term of the power allocator governor's PID
+	controller. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	RW, Optional
+
+integral_cutoff
+	Temperature offset from the desired temperature trip point
+	above which the integral term of the power allocator
+	governor's PID controller starts accumulating errors. For
+	example, if integral_cutoff is 0, then the integral term only
+	accumulates error when temperature is above the desired
+	temperature trip point. For more information see
+	Documentation/thermal/power_allocator.rst
+
+	Unit: millidegree Celsius
+
+	RW, Optional
+
+slope
+	The slope constant used in a linear extrapolation model
+	to determine a hotspot temperature based off the sensor's
+	raw readings. It is up to the device driver to determine
+	the usage of these values.
+
+	RW, Optional
+
+offset
+	The offset constant used in a linear extrapolation model
+	to determine a hotspot temperature based off the sensor's
+	raw readings. It is up to the device driver to determine
+	the usage of these values.
+
+	RW, Optional
+
+Cooling device attributes
+-------------------------
+
+type
+	String which represents the type of device, e.g:
+
+	- for generic ACPI: should be "Fan", "Processor" or "LCD"
+	- for memory controller device on intel_menlow platform:
+	  should be "Memory controller".
+
+	RO, Required
+
+max_state
+	The maximum permissible cooling state of this cooling device.
+
+	RO, Required
+
+cur_state
+	The current cooling state of this cooling device.
+	The value can any integer numbers between 0 and max_state:
+
+	- cur_state == 0 means no cooling
+	- cur_state == max_state means the maximum cooling.
+
+	RW, Required
+
+stats/reset
+	Writing any value resets the cooling device's statistics.
+	WO, Required
+
+stats/time_in_state_ms:
+	The amount of time spent by the cooling device in various cooling
+	states. The output will have "<state> <time>" pair in each line, which
+	will mean this cooling device spent <time> msec of time at <state>.
+	Output will have one line for each of the supported states.  usertime
+	units here is 10mS (similar to other time exported in /proc).
+	RO, Required
+
+
+stats/total_trans:
+	A single positive value showing the total number of times the state of a
+	cooling device is changed.
+
+	RO, Required
+
+stats/trans_table:
+	This gives fine grained information about all the cooling state
+	transitions. The cat output here is a two dimensional matrix, where an
+	entry <i,j> (row i, column j) represents the number of transitions from
+	State_i to State_j. If the transition table is bigger than PAGE_SIZE,
+	reading this will return an -EFBIG error.
+	RO, Required
+
+3. A simple implementation
+==========================
+
+ACPI thermal zone may support multiple trip points like critical, hot,
+passive, active. If an ACPI thermal zone supports critical, passive,
+active[0] and active[1] at the same time, it may register itself as a
+thermal_zone_device (thermal_zone1) with 4 trip points in all.
+It has one processor and one fan, which are both registered as
+thermal_cooling_device. Both are considered to have the same
+effectiveness in cooling the thermal zone.
+
+If the processor is listed in _PSL method, and the fan is listed in _AL0
+method, the sys I/F structure will be built like this::
+
+ /sys/class/thermal:
+  |thermal_zone1:
+    |---type:			acpitz
+    |---temp:			37000
+    |---mode:			enabled
+    |---policy:			step_wise
+    |---available_policies:	step_wise fair_share
+    |---trip_point_0_temp:	100000
+    |---trip_point_0_type:	critical
+    |---trip_point_1_temp:	80000
+    |---trip_point_1_type:	passive
+    |---trip_point_2_temp:	70000
+    |---trip_point_2_type:	active0
+    |---trip_point_3_temp:	60000
+    |---trip_point_3_type:	active1
+    |---cdev0:			--->/sys/class/thermal/cooling_device0
+    |---cdev0_trip_point:	1	/* cdev0 can be used for passive */
+    |---cdev0_weight:           1024
+    |---cdev1:			--->/sys/class/thermal/cooling_device3
+    |---cdev1_trip_point:	2	/* cdev1 can be used for active[0]*/
+    |---cdev1_weight:           1024
+
+  |cooling_device0:
+    |---type:			Processor
+    |---max_state:		8
+    |---cur_state:		0
+
+  |cooling_device3:
+    |---type:			Fan
+    |---max_state:		2
+    |---cur_state:		0
+
+ /sys/class/hwmon:
+  |hwmon0:
+    |---name:			acpitz
+    |---temp1_input:		37000
+    |---temp1_crit:		100000
+
+4. Event Notification
+=====================
+
+The framework includes a simple notification mechanism, in the form of a
+netlink event. Netlink socket initialization is done during the _init_
+of the framework. Drivers which intend to use the notification mechanism
+just need to call thermal_generate_netlink_event() with two arguments viz
+(originator, event). The originator is a pointer to struct thermal_zone_device
+from where the event has been originated. An integer which represents the
+thermal zone device will be used in the message to identify the zone. The
+event will be one of:{THERMAL_AUX0, THERMAL_AUX1, THERMAL_CRITICAL,
+THERMAL_DEV_FAULT}. Notification can be sent when the current temperature
+crosses any of the configured thresholds.
+
+5. Export Symbol APIs
+=====================
+
+5.1. get_tz_trend
+-----------------
+
+This function returns the trend of a thermal zone, i.e the rate of change
+of temperature of the thermal zone. Ideally, the thermal sensor drivers
+are supposed to implement the callback. If they don't, the thermal
+framework calculated the trend by comparing the previous and the current
+temperature values.
+
+5.2. get_thermal_instance
+-------------------------
+
+This function returns the thermal_instance corresponding to a given
+{thermal_zone, cooling_device, trip_point} combination. Returns NULL
+if such an instance does not exist.
+
+5.3. thermal_notify_framework
+-----------------------------
+
+This function handles the trip events from sensor drivers. It starts
+throttling the cooling devices according to the policy configured.
+For CRITICAL and HOT trip points, this notifies the respective drivers,
+and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
+The throttling policy is based on the configured platform data; if no
+platform data is provided, this uses the step_wise throttling policy.
+
+5.4. thermal_cdev_update
+------------------------
+
+This function serves as an arbitrator to set the state of a cooling
+device. It sets the cooling device to the deepest cooling state if
+possible.
+
+6. thermal_emergency_poweroff
+=============================
+
+On an event of critical trip temperature crossing. Thermal framework
+allows the system to shutdown gracefully by calling orderly_poweroff().
+In the event of a failure of orderly_poweroff() to shut down the system
+we are in danger of keeping the system alive at undesirably high
+temperatures. To mitigate this high risk scenario we program a work
+queue to fire after a pre-determined number of seconds to start
+an emergency shutdown of the device using the kernel_power_off()
+function. In case kernel_power_off() fails then finally
+emergency_restart() is called in the worst case.
+
+The delay should be carefully profiled so as to give adequate time for
+orderly_poweroff(). In case of failure of an orderly_poweroff() the
+emergency poweroff kicks in after the delay has elapsed and shuts down
+the system.
+
+If set to 0 emergency poweroff will not be supported. So a carefully
+profiled non-zero positive value is a must for emergerncy poweroff to be
+triggered.
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
deleted file mode 100644
index c3fa500df92c..000000000000
--- a/Documentation/thermal/sysfs-api.txt
+++ /dev/null
@@ -1,636 +0,0 @@
-Generic Thermal Sysfs driver How To
-===================================
-
-Written by Sujith Thomas <sujith.thomas@intel.com>, Zhang Rui <rui.zhang@intel.com>
-
-Updated: 2 January 2008
-
-Copyright (c)  2008 Intel Corporation
-
-
-0. Introduction
-
-The generic thermal sysfs provides a set of interfaces for thermal zone
-devices (sensors) and thermal cooling devices (fan, processor...) to register
-with the thermal management solution and to be a part of it.
-
-This how-to focuses on enabling new thermal zone and cooling devices to
-participate in thermal management.
-This solution is platform independent and any type of thermal zone devices
-and cooling devices should be able to make use of the infrastructure.
-
-The main task of the thermal sysfs driver is to expose thermal zone attributes
-as well as cooling device attributes to the user space.
-An intelligent thermal management application can make decisions based on
-inputs from thermal zone attributes (the current temperature and trip point
-temperature) and throttle appropriate devices.
-
-[0-*]	denotes any positive number starting from 0
-[1-*]	denotes any positive number starting from 1
-
-1. thermal sysfs driver interface functions
-
-1.1 thermal zone device interface
-1.1.1 struct thermal_zone_device *thermal_zone_device_register(char *type,
-		int trips, int mask, void *devdata,
-		struct thermal_zone_device_ops *ops,
-		const struct thermal_zone_params *tzp,
-		int passive_delay, int polling_delay))
-
-    This interface function adds a new thermal zone device (sensor) to
-    /sys/class/thermal folder as thermal_zone[0-*]. It tries to bind all the
-    thermal cooling devices registered at the same time.
-
-    type: the thermal zone type.
-    trips: the total number of trip points this thermal zone supports.
-    mask: Bit string: If 'n'th bit is set, then trip point 'n' is writeable.
-    devdata: device private data
-    ops: thermal zone device call-backs.
-	.bind: bind the thermal zone device with a thermal cooling device.
-	.unbind: unbind the thermal zone device with a thermal cooling device.
-	.get_temp: get the current temperature of the thermal zone.
-	.set_trips: set the trip points window. Whenever the current temperature
-		    is updated, the trip points immediately below and above the
-		    current temperature are found.
-	.get_mode: get the current mode (enabled/disabled) of the thermal zone.
-	    - "enabled" means the kernel thermal management is enabled.
-	    - "disabled" will prevent kernel thermal driver action upon trip points
-	      so that user applications can take charge of thermal management.
-	.set_mode: set the mode (enabled/disabled) of the thermal zone.
-	.get_trip_type: get the type of certain trip point.
-	.get_trip_temp: get the temperature above which the certain trip point
-			will be fired.
-	.set_emul_temp: set the emulation temperature which helps in debugging
-			different threshold temperature points.
-    tzp: thermal zone platform parameters.
-    passive_delay: number of milliseconds to wait between polls when
-	performing passive cooling.
-    polling_delay: number of milliseconds to wait between polls when checking
-	whether trip points have been crossed (0 for interrupt driven systems).
-
-
-1.1.2 void thermal_zone_device_unregister(struct thermal_zone_device *tz)
-
-    This interface function removes the thermal zone device.
-    It deletes the corresponding entry from /sys/class/thermal folder and
-    unbinds all the thermal cooling devices it uses.
-
-1.1.3 struct thermal_zone_device *thermal_zone_of_sensor_register(
-		struct device *dev, int sensor_id, void *data,
-		const struct thermal_zone_of_device_ops *ops)
-
-	This interface adds a new sensor to a DT thermal zone.
-	This function will search the list of thermal zones described in
-	device tree and look for the zone that refer to the sensor device
-	pointed by dev->of_node as temperature providers. For the zone
-	pointing to the sensor node, the sensor will be added to the DT
-	thermal zone device.
-
-	The parameters for this interface are:
-	dev:		Device node of sensor containing valid node pointer in
-			dev->of_node.
-	sensor_id:	a sensor identifier, in case the sensor IP has more
-			than one sensors
-	data:		a private pointer (owned by the caller) that will be
-			passed back, when a temperature reading is needed.
-	ops:		struct thermal_zone_of_device_ops *.
-
-			get_temp:	a pointer to a function that reads the
-					sensor temperature. This is mandatory
-					callback provided by sensor driver.
-			set_trips:      a pointer to a function that sets a
-					temperature window. When this window is
-					left the driver must inform the thermal
-					core via thermal_zone_device_update.
-			get_trend: 	a pointer to a function that reads the
-					sensor temperature trend.
-			set_emul_temp:	a pointer to a function that sets
-					sensor emulated temperature.
-	The thermal zone temperature is provided by the get_temp() function
-	pointer of thermal_zone_of_device_ops. When called, it will
-	have the private pointer @data back.
-
-	It returns error pointer if fails otherwise valid thermal zone device
-	handle. Caller should check the return handle with IS_ERR() for finding
-	whether success or not.
-
-1.1.4 void thermal_zone_of_sensor_unregister(struct device *dev,
-		struct thermal_zone_device *tzd)
-
-	This interface unregisters a sensor from a DT thermal zone which was
-	successfully added by interface thermal_zone_of_sensor_register().
-	This function removes the sensor callbacks and private data from the
-	thermal zone device registered with thermal_zone_of_sensor_register()
-	interface. It will also silent the zone by remove the .get_temp() and
-	get_trend() thermal zone device callbacks.
-
-1.1.5 struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
-		struct device *dev, int sensor_id,
-		void *data, const struct thermal_zone_of_device_ops *ops)
-
-	This interface is resource managed version of
-	thermal_zone_of_sensor_register().
-	All details of thermal_zone_of_sensor_register() described in
-	section 1.1.3 is applicable here.
-	The benefit of using this interface to register sensor is that it
-	is not require to explicitly call thermal_zone_of_sensor_unregister()
-	in error path or during driver unbinding as this is done by driver
-	resource manager.
-
-1.1.6 void devm_thermal_zone_of_sensor_unregister(struct device *dev,
-		struct thermal_zone_device *tzd)
-
-	This interface is resource managed version of
-	thermal_zone_of_sensor_unregister().
-	All details of thermal_zone_of_sensor_unregister() described in
-	section 1.1.4 is applicable here.
-	Normally this function will not need to be called and the resource
-	management code will ensure that the resource is freed.
-
-1.1.7 int thermal_zone_get_slope(struct thermal_zone_device *tz)
-
-	This interface is used to read the slope attribute value
-	for the thermal zone device, which might be useful for platform
-	drivers for temperature calculations.
-
-1.1.8 int thermal_zone_get_offset(struct thermal_zone_device *tz)
-
-	This interface is used to read the offset attribute value
-	for the thermal zone device, which might be useful for platform
-	drivers for temperature calculations.
-
-1.2 thermal cooling device interface
-1.2.1 struct thermal_cooling_device *thermal_cooling_device_register(char *name,
-		void *devdata, struct thermal_cooling_device_ops *)
-
-    This interface function adds a new thermal cooling device (fan/processor/...)
-    to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself
-    to all the thermal zone devices registered at the same time.
-    name: the cooling device name.
-    devdata: device private data.
-    ops: thermal cooling devices call-backs.
-	.get_max_state: get the Maximum throttle state of the cooling device.
-	.get_cur_state: get the Currently requested throttle state of the cooling device.
-	.set_cur_state: set the Current throttle state of the cooling device.
-
-1.2.2 void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
-
-    This interface function removes the thermal cooling device.
-    It deletes the corresponding entry from /sys/class/thermal folder and
-    unbinds itself from all the thermal zone devices using it.
-
-1.3 interface for binding a thermal zone device with a thermal cooling device
-1.3.1 int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
-	int trip, struct thermal_cooling_device *cdev,
-	unsigned long upper, unsigned long lower, unsigned int weight);
-
-    This interface function binds a thermal cooling device to a particular trip
-    point of a thermal zone device.
-    This function is usually called in the thermal zone device .bind callback.
-    tz: the thermal zone device
-    cdev: thermal cooling device
-    trip: indicates which trip point in this thermal zone the cooling device
-          is associated with.
-    upper:the Maximum cooling state for this trip point.
-          THERMAL_NO_LIMIT means no upper limit,
-	  and the cooling device can be in max_state.
-    lower:the Minimum cooling state can be used for this trip point.
-          THERMAL_NO_LIMIT means no lower limit,
-	  and the cooling device can be in cooling state 0.
-    weight: the influence of this cooling device in this thermal
-            zone.  See 1.4.1 below for more information.
-
-1.3.2 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
-		int trip, struct thermal_cooling_device *cdev);
-
-    This interface function unbinds a thermal cooling device from a particular
-    trip point of a thermal zone device. This function is usually called in
-    the thermal zone device .unbind callback.
-    tz: the thermal zone device
-    cdev: thermal cooling device
-    trip: indicates which trip point in this thermal zone the cooling device
-          is associated with.
-
-1.4 Thermal Zone Parameters
-1.4.1 struct thermal_bind_params
-    This structure defines the following parameters that are used to bind
-    a zone with a cooling device for a particular trip point.
-    .cdev: The cooling device pointer
-    .weight: The 'influence' of a particular cooling device on this
-             zone. This is relative to the rest of the cooling
-             devices. For example, if all cooling devices have a
-             weight of 1, then they all contribute the same. You can
-             use percentages if you want, but it's not mandatory. A
-             weight of 0 means that this cooling device doesn't
-             contribute to the cooling of this zone unless all cooling
-             devices have a weight of 0. If all weights are 0, then
-             they all contribute the same.
-    .trip_mask:This is a bit mask that gives the binding relation between
-               this thermal zone and cdev, for a particular trip point.
-               If nth bit is set, then the cdev and thermal zone are bound
-               for trip point n.
-    .binding_limits: This is an array of cooling state limits. Must have
-                     exactly 2 * thermal_zone.number_of_trip_points. It is an
-                     array consisting of tuples <lower-state upper-state> of
-                     state limits. Each trip will be associated with one state
-                     limit tuple when binding. A NULL pointer means
-                     <THERMAL_NO_LIMITS THERMAL_NO_LIMITS> on all trips.
-                     These limits are used when binding a cdev to a trip point.
-    .match: This call back returns success(0) if the 'tz and cdev' need to
-	    be bound, as per platform data.
-1.4.2 struct thermal_zone_params
-    This structure defines the platform level parameters for a thermal zone.
-    This data, for each thermal zone should come from the platform layer.
-    This is an optional feature where some platforms can choose not to
-    provide this data.
-    .governor_name: Name of the thermal governor used for this zone
-    .no_hwmon: a boolean to indicate if the thermal to hwmon sysfs interface
-               is required. when no_hwmon == false, a hwmon sysfs interface
-               will be created. when no_hwmon == true, nothing will be done.
-               In case the thermal_zone_params is NULL, the hwmon interface
-               will be created (for backward compatibility).
-    .num_tbps: Number of thermal_bind_params entries for this zone
-    .tbp: thermal_bind_params entries
-
-2. sysfs attributes structure
-
-RO	read only value
-WO	write only value
-RW	read/write value
-
-Thermal sysfs attributes will be represented under /sys/class/thermal.
-Hwmon sysfs I/F extension is also available under /sys/class/hwmon
-if hwmon is compiled in or built as a module.
-
-Thermal zone device sys I/F, created once it's registered:
-/sys/class/thermal/thermal_zone[0-*]:
-    |---type:			Type of the thermal zone
-    |---temp:			Current temperature
-    |---mode:			Working mode of the thermal zone
-    |---policy:			Thermal governor used for this zone
-    |---available_policies:	Available thermal governors for this zone
-    |---trip_point_[0-*]_temp:	Trip point temperature
-    |---trip_point_[0-*]_type:	Trip point type
-    |---trip_point_[0-*]_hyst:	Hysteresis value for this trip point
-    |---emul_temp:		Emulated temperature set node
-    |---sustainable_power:      Sustainable dissipatable power
-    |---k_po:                   Proportional term during temperature overshoot
-    |---k_pu:                   Proportional term during temperature undershoot
-    |---k_i:                    PID's integral term in the power allocator gov
-    |---k_d:                    PID's derivative term in the power allocator
-    |---integral_cutoff:        Offset above which errors are accumulated
-    |---slope:                  Slope constant applied as linear extrapolation
-    |---offset:                 Offset constant applied as linear extrapolation
-
-Thermal cooling device sys I/F, created once it's registered:
-/sys/class/thermal/cooling_device[0-*]:
-    |---type:			Type of the cooling device(processor/fan/...)
-    |---max_state:		Maximum cooling state of the cooling device
-    |---cur_state:		Current cooling state of the cooling device
-    |---stats:			Directory containing cooling device's statistics
-    |---stats/reset:		Writing any value resets the statistics
-    |---stats/time_in_state_ms:	Time (msec) spent in various cooling states
-    |---stats/total_trans:	Total number of times cooling state is changed
-    |---stats/trans_table:	Cooing state transition table
-
-
-Then next two dynamic attributes are created/removed in pairs. They represent
-the relationship between a thermal zone and its associated cooling device.
-They are created/removed for each successful execution of
-thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device.
-
-/sys/class/thermal/thermal_zone[0-*]:
-    |---cdev[0-*]:		[0-*]th cooling device in current thermal zone
-    |---cdev[0-*]_trip_point:	Trip point that cdev[0-*] is associated with
-    |---cdev[0-*]_weight:       Influence of the cooling device in
-                                this thermal zone
-
-Besides the thermal zone device sysfs I/F and cooling device sysfs I/F,
-the generic thermal driver also creates a hwmon sysfs I/F for each _type_
-of thermal zone device. E.g. the generic thermal driver registers one hwmon
-class device and build the associated hwmon sysfs I/F for all the registered
-ACPI thermal zones.
-
-/sys/class/hwmon/hwmon[0-*]:
-    |---name:			The type of the thermal zone devices
-    |---temp[1-*]_input:	The current temperature of thermal zone [1-*]
-    |---temp[1-*]_critical:	The critical trip point of thermal zone [1-*]
-
-Please read Documentation/hwmon/sysfs-interface.rst for additional information.
-
-***************************
-* Thermal zone attributes *
-***************************
-
-type
-	Strings which represent the thermal zone type.
-	This is given by thermal zone driver as part of registration.
-	E.g: "acpitz" indicates it's an ACPI thermal device.
-	In order to keep it consistent with hwmon sys attribute; this should
-	be a short, lowercase string, not containing spaces nor dashes.
-	RO, Required
-
-temp
-	Current temperature as reported by thermal zone (sensor).
-	Unit: millidegree Celsius
-	RO, Required
-
-mode
-	One of the predefined values in [enabled, disabled].
-	This file gives information about the algorithm that is currently
-	managing the thermal zone. It can be either default kernel based
-	algorithm or user space application.
-	enabled		= enable Kernel Thermal management.
-	disabled	= Preventing kernel thermal zone driver actions upon
-			  trip points so that user application can take full
-			  charge of the thermal management.
-	RW, Optional
-
-policy
-	One of the various thermal governors used for a particular zone.
-	RW, Required
-
-available_policies
-	Available thermal governors which can be used for a particular zone.
-	RO, Required
-
-trip_point_[0-*]_temp
-	The temperature above which trip point will be fired.
-	Unit: millidegree Celsius
-	RO, Optional
-
-trip_point_[0-*]_type
-	Strings which indicate the type of the trip point.
-	E.g. it can be one of critical, hot, passive, active[0-*] for ACPI
-	thermal zone.
-	RO, Optional
-
-trip_point_[0-*]_hyst
-	The hysteresis value for a trip point, represented as an integer
-	Unit: Celsius
-	RW, Optional
-
-cdev[0-*]
-	Sysfs link to the thermal cooling device node where the sys I/F
-	for cooling device throttling control represents.
-	RO, Optional
-
-cdev[0-*]_trip_point
-	The trip point in this thermal zone which cdev[0-*] is associated
-	with; -1 means the cooling device is not associated with any trip
-	point.
-	RO, Optional
-
-cdev[0-*]_weight
-        The influence of cdev[0-*] in this thermal zone. This value
-        is relative to the rest of cooling devices in the thermal
-        zone. For example, if a cooling device has a weight double
-        than that of other, it's twice as effective in cooling the
-        thermal zone.
-        RW, Optional
-
-passive
-	Attribute is only present for zones in which the passive cooling
-	policy is not supported by native thermal driver. Default is zero
-	and can be set to a temperature (in millidegrees) to enable a
-	passive trip point for the zone. Activation is done by polling with
-	an interval of 1 second.
-	Unit: millidegrees Celsius
-	Valid values: 0 (disabled) or greater than 1000
-	RW, Optional
-
-emul_temp
-	Interface to set the emulated temperature method in thermal zone
-	(sensor). After setting this temperature, the thermal zone may pass
-	this temperature to platform emulation function if registered or
-	cache it locally. This is useful in debugging different temperature
-	threshold and its associated cooling action. This is write only node
-	and writing 0 on this node should disable emulation.
-	Unit: millidegree Celsius
-	WO, Optional
-
-	  WARNING: Be careful while enabling this option on production systems,
-	  because userland can easily disable the thermal policy by simply
-	  flooding this sysfs node with low temperature values.
-
-sustainable_power
-	An estimate of the sustained power that can be dissipated by
-	the thermal zone. Used by the power allocator governor. For
-	more information see Documentation/thermal/power_allocator.txt
-	Unit: milliwatts
-	RW, Optional
-
-k_po
-	The proportional term of the power allocator governor's PID
-	controller during temperature overshoot. Temperature overshoot
-	is when the current temperature is above the "desired
-	temperature" trip point. For more information see
-	Documentation/thermal/power_allocator.txt
-	RW, Optional
-
-k_pu
-	The proportional term of the power allocator governor's PID
-	controller during temperature undershoot. Temperature undershoot
-	is when the current temperature is below the "desired
-	temperature" trip point. For more information see
-	Documentation/thermal/power_allocator.txt
-	RW, Optional
-
-k_i
-	The integral term of the power allocator governor's PID
-	controller. This term allows the PID controller to compensate
-	for long term drift. For more information see
-	Documentation/thermal/power_allocator.txt
-	RW, Optional
-
-k_d
-	The derivative term of the power allocator governor's PID
-	controller. For more information see
-	Documentation/thermal/power_allocator.txt
-	RW, Optional
-
-integral_cutoff
-	Temperature offset from the desired temperature trip point
-	above which the integral term of the power allocator
-	governor's PID controller starts accumulating errors. For
-	example, if integral_cutoff is 0, then the integral term only
-	accumulates error when temperature is above the desired
-	temperature trip point. For more information see
-	Documentation/thermal/power_allocator.txt
-	Unit: millidegree Celsius
-	RW, Optional
-
-slope
-	The slope constant used in a linear extrapolation model
-	to determine a hotspot temperature based off the sensor's
-	raw readings. It is up to the device driver to determine
-	the usage of these values.
-	RW, Optional
-
-offset
-	The offset constant used in a linear extrapolation model
-	to determine a hotspot temperature based off the sensor's
-	raw readings. It is up to the device driver to determine
-	the usage of these values.
-	RW, Optional
-
-*****************************
-* Cooling device attributes *
-*****************************
-
-type
-	String which represents the type of device, e.g:
-	- for generic ACPI: should be "Fan", "Processor" or "LCD"
-	- for memory controller device on intel_menlow platform:
-	  should be "Memory controller".
-	RO, Required
-
-max_state
-	The maximum permissible cooling state of this cooling device.
-	RO, Required
-
-cur_state
-	The current cooling state of this cooling device.
-	The value can any integer numbers between 0 and max_state:
-	- cur_state == 0 means no cooling
-	- cur_state == max_state means the maximum cooling.
-	RW, Required
-
-stats/reset
-	Writing any value resets the cooling device's statistics.
-	WO, Required
-
-stats/time_in_state_ms:
-	The amount of time spent by the cooling device in various cooling
-	states. The output will have "<state> <time>" pair in each line, which
-	will mean this cooling device spent <time> msec of time at <state>.
-	Output will have one line for each of the supported states.  usertime
-	units here is 10mS (similar to other time exported in /proc).
-	RO, Required
-
-stats/total_trans:
-	A single positive value showing the total number of times the state of a
-	cooling device is changed.
-	RO, Required
-
-stats/trans_table:
-	This gives fine grained information about all the cooling state
-	transitions. The cat output here is a two dimensional matrix, where an
-	entry <i,j> (row i, column j) represents the number of transitions from
-	State_i to State_j. If the transition table is bigger than PAGE_SIZE,
-	reading this will return an -EFBIG error.
-	RO, Required
-
-3. A simple implementation
-
-ACPI thermal zone may support multiple trip points like critical, hot,
-passive, active. If an ACPI thermal zone supports critical, passive,
-active[0] and active[1] at the same time, it may register itself as a
-thermal_zone_device (thermal_zone1) with 4 trip points in all.
-It has one processor and one fan, which are both registered as
-thermal_cooling_device. Both are considered to have the same
-effectiveness in cooling the thermal zone.
-
-If the processor is listed in _PSL method, and the fan is listed in _AL0
-method, the sys I/F structure will be built like this:
-
-/sys/class/thermal:
-
-|thermal_zone1:
-    |---type:			acpitz
-    |---temp:			37000
-    |---mode:			enabled
-    |---policy:			step_wise
-    |---available_policies:	step_wise fair_share
-    |---trip_point_0_temp:	100000
-    |---trip_point_0_type:	critical
-    |---trip_point_1_temp:	80000
-    |---trip_point_1_type:	passive
-    |---trip_point_2_temp:	70000
-    |---trip_point_2_type:	active0
-    |---trip_point_3_temp:	60000
-    |---trip_point_3_type:	active1
-    |---cdev0:			--->/sys/class/thermal/cooling_device0
-    |---cdev0_trip_point:	1	/* cdev0 can be used for passive */
-    |---cdev0_weight:           1024
-    |---cdev1:			--->/sys/class/thermal/cooling_device3
-    |---cdev1_trip_point:	2	/* cdev1 can be used for active[0]*/
-    |---cdev1_weight:           1024
-
-|cooling_device0:
-    |---type:			Processor
-    |---max_state:		8
-    |---cur_state:		0
-
-|cooling_device3:
-    |---type:			Fan
-    |---max_state:		2
-    |---cur_state:		0
-
-/sys/class/hwmon:
-
-|hwmon0:
-    |---name:			acpitz
-    |---temp1_input:		37000
-    |---temp1_crit:		100000
-
-4. Event Notification
-
-The framework includes a simple notification mechanism, in the form of a
-netlink event. Netlink socket initialization is done during the _init_
-of the framework. Drivers which intend to use the notification mechanism
-just need to call thermal_generate_netlink_event() with two arguments viz
-(originator, event). The originator is a pointer to struct thermal_zone_device
-from where the event has been originated. An integer which represents the
-thermal zone device will be used in the message to identify the zone. The
-event will be one of:{THERMAL_AUX0, THERMAL_AUX1, THERMAL_CRITICAL,
-THERMAL_DEV_FAULT}. Notification can be sent when the current temperature
-crosses any of the configured thresholds.
-
-5. Export Symbol APIs:
-
-5.1: get_tz_trend:
-This function returns the trend of a thermal zone, i.e the rate of change
-of temperature of the thermal zone. Ideally, the thermal sensor drivers
-are supposed to implement the callback. If they don't, the thermal
-framework calculated the trend by comparing the previous and the current
-temperature values.
-
-5.2:get_thermal_instance:
-This function returns the thermal_instance corresponding to a given
-{thermal_zone, cooling_device, trip_point} combination. Returns NULL
-if such an instance does not exist.
-
-5.3:thermal_notify_framework:
-This function handles the trip events from sensor drivers. It starts
-throttling the cooling devices according to the policy configured.
-For CRITICAL and HOT trip points, this notifies the respective drivers,
-and does actual throttling for other trip points i.e ACTIVE and PASSIVE.
-The throttling policy is based on the configured platform data; if no
-platform data is provided, this uses the step_wise throttling policy.
-
-5.4:thermal_cdev_update:
-This function serves as an arbitrator to set the state of a cooling
-device. It sets the cooling device to the deepest cooling state if
-possible.
-
-6. thermal_emergency_poweroff:
-
-On an event of critical trip temperature crossing. Thermal framework
-allows the system to shutdown gracefully by calling orderly_poweroff().
-In the event of a failure of orderly_poweroff() to shut down the system
-we are in danger of keeping the system alive at undesirably high
-temperatures. To mitigate this high risk scenario we program a work
-queue to fire after a pre-determined number of seconds to start
-an emergency shutdown of the device using the kernel_power_off()
-function. In case kernel_power_off() fails then finally
-emergency_restart() is called in the worst case.
-
-The delay should be carefully profiled so as to give adequate time for
-orderly_poweroff(). In case of failure of an orderly_poweroff() the
-emergency poweroff kicks in after the delay has elapsed and shuts down
-the system.
-
-If set to 0 emergency poweroff will not be supported. So a carefully
-profiled non-zero positive value is a must for emergerncy poweroff to be
-triggered.
diff --git a/Documentation/thermal/x86_pkg_temperature_thermal b/Documentation/thermal/x86_pkg_temperature_thermal
deleted file mode 100644
index 17a3a4c0a0ca..000000000000
--- a/Documentation/thermal/x86_pkg_temperature_thermal
+++ /dev/null
@@ -1,47 +0,0 @@
-Kernel driver: x86_pkg_temp_thermal
-===================
-
-Supported chips:
-* x86: with package level thermal management
-(Verify using: CPUID.06H:EAX[bit 6] =1)
-
-Authors: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
-
-Reference
----
-Intel® 64 and IA-32 Architectures Software Developer’s Manual (Jan, 2013):
-Chapter 14.6: PACKAGE LEVEL THERMAL MANAGEMENT
-
-Description
----------
-
-This driver register CPU digital temperature package level sensor as a thermal
-zone with maximum two user mode configurable trip points. Number of trip points
-depends on the capability of the package. Once the trip point is violated,
-user mode can receive notification via thermal notification mechanism and can
-take any action to control temperature.
-
-
-Threshold management
---------------------
-Each package will register as a thermal zone under /sys/class/thermal.
-Example:
-/sys/class/thermal/thermal_zone1
-
-This contains two trip points:
-- trip_point_0_temp
-- trip_point_1_temp
-
-User can set any temperature between 0 to TJ-Max temperature. Temperature units
-are in milli-degree Celsius. Refer to "Documentation/thermal/sysfs-api.txt" for
-thermal sys-fs details.
-
-Any value other than 0 in these trip points, can trigger thermal notifications.
-Setting 0, stops sending thermal notifications.
-
-Thermal notifications: To get kobject-uevent notifications, set the thermal zone
-policy to "user_space". For example: echo -n "user_space" > policy
-
-
-
-
diff --git a/Documentation/thermal/x86_pkg_temperature_thermal.rst b/Documentation/thermal/x86_pkg_temperature_thermal.rst
new file mode 100644
index 000000000000..f134dbd3f5a9
--- /dev/null
+++ b/Documentation/thermal/x86_pkg_temperature_thermal.rst
@@ -0,0 +1,55 @@
+===================================
+Kernel driver: x86_pkg_temp_thermal
+===================================
+
+Supported chips:
+
+* x86: with package level thermal management
+
+(Verify using: CPUID.06H:EAX[bit 6] =1)
+
+Authors: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+
+Reference
+---------
+
+Intel® 64 and IA-32 Architectures Software Developer’s Manual (Jan, 2013):
+Chapter 14.6: PACKAGE LEVEL THERMAL MANAGEMENT
+
+Description
+-----------
+
+This driver register CPU digital temperature package level sensor as a thermal
+zone with maximum two user mode configurable trip points. Number of trip points
+depends on the capability of the package. Once the trip point is violated,
+user mode can receive notification via thermal notification mechanism and can
+take any action to control temperature.
+
+
+Threshold management
+--------------------
+Each package will register as a thermal zone under /sys/class/thermal.
+
+Example::
+
+	/sys/class/thermal/thermal_zone1
+
+This contains two trip points:
+
+- trip_point_0_temp
+- trip_point_1_temp
+
+User can set any temperature between 0 to TJ-Max temperature. Temperature units
+are in milli-degree Celsius. Refer to "Documentation/thermal/sysfs-api.rst" for
+thermal sys-fs details.
+
+Any value other than 0 in these trip points, can trigger thermal notifications.
+Setting 0, stops sending thermal notifications.
+
+Thermal notifications:
+To get kobject-uevent notifications, set the thermal zone
+policy to "user_space".
+
+For example::
+
+	echo -n "user_space" > policy
diff --git a/MAINTAINERS b/MAINTAINERS
index d0ed735994a5..693f2aebbc83 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15618,7 +15618,7 @@ M:	Viresh Kumar <viresh.kumar@linaro.org>
 M:	Javi Merino <javi.merino@kernel.org>
 L:	linux-pm@vger.kernel.org
 S:	Supported
-F:	Documentation/thermal/cpu-cooling-api.txt
+F:	Documentation/thermal/cpu-cooling-api.rst
 F:	drivers/thermal/cpu_cooling.c
 F:	include/linux/cpu_cooling.h
 
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 15a4ca5d7099..681047f8cc05 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -251,7 +251,7 @@ struct thermal_bind_params {
 	 * platform characterization. This value is relative to the
 	 * rest of the weights so a cooling device whose weight is
 	 * double that of another cooling device is twice as
-	 * effective. See Documentation/thermal/sysfs-api.txt for more
+	 * effective. See Documentation/thermal/sysfs-api.rst for more
 	 * information.
 	 */
 	int weight;
@@ -259,7 +259,7 @@ struct thermal_bind_params {
 	/*
 	 * This is a bit mask that gives the binding relation between this
 	 * thermal zone and cdev, for a particular trip point.
-	 * See Documentation/thermal/sysfs-api.txt for more information.
+	 * See Documentation/thermal/sysfs-api.rst for more information.
 	 */
 	int trip_mask;
 
-- 
cgit v1.2.3-59-g8ed1b


From 586c1b4125b3c7bf5b482fcafab5d568b8a3c285 Mon Sep 17 00:00:00 2001
From: Tony Xie <tony.xie@rock-chips.com>
Date: Fri, 21 Jun 2019 06:32:54 -0400
Subject: mfd: rk808: Add RK817 and RK809 support

The RK809 and RK817 are a Power Management IC (PMIC) for multimedia
and handheld devices. They contains the following components:
  - Regulators
  - RTC
  - Clocking

Both RK809 and RK817 chips are using a similar register map,
so we can reuse the RTC and Clocking functionality.
Most of regulators have a some implementation also.

Signed-off-by: Tony Xie <tony.xie@rock-chips.com>
Acked-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig       |   6 +-
 drivers/mfd/rk808.c       | 192 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/mfd/rk808.h | 172 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 364 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 294d9567cc71..0b7db542e478 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1030,14 +1030,14 @@ config MFD_RC5T583
 	  different functionality of the device.
 
 config MFD_RK808
-	tristate "Rockchip RK805/RK808/RK818 Power Management Chip"
+	tristate "Rockchip RK805/RK808/RK809/RK817/RK818 Power Management Chip"
 	depends on I2C && OF
 	select MFD_CORE
 	select REGMAP_I2C
 	select REGMAP_IRQ
 	help
-	  If you say yes here you get support for the RK805, RK808 and RK818
-	  Power Management chips.
+	  If you say yes here you get support for the RK805, RK808, RK809,
+	  RK817 and RK818 Power Management chips.
 	  This driver provides common support for accessing the device
 	  through I2C interface. The device supports multiple sub-devices
 	  including interrupts, RTC, LDO & DCDC regulators, and onkey.
diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index 94377782d208..6ee1c461a3bb 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/of_device.h>
 #include <linux/regmap.h>
+#include <linux/syscore_ops.h>
 
 struct rk808_reg_data {
 	int addr;
@@ -62,6 +63,27 @@ static bool rk808_is_volatile_reg(struct device *dev, unsigned int reg)
 	return false;
 }
 
+static bool rk817_is_volatile_reg(struct device *dev, unsigned int reg)
+{
+	/*
+	 * Notes:
+	 * - Technically the ROUND_30s bit makes RTC_CTRL_REG volatile, but
+	 *   we don't use that feature.  It's better to cache.
+	 */
+
+	switch (reg) {
+	case RK817_SECONDS_REG ... RK817_WEEKS_REG:
+	case RK817_RTC_STATUS_REG:
+	case RK817_INT_STS_REG0:
+	case RK817_INT_STS_REG1:
+	case RK817_INT_STS_REG2:
+	case RK817_SYS_STS:
+		return true;
+	}
+
+	return true;
+}
+
 static const struct regmap_config rk818_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
@@ -86,6 +108,14 @@ static const struct regmap_config rk808_regmap_config = {
 	.volatile_reg = rk808_is_volatile_reg,
 };
 
+static const struct regmap_config rk817_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = RK817_GPIO_INT_CFG,
+	.cache_type = REGCACHE_NONE,
+	.volatile_reg = rk817_is_volatile_reg,
+};
+
 static struct resource rtc_resources[] = {
 	{
 		.start  = RK808_IRQ_RTC_ALARM,
@@ -94,6 +124,10 @@ static struct resource rtc_resources[] = {
 	}
 };
 
+static struct resource rk817_rtc_resources[] = {
+	DEFINE_RES_IRQ(RK817_IRQ_RTC_ALARM),
+};
+
 static struct resource rk805_key_resources[] = {
 	{
 		.start  = RK805_IRQ_PWRON_FALL,
@@ -107,6 +141,11 @@ static struct resource rk805_key_resources[] = {
 	}
 };
 
+static struct resource rk817_pwrkey_resources[] = {
+	DEFINE_RES_IRQ(RK817_IRQ_PWRON_RISE),
+	DEFINE_RES_IRQ(RK817_IRQ_PWRON_FALL),
+};
+
 static const struct mfd_cell rk805s[] = {
 	{ .name = "rk808-clkout", },
 	{ .name = "rk808-regulator", },
@@ -132,6 +171,21 @@ static const struct mfd_cell rk808s[] = {
 	},
 };
 
+static const struct mfd_cell rk817s[] = {
+	{ .name = "rk808-clkout",},
+	{ .name = "rk808-regulator",},
+	{
+		.name = "rk8xx-pwrkey",
+		.num_resources = ARRAY_SIZE(rk817_pwrkey_resources),
+		.resources = &rk817_pwrkey_resources[0],
+	},
+	{
+		.name = "rk808-rtc",
+		.num_resources = ARRAY_SIZE(rk817_rtc_resources),
+		.resources = &rk817_rtc_resources[0],
+	},
+};
+
 static const struct mfd_cell rk818s[] = {
 	{ .name = "rk808-clkout", },
 	{ .name = "rk808-regulator", },
@@ -167,6 +221,13 @@ static const struct rk808_reg_data rk808_pre_init_reg[] = {
 						    VB_LO_SEL_3500MV },
 };
 
+static const struct rk808_reg_data rk817_pre_init_reg[] = {
+	{RK817_RTC_CTRL_REG, RTC_STOP, RTC_STOP},
+	{RK817_GPIO_INT_CFG, RK817_INT_POL_MSK, RK817_INT_POL_H},
+	{RK817_SYS_CFG(1), RK817_HOTDIE_TEMP_MSK | RK817_TSD_TEMP_MSK,
+					   RK817_HOTDIE_105 | RK817_TSD_140},
+};
+
 static const struct rk808_reg_data rk818_pre_init_reg[] = {
 	/* improve efficiency */
 	{ RK818_BUCK2_CONFIG_REG, BUCK2_RATE_MASK,  BUCK_ILMIN_250MA },
@@ -332,6 +393,33 @@ static const struct regmap_irq rk818_irqs[] = {
 	},
 };
 
+static const struct regmap_irq rk817_irqs[RK817_IRQ_END] = {
+	REGMAP_IRQ_REG_LINE(0, 8),
+	REGMAP_IRQ_REG_LINE(1, 8),
+	REGMAP_IRQ_REG_LINE(2, 8),
+	REGMAP_IRQ_REG_LINE(3, 8),
+	REGMAP_IRQ_REG_LINE(4, 8),
+	REGMAP_IRQ_REG_LINE(5, 8),
+	REGMAP_IRQ_REG_LINE(6, 8),
+	REGMAP_IRQ_REG_LINE(7, 8),
+	REGMAP_IRQ_REG_LINE(8, 8),
+	REGMAP_IRQ_REG_LINE(9, 8),
+	REGMAP_IRQ_REG_LINE(10, 8),
+	REGMAP_IRQ_REG_LINE(11, 8),
+	REGMAP_IRQ_REG_LINE(12, 8),
+	REGMAP_IRQ_REG_LINE(13, 8),
+	REGMAP_IRQ_REG_LINE(14, 8),
+	REGMAP_IRQ_REG_LINE(15, 8),
+	REGMAP_IRQ_REG_LINE(16, 8),
+	REGMAP_IRQ_REG_LINE(17, 8),
+	REGMAP_IRQ_REG_LINE(18, 8),
+	REGMAP_IRQ_REG_LINE(19, 8),
+	REGMAP_IRQ_REG_LINE(20, 8),
+	REGMAP_IRQ_REG_LINE(21, 8),
+	REGMAP_IRQ_REG_LINE(22, 8),
+	REGMAP_IRQ_REG_LINE(23, 8)
+};
+
 static struct regmap_irq_chip rk805_irq_chip = {
 	.name = "rk805",
 	.irqs = rk805_irqs,
@@ -355,6 +443,18 @@ static const struct regmap_irq_chip rk808_irq_chip = {
 	.init_ack_masked = true,
 };
 
+static struct regmap_irq_chip rk817_irq_chip = {
+	.name = "rk817",
+	.irqs = rk817_irqs,
+	.num_irqs = ARRAY_SIZE(rk817_irqs),
+	.num_regs = 3,
+	.irq_reg_stride = 2,
+	.status_base = RK817_INT_STS_REG0,
+	.mask_base = RK817_INT_STS_MSK_REG0,
+	.ack_base = RK817_INT_STS_REG0,
+	.init_ack_masked = true,
+};
+
 static const struct regmap_irq_chip rk818_irq_chip = {
 	.name = "rk818",
 	.irqs = rk818_irqs,
@@ -423,9 +523,33 @@ static void rk818_device_shutdown(void)
 		dev_err(&rk808_i2c_client->dev, "power off error!\n");
 }
 
+static void rk8xx_syscore_shutdown(void)
+{
+	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+	int ret;
+
+	if (system_state == SYSTEM_POWER_OFF &&
+	    (rk808->variant == RK809_ID || rk808->variant == RK817_ID)) {
+		ret = regmap_update_bits(rk808->regmap,
+					 RK817_SYS_CFG(3),
+					 RK817_SLPPIN_FUNC_MSK,
+					 SLPPIN_DN_FUN);
+		if (ret) {
+			dev_warn(&rk808_i2c_client->dev,
+				 "Cannot switch to power down function\n");
+		}
+	}
+}
+
+static struct syscore_ops rk808_syscore_ops = {
+	.shutdown = rk8xx_syscore_shutdown,
+};
+
 static const struct of_device_id rk808_of_match[] = {
 	{ .compatible = "rockchip,rk805" },
 	{ .compatible = "rockchip,rk808" },
+	{ .compatible = "rockchip,rk809" },
+	{ .compatible = "rockchip,rk817" },
 	{ .compatible = "rockchip,rk818" },
 	{ },
 };
@@ -438,10 +562,11 @@ static int rk808_probe(struct i2c_client *client,
 	struct rk808 *rk808;
 	const struct rk808_reg_data *pre_init_reg;
 	const struct mfd_cell *cells;
-	void (*pm_pwroff_fn)(void);
+	void (*pm_pwroff_fn)(void) = NULL;
 	int nr_pre_init_regs;
 	int nr_cells;
 	int pm_off = 0, msb, lsb;
+	unsigned char pmic_id_msb, pmic_id_lsb;
 	int ret;
 	int i;
 
@@ -449,15 +574,24 @@ static int rk808_probe(struct i2c_client *client,
 	if (!rk808)
 		return -ENOMEM;
 
+	if (of_device_is_compatible(np, "rockchip,rk817") ||
+	    of_device_is_compatible(np, "rockchip,rk809")) {
+		pmic_id_msb = RK817_ID_MSB;
+		pmic_id_lsb = RK817_ID_LSB;
+	} else {
+		pmic_id_msb = RK808_ID_MSB;
+		pmic_id_lsb = RK808_ID_LSB;
+	}
+
 	/* Read chip variant */
-	msb = i2c_smbus_read_byte_data(client, RK808_ID_MSB);
+	msb = i2c_smbus_read_byte_data(client, pmic_id_msb);
 	if (msb < 0) {
 		dev_err(&client->dev, "failed to read the chip id at 0x%x\n",
 			RK808_ID_MSB);
 		return msb;
 	}
 
-	lsb = i2c_smbus_read_byte_data(client, RK808_ID_LSB);
+	lsb = i2c_smbus_read_byte_data(client, pmic_id_lsb);
 	if (lsb < 0) {
 		dev_err(&client->dev, "failed to read the chip id at 0x%x\n",
 			RK808_ID_LSB);
@@ -495,6 +629,16 @@ static int rk808_probe(struct i2c_client *client,
 		nr_cells = ARRAY_SIZE(rk818s);
 		pm_pwroff_fn = rk818_device_shutdown;
 		break;
+	case RK809_ID:
+	case RK817_ID:
+		rk808->regmap_cfg = &rk817_regmap_config;
+		rk808->regmap_irq_chip = &rk817_irq_chip;
+		pre_init_reg = rk817_pre_init_reg;
+		nr_pre_init_regs = ARRAY_SIZE(rk817_pre_init_reg);
+		cells = rk817s;
+		nr_cells = ARRAY_SIZE(rk817s);
+		register_syscore_ops(&rk808_syscore_ops);
+		break;
 	default:
 		dev_err(&client->dev, "Unsupported RK8XX ID %lu\n",
 			rk808->variant);
@@ -568,10 +712,52 @@ static int rk808_remove(struct i2c_client *client)
 	return 0;
 }
 
+static int rk8xx_suspend(struct device *dev)
+{
+	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+	int ret = 0;
+
+	switch (rk808->variant) {
+	case RK809_ID:
+	case RK817_ID:
+		ret = regmap_update_bits(rk808->regmap,
+					 RK817_SYS_CFG(3),
+					 RK817_SLPPIN_FUNC_MSK,
+					 SLPPIN_SLP_FUN);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static int rk8xx_resume(struct device *dev)
+{
+	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+	int ret = 0;
+
+	switch (rk808->variant) {
+	case RK809_ID:
+	case RK817_ID:
+		ret = regmap_update_bits(rk808->regmap,
+					 RK817_SYS_CFG(3),
+					 RK817_SLPPIN_FUNC_MSK,
+					 SLPPIN_NULL_FUN);
+		break;
+	default:
+		break;
+	}
+
+	return ret;
+}
+SIMPLE_DEV_PM_OPS(rk8xx_pm_ops, rk8xx_suspend, rk8xx_resume);
+
 static struct i2c_driver rk808_i2c_driver = {
 	.driver = {
 		.name = "rk808",
 		.of_match_table = rk808_of_match,
+		.pm = &rk8xx_pm_ops,
 	},
 	.probe    = rk808_probe,
 	.remove   = rk808_remove,
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index d3156594674c..0fd9eedf3c20 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -382,6 +382,7 @@ enum rk805_reg {
 #define SWITCH1_EN	BIT(5)
 #define DEV_OFF_RST	BIT(3)
 #define DEV_OFF		BIT(0)
+#define RTC_STOP	BIT(0)
 
 #define VB_LO_ACT		BIT(4)
 #define VB_LO_SEL_3500MV	(7 << 0)
@@ -396,6 +397,175 @@ enum rk805_reg {
 #define SLEEP_FUN			(0x1 << 2)
 #define RK8XX_ID_MSK			0xfff0
 #define FPWM_MODE			BIT(7)
+enum rk817_reg_id {
+	RK817_ID_DCDC1 = 0,
+	RK817_ID_DCDC2,
+	RK817_ID_DCDC3,
+	RK817_ID_DCDC4,
+	RK817_ID_LDO1,
+	RK817_ID_LDO2,
+	RK817_ID_LDO3,
+	RK817_ID_LDO4,
+	RK817_ID_LDO5,
+	RK817_ID_LDO6,
+	RK817_ID_LDO7,
+	RK817_ID_LDO8,
+	RK817_ID_LDO9,
+	RK817_ID_BOOST,
+	RK817_ID_BOOST_OTG_SW,
+	RK817_NUM_REGULATORS
+};
+
+enum rk809_reg_id {
+	RK809_ID_DCDC5 = RK817_ID_BOOST,
+	RK809_ID_SW1,
+	RK809_ID_SW2,
+	RK809_NUM_REGULATORS
+};
+
+#define RK817_SECONDS_REG		0x00
+#define RK817_MINUTES_REG		0x01
+#define RK817_HOURS_REG			0x02
+#define RK817_DAYS_REG			0x03
+#define RK817_MONTHS_REG		0x04
+#define RK817_YEARS_REG			0x05
+#define RK817_WEEKS_REG			0x06
+#define RK817_ALARM_SECONDS_REG		0x07
+#define RK817_ALARM_MINUTES_REG		0x08
+#define RK817_ALARM_HOURS_REG		0x09
+#define RK817_ALARM_DAYS_REG		0x0a
+#define RK817_ALARM_MONTHS_REG		0x0b
+#define RK817_ALARM_YEARS_REG		0x0c
+#define RK817_RTC_CTRL_REG		0xd
+#define RK817_RTC_STATUS_REG		0xe
+#define RK817_RTC_INT_REG		0xf
+#define RK817_RTC_COMP_LSB_REG		0x10
+#define RK817_RTC_COMP_MSB_REG		0x11
+
+#define RK817_POWER_EN_REG(i)		(0xb1 + (i))
+#define RK817_POWER_SLP_EN_REG(i)	(0xb5 + (i))
+
+#define RK817_POWER_CONFIG		(0xb9)
+
+#define RK817_BUCK_CONFIG_REG(i)	(0xba + (i) * 3)
+
+#define RK817_BUCK1_ON_VSEL_REG		0xBB
+#define RK817_BUCK1_SLP_VSEL_REG	0xBC
+
+#define RK817_BUCK2_CONFIG_REG		0xBD
+#define RK817_BUCK2_ON_VSEL_REG		0xBE
+#define RK817_BUCK2_SLP_VSEL_REG	0xBF
+
+#define RK817_BUCK3_CONFIG_REG		0xC0
+#define RK817_BUCK3_ON_VSEL_REG		0xC1
+#define RK817_BUCK3_SLP_VSEL_REG	0xC2
+
+#define RK817_BUCK4_CONFIG_REG		0xC3
+#define RK817_BUCK4_ON_VSEL_REG		0xC4
+#define RK817_BUCK4_SLP_VSEL_REG	0xC5
+
+#define RK817_LDO_ON_VSEL_REG(idx)	(0xcc + (idx) * 2)
+#define RK817_BOOST_OTG_CFG		(0xde)
+
+#define RK817_ID_MSB			0xed
+#define RK817_ID_LSB			0xee
+
+#define RK817_SYS_STS			0xf0
+#define RK817_SYS_CFG(i)		(0xf1 + (i))
+
+#define RK817_ON_SOURCE_REG		0xf5
+#define RK817_OFF_SOURCE_REG		0xf6
+
+/* INTERRUPT REGISTER */
+#define RK817_INT_STS_REG0		0xf8
+#define RK817_INT_STS_MSK_REG0		0xf9
+#define RK817_INT_STS_REG1		0xfa
+#define RK817_INT_STS_MSK_REG1		0xfb
+#define RK817_INT_STS_REG2		0xfc
+#define RK817_INT_STS_MSK_REG2		0xfd
+#define RK817_GPIO_INT_CFG		0xfe
+
+/* IRQ Definitions */
+#define RK817_IRQ_PWRON_FALL		0
+#define RK817_IRQ_PWRON_RISE		1
+#define RK817_IRQ_PWRON			2
+#define RK817_IRQ_PWMON_LP		3
+#define RK817_IRQ_HOTDIE		4
+#define RK817_IRQ_RTC_ALARM		5
+#define RK817_IRQ_RTC_PERIOD		6
+#define RK817_IRQ_VB_LO			7
+#define RK817_IRQ_PLUG_IN		8
+#define RK817_IRQ_PLUG_OUT		9
+#define RK817_IRQ_CHRG_TERM		10
+#define RK817_IRQ_CHRG_TIME		11
+#define RK817_IRQ_CHRG_TS		12
+#define RK817_IRQ_USB_OV		13
+#define RK817_IRQ_CHRG_IN_CLMP		14
+#define RK817_IRQ_BAT_DIS_ILIM		15
+#define RK817_IRQ_GATE_GPIO		16
+#define RK817_IRQ_TS_GPIO		17
+#define RK817_IRQ_CODEC_PD		18
+#define RK817_IRQ_CODEC_PO		19
+#define RK817_IRQ_CLASSD_MUTE_DONE	20
+#define RK817_IRQ_CLASSD_OCP		21
+#define RK817_IRQ_BAT_OVP               22
+#define RK817_IRQ_CHRG_BAT_HI		23
+#define RK817_IRQ_END			(RK817_IRQ_CHRG_BAT_HI + 1)
+
+/*
+ * rtc_ctrl 0xd
+ * same as 808, except bit4
+ */
+#define RK817_RTC_CTRL_RSV4		BIT(4)
+
+/* power config 0xb9 */
+#define RK817_BUCK3_FB_RES_MSK		BIT(6)
+#define RK817_BUCK3_FB_RES_INTER	BIT(6)
+#define RK817_BUCK3_FB_RES_EXT		0
+
+/* buck config 0xba */
+#define RK817_RAMP_RATE_OFFSET		6
+#define RK817_RAMP_RATE_MASK		(0x3 << RK817_RAMP_RATE_OFFSET)
+#define RK817_RAMP_RATE_3MV_PER_US	(0x0 << RK817_RAMP_RATE_OFFSET)
+#define RK817_RAMP_RATE_6_3MV_PER_US	(0x1 << RK817_RAMP_RATE_OFFSET)
+#define RK817_RAMP_RATE_12_5MV_PER_US	(0x2 << RK817_RAMP_RATE_OFFSET)
+#define RK817_RAMP_RATE_25MV_PER_US	(0x3 << RK817_RAMP_RATE_OFFSET)
+
+/* sys_cfg1 0xf2 */
+#define RK817_HOTDIE_TEMP_MSK		(0x3 << 4)
+#define RK817_HOTDIE_85			(0x0 << 4)
+#define RK817_HOTDIE_95			(0x1 << 4)
+#define RK817_HOTDIE_105		(0x2 << 4)
+#define RK817_HOTDIE_115		(0x3 << 4)
+
+#define RK817_TSD_TEMP_MSK		BIT(6)
+#define RK817_TSD_140			0
+#define RK817_TSD_160			BIT(6)
+
+#define RK817_CLK32KOUT2_EN		BIT(7)
+
+/* sys_cfg3 0xf4 */
+#define RK817_SLPPIN_FUNC_MSK		(0x3 << 3)
+#define SLPPIN_NULL_FUN			(0x0 << 3)
+#define SLPPIN_SLP_FUN			(0x1 << 3)
+#define SLPPIN_DN_FUN			(0x2 << 3)
+#define SLPPIN_RST_FUN			(0x3 << 3)
+
+#define RK817_RST_FUNC_MSK		(0x3 << 6)
+#define RK817_RST_FUNC_SFT		(6)
+#define RK817_RST_FUNC_CNT		(3)
+#define RK817_RST_FUNC_DEV		(0) /* reset the dev */
+#define RK817_RST_FUNC_REG		(0x1 << 6) /* reset the reg only */
+
+#define RK817_SLPPOL_MSK		BIT(5)
+#define RK817_SLPPOL_H			BIT(5)
+#define RK817_SLPPOL_L			(0)
+
+/* gpio&int 0xfe */
+#define RK817_INT_POL_MSK		BIT(1)
+#define RK817_INT_POL_H			BIT(1)
+#define RK817_INT_POL_L			0
+#define RK809_BUCK5_CONFIG(i)		(RK817_BOOST_OTG_CFG + (i) * 1)
 
 enum {
 	BUCK_ILMIN_50MA,
@@ -443,6 +613,8 @@ enum {
 enum {
 	RK805_ID = 0x8050,
 	RK808_ID = 0x0000,
+	RK809_ID = 0x8090,
+	RK817_ID = 0x8170,
 	RK818_ID = 0x8181,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From e444f6d68c07bc01a3a3d5905409dbe1ca391d26 Mon Sep 17 00:00:00 2001
From: Heiko Stuebner <heiko@sntech.de>
Date: Wed, 26 Jun 2019 14:29:18 +0200
Subject: regulator: rk808: Add RK809 and RK817 support.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

    Add support for the rk809 and rk817 regulator driver.
    Their specifications are as follows：
    1. The RK809 and RK809 consist of 5 DCDCs, 9 LDOs
       and have the same registers for these components except dcdc5.
    2. The dcdc5 is a boost dcdc for RK817 and is a buck for RK809.
    3. The RK817 has one switch but The Rk809 has two.

    The output voltages are configurable and are meant to supply power
    to the main processor and other components.

Signed-off-by: Tony Xie <tony.xie@rock-chips.com>
Acked-by: Mark Brown <broonie@kernel.org>
[rebased on top of 5.2-rc1]
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/regulator/Kconfig           |   4 +-
 drivers/regulator/rk808-regulator.c | 646 ++++++++++++++++++++++++++++++++++--
 include/linux/mfd/rk808.h           |   3 +
 3 files changed, 625 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 6c37f0df9323..214a958ff3e5 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -762,11 +762,11 @@ config REGULATOR_RC5T583
 	  outputs which can be controlled by i2c communication.
 
 config REGULATOR_RK808
-	tristate "Rockchip RK805/RK808/RK818 Power regulators"
+	tristate "Rockchip RK805/RK808/RK809/RK817/RK818 Power regulators"
 	depends on MFD_RK808
 	help
 	  Select this option to enable the power regulator of ROCKCHIP
-	  PMIC RK805,RK808 and RK818.
+	  PMIC RK805,RK809&RK817,RK808 and RK818.
 	  This driver supports the control of different power rails of device
 	  through regulator interface. The device supports multiple DCDC/LDO
 	  outputs which can be controlled by i2c communication.
diff --git a/drivers/regulator/rk808-regulator.c b/drivers/regulator/rk808-regulator.c
index 23713e16c286..e9b0bb996fc4 100644
--- a/drivers/regulator/rk808-regulator.c
+++ b/drivers/regulator/rk808-regulator.c
@@ -36,6 +36,12 @@
 #define RK808_BUCK4_VSEL_MASK	0xf
 #define RK808_LDO_VSEL_MASK	0x1f
 
+#define RK809_BUCK5_VSEL_MASK		0x7
+
+#define RK817_LDO_VSEL_MASK		0x7f
+#define RK817_BOOST_VSEL_MASK		0x7
+#define RK817_BUCK_VSEL_MASK		0x7f
+
 #define RK818_BUCK_VSEL_MASK		0x3f
 #define RK818_BUCK4_VSEL_MASK		0x1f
 #define RK818_LDO_VSEL_MASK		0x1f
@@ -65,30 +71,36 @@
 /* max steps for increase voltage of Buck1/2, equal 100mv*/
 #define MAX_STEPS_ONE_TIME 8
 
-#define RK805_DESC(_id, _match, _supply, _min, _max, _step, _vreg,      \
-	_vmask, _ereg, _emask, _etime)                                  \
-	[_id] = {                                                       \
-		.name           = (_match),                             \
-		.supply_name    = (_supply),                            \
-		.of_match       = of_match_ptr(_match),                 \
-		.regulators_node = of_match_ptr("regulators"),          \
-		.type           = REGULATOR_VOLTAGE,                    \
-		.id             = (_id),                                \
-		.n_voltages     = (((_max) - (_min)) / (_step) + 1),    \
-		.owner          = THIS_MODULE,                          \
-		.min_uV         = (_min) * 1000,                        \
-		.uV_step        = (_step) * 1000,                       \
-		.vsel_reg       = (_vreg),                              \
-		.vsel_mask      = (_vmask),                             \
-		.enable_reg     = (_ereg),                              \
-		.enable_mask    = (_emask),                             \
-		.enable_time    = (_etime),                             \
-		.ops            = &rk805_reg_ops,                       \
+#define ENABLE_MASK(id)			(BIT(id) | BIT(4 + (id)))
+#define DISABLE_VAL(id)			(BIT(4 + (id)))
+
+#define RK817_BOOST_DESC(_id, _match, _supply, _min, _max, _step, _vreg,\
+	_vmask, _ereg, _emask, _enval, _disval, _etime, m_drop)		\
+	{							\
+		.name		= (_match),				\
+		.supply_name	= (_supply),				\
+		.of_match	= of_match_ptr(_match),			\
+		.regulators_node = of_match_ptr("regulators"),		\
+		.type		= REGULATOR_VOLTAGE,			\
+		.id		= (_id),				\
+		.n_voltages	= (((_max) - (_min)) / (_step) + 1),	\
+		.owner		= THIS_MODULE,				\
+		.min_uV		= (_min) * 1000,			\
+		.uV_step	= (_step) * 1000,			\
+		.vsel_reg	= (_vreg),				\
+		.vsel_mask	= (_vmask),				\
+		.enable_reg	= (_ereg),				\
+		.enable_mask	= (_emask),				\
+		.enable_val     = (_enval),				\
+		.disable_val     = (_disval),				\
+		.enable_time	= (_etime),				\
+		.min_dropout_uV = (m_drop) * 1000,			\
+		.ops		= &rk817_boost_ops,			\
 	}
 
-#define RK8XX_DESC(_id, _match, _supply, _min, _max, _step, _vreg,	\
-	_vmask, _ereg, _emask, _etime)					\
-	[_id] = {							\
+#define RK8XX_DESC_COM(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _enval, _disval, _etime, _ops)		\
+	{								\
 		.name		= (_match),				\
 		.supply_name	= (_supply),				\
 		.of_match	= of_match_ptr(_match),			\
@@ -103,12 +115,30 @@
 		.vsel_mask	= (_vmask),				\
 		.enable_reg	= (_ereg),				\
 		.enable_mask	= (_emask),				\
+		.enable_val     = (_enval),				\
+		.disable_val     = (_disval),				\
 		.enable_time	= (_etime),				\
-		.ops		= &rk808_reg_ops,			\
+		.ops		= _ops,			\
 	}
 
-#define RK8XX_DESC_SWITCH(_id, _match, _supply, _ereg, _emask)		\
-	[_id] = {							\
+#define RK805_DESC(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _etime)					\
+	RK8XX_DESC_COM(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, 0, 0, _etime, &rk805_reg_ops)
+
+#define RK8XX_DESC(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _etime)					\
+	RK8XX_DESC_COM(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, 0, 0, _etime, &rk808_reg_ops)
+
+#define RK817_DESC(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _disval, _etime)				\
+	RK8XX_DESC_COM(_id, _match, _supply, _min, _max, _step, _vreg,	\
+	_vmask, _ereg, _emask, _emask, _disval, _etime, &rk817_reg_ops)
+
+#define RKXX_DESC_SWITCH_COM(_id, _match, _supply, _ereg, _emask,	\
+	_enval, _disval, _ops)						\
+	{								\
 		.name		= (_match),				\
 		.supply_name	= (_supply),				\
 		.of_match	= of_match_ptr(_match),			\
@@ -117,10 +147,20 @@
 		.id		= (_id),				\
 		.enable_reg	= (_ereg),				\
 		.enable_mask	= (_emask),				\
+		.enable_val     = (_enval),				\
+		.disable_val     = (_disval),				\
 		.owner		= THIS_MODULE,				\
-		.ops		= &rk808_switch_ops			\
+		.ops		= _ops					\
 	}
 
+#define RK817_DESC_SWITCH(_id, _match, _supply, _ereg, _emask,		\
+	_disval)							\
+	RKXX_DESC_SWITCH_COM(_id, _match, _supply, _ereg, _emask,	\
+	_emask, _disval, &rk817_switch_ops)
+
+#define RK8XX_DESC_SWITCH(_id, _match, _supply, _ereg, _emask)		\
+	RKXX_DESC_SWITCH_COM(_id, _match, _supply, _ereg, _emask,	\
+	0, 0, &rk808_switch_ops)
 
 struct rk808_regulator_data {
 	struct gpio_desc *dvs_gpio[2];
@@ -138,6 +178,51 @@ static const struct regulator_linear_range rk808_ldo3_voltage_ranges[] = {
 	REGULATOR_LINEAR_RANGE(2500000, 15, 15, 0),
 };
 
+#define RK809_BUCK5_SEL_CNT		(8)
+
+static const struct regulator_linear_range rk809_buck5_voltage_ranges[] = {
+	REGULATOR_LINEAR_RANGE(1500000, 0, 0, 0),
+	REGULATOR_LINEAR_RANGE(1800000, 1, 3, 200000),
+	REGULATOR_LINEAR_RANGE(2800000, 4, 5, 200000),
+	REGULATOR_LINEAR_RANGE(3300000, 6, 7, 300000),
+};
+
+#define RK817_BUCK1_MIN0 500000
+#define RK817_BUCK1_MAX0 1500000
+
+#define RK817_BUCK1_MIN1 1600000
+#define RK817_BUCK1_MAX1 2400000
+
+#define RK817_BUCK3_MAX1 3400000
+
+#define RK817_BUCK1_STP0 12500
+#define RK817_BUCK1_STP1 100000
+
+#define RK817_BUCK1_SEL0 ((RK817_BUCK1_MAX0 - RK817_BUCK1_MIN0) /\
+						  RK817_BUCK1_STP0)
+#define RK817_BUCK1_SEL1 ((RK817_BUCK1_MAX1 - RK817_BUCK1_MIN1) /\
+						  RK817_BUCK1_STP1)
+
+#define RK817_BUCK3_SEL1 ((RK817_BUCK3_MAX1 - RK817_BUCK1_MIN1) /\
+						  RK817_BUCK1_STP1)
+
+#define RK817_BUCK1_SEL_CNT (RK817_BUCK1_SEL0 + RK817_BUCK1_SEL1 + 1)
+#define RK817_BUCK3_SEL_CNT (RK817_BUCK1_SEL0 + RK817_BUCK3_SEL1 + 1)
+
+static const struct regulator_linear_range rk817_buck1_voltage_ranges[] = {
+	REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN0, 0,
+			       RK817_BUCK1_SEL0, RK817_BUCK1_STP0),
+	REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN1, RK817_BUCK1_SEL0 + 1,
+			       RK817_BUCK1_SEL_CNT, RK817_BUCK1_STP1),
+};
+
+static const struct regulator_linear_range rk817_buck3_voltage_ranges[] = {
+	REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN0, 0,
+			       RK817_BUCK1_SEL0, RK817_BUCK1_STP0),
+	REGULATOR_LINEAR_RANGE(RK817_BUCK1_MIN1, RK817_BUCK1_SEL0 + 1,
+			       RK817_BUCK3_SEL_CNT, RK817_BUCK1_STP1),
+};
+
 static int rk808_buck1_2_get_voltage_sel_regmap(struct regulator_dev *rdev)
 {
 	struct rk808_regulator_data *pdata = rdev_get_drvdata(rdev);
@@ -289,6 +374,36 @@ static int rk808_set_ramp_delay(struct regulator_dev *rdev, int ramp_delay)
 				  RK808_RAMP_RATE_MASK, ramp_value);
 }
 
+/*
+ * RK817 RK809
+ */
+static int rk817_set_ramp_delay(struct regulator_dev *rdev, int ramp_delay)
+{
+	unsigned int ramp_value = RK817_RAMP_RATE_25MV_PER_US;
+	unsigned int reg = RK817_BUCK_CONFIG_REG(rdev_get_id(rdev));
+
+	switch (ramp_delay) {
+	case 0 ... 3000:
+		ramp_value = RK817_RAMP_RATE_3MV_PER_US;
+		break;
+	case 3001 ... 6300:
+		ramp_value = RK817_RAMP_RATE_6_3MV_PER_US;
+		break;
+	case 6301 ... 12500:
+		ramp_value = RK817_RAMP_RATE_12_5MV_PER_US;
+		break;
+	case 12501 ... 25000:
+		break;
+	default:
+		dev_warn(&rdev->dev,
+			 "%s ramp_delay: %d not supported, setting 10000\n",
+			 rdev->desc->name, ramp_delay);
+	}
+
+	return regmap_update_bits(rdev->regmap, reg,
+				  RK817_RAMP_RATE_MASK, ramp_value);
+}
+
 static int rk808_set_suspend_voltage(struct regulator_dev *rdev, int uv)
 {
 	unsigned int reg;
@@ -304,6 +419,21 @@ static int rk808_set_suspend_voltage(struct regulator_dev *rdev, int uv)
 				  sel);
 }
 
+static int rk817_set_suspend_voltage(struct regulator_dev *rdev, int uv)
+{
+	unsigned int reg;
+	int sel = regulator_map_voltage_linear(rdev, uv, uv);
+	/* only ldo1~ldo9 */
+	if (sel < 0)
+		return -EINVAL;
+
+	reg = rdev->desc->vsel_reg + RK808_SLP_REG_OFFSET;
+
+	return regmap_update_bits(rdev->regmap, reg,
+				  rdev->desc->vsel_mask,
+				  sel);
+}
+
 static int rk808_set_suspend_voltage_range(struct regulator_dev *rdev, int uv)
 {
 	unsigned int reg;
@@ -363,6 +493,131 @@ static int rk808_set_suspend_disable(struct regulator_dev *rdev)
 				  rdev->desc->enable_mask);
 }
 
+static int rk817_set_suspend_enable_ctrl(struct regulator_dev *rdev,
+					 unsigned int en)
+{
+	unsigned int reg;
+	int id = rdev_get_id(rdev);
+	unsigned int id_slp, msk, val;
+
+	if (id >= RK817_ID_DCDC1 && id <= RK817_ID_DCDC4)
+		id_slp = id;
+	else if (id >= RK817_ID_LDO1 && id <= RK817_ID_LDO8)
+		id_slp = 8 + (id - RK817_ID_LDO1);
+	else if (id >= RK817_ID_LDO9 && id <= RK809_ID_SW2)
+		id_slp = 4 + (id - RK817_ID_LDO9);
+	else
+		return -EINVAL;
+
+	reg = RK817_POWER_SLP_EN_REG(id_slp / 8);
+
+	msk = BIT(id_slp % 8);
+	if (en)
+		val = msk;
+	else
+		val = 0;
+
+	return regmap_update_bits(rdev->regmap, reg, msk, val);
+}
+
+static int rk817_set_suspend_enable(struct regulator_dev *rdev)
+{
+	return rk817_set_suspend_enable_ctrl(rdev, 1);
+}
+
+static int rk817_set_suspend_disable(struct regulator_dev *rdev)
+{
+	return rk817_set_suspend_enable_ctrl(rdev, 0);
+}
+
+static int rk8xx_set_suspend_mode(struct regulator_dev *rdev, unsigned int mode)
+{
+	unsigned int reg;
+
+	reg = rdev->desc->vsel_reg + RK808_SLP_REG_OFFSET;
+
+	switch (mode) {
+	case REGULATOR_MODE_FAST:
+		return regmap_update_bits(rdev->regmap, reg,
+					  PWM_MODE_MSK, FPWM_MODE);
+	case REGULATOR_MODE_NORMAL:
+		return regmap_update_bits(rdev->regmap, reg,
+					  PWM_MODE_MSK, AUTO_PWM_MODE);
+	default:
+		dev_err(&rdev->dev, "do not support this mode\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int rk8xx_set_mode(struct regulator_dev *rdev, unsigned int mode)
+{
+	switch (mode) {
+	case REGULATOR_MODE_FAST:
+		return regmap_update_bits(rdev->regmap, rdev->desc->vsel_reg,
+					  PWM_MODE_MSK, FPWM_MODE);
+	case REGULATOR_MODE_NORMAL:
+		return regmap_update_bits(rdev->regmap, rdev->desc->vsel_reg,
+					  PWM_MODE_MSK, AUTO_PWM_MODE);
+	default:
+		dev_err(&rdev->dev, "do not support this mode\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static unsigned int rk8xx_get_mode(struct regulator_dev *rdev)
+{
+	unsigned int val;
+	int err;
+
+	err = regmap_read(rdev->regmap, rdev->desc->vsel_reg, &val);
+	if (err)
+		return err;
+
+	if (val & FPWM_MODE)
+		return REGULATOR_MODE_FAST;
+	else
+		return REGULATOR_MODE_NORMAL;
+}
+
+static int rk8xx_is_enabled_wmsk_regmap(struct regulator_dev *rdev)
+{
+	unsigned int val;
+	int ret;
+
+	ret = regmap_read(rdev->regmap, rdev->desc->enable_reg, &val);
+	if (ret != 0)
+		return ret;
+
+	/* add write mask bit */
+	val |= (rdev->desc->enable_mask & 0xf0);
+	val &= rdev->desc->enable_mask;
+
+	if (rdev->desc->enable_is_inverted) {
+		if (rdev->desc->enable_val)
+			return val != rdev->desc->enable_val;
+		return (val == 0);
+	}
+	if (rdev->desc->enable_val)
+		return val == rdev->desc->enable_val;
+	return val != 0;
+}
+
+static unsigned int rk8xx_regulator_of_map_mode(unsigned int mode)
+{
+	switch (mode) {
+	case 1:
+		return REGULATOR_MODE_FAST;
+	case 2:
+		return REGULATOR_MODE_NORMAL;
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct regulator_ops rk805_reg_ops = {
 	.list_voltage           = regulator_list_voltage_linear,
 	.map_voltage            = regulator_map_voltage_linear,
@@ -439,6 +694,71 @@ static const struct regulator_linear_range rk805_buck_1_2_voltage_ranges[] = {
 	REGULATOR_LINEAR_RANGE(2300000, 63, 63, 0),
 };
 
+static struct regulator_ops rk809_buck5_ops_range = {
+	.list_voltage		= regulator_list_voltage_linear_range,
+	.map_voltage		= regulator_map_voltage_linear_range,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+	.set_voltage_time_sel	= regulator_set_voltage_time_sel,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_suspend_voltage	= rk808_set_suspend_voltage_range,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
+static struct regulator_ops rk817_reg_ops = {
+	.list_voltage		= regulator_list_voltage_linear,
+	.map_voltage		= regulator_map_voltage_linear,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_suspend_voltage	= rk817_set_suspend_voltage,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
+static struct regulator_ops rk817_boost_ops = {
+	.list_voltage		= regulator_list_voltage_linear,
+	.map_voltage		= regulator_map_voltage_linear,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
+static struct regulator_ops rk817_buck_ops_range = {
+	.list_voltage		= regulator_list_voltage_linear_range,
+	.map_voltage		= regulator_map_voltage_linear_range,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+	.set_voltage_time_sel	= regulator_set_voltage_time_sel,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_mode		= rk8xx_set_mode,
+	.get_mode		= rk8xx_get_mode,
+	.set_suspend_mode	= rk8xx_set_suspend_mode,
+	.set_ramp_delay		= rk817_set_ramp_delay,
+	.set_suspend_voltage	= rk808_set_suspend_voltage_range,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
+static struct regulator_ops rk817_switch_ops = {
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.is_enabled		= rk8xx_is_enabled_wmsk_regmap,
+	.set_suspend_enable	= rk817_set_suspend_enable,
+	.set_suspend_disable	= rk817_set_suspend_disable,
+};
+
 static const struct regulator_desc rk805_reg[] = {
 	{
 		.name = "DCDC_REG1",
@@ -595,6 +915,271 @@ static const struct regulator_desc rk808_reg[] = {
 		RK808_DCDC_EN_REG, BIT(6)),
 };
 
+static const struct regulator_desc rk809_reg[] = {
+	{
+		.name = "DCDC_REG1",
+		.supply_name = "vcc1",
+		.of_match = of_match_ptr("DCDC_REG1"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC1,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK1_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC1),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC1),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC1),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG2",
+		.supply_name = "vcc2",
+		.of_match = of_match_ptr("DCDC_REG2"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC2,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK2_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC2),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC2),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC2),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG3",
+		.supply_name = "vcc3",
+		.of_match = of_match_ptr("DCDC_REG3"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC3,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK3_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC3),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC3),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC3),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG4",
+		.supply_name = "vcc4",
+		.of_match = of_match_ptr("DCDC_REG4"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC4,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK3_SEL_CNT + 1,
+		.linear_ranges = rk817_buck3_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck3_voltage_ranges),
+		.vsel_reg = RK817_BUCK4_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC4),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC4),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC4),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	},
+	{
+		.name = "DCDC_REG5",
+		.supply_name = "vcc9",
+		.of_match = of_match_ptr("DCDC_REG5"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK809_ID_DCDC5,
+		.ops = &rk809_buck5_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK809_BUCK5_SEL_CNT,
+		.linear_ranges = rk809_buck5_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk809_buck5_voltage_ranges),
+		.vsel_reg = RK809_BUCK5_CONFIG(0),
+		.vsel_mask = RK809_BUCK5_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(3),
+		.enable_mask = ENABLE_MASK(1),
+		.enable_val = ENABLE_MASK(1),
+		.disable_val = DISABLE_VAL(1),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	},
+	RK817_DESC(RK817_ID_LDO1, "LDO_REG1", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(0), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC(RK817_ID_LDO2, "LDO_REG2", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(1), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400),
+	RK817_DESC(RK817_ID_LDO3, "LDO_REG3", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(2), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(2),
+		   DISABLE_VAL(2), 400),
+	RK817_DESC(RK817_ID_LDO4, "LDO_REG4", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(3), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(3),
+		   DISABLE_VAL(3), 400),
+	RK817_DESC(RK817_ID_LDO5, "LDO_REG5", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(4), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC(RK817_ID_LDO6, "LDO_REG6", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(5), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400),
+	RK817_DESC(RK817_ID_LDO7, "LDO_REG7", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(6), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(2),
+		   DISABLE_VAL(2), 400),
+	RK817_DESC(RK817_ID_LDO8, "LDO_REG8", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(7), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(3),
+		   DISABLE_VAL(3), 400),
+	RK817_DESC(RK817_ID_LDO9, "LDO_REG9", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(8), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(3), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC_SWITCH(RK809_ID_SW1, "SWITCH_REG1", "vcc9",
+			  RK817_POWER_EN_REG(3), ENABLE_MASK(2),
+			  DISABLE_VAL(2)),
+	RK817_DESC_SWITCH(RK809_ID_SW2, "SWITCH_REG2", "vcc8",
+			  RK817_POWER_EN_REG(3), ENABLE_MASK(3),
+			  DISABLE_VAL(3)),
+};
+
+static const struct regulator_desc rk817_reg[] = {
+	{
+		.name = "DCDC_REG1",
+		.supply_name = "vcc1",
+		.of_match = of_match_ptr("DCDC_REG1"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC1,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK1_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC1),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC1),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC1),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG2",
+		.supply_name = "vcc2",
+		.of_match = of_match_ptr("DCDC_REG2"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC2,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK2_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC2),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC2),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC2),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG3",
+		.supply_name = "vcc3",
+		.of_match = of_match_ptr("DCDC_REG3"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC3,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK1_SEL_CNT + 1,
+		.linear_ranges = rk817_buck1_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck1_voltage_ranges),
+		.vsel_reg = RK817_BUCK3_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC3),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC3),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC3),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	}, {
+		.name = "DCDC_REG4",
+		.supply_name = "vcc4",
+		.of_match = of_match_ptr("DCDC_REG4"),
+		.regulators_node = of_match_ptr("regulators"),
+		.id = RK817_ID_DCDC4,
+		.ops = &rk817_buck_ops_range,
+		.type = REGULATOR_VOLTAGE,
+		.n_voltages = RK817_BUCK3_SEL_CNT + 1,
+		.linear_ranges = rk817_buck3_voltage_ranges,
+		.n_linear_ranges = ARRAY_SIZE(rk817_buck3_voltage_ranges),
+		.vsel_reg = RK817_BUCK4_ON_VSEL_REG,
+		.vsel_mask = RK817_BUCK_VSEL_MASK,
+		.enable_reg = RK817_POWER_EN_REG(0),
+		.enable_mask = ENABLE_MASK(RK817_ID_DCDC4),
+		.enable_val = ENABLE_MASK(RK817_ID_DCDC4),
+		.disable_val = DISABLE_VAL(RK817_ID_DCDC4),
+		.of_map_mode = rk8xx_regulator_of_map_mode,
+		.owner = THIS_MODULE,
+	},
+	RK817_DESC(RK817_ID_LDO1, "LDO_REG1", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(0), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC(RK817_ID_LDO2, "LDO_REG2", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(1), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400),
+	RK817_DESC(RK817_ID_LDO3, "LDO_REG3", "vcc5", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(2), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(2),
+		   DISABLE_VAL(2), 400),
+	RK817_DESC(RK817_ID_LDO4, "LDO_REG4", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(3), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(1), ENABLE_MASK(3),
+		   DISABLE_VAL(3), 400),
+	RK817_DESC(RK817_ID_LDO5, "LDO_REG5", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(4), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_DESC(RK817_ID_LDO6, "LDO_REG6", "vcc6", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(5), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400),
+	RK817_DESC(RK817_ID_LDO7, "LDO_REG7", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(6), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(2),
+		   DISABLE_VAL(2), 400),
+	RK817_DESC(RK817_ID_LDO8, "LDO_REG8", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(7), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(2), ENABLE_MASK(3),
+		   DISABLE_VAL(3), 400),
+	RK817_DESC(RK817_ID_LDO9, "LDO_REG9", "vcc7", 600, 3400, 25,
+		   RK817_LDO_ON_VSEL_REG(8), RK817_LDO_VSEL_MASK,
+		   RK817_POWER_EN_REG(3), ENABLE_MASK(0),
+		   DISABLE_VAL(0), 400),
+	RK817_BOOST_DESC(RK817_ID_BOOST, "BOOST", "vcc8", 4700, 5400, 100,
+			 RK817_BOOST_OTG_CFG, RK817_BOOST_VSEL_MASK,
+			 RK817_POWER_EN_REG(3), ENABLE_MASK(1), ENABLE_MASK(1),
+		   DISABLE_VAL(1), 400, 3500 - 5400),
+	RK817_DESC_SWITCH(RK817_ID_BOOST_OTG_SW, "OTG_SWITCH", "vcc9",
+			  RK817_POWER_EN_REG(3), ENABLE_MASK(2),
+			  DISABLE_VAL(2)),
+};
+
 static const struct regulator_desc rk818_reg[] = {
 	{
 		.name = "DCDC_REG1",
@@ -765,6 +1350,14 @@ static int rk808_regulator_probe(struct platform_device *pdev)
 		regulators = rk808_reg;
 		nregulators = RK808_NUM_REGULATORS;
 		break;
+	case RK809_ID:
+		regulators = rk809_reg;
+		nregulators = RK809_NUM_REGULATORS;
+		break;
+	case RK817_ID:
+		regulators = rk817_reg;
+		nregulators = RK817_NUM_REGULATORS;
+		break;
 	case RK818_ID:
 		regulators = rk818_reg;
 		nregulators = RK818_NUM_REGULATORS;
@@ -803,6 +1396,7 @@ static struct platform_driver rk808_regulator_driver = {
 module_platform_driver(rk808_regulator_driver);
 
 MODULE_DESCRIPTION("regulator driver for the RK805/RK808/RK818 series PMICs");
+MODULE_AUTHOR("Tony xie <tony.xie@rock-chips.com>");
 MODULE_AUTHOR("Chris Zhong <zyw@rock-chips.com>");
 MODULE_AUTHOR("Zhang Qing <zhangqing@rock-chips.com>");
 MODULE_AUTHOR("Wadim Egorov <w.egorov@phytec.de>");
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 0fd9eedf3c20..2a9cd01691b2 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -396,7 +396,10 @@ enum rk805_reg {
 #define SHUTDOWN_FUN			(0x2 << 2)
 #define SLEEP_FUN			(0x1 << 2)
 #define RK8XX_ID_MSK			0xfff0
+#define PWM_MODE_MSK			BIT(7)
 #define FPWM_MODE			BIT(7)
+#define AUTO_PWM_MODE			0
+
 enum rk817_reg_id {
 	RK817_ID_DCDC1 = 0,
 	RK817_ID_DCDC2,
-- 
cgit v1.2.3-59-g8ed1b


From 5a136b4ae327e7f6be9c984a010df8d7ea5a4f83 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 7 Jun 2019 12:10:33 -0300
Subject: mm/hmm: Fix error flows in hmm_invalidate_range_start

If the trylock on the hmm->mirrors_sem fails the function will return
without decrementing the notifiers that were previously incremented. Since
the caller will not call invalidate_range_end() on EAGAIN this will result
in notifiers becoming permanently incremented and deadlock.

If the sync_cpu_device_pagetables() required blocking the function will
not return EAGAIN even though the device continues to touch the
pages. This is a violation of the mmu notifier contract.

Switch, and rename, the ranges_lock to a spin lock so we can reliably
obtain it without blocking during error unwind.

The error unwind is necessary since the notifiers count must be held
incremented across the call to sync_cpu_device_pagetables() as we cannot
allow the range to become marked valid by a parallel
invalidate_start/end() pair while doing sync_cpu_device_pagetables().

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Philip Yang <Philip.Yang@amd.com>
---
 include/linux/hmm.h |  2 +-
 mm/hmm.c            | 69 +++++++++++++++++++++++++++++++----------------------
 2 files changed, 41 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index bf013e965257..0fa8ea34ccef 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -86,7 +86,7 @@
 struct hmm {
 	struct mm_struct	*mm;
 	struct kref		kref;
-	struct mutex		lock;
+	spinlock_t		ranges_lock;
 	struct list_head	ranges;
 	struct list_head	mirrors;
 	struct mmu_notifier	mmu_notifier;
diff --git a/mm/hmm.c b/mm/hmm.c
index b224ea635a77..de35289df20d 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -64,7 +64,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
 	init_rwsem(&hmm->mirrors_sem);
 	hmm->mmu_notifier.ops = NULL;
 	INIT_LIST_HEAD(&hmm->ranges);
-	mutex_init(&hmm->lock);
+	spin_lock_init(&hmm->ranges_lock);
 	kref_init(&hmm->kref);
 	hmm->notifiers = 0;
 	hmm->mm = mm;
@@ -144,6 +144,25 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	hmm_put(hmm);
 }
 
+static void notifiers_decrement(struct hmm *hmm)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&hmm->ranges_lock, flags);
+	hmm->notifiers--;
+	if (!hmm->notifiers) {
+		struct hmm_range *range;
+
+		list_for_each_entry(range, &hmm->ranges, list) {
+			if (range->valid)
+				continue;
+			range->valid = true;
+		}
+		wake_up_all(&hmm->wq);
+	}
+	spin_unlock_irqrestore(&hmm->ranges_lock, flags);
+}
+
 static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 			const struct mmu_notifier_range *nrange)
 {
@@ -151,6 +170,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 	struct hmm_mirror *mirror;
 	struct hmm_update update;
 	struct hmm_range *range;
+	unsigned long flags;
 	int ret = 0;
 
 	if (!kref_get_unless_zero(&hmm->kref))
@@ -161,12 +181,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 	update.event = HMM_UPDATE_INVALIDATE;
 	update.blockable = mmu_notifier_range_blockable(nrange);
 
-	if (mmu_notifier_range_blockable(nrange))
-		mutex_lock(&hmm->lock);
-	else if (!mutex_trylock(&hmm->lock)) {
-		ret = -EAGAIN;
-		goto out;
-	}
+	spin_lock_irqsave(&hmm->ranges_lock, flags);
 	hmm->notifiers++;
 	list_for_each_entry(range, &hmm->ranges, list) {
 		if (update.end < range->start || update.start >= range->end)
@@ -174,7 +189,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 
 		range->valid = false;
 	}
-	mutex_unlock(&hmm->lock);
+	spin_unlock_irqrestore(&hmm->ranges_lock, flags);
 
 	if (mmu_notifier_range_blockable(nrange))
 		down_read(&hmm->mirrors_sem);
@@ -182,16 +197,23 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn,
 		ret = -EAGAIN;
 		goto out;
 	}
+
 	list_for_each_entry(mirror, &hmm->mirrors, list) {
-		int ret;
+		int rc;
 
-		ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
-		if (!update.blockable && ret == -EAGAIN)
+		rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update);
+		if (rc) {
+			if (WARN_ON(update.blockable || rc != -EAGAIN))
+				continue;
+			ret = -EAGAIN;
 			break;
+		}
 	}
 	up_read(&hmm->mirrors_sem);
 
 out:
+	if (ret)
+		notifiers_decrement(hmm);
 	hmm_put(hmm);
 	return ret;
 }
@@ -204,20 +226,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn,
 	if (!kref_get_unless_zero(&hmm->kref))
 		return;
 
-	mutex_lock(&hmm->lock);
-	hmm->notifiers--;
-	if (!hmm->notifiers) {
-		struct hmm_range *range;
-
-		list_for_each_entry(range, &hmm->ranges, list) {
-			if (range->valid)
-				continue;
-			range->valid = true;
-		}
-		wake_up_all(&hmm->wq);
-	}
-	mutex_unlock(&hmm->lock);
-
+	notifiers_decrement(hmm);
 	hmm_put(hmm);
 }
 
@@ -868,6 +877,7 @@ int hmm_range_register(struct hmm_range *range,
 {
 	unsigned long mask = ((1UL << page_shift) - 1UL);
 	struct hmm *hmm = mirror->hmm;
+	unsigned long flags;
 
 	range->valid = false;
 	range->hmm = NULL;
@@ -886,7 +896,7 @@ int hmm_range_register(struct hmm_range *range,
 		return -EFAULT;
 
 	/* Initialize range to track CPU page table updates. */
-	mutex_lock(&hmm->lock);
+	spin_lock_irqsave(&hmm->ranges_lock, flags);
 
 	range->hmm = hmm;
 	kref_get(&hmm->kref);
@@ -898,7 +908,7 @@ int hmm_range_register(struct hmm_range *range,
 	 */
 	if (!hmm->notifiers)
 		range->valid = true;
-	mutex_unlock(&hmm->lock);
+	spin_unlock_irqrestore(&hmm->ranges_lock, flags);
 
 	return 0;
 }
@@ -914,10 +924,11 @@ EXPORT_SYMBOL(hmm_range_register);
 void hmm_range_unregister(struct hmm_range *range)
 {
 	struct hmm *hmm = range->hmm;
+	unsigned long flags;
 
-	mutex_lock(&hmm->lock);
+	spin_lock_irqsave(&hmm->ranges_lock, flags);
 	list_del_init(&range->list);
-	mutex_unlock(&hmm->lock);
+	spin_unlock_irqrestore(&hmm->ranges_lock, flags);
 
 	/* Drop reference taken by hmm_range_register() */
 	mmput(hmm->mm);
-- 
cgit v1.2.3-59-g8ed1b


From 4844ef80305d0180051d0787cd91c63573255dc2 Mon Sep 17 00:00:00 2001
From: Vignesh Raghavendra <vigneshr@ti.com>
Date: Tue, 25 Jun 2019 13:27:42 +0530
Subject: mtd: cfi_cmdset_0002: Add support for polling status register

HyperFlash devices are compliant with CFI AMD/Fujitsu Extended Command
Set (0x0002) for flash operations, therefore
drivers/mtd/chips/cfi_cmdset_0002.c can be used as is. But these devices
do not support DQ polling method of determining chip ready/good status.
These flashes provide Status Register whose bits can be polled to know
status of flash operation.

Cypress HyperFlash datasheet here[1], talks about CFI Amd/Fujitsu
Extended Query version 1.5. Bit 0 of "Software Features supported" field
of CFI Primary Vendor-Specific Extended Query table indicates
presence/absence of status register and Bit 1 indicates whether or not
DQ polling is supported. Using these bits, its possible to determine
whether flash supports DQ polling or need to use Status Register.

Add support for polling Status Register to know device ready/status of
erase/write operations when DQ polling is not supported.
Print error messages on erase/program failure by looking at related
Status Register bits.

[1] https://www.cypress.com/file/213346/download

Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Reviewed-by: Tokunori Ikegami <ikegami.t@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/chips/cfi_cmdset_0002.c | 130 +++++++++++++++++++++++++++++++-----
 include/linux/mtd/cfi.h             |   7 ++
 2 files changed, 120 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index a1a7d334aa82..f4da7bd552e9 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -49,6 +49,16 @@
 #define SST49LF008A		0x005a
 #define AT49BV6416		0x00d6
 
+/*
+ * Status Register bit description. Used by flash devices that don't
+ * support DQ polling (e.g. HyperFlash)
+ */
+#define CFI_SR_DRB		BIT(7)
+#define CFI_SR_ESB		BIT(5)
+#define CFI_SR_PSB		BIT(4)
+#define CFI_SR_WBASB		BIT(3)
+#define CFI_SR_SLSB		BIT(1)
+
 static int cfi_amdstd_read (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
 static int cfi_amdstd_write_words(struct mtd_info *, loff_t, size_t, size_t *, const u_char *);
 static int cfi_amdstd_write_buffers(struct mtd_info *, loff_t, size_t, size_t *, const u_char *);
@@ -97,6 +107,50 @@ static struct mtd_chip_driver cfi_amdstd_chipdrv = {
 	.module		= THIS_MODULE
 };
 
+/*
+ * Use status register to poll for Erase/write completion when DQ is not
+ * supported. This is indicated by Bit[1:0] of SoftwareFeatures field in
+ * CFI Primary Vendor-Specific Extended Query table 1.5
+ */
+static int cfi_use_status_reg(struct cfi_private *cfi)
+{
+	struct cfi_pri_amdstd *extp = cfi->cmdset_priv;
+	u8 poll_mask = CFI_POLL_STATUS_REG | CFI_POLL_DQ;
+
+	return extp->MinorVersion >= '5' &&
+		(extp->SoftwareFeatures & poll_mask) == CFI_POLL_STATUS_REG;
+}
+
+static void cfi_check_err_status(struct map_info *map, struct flchip *chip,
+				 unsigned long adr)
+{
+	struct cfi_private *cfi = map->fldrv_priv;
+	map_word status;
+
+	if (!cfi_use_status_reg(cfi))
+		return;
+
+	cfi_send_gen_cmd(0x70, cfi->addr_unlock1, chip->start, map, cfi,
+			 cfi->device_type, NULL);
+	status = map_read(map, adr);
+
+	if (map_word_bitsset(map, status, CMD(0x3a))) {
+		unsigned long chipstatus = MERGESTATUS(status);
+
+		if (chipstatus & CFI_SR_ESB)
+			pr_err("%s erase operation failed, status %lx\n",
+			       map->name, chipstatus);
+		if (chipstatus & CFI_SR_PSB)
+			pr_err("%s program operation failed, status %lx\n",
+			       map->name, chipstatus);
+		if (chipstatus & CFI_SR_WBASB)
+			pr_err("%s buffer program command aborted, status %lx\n",
+			       map->name, chipstatus);
+		if (chipstatus & CFI_SR_SLSB)
+			pr_err("%s sector write protected, status %lx\n",
+			       map->name, chipstatus);
+	}
+}
 
 /* #define DEBUG_CFI_FEATURES */
 
@@ -742,10 +796,25 @@ static struct mtd_info *cfi_amdstd_setup(struct mtd_info *mtd)
  * correctly and is therefore not done	(particularly with interleaved chips
  * as each chip must be checked independently of the others).
  */
-static int __xipram chip_ready(struct map_info *map, unsigned long addr)
+static int __xipram chip_ready(struct map_info *map, struct flchip *chip,
+			       unsigned long addr)
 {
+	struct cfi_private *cfi = map->fldrv_priv;
 	map_word d, t;
 
+	if (cfi_use_status_reg(cfi)) {
+		map_word ready = CMD(CFI_SR_DRB);
+		/*
+		 * For chips that support status register, check device
+		 * ready bit
+		 */
+		cfi_send_gen_cmd(0x70, cfi->addr_unlock1, chip->start, map, cfi,
+				 cfi->device_type, NULL);
+		d = map_read(map, addr);
+
+		return map_word_andequal(map, d, ready, ready);
+	}
+
 	d = map_read(map, addr);
 	t = map_read(map, addr);
 
@@ -767,10 +836,30 @@ static int __xipram chip_ready(struct map_info *map, unsigned long addr)
  * as each chip must be checked independently of the others).
  *
  */
-static int __xipram chip_good(struct map_info *map, unsigned long addr, map_word expected)
+static int __xipram chip_good(struct map_info *map, struct flchip *chip,
+			      unsigned long addr, map_word expected)
 {
+	struct cfi_private *cfi = map->fldrv_priv;
 	map_word oldd, curd;
 
+	if (cfi_use_status_reg(cfi)) {
+		map_word ready = CMD(CFI_SR_DRB);
+		map_word err = CMD(CFI_SR_PSB | CFI_SR_ESB);
+		/*
+		 * For chips that support status register, check device
+		 * ready bit and Erase/Program status bit to know if
+		 * operation succeeded.
+		 */
+		cfi_send_gen_cmd(0x70, cfi->addr_unlock1, chip->start, map, cfi,
+				 cfi->device_type, NULL);
+		curd = map_read(map, addr);
+
+		if (map_word_andequal(map, curd, ready, ready))
+			return !map_word_bitsset(map, curd, err);
+
+		return 0;
+	}
+
 	oldd = map_read(map, addr);
 	curd = map_read(map, addr);
 
@@ -792,7 +881,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 
 	case FL_STATUS:
 		for (;;) {
-			if (chip_ready(map, adr))
+			if (chip_ready(map, chip, adr))
 				break;
 
 			if (time_after(jiffies, timeo)) {
@@ -830,7 +919,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 		chip->state = FL_ERASE_SUSPENDING;
 		chip->erase_suspended = 1;
 		for (;;) {
-			if (chip_ready(map, adr))
+			if (chip_ready(map, chip, adr))
 				break;
 
 			if (time_after(jiffies, timeo)) {
@@ -1362,7 +1451,7 @@ static int do_otp_lock(struct map_info *map, struct flchip *chip, loff_t adr,
 	/* wait for chip to become ready */
 	timeo = jiffies + msecs_to_jiffies(2);
 	for (;;) {
-		if (chip_ready(map, adr))
+		if (chip_ready(map, chip, adr))
 			break;
 
 		if (time_after(jiffies, timeo)) {
@@ -1628,22 +1717,24 @@ static int __xipram do_write_oneword(struct map_info *map, struct flchip *chip,
 			continue;
 		}
 
-		if (time_after(jiffies, timeo) && !chip_ready(map, adr)){
+		if (time_after(jiffies, timeo) &&
+		    !chip_ready(map, chip, adr)) {
 			xip_enable(map, chip, adr);
 			printk(KERN_WARNING "MTD %s(): software timeout\n", __func__);
 			xip_disable(map, chip, adr);
 			break;
 		}
 
-		if (chip_ready(map, adr))
+		if (chip_ready(map, chip, adr))
 			break;
 
 		/* Latency issues. Drop the lock, wait a while and retry */
 		UDELAY(map, chip, adr, 1);
 	}
 	/* Did we succeed? */
-	if (!chip_good(map, adr, datum)) {
+	if (!chip_good(map, chip, adr, datum)) {
 		/* reset on all failures. */
+		cfi_check_err_status(map, chip, adr);
 		map_write(map, CMD(0xF0), chip->start);
 		/* FIXME - should have reset delay before continuing */
 
@@ -1881,10 +1972,11 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 		 * We check "time_after" and "!chip_good" before checking "chip_good" to avoid
 		 * the failure due to scheduling.
 		 */
-		if (time_after(jiffies, timeo) && !chip_good(map, adr, datum))
+		if (time_after(jiffies, timeo) &&
+		    !chip_good(map, chip, adr, datum))
 			break;
 
-		if (chip_good(map, adr, datum)) {
+		if (chip_good(map, chip, adr, datum)) {
 			xip_enable(map, chip, adr);
 			goto op_done;
 		}
@@ -1901,6 +1993,7 @@ static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
 	 * See e.g.
 	 * http://www.spansion.com/Support/Application%20Notes/MirrorBit_Write_Buffer_Prog_Page_Buffer_Read_AN.pdf
 	 */
+	cfi_check_err_status(map, chip, adr);
 	cfi_send_gen_cmd(0xAA, cfi->addr_unlock1, chip->start, map, cfi,
 			 cfi->device_type, NULL);
 	cfi_send_gen_cmd(0x55, cfi->addr_unlock2, chip->start, map, cfi,
@@ -2018,7 +2111,7 @@ static int cfi_amdstd_panic_wait(struct map_info *map, struct flchip *chip,
 	 * If the driver thinks the chip is idle, and no toggle bits
 	 * are changing, then the chip is actually idle for sure.
 	 */
-	if (chip->state == FL_READY && chip_ready(map, adr))
+	if (chip->state == FL_READY && chip_ready(map, chip, adr))
 		return 0;
 
 	/*
@@ -2035,7 +2128,7 @@ static int cfi_amdstd_panic_wait(struct map_info *map, struct flchip *chip,
 
 		/* wait for the chip to become ready */
 		for (i = 0; i < jiffies_to_usecs(timeo); i++) {
-			if (chip_ready(map, adr))
+			if (chip_ready(map, chip, adr))
 				return 0;
 
 			udelay(1);
@@ -2099,14 +2192,15 @@ retry:
 	map_write(map, datum, adr);
 
 	for (i = 0; i < jiffies_to_usecs(uWriteTimeout); i++) {
-		if (chip_ready(map, adr))
+		if (chip_ready(map, chip, adr))
 			break;
 
 		udelay(1);
 	}
 
-	if (!chip_good(map, adr, datum)) {
+	if (!chip_good(map, chip, adr, datum)) {
 		/* reset on all failures. */
+		cfi_check_err_status(map, chip, adr);
 		map_write(map, CMD(0xF0), chip->start);
 		/* FIXME - should have reset delay before continuing */
 
@@ -2300,7 +2394,7 @@ static int __xipram do_erase_chip(struct map_info *map, struct flchip *chip)
 			chip->erase_suspended = 0;
 		}
 
-		if (chip_good(map, adr, map_word_ff(map)))
+		if (chip_good(map, chip, adr, map_word_ff(map)))
 			break;
 
 		if (time_after(jiffies, timeo)) {
@@ -2316,6 +2410,7 @@ static int __xipram do_erase_chip(struct map_info *map, struct flchip *chip)
 	/* Did we succeed? */
 	if (ret) {
 		/* reset on all failures. */
+		cfi_check_err_status(map, chip, adr);
 		map_write(map, CMD(0xF0), chip->start);
 		/* FIXME - should have reset delay before continuing */
 
@@ -2396,7 +2491,7 @@ static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip,
 			chip->erase_suspended = 0;
 		}
 
-		if (chip_good(map, adr, map_word_ff(map)))
+		if (chip_good(map, chip, adr, map_word_ff(map)))
 			break;
 
 		if (time_after(jiffies, timeo)) {
@@ -2412,6 +2507,7 @@ static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip,
 	/* Did we succeed? */
 	if (ret) {
 		/* reset on all failures. */
+		cfi_check_err_status(map, chip, adr);
 		map_write(map, CMD(0xF0), chip->start);
 		/* FIXME - should have reset delay before continuing */
 
@@ -2587,7 +2683,7 @@ static int __maybe_unused do_ppb_xxlock(struct map_info *map,
 	 */
 	timeo = jiffies + msecs_to_jiffies(2000);	/* 2s max (un)locking */
 	for (;;) {
-		if (chip_ready(map, adr))
+		if (chip_ready(map, chip, adr))
 			break;
 
 		if (time_after(jiffies, timeo)) {
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index cbf77168658c..7fdbc1ff6527 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -233,6 +233,13 @@ struct cfi_pri_amdstd {
 	uint8_t  VppMin;
 	uint8_t  VppMax;
 	uint8_t  TopBottom;
+	/* Below field are added from version 1.5 */
+	uint8_t  ProgramSuspend;
+	uint8_t  UnlockBypass;
+	uint8_t  SecureSiliconSector;
+	uint8_t  SoftwareFeatures;
+#define CFI_POLL_STATUS_REG	BIT(0)
+#define CFI_POLL_DQ		BIT(1)
 } __packed;
 
 /* Vendor-Specific PRI for Atmel chips (command set 0x0002) */
-- 
cgit v1.2.3-59-g8ed1b


From dcc7d3446a0fa19bd7e8074920b8f9ef3b7ec00c Mon Sep 17 00:00:00 2001
From: Vignesh Raghavendra <vigneshr@ti.com>
Date: Tue, 25 Jun 2019 13:27:44 +0530
Subject: mtd: Add support for HyperBus memory devices

Cypress' HyperBus is Low Signal Count, High Performance Double Data Rate
Bus interface between a host system master and one or more slave
interfaces. HyperBus is used to connect microprocessor, microcontroller,
or ASIC devices with random access NOR flash memory (called HyperFlash)
or self refresh DRAM (called HyperRAM).

Its a 8-bit data bus (DQ[7:0]) with  Read-Write Data Strobe (RWDS)
signal and either Single-ended clock(3.0V parts) or Differential clock
(1.8V parts). It uses ChipSelect lines to select b/w multiple slaves.
At bus level, it follows a separate protocol described in HyperBus
specification[1].

HyperFlash follows CFI AMD/Fujitsu Extended Command Set (0x0002) similar
to that of existing parallel NORs. Since HyperBus is x8 DDR bus,
its equivalent to x16 parallel NOR flash with respect to bits per clock
cycle. But HyperBus operates at >166MHz frequencies.
HyperRAM provides direct random read/write access to flash memory
array.

But, HyperBus memory controllers seem to abstract implementation details
and expose a simple MMIO interface to access connected flash.

Add support for registering HyperFlash devices with MTD framework. MTD
maps framework along with CFI chip support framework are used to support
communicating with flash.

Framework is modelled along the lines of spi-nor framework. HyperBus
memory controller (HBMC) drivers calls hyperbus_register_device() to
register a single HyperFlash device. HyperFlash core parses MMIO access
information from DT, sets up the map_info struct, probes CFI flash and
registers it with MTD framework.

Some HBMC masters need calibration/training sequence[3] to be carried
out, in order for DLL inside the controller to lock, by reading a known
string/pattern. This is done by repeatedly reading CFI Query
Identification String. Calibration needs to be done before trying to detect
flash as part of CFI flash probe.

HyperRAM is not supported at the moment.

HyperBus specification can be found at[1]
HyperFlash datasheet can be found at[2]

[1] https://www.cypress.com/file/213356/download
[2] https://www.cypress.com/file/213346/download
[3] http://www.ti.com/lit/ug/spruid7b/spruid7b.pdf
    Table 12-5741. HyperFlash Access Sequence

Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 MAINTAINERS                          |   7 ++
 drivers/mtd/Kconfig                  |   2 +
 drivers/mtd/Makefile                 |   1 +
 drivers/mtd/hyperbus/Kconfig         |  11 +++
 drivers/mtd/hyperbus/Makefile        |   3 +
 drivers/mtd/hyperbus/hyperbus-core.c | 153 +++++++++++++++++++++++++++++++++++
 include/linux/mtd/hyperbus.h         |  84 +++++++++++++++++++
 7 files changed, 261 insertions(+)
 create mode 100644 drivers/mtd/hyperbus/Kconfig
 create mode 100644 drivers/mtd/hyperbus/Makefile
 create mode 100644 drivers/mtd/hyperbus/hyperbus-core.c
 create mode 100644 include/linux/mtd/hyperbus.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 5cfbea4ce575..f1253adb8cf6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7305,6 +7305,13 @@ F:	include/uapi/linux/hyperv.h
 F:	tools/hv/
 F:	Documentation/ABI/stable/sysfs-bus-vmbus
 
+HYPERBUS SUPPORT
+M:	Vignesh Raghavendra <vigneshr@ti.com>
+S:	Supported
+F:	drivers/mtd/hyperbus/
+F:	include/linux/mtd/hyperbus.h
+F:	Documentation/devicetree/bindings/mtd/cypress,hyperflash.txt
+
 HYPERVISOR VIRTUAL CONSOLE DRIVER
 L:	linuxppc-dev@lists.ozlabs.org
 S:	Odd Fixes
diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
index fb31a7f649a3..80a6e2dcd085 100644
--- a/drivers/mtd/Kconfig
+++ b/drivers/mtd/Kconfig
@@ -274,4 +274,6 @@ source "drivers/mtd/spi-nor/Kconfig"
 
 source "drivers/mtd/ubi/Kconfig"
 
+source "drivers/mtd/hyperbus/Kconfig"
+
 endif # MTD
diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile
index 806287e80e84..62d649a959e2 100644
--- a/drivers/mtd/Makefile
+++ b/drivers/mtd/Makefile
@@ -34,3 +34,4 @@ obj-y		+= chips/ lpddr/ maps/ devices/ nand/ tests/
 
 obj-$(CONFIG_MTD_SPI_NOR)	+= spi-nor/
 obj-$(CONFIG_MTD_UBI)		+= ubi/
+obj-$(CONFIG_MTD_HYPERBUS)	+= hyperbus/
diff --git a/drivers/mtd/hyperbus/Kconfig b/drivers/mtd/hyperbus/Kconfig
new file mode 100644
index 000000000000..98147e28caa0
--- /dev/null
+++ b/drivers/mtd/hyperbus/Kconfig
@@ -0,0 +1,11 @@
+menuconfig MTD_HYPERBUS
+	tristate "HyperBus support"
+	select MTD_CFI
+	select MTD_MAP_BANK_WIDTH_2
+	select MTD_CFI_AMDSTD
+	select MTD_COMPLEX_MAPPINGS
+	help
+	  This is the framework for the HyperBus which can be used by
+	  the HyperBus Controller driver to communicate with
+	  HyperFlash. See Cypress HyperBus specification for more
+	  details
diff --git a/drivers/mtd/hyperbus/Makefile b/drivers/mtd/hyperbus/Makefile
new file mode 100644
index 000000000000..ca61dedd730d
--- /dev/null
+++ b/drivers/mtd/hyperbus/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_MTD_HYPERBUS)	+= hyperbus-core.o
diff --git a/drivers/mtd/hyperbus/hyperbus-core.c b/drivers/mtd/hyperbus/hyperbus-core.c
new file mode 100644
index 000000000000..6af9ea34117d
--- /dev/null
+++ b/drivers/mtd/hyperbus/hyperbus-core.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+//
+// Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com/
+// Author: Vignesh Raghavendra <vigneshr@ti.com>
+
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mtd/hyperbus.h>
+#include <linux/mtd/map.h>
+#include <linux/mtd/mtd.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/types.h>
+
+static struct hyperbus_device *map_to_hbdev(struct map_info *map)
+{
+	return container_of(map, struct hyperbus_device, map);
+}
+
+static map_word hyperbus_read16(struct map_info *map, unsigned long addr)
+{
+	struct hyperbus_device *hbdev = map_to_hbdev(map);
+	struct hyperbus_ctlr *ctlr = hbdev->ctlr;
+	map_word read_data;
+
+	read_data.x[0] = ctlr->ops->read16(hbdev, addr);
+
+	return read_data;
+}
+
+static void hyperbus_write16(struct map_info *map, map_word d,
+			     unsigned long addr)
+{
+	struct hyperbus_device *hbdev = map_to_hbdev(map);
+	struct hyperbus_ctlr *ctlr = hbdev->ctlr;
+
+	ctlr->ops->write16(hbdev, addr, d.x[0]);
+}
+
+static void hyperbus_copy_from(struct map_info *map, void *to,
+			       unsigned long from, ssize_t len)
+{
+	struct hyperbus_device *hbdev = map_to_hbdev(map);
+	struct hyperbus_ctlr *ctlr = hbdev->ctlr;
+
+	ctlr->ops->copy_from(hbdev, to, from, len);
+}
+
+static void hyperbus_copy_to(struct map_info *map, unsigned long to,
+			     const void *from, ssize_t len)
+{
+	struct hyperbus_device *hbdev = map_to_hbdev(map);
+	struct hyperbus_ctlr *ctlr = hbdev->ctlr;
+
+	ctlr->ops->copy_to(hbdev, to, from, len);
+}
+
+int hyperbus_register_device(struct hyperbus_device *hbdev)
+{
+	const struct hyperbus_ops *ops;
+	struct hyperbus_ctlr *ctlr;
+	struct device_node *np;
+	struct map_info *map;
+	struct resource res;
+	struct device *dev;
+	int ret;
+
+	if (!hbdev || !hbdev->np || !hbdev->ctlr || !hbdev->ctlr->dev) {
+		pr_err("hyperbus: please fill all the necessary fields!\n");
+		return -EINVAL;
+	}
+
+	np = hbdev->np;
+	ctlr = hbdev->ctlr;
+	if (!of_device_is_compatible(np, "cypress,hyperflash"))
+		return -ENODEV;
+
+	hbdev->memtype = HYPERFLASH;
+
+	ret = of_address_to_resource(np, 0, &res);
+	if (ret)
+		return ret;
+
+	dev = ctlr->dev;
+	map = &hbdev->map;
+	map->size = resource_size(&res);
+	map->virt = devm_ioremap_resource(dev, &res);
+	if (IS_ERR(map->virt))
+		return PTR_ERR(map->virt);
+
+	map->name = dev_name(dev);
+	map->bankwidth = 2;
+	map->device_node = np;
+
+	simple_map_init(map);
+	ops = ctlr->ops;
+	if (ops) {
+		if (ops->read16)
+			map->read = hyperbus_read16;
+		if (ops->write16)
+			map->write = hyperbus_write16;
+		if (ops->copy_to)
+			map->copy_to = hyperbus_copy_to;
+		if (ops->copy_from)
+			map->copy_from = hyperbus_copy_from;
+
+		if (ops->calibrate && !ctlr->calibrated) {
+			ret = ops->calibrate(hbdev);
+			if (!ret) {
+				dev_err(dev, "Calibration failed\n");
+				return -ENODEV;
+			}
+			ctlr->calibrated = true;
+		}
+	}
+
+	hbdev->mtd = do_map_probe("cfi_probe", map);
+	if (!hbdev->mtd) {
+		dev_err(dev, "probing of hyperbus device failed\n");
+		return -ENODEV;
+	}
+
+	hbdev->mtd->dev.parent = dev;
+	mtd_set_of_node(hbdev->mtd, np);
+
+	ret = mtd_device_register(hbdev->mtd, NULL, 0);
+	if (ret) {
+		dev_err(dev, "failed to register mtd device\n");
+		map_destroy(hbdev->mtd);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(hyperbus_register_device);
+
+int hyperbus_unregister_device(struct hyperbus_device *hbdev)
+{
+	int ret = 0;
+
+	if (hbdev && hbdev->mtd) {
+		ret = mtd_device_unregister(hbdev->mtd);
+		map_destroy(hbdev->mtd);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(hyperbus_unregister_device);
+
+MODULE_DESCRIPTION("HyperBus Framework");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Vignesh Raghavendra <vigneshr@ti.com>");
diff --git a/include/linux/mtd/hyperbus.h b/include/linux/mtd/hyperbus.h
new file mode 100644
index 000000000000..2dfe65964f6e
--- /dev/null
+++ b/include/linux/mtd/hyperbus.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com/
+ */
+
+#ifndef __LINUX_MTD_HYPERBUS_H__
+#define __LINUX_MTD_HYPERBUS_H__
+
+#include <linux/mtd/map.h>
+
+enum hyperbus_memtype {
+	HYPERFLASH,
+	HYPERRAM,
+};
+
+/**
+ * struct hyperbus_device - struct representing HyperBus slave device
+ * @map: map_info struct for accessing MMIO HyperBus flash memory
+ * @np: pointer to HyperBus slave device node
+ * @mtd: pointer to MTD struct
+ * @ctlr: pointer to HyperBus controller struct
+ * @memtype: type of memory device: HyperFlash or HyperRAM
+ */
+
+struct hyperbus_device {
+	struct map_info map;
+	struct device_node *np;
+	struct mtd_info *mtd;
+	struct hyperbus_ctlr *ctlr;
+	enum hyperbus_memtype memtype;
+};
+
+/**
+ * struct hyperbus_ops - struct representing custom HyperBus operations
+ * @read16: read 16 bit of data from flash in a single burst. Used to read
+ *          from non default address space, such as ID/CFI space
+ * @write16: write 16 bit of data to flash in a single burst. Used to
+ *           send cmd to flash or write single 16 bit word at a time.
+ * @copy_from: copy data from flash memory
+ * @copy_to: copy data to flash memory
+ * @calibrate: calibrate HyperBus controller
+ */
+
+struct hyperbus_ops {
+	u16 (*read16)(struct hyperbus_device *hbdev, unsigned long addr);
+	void (*write16)(struct hyperbus_device *hbdev,
+			unsigned long addr, u16 val);
+	void (*copy_from)(struct hyperbus_device *hbdev, void *to,
+			  unsigned long from, ssize_t len);
+	void (*copy_to)(struct hyperbus_device *dev, unsigned long to,
+			const void *from, ssize_t len);
+	int (*calibrate)(struct hyperbus_device *dev);
+};
+
+/**
+ * struct hyperbus_ctlr - struct representing HyperBus controller
+ * @dev: pointer to HyperBus controller device
+ * @calibrated: flag to indicate ctlr calibration sequence is complete
+ * @ops: HyperBus controller ops
+ */
+struct hyperbus_ctlr {
+	struct device *dev;
+	bool calibrated;
+
+	const struct hyperbus_ops *ops;
+};
+
+/**
+ * hyperbus_register_device - probe and register a HyperBus slave memory device
+ * @hbdev: hyperbus_device struct with dev, np and ctlr field populated
+ *
+ * Return: 0 for success, others for failure.
+ */
+int hyperbus_register_device(struct hyperbus_device *hbdev);
+
+/**
+ * hyperbus_unregister_device - deregister HyperBus slave memory device
+ * @hbdev: hyperbus_device to be unregistered
+ *
+ * Return: 0 for success, others for failure.
+ */
+int hyperbus_unregister_device(struct hyperbus_device *hbdev);
+
+#endif /* __LINUX_MTD_HYPERBUS_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 855eff216a97afa4a2233b792cb3c812b5ebd876 Mon Sep 17 00:00:00 2001
From: Jonathan Bakker <xc-racer2@live.ca>
Date: Fri, 26 Apr 2019 17:06:34 +0200
Subject: mtd: onenand: Add support for 8Gb datasize onenand
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Used in several S5PV210-based Galaxy S devices, among them SGH-T959V,
SGH-T959P, SGH-T839, and SPH-D700.

Signed-off-by: Jonathan Bakker <xc-racer2@live.ca>
Signed-off-by: Paweł Chmiel <pawel.mikolaj.chmiel@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/onenand/onenand_base.c | 2 ++
 include/linux/mtd/onenand_regs.h        | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/onenand/onenand_base.c b/drivers/mtd/nand/onenand/onenand_base.c
index f41d76248550..492c0059673d 100644
--- a/drivers/mtd/nand/onenand/onenand_base.c
+++ b/drivers/mtd/nand/onenand/onenand_base.c
@@ -3260,6 +3260,8 @@ static void onenand_check_features(struct mtd_info *mtd)
 
 	/* Lock scheme */
 	switch (density) {
+	case ONENAND_DEVICE_DENSITY_8Gb:
+		this->options |= ONENAND_HAS_NOP_1;
 	case ONENAND_DEVICE_DENSITY_4Gb:
 		if (ONENAND_IS_DDP(this))
 			this->options |= ONENAND_HAS_2PLANE;
diff --git a/include/linux/mtd/onenand_regs.h b/include/linux/mtd/onenand_regs.h
index d60130f88eed..9640d707cbf8 100644
--- a/include/linux/mtd/onenand_regs.h
+++ b/include/linux/mtd/onenand_regs.h
@@ -80,6 +80,7 @@
 #define ONENAND_DEVICE_DENSITY_1Gb	(0x003)
 #define ONENAND_DEVICE_DENSITY_2Gb	(0x004)
 #define ONENAND_DEVICE_DENSITY_4Gb	(0x005)
+#define ONENAND_DEVICE_DENSITY_8Gb	(0x006)
 
 /*
  * Version ID Register F002h (R)
-- 
cgit v1.2.3-59-g8ed1b


From 14a82ea7e1682645d942d9fb41fcb6126fd1645e Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 21 May 2019 09:06:30 +0200
Subject: mtd: rawnand: export NAND operation tracer

The NAND core has a NAND operation tracing function, but it can only
be used by drivers using the generic option parser from the NAND core.
Export the tracing function as a static inline function in rawnand.h
so that drivers implementing exec_op directly do not have to write their
own operation tracing.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/raw/nand_base.c | 30 +-----------------------------
 include/linux/mtd/rawnand.h      | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c
index d565b4a9dce1..6ecd1c496ce3 100644
--- a/drivers/mtd/nand/raw/nand_base.c
+++ b/drivers/mtd/nand/raw/nand_base.c
@@ -2115,35 +2115,7 @@ static void nand_op_parser_trace(const struct nand_op_parser_ctx *ctx)
 		if (instr == &ctx->subop.instrs[0])
 			prefix = "    ->";
 
-		switch (instr->type) {
-		case NAND_OP_CMD_INSTR:
-			pr_debug("%sCMD      [0x%02x]\n", prefix,
-				 instr->ctx.cmd.opcode);
-			break;
-		case NAND_OP_ADDR_INSTR:
-			pr_debug("%sADDR     [%d cyc: %*ph]\n", prefix,
-				 instr->ctx.addr.naddrs,
-				 instr->ctx.addr.naddrs < 64 ?
-				 instr->ctx.addr.naddrs : 64,
-				 instr->ctx.addr.addrs);
-			break;
-		case NAND_OP_DATA_IN_INSTR:
-			pr_debug("%sDATA_IN  [%d B%s]\n", prefix,
-				 instr->ctx.data.len,
-				 instr->ctx.data.force_8bit ?
-				 ", force 8-bit" : "");
-			break;
-		case NAND_OP_DATA_OUT_INSTR:
-			pr_debug("%sDATA_OUT [%d B%s]\n", prefix,
-				 instr->ctx.data.len,
-				 instr->ctx.data.force_8bit ?
-				 ", force 8-bit" : "");
-			break;
-		case NAND_OP_WAITRDY_INSTR:
-			pr_debug("%sWAITRDY  [max %d ms]\n", prefix,
-				 instr->ctx.waitrdy.timeout_ms);
-			break;
-		}
+		nand_op_trace(prefix, instr);
 
 		if (instr == &ctx->subop.instrs[ctx->subop.ninstrs - 1])
 			prefix = "      ";
diff --git a/include/linux/mtd/rawnand.h b/include/linux/mtd/rawnand.h
index dbfffa5bec7b..f5bb6f11c36b 100644
--- a/include/linux/mtd/rawnand.h
+++ b/include/linux/mtd/rawnand.h
@@ -877,6 +877,42 @@ int nand_op_parser_exec_op(struct nand_chip *chip,
 			   const struct nand_op_parser *parser,
 			   const struct nand_operation *op, bool check_only);
 
+static inline void nand_op_trace(const char *prefix,
+				 const struct nand_op_instr *instr)
+{
+#if IS_ENABLED(CONFIG_DYNAMIC_DEBUG) || defined(DEBUG)
+	switch (instr->type) {
+	case NAND_OP_CMD_INSTR:
+		pr_debug("%sCMD      [0x%02x]\n", prefix,
+			 instr->ctx.cmd.opcode);
+		break;
+	case NAND_OP_ADDR_INSTR:
+		pr_debug("%sADDR     [%d cyc: %*ph]\n", prefix,
+			 instr->ctx.addr.naddrs,
+			 instr->ctx.addr.naddrs < 64 ?
+			 instr->ctx.addr.naddrs : 64,
+			 instr->ctx.addr.addrs);
+		break;
+	case NAND_OP_DATA_IN_INSTR:
+		pr_debug("%sDATA_IN  [%d B%s]\n", prefix,
+			 instr->ctx.data.len,
+			 instr->ctx.data.force_8bit ?
+			 ", force 8-bit" : "");
+		break;
+	case NAND_OP_DATA_OUT_INSTR:
+		pr_debug("%sDATA_OUT [%d B%s]\n", prefix,
+			 instr->ctx.data.len,
+			 instr->ctx.data.force_8bit ?
+			 ", force 8-bit" : "");
+		break;
+	case NAND_OP_WAITRDY_INSTR:
+		pr_debug("%sWAITRDY  [max %d ms]\n", prefix,
+			 instr->ctx.waitrdy.timeout_ms);
+		break;
+	}
+#endif
+}
+
 /**
  * struct nand_controller_ops - Controller operations
  *
-- 
cgit v1.2.3-59-g8ed1b


From e0ddaab76802d3179013f4864535043e2aea6c69 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 21 May 2019 09:06:41 +0200
Subject: dmaengine: mxs: Add header file to be shared with gpmi nand driver

The mxs dma driver can do PIO transfers. A pointer to the PIO words
to transfer is passed in the struct scatterlist * argument of
dmaengine_prep_slave_sg(). It's quite ugly and non obvious to cast
u32 * to struct scatterlist * each time when calling
dmaengine_prep_slave_sg(), so add a static inline wrapper function
to be called by the user along with a description what is going on.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/dma/mxs-dma.c                      |  1 +
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 36 ++++++++++++------------------
 include/linux/dma/mxs-dma.h                | 21 +++++++++++++++++
 3 files changed, 36 insertions(+), 22 deletions(-)
 create mode 100644 include/linux/dma/mxs-dma.h

(limited to 'include/linux')

diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index ce92a3626ea4..62ee9328aea1 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -24,6 +24,7 @@
 #include <linux/of_device.h>
 #include <linux/of_dma.h>
 #include <linux/list.h>
+#include <linux/dma/mxs-dma.h>
 
 #include <asm/irq.h>
 
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
index 74c4de0b1a3d..45c7b91aae23 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
@@ -15,6 +15,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/pm_runtime.h>
+#include <linux/dma/mxs-dma.h>
 #include "gpmi-nand.h"
 #include "gpmi-regs.h"
 #include "bch-regs.h"
@@ -914,9 +915,8 @@ static int gpmi_send_command(struct gpmi_nand_data *this)
 		| BM_GPMI_CTRL0_ADDRESS_INCREMENT
 		| BF_GPMI_CTRL0_XFER_COUNT(this->command_length);
 	pio[1] = pio[2] = 0;
-	desc = dmaengine_prep_slave_sg(channel,
-					(struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
 	if (!desc)
 		return -EINVAL;
 
@@ -988,8 +988,8 @@ static int gpmi_send_data(struct gpmi_nand_data *this, const void *buf, int len)
 		| BF_GPMI_CTRL0_ADDRESS(address)
 		| BF_GPMI_CTRL0_XFER_COUNT(len);
 	pio[1] = 0;
-	desc = dmaengine_prep_slave_sg(channel, (struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
 	if (!desc)
 		return -EINVAL;
 
@@ -1025,9 +1025,8 @@ static int gpmi_read_data(struct gpmi_nand_data *this, void *buf, int len)
 		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
 		| BF_GPMI_CTRL0_XFER_COUNT(len);
 	pio[1] = 0;
-	desc = dmaengine_prep_slave_sg(channel,
-					(struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE, 0);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
 	if (!desc)
 		return -EINVAL;
 
@@ -1083,10 +1082,8 @@ static int gpmi_send_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[4] = payload;
 	pio[5] = auxiliary;
 
-	desc = dmaengine_prep_slave_sg(channel,
-					(struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE,
-					DMA_CTRL_ACK);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, DMA_CTRL_ACK);
 	if (!desc)
 		return -EINVAL;
 
@@ -1117,9 +1114,7 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 		| BF_GPMI_CTRL0_ADDRESS(address)
 		| BF_GPMI_CTRL0_XFER_COUNT(0);
 	pio[1] = 0;
-	desc = dmaengine_prep_slave_sg(channel,
-				(struct scatterlist *)pio, 2,
-				DMA_TRANS_NONE, 0);
+	desc = mxs_dmaengine_prep_pio(channel, pio, 2, DMA_TRANS_NONE, 0);
 	if (!desc)
 		return -EINVAL;
 
@@ -1144,10 +1139,8 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[3] = geo->page_size;
 	pio[4] = payload;
 	pio[5] = auxiliary;
-	desc = dmaengine_prep_slave_sg(channel,
-					(struct scatterlist *)pio,
-					ARRAY_SIZE(pio), DMA_TRANS_NONE,
-					DMA_CTRL_ACK);
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, DMA_CTRL_ACK);
 	if (!desc)
 		return -EINVAL;
 
@@ -1163,9 +1156,8 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 		| BF_GPMI_CTRL0_XFER_COUNT(geo->page_size);
 	pio[1] = 0;
 	pio[2] = 0; /* clear GPMI_HW_GPMI_ECCCTRL, disable the BCH. */
-	desc = dmaengine_prep_slave_sg(channel,
-				(struct scatterlist *)pio, 3,
-				DMA_TRANS_NONE, DMA_CTRL_ACK);
+	desc = mxs_dmaengine_prep_pio(channel, pio, 3, DMA_TRANS_NONE,
+				      DMA_CTRL_ACK);
 	if (!desc)
 		return -EINVAL;
 
diff --git a/include/linux/dma/mxs-dma.h b/include/linux/dma/mxs-dma.h
new file mode 100644
index 000000000000..092b2a7b92ac
--- /dev/null
+++ b/include/linux/dma/mxs-dma.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _MXS_DMA_H_
+#define _MXS_DMA_H_
+
+#include <linux/dmaengine.h>
+
+/*
+ * The mxs dmaengine can do PIO transfers. We pass a pointer to the PIO words
+ * in the second argument to dmaengine_prep_slave_sg when the direction is
+ * set to DMA_TRANS_NONE. To make this clear and to prevent users from doing
+ * the error prone casting we have this wrapper function
+ */
+static inline struct dma_async_tx_descriptor *mxs_dmaengine_prep_pio(
+        struct dma_chan *chan, u32 *pio, unsigned int npio,
+        enum dma_transfer_direction dir, unsigned long flags)
+{
+	return dmaengine_prep_slave_sg(chan, (struct scatterlist *)pio, npio,
+				       dir, flags);
+}
+
+#endif /* _MXS_DMA_H_ */
-- 
cgit v1.2.3-59-g8ed1b


From ceeeb99cd821a2f7493e1e0e1eca5afc7a205213 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 21 May 2019 09:06:42 +0200
Subject: dmaengine: mxs: rename custom flag

The mxs dma driver uses the flags parameter in dmaengine_prep_slave_sg() for
custom flags, but still uses the dmaengine specific names of the flags.
Do a little bit better and at least give the flag a custom name.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Acked-by: Vinod Koul <vkoul@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/dma/mxs-dma.c                      |  4 ++--
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 17 +++++++++++------
 include/linux/dma/mxs-dma.h                |  2 ++
 3 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index 62ee9328aea1..c622bee7eb12 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -541,7 +541,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
 		ccw->bits = 0;
 		ccw->bits |= CCW_IRQ;
 		ccw->bits |= CCW_DEC_SEM;
-		if (flags & DMA_CTRL_ACK)
+		if (flags & MXS_DMA_CTRL_WAIT4END)
 			ccw->bits |= CCW_WAIT4END;
 		ccw->bits |= CCW_HALT_ON_TERM;
 		ccw->bits |= CCW_TERM_FLUSH;
@@ -573,7 +573,7 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
 				ccw->bits &= ~CCW_CHAIN;
 				ccw->bits |= CCW_IRQ;
 				ccw->bits |= CCW_DEC_SEM;
-				if (flags & DMA_CTRL_ACK)
+				if (flags & MXS_DMA_CTRL_WAIT4END)
 					ccw->bits |= CCW_WAIT4END;
 			}
 		}
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
index 45c7b91aae23..d088b3e77fef 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
@@ -926,7 +926,8 @@ static int gpmi_send_command(struct gpmi_nand_data *this)
 	sg_init_one(sgl, this->cmd_buffer, this->command_length);
 	dma_map_sg(this->dev, sgl, 1, DMA_TO_DEVICE);
 	desc = dmaengine_prep_slave_sg(channel,
-				sgl, 1, DMA_MEM_TO_DEV, DMA_CTRL_ACK);
+				sgl, 1, DMA_MEM_TO_DEV,
+				MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -996,7 +997,8 @@ static int gpmi_send_data(struct gpmi_nand_data *this, const void *buf, int len)
 	/* [2] send DMA request */
 	prepare_data_dma(this, buf, len, DMA_TO_DEVICE);
 	desc = dmaengine_prep_slave_sg(channel, &this->data_sgl,
-					1, DMA_MEM_TO_DEV, DMA_CTRL_ACK);
+					1, DMA_MEM_TO_DEV,
+					MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -1033,7 +1035,8 @@ static int gpmi_read_data(struct gpmi_nand_data *this, void *buf, int len)
 	/* [2] : send DMA request */
 	direct = prepare_data_dma(this, buf, len, DMA_FROM_DEVICE);
 	desc = dmaengine_prep_slave_sg(channel, &this->data_sgl,
-					1, DMA_DEV_TO_MEM, DMA_CTRL_ACK);
+					1, DMA_DEV_TO_MEM,
+					MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -1083,7 +1086,8 @@ static int gpmi_send_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[5] = auxiliary;
 
 	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, DMA_CTRL_ACK);
+				      DMA_TRANS_NONE,
+				      MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -1140,7 +1144,8 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[4] = payload;
 	pio[5] = auxiliary;
 	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, DMA_CTRL_ACK);
+				      DMA_TRANS_NONE,
+				      MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
@@ -1157,7 +1162,7 @@ static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
 	pio[1] = 0;
 	pio[2] = 0; /* clear GPMI_HW_GPMI_ECCCTRL, disable the BCH. */
 	desc = mxs_dmaengine_prep_pio(channel, pio, 3, DMA_TRANS_NONE,
-				      DMA_CTRL_ACK);
+				      MXS_DMA_CTRL_WAIT4END);
 	if (!desc)
 		return -EINVAL;
 
diff --git a/include/linux/dma/mxs-dma.h b/include/linux/dma/mxs-dma.h
index 092b2a7b92ac..4a33f2c8a682 100644
--- a/include/linux/dma/mxs-dma.h
+++ b/include/linux/dma/mxs-dma.h
@@ -4,6 +4,8 @@
 
 #include <linux/dmaengine.h>
 
+#define MXS_DMA_CTRL_WAIT4END	BIT(31)
+
 /*
  * The mxs dmaengine can do PIO transfers. We pass a pointer to the PIO words
  * in the second argument to dmaengine_prep_slave_sg when the direction is
-- 
cgit v1.2.3-59-g8ed1b


From ef347c0cfd619a9251e5a2f9ff72e33650a9bccb Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Tue, 21 May 2019 09:06:43 +0200
Subject: mtd: rawnand: gpmi: Implement exec_op

The gpmi driver performance suffers from NAND operations being split
in multiple small DMA transfers. This has been forced by the NAND layer
in the former days, but now with exec_op we can use the controller as
intended.

With this patch gpmi_nfc_exec_op becomes the main entry point to NAND
operations. Here all instructions are collected and chained as separate
DMA transfers. In the end whole chain is fired and waited to be
finished. gpmi_nfc_exec_op only does the hardware operations, bad block
marker swapping and buffer scrambling is done by the callers. It's worth
noting that the nand_*_op functions always take the buffer lengths for
the data that the NAND chip actually transfers. When doing BCH we have
to calculate the net data size from the raw data size in some places.

This patch has been tested with 2048/64 and 2048/128 byte NAND on
i.MX6q. mtd_oobtest, mtd_subpagetest and mtd_speedtest run without
errors. nandbiterrs, nandpagetest and nandsubpagetest userspace tests
from mtdutils run without errors and UBIFS can successfully be mounted.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/dma/mxs-dma.c                      |    3 +
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c | 1105 +++++++++++-----------------
 drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h |   25 +-
 include/linux/dma/mxs-dma.h                |    1 +
 4 files changed, 444 insertions(+), 690 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/mxs-dma.c b/drivers/dma/mxs-dma.c
index c622bee7eb12..20a9cb7cb6d3 100644
--- a/drivers/dma/mxs-dma.c
+++ b/drivers/dma/mxs-dma.c
@@ -78,6 +78,7 @@
 #define BM_CCW_COMMAND		(3 << 0)
 #define CCW_CHAIN		(1 << 2)
 #define CCW_IRQ			(1 << 3)
+#define CCW_WAIT4RDY		(1 << 5)
 #define CCW_DEC_SEM		(1 << 6)
 #define CCW_WAIT4END		(1 << 7)
 #define CCW_HALT_ON_TERM	(1 << 8)
@@ -547,6 +548,8 @@ static struct dma_async_tx_descriptor *mxs_dma_prep_slave_sg(
 		ccw->bits |= CCW_TERM_FLUSH;
 		ccw->bits |= BF_CCW(sg_len, PIO_NUM);
 		ccw->bits |= BF_CCW(MXS_DMA_CMD_NO_XFER, COMMAND);
+		if (flags & MXS_DMA_CTRL_WAIT4RDY)
+			ccw->bits |= CCW_WAIT4RDY;
 	} else {
 		for_each_sg(sgl, sg, sg_len, i) {
 			if (sg_dma_len(sg) > MAX_XFER_BYTES) {
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
index d088b3e77fef..5db84178edff 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.c
@@ -528,26 +528,12 @@ static int common_nfc_set_geometry(struct gpmi_nand_data *this)
 static int bch_set_geometry(struct gpmi_nand_data *this)
 {
 	struct resources *r = &this->resources;
-	struct bch_geometry *bch_geo = &this->bch_geometry;
-	unsigned int block_count;
-	unsigned int block_size;
-	unsigned int metadata_size;
-	unsigned int ecc_strength;
-	unsigned int page_size;
-	unsigned int gf_len;
 	int ret;
 
 	ret = common_nfc_set_geometry(this);
 	if (ret)
 		return ret;
 
-	block_count   = bch_geo->ecc_chunk_count - 1;
-	block_size    = bch_geo->ecc_chunk_size;
-	metadata_size = bch_geo->metadata_size;
-	ecc_strength  = bch_geo->ecc_strength >> 1;
-	page_size     = bch_geo->page_size;
-	gf_len        = bch_geo->gf_len;
-
 	ret = pm_runtime_get_sync(this->dev);
 	if (ret < 0)
 		return ret;
@@ -561,27 +547,9 @@ static int bch_set_geometry(struct gpmi_nand_data *this)
 	if (ret)
 		goto err_out;
 
-	/* Configure layout 0. */
-	writel(BF_BCH_FLASH0LAYOUT0_NBLOCKS(block_count)
-			| BF_BCH_FLASH0LAYOUT0_META_SIZE(metadata_size)
-			| BF_BCH_FLASH0LAYOUT0_ECC0(ecc_strength, this)
-			| BF_BCH_FLASH0LAYOUT0_GF(gf_len, this)
-			| BF_BCH_FLASH0LAYOUT0_DATA0_SIZE(block_size, this),
-			r->bch_regs + HW_BCH_FLASH0LAYOUT0);
-
-	writel(BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(page_size)
-			| BF_BCH_FLASH0LAYOUT1_ECCN(ecc_strength, this)
-			| BF_BCH_FLASH0LAYOUT1_GF(gf_len, this)
-			| BF_BCH_FLASH0LAYOUT1_DATAN_SIZE(block_size, this),
-			r->bch_regs + HW_BCH_FLASH0LAYOUT1);
-
 	/* Set *all* chip selects to use layout 0. */
 	writel(0, r->bch_regs + HW_BCH_LAYOUTSELECT);
 
-	/* Enable interrupts. */
-	writel(BM_BCH_CTRL_COMPLETE_IRQ_EN,
-				r->bch_regs + HW_BCH_CTRL_SET);
-
 	ret = 0;
 err_out:
 	pm_runtime_mark_last_busy(this->dev);
@@ -795,32 +763,6 @@ static void gpmi_clear_bch(struct gpmi_nand_data *this)
 	writel(BM_BCH_CTRL_COMPLETE_IRQ, r->bch_regs + HW_BCH_CTRL_CLR);
 }
 
-/* Returns the Ready/Busy status of the given chip. */
-static int gpmi_is_ready(struct gpmi_nand_data *this, unsigned chip)
-{
-	struct resources *r = &this->resources;
-	uint32_t mask = 0;
-	uint32_t reg = 0;
-
-	if (GPMI_IS_MX23(this)) {
-		mask = MX23_BM_GPMI_DEBUG_READY0 << chip;
-		reg = readl(r->gpmi_regs + HW_GPMI_DEBUG);
-	} else if (GPMI_IS_MX28(this) || GPMI_IS_MX6(this)) {
-		/*
-		 * In the imx6, all the ready/busy pins are bound
-		 * together. So we only need to check chip 0.
-		 */
-		if (GPMI_IS_MX6(this))
-			chip = 0;
-
-		/* MX28 shares the same R/B register as MX6Q. */
-		mask = MX28_BF_GPMI_STAT_READY_BUSY(1 << chip);
-		reg = readl(r->gpmi_regs + HW_GPMI_STAT);
-	} else
-		dev_err(this->dev, "unknown arch.\n");
-	return reg & mask;
-}
-
 static struct dma_chan *get_dma_chan(struct gpmi_nand_data *this)
 {
 	/* We use the DMA channel 0 to access all the nand chips. */
@@ -836,29 +778,6 @@ static void dma_irq_callback(void *param)
 	complete(dma_c);
 }
 
-static int start_dma_without_bch_irq(struct gpmi_nand_data *this,
-				     struct dma_async_tx_descriptor *desc)
-{
-	struct completion *dma_c = &this->dma_done;
-	unsigned long timeout;
-
-	init_completion(dma_c);
-
-	desc->callback		= dma_irq_callback;
-	desc->callback_param	= this;
-	dmaengine_submit(desc);
-	dma_async_issue_pending(get_dma_chan(this));
-
-	/* Wait for the interrupt from the DMA block. */
-	timeout = wait_for_completion_timeout(dma_c, msecs_to_jiffies(1000));
-	if (!timeout) {
-		dev_err(this->dev, "DMA timeout, last DMA\n");
-		gpmi_dump_info(this);
-		return -ETIMEDOUT;
-	}
-	return 0;
-}
-
 static irqreturn_t bch_irq(int irq, void *cookie)
 {
 	struct gpmi_nand_data *this = cookie;
@@ -868,83 +787,25 @@ static irqreturn_t bch_irq(int irq, void *cookie)
 	return IRQ_HANDLED;
 }
 
-/*
- * This function is used in BCH reading or BCH writing pages.
- * It will wait for the BCH interrupt as long as ONE second.
- * Actually, we must wait for two interrupts :
- *	[1] firstly the DMA interrupt and
- *	[2] secondly the BCH interrupt.
- */
-static int start_dma_with_bch_irq(struct gpmi_nand_data *this,
-				  struct dma_async_tx_descriptor *desc)
+static int gpmi_raw_len_to_len(struct gpmi_nand_data *this, int raw_len)
 {
-	struct completion *bch_c = &this->bch_done;
-	unsigned long timeout;
-
-	/* Prepare to receive an interrupt from the BCH block. */
-	init_completion(bch_c);
-
-	/* start the DMA */
-	start_dma_without_bch_irq(this, desc);
-
-	/* Wait for the interrupt from the BCH block. */
-	timeout = wait_for_completion_timeout(bch_c, msecs_to_jiffies(1000));
-	if (!timeout) {
-		dev_err(this->dev, "BCH timeout\n");
-		gpmi_dump_info(this);
-		return -ETIMEDOUT;
-	}
-	return 0;
-}
-
-static int gpmi_send_command(struct gpmi_nand_data *this)
-{
-	struct dma_chan *channel = get_dma_chan(this);
-	struct dma_async_tx_descriptor *desc;
-	struct scatterlist *sgl;
-	int chip = this->current_chip;
-	int ret;
-	u32 pio[3];
-
-	/* [1] send out the PIO words */
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__WRITE)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_CLE)
-		| BM_GPMI_CTRL0_ADDRESS_INCREMENT
-		| BF_GPMI_CTRL0_XFER_COUNT(this->command_length);
-	pio[1] = pio[2] = 0;
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, 0);
-	if (!desc)
-		return -EINVAL;
-
-	/* [2] send out the COMMAND + ADDRESS string stored in @buffer */
-	sgl = &this->cmd_sgl;
-
-	sg_init_one(sgl, this->cmd_buffer, this->command_length);
-	dma_map_sg(this->dev, sgl, 1, DMA_TO_DEVICE);
-	desc = dmaengine_prep_slave_sg(channel,
-				sgl, 1, DMA_MEM_TO_DEV,
-				MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [3] submit the DMA */
-	ret = start_dma_without_bch_irq(this, desc);
-
-	dma_unmap_sg(this->dev, sgl, 1, DMA_TO_DEVICE);
-
-	return ret;
+	/*
+	 * raw_len is the length to read/write including bch data which
+	 * we are passed in exec_op. Calculate the data length from it.
+	 */
+	if (this->bch)
+		return ALIGN_DOWN(raw_len, this->bch_geometry.ecc_chunk_size);
+	else
+		return raw_len;
 }
 
 /* Can we use the upper's buffer directly for DMA? */
 static bool prepare_data_dma(struct gpmi_nand_data *this, const void *buf,
-			     int len, enum dma_data_direction dr)
+			     int raw_len, struct scatterlist *sgl,
+			     enum dma_data_direction dr)
 {
-	struct scatterlist *sgl = &this->data_sgl;
 	int ret;
+	int len = gpmi_raw_len_to_len(this, raw_len);
 
 	/* first try to map the upper buffer directly */
 	if (virt_addr_valid(buf) && !object_is_on_stack(buf)) {
@@ -960,7 +821,7 @@ map_fail:
 	/* We have to use our own DMA buffer. */
 	sg_init_one(sgl, this->data_buffer_dma, len);
 
-	if (dr == DMA_TO_DEVICE)
+	if (dr == DMA_TO_DEVICE && buf != this->data_buffer_dma)
 		memcpy(this->data_buffer_dma, buf, len);
 
 	dma_map_sg(this->dev, sgl, 1, dr);
@@ -968,208 +829,6 @@ map_fail:
 	return false;
 }
 
-static int gpmi_send_data(struct gpmi_nand_data *this, const void *buf, int len)
-{
-	struct dma_async_tx_descriptor *desc;
-	struct dma_chan *channel = get_dma_chan(this);
-	int chip = this->current_chip;
-	int ret;
-	uint32_t command_mode;
-	uint32_t address;
-	u32 pio[2];
-
-	/* [1] PIO */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__WRITE;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(len);
-	pio[1] = 0;
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, 0);
-	if (!desc)
-		return -EINVAL;
-
-	/* [2] send DMA request */
-	prepare_data_dma(this, buf, len, DMA_TO_DEVICE);
-	desc = dmaengine_prep_slave_sg(channel, &this->data_sgl,
-					1, DMA_MEM_TO_DEV,
-					MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [3] submit the DMA */
-	ret = start_dma_without_bch_irq(this, desc);
-
-	dma_unmap_sg(this->dev, &this->data_sgl, 1, DMA_TO_DEVICE);
-
-	return ret;
-}
-
-static int gpmi_read_data(struct gpmi_nand_data *this, void *buf, int len)
-{
-	struct dma_async_tx_descriptor *desc;
-	struct dma_chan *channel = get_dma_chan(this);
-	int chip = this->current_chip;
-	int ret;
-	u32 pio[2];
-	bool direct;
-
-	/* [1] : send PIO */
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__READ)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
-		| BF_GPMI_CTRL0_XFER_COUNT(len);
-	pio[1] = 0;
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE, 0);
-	if (!desc)
-		return -EINVAL;
-
-	/* [2] : send DMA request */
-	direct = prepare_data_dma(this, buf, len, DMA_FROM_DEVICE);
-	desc = dmaengine_prep_slave_sg(channel, &this->data_sgl,
-					1, DMA_DEV_TO_MEM,
-					MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [3] : submit the DMA */
-
-	ret = start_dma_without_bch_irq(this, desc);
-
-	dma_unmap_sg(this->dev, &this->data_sgl, 1, DMA_FROM_DEVICE);
-	if (!direct)
-		memcpy(buf, this->data_buffer_dma, len);
-
-	return ret;
-}
-
-static int gpmi_send_page(struct gpmi_nand_data *this, dma_addr_t payload,
-			  dma_addr_t auxiliary)
-{
-	struct bch_geometry *geo = &this->bch_geometry;
-	uint32_t command_mode;
-	uint32_t address;
-	uint32_t ecc_command;
-	uint32_t buffer_mask;
-	struct dma_async_tx_descriptor *desc;
-	struct dma_chan *channel = get_dma_chan(this);
-	int chip = this->current_chip;
-	u32 pio[6];
-
-	/* A DMA descriptor that does an ECC page read. */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__WRITE;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-	ecc_command  = BV_GPMI_ECCCTRL_ECC_CMD__BCH_ENCODE;
-	buffer_mask  = BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_PAGE |
-				BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_AUXONLY;
-
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(0);
-	pio[1] = 0;
-	pio[2] = BM_GPMI_ECCCTRL_ENABLE_ECC
-		| BF_GPMI_ECCCTRL_ECC_CMD(ecc_command)
-		| BF_GPMI_ECCCTRL_BUFFER_MASK(buffer_mask);
-	pio[3] = geo->page_size;
-	pio[4] = payload;
-	pio[5] = auxiliary;
-
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE,
-				      MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	return start_dma_with_bch_irq(this, desc);
-}
-
-static int gpmi_read_page(struct gpmi_nand_data *this, dma_addr_t payload,
-			  dma_addr_t auxiliary)
-{
-	struct bch_geometry *geo = &this->bch_geometry;
-	uint32_t command_mode;
-	uint32_t address;
-	uint32_t ecc_command;
-	uint32_t buffer_mask;
-	struct dma_async_tx_descriptor *desc;
-	struct dma_chan *channel = get_dma_chan(this);
-	int chip = this->current_chip;
-	u32 pio[6];
-
-	/* [1] Wait for the chip to report ready. */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__WAIT_FOR_READY;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-
-	pio[0] =  BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(0);
-	pio[1] = 0;
-	desc = mxs_dmaengine_prep_pio(channel, pio, 2, DMA_TRANS_NONE, 0);
-	if (!desc)
-		return -EINVAL;
-
-	/* [2] Enable the BCH block and read. */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__READ;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-	ecc_command  = BV_GPMI_ECCCTRL_ECC_CMD__BCH_DECODE;
-	buffer_mask  = BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_PAGE
-			| BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_AUXONLY;
-
-	pio[0] =  BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(geo->page_size);
-
-	pio[1] = 0;
-	pio[2] =  BM_GPMI_ECCCTRL_ENABLE_ECC
-		| BF_GPMI_ECCCTRL_ECC_CMD(ecc_command)
-		| BF_GPMI_ECCCTRL_BUFFER_MASK(buffer_mask);
-	pio[3] = geo->page_size;
-	pio[4] = payload;
-	pio[5] = auxiliary;
-	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
-				      DMA_TRANS_NONE,
-				      MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [3] Disable the BCH block */
-	command_mode = BV_GPMI_CTRL0_COMMAND_MODE__WAIT_FOR_READY;
-	address      = BV_GPMI_CTRL0_ADDRESS__NAND_DATA;
-
-	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(command_mode)
-		| BM_GPMI_CTRL0_WORD_LENGTH
-		| BF_GPMI_CTRL0_CS(chip, this)
-		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
-		| BF_GPMI_CTRL0_ADDRESS(address)
-		| BF_GPMI_CTRL0_XFER_COUNT(geo->page_size);
-	pio[1] = 0;
-	pio[2] = 0; /* clear GPMI_HW_GPMI_ECCCTRL, disable the BCH. */
-	desc = mxs_dmaengine_prep_pio(channel, pio, 3, DMA_TRANS_NONE,
-				      MXS_DMA_CTRL_WAIT4END);
-	if (!desc)
-		return -EINVAL;
-
-	/* [4] submit the DMA */
-	return start_dma_with_bch_irq(this, desc);
-}
-
 /**
  * gpmi_copy_bits - copy bits from one memory region to another
  * @dst: destination buffer
@@ -1568,67 +1227,20 @@ static void release_resources(struct gpmi_nand_data *this)
 	release_dma_channels(this);
 }
 
-static int send_page_prepare(struct gpmi_nand_data *this,
-			const void *source, unsigned length,
-			void *alt_virt, dma_addr_t alt_phys, unsigned alt_size,
-			const void **use_virt, dma_addr_t *use_phys)
-{
-	struct device *dev = this->dev;
-
-	if (virt_addr_valid(source)) {
-		dma_addr_t source_phys;
-
-		source_phys = dma_map_single(dev, (void *)source, length,
-						DMA_TO_DEVICE);
-		if (dma_mapping_error(dev, source_phys)) {
-			if (alt_size < length) {
-				dev_err(dev, "Alternate buffer is too small\n");
-				return -ENOMEM;
-			}
-			goto map_failed;
-		}
-		*use_virt = source;
-		*use_phys = source_phys;
-		return 0;
-	}
-map_failed:
-	/*
-	 * Copy the content of the source buffer into the alternate
-	 * buffer and set up the return values accordingly.
-	 */
-	memcpy(alt_virt, source, length);
-
-	*use_virt = alt_virt;
-	*use_phys = alt_phys;
-	return 0;
-}
-
-static void send_page_end(struct gpmi_nand_data *this,
-			const void *source, unsigned length,
-			void *alt_virt, dma_addr_t alt_phys, unsigned alt_size,
-			const void *used_virt, dma_addr_t used_phys)
-{
-	struct device *dev = this->dev;
-	if (used_virt == source)
-		dma_unmap_single(dev, used_phys, length, DMA_TO_DEVICE);
-}
-
 static void gpmi_free_dma_buffer(struct gpmi_nand_data *this)
 {
 	struct device *dev = this->dev;
+	struct bch_geometry *geo = &this->bch_geometry;
 
-	if (this->payload_virt && virt_addr_valid(this->payload_virt))
-		dma_free_coherent(dev, this->page_buffer_size,
-					this->payload_virt,
-					this->payload_phys);
-	kfree(this->cmd_buffer);
+	if (this->auxiliary_virt && virt_addr_valid(this->auxiliary_virt))
+		dma_free_coherent(dev, geo->auxiliary_size,
+					this->auxiliary_virt,
+					this->auxiliary_phys);
 	kfree(this->data_buffer_dma);
 	kfree(this->raw_buffer);
 
-	this->cmd_buffer	= NULL;
 	this->data_buffer_dma	= NULL;
 	this->raw_buffer	= NULL;
-	this->page_buffer_size	=  0;
 }
 
 /* Allocate the DMA buffers */
@@ -1638,11 +1250,6 @@ static int gpmi_alloc_dma_buffer(struct gpmi_nand_data *this)
 	struct device *dev = this->dev;
 	struct mtd_info *mtd = nand_to_mtd(&this->nand);
 
-	/* [1] Allocate a command buffer. PAGE_SIZE is enough. */
-	this->cmd_buffer = kzalloc(PAGE_SIZE, GFP_DMA | GFP_KERNEL);
-	if (this->cmd_buffer == NULL)
-		goto error_alloc;
-
 	/*
 	 * [2] Allocate a read/write data buffer.
 	 *     The gpmi_alloc_dma_buffer can be called twice.
@@ -1656,27 +1263,15 @@ static int gpmi_alloc_dma_buffer(struct gpmi_nand_data *this)
 	if (this->data_buffer_dma == NULL)
 		goto error_alloc;
 
-	/*
-	 * [3] Allocate the page buffer.
-	 *
-	 * Both the payload buffer and the auxiliary buffer must appear on
-	 * 32-bit boundaries. We presume the size of the payload buffer is a
-	 * power of two and is much larger than four, which guarantees the
-	 * auxiliary buffer will appear on a 32-bit boundary.
-	 */
-	this->page_buffer_size = geo->payload_size + geo->auxiliary_size;
-	this->payload_virt = dma_alloc_coherent(dev, this->page_buffer_size,
-					&this->payload_phys, GFP_DMA);
-	if (!this->payload_virt)
+	this->auxiliary_virt = dma_alloc_coherent(dev, geo->auxiliary_size,
+					&this->auxiliary_phys, GFP_DMA);
+	if (!this->auxiliary_virt)
 		goto error_alloc;
 
-	this->raw_buffer = kzalloc(mtd->writesize + mtd->oobsize, GFP_KERNEL);
+	this->raw_buffer = kzalloc((mtd->writesize ?: PAGE_SIZE) + mtd->oobsize, GFP_KERNEL);
 	if (!this->raw_buffer)
 		goto error_alloc;
 
-	/* Slice up the page buffer. */
-	this->auxiliary_virt = this->payload_virt + geo->payload_size;
-	this->auxiliary_phys = this->payload_phys + geo->payload_size;
 	return 0;
 
 error_alloc:
@@ -1684,105 +1279,6 @@ error_alloc:
 	return -ENOMEM;
 }
 
-static void gpmi_cmd_ctrl(struct nand_chip *chip, int data, unsigned int ctrl)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-	int ret;
-
-	/*
-	 * Every operation begins with a command byte and a series of zero or
-	 * more address bytes. These are distinguished by either the Address
-	 * Latch Enable (ALE) or Command Latch Enable (CLE) signals being
-	 * asserted. When MTD is ready to execute the command, it will deassert
-	 * both latch enables.
-	 *
-	 * Rather than run a separate DMA operation for every single byte, we
-	 * queue them up and run a single DMA operation for the entire series
-	 * of command and data bytes. NAND_CMD_NONE means the END of the queue.
-	 */
-	if ((ctrl & (NAND_ALE | NAND_CLE))) {
-		if (data != NAND_CMD_NONE)
-			this->cmd_buffer[this->command_length++] = data;
-		return;
-	}
-
-	if (!this->command_length)
-		return;
-
-	ret = gpmi_send_command(this);
-	if (ret)
-		dev_err(this->dev, "Chip: %u, Error %d\n",
-			this->current_chip, ret);
-
-	this->command_length = 0;
-}
-
-static int gpmi_dev_ready(struct nand_chip *chip)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-
-	return gpmi_is_ready(this, this->current_chip);
-}
-
-static void gpmi_select_chip(struct nand_chip *chip, int chipnr)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-	int ret;
-
-	/*
-	 * For power consumption matters, disable/enable the clock each time a
-	 * die is selected/unselected.
-	 */
-	if (this->current_chip < 0 && chipnr >= 0) {
-		ret = pm_runtime_get_sync(this->dev);
-		if (ret < 0)
-			dev_err(this->dev, "Failed to enable the clock\n");
-	} else if (this->current_chip >= 0 && chipnr < 0) {
-		pm_runtime_mark_last_busy(this->dev);
-		pm_runtime_put_autosuspend(this->dev);
-	}
-
-	/*
-	 * This driver currently supports only one NAND chip. Plus, dies share
-	 * the same configuration. So once timings have been applied on the
-	 * controller side, they will not change anymore. When the time will
-	 * come, the check on must_apply_timings will have to be dropped.
-	 */
-	if (chipnr >= 0 && this->hw.must_apply_timings) {
-		this->hw.must_apply_timings = false;
-		gpmi_nfc_apply_timings(this);
-	}
-
-	this->current_chip = chipnr;
-}
-
-static void gpmi_read_buf(struct nand_chip *chip, uint8_t *buf, int len)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-
-	dev_dbg(this->dev, "len is %d\n", len);
-
-	gpmi_read_data(this, buf, len);
-}
-
-static void gpmi_write_buf(struct nand_chip *chip, const uint8_t *buf, int len)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-
-	dev_dbg(this->dev, "len is %d\n", len);
-
-	gpmi_send_data(this, buf, len);
-}
-
-static uint8_t gpmi_read_byte(struct nand_chip *chip)
-{
-	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-	uint8_t *buf = this->data_buffer_dma;
-
-	gpmi_read_buf(chip, buf, 1);
-	return buf[0];
-}
-
 /*
  * Handles block mark swapping.
  * It can be called in swapping the block mark, or swapping it back,
@@ -1831,50 +1327,20 @@ static void block_mark_swapping(struct gpmi_nand_data *this,
 	p[1] = (p[1] & mask) | (from_oob >> (8 - bit));
 }
 
-static int gpmi_ecc_read_page_data(struct nand_chip *chip, uint8_t *buf)
+static int gpmi_count_bitflips(struct nand_chip *chip, void *buf, int first,
+			       int last, int meta)
 {
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
 	struct bch_geometry *nfc_geo = &this->bch_geometry;
 	struct mtd_info *mtd = nand_to_mtd(chip);
-	dma_addr_t    payload_phys;
-	unsigned int  i;
+	int i;
 	unsigned char *status;
-	unsigned int  max_bitflips = 0;
-	int           ret;
-	bool          direct = false;
-
-	payload_phys = this->payload_phys;
-
-	if (virt_addr_valid(buf)) {
-		dma_addr_t dest_phys;
-
-		dest_phys = dma_map_single(this->dev, buf, nfc_geo->payload_size,
-					   DMA_FROM_DEVICE);
-		if (!dma_mapping_error(this->dev, dest_phys)) {
-			payload_phys = dest_phys;
-			direct = true;
-		}
-	}
-
-	/* go! */
-	ret = gpmi_read_page(this, payload_phys, this->auxiliary_phys);
-
-	if (direct)
-		dma_unmap_single(this->dev, payload_phys, nfc_geo->payload_size,
-				 DMA_FROM_DEVICE);
-
-	if (ret) {
-		dev_err(this->dev, "Error in ECC-based read: %d\n", ret);
-		return ret;
-	}
+	unsigned int max_bitflips = 0;
 
 	/* Loop over status bytes, accumulating ECC status. */
-	status = this->auxiliary_virt + nfc_geo->auxiliary_status_offset;
+	status = this->auxiliary_virt + ALIGN(meta, 4);
 
-	if (!direct)
-		memcpy(buf, this->payload_virt, nfc_geo->payload_size);
-
-	for (i = 0; i < nfc_geo->ecc_chunk_count; i++, status++) {
+	for (i = first; i < last; i++, status++) {
 		if ((*status == STATUS_GOOD) || (*status == STATUS_ERASED))
 			continue;
 
@@ -1954,25 +1420,53 @@ static int gpmi_ecc_read_page_data(struct nand_chip *chip, uint8_t *buf)
 		max_bitflips = max_t(unsigned int, max_bitflips, *status);
 	}
 
-	/* handle the block mark swapping */
-	block_mark_swapping(this, buf, this->auxiliary_virt);
-
 	return max_bitflips;
 }
 
+static void gpmi_bch_layout_std(struct gpmi_nand_data *this)
+{
+	struct bch_geometry *geo = &this->bch_geometry;
+	unsigned int ecc_strength = geo->ecc_strength >> 1;
+	unsigned int gf_len = geo->gf_len;
+	unsigned int block_size = block_size = geo->ecc_chunk_size;
+
+	this->bch_flashlayout0 =
+		BF_BCH_FLASH0LAYOUT0_NBLOCKS(geo->ecc_chunk_count - 1) |
+		BF_BCH_FLASH0LAYOUT0_META_SIZE(geo->metadata_size) |
+		BF_BCH_FLASH0LAYOUT0_ECC0(ecc_strength, this) |
+		BF_BCH_FLASH0LAYOUT0_GF(gf_len, this) |
+		BF_BCH_FLASH0LAYOUT0_DATA0_SIZE(block_size, this);
+
+	this->bch_flashlayout1 =
+		BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(geo->page_size) |
+		BF_BCH_FLASH0LAYOUT1_ECCN(ecc_strength, this) |
+		BF_BCH_FLASH0LAYOUT1_GF(gf_len, this) |
+		BF_BCH_FLASH0LAYOUT1_DATAN_SIZE(block_size, this);
+}
+
 static int gpmi_ecc_read_page(struct nand_chip *chip, uint8_t *buf,
 			      int oob_required, int page)
 {
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
 	struct mtd_info *mtd = nand_to_mtd(chip);
+	struct bch_geometry *geo = &this->bch_geometry;
+	unsigned int max_bitflips;
 	int ret;
 
-	nand_read_page_op(chip, page, 0, NULL, 0);
+	gpmi_bch_layout_std(this);
+	this->bch = true;
 
-	ret = gpmi_ecc_read_page_data(chip, buf);
-	if (ret < 0)
+	ret = nand_read_page_op(chip, page, 0, buf, geo->page_size);
+	if (ret)
 		return ret;
 
+	max_bitflips = gpmi_count_bitflips(chip, buf, 0,
+					   geo->ecc_chunk_count,
+					   geo->auxiliary_status_offset);
+
+	/* handle the block mark swapping */
+	block_mark_swapping(this, buf, this->auxiliary_virt);
+
 	if (oob_required) {
 		/*
 		 * It's time to deliver the OOB bytes. See gpmi_ecc_read_oob()
@@ -1988,7 +1482,7 @@ static int gpmi_ecc_read_page(struct nand_chip *chip, uint8_t *buf,
 		chip->oob_poi[0] = ((uint8_t *)this->auxiliary_virt)[0];
 	}
 
-	return ret;
+	return max_bitflips;
 }
 
 /* Fake a virtual small page for the subpage read */
@@ -1996,17 +1490,15 @@ static int gpmi_ecc_read_subpage(struct nand_chip *chip, uint32_t offs,
 				 uint32_t len, uint8_t *buf, int page)
 {
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
-	void __iomem *bch_regs = this->resources.bch_regs;
-	struct bch_geometry old_geo = this->bch_geometry;
 	struct bch_geometry *geo = &this->bch_geometry;
 	int size = chip->ecc.size; /* ECC chunk size */
 	int meta, n, page_size;
-	u32 r1_old, r2_old, r1_new, r2_new;
 	unsigned int max_bitflips;
+	unsigned int ecc_strength;
 	int first, last, marker_pos;
 	int ecc_parity_size;
 	int col = 0;
-	int old_swap_block_mark = this->swap_block_mark;
+	int ret;
 
 	/* The size of ECC parity */
 	ecc_parity_size = geo->gf_len * geo->ecc_strength / 8;
@@ -2039,43 +1531,33 @@ static int gpmi_ecc_read_subpage(struct nand_chip *chip, uint32_t offs,
 		buf = buf + first * size;
 	}
 
-	nand_read_page_op(chip, page, col, NULL, 0);
-
-	/* Save the old environment */
-	r1_old = r1_new = readl(bch_regs + HW_BCH_FLASH0LAYOUT0);
-	r2_old = r2_new = readl(bch_regs + HW_BCH_FLASH0LAYOUT1);
+	ecc_parity_size = geo->gf_len * geo->ecc_strength / 8;
 
-	/* change the BCH registers and bch_geometry{} */
 	n = last - first + 1;
 	page_size = meta + (size + ecc_parity_size) * n;
+	ecc_strength = geo->ecc_strength >> 1;
+
+	this->bch_flashlayout0 = BF_BCH_FLASH0LAYOUT0_NBLOCKS(n - 1) |
+		BF_BCH_FLASH0LAYOUT0_META_SIZE(meta) |
+		BF_BCH_FLASH0LAYOUT0_ECC0(ecc_strength, this) |
+		BF_BCH_FLASH0LAYOUT0_GF(geo->gf_len, this) |
+		BF_BCH_FLASH0LAYOUT0_DATA0_SIZE(geo->ecc_chunk_size, this);
 
-	r1_new &= ~(BM_BCH_FLASH0LAYOUT0_NBLOCKS |
-			BM_BCH_FLASH0LAYOUT0_META_SIZE);
-	r1_new |= BF_BCH_FLASH0LAYOUT0_NBLOCKS(n - 1)
-			| BF_BCH_FLASH0LAYOUT0_META_SIZE(meta);
-	writel(r1_new, bch_regs + HW_BCH_FLASH0LAYOUT0);
+	this->bch_flashlayout1 = BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(page_size) |
+		BF_BCH_FLASH0LAYOUT1_ECCN(ecc_strength, this) |
+		BF_BCH_FLASH0LAYOUT1_GF(geo->gf_len, this) |
+		BF_BCH_FLASH0LAYOUT1_DATAN_SIZE(geo->ecc_chunk_size, this);
 
-	r2_new &= ~BM_BCH_FLASH0LAYOUT1_PAGE_SIZE;
-	r2_new |= BF_BCH_FLASH0LAYOUT1_PAGE_SIZE(page_size);
-	writel(r2_new, bch_regs + HW_BCH_FLASH0LAYOUT1);
+	this->bch = true;
 
-	geo->ecc_chunk_count = n;
-	geo->payload_size = n * size;
-	geo->page_size = page_size;
-	geo->auxiliary_status_offset = ALIGN(meta, 4);
+	ret = nand_read_page_op(chip, page, col, buf, page_size);
+	if (ret)
+		return ret;
 
 	dev_dbg(this->dev, "page:%d(%d:%d)%d, chunk:(%d:%d), BCH PG size:%d\n",
 		page, offs, len, col, first, n, page_size);
 
-	/* Read the subpage now */
-	this->swap_block_mark = false;
-	max_bitflips = gpmi_ecc_read_page_data(chip, buf);
-
-	/* Restore */
-	writel(r1_old, bch_regs + HW_BCH_FLASH0LAYOUT0);
-	writel(r2_old, bch_regs + HW_BCH_FLASH0LAYOUT1);
-	this->bch_geometry = old_geo;
-	this->swap_block_mark = old_swap_block_mark;
+	max_bitflips = gpmi_count_bitflips(chip, buf, first, last, meta);
 
 	return max_bitflips;
 }
@@ -2086,81 +1568,29 @@ static int gpmi_ecc_write_page(struct nand_chip *chip, const uint8_t *buf,
 	struct mtd_info *mtd = nand_to_mtd(chip);
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
 	struct bch_geometry *nfc_geo = &this->bch_geometry;
-	const void *payload_virt;
-	dma_addr_t payload_phys;
-	const void *auxiliary_virt;
-	dma_addr_t auxiliary_phys;
-	int        ret;
+	int ret;
 
 	dev_dbg(this->dev, "ecc write page.\n");
 
-	nand_prog_page_begin_op(chip, page, 0, NULL, 0);
+	gpmi_bch_layout_std(this);
+	this->bch = true;
+
+	memcpy(this->auxiliary_virt, chip->oob_poi, nfc_geo->auxiliary_size);
 
 	if (this->swap_block_mark) {
 		/*
-		 * If control arrives here, we're doing block mark swapping.
-		 * Since we can't modify the caller's buffers, we must copy them
-		 * into our own.
-		 */
-		memcpy(this->payload_virt, buf, mtd->writesize);
-		payload_virt = this->payload_virt;
-		payload_phys = this->payload_phys;
-
-		memcpy(this->auxiliary_virt, chip->oob_poi,
-				nfc_geo->auxiliary_size);
-		auxiliary_virt = this->auxiliary_virt;
-		auxiliary_phys = this->auxiliary_phys;
-
-		/* Handle block mark swapping. */
-		block_mark_swapping(this,
-				(void *)payload_virt, (void *)auxiliary_virt);
-	} else {
-		/*
-		 * If control arrives here, we're not doing block mark swapping,
-		 * so we can to try and use the caller's buffers.
+		 * When doing bad block marker swapping we must always copy the
+		 * input buffer as we can't modify the const buffer.
 		 */
-		ret = send_page_prepare(this,
-				buf, mtd->writesize,
-				this->payload_virt, this->payload_phys,
-				nfc_geo->payload_size,
-				&payload_virt, &payload_phys);
-		if (ret) {
-			dev_err(this->dev, "Inadequate payload DMA buffer\n");
-			return 0;
-		}
-
-		ret = send_page_prepare(this,
-				chip->oob_poi, mtd->oobsize,
-				this->auxiliary_virt, this->auxiliary_phys,
-				nfc_geo->auxiliary_size,
-				&auxiliary_virt, &auxiliary_phys);
-		if (ret) {
-			dev_err(this->dev, "Inadequate auxiliary DMA buffer\n");
-			goto exit_auxiliary;
-		}
+		memcpy(this->data_buffer_dma, buf, mtd->writesize);
+		buf = this->data_buffer_dma;
+		block_mark_swapping(this, this->data_buffer_dma,
+				    this->auxiliary_virt);
 	}
 
-	/* Ask the NFC. */
-	ret = gpmi_send_page(this, payload_phys, auxiliary_phys);
-	if (ret)
-		dev_err(this->dev, "Error in ECC-based write: %d\n", ret);
-
-	if (!this->swap_block_mark) {
-		send_page_end(this, chip->oob_poi, mtd->oobsize,
-				this->auxiliary_virt, this->auxiliary_phys,
-				nfc_geo->auxiliary_size,
-				auxiliary_virt, auxiliary_phys);
-exit_auxiliary:
-		send_page_end(this, buf, mtd->writesize,
-				this->payload_virt, this->payload_phys,
-				nfc_geo->payload_size,
-				payload_virt, payload_phys);
-	}
+	ret = nand_prog_page_op(chip, page, 0, buf, nfc_geo->page_size);
 
-	if (ret)
-		return ret;
-
-	return nand_prog_page_end_op(chip);
+	return ret;
 }
 
 /*
@@ -2229,7 +1659,6 @@ static int gpmi_ecc_read_oob(struct nand_chip *chip, int page)
 	struct gpmi_nand_data *this = nand_get_controller_data(chip);
 	int ret;
 
-	dev_dbg(this->dev, "page number is %d\n", page);
 	/* clear the OOB buffer */
 	memset(chip->oob_poi, ~0, mtd->oobsize);
 
@@ -2297,9 +1726,12 @@ static int gpmi_ecc_read_page_raw(struct nand_chip *chip, uint8_t *buf,
 	size_t oob_byte_off;
 	uint8_t *oob = chip->oob_poi;
 	int step;
+	int ret;
 
-	nand_read_page_op(chip, page, 0, tmp_buf,
-			  mtd->writesize + mtd->oobsize);
+	ret = nand_read_page_op(chip, page, 0, tmp_buf,
+				mtd->writesize + mtd->oobsize);
+	if (ret)
+		return ret;
 
 	/*
 	 * If required, swap the bad block marker and the data stored in the
@@ -2789,9 +2221,330 @@ static int gpmi_nand_attach_chip(struct nand_chip *chip)
 	return 0;
 }
 
+static struct gpmi_transfer *get_next_transfer(struct gpmi_nand_data *this)
+{
+	struct gpmi_transfer *transfer = &this->transfers[this->ntransfers];
+
+	this->ntransfers++;
+
+	if (this->ntransfers == GPMI_MAX_TRANSFERS)
+		return NULL;
+
+	return transfer;
+}
+
+static struct dma_async_tx_descriptor *gpmi_chain_command(
+	struct gpmi_nand_data *this, u8 cmd, const u8 *addr, int naddr)
+{
+	struct dma_chan *channel = get_dma_chan(this);
+	struct dma_async_tx_descriptor *desc;
+	struct gpmi_transfer *transfer;
+	int chip = this->nand.cur_cs;
+	u32 pio[3];
+
+	/* [1] send out the PIO words */
+	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__WRITE)
+		| BM_GPMI_CTRL0_WORD_LENGTH
+		| BF_GPMI_CTRL0_CS(chip, this)
+		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
+		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_CLE)
+		| BM_GPMI_CTRL0_ADDRESS_INCREMENT
+		| BF_GPMI_CTRL0_XFER_COUNT(naddr + 1);
+	pio[1] = 0;
+	pio[2] = 0;
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
+	if (!desc)
+		return NULL;
+
+	transfer = get_next_transfer(this);
+	if (!transfer)
+		return NULL;
+
+	transfer->cmdbuf[0] = cmd;
+	if (naddr)
+		memcpy(&transfer->cmdbuf[1], addr, naddr);
+
+	sg_init_one(&transfer->sgl, transfer->cmdbuf, naddr + 1);
+	dma_map_sg(this->dev, &transfer->sgl, 1, DMA_TO_DEVICE);
+
+	transfer->direction = DMA_TO_DEVICE;
+
+	desc = dmaengine_prep_slave_sg(channel, &transfer->sgl, 1, DMA_MEM_TO_DEV,
+				       MXS_DMA_CTRL_WAIT4END);
+	return desc;
+}
+
+static struct dma_async_tx_descriptor *gpmi_chain_wait_ready(
+	struct gpmi_nand_data *this)
+{
+	struct dma_chan *channel = get_dma_chan(this);
+	u32 pio[2];
+
+	pio[0] =  BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__WAIT_FOR_READY)
+		| BM_GPMI_CTRL0_WORD_LENGTH
+		| BF_GPMI_CTRL0_CS(this->nand.cur_cs, this)
+		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
+		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
+		| BF_GPMI_CTRL0_XFER_COUNT(0);
+	pio[1] = 0;
+
+	return mxs_dmaengine_prep_pio(channel, pio, 2, DMA_TRANS_NONE,
+				MXS_DMA_CTRL_WAIT4END | MXS_DMA_CTRL_WAIT4RDY);
+}
+
+static struct dma_async_tx_descriptor *gpmi_chain_data_read(
+	struct gpmi_nand_data *this, void *buf, int raw_len, bool *direct)
+{
+	struct dma_async_tx_descriptor *desc;
+	struct dma_chan *channel = get_dma_chan(this);
+	struct gpmi_transfer *transfer;
+	u32 pio[6] = {};
+
+	transfer = get_next_transfer(this);
+	if (!transfer)
+		return NULL;
+
+	transfer->direction = DMA_FROM_DEVICE;
+
+	*direct = prepare_data_dma(this, buf, raw_len, &transfer->sgl,
+				   DMA_FROM_DEVICE);
+
+	pio[0] =  BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__READ)
+		| BM_GPMI_CTRL0_WORD_LENGTH
+		| BF_GPMI_CTRL0_CS(this->nand.cur_cs, this)
+		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
+		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
+		| BF_GPMI_CTRL0_XFER_COUNT(raw_len);
+
+	if (this->bch) {
+		pio[2] =  BM_GPMI_ECCCTRL_ENABLE_ECC
+			| BF_GPMI_ECCCTRL_ECC_CMD(BV_GPMI_ECCCTRL_ECC_CMD__BCH_DECODE)
+			| BF_GPMI_ECCCTRL_BUFFER_MASK(BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_PAGE
+				| BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_AUXONLY);
+		pio[3] = raw_len;
+		pio[4] = transfer->sgl.dma_address;
+		pio[5] = this->auxiliary_phys;
+	}
+
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE, 0);
+	if (!desc)
+		return NULL;
+
+	if (!this->bch)
+		desc = dmaengine_prep_slave_sg(channel, &transfer->sgl, 1,
+					     DMA_DEV_TO_MEM,
+					     MXS_DMA_CTRL_WAIT4END);
+
+	return desc;
+}
+
+static struct dma_async_tx_descriptor *gpmi_chain_data_write(
+	struct gpmi_nand_data *this, const void *buf, int raw_len)
+{
+	struct dma_chan *channel = get_dma_chan(this);
+	struct dma_async_tx_descriptor *desc;
+	struct gpmi_transfer *transfer;
+	u32 pio[6] = {};
+
+	transfer = get_next_transfer(this);
+	if (!transfer)
+		return NULL;
+
+	transfer->direction = DMA_TO_DEVICE;
+
+	prepare_data_dma(this, buf, raw_len, &transfer->sgl, DMA_TO_DEVICE);
+
+	pio[0] = BF_GPMI_CTRL0_COMMAND_MODE(BV_GPMI_CTRL0_COMMAND_MODE__WRITE)
+		| BM_GPMI_CTRL0_WORD_LENGTH
+		| BF_GPMI_CTRL0_CS(this->nand.cur_cs, this)
+		| BF_GPMI_CTRL0_LOCK_CS(LOCK_CS_ENABLE, this)
+		| BF_GPMI_CTRL0_ADDRESS(BV_GPMI_CTRL0_ADDRESS__NAND_DATA)
+		| BF_GPMI_CTRL0_XFER_COUNT(raw_len);
+
+	if (this->bch) {
+		pio[2] = BM_GPMI_ECCCTRL_ENABLE_ECC
+			| BF_GPMI_ECCCTRL_ECC_CMD(BV_GPMI_ECCCTRL_ECC_CMD__BCH_ENCODE)
+			| BF_GPMI_ECCCTRL_BUFFER_MASK(BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_PAGE |
+					BV_GPMI_ECCCTRL_BUFFER_MASK__BCH_AUXONLY);
+		pio[3] = raw_len;
+		pio[4] = transfer->sgl.dma_address;
+		pio[5] = this->auxiliary_phys;
+	}
+
+	desc = mxs_dmaengine_prep_pio(channel, pio, ARRAY_SIZE(pio),
+				      DMA_TRANS_NONE,
+				      (this->bch ? MXS_DMA_CTRL_WAIT4END : 0));
+	if (!desc)
+		return NULL;
+
+	if (!this->bch)
+		desc = dmaengine_prep_slave_sg(channel, &transfer->sgl, 1,
+					       DMA_MEM_TO_DEV,
+					       MXS_DMA_CTRL_WAIT4END);
+
+	return desc;
+}
+
+static int gpmi_nfc_exec_op(struct nand_chip *chip,
+			     const struct nand_operation *op,
+			     bool check_only)
+{
+	const struct nand_op_instr *instr;
+	struct gpmi_nand_data *this = nand_get_controller_data(chip);
+	struct dma_async_tx_descriptor *desc = NULL;
+	int i, ret, buf_len = 0, nbufs = 0;
+	u8 cmd = 0;
+	void *buf_read = NULL;
+	const void *buf_write = NULL;
+	bool direct = false;
+	struct completion *completion;
+	unsigned long to;
+
+	this->ntransfers = 0;
+	for (i = 0; i < GPMI_MAX_TRANSFERS; i++)
+		this->transfers[i].direction = DMA_NONE;
+
+	ret = pm_runtime_get_sync(this->dev);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * This driver currently supports only one NAND chip. Plus, dies share
+	 * the same configuration. So once timings have been applied on the
+	 * controller side, they will not change anymore. When the time will
+	 * come, the check on must_apply_timings will have to be dropped.
+	 */
+	if (this->hw.must_apply_timings) {
+		this->hw.must_apply_timings = false;
+		gpmi_nfc_apply_timings(this);
+	}
+
+	dev_dbg(this->dev, "%s: %d instructions\n", __func__, op->ninstrs);
+
+	for (i = 0; i < op->ninstrs; i++) {
+		instr = &op->instrs[i];
+
+		nand_op_trace("  ", instr);
+
+		switch (instr->type) {
+		case NAND_OP_WAITRDY_INSTR:
+			desc = gpmi_chain_wait_ready(this);
+			break;
+		case NAND_OP_CMD_INSTR:
+			cmd = instr->ctx.cmd.opcode;
+
+			/*
+			 * When this command has an address cycle chain it
+			 * together with the address cycle
+			 */
+			if (i + 1 != op->ninstrs &&
+			    op->instrs[i + 1].type == NAND_OP_ADDR_INSTR)
+				continue;
+
+			desc = gpmi_chain_command(this, cmd, NULL, 0);
+
+			break;
+		case NAND_OP_ADDR_INSTR:
+			desc = gpmi_chain_command(this, cmd, instr->ctx.addr.addrs,
+						  instr->ctx.addr.naddrs);
+			break;
+		case NAND_OP_DATA_OUT_INSTR:
+			buf_write = instr->ctx.data.buf.out;
+			buf_len = instr->ctx.data.len;
+			nbufs++;
+
+			desc = gpmi_chain_data_write(this, buf_write, buf_len);
+
+			break;
+		case NAND_OP_DATA_IN_INSTR:
+			if (!instr->ctx.data.len)
+				break;
+			buf_read = instr->ctx.data.buf.in;
+			buf_len = instr->ctx.data.len;
+			nbufs++;
+
+			desc = gpmi_chain_data_read(this, buf_read, buf_len,
+						   &direct);
+			break;
+		}
+
+		if (!desc) {
+			ret = -ENXIO;
+			goto unmap;
+		}
+	}
+
+	dev_dbg(this->dev, "%s setup done\n", __func__);
+
+	if (nbufs > 1) {
+		dev_err(this->dev, "Multiple data instructions not supported\n");
+		ret = -EINVAL;
+		goto unmap;
+	}
+
+	if (this->bch) {
+		writel(this->bch_flashlayout0,
+		       this->resources.bch_regs + HW_BCH_FLASH0LAYOUT0);
+		writel(this->bch_flashlayout1,
+		       this->resources.bch_regs + HW_BCH_FLASH0LAYOUT1);
+	}
+
+	if (this->bch && buf_read) {
+		writel(BM_BCH_CTRL_COMPLETE_IRQ_EN,
+		       this->resources.bch_regs + HW_BCH_CTRL_SET);
+		completion = &this->bch_done;
+	} else {
+		desc->callback = dma_irq_callback;
+		desc->callback_param = this;
+		completion = &this->dma_done;
+	}
+
+	init_completion(completion);
+
+	dmaengine_submit(desc);
+	dma_async_issue_pending(get_dma_chan(this));
+
+	to = wait_for_completion_timeout(completion, msecs_to_jiffies(1000));
+	if (!to) {
+		dev_err(this->dev, "DMA timeout, last DMA\n");
+		gpmi_dump_info(this);
+		ret = -ETIMEDOUT;
+		goto unmap;
+	}
+
+	writel(BM_BCH_CTRL_COMPLETE_IRQ_EN,
+	       this->resources.bch_regs + HW_BCH_CTRL_CLR);
+	gpmi_clear_bch(this);
+
+	ret = 0;
+
+unmap:
+	for (i = 0; i < this->ntransfers; i++) {
+		struct gpmi_transfer *transfer = &this->transfers[i];
+
+		if (transfer->direction != DMA_NONE)
+			dma_unmap_sg(this->dev, &transfer->sgl, 1,
+				     transfer->direction);
+	}
+
+	if (!ret && buf_read && !direct)
+		memcpy(buf_read, this->data_buffer_dma,
+		       gpmi_raw_len_to_len(this, buf_len));
+
+	this->bch = false;
+
+	pm_runtime_mark_last_busy(this->dev);
+	pm_runtime_put_autosuspend(this->dev);
+
+	return ret;
+}
+
 static const struct nand_controller_ops gpmi_nand_controller_ops = {
 	.attach_chip = gpmi_nand_attach_chip,
 	.setup_data_interface = gpmi_setup_data_interface,
+	.exec_op = gpmi_nfc_exec_op,
 };
 
 static int gpmi_nand_init(struct gpmi_nand_data *this)
@@ -2800,9 +2553,6 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
 	struct mtd_info  *mtd = nand_to_mtd(chip);
 	int ret;
 
-	/* init current chip */
-	this->current_chip	= -1;
-
 	/* init the MTD data structures */
 	mtd->name		= "gpmi-nand";
 	mtd->dev.parent		= this->dev;
@@ -2810,14 +2560,8 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
 	/* init the nand_chip{}, we don't support a 16-bit NAND Flash bus. */
 	nand_set_controller_data(chip, this);
 	nand_set_flash_node(chip, this->pdev->dev.of_node);
-	chip->legacy.select_chip	= gpmi_select_chip;
-	chip->legacy.cmd_ctrl	= gpmi_cmd_ctrl;
-	chip->legacy.dev_ready	= gpmi_dev_ready;
-	chip->legacy.read_byte	= gpmi_read_byte;
-	chip->legacy.read_buf	= gpmi_read_buf;
-	chip->legacy.write_buf	= gpmi_write_buf;
-	chip->badblock_pattern	= &gpmi_bbt_descr;
 	chip->legacy.block_markbad = gpmi_block_markbad;
+	chip->badblock_pattern	= &gpmi_bbt_descr;
 	chip->options		|= NAND_NO_SUBPAGE_WRITE;
 
 	/* Set up swap_block_mark, must be set before the gpmi_set_geometry() */
@@ -2833,7 +2577,10 @@ static int gpmi_nand_init(struct gpmi_nand_data *this)
 	if (ret)
 		goto err_out;
 
-	chip->legacy.dummy_controller.ops = &gpmi_nand_controller_ops;
+	nand_controller_init(&this->base);
+	this->base.ops = &gpmi_nand_controller_ops;
+	chip->controller = &this->base;
+
 	ret = nand_scan(chip, GPMI_IS_MX6(this) ? 2 : 1);
 	if (ret)
 		goto err_out;
diff --git a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h
index 51a070da84ed..fdc5ed7de083 100644
--- a/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h
+++ b/drivers/mtd/nand/raw/gpmi-nand/gpmi-nand.h
@@ -103,6 +103,14 @@ struct gpmi_nfc_hardware_timing {
 	u32 ctrl1n;
 };
 
+#define GPMI_MAX_TRANSFERS	8
+
+struct gpmi_transfer {
+	u8 cmdbuf[8];
+	struct scatterlist sgl;
+	enum dma_data_direction direction;
+};
+
 struct gpmi_nand_data {
 	/* Devdata */
 	const struct gpmi_devdata *devdata;
@@ -126,23 +134,18 @@ struct gpmi_nand_data {
 	struct boot_rom_geometry rom_geometry;
 
 	/* MTD / NAND */
+	struct nand_controller	base;
 	struct nand_chip	nand;
 
-	/* General-use Variables */
-	int			current_chip;
-	unsigned int		command_length;
+	struct gpmi_transfer	transfers[GPMI_MAX_TRANSFERS];
+	int			ntransfers;
 
-	struct scatterlist	cmd_sgl;
-	char			*cmd_buffer;
+	bool			bch;
+	uint32_t		bch_flashlayout0;
+	uint32_t		bch_flashlayout1;
 
-	struct scatterlist	data_sgl;
 	char			*data_buffer_dma;
 
-	unsigned int		page_buffer_size;
-
-	void			*payload_virt;
-	dma_addr_t		payload_phys;
-
 	void			*auxiliary_virt;
 	dma_addr_t		auxiliary_phys;
 
diff --git a/include/linux/dma/mxs-dma.h b/include/linux/dma/mxs-dma.h
index 4a33f2c8a682..069d9f5a609e 100644
--- a/include/linux/dma/mxs-dma.h
+++ b/include/linux/dma/mxs-dma.h
@@ -5,6 +5,7 @@
 #include <linux/dmaengine.h>
 
 #define MXS_DMA_CTRL_WAIT4END	BIT(31)
+#define MXS_DMA_CTRL_WAIT4RDY	BIT(30)
 
 /*
  * The mxs dmaengine can do PIO transfers. We pass a pointer to the PIO words
-- 
cgit v1.2.3-59-g8ed1b


From bded033062396e67ffbb3111084cf7ea202473d5 Mon Sep 17 00:00:00 2001
From: Jeff Kletsky <git-commits@allycomm.com>
Date: Wed, 22 May 2019 15:05:53 -0700
Subject: mtd: spinand: Define macros for page-read ops with three-byte
 addresses

The GigaDevice GD5F1GQ4UFxxG SPI NAND utilizes three-byte addresses
for its page-read ops.

http://www.gigadevice.com/datasheet/gd5f1gq4xfxxg/

Signed-off-by: Jeff Kletsky <git-commits@allycomm.com>
Reviewed-by: Frieder Schrempf <frieder.schrempf@kontron.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 include/linux/mtd/spinand.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 507f7e289bd1..8aa39ac41e8e 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -68,30 +68,60 @@
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 1))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_OP_3A(fast, addr, ndummy, buf, len) \
+	SPI_MEM_OP(SPI_MEM_OP_CMD(fast ? 0x0b : 0x03, 1),		\
+		   SPI_MEM_OP_ADDR(3, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 1))
+
 #define SPINAND_PAGE_READ_FROM_CACHE_X2_OP(addr, ndummy, buf, len)	\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 2))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_X2_OP_3A(addr, ndummy, buf, len)	\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x3b, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 2))
+
 #define SPINAND_PAGE_READ_FROM_CACHE_X4_OP(addr, ndummy, buf, len)	\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 1),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 4))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_X4_OP_3A(addr, ndummy, buf, len)	\
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0x6b, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 1),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 1),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 4))
+
 #define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(addr, ndummy, buf, len)	\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 2),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 2),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 2))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP_3A(addr, ndummy, buf, len) \
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 2),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 2),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 2))
+
 #define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(addr, ndummy, buf, len)	\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1),				\
 		   SPI_MEM_OP_ADDR(2, addr, 4),				\
 		   SPI_MEM_OP_DUMMY(ndummy, 4),				\
 		   SPI_MEM_OP_DATA_IN(len, buf, 4))
 
+#define SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP_3A(addr, ndummy, buf, len) \
+	SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1),				\
+		   SPI_MEM_OP_ADDR(3, addr, 4),				\
+		   SPI_MEM_OP_DUMMY(ndummy, 4),				\
+		   SPI_MEM_OP_DATA_IN(len, buf, 4))
+
 #define SPINAND_PROG_EXEC_OP(addr)					\
 	SPI_MEM_OP(SPI_MEM_OP_CMD(0x10, 1),				\
 		   SPI_MEM_OP_ADDR(3, addr, 1),				\
-- 
cgit v1.2.3-59-g8ed1b


From 878844908e563a2f02b977bacd221c288e681c47 Mon Sep 17 00:00:00 2001
From: Jeff Kletsky <git-commits@allycomm.com>
Date: Wed, 22 May 2019 15:05:54 -0700
Subject: mtd: spinand: Add support for two-byte device IDs

The GigaDevice GD5F1GQ4UFxxG SPI NAND utilizes two-byte device IDs.

http://www.gigadevice.com/datasheet/gd5f1gq4xfxxg/

Signed-off-by: Jeff Kletsky <git-commits@allycomm.com>
Reviewed-by: Frieder Schrempf <frieder.schrempf@kontron.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/core.c | 2 +-
 include/linux/mtd/spinand.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 4c15bb58c623..556bfdb34455 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -845,7 +845,7 @@ spinand_select_op_variant(struct spinand_device *spinand,
  */
 int spinand_match_and_init(struct spinand_device *spinand,
 			   const struct spinand_info *table,
-			   unsigned int table_size, u8 devid)
+			   unsigned int table_size, u16 devid)
 {
 	struct nand_device *nand = spinand_to_nand(spinand);
 	unsigned int i;
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index 8aa39ac41e8e..fbc0423bb4ae 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -290,7 +290,7 @@ struct spinand_ecc_info {
  */
 struct spinand_info {
 	const char *model;
-	u8 devid;
+	u16 devid;
 	u32 flags;
 	struct nand_memory_organization memorg;
 	struct nand_ecc_req eccreq;
@@ -452,7 +452,7 @@ static inline void spinand_set_of_node(struct spinand_device *spinand,
 
 int spinand_match_and_init(struct spinand_device *dev,
 			   const struct spinand_info *table,
-			   unsigned int table_size, u8 devid);
+			   unsigned int table_size, u16 devid);
 
 int spinand_upd_cfg(struct spinand_device *spinand, u8 mask, u8 val);
 int spinand_select_target(struct spinand_device *spinand, unsigned int target);
-- 
cgit v1.2.3-59-g8ed1b


From 9f897bfdd89f5f08a12fa263a7f57fbf8ad9292f Mon Sep 17 00:00:00 2001
From: Kamal Dasu <kdasu.kdev@gmail.com>
Date: Thu, 16 May 2019 12:41:46 -0400
Subject: mtd: Add flag to indicate panic_write

Added a flag to indicate a panic_write so that low level drivers can
use it to take required action where applicable, to ensure oops data
gets written to assigned mtd device.

Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/mtdcore.c   | 3 +++
 include/linux/mtd/mtd.h | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 453242d6cf56..408615f29e57 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1124,6 +1124,9 @@ int mtd_panic_write(struct mtd_info *mtd, loff_t to, size_t len, size_t *retlen,
 		return -EROFS;
 	if (!len)
 		return 0;
+	if (!mtd->oops_panic_write)
+		mtd->oops_panic_write = true;
+
 	return mtd->_panic_write(mtd, to, len, retlen, buf);
 }
 EXPORT_SYMBOL_GPL(mtd_panic_write);
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 936a3fdb48b5..4ca8c1c845fb 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -316,6 +316,12 @@ struct mtd_info {
 	int (*_get_device) (struct mtd_info *mtd);
 	void (*_put_device) (struct mtd_info *mtd);
 
+	/*
+	 * flag indicates a panic write, low level drivers can take appropriate
+	 * action if required to ensure writes go through
+	 */
+	bool oops_panic_write;
+
 	struct notifier_block reboot_notifier;  /* default mode before reboot */
 
 	/* ECC status information */
-- 
cgit v1.2.3-59-g8ed1b


From 3552691616c940a7c4125c2678ba816653cd725e Mon Sep 17 00:00:00 2001
From: Jeff Kletsky <git-commits@allycomm.com>
Date: Tue, 18 Jun 2019 10:08:05 -0700
Subject: mtd: spinand: Add initial support for Paragon PN26G0xA

Add initial support for Paragon Technology
PN26G01Axxxxx and PN26G02Axxxxx SPI NAND

Datasheets available at
http://www.xtxtech.com/upfile/2016082517274590.pdf
http://www.xtxtech.com/upfile/2016082517282329.pdf

Signed-off-by: Jeff Kletsky <git-commits@allycomm.com>
Reviewed-by: Frieder Schrempf <frieder.schrempf@kontron.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
---
 drivers/mtd/nand/spi/Makefile  |   2 +-
 drivers/mtd/nand/spi/core.c    |   1 +
 drivers/mtd/nand/spi/paragon.c | 147 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/spinand.h    |   1 +
 4 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mtd/nand/spi/paragon.c

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/spi/Makefile b/drivers/mtd/nand/spi/Makefile
index 753125082640..9662b9c1d5a9 100644
--- a/drivers/mtd/nand/spi/Makefile
+++ b/drivers/mtd/nand/spi/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
-spinand-objs := core.o gigadevice.o macronix.o micron.o toshiba.o winbond.o
+spinand-objs := core.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o
 obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c
index 556bfdb34455..f0f3528aab8f 100644
--- a/drivers/mtd/nand/spi/core.c
+++ b/drivers/mtd/nand/spi/core.c
@@ -757,6 +757,7 @@ static const struct spinand_manufacturer *spinand_manufacturers[] = {
 	&gigadevice_spinand_manufacturer,
 	&macronix_spinand_manufacturer,
 	&micron_spinand_manufacturer,
+	&paragon_spinand_manufacturer,
 	&toshiba_spinand_manufacturer,
 	&winbond_spinand_manufacturer,
 };
diff --git a/drivers/mtd/nand/spi/paragon.c b/drivers/mtd/nand/spi/paragon.c
new file mode 100644
index 000000000000..52307681cbd0
--- /dev/null
+++ b/drivers/mtd/nand/spi/paragon.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Jeff Kletsky
+ *
+ * Author: Jeff Kletsky <git-commits@allycomm.com>
+ */
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/mtd/spinand.h>
+
+
+#define SPINAND_MFR_PARAGON	0xa1
+
+
+#define PN26G0XA_STATUS_ECC_BITMASK		(3 << 4)
+
+#define PN26G0XA_STATUS_ECC_NONE_DETECTED	(0 << 4)
+#define PN26G0XA_STATUS_ECC_1_7_CORRECTED	(1 << 4)
+#define PN26G0XA_STATUS_ECC_ERRORED		(2 << 4)
+#define PN26G0XA_STATUS_ECC_8_CORRECTED		(3 << 4)
+
+
+static SPINAND_OP_VARIANTS(read_cache_variants,
+		SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(0, 2, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
+		SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
+
+static SPINAND_OP_VARIANTS(write_cache_variants,
+		SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
+		SPINAND_PROG_LOAD(true, 0, NULL, 0));
+
+static SPINAND_OP_VARIANTS(update_cache_variants,
+		SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
+		SPINAND_PROG_LOAD(false, 0, NULL, 0));
+
+
+static int pn26g0xa_ooblayout_ecc(struct mtd_info *mtd, int section,
+				   struct mtd_oob_region *region)
+{
+	if (section > 3)
+		return -ERANGE;
+
+	region->offset = 6 + (15 * section); /* 4 BBM + 2 user bytes */
+	region->length = 13;
+
+	return 0;
+}
+
+static int pn26g0xa_ooblayout_free(struct mtd_info *mtd, int section,
+				   struct mtd_oob_region *region)
+{
+	if (section > 4)
+		return -ERANGE;
+
+	if (section == 4) {
+		region->offset = 64;
+		region->length = 64;
+	} else {
+		region->offset = 4 + (15 * section);
+		region->length = 2;
+	}
+
+	return 0;
+}
+
+static int pn26g0xa_ecc_get_status(struct spinand_device *spinand,
+				   u8 status)
+{
+	switch (status & PN26G0XA_STATUS_ECC_BITMASK) {
+	case PN26G0XA_STATUS_ECC_NONE_DETECTED:
+		return 0;
+
+	case PN26G0XA_STATUS_ECC_1_7_CORRECTED:
+		return 7;	/* Return upper limit by convention */
+
+	case PN26G0XA_STATUS_ECC_8_CORRECTED:
+		return 8;
+
+	case PN26G0XA_STATUS_ECC_ERRORED:
+		return -EBADMSG;
+
+	default:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+static const struct mtd_ooblayout_ops pn26g0xa_ooblayout = {
+	.ecc = pn26g0xa_ooblayout_ecc,
+	.free = pn26g0xa_ooblayout_free,
+};
+
+
+static const struct spinand_info paragon_spinand_table[] = {
+	SPINAND_INFO("PN26G01A", 0xe1,
+		     NAND_MEMORG(1, 2048, 128, 64, 1024, 21, 1, 1, 1),
+		     NAND_ECCREQ(8, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     0,
+		     SPINAND_ECCINFO(&pn26g0xa_ooblayout,
+				     pn26g0xa_ecc_get_status)),
+	SPINAND_INFO("PN26G02A", 0xe2,
+		     NAND_MEMORG(1, 2048, 128, 64, 2048, 41, 1, 1, 1),
+		     NAND_ECCREQ(8, 512),
+		     SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
+					      &write_cache_variants,
+					      &update_cache_variants),
+		     0,
+		     SPINAND_ECCINFO(&pn26g0xa_ooblayout,
+				     pn26g0xa_ecc_get_status)),
+};
+
+static int paragon_spinand_detect(struct spinand_device *spinand)
+{
+	u8 *id = spinand->id.data;
+	int ret;
+
+	/* Read ID returns [0][MID][DID] */
+
+	if (id[1] != SPINAND_MFR_PARAGON)
+		return 0;
+
+	ret = spinand_match_and_init(spinand, paragon_spinand_table,
+				     ARRAY_SIZE(paragon_spinand_table),
+				     id[2]);
+	if (ret)
+		return ret;
+
+	return 1;
+}
+
+static const struct spinand_manufacturer_ops paragon_spinand_manuf_ops = {
+	.detect = paragon_spinand_detect,
+};
+
+const struct spinand_manufacturer paragon_spinand_manufacturer = {
+	.id = SPINAND_MFR_PARAGON,
+	.name = "Paragon",
+	.ops = &paragon_spinand_manuf_ops,
+};
diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h
index fbc0423bb4ae..4ea558bd3c46 100644
--- a/include/linux/mtd/spinand.h
+++ b/include/linux/mtd/spinand.h
@@ -227,6 +227,7 @@ struct spinand_manufacturer {
 extern const struct spinand_manufacturer gigadevice_spinand_manufacturer;
 extern const struct spinand_manufacturer macronix_spinand_manufacturer;
 extern const struct spinand_manufacturer micron_spinand_manufacturer;
+extern const struct spinand_manufacturer paragon_spinand_manufacturer;
 extern const struct spinand_manufacturer toshiba_spinand_manufacturer;
 extern const struct spinand_manufacturer winbond_spinand_manufacturer;
 
-- 
cgit v1.2.3-59-g8ed1b


From a4496d52b3430cb3c4c16d03cdd5f4ee97ad1241 Mon Sep 17 00:00:00 2001
From: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Date: Tue, 7 May 2019 11:52:47 +0200
Subject: power: supply: add input power and voltage limit properties

For thermal management strategy you might be interested on limit the
input power for a power supply. We already have current limit but
basically what we probably want is to limit power. So, introduce the
input_power_limit property.

Although the common use case is limit the input power, in some
specific cases it is the voltage that is problematic (i.e some regulators
have different efficiencies at higher voltage resulting in more heat).
So introduce also the input_voltage_limit property.

This happens in one Chromebook and is used on the Pixel C's thermal
management strategy to effectively limit the input power to 5V 3A when
the screen is on. When the screen is on, the display, the CPU, and the GPU
all contribute more heat to the system than while the screen is off, and
we made a tradeoff to throttle the charger in order to give more of the
thermal budget to those other components.

So there's nothing fundamentally broken about the hardware that would
cause the Pixel C to malfunction if we were charging at 9V or 12V instead
of 5V when the screen is on, i.e. if userspace doesn't change this.

What would happen is that you wouldn't meet Google's skin temperature
targets on the system if the charger was allowed to run at 9V or 12V with
the screen on.

For folks hacking on Pixel Cs (which is now outside of Google's official
support window for Android) and customizing their own kernel and userspace
this would be acceptable, but we wanted to expose this feature in the
power supply properties because the feature does exist in the Emedded
Controller firmware of the Pixel C and all of Google's Chromebooks with
USB-C made since 2015 in case someone running an up to date kernel wanted
to limit the charging power for thermal or other reasons.

This patch exposes a new property, similar to input current limit, to
re-configure the maximum voltage from the external supply at runtime
based on system-level knowledge or user input.

Signed-off-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Reviewed-by: Guenter Roeck <groeck@chromium.org>
Acked-by: Adam Thomson <Adam.Thomson.Opensource@diasemi.com>
Reviewed-by: Benson Leung <bleung@chromium.org>
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 Documentation/ABI/testing/sysfs-class-power | 32 +++++++++++++++++++++++++++++
 Documentation/power/power_supply_class.txt  |  4 ++++
 drivers/power/supply/power_supply_sysfs.c   |  2 ++
 include/linux/power_supply.h                |  2 ++
 4 files changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-power b/Documentation/ABI/testing/sysfs-class-power
index b77e30b9014e..27edc06e2495 100644
--- a/Documentation/ABI/testing/sysfs-class-power
+++ b/Documentation/ABI/testing/sysfs-class-power
@@ -376,10 +376,42 @@ Description:
 		supply. Normally this is configured based on the type of
 		connection made (e.g. A configured SDP should output a maximum
 		of 500mA so the input current limit is set to the same value).
+		Use preferably input_power_limit, and for problems that can be
+		solved using power limit use input_current_limit.
 
 		Access: Read, Write
 		Valid values: Represented in microamps
 
+What:		/sys/class/power_supply/<supply_name>/input_voltage_limit
+Date:		May 2019
+Contact:	linux-pm@vger.kernel.org
+Description:
+		This entry configures the incoming VBUS voltage limit currently
+		set in the supply. Normally this is configured based on
+		system-level knowledge or user input (e.g. This is part of the
+		Pixel C's thermal management strategy to effectively limit the
+		input power to 5V when the screen is on to meet Google's skin
+		temperature targets). Note that this feature should not be
+		used for safety critical things.
+		Use preferably input_power_limit, and for problems that can be
+		solved using power limit use input_voltage_limit.
+
+		Access: Read, Write
+		Valid values: Represented in microvolts
+
+What:		/sys/class/power_supply/<supply_name>/input_power_limit
+Date:		May 2019
+Contact:	linux-pm@vger.kernel.org
+Description:
+		This entry configures the incoming power limit currently set
+		in the supply. Normally this is configured based on
+		system-level knowledge or user input. Use preferably this
+		feature to limit the incoming power and use current/voltage
+		limit only for problems that can be solved using power limit.
+
+		Access: Read, Write
+		Valid values: Represented in microwatts
+
 What:		/sys/class/power_supply/<supply_name>/online,
 Date:		May 2007
 Contact:	linux-pm@vger.kernel.org
diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt
index 300d37896e51..1e3c705111db 100644
--- a/Documentation/power/power_supply_class.txt
+++ b/Documentation/power/power_supply_class.txt
@@ -137,6 +137,10 @@ power supply object.
 
 INPUT_CURRENT_LIMIT - input current limit programmed by charger. Indicates
 the current drawn from a charging source.
+INPUT_VOLTAGE_LIMIT - input voltage limit programmed by charger. Indicates
+the voltage limit from a charging source.
+INPUT_POWER_LIMIT - input power limit programmed by charger. Indicates
+the power limit from a charging source.
 
 CHARGE_CONTROL_LIMIT - current charge control limit setting
 CHARGE_CONTROL_LIMIT_MAX - maximum charge control limit setting
diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c
index a704a76d7529..829e12c800e5 100644
--- a/drivers/power/supply/power_supply_sysfs.c
+++ b/drivers/power/supply/power_supply_sysfs.c
@@ -277,6 +277,8 @@ static struct device_attribute power_supply_attrs[] = {
 	POWER_SUPPLY_ATTR(charge_control_start_threshold),
 	POWER_SUPPLY_ATTR(charge_control_end_threshold),
 	POWER_SUPPLY_ATTR(input_current_limit),
+	POWER_SUPPLY_ATTR(input_voltage_limit),
+	POWER_SUPPLY_ATTR(input_power_limit),
 	POWER_SUPPLY_ATTR(energy_full_design),
 	POWER_SUPPLY_ATTR(energy_empty_design),
 	POWER_SUPPLY_ATTR(energy_full),
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index d5b15e039f4f..cbb708b57b11 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -129,6 +129,8 @@ enum power_supply_property {
 	POWER_SUPPLY_PROP_CHARGE_CONTROL_START_THRESHOLD, /* in percents! */
 	POWER_SUPPLY_PROP_CHARGE_CONTROL_END_THRESHOLD, /* in percents! */
 	POWER_SUPPLY_PROP_INPUT_CURRENT_LIMIT,
+	POWER_SUPPLY_PROP_INPUT_VOLTAGE_LIMIT,
+	POWER_SUPPLY_PROP_INPUT_POWER_LIMIT,
 	POWER_SUPPLY_PROP_ENERGY_FULL_DESIGN,
 	POWER_SUPPLY_PROP_ENERGY_EMPTY_DESIGN,
 	POWER_SUPPLY_PROP_ENERGY_FULL,
-- 
cgit v1.2.3-59-g8ed1b


From b1a17513a2d60f9e933016bed04d0eeb8651a915 Mon Sep 17 00:00:00 2001
From: Clement Leger <cleger@kalray.eu>
Date: Mon, 17 Jun 2019 14:57:30 +0200
Subject: remoteproc: add vendor resources handling

In order to allow rproc backend to handle vendor resources such as in
OpenAMP, add a handle_rsc hook. This hook allow the rproc backends to
handle vendor resources as they like. The hook will be called only for
vendor resources and should return RSC_HANDLED on successful resource
handling, RSC_IGNORED if resource was ignored, or a negative value on
error.

Signed-off-by: Clement Leger <cleger@kalray.eu>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 Documentation/remoteproc.txt             | 14 +++++++++-----
 drivers/remoteproc/remoteproc_core.c     | 14 ++++++++++++++
 drivers/remoteproc/remoteproc_internal.h | 11 +++++++++++
 include/linux/remoteproc.h               | 32 ++++++++++++++++++++++++++------
 4 files changed, 60 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/remoteproc.txt b/Documentation/remoteproc.txt
index 77fb03acdbb4..03c3d2e568b0 100644
--- a/Documentation/remoteproc.txt
+++ b/Documentation/remoteproc.txt
@@ -314,6 +314,8 @@ Here are the various resource types that are currently supported::
    * @RSC_VDEV:       declare support for a virtio device, and serve as its
    *		    virtio header.
    * @RSC_LAST:       just keep this one at the end
+   * @RSC_VENDOR_START:	start of the vendor specific resource types range
+   * @RSC_VENDOR_END:	end of the vendor specific resource types range
    *
    * Please note that these values are used as indices to the rproc_handle_rsc
    * lookup table, so please keep them sane. Moreover, @RSC_LAST is used to
@@ -321,11 +323,13 @@ Here are the various resource types that are currently supported::
    * please update it as needed.
    */
   enum fw_resource_type {
-	RSC_CARVEOUT	= 0,
-	RSC_DEVMEM	= 1,
-	RSC_TRACE	= 2,
-	RSC_VDEV	= 3,
-	RSC_LAST	= 4,
+	RSC_CARVEOUT		= 0,
+	RSC_DEVMEM		= 1,
+	RSC_TRACE		= 2,
+	RSC_VDEV		= 3,
+	RSC_LAST		= 4,
+	RSC_VENDOR_START	= 128,
+	RSC_VENDOR_END		= 512,
   };
 
 For more details regarding a specific resource type, please see its
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 48feebd6d0a2..263e9c9614a8 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1066,6 +1066,20 @@ static int rproc_handle_resources(struct rproc *rproc,
 
 		dev_dbg(dev, "rsc: type %d\n", hdr->type);
 
+		if (hdr->type >= RSC_VENDOR_START &&
+		    hdr->type <= RSC_VENDOR_END) {
+			ret = rproc_handle_rsc(rproc, hdr->type, rsc,
+					       offset + sizeof(*hdr), avail);
+			if (ret == RSC_HANDLED)
+				continue;
+			else if (ret < 0)
+				break;
+
+			dev_warn(dev, "unsupported vendor resource %d\n",
+				 hdr->type);
+			continue;
+		}
+
 		if (hdr->type >= RSC_LAST) {
 			dev_warn(dev, "unsupported resource %d\n", hdr->type);
 			continue;
diff --git a/drivers/remoteproc/remoteproc_internal.h b/drivers/remoteproc/remoteproc_internal.h
index 45ff76a06c72..4c77bdd517b9 100644
--- a/drivers/remoteproc/remoteproc_internal.h
+++ b/drivers/remoteproc/remoteproc_internal.h
@@ -106,6 +106,17 @@ static inline int rproc_parse_fw(struct rproc *rproc, const struct firmware *fw)
 	return 0;
 }
 
+static inline
+int rproc_handle_rsc(struct rproc *rproc, u32 rsc_type, void *rsc, int offset,
+		     int avail)
+{
+	if (rproc->ops->handle_rsc)
+		return rproc->ops->handle_rsc(rproc, rsc_type, rsc, offset,
+					      avail);
+
+	return RSC_IGNORED;
+}
+
 static inline
 struct resource_table *rproc_find_loaded_rsc_table(struct rproc *rproc,
 						   const struct firmware *fw)
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index 04d04709f2bd..16ad66683ad0 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -100,7 +100,9 @@ struct fw_rsc_hdr {
  *		    the remote processor will be writing logs.
  * @RSC_VDEV:       declare support for a virtio device, and serve as its
  *		    virtio header.
- * @RSC_LAST:       just keep this one at the end
+ * @RSC_LAST:       just keep this one at the end of standard resources
+ * @RSC_VENDOR_START:	start of the vendor specific resource types range
+ * @RSC_VENDOR_END:	end of the vendor specific resource types range
  *
  * For more details regarding a specific resource type, please see its
  * dedicated structure below.
@@ -111,11 +113,13 @@ struct fw_rsc_hdr {
  * please update it as needed.
  */
 enum fw_resource_type {
-	RSC_CARVEOUT	= 0,
-	RSC_DEVMEM	= 1,
-	RSC_TRACE	= 2,
-	RSC_VDEV	= 3,
-	RSC_LAST	= 4,
+	RSC_CARVEOUT		= 0,
+	RSC_DEVMEM		= 1,
+	RSC_TRACE		= 2,
+	RSC_VDEV		= 3,
+	RSC_LAST		= 4,
+	RSC_VENDOR_START	= 128,
+	RSC_VENDOR_END		= 512,
 };
 
 #define FW_RSC_ADDR_ANY (-1)
@@ -339,6 +343,16 @@ struct rproc_mem_entry {
 
 struct firmware;
 
+/**
+ * enum rsc_handling_status - return status of rproc_ops handle_rsc hook
+ * @RSC_HANDLED:	resource was handled
+ * @RSC_IGNORED:	resource was ignored
+ */
+enum rsc_handling_status {
+	RSC_HANDLED	= 0,
+	RSC_IGNORED	= 1,
+};
+
 /**
  * struct rproc_ops - platform-specific device handlers
  * @start:	power on the device and boot it
@@ -346,6 +360,10 @@ struct firmware;
  * @kick:	kick a virtqueue (virtqueue id given as a parameter)
  * @da_to_va:	optional platform hook to perform address translations
  * @parse_fw:	parse firmware to extract information (e.g. resource table)
+ * @handle_rsc:	optional platform hook to handle vendor resources. Should return
+ * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled and a
+ * negative value on error
+ * @load_rsc_table:	load resource table from firmware image
  * @find_loaded_rsc_table: find the loaded resouce table
  * @load:		load firmware to memory, where the remote processor
  *			expects to find it
@@ -358,6 +376,8 @@ struct rproc_ops {
 	void (*kick)(struct rproc *rproc, int vqid);
 	void * (*da_to_va)(struct rproc *rproc, u64 da, int len);
 	int (*parse_fw)(struct rproc *rproc, const struct firmware *fw);
+	int (*handle_rsc)(struct rproc *rproc, u32 rsc_type, void *rsc,
+			  int offset, int avail);
 	struct resource_table *(*find_loaded_rsc_table)(
 				struct rproc *rproc, const struct firmware *fw);
 	int (*load)(struct rproc *rproc, const struct firmware *fw);
-- 
cgit v1.2.3-59-g8ed1b


From 360aa640a59f269b784848c0b2d6d462952750d9 Mon Sep 17 00:00:00 2001
From: Fabien Dessenne <fabien.dessenne@st.com>
Date: Thu, 7 Mar 2019 16:58:23 +0100
Subject: hwspinlock: add the 'in_atomic' API

Add the 'in_atomic' mode which can be called from an atomic context.
This mode relies on the existing 'raw' mode (no lock, no preemption/irq
disabling) with the difference that the timeout is not based on jiffies
(jiffies won't increase when irq are disabled) but handled with
busy-waiting udelay() calls.

Signed-off-by: Fabien Dessenne <fabien.dessenne@st.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 Documentation/hwspinlock.txt         | 39 +++++++++++++++++++++++
 drivers/hwspinlock/hwspinlock_core.c | 43 +++++++++++++++++--------
 include/linux/hwspinlock.h           | 61 ++++++++++++++++++++++++++++++++++--
 3 files changed, 127 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/hwspinlock.txt b/Documentation/hwspinlock.txt
index c3403f9ae27a..6f03713b7003 100644
--- a/Documentation/hwspinlock.txt
+++ b/Documentation/hwspinlock.txt
@@ -151,6 +151,22 @@ notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
 
 The function will never sleep.
 
+::
+
+  int hwspin_lock_timeout_in_atomic(struct hwspinlock *hwlock, unsigned int to);
+
+Lock a previously-assigned hwspinlock with a timeout limit (specified in
+msecs). If the hwspinlock is already taken, the function will busy loop
+waiting for it to be released, but give up when the timeout elapses.
+
+This function shall be called only from an atomic context and the timeout
+value shall not exceed a few msecs.
+
+Returns 0 when successful and an appropriate error code otherwise (most
+notably -ETIMEDOUT if the hwspinlock is still busy after timeout msecs).
+
+The function will never sleep.
+
 ::
 
   int hwspin_trylock(struct hwspinlock *hwlock);
@@ -216,6 +232,19 @@ Returns 0 on success and an appropriate error code otherwise (most
 notably -EBUSY if the hwspinlock was already taken).
 The function will never sleep.
 
+::
+
+  int hwspin_trylock_in_atomic(struct hwspinlock *hwlock);
+
+Attempt to lock a previously-assigned hwspinlock, but immediately fail if
+it is already taken.
+
+This function shall be called only from an atomic context.
+
+Returns 0 on success and an appropriate error code otherwise (most
+notably -EBUSY if the hwspinlock was already taken).
+The function will never sleep.
+
 ::
 
   void hwspin_unlock(struct hwspinlock *hwlock);
@@ -262,6 +291,16 @@ The caller should **never** unlock an hwspinlock which is already unlocked.
 Doing so is considered a bug (there is no protection against this).
 This function will never sleep.
 
+::
+
+  void hwspin_unlock_in_atomic(struct hwspinlock *hwlock);
+
+Unlock a previously-locked hwspinlock.
+
+The caller should **never** unlock an hwspinlock which is already unlocked.
+Doing so is considered a bug (there is no protection against this).
+This function will never sleep.
+
 ::
 
   int hwspin_lock_get_id(struct hwspinlock *hwlock);
diff --git a/drivers/hwspinlock/hwspinlock_core.c b/drivers/hwspinlock/hwspinlock_core.c
index d806307f19c2..8862445aa858 100644
--- a/drivers/hwspinlock/hwspinlock_core.c
+++ b/drivers/hwspinlock/hwspinlock_core.c
@@ -9,6 +9,7 @@
 
 #define pr_fmt(fmt)    "%s: " fmt, __func__
 
+#include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
@@ -23,6 +24,9 @@
 
 #include "hwspinlock_internal.h"
 
+/* retry delay used in atomic context */
+#define HWSPINLOCK_RETRY_DELAY_US	100
+
 /* radix tree tags */
 #define HWSPINLOCK_UNUSED	(0) /* tags an hwspinlock as unused */
 
@@ -68,11 +72,11 @@ static DEFINE_MUTEX(hwspinlock_tree_lock);
  * user need some time-consuming or sleepable operations under the hardware
  * lock, they need one sleepable lock (like mutex) to protect the operations.
  *
- * If the mode is not HWLOCK_RAW, upon a successful return from this function,
- * preemption (and possibly interrupts) is disabled, so the caller must not
- * sleep, and is advised to release the hwspinlock as soon as possible. This is
- * required in order to minimize remote cores polling on the hardware
- * interconnect.
+ * If the mode is neither HWLOCK_IN_ATOMIC nor HWLOCK_RAW, upon a successful
+ * return from this function, preemption (and possibly interrupts) is disabled,
+ * so the caller must not sleep, and is advised to release the hwspinlock as
+ * soon as possible. This is required in order to minimize remote cores polling
+ * on the hardware interconnect.
  *
  * The user decides whether local interrupts are disabled or not, and if yes,
  * whether he wants their previous state to be saved. It is up to the user
@@ -112,6 +116,7 @@ int __hwspin_trylock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 		ret = spin_trylock_irq(&hwlock->lock);
 		break;
 	case HWLOCK_RAW:
+	case HWLOCK_IN_ATOMIC:
 		ret = 1;
 		break;
 	default:
@@ -136,6 +141,7 @@ int __hwspin_trylock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 			spin_unlock_irq(&hwlock->lock);
 			break;
 		case HWLOCK_RAW:
+		case HWLOCK_IN_ATOMIC:
 			/* Nothing to do */
 			break;
 		default:
@@ -179,11 +185,14 @@ EXPORT_SYMBOL_GPL(__hwspin_trylock);
  * user need some time-consuming or sleepable operations under the hardware
  * lock, they need one sleepable lock (like mutex) to protect the operations.
  *
- * If the mode is not HWLOCK_RAW, upon a successful return from this function,
- * preemption is disabled (and possibly local interrupts, too), so the caller
- * must not sleep, and is advised to release the hwspinlock as soon as possible.
- * This is required in order to minimize remote cores polling on the
- * hardware interconnect.
+ * If the mode is HWLOCK_IN_ATOMIC (called from an atomic context) the timeout
+ * is handled with busy-waiting delays, hence shall not exceed few msecs.
+ *
+ * If the mode is neither HWLOCK_IN_ATOMIC nor HWLOCK_RAW, upon a successful
+ * return from this function, preemption (and possibly interrupts) is disabled,
+ * so the caller must not sleep, and is advised to release the hwspinlock as
+ * soon as possible. This is required in order to minimize remote cores polling
+ * on the hardware interconnect.
  *
  * The user decides whether local interrupts are disabled or not, and if yes,
  * whether he wants their previous state to be saved. It is up to the user
@@ -198,7 +207,7 @@ int __hwspin_lock_timeout(struct hwspinlock *hwlock, unsigned int to,
 					int mode, unsigned long *flags)
 {
 	int ret;
-	unsigned long expire;
+	unsigned long expire, atomic_delay = 0;
 
 	expire = msecs_to_jiffies(to) + jiffies;
 
@@ -212,8 +221,15 @@ int __hwspin_lock_timeout(struct hwspinlock *hwlock, unsigned int to,
 		 * The lock is already taken, let's check if the user wants
 		 * us to try again
 		 */
-		if (time_is_before_eq_jiffies(expire))
-			return -ETIMEDOUT;
+		if (mode == HWLOCK_IN_ATOMIC) {
+			udelay(HWSPINLOCK_RETRY_DELAY_US);
+			atomic_delay += HWSPINLOCK_RETRY_DELAY_US;
+			if (atomic_delay > to * 1000)
+				return -ETIMEDOUT;
+		} else {
+			if (time_is_before_eq_jiffies(expire))
+				return -ETIMEDOUT;
+		}
 
 		/*
 		 * Allow platform-specific relax handlers to prevent
@@ -276,6 +292,7 @@ void __hwspin_unlock(struct hwspinlock *hwlock, int mode, unsigned long *flags)
 		spin_unlock_irq(&hwlock->lock);
 		break;
 	case HWLOCK_RAW:
+	case HWLOCK_IN_ATOMIC:
 		/* Nothing to do */
 		break;
 	default:
diff --git a/include/linux/hwspinlock.h b/include/linux/hwspinlock.h
index 0afe693be5f4..bfe7c1f1ac6d 100644
--- a/include/linux/hwspinlock.h
+++ b/include/linux/hwspinlock.h
@@ -14,9 +14,10 @@
 #include <linux/sched.h>
 
 /* hwspinlock mode argument */
-#define HWLOCK_IRQSTATE	0x01	/* Disable interrupts, save state */
-#define HWLOCK_IRQ	0x02	/* Disable interrupts, don't save state */
-#define HWLOCK_RAW	0x03
+#define HWLOCK_IRQSTATE		0x01 /* Disable interrupts, save state */
+#define HWLOCK_IRQ		0x02 /* Disable interrupts, don't save state */
+#define HWLOCK_RAW		0x03
+#define HWLOCK_IN_ATOMIC	0x04 /* Called while in atomic context */
 
 struct device;
 struct device_node;
@@ -222,6 +223,23 @@ static inline int hwspin_trylock_raw(struct hwspinlock *hwlock)
 	return __hwspin_trylock(hwlock, HWLOCK_RAW, NULL);
 }
 
+/**
+ * hwspin_trylock_in_atomic() - attempt to lock a specific hwspinlock
+ * @hwlock: an hwspinlock which we want to trylock
+ *
+ * This function attempts to lock an hwspinlock, and will immediately fail
+ * if the hwspinlock is already taken.
+ *
+ * This function shall be called only from an atomic context.
+ *
+ * Returns 0 if we successfully locked the hwspinlock, -EBUSY if
+ * the hwspinlock was already taken, and -EINVAL if @hwlock is invalid.
+ */
+static inline int hwspin_trylock_in_atomic(struct hwspinlock *hwlock)
+{
+	return __hwspin_trylock(hwlock, HWLOCK_IN_ATOMIC, NULL);
+}
+
 /**
  * hwspin_trylock() - attempt to lock a specific hwspinlock
  * @hwlock: an hwspinlock which we want to trylock
@@ -312,6 +330,28 @@ int hwspin_lock_timeout_raw(struct hwspinlock *hwlock, unsigned int to)
 	return __hwspin_lock_timeout(hwlock, to, HWLOCK_RAW, NULL);
 }
 
+/**
+ * hwspin_lock_timeout_in_atomic() - lock an hwspinlock with timeout limit
+ * @hwlock: the hwspinlock to be locked
+ * @to: timeout value in msecs
+ *
+ * This function locks the underlying @hwlock. If the @hwlock
+ * is already taken, the function will busy loop waiting for it to
+ * be released, but give up when @timeout msecs have elapsed.
+ *
+ * This function shall be called only from an atomic context and the timeout
+ * value shall not exceed a few msecs.
+ *
+ * Returns 0 when the @hwlock was successfully taken, and an appropriate
+ * error code otherwise (most notably an -ETIMEDOUT if the @hwlock is still
+ * busy after @timeout msecs). The function will never sleep.
+ */
+static inline
+int hwspin_lock_timeout_in_atomic(struct hwspinlock *hwlock, unsigned int to)
+{
+	return __hwspin_lock_timeout(hwlock, to, HWLOCK_IN_ATOMIC, NULL);
+}
+
 /**
  * hwspin_lock_timeout() - lock an hwspinlock with timeout limit
  * @hwlock: the hwspinlock to be locked
@@ -386,6 +426,21 @@ static inline void hwspin_unlock_raw(struct hwspinlock *hwlock)
 	__hwspin_unlock(hwlock, HWLOCK_RAW, NULL);
 }
 
+/**
+ * hwspin_unlock_in_atomic() - unlock hwspinlock
+ * @hwlock: a previously-acquired hwspinlock which we want to unlock
+ *
+ * This function will unlock a specific hwspinlock.
+ *
+ * @hwlock must be already locked (e.g. by hwspin_trylock()) before calling
+ * this function: it is a bug to call unlock on a @hwlock that is already
+ * unlocked.
+ */
+static inline void hwspin_unlock_in_atomic(struct hwspinlock *hwlock)
+{
+	__hwspin_unlock(hwlock, HWLOCK_IN_ATOMIC, NULL);
+}
+
 /**
  * hwspin_unlock() - unlock hwspinlock
  * @hwlock: a previously-acquired hwspinlock which we want to unlock
-- 
cgit v1.2.3-59-g8ed1b


From 0828c1001399d5c5fcab547ef7b0a29c78d4bdf6 Mon Sep 17 00:00:00 2001
From: Jeffrey Hugo <jeffrey.l.hugo@gmail.com>
Date: Sun, 30 Jun 2019 00:14:52 -0700
Subject: Input: elan_i2c - export the device id whitelist

Elan_i2c and hid-quirks work in conjunction to decide which devices each
driver will handle.  Elan_i2c has a whitelist of devices that should be
consumed by hid-quirks so that there is one master list of devices to
handoff between the drivers.  Put the ids in a header file so that
hid-quirks can consume it instead of duplicating the list.

Signed-off-by: Jeffrey Hugo <jeffrey.l.hugo@gmail.com>
Acked-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/mouse/elan_i2c_core.c | 50 +-----------------------
 include/linux/input/elan-i2c-ids.h  | 76 +++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 49 deletions(-)
 create mode 100644 include/linux/input/elan-i2c-ids.h

(limited to 'include/linux')

diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c
index 65cd325eabc3..e2c824abd19c 100644
--- a/drivers/input/mouse/elan_i2c_core.c
+++ b/drivers/input/mouse/elan_i2c_core.c
@@ -37,6 +37,7 @@
 #include <linux/completion.h>
 #include <linux/of.h>
 #include <linux/property.h>
+#include <linux/input/elan-i2c-ids.h>
 #include <linux/regulator/consumer.h>
 #include <asm/unaligned.h>
 
@@ -1375,55 +1376,6 @@ static const struct i2c_device_id elan_id[] = {
 MODULE_DEVICE_TABLE(i2c, elan_id);
 
 #ifdef CONFIG_ACPI
-static const struct acpi_device_id elan_acpi_id[] = {
-	{ "ELAN0000", 0 },
-	{ "ELAN0100", 0 },
-	{ "ELAN0600", 0 },
-	{ "ELAN0601", 0 },
-	{ "ELAN0602", 0 },
-	{ "ELAN0603", 0 },
-	{ "ELAN0604", 0 },
-	{ "ELAN0605", 0 },
-	{ "ELAN0606", 0 },
-	{ "ELAN0607", 0 },
-	{ "ELAN0608", 0 },
-	{ "ELAN0609", 0 },
-	{ "ELAN060B", 0 },
-	{ "ELAN060C", 0 },
-	{ "ELAN060F", 0 },
-	{ "ELAN0610", 0 },
-	{ "ELAN0611", 0 },
-	{ "ELAN0612", 0 },
-	{ "ELAN0615", 0 },
-	{ "ELAN0616", 0 },
-	{ "ELAN0617", 0 },
-	{ "ELAN0618", 0 },
-	{ "ELAN0619", 0 },
-	{ "ELAN061A", 0 },
-	{ "ELAN061B", 0 },
-	{ "ELAN061C", 0 },
-	{ "ELAN061D", 0 },
-	{ "ELAN061E", 0 },
-	{ "ELAN061F", 0 },
-	{ "ELAN0620", 0 },
-	{ "ELAN0621", 0 },
-	{ "ELAN0622", 0 },
-	{ "ELAN0623", 0 },
-	{ "ELAN0624", 0 },
-	{ "ELAN0625", 0 },
-	{ "ELAN0626", 0 },
-	{ "ELAN0627", 0 },
-	{ "ELAN0628", 0 },
-	{ "ELAN0629", 0 },
-	{ "ELAN062A", 0 },
-	{ "ELAN062B", 0 },
-	{ "ELAN062C", 0 },
-	{ "ELAN062D", 0 },
-	{ "ELAN0631", 0 },
-	{ "ELAN0632", 0 },
-	{ "ELAN1000", 0 },
-	{ }
-};
 MODULE_DEVICE_TABLE(acpi, elan_acpi_id);
 #endif
 
diff --git a/include/linux/input/elan-i2c-ids.h b/include/linux/input/elan-i2c-ids.h
new file mode 100644
index 000000000000..ceabb01a6a7d
--- /dev/null
+++ b/include/linux/input/elan-i2c-ids.h
@@ -0,0 +1,76 @@
+/*
+ * Elan I2C/SMBus Touchpad device whitelist
+ *
+ * Copyright (c) 2013 ELAN Microelectronics Corp.
+ *
+ * Author: æ維 (Duson Lin) <dusonlin@emc.com.tw>
+ * Author: KT Liao <kt.liao@emc.com.tw>
+ * Version: 1.6.3
+ *
+ * Based on cyapa driver:
+ * copyright (c) 2011-2012 Cypress Semiconductor, Inc.
+ * copyright (c) 2011-2012 Google, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * Trademarks are the property of their respective owners.
+ */
+
+#ifndef __ELAN_I2C_IDS_H
+#define __ELAN_I2C_IDS_H
+
+#include <linux/mod_devicetable.h>
+
+static const struct acpi_device_id elan_acpi_id[] = {
+	{ "ELAN0000", 0 },
+	{ "ELAN0100", 0 },
+	{ "ELAN0600", 0 },
+	{ "ELAN0601", 0 },
+	{ "ELAN0602", 0 },
+	{ "ELAN0603", 0 },
+	{ "ELAN0604", 0 },
+	{ "ELAN0605", 0 },
+	{ "ELAN0606", 0 },
+	{ "ELAN0607", 0 },
+	{ "ELAN0608", 0 },
+	{ "ELAN0609", 0 },
+	{ "ELAN060B", 0 },
+	{ "ELAN060C", 0 },
+	{ "ELAN060F", 0 },
+	{ "ELAN0610", 0 },
+	{ "ELAN0611", 0 },
+	{ "ELAN0612", 0 },
+	{ "ELAN0615", 0 },
+	{ "ELAN0616", 0 },
+	{ "ELAN0617", 0 },
+	{ "ELAN0618", 0 },
+	{ "ELAN0619", 0 },
+	{ "ELAN061A", 0 },
+	{ "ELAN061B", 0 },
+	{ "ELAN061C", 0 },
+	{ "ELAN061D", 0 },
+	{ "ELAN061E", 0 },
+	{ "ELAN061F", 0 },
+	{ "ELAN0620", 0 },
+	{ "ELAN0621", 0 },
+	{ "ELAN0622", 0 },
+	{ "ELAN0623", 0 },
+	{ "ELAN0624", 0 },
+	{ "ELAN0625", 0 },
+	{ "ELAN0626", 0 },
+	{ "ELAN0627", 0 },
+	{ "ELAN0628", 0 },
+	{ "ELAN0629", 0 },
+	{ "ELAN062A", 0 },
+	{ "ELAN062B", 0 },
+	{ "ELAN062C", 0 },
+	{ "ELAN062D", 0 },
+	{ "ELAN0631", 0 },
+	{ "ELAN0632", 0 },
+	{ "ELAN1000", 0 },
+	{ }
+};
+
+#endif /* __ELAN_I2C_IDS_H */
-- 
cgit v1.2.3-59-g8ed1b


From 498b98e939007f8bb65094dfa229e84b6bf30e62 Mon Sep 17 00:00:00 2001
From: Bjorn Andersson <bjorn.andersson@linaro.org>
Date: Fri, 21 Jun 2019 18:21:45 -0700
Subject: soc: qcom: mdt_loader: Support loading non-split images

In some software releases the firmware images are not split up with each
loadable segment in it's own file. Check the size of the loaded firmware
to see if it still contains each segment to be loaded, before falling
back to the split-out segments.

Acked-by: Andy Gross <agross@kernel.org>
Reviewed-by: Jeffrey Hugo <jeffrey.l.hugo@gmail.com>
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
---
 drivers/soc/qcom/mdt_loader.c       | 88 +++++++++++++++++++++++++++++++++++--
 include/linux/soc/qcom/mdt_loader.h |  2 +
 2 files changed, 87 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/soc/qcom/mdt_loader.c b/drivers/soc/qcom/mdt_loader.c
index 1c488024c698..1c970d6dd532 100644
--- a/drivers/soc/qcom/mdt_loader.c
+++ b/drivers/soc/qcom/mdt_loader.c
@@ -74,6 +74,66 @@ ssize_t qcom_mdt_get_size(const struct firmware *fw)
 }
 EXPORT_SYMBOL_GPL(qcom_mdt_get_size);
 
+/**
+ * qcom_mdt_read_metadata() - read header and metadata from mdt or mbn
+ * @fw:		firmware of mdt header or mbn
+ * @data_len:	length of the read metadata blob
+ *
+ * The mechanism that performs the authentication of the loading firmware
+ * expects an ELF header directly followed by the segment of hashes, with no
+ * padding inbetween. This function allocates a chunk of memory for this pair
+ * and copy the two pieces into the buffer.
+ *
+ * In the case of split firmware the hash is found directly following the ELF
+ * header, rather than at p_offset described by the second program header.
+ *
+ * The caller is responsible to free (kfree()) the returned pointer.
+ *
+ * Return: pointer to data, or ERR_PTR()
+ */
+void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len)
+{
+	const struct elf32_phdr *phdrs;
+	const struct elf32_hdr *ehdr;
+	size_t hash_offset;
+	size_t hash_size;
+	size_t ehdr_size;
+	void *data;
+
+	ehdr = (struct elf32_hdr *)fw->data;
+	phdrs = (struct elf32_phdr *)(ehdr + 1);
+
+	if (ehdr->e_phnum < 2)
+		return ERR_PTR(-EINVAL);
+
+	if (phdrs[0].p_type == PT_LOAD || phdrs[1].p_type == PT_LOAD)
+		return ERR_PTR(-EINVAL);
+
+	if ((phdrs[1].p_flags & QCOM_MDT_TYPE_MASK) != QCOM_MDT_TYPE_HASH)
+		return ERR_PTR(-EINVAL);
+
+	ehdr_size = phdrs[0].p_filesz;
+	hash_size = phdrs[1].p_filesz;
+
+	data = kmalloc(ehdr_size + hash_size, GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	/* Is the header and hash already packed */
+	if (ehdr_size + hash_size == fw->size)
+		hash_offset = phdrs[0].p_filesz;
+	else
+		hash_offset = phdrs[1].p_offset;
+
+	memcpy(data, fw->data, ehdr_size);
+	memcpy(data + ehdr_size, fw->data + hash_offset, hash_size);
+
+	*data_len = ehdr_size + hash_size;
+
+	return data;
+}
+EXPORT_SYMBOL_GPL(qcom_mdt_read_metadata);
+
 static int __qcom_mdt_load(struct device *dev, const struct firmware *fw,
 			   const char *firmware, int pas_id, void *mem_region,
 			   phys_addr_t mem_phys, size_t mem_size,
@@ -86,12 +146,14 @@ static int __qcom_mdt_load(struct device *dev, const struct firmware *fw,
 	phys_addr_t mem_reloc;
 	phys_addr_t min_addr = PHYS_ADDR_MAX;
 	phys_addr_t max_addr = 0;
+	size_t metadata_len;
 	size_t fw_name_len;
 	ssize_t offset;
+	void *metadata;
 	char *fw_name;
 	bool relocate = false;
 	void *ptr;
-	int ret;
+	int ret = 0;
 	int i;
 
 	if (!fw || !mem_region || !mem_phys || !mem_size)
@@ -109,7 +171,15 @@ static int __qcom_mdt_load(struct device *dev, const struct firmware *fw,
 		return -ENOMEM;
 
 	if (pas_init) {
-		ret = qcom_scm_pas_init_image(pas_id, fw->data, fw->size);
+		metadata = qcom_mdt_read_metadata(fw, &metadata_len);
+		if (IS_ERR(metadata)) {
+			ret = PTR_ERR(metadata);
+			goto out;
+		}
+
+		ret = qcom_scm_pas_init_image(pas_id, metadata, metadata_len);
+
+		kfree(metadata);
 		if (ret) {
 			dev_err(dev, "invalid firmware metadata\n");
 			goto out;
@@ -170,7 +240,19 @@ static int __qcom_mdt_load(struct device *dev, const struct firmware *fw,
 
 		ptr = mem_region + offset;
 
-		if (phdr->p_filesz) {
+		if (phdr->p_filesz && phdr->p_offset < fw->size) {
+			/* Firmware is large enough to be non-split */
+			if (phdr->p_offset + phdr->p_filesz > fw->size) {
+				dev_err(dev,
+					"failed to load segment %d from truncated file %s\n",
+					i, firmware);
+				ret = -EINVAL;
+				break;
+			}
+
+			memcpy(ptr, fw->data + phdr->p_offset, phdr->p_filesz);
+		} else if (phdr->p_filesz) {
+			/* Firmware not large enough, load split-out segments */
 			sprintf(fw_name + fw_name_len - 3, "b%02d", i);
 			ret = request_firmware_into_buf(&seg_fw, fw_name, dev,
 							ptr, phdr->p_filesz);
diff --git a/include/linux/soc/qcom/mdt_loader.h b/include/linux/soc/qcom/mdt_loader.h
index 944b06aefb0f..e600baec6825 100644
--- a/include/linux/soc/qcom/mdt_loader.h
+++ b/include/linux/soc/qcom/mdt_loader.h
@@ -21,4 +21,6 @@ int qcom_mdt_load_no_init(struct device *dev, const struct firmware *fw,
 			  const char *fw_name, int pas_id, void *mem_region,
 			  phys_addr_t mem_phys, size_t mem_size,
 			  phys_addr_t *reloc_base);
+void *qcom_mdt_read_metadata(const struct firmware *fw, size_t *data_len);
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 6987fd42239ad43937166dace2ed8fb260b14d25 Mon Sep 17 00:00:00 2001
From: Otto Sabart <ottosabart@seberm.com>
Date: Mon, 20 May 2019 10:06:26 +0100
Subject: mfd: madera: Fix bad reference to pinctrl.txt file

The pinctrl.txt file was converted into reStructuredText and moved into
driver-api folder. This patch updates the broken reference.

Fixes: 5a9b73832e9e ("pinctrl.txt: move it to the driver-api book")
Signed-off-by: Otto Sabart <ottosabart@seberm.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Reviewed-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/madera/pdata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/madera/pdata.h b/include/linux/mfd/madera/pdata.h
index 8dc852402dbb..dd00ab824e5b 100644
--- a/include/linux/mfd/madera/pdata.h
+++ b/include/linux/mfd/madera/pdata.h
@@ -34,7 +34,8 @@ struct madera_codec_pdata;
  * @micvdd:	    Substruct of pdata for the MICVDD regulator
  * @irq_flags:	    Mode for primary IRQ (defaults to active low)
  * @gpio_base:	    Base GPIO number
- * @gpio_configs:   Array of GPIO configurations (See Documentation/pinctrl.txt)
+ * @gpio_configs:   Array of GPIO configurations (See
+ *		    Documentation/driver-api/pinctl.rst)
  * @n_gpio_configs: Number of entries in gpio_configs
  * @gpsw:	    General purpose switch mode setting. Depends on the external
  *		    hardware connected to the switch. (See the SW1_MODE field
-- 
cgit v1.2.3-59-g8ed1b


From 1ef921b6d1b68887be22f02dabc6ae73c112dce4 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Thu, 30 May 2019 15:39:52 +0100
Subject: mfd: madera: Add Madera core support for CS47L15

This patch adds all the core support and defines for the Cirrus
Logic CS47L15 smart audio CODEC.

Registers or fields are named MADERA_* if it is part of the
common hardware platform and does not conflict with any other
Madera codecs. It is named CS47L15_* if it is unique to CS47L15
and conflicts with definitions on other codecs.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                  |    7 +
 drivers/mfd/Makefile                 |    3 +
 drivers/mfd/cs47l15-tables.c         | 1301 ++++++++++++++++++++++++++++++++++
 drivers/mfd/madera-core.c            |   44 ++
 drivers/mfd/madera-i2c.c             |    7 +
 drivers/mfd/madera-spi.c             |    7 +
 drivers/mfd/madera.h                 |    6 +
 include/linux/mfd/madera/core.h      |    2 +
 include/linux/mfd/madera/registers.h |    5 +
 9 files changed, 1382 insertions(+)
 create mode 100644 drivers/mfd/cs47l15-tables.c

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 760b9e29c8e5..86ae0a11f631 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -260,6 +260,13 @@ config MFD_MADERA_SPI
 	  Support for the Cirrus Logic Madera platform audio SoC
 	  core functionality controlled via SPI.
 
+config MFD_CS47L15
+	bool "Cirrus Logic CS47L15"
+	select PINCTRL_CS47L15
+	depends on MFD_MADERA
+	help
+	  Support for Cirrus Logic CS47L15 Smart Codec
+
 config MFD_CS47L35
 	bool "Cirrus Logic CS47L35"
 	select PINCTRL_CS47L35
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 643d65bcb6ea..cc044f38af84 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -75,6 +75,9 @@ obj-$(CONFIG_MFD_WM8994)	+= wm8994.o
 obj-$(CONFIG_MFD_WM97xx)	+= wm97xx-core.o
 
 madera-objs			:= madera-core.o
+ifeq ($(CONFIG_MFD_CS47L15),y)
+madera-objs			+= cs47l15-tables.o
+endif
 ifeq ($(CONFIG_MFD_CS47L35),y)
 madera-objs			+= cs47l35-tables.o
 endif
diff --git a/drivers/mfd/cs47l15-tables.c b/drivers/mfd/cs47l15-tables.c
new file mode 100644
index 000000000000..1b4f6f79eac2
--- /dev/null
+++ b/drivers/mfd/cs47l15-tables.c
@@ -0,0 +1,1301 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Regmap tables for CS47L15 codec
+ *
+ * Copyright (C) 2016-2019 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+
+#include <linux/mfd/madera/core.h>
+#include <linux/mfd/madera/registers.h>
+
+#include "madera.h"
+
+static const struct reg_sequence cs47l15_reva_16_patch[] = {
+	{ 0x8C, 0x5555 },
+	{ 0x8C, 0xAAAA },
+	{ 0x314, 0x0080 },
+	{ 0x4A8, 0x6023 },
+	{ 0x4A9, 0x6023 },
+	{ 0x4D4, 0x0008 },
+	{ 0x4CF, 0x0F00 },
+	{ 0x4D7, 0x1B2B },
+	{ 0x8C, 0xCCCC },
+	{ 0x8C, 0x3333 },
+};
+
+int cs47l15_patch(struct madera *madera)
+{
+	int ret;
+
+	ret = regmap_register_patch(madera->regmap,
+				    cs47l15_reva_16_patch,
+				    ARRAY_SIZE(cs47l15_reva_16_patch));
+	if (ret < 0) {
+		dev_err(madera->dev,
+			"Error in applying 16-bit patch: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cs47l15_patch);
+
+static const struct reg_default cs47l15_reg_default[] = {
+	{ 0x00000020, 0x0000 }, /* R32 (0x20) - Tone Generator 1 */
+	{ 0x00000021, 0x1000 }, /* R33 (0x21) - Tone Generator 2 */
+	{ 0x00000022, 0x0000 }, /* R34 (0x22) - Tone Generator 3 */
+	{ 0x00000023, 0x1000 }, /* R35 (0x23) - Tone Generator 4 */
+	{ 0x00000024, 0x0000 }, /* R36 (0x24) - Tone Generator 5 */
+	{ 0x00000030, 0x0000 }, /* R48 (0x30) - PWM Drive 1 */
+	{ 0x00000031, 0x0100 }, /* R49 (0x31) - PWM Drive 2 */
+	{ 0x00000032, 0x0100 }, /* R50 (0x32) - PWM Drive 3 */
+	{ 0x00000061, 0x01ff }, /* R97 (0x61) - Sample Rate Sequence Select 1 */
+	{ 0x00000062, 0x01ff }, /* R98 (0x62) - Sample Rate Sequence Select 2 */
+	{ 0x00000063, 0x01ff }, /* R99 (0x63) - Sample Rate Sequence Select 3 */
+	{ 0x00000064, 0x01ff }, /* R100 (0x64) - Sample Rate Sequence Select 4 */
+	{ 0x00000066, 0x01ff }, /* R102 (0x66) - Always On Triggers Sequence Select 1 */
+	{ 0x00000067, 0x01ff }, /* R103 (0x67) - Always On Triggers Sequence Select 2 */
+	{ 0x00000090, 0x0000 }, /* R144 (0x90) - Haptics Control 1 */
+	{ 0x00000091, 0x7fff }, /* R145 (0x91) - Haptics Control 2 */
+	{ 0x00000092, 0x0000 }, /* R146 (0x92) - Haptics Phase 1 Intensity */
+	{ 0x00000093, 0x0000 }, /* R147 (0x93) - Haptics Phase 1 Duration */
+	{ 0x00000094, 0x0000 }, /* R148 (0x94) - Haptics Phase 2 Intensity */
+	{ 0x00000095, 0x0000 }, /* R149 (0x95) - Haptics Phase 2 Duration */
+	{ 0x00000096, 0x0000 }, /* R150 (0x96) - Haptics Phase 3 Intensity */
+	{ 0x00000097, 0x0000 }, /* R151 (0x97) - Haptics Phase 3 Duration */
+	{ 0x000000a0, 0x0000 }, /* R160 (0xA0) - Comfort Noise Generator */
+	{ 0x00000100, 0x0002 }, /* R256 (0x100) - Clock 32K 1 */
+	{ 0x00000101, 0x0404 }, /* R257 (0x101) - System Clock 1 */
+	{ 0x00000102, 0x0011 }, /* R258 (0x102) - Sample Rate 1 */
+	{ 0x00000103, 0x0011 }, /* R259 (0x103) - Sample Rate 2 */
+	{ 0x00000104, 0x0011 }, /* R260 (0x104) - Sample Rate 3 */
+	{ 0x00000120, 0x0304 }, /* R288 (0x120) - DSP Clock 1 */
+	{ 0x00000122, 0x0000 }, /* R290 (0x122) - DSP Clock 2 */
+	{ 0x00000149, 0x0000 }, /* R329 (0x149) - Output System Clock */
+	{ 0x00000152, 0x0000 }, /* R338 (0x152) - Rate Estimator 1 */
+	{ 0x00000153, 0x0000 }, /* R339 (0x153) - Rate Estimator 2 */
+	{ 0x00000154, 0x0000 }, /* R340 (0x154) - Rate Estimator 3 */
+	{ 0x00000155, 0x0000 }, /* R341 (0x155) - Rate Estimator 4 */
+	{ 0x00000156, 0x0000 }, /* R342 (0x156) - Rate Estimator 5 */
+	{ 0x00000171, 0x0002 }, /* R369 (0x171) - FLL1 Control 1 */
+	{ 0x00000172, 0x0008 }, /* R370 (0x172) - FLL1 Control 2 */
+	{ 0x00000173, 0x0018 }, /* R371 (0x173) - FLL1 Control 3 */
+	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
+	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
+	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
+	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
+	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
+	{ 0x0000017a, 0x2906 }, /* R378 (0x17A) - FLL1 EFS 2 */
+	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
+	{ 0x00000182, 0x0000 }, /* R386 (0x182) - FLL1 Synchroniser 2 */
+	{ 0x00000183, 0x0000 }, /* R387 (0x183) - FLL1 Synchroniser 3 */
+	{ 0x00000184, 0x0000 }, /* R388 (0x184) - FLL1 Synchroniser 4 */
+	{ 0x00000185, 0x0000 }, /* R389 (0x185) - FLL1 Synchroniser 5 */
+	{ 0x00000186, 0x0000 }, /* R390 (0x186) - FLL1 Synchroniser 6 */
+	{ 0x00000187, 0x0001 }, /* R391 (0x187) - FLL1 Synchroniser 7 */
+	{ 0x00000189, 0x0000 }, /* R393 (0x189) - FLL1 Spread Spectrum */
+	{ 0x0000018a, 0x0004 }, /* R394 (0x18A) - FLL1 GPIO Clock */
+	{ 0x000001d1, 0x0004 }, /* R465 (0x1D1) - FLL AO Control 1 */
+	{ 0x000001d2, 0x0004 }, /* R466 (0x1D2) - FLL AO Control 2 */
+	{ 0x000001d3, 0x0000 }, /* R467 (0x1D3) - FLL AO Control 3 */
+	{ 0x000001d4, 0x0000 }, /* R468 (0x1D4) - FLL AO Control 4 */
+	{ 0x000001d5, 0x0001 }, /* R469 (0x1D5) - FLL AO Control 5 */
+	{ 0x000001d6, 0x8004 }, /* R470 (0x1D6) - FLL AO Control 6 */
+	{ 0x000001d8, 0x0000 }, /* R472 (0x1D8) - FLL AO Control 7 */
+	{ 0x000001da, 0x0077 }, /* R474 (0x1DA) - FLL AO Control 8 */
+	{ 0x000001db, 0x0000 }, /* R475 (0x1DB) - FLL AO Control 9 */
+	{ 0x000001dc, 0x06da }, /* R476 (0x1DC) - FLL AO Control 10 */
+	{ 0x000001dd, 0x0011 }, /* R477 (0x1DD) - FLL AO Control 11 */
+	{ 0x00000218, 0x00e6 }, /* R536 (0x218) - Mic Bias Ctrl 1 */
+	{ 0x0000021c, 0x0222 }, /* R540 (0x21C) - Mic Bias Ctrl 5 */
+	{ 0x00000299, 0x0000 }, /* R665 (0x299) - Headphone Detect 0 */
+	{ 0x0000029b, 0x0000 }, /* R667 (0x29B) - Headphone Detect 1 */
+	{ 0x000002a2, 0x0010 }, /* R674 (0x2A2) - Mic Detect 1 Control 0 */
+	{ 0x000002a3, 0x1102 }, /* R675 (0x2A3) - Mic Detect 1 Control 1 */
+	{ 0x000002a4, 0x009f }, /* R676 (0x2A4) - Mic Detect 1 Control 2 */
+	{ 0x000002a6, 0x3d3d }, /* R678 (0x2A6) - Mic Detect 1 Level 1 */
+	{ 0x000002a7, 0x3d3d }, /* R679 (0x2A7) - Mic Detect 1 Level 2 */
+	{ 0x000002a8, 0x333d }, /* R680 (0x2A8) - Mic Detect 1 Level 3 */
+	{ 0x000002a9, 0x202d }, /* R681 (0x2A9) - Mic Detect 1 Level 4 */
+	{ 0x000002c6, 0x0010 }, /* R710 (0x2C6) - Micd Clamp Control */
+	{ 0x000002c8, 0x0000 }, /* R712 (0x2C8) - GP Switch 1 */
+	{ 0x000002d3, 0x0000 }, /* R723 (0x2D3) - Jack Detect Analogue */
+	{ 0x00000300, 0x0000 }, /* R768 (0x300) - Input Enables */
+	{ 0x00000308, 0x0000 }, /* R776 (0x308) - Input Rate */
+	{ 0x00000309, 0x0022 }, /* R777 (0x309) - Input Volume Ramp */
+	{ 0x0000030c, 0x0002 }, /* R780 (0x30C) - HPF Control */
+	{ 0x00000310, 0x0080 }, /* R784 (0x310) - IN1L Control */
+	{ 0x00000311, 0x0180 }, /* R785 (0x311) - ADC Digital Volume 1L */
+	{ 0x00000312, 0x0500 }, /* R786 (0x312) - DMIC1L Control */
+	{ 0x00000313, 0x0000 }, /* R787 (0x313) - IN1L Rate Control */
+	{ 0x00000314, 0x0080 }, /* R788 (0x314) - IN1R Control */
+	{ 0x00000315, 0x0180 }, /* R789 (0x315) - ADC Digital Volume 1R */
+	{ 0x00000316, 0x0000 }, /* R790 (0x316) - DMIC1R Control */
+	{ 0x00000317, 0x0000 }, /* R791 (0x317) - IN1R Rate Control */
+	{ 0x00000318, 0x0000 }, /* R792 (0x318) - IN2L Control */
+	{ 0x00000319, 0x0180 }, /* R793 (0x319) - ADC Digital Volume 2L */
+	{ 0x0000031a, 0x0500 }, /* R794 (0x31A) - DMIC2L Control */
+	{ 0x0000031b, 0x0000 }, /* R795 (0x31B) - IN2L Rate Control */
+	{ 0x0000031c, 0x0800 }, /* R796 (0x31C) - IN2R Control */
+	{ 0x0000031d, 0x0180 }, /* R797 (0x31D) - ADC Digital Volume 2R */
+	{ 0x0000031e, 0x0000 }, /* R798 (0x31E) - DMIC2R Control */
+	{ 0x0000031f, 0x0000 }, /* R799 (0x31F) - IN2R Rate Control */
+	{ 0x000003a8, 0x2000 }, /* R936 (0x3A8) - CS47L15 ADC Int Bias */
+	{ 0x000003c4, 0x0000 }, /* R964 (0x3C4) - CS47L15 PGA Bias Sel */
+	{ 0x00000400, 0x0000 }, /* R1024 (0x400) - Output Enables 1 */
+	{ 0x00000408, 0x0000 }, /* R1032 (0x408) - Output Rate 1 */
+	{ 0x00000409, 0x0022 }, /* R1033 (0x409) - Output Volume Ramp */
+	{ 0x00000410, 0x0080 }, /* R1040 (0x410) - Output Path Config 1L */
+	{ 0x00000411, 0x0180 }, /* R1041 (0x411) - DAC Digital Volume 1L */
+	{ 0x00000412, 0x0000 }, /* R1042 (0x412) - Output Path Config 1 */
+	{ 0x00000413, 0x0001 }, /* R1043 (0x413) - Noise Gate Select 1L */
+	{ 0x00000414, 0x0080 }, /* R1044 (0x414) - Output Path Config 1R */
+	{ 0x00000415, 0x0180 }, /* R1045 (0x415) - DAC Digital Volume 1R */
+	{ 0x00000417, 0x0002 }, /* R1047 (0x417) - Noise Gate Select 1R */
+	{ 0x0000041a, 0x0600 }, /* R1050 (0x41A) - Output Path Config 2 */
+	{ 0x00000428, 0x0000 }, /* R1064 (0x428) - Output Path Config 4L */
+	{ 0x00000429, 0x0180 }, /* R1065 (0x429) - DAC Digital Volume 4L */
+	{ 0x0000042b, 0x0040 }, /* R1067 (0x42B) - Noise Gate Select 4L */
+	{ 0x00000430, 0x0000 }, /* R1072 (0x430) - Output Path Config 5L */
+	{ 0x00000431, 0x0180 }, /* R1073 (0x431) - DAC Digital Volume 5L */
+	{ 0x00000433, 0x0100 }, /* R1075 (0x433) - Noise Gate Select 5L */
+	{ 0x00000434, 0x0000 }, /* R1076 (0x434) - Output Path Config 5R */
+	{ 0x00000435, 0x0180 }, /* R1077 (0x435) - DAC Digital Volume 5R */
+	{ 0x00000437, 0x0200 }, /* R1079 (0x437) - Noise Gate Select 5R */
+	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
+	{ 0x00000451, 0x0000 }, /* R1105 (0x451) - DAC AEC Control 2 */
+	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
+	{ 0x00000490, 0x0069 }, /* R1168 (0x490) - PDM SPK1 Ctrl 1 */
+	{ 0x00000491, 0x0000 }, /* R1169 (0x491) - PDM SPK1 Ctrl 2 */
+	{ 0x000004a0, 0x3080 }, /* R1184 (0x4A0) - HP1 Short Circuit Ctrl */
+	{ 0x000004a8, 0x6023 }, /* R1192 (0x4A8) - HP Test Ctrl 5 */
+	{ 0x000004a9, 0x6023 }, /* R1193 (0x4A9) - HP Test Ctrl 6 */
+	{ 0x00000500, 0x000c }, /* R1280 (0x500) - AIF1 BCLK Ctrl */
+	{ 0x00000501, 0x0000 }, /* R1281 (0x501) - AIF1 Tx Pin Ctrl */
+	{ 0x00000502, 0x0000 }, /* R1282 (0x502) - AIF1 Rx Pin Ctrl */
+	{ 0x00000503, 0x0000 }, /* R1283 (0x503) - AIF1 Rate Ctrl */
+	{ 0x00000504, 0x0000 }, /* R1284 (0x504) - AIF1 Format */
+	{ 0x00000506, 0x0040 }, /* R1286 (0x506) - AIF1 Rx BCLK Rate */
+	{ 0x00000507, 0x1818 }, /* R1287 (0x507) - AIF1 Frame Ctrl 1 */
+	{ 0x00000508, 0x1818 }, /* R1288 (0x508) - AIF1 Frame Ctrl 2 */
+	{ 0x00000509, 0x0000 }, /* R1289 (0x509) - AIF1 Frame Ctrl 3 */
+	{ 0x0000050a, 0x0001 }, /* R1290 (0x50A) - AIF1 Frame Ctrl 4 */
+	{ 0x0000050b, 0x0002 }, /* R1291 (0x50B) - AIF1 Frame Ctrl 5 */
+	{ 0x0000050c, 0x0003 }, /* R1292 (0x50C) - AIF1 Frame Ctrl 6 */
+	{ 0x0000050d, 0x0004 }, /* R1293 (0x50D) - AIF1 Frame Ctrl 7 */
+	{ 0x0000050e, 0x0005 }, /* R1294 (0x50E) - AIF1 Frame Ctrl 8 */
+	{ 0x00000511, 0x0000 }, /* R1297 (0x511) - AIF1 Frame Ctrl 11 */
+	{ 0x00000512, 0x0001 }, /* R1298 (0x512) - AIF1 Frame Ctrl 12 */
+	{ 0x00000513, 0x0002 }, /* R1299 (0x513) - AIF1 Frame Ctrl 13 */
+	{ 0x00000514, 0x0003 }, /* R1300 (0x514) - AIF1 Frame Ctrl 14 */
+	{ 0x00000515, 0x0004 }, /* R1301 (0x515) - AIF1 Frame Ctrl 15 */
+	{ 0x00000516, 0x0005 }, /* R1302 (0x516) - AIF1 Frame Ctrl 16 */
+	{ 0x00000519, 0x0000 }, /* R1305 (0x519) - AIF1 Tx Enables */
+	{ 0x0000051a, 0x0000 }, /* R1306 (0x51A) - AIF1 Rx Enables */
+	{ 0x00000540, 0x000c }, /* R1344 (0x540) - AIF2 BCLK Ctrl */
+	{ 0x00000541, 0x0000 }, /* R1345 (0x541) - AIF2 Tx Pin Ctrl */
+	{ 0x00000542, 0x0000 }, /* R1346 (0x542) - AIF2 Rx Pin Ctrl */
+	{ 0x00000543, 0x0000 }, /* R1347 (0x543) - AIF2 Rate Ctrl */
+	{ 0x00000544, 0x0000 }, /* R1348 (0x544) - AIF2 Format */
+	{ 0x00000546, 0x0040 }, /* R1350 (0x546) - AIF2 Rx BCLK Rate */
+	{ 0x00000547, 0x1818 }, /* R1351 (0x547) - AIF2 Frame Ctrl 1 */
+	{ 0x00000548, 0x1818 }, /* R1352 (0x548) - AIF2 Frame Ctrl 2 */
+	{ 0x00000549, 0x0000 }, /* R1353 (0x549) - AIF2 Frame Ctrl 3 */
+	{ 0x0000054a, 0x0001 }, /* R1354 (0x54A) - AIF2 Frame Ctrl 4 */
+	{ 0x0000054b, 0x0002 }, /* R1355 (0x54B) - AIF2 Frame Ctrl 5 */
+	{ 0x0000054c, 0x0003 }, /* R1356 (0x54C) - AIF2 Frame Ctrl 6 */
+	{ 0x00000551, 0x0000 }, /* R1361 (0x551) - AIF2 Frame Ctrl 11 */
+	{ 0x00000552, 0x0001 }, /* R1362 (0x552) - AIF2 Frame Ctrl 12 */
+	{ 0x00000553, 0x0002 }, /* R1363 (0x553) - AIF2 Frame Ctrl 13 */
+	{ 0x00000554, 0x0003 }, /* R1364 (0x554) - AIF2 Frame Ctrl 14 */
+	{ 0x00000559, 0x0000 }, /* R1369 (0x559) - AIF2 Tx Enables */
+	{ 0x0000055a, 0x0000 }, /* R1370 (0x55A) - AIF2 Rx Enables */
+	{ 0x00000580, 0x000c }, /* R1408 (0x580) - AIF3 BCLK Ctrl */
+	{ 0x00000581, 0x0000 }, /* R1409 (0x581) - AIF3 Tx Pin Ctrl */
+	{ 0x00000582, 0x0000 }, /* R1410 (0x582) - AIF3 Rx Pin Ctrl */
+	{ 0x00000583, 0x0000 }, /* R1411 (0x583) - AIF3 Rate Ctrl */
+	{ 0x00000584, 0x0000 }, /* R1412 (0x584) - AIF3 Format */
+	{ 0x00000586, 0x0040 }, /* R1414 (0x586) - AIF3 Rx BCLK Rate */
+	{ 0x00000587, 0x1818 }, /* R1415 (0x587) - AIF3 Frame Ctrl 1 */
+	{ 0x00000588, 0x1818 }, /* R1416 (0x588) - AIF3 Frame Ctrl 2 */
+	{ 0x00000589, 0x0000 }, /* R1417 (0x589) - AIF3 Frame Ctrl 3 */
+	{ 0x0000058a, 0x0001 }, /* R1418 (0x58A) - AIF3 Frame Ctrl 4 */
+	{ 0x00000591, 0x0000 }, /* R1425 (0x591) - AIF3 Frame Ctrl 11 */
+	{ 0x00000592, 0x0001 }, /* R1426 (0x592) - AIF3 Frame Ctrl 12 */
+	{ 0x00000599, 0x0000 }, /* R1433 (0x599) - AIF3 Tx Enables */
+	{ 0x0000059a, 0x0000 }, /* R1434 (0x59A) - AIF3 Rx Enables */
+	{ 0x000005c2, 0x0000 }, /* R1474 (0x5C2) - SPD1 Tx Control */
+	{ 0x00000640, 0x0000 }, /* R1600 (0x640) - PWM1MIX Input 1 Source */
+	{ 0x00000641, 0x0080 }, /* R1601 (0x641) - PWM1MIX Input 1 Volume */
+	{ 0x00000642, 0x0000 }, /* R1602 (0x642) - PWM1MIX Input 2 Source */
+	{ 0x00000643, 0x0080 }, /* R1603 (0x643) - PWM1MIX Input 2 Volume */
+	{ 0x00000644, 0x0000 }, /* R1604 (0x644) - PWM1MIX Input 3 Source */
+	{ 0x00000645, 0x0080 }, /* R1605 (0x645) - PWM1MIX Input 3 Volume */
+	{ 0x00000646, 0x0000 }, /* R1606 (0x646) - PWM1MIX Input 4 Source */
+	{ 0x00000647, 0x0080 }, /* R1607 (0x647) - PWM1MIX Input 4 Volume */
+	{ 0x00000648, 0x0000 }, /* R1608 (0x648) - PWM2MIX Input 1 Source */
+	{ 0x00000649, 0x0080 }, /* R1609 (0x649) - PWM2MIX Input 1 Volume */
+	{ 0x0000064a, 0x0000 }, /* R1610 (0x64A) - PWM2MIX Input 2 Source */
+	{ 0x0000064b, 0x0080 }, /* R1611 (0x64B) - PWM2MIX Input 2 Volume */
+	{ 0x0000064c, 0x0000 }, /* R1612 (0x64C) - PWM2MIX Input 3 Source */
+	{ 0x0000064d, 0x0080 }, /* R1613 (0x64D) - PWM2MIX Input 3 Volume */
+	{ 0x0000064e, 0x0000 }, /* R1614 (0x64E) - PWM2MIX Input 4 Source */
+	{ 0x0000064f, 0x0080 }, /* R1615 (0x64F) - PWM2MIX Input 4 Volume */
+	{ 0x00000680, 0x0000 }, /* R1664 (0x680) - OUT1LMIX Input 1 Source */
+	{ 0x00000681, 0x0080 }, /* R1665 (0x681) - OUT1LMIX Input 1 Volume */
+	{ 0x00000682, 0x0000 }, /* R1666 (0x682) - OUT1LMIX Input 2 Source */
+	{ 0x00000683, 0x0080 }, /* R1667 (0x683) - OUT1LMIX Input 2 Volume */
+	{ 0x00000684, 0x0000 }, /* R1668 (0x684) - OUT1LMIX Input 3 Source */
+	{ 0x00000685, 0x0080 }, /* R1669 (0x685) - OUT1LMIX Input 3 Volume */
+	{ 0x00000686, 0x0000 }, /* R1670 (0x686) - OUT1LMIX Input 4 Source */
+	{ 0x00000687, 0x0080 }, /* R1671 (0x687) - OUT1LMIX Input 4 Volume */
+	{ 0x00000688, 0x0000 }, /* R1672 (0x688) - OUT1RMIX Input 1 Source */
+	{ 0x00000689, 0x0080 }, /* R1673 (0x689) - OUT1RMIX Input 1 Volume */
+	{ 0x0000068a, 0x0000 }, /* R1674 (0x68A) - OUT1RMIX Input 2 Source */
+	{ 0x0000068b, 0x0080 }, /* R1675 (0x68B) - OUT1RMIX Input 2 Volume */
+	{ 0x0000068c, 0x0000 }, /* R1676 (0x68C) - OUT1RMIX Input 3 Source */
+	{ 0x0000068d, 0x0080 }, /* R1677 (0x68D) - OUT1RMIX Input 3 Volume */
+	{ 0x0000068e, 0x0000 }, /* R1678 (0x68E) - OUT1RMIX Input 4 Source */
+	{ 0x0000068f, 0x0080 }, /* R1679 (0x68F) - OUT1RMIX Input 4 Volume */
+	{ 0x000006b0, 0x0000 }, /* R1712 (0x6B0) - OUT4LMIX Input 1 Source */
+	{ 0x000006b1, 0x0080 }, /* R1713 (0x6B1) - OUT4LMIX Input 1 Volume */
+	{ 0x000006b2, 0x0000 }, /* R1714 (0x6B2) - OUT4LMIX Input 2 Source */
+	{ 0x000006b3, 0x0080 }, /* R1715 (0x6B3) - OUT4LMIX Input 2 Volume */
+	{ 0x000006b4, 0x0000 }, /* R1716 (0x6B4) - OUT4LMIX Input 3 Source */
+	{ 0x000006b5, 0x0080 }, /* R1717 (0x6B5) - OUT4LMIX Input 3 Volume */
+	{ 0x000006b6, 0x0000 }, /* R1718 (0x6B6) - OUT4LMIX Input 4 Source */
+	{ 0x000006b7, 0x0080 }, /* R1719 (0x6B7) - OUT4LMIX Input 4 Volume */
+	{ 0x000006c0, 0x0000 }, /* R1728 (0x6C0) - OUT5LMIX Input 1 Source */
+	{ 0x000006c1, 0x0080 }, /* R1729 (0x6C1) - OUT5LMIX Input 1 Volume */
+	{ 0x000006c2, 0x0000 }, /* R1730 (0x6C2) - OUT5LMIX Input 2 Source */
+	{ 0x000006c3, 0x0080 }, /* R1731 (0x6C3) - OUT5LMIX Input 2 Volume */
+	{ 0x000006c4, 0x0000 }, /* R1732 (0x6C4) - OUT5LMIX Input 3 Source */
+	{ 0x000006c5, 0x0080 }, /* R1733 (0x6C5) - OUT5LMIX Input 3 Volume */
+	{ 0x000006c6, 0x0000 }, /* R1734 (0x6C6) - OUT5LMIX Input 4 Source */
+	{ 0x000006c7, 0x0080 }, /* R1735 (0x6C7) - OUT5LMIX Input 4 Volume */
+	{ 0x000006c8, 0x0000 }, /* R1736 (0x6C8) - OUT5RMIX Input 1 Source */
+	{ 0x000006c9, 0x0080 }, /* R1737 (0x6C9) - OUT5RMIX Input 1 Volume */
+	{ 0x000006ca, 0x0000 }, /* R1738 (0x6CA) - OUT5RMIX Input 2 Source */
+	{ 0x000006cb, 0x0080 }, /* R1739 (0x6CB) - OUT5RMIX Input 2 Volume */
+	{ 0x000006cc, 0x0000 }, /* R1740 (0x6CC) - OUT5RMIX Input 3 Source */
+	{ 0x000006cd, 0x0080 }, /* R1741 (0x6CD) - OUT5RMIX Input 3 Volume */
+	{ 0x000006ce, 0x0000 }, /* R1742 (0x6CE) - OUT5RMIX Input 4 Source */
+	{ 0x000006cf, 0x0080 }, /* R1743 (0x6CF) - OUT5RMIX Input 4 Volume */
+	{ 0x00000700, 0x0000 }, /* R1792 (0x700) - AIF1TX1MIX Input 1 Source */
+	{ 0x00000701, 0x0080 }, /* R1793 (0x701) - AIF1TX1MIX Input 1 Volume */
+	{ 0x00000702, 0x0000 }, /* R1794 (0x702) - AIF1TX1MIX Input 2 Source */
+	{ 0x00000703, 0x0080 }, /* R1795 (0x703) - AIF1TX1MIX Input 2 Volume */
+	{ 0x00000704, 0x0000 }, /* R1796 (0x704) - AIF1TX1MIX Input 3 Source */
+	{ 0x00000705, 0x0080 }, /* R1797 (0x705) - AIF1TX1MIX Input 3 Volume */
+	{ 0x00000706, 0x0000 }, /* R1798 (0x706) - AIF1TX1MIX Input 4 Source */
+	{ 0x00000707, 0x0080 }, /* R1799 (0x707) - AIF1TX1MIX Input 4 Volume */
+	{ 0x00000708, 0x0000 }, /* R1800 (0x708) - AIF1TX2MIX Input 1 Source */
+	{ 0x00000709, 0x0080 }, /* R1801 (0x709) - AIF1TX2MIX Input 1 Volume */
+	{ 0x0000070a, 0x0000 }, /* R1802 (0x70A) - AIF1TX2MIX Input 2 Source */
+	{ 0x0000070b, 0x0080 }, /* R1803 (0x70B) - AIF1TX2MIX Input 2 Volume */
+	{ 0x0000070c, 0x0000 }, /* R1804 (0x70C) - AIF1TX2MIX Input 3 Source */
+	{ 0x0000070d, 0x0080 }, /* R1805 (0x70D) - AIF1TX2MIX Input 3 Volume */
+	{ 0x0000070e, 0x0000 }, /* R1806 (0x70E) - AIF1TX2MIX Input 4 Source */
+	{ 0x0000070f, 0x0080 }, /* R1807 (0x70F) - AIF1TX2MIX Input 4 Volume */
+	{ 0x00000710, 0x0000 }, /* R1808 (0x710) - AIF1TX3MIX Input 1 Source */
+	{ 0x00000711, 0x0080 }, /* R1809 (0x711) - AIF1TX3MIX Input 1 Volume */
+	{ 0x00000712, 0x0000 }, /* R1810 (0x712) - AIF1TX3MIX Input 2 Source */
+	{ 0x00000713, 0x0080 }, /* R1811 (0x713) - AIF1TX3MIX Input 2 Volume */
+	{ 0x00000714, 0x0000 }, /* R1812 (0x714) - AIF1TX3MIX Input 3 Source */
+	{ 0x00000715, 0x0080 }, /* R1813 (0x715) - AIF1TX3MIX Input 3 Volume */
+	{ 0x00000716, 0x0000 }, /* R1814 (0x716) - AIF1TX3MIX Input 4 Source */
+	{ 0x00000717, 0x0080 }, /* R1815 (0x717) - AIF1TX3MIX Input 4 Volume */
+	{ 0x00000718, 0x0000 }, /* R1816 (0x718) - AIF1TX4MIX Input 1 Source */
+	{ 0x00000719, 0x0080 }, /* R1817 (0x719) - AIF1TX4MIX Input 1 Volume */
+	{ 0x0000071a, 0x0000 }, /* R1818 (0x71A) - AIF1TX4MIX Input 2 Source */
+	{ 0x0000071b, 0x0080 }, /* R1819 (0x71B) - AIF1TX4MIX Input 2 Volume */
+	{ 0x0000071c, 0x0000 }, /* R1820 (0x71C) - AIF1TX4MIX Input 3 Source */
+	{ 0x0000071d, 0x0080 }, /* R1821 (0x71D) - AIF1TX4MIX Input 3 Volume */
+	{ 0x0000071e, 0x0000 }, /* R1822 (0x71E) - AIF1TX4MIX Input 4 Source */
+	{ 0x0000071f, 0x0080 }, /* R1823 (0x71F) - AIF1TX4MIX Input 4 Volume */
+	{ 0x00000720, 0x0000 }, /* R1824 (0x720) - AIF1TX5MIX Input 1 Source */
+	{ 0x00000721, 0x0080 }, /* R1825 (0x721) - AIF1TX5MIX Input 1 Volume */
+	{ 0x00000722, 0x0000 }, /* R1826 (0x722) - AIF1TX5MIX Input 2 Source */
+	{ 0x00000723, 0x0080 }, /* R1827 (0x723) - AIF1TX5MIX Input 2 Volume */
+	{ 0x00000724, 0x0000 }, /* R1828 (0x724) - AIF1TX5MIX Input 3 Source */
+	{ 0x00000725, 0x0080 }, /* R1829 (0x725) - AIF1TX5MIX Input 3 Volume */
+	{ 0x00000726, 0x0000 }, /* R1830 (0x726) - AIF1TX5MIX Input 4 Source */
+	{ 0x00000727, 0x0080 }, /* R1831 (0x727) - AIF1TX5MIX Input 4 Volume */
+	{ 0x00000728, 0x0000 }, /* R1832 (0x728) - AIF1TX6MIX Input 1 Source */
+	{ 0x00000729, 0x0080 }, /* R1833 (0x729) - AIF1TX6MIX Input 1 Volume */
+	{ 0x0000072a, 0x0000 }, /* R1834 (0x72A) - AIF1TX6MIX Input 2 Source */
+	{ 0x0000072b, 0x0080 }, /* R1835 (0x72B) - AIF1TX6MIX Input 2 Volume */
+	{ 0x0000072c, 0x0000 }, /* R1836 (0x72C) - AIF1TX6MIX Input 3 Source */
+	{ 0x0000072d, 0x0080 }, /* R1837 (0x72D) - AIF1TX6MIX Input 3 Volume */
+	{ 0x0000072e, 0x0000 }, /* R1838 (0x72E) - AIF1TX6MIX Input 4 Source */
+	{ 0x0000072f, 0x0080 }, /* R1839 (0x72F) - AIF1TX6MIX Input 4 Volume */
+	{ 0x00000740, 0x0000 }, /* R1856 (0x740) - AIF2TX1MIX Input 1 Source */
+	{ 0x00000741, 0x0080 }, /* R1857 (0x741) - AIF2TX1MIX Input 1 Volume */
+	{ 0x00000742, 0x0000 }, /* R1858 (0x742) - AIF2TX1MIX Input 2 Source */
+	{ 0x00000743, 0x0080 }, /* R1859 (0x743) - AIF2TX1MIX Input 2 Volume */
+	{ 0x00000744, 0x0000 }, /* R1860 (0x744) - AIF2TX1MIX Input 3 Source */
+	{ 0x00000745, 0x0080 }, /* R1861 (0x745) - AIF2TX1MIX Input 3 Volume */
+	{ 0x00000746, 0x0000 }, /* R1862 (0x746) - AIF2TX1MIX Input 4 Source */
+	{ 0x00000747, 0x0080 }, /* R1863 (0x747) - AIF2TX1MIX Input 4 Volume */
+	{ 0x00000748, 0x0000 }, /* R1864 (0x748) - AIF2TX2MIX Input 1 Source */
+	{ 0x00000749, 0x0080 }, /* R1865 (0x749) - AIF2TX2MIX Input 1 Volume */
+	{ 0x0000074a, 0x0000 }, /* R1866 (0x74A) - AIF2TX2MIX Input 2 Source */
+	{ 0x0000074b, 0x0080 }, /* R1867 (0x74B) - AIF2TX2MIX Input 2 Volume */
+	{ 0x0000074c, 0x0000 }, /* R1868 (0x74C) - AIF2TX2MIX Input 3 Source */
+	{ 0x0000074d, 0x0080 }, /* R1869 (0x74D) - AIF2TX2MIX Input 3 Volume */
+	{ 0x0000074e, 0x0000 }, /* R1870 (0x74E) - AIF2TX2MIX Input 4 Source */
+	{ 0x0000074f, 0x0080 }, /* R1871 (0x74F) - AIF2TX2MIX Input 4 Volume */
+	{ 0x00000750, 0x0000 }, /* R1872 (0x750) - AIF2TX3MIX Input 1 Source */
+	{ 0x00000751, 0x0080 }, /* R1873 (0x751) - AIF2TX3MIX Input 1 Volume */
+	{ 0x00000752, 0x0000 }, /* R1874 (0x752) - AIF2TX3MIX Input 2 Source */
+	{ 0x00000753, 0x0080 }, /* R1875 (0x753) - AIF2TX3MIX Input 2 Volume */
+	{ 0x00000754, 0x0000 }, /* R1876 (0x754) - AIF2TX3MIX Input 3 Source */
+	{ 0x00000755, 0x0080 }, /* R1877 (0x755) - AIF2TX3MIX Input 3 Volume */
+	{ 0x00000756, 0x0000 }, /* R1878 (0x756) - AIF2TX3MIX Input 4 Source */
+	{ 0x00000757, 0x0080 }, /* R1879 (0x757) - AIF2TX3MIX Input 4 Volume */
+	{ 0x00000758, 0x0000 }, /* R1880 (0x758) - AIF2TX4MIX Input 1 Source */
+	{ 0x00000759, 0x0080 }, /* R1881 (0x759) - AIF2TX4MIX Input 1 Volume */
+	{ 0x0000075a, 0x0000 }, /* R1882 (0x75A) - AIF2TX4MIX Input 2 Source */
+	{ 0x0000075b, 0x0080 }, /* R1883 (0x75B) - AIF2TX4MIX Input 2 Volume */
+	{ 0x0000075c, 0x0000 }, /* R1884 (0x75C) - AIF2TX4MIX Input 3 Source */
+	{ 0x0000075d, 0x0080 }, /* R1885 (0x75D) - AIF2TX4MIX Input 3 Volume */
+	{ 0x0000075e, 0x0000 }, /* R1886 (0x75E) - AIF2TX4MIX Input 4 Source */
+	{ 0x0000075f, 0x0080 }, /* R1887 (0x75F) - AIF2TX4MIX Input 4 Volume */
+	{ 0x00000780, 0x0000 }, /* R1920 (0x780) - AIF3TX1MIX Input 1 Source */
+	{ 0x00000781, 0x0080 }, /* R1921 (0x781) - AIF3TX1MIX Input 1 Volume */
+	{ 0x00000782, 0x0000 }, /* R1922 (0x782) - AIF3TX1MIX Input 2 Source */
+	{ 0x00000783, 0x0080 }, /* R1923 (0x783) - AIF3TX1MIX Input 2 Volume */
+	{ 0x00000784, 0x0000 }, /* R1924 (0x784) - AIF3TX1MIX Input 3 Source */
+	{ 0x00000785, 0x0080 }, /* R1925 (0x785) - AIF3TX1MIX Input 3 Volume */
+	{ 0x00000786, 0x0000 }, /* R1926 (0x786) - AIF3TX1MIX Input 4 Source */
+	{ 0x00000787, 0x0080 }, /* R1927 (0x787) - AIF3TX1MIX Input 4 Volume */
+	{ 0x00000788, 0x0000 }, /* R1928 (0x788) - AIF3TX2MIX Input 1 Source */
+	{ 0x00000789, 0x0080 }, /* R1929 (0x789) - AIF3TX2MIX Input 1 Volume */
+	{ 0x0000078a, 0x0000 }, /* R1930 (0x78A) - AIF3TX2MIX Input 2 Source */
+	{ 0x0000078b, 0x0080 }, /* R1931 (0x78B) - AIF3TX2MIX Input 2 Volume */
+	{ 0x0000078c, 0x0000 }, /* R1932 (0x78C) - AIF3TX2MIX Input 3 Source */
+	{ 0x0000078d, 0x0080 }, /* R1933 (0x78D) - AIF3TX2MIX Input 3 Volume */
+	{ 0x0000078e, 0x0000 }, /* R1934 (0x78E) - AIF3TX2MIX Input 4 Source */
+	{ 0x0000078f, 0x0080 }, /* R1935 (0x78F) - AIF3TX2MIX Input 4 Volume */
+	{ 0x00000800, 0x0000 }, /* R2048 (0x800) - SPDIF1TX1MIX Input 1 Source */
+	{ 0x00000801, 0x0080 }, /* R2049 (0x801) - SPDIF1TX1MIX Input 1 Volume */
+	{ 0x00000808, 0x0000 }, /* R2056 (0x808) - SPDIF1TX2MIX Input 1 Source */
+	{ 0x00000809, 0x0080 }, /* R2057 (0x809) - SPDIF1TX2MIX Input 1 Volume */
+	{ 0x00000880, 0x0000 }, /* R2176 (0x880) - EQ1MIX Input 1 Source */
+	{ 0x00000881, 0x0080 }, /* R2177 (0x881) - EQ1MIX Input 1 Volume */
+	{ 0x00000882, 0x0000 }, /* R2178 (0x882) - EQ1MIX Input 2 Source */
+	{ 0x00000883, 0x0080 }, /* R2179 (0x883) - EQ1MIX Input 2 Volume */
+	{ 0x00000884, 0x0000 }, /* R2180 (0x884) - EQ1MIX Input 3 Source */
+	{ 0x00000885, 0x0080 }, /* R2181 (0x885) - EQ1MIX Input 3 Volume */
+	{ 0x00000886, 0x0000 }, /* R2182 (0x886) - EQ1MIX Input 4 Source */
+	{ 0x00000887, 0x0080 }, /* R2183 (0x887) - EQ1MIX Input 4 Volume */
+	{ 0x00000888, 0x0000 }, /* R2184 (0x888) - EQ2MIX Input 1 Source */
+	{ 0x00000889, 0x0080 }, /* R2185 (0x889) - EQ2MIX Input 1 Volume */
+	{ 0x0000088a, 0x0000 }, /* R2186 (0x88A) - EQ2MIX Input 2 Source */
+	{ 0x0000088b, 0x0080 }, /* R2187 (0x88B) - EQ2MIX Input 2 Volume */
+	{ 0x0000088c, 0x0000 }, /* R2188 (0x88C) - EQ2MIX Input 3 Source */
+	{ 0x0000088d, 0x0080 }, /* R2189 (0x88D) - EQ2MIX Input 3 Volume */
+	{ 0x0000088e, 0x0000 }, /* R2190 (0x88E) - EQ2MIX Input 4 Source */
+	{ 0x0000088f, 0x0080 }, /* R2191 (0x88F) - EQ2MIX Input 4 Volume */
+	{ 0x00000890, 0x0000 }, /* R2192 (0x890) - EQ3MIX Input 1 Source */
+	{ 0x00000891, 0x0080 }, /* R2193 (0x891) - EQ3MIX Input 1 Volume */
+	{ 0x00000892, 0x0000 }, /* R2194 (0x892) - EQ3MIX Input 2 Source */
+	{ 0x00000893, 0x0080 }, /* R2195 (0x893) - EQ3MIX Input 2 Volume */
+	{ 0x00000894, 0x0000 }, /* R2196 (0x894) - EQ3MIX Input 3 Source */
+	{ 0x00000895, 0x0080 }, /* R2197 (0x895) - EQ3MIX Input 3 Volume */
+	{ 0x00000896, 0x0000 }, /* R2198 (0x896) - EQ3MIX Input 4 Source */
+	{ 0x00000897, 0x0080 }, /* R2199 (0x897) - EQ3MIX Input 4 Volume */
+	{ 0x00000898, 0x0000 }, /* R2200 (0x898) - EQ4MIX Input 1 Source */
+	{ 0x00000899, 0x0080 }, /* R2201 (0x899) - EQ4MIX Input 1 Volume */
+	{ 0x0000089a, 0x0000 }, /* R2202 (0x89A) - EQ4MIX Input 2 Source */
+	{ 0x0000089b, 0x0080 }, /* R2203 (0x89B) - EQ4MIX Input 2 Volume */
+	{ 0x0000089c, 0x0000 }, /* R2204 (0x89C) - EQ4MIX Input 3 Source */
+	{ 0x0000089d, 0x0080 }, /* R2205 (0x89D) - EQ4MIX Input 3 Volume */
+	{ 0x0000089e, 0x0000 }, /* R2206 (0x89E) - EQ4MIX Input 4 Source */
+	{ 0x0000089f, 0x0080 }, /* R2207 (0x89F) - EQ4MIX Input 4 Volume */
+	{ 0x000008c0, 0x0000 }, /* R2240 (0x8C0) - DRC1LMIX Input 1 Source */
+	{ 0x000008c1, 0x0080 }, /* R2241 (0x8C1) - DRC1LMIX Input 1 Volume */
+	{ 0x000008c2, 0x0000 }, /* R2242 (0x8C2) - DRC1LMIX Input 2 Source */
+	{ 0x000008c3, 0x0080 }, /* R2243 (0x8C3) - DRC1LMIX Input 2 Volume */
+	{ 0x000008c4, 0x0000 }, /* R2244 (0x8C4) - DRC1LMIX Input 3 Source */
+	{ 0x000008c5, 0x0080 }, /* R2245 (0x8C5) - DRC1LMIX Input 3 Volume */
+	{ 0x000008c6, 0x0000 }, /* R2246 (0x8C6) - DRC1LMIX Input 4 Source */
+	{ 0x000008c7, 0x0080 }, /* R2247 (0x8C7) - DRC1LMIX Input 4 Volume */
+	{ 0x000008c8, 0x0000 }, /* R2248 (0x8C8) - DRC1RMIX Input 1 Source */
+	{ 0x000008c9, 0x0080 }, /* R2249 (0x8C9) - DRC1RMIX Input 1 Volume */
+	{ 0x000008ca, 0x0000 }, /* R2250 (0x8CA) - DRC1RMIX Input 2 Source */
+	{ 0x000008cb, 0x0080 }, /* R2251 (0x8CB) - DRC1RMIX Input 2 Volume */
+	{ 0x000008cc, 0x0000 }, /* R2252 (0x8CC) - DRC1RMIX Input 3 Source */
+	{ 0x000008cd, 0x0080 }, /* R2253 (0x8CD) - DRC1RMIX Input 3 Volume */
+	{ 0x000008ce, 0x0000 }, /* R2254 (0x8CE) - DRC1RMIX Input 4 Source */
+	{ 0x000008cf, 0x0080 }, /* R2255 (0x8CF) - DRC1RMIX Input 4 Volume */
+	{ 0x000008d0, 0x0000 }, /* R2256 (0x8D0) - DRC2LMIX Input 1 Source */
+	{ 0x000008d1, 0x0080 }, /* R2257 (0x8D1) - DRC2LMIX Input 1 Volume */
+	{ 0x000008d2, 0x0000 }, /* R2258 (0x8D2) - DRC2LMIX Input 2 Source */
+	{ 0x000008d3, 0x0080 }, /* R2259 (0x8D3) - DRC2LMIX Input 2 Volume */
+	{ 0x000008d4, 0x0000 }, /* R2260 (0x8D4) - DRC2LMIX Input 3 Source */
+	{ 0x000008d5, 0x0080 }, /* R2261 (0x8D5) - DRC2LMIX Input 3 Volume */
+	{ 0x000008d6, 0x0000 }, /* R2262 (0x8D6) - DRC2LMIX Input 4 Source */
+	{ 0x000008d7, 0x0080 }, /* R2263 (0x8D7) - DRC2LMIX Input 4 Volume */
+	{ 0x000008d8, 0x0000 }, /* R2264 (0x8D8) - DRC2RMIX Input 1 Source */
+	{ 0x000008d9, 0x0080 }, /* R2265 (0x8D9) - DRC2RMIX Input 1 Volume */
+	{ 0x000008da, 0x0000 }, /* R2266 (0x8DA) - DRC2RMIX Input 2 Source */
+	{ 0x000008db, 0x0080 }, /* R2267 (0x8DB) - DRC2RMIX Input 2 Volume */
+	{ 0x000008dc, 0x0000 }, /* R2268 (0x8DC) - DRC2RMIX Input 3 Source */
+	{ 0x000008dd, 0x0080 }, /* R2269 (0x8DD) - DRC2RMIX Input 3 Volume */
+	{ 0x000008de, 0x0000 }, /* R2270 (0x8DE) - DRC2RMIX Input 4 Source */
+	{ 0x000008df, 0x0080 }, /* R2271 (0x8DF) - DRC2RMIX Input 4 Volume */
+	{ 0x00000900, 0x0000 }, /* R2304 (0x900) - HPLP1MIX Input 1 Source */
+	{ 0x00000901, 0x0080 }, /* R2305 (0x901) - HPLP1MIX Input 1 Volume */
+	{ 0x00000902, 0x0000 }, /* R2306 (0x902) - HPLP1MIX Input 2 Source */
+	{ 0x00000903, 0x0080 }, /* R2307 (0x903) - HPLP1MIX Input 2 Volume */
+	{ 0x00000904, 0x0000 }, /* R2308 (0x904) - HPLP1MIX Input 3 Source */
+	{ 0x00000905, 0x0080 }, /* R2309 (0x905) - HPLP1MIX Input 3 Volume */
+	{ 0x00000906, 0x0000 }, /* R2310 (0x906) - HPLP1MIX Input 4 Source */
+	{ 0x00000907, 0x0080 }, /* R2311 (0x907) - HPLP1MIX Input 4 Volume */
+	{ 0x00000908, 0x0000 }, /* R2312 (0x908) - HPLP2MIX Input 1 Source */
+	{ 0x00000909, 0x0080 }, /* R2313 (0x909) - HPLP2MIX Input 1 Volume */
+	{ 0x0000090a, 0x0000 }, /* R2314 (0x90A) - HPLP2MIX Input 2 Source */
+	{ 0x0000090b, 0x0080 }, /* R2315 (0x90B) - HPLP2MIX Input 2 Volume */
+	{ 0x0000090c, 0x0000 }, /* R2316 (0x90C) - HPLP2MIX Input 3 Source */
+	{ 0x0000090d, 0x0080 }, /* R2317 (0x90D) - HPLP2MIX Input 3 Volume */
+	{ 0x0000090e, 0x0000 }, /* R2318 (0x90E) - HPLP2MIX Input 4 Source */
+	{ 0x0000090f, 0x0080 }, /* R2319 (0x90F) - HPLP2MIX Input 4 Volume */
+	{ 0x00000910, 0x0000 }, /* R2320 (0x910) - HPLP3MIX Input 1 Source */
+	{ 0x00000911, 0x0080 }, /* R2321 (0x911) - HPLP3MIX Input 1 Volume */
+	{ 0x00000912, 0x0000 }, /* R2322 (0x912) - HPLP3MIX Input 2 Source */
+	{ 0x00000913, 0x0080 }, /* R2323 (0x913) - HPLP3MIX Input 2 Volume */
+	{ 0x00000914, 0x0000 }, /* R2324 (0x914) - HPLP3MIX Input 3 Source */
+	{ 0x00000915, 0x0080 }, /* R2325 (0x915) - HPLP3MIX Input 3 Volume */
+	{ 0x00000916, 0x0000 }, /* R2326 (0x916) - HPLP3MIX Input 4 Source */
+	{ 0x00000917, 0x0080 }, /* R2327 (0x917) - HPLP3MIX Input 4 Volume */
+	{ 0x00000918, 0x0000 }, /* R2328 (0x918) - HPLP4MIX Input 1 Source */
+	{ 0x00000919, 0x0080 }, /* R2329 (0x919) - HPLP4MIX Input 1 Volume */
+	{ 0x0000091a, 0x0000 }, /* R2330 (0x91A) - HPLP4MIX Input 2 Source */
+	{ 0x0000091b, 0x0080 }, /* R2331 (0x91B) - HPLP4MIX Input 2 Volume */
+	{ 0x0000091c, 0x0000 }, /* R2332 (0x91C) - HPLP4MIX Input 3 Source */
+	{ 0x0000091d, 0x0080 }, /* R2333 (0x91D) - HPLP4MIX Input 3 Volume */
+	{ 0x0000091e, 0x0000 }, /* R2334 (0x91E) - HPLP4MIX Input 4 Source */
+	{ 0x0000091f, 0x0080 }, /* R2335 (0x91F) - HPLP4MIX Input 4 Volume */
+	{ 0x00000940, 0x0000 }, /* R2368 (0x940) - DSP1LMIX Input 1 Source */
+	{ 0x00000941, 0x0080 }, /* R2369 (0x941) - DSP1LMIX Input 1 Volume */
+	{ 0x00000942, 0x0000 }, /* R2370 (0x942) - DSP1LMIX Input 2 Source */
+	{ 0x00000943, 0x0080 }, /* R2371 (0x943) - DSP1LMIX Input 2 Volume */
+	{ 0x00000944, 0x0000 }, /* R2372 (0x944) - DSP1LMIX Input 3 Source */
+	{ 0x00000945, 0x0080 }, /* R2373 (0x945) - DSP1LMIX Input 3 Volume */
+	{ 0x00000946, 0x0000 }, /* R2374 (0x946) - DSP1LMIX Input 4 Source */
+	{ 0x00000947, 0x0080 }, /* R2375 (0x947) - DSP1LMIX Input 4 Volume */
+	{ 0x00000948, 0x0000 }, /* R2376 (0x948) - DSP1RMIX Input 1 Source */
+	{ 0x00000949, 0x0080 }, /* R2377 (0x949) - DSP1RMIX Input 1 Volume */
+	{ 0x0000094a, 0x0000 }, /* R2378 (0x94A) - DSP1RMIX Input 2 Source */
+	{ 0x0000094b, 0x0080 }, /* R2379 (0x94B) - DSP1RMIX Input 2 Volume */
+	{ 0x0000094c, 0x0000 }, /* R2380 (0x94C) - DSP1RMIX Input 3 Source */
+	{ 0x0000094d, 0x0080 }, /* R2381 (0x94D) - DSP1RMIX Input 3 Volume */
+	{ 0x0000094e, 0x0000 }, /* R2382 (0x94E) - DSP1RMIX Input 4 Source */
+	{ 0x0000094f, 0x0080 }, /* R2383 (0x94F) - DSP1RMIX Input 4 Volume */
+	{ 0x00000950, 0x0000 }, /* R2384 (0x950) - DSP1AUX1MIX Input 1 Source */
+	{ 0x00000958, 0x0000 }, /* R2392 (0x958) - DSP1AUX2MIX Input 1 Source */
+	{ 0x00000960, 0x0000 }, /* R2400 (0x960) - DSP1AUX3MIX Input 1 Source */
+	{ 0x00000968, 0x0000 }, /* R2408 (0x968) - DSP1AUX4MIX Input 1 Source */
+	{ 0x00000970, 0x0000 }, /* R2416 (0x970) - DSP1AUX5MIX Input 1 Source */
+	{ 0x00000978, 0x0000 }, /* R2424 (0x978) - DSP1AUX6MIX Input 1 Source */
+	{ 0x00000b00, 0x0000 }, /* R2816 (0xB00) - ISRC1DEC1MIX Input 1 Source */
+	{ 0x00000b08, 0x0000 }, /* R2824 (0xB08) - ISRC1DEC2MIX Input 1 Source */
+	{ 0x00000b10, 0x0000 }, /* R2832 (0xB10) - ISRC1DEC3MIX Input 1 Source */
+	{ 0x00000b18, 0x0000 }, /* R2840 (0xB18) - ISRC1DEC4MIX Input 1 Source */
+	{ 0x00000b20, 0x0000 }, /* R2848 (0xB20) - ISRC1INT1MIX Input 1 Source */
+	{ 0x00000b28, 0x0000 }, /* R2856 (0xB28) - ISRC1INT2MIX Input 1 Source */
+	{ 0x00000b30, 0x0000 }, /* R2864 (0xB30) - ISRC1INT3MIX Input 1 Source */
+	{ 0x00000b38, 0x0000 }, /* R2872 (0xB38) - ISRC1INT4MIX Input 1 Source */
+	{ 0x00000b40, 0x0000 }, /* R2880 (0xB40) - ISRC2DEC1MIX Input 1 Source */
+	{ 0x00000b48, 0x0000 }, /* R2888 (0xB48) - ISRC2DEC2MIX Input 1 Source */
+	{ 0x00000b50, 0x0000 }, /* R2896 (0xB50) - ISRC2DEC3MIX Input 1 Source */
+	{ 0x00000b58, 0x0000 }, /* R2904 (0xB58) - ISRC2DEC4MIX Input 1 Source */
+	{ 0x00000b60, 0x0000 }, /* R2912 (0xB60) - ISRC2INT1MIX Input 1 Source */
+	{ 0x00000b68, 0x0000 }, /* R2920 (0xB68) - ISRC2INT2MIX Input 1 Source */
+	{ 0x00000b70, 0x0000 }, /* R2928 (0xB70) - ISRC2INT3MIX Input 1 Source */
+	{ 0x00000b78, 0x0000 }, /* R2936 (0xB78) - ISRC2INT4MIX Input 1 Source */
+	{ 0x00000e00, 0x0000 }, /* R3584 (0xE00) - FX Ctrl 1 */
+	{ 0x00000e10, 0x6318 }, /* R3600 (0xE10) - EQ1 1 */
+	{ 0x00000e11, 0x6300 }, /* R3601 (0xE11) - EQ1 2 */
+	{ 0x00000e12, 0x0fc8 }, /* R3602 (0xE12) - EQ1 3 */
+	{ 0x00000e13, 0x03fe }, /* R3603 (0xE13) - EQ1 4 */
+	{ 0x00000e14, 0x00e0 }, /* R3604 (0xE14) - EQ1 5 */
+	{ 0x00000e15, 0x1ec4 }, /* R3605 (0xE15) - EQ1 6 */
+	{ 0x00000e16, 0xf136 }, /* R3606 (0xE16) - EQ1 7 */
+	{ 0x00000e17, 0x0409 }, /* R3607 (0xE17) - EQ1 8 */
+	{ 0x00000e18, 0x04cc }, /* R3608 (0xE18) - EQ1 9 */
+	{ 0x00000e19, 0x1c9b }, /* R3609 (0xE19) - EQ1 10 */
+	{ 0x00000e1a, 0xf337 }, /* R3610 (0xE1A) - EQ1 11 */
+	{ 0x00000e1b, 0x040b }, /* R3611 (0xE1B) - EQ1 12 */
+	{ 0x00000e1c, 0x0cbb }, /* R3612 (0xE1C) - EQ1 13 */
+	{ 0x00000e1d, 0x16f8 }, /* R3613 (0xE1D) - EQ1 14 */
+	{ 0x00000e1e, 0xf7d9 }, /* R3614 (0xE1E) - EQ1 15 */
+	{ 0x00000e1f, 0x040a }, /* R3615 (0xE1F) - EQ1 16 */
+	{ 0x00000e20, 0x1f14 }, /* R3616 (0xE20) - EQ1 17 */
+	{ 0x00000e21, 0x058c }, /* R3617 (0xE21) - EQ1 18 */
+	{ 0x00000e22, 0x0563 }, /* R3618 (0xE22) - EQ1 19 */
+	{ 0x00000e23, 0x4000 }, /* R3619 (0xE23) - EQ1 20 */
+	{ 0x00000e24, 0x0b75 }, /* R3620 (0xE24) - EQ1 21 */
+	{ 0x00000e26, 0x6318 }, /* R3622 (0xE26) - EQ2 1 */
+	{ 0x00000e27, 0x6300 }, /* R3623 (0xE27) - EQ2 2 */
+	{ 0x00000e28, 0x0fc8 }, /* R3624 (0xE28) - EQ2 3 */
+	{ 0x00000e29, 0x03fe }, /* R3625 (0xE29) - EQ2 4 */
+	{ 0x00000e2a, 0x00e0 }, /* R3626 (0xE2A) - EQ2 5 */
+	{ 0x00000e2b, 0x1ec4 }, /* R3627 (0xE2B) - EQ2 6 */
+	{ 0x00000e2c, 0xf136 }, /* R3628 (0xE2C) - EQ2 7 */
+	{ 0x00000e2d, 0x0409 }, /* R3629 (0xE2D) - EQ2 8 */
+	{ 0x00000e2e, 0x04cc }, /* R3630 (0xE2E) - EQ2 9 */
+	{ 0x00000e2f, 0x1c9b }, /* R3631 (0xE2F) - EQ2 10 */
+	{ 0x00000e30, 0xf337 }, /* R3632 (0xE30) - EQ2 11 */
+	{ 0x00000e31, 0x040b }, /* R3633 (0xE31) - EQ2 12 */
+	{ 0x00000e32, 0x0cbb }, /* R3634 (0xE32) - EQ2 13 */
+	{ 0x00000e33, 0x16f8 }, /* R3635 (0xE33) - EQ2 14 */
+	{ 0x00000e34, 0xf7d9 }, /* R3636 (0xE34) - EQ2 15 */
+	{ 0x00000e35, 0x040a }, /* R3637 (0xE35) - EQ2 16 */
+	{ 0x00000e36, 0x1f14 }, /* R3638 (0xE36) - EQ2 17 */
+	{ 0x00000e37, 0x058c }, /* R3639 (0xE37) - EQ2 18 */
+	{ 0x00000e38, 0x0563 }, /* R3640 (0xE38) - EQ2 19 */
+	{ 0x00000e39, 0x4000 }, /* R3641 (0xE39) - EQ2 20 */
+	{ 0x00000e3a, 0x0b75 }, /* R3642 (0xE3A) - EQ2 21 */
+	{ 0x00000e3c, 0x6318 }, /* R3644 (0xE3C) - EQ3 1 */
+	{ 0x00000e3d, 0x6300 }, /* R3645 (0xE3D) - EQ3 2 */
+	{ 0x00000e3e, 0x0fc8 }, /* R3646 (0xE3E) - EQ3 3 */
+	{ 0x00000e3f, 0x03fe }, /* R3647 (0xE3F) - EQ3 4 */
+	{ 0x00000e40, 0x00e0 }, /* R3648 (0xE40) - EQ3 5 */
+	{ 0x00000e41, 0x1ec4 }, /* R3649 (0xE41) - EQ3 6 */
+	{ 0x00000e42, 0xf136 }, /* R3650 (0xE42) - EQ3 7 */
+	{ 0x00000e43, 0x0409 }, /* R3651 (0xE43) - EQ3 8 */
+	{ 0x00000e44, 0x04cc }, /* R3652 (0xE44) - EQ3 9 */
+	{ 0x00000e45, 0x1c9b }, /* R3653 (0xE45) - EQ3 10 */
+	{ 0x00000e46, 0xf337 }, /* R3654 (0xE46) - EQ3 11 */
+	{ 0x00000e47, 0x040b }, /* R3655 (0xE47) - EQ3 12 */
+	{ 0x00000e48, 0x0cbb }, /* R3656 (0xE48) - EQ3 13 */
+	{ 0x00000e49, 0x16f8 }, /* R3657 (0xE49) - EQ3 14 */
+	{ 0x00000e4a, 0xf7d9 }, /* R3658 (0xE4A) - EQ3 15 */
+	{ 0x00000e4b, 0x040a }, /* R3659 (0xE4B) - EQ3 16 */
+	{ 0x00000e4c, 0x1f14 }, /* R3660 (0xE4C) - EQ3 17 */
+	{ 0x00000e4d, 0x058c }, /* R3661 (0xE4D) - EQ3 18 */
+	{ 0x00000e4e, 0x0563 }, /* R3662 (0xE4E) - EQ3 19 */
+	{ 0x00000e4f, 0x4000 }, /* R3663 (0xE4F) - EQ3 20 */
+	{ 0x00000e50, 0x0b75 }, /* R3664 (0xE50) - EQ3 21 */
+	{ 0x00000e52, 0x6318 }, /* R3666 (0xE52) - EQ4 1 */
+	{ 0x00000e53, 0x6300 }, /* R3667 (0xE53) - EQ4 2 */
+	{ 0x00000e54, 0x0fc8 }, /* R3668 (0xE54) - EQ4 3 */
+	{ 0x00000e55, 0x03fe }, /* R3669 (0xE55) - EQ4 4 */
+	{ 0x00000e56, 0x00e0 }, /* R3670 (0xE56) - EQ4 5 */
+	{ 0x00000e57, 0x1ec4 }, /* R3671 (0xE57) - EQ4 6 */
+	{ 0x00000e58, 0xf136 }, /* R3672 (0xE58) - EQ4 7 */
+	{ 0x00000e59, 0x0409 }, /* R3673 (0xE59) - EQ4 8 */
+	{ 0x00000e5a, 0x04cc }, /* R3674 (0xE5A) - EQ4 9 */
+	{ 0x00000e5b, 0x1c9b }, /* R3675 (0xE5B) - EQ4 10 */
+	{ 0x00000e5c, 0xf337 }, /* R3676 (0xE5C) - EQ4 11 */
+	{ 0x00000e5d, 0x040b }, /* R3677 (0xE5D) - EQ4 12 */
+	{ 0x00000e5e, 0x0cbb }, /* R3678 (0xE5E) - EQ4 13 */
+	{ 0x00000e5f, 0x16f8 }, /* R3679 (0xE5F) - EQ4 14 */
+	{ 0x00000e60, 0xf7d9 }, /* R3680 (0xE60) - EQ4 15 */
+	{ 0x00000e61, 0x040a }, /* R3681 (0xE61) - EQ4 16 */
+	{ 0x00000e62, 0x1f14 }, /* R3682 (0xE62) - EQ4 17 */
+	{ 0x00000e63, 0x058c }, /* R3683 (0xE63) - EQ4 18 */
+	{ 0x00000e64, 0x0563 }, /* R3684 (0xE64) - EQ4 19 */
+	{ 0x00000e65, 0x4000 }, /* R3685 (0xE65) - EQ4 20 */
+	{ 0x00000e66, 0x0b75 }, /* R3686 (0xE66) - EQ4 21 */
+	{ 0x00000e80, 0x0018 }, /* R3712 (0xE80) - DRC1 Ctrl 1 */
+	{ 0x00000e81, 0x0933 }, /* R3713 (0xE81) - DRC1 Ctrl 2 */
+	{ 0x00000e82, 0x0018 }, /* R3714 (0xE82) - DRC1 Ctrl 3 */
+	{ 0x00000e83, 0x0000 }, /* R3715 (0xE83) - DRC1 Ctrl 4 */
+	{ 0x00000e84, 0x0000 }, /* R3716 (0xE84) - DRC1 Ctrl 5 */
+	{ 0x00000e88, 0x0018 }, /* R3720 (0xE88) - DRC2 Ctrl 1 */
+	{ 0x00000e89, 0x0933 }, /* R3721 (0xE89) - DRC2 Ctrl 2 */
+	{ 0x00000e8a, 0x0018 }, /* R3722 (0xE8A) - DRC2 Ctrl 3 */
+	{ 0x00000e8b, 0x0000 }, /* R3723 (0xE8B) - DRC2 Ctrl 4 */
+	{ 0x00000e8c, 0x0000 }, /* R3724 (0xE8C) - DRC2 Ctrl 5 */
+	{ 0x00000ec0, 0x0000 }, /* R3776 (0xEC0) - HPLPF1 1 */
+	{ 0x00000ec1, 0x0000 }, /* R3777 (0xEC1) - HPLPF1 2 */
+	{ 0x00000ec4, 0x0000 }, /* R3780 (0xEC4) - HPLPF2 1 */
+	{ 0x00000ec5, 0x0000 }, /* R3781 (0xEC5) - HPLPF2 2 */
+	{ 0x00000ec8, 0x0000 }, /* R3784 (0xEC8) - HPLPF3 1 */
+	{ 0x00000ec9, 0x0000 }, /* R3785 (0xEC9) - HPLPF3 2 */
+	{ 0x00000ecc, 0x0000 }, /* R3788 (0xECC) - HPLPF4 1 */
+	{ 0x00000ecd, 0x0000 }, /* R3789 (0xECD) - HPLPF4 2 */
+	{ 0x00000ef0, 0x0000 }, /* R3824 (0xEF0) - ISRC1 Ctrl 1 */
+	{ 0x00000ef1, 0x0001 }, /* R3825 (0xEF1) - ISRC1 Ctrl 2 */
+	{ 0x00000ef2, 0x0000 }, /* R3826 (0xEF2) - ISRC1 Ctrl 3 */
+	{ 0x00000ef3, 0x0000 }, /* R3827 (0xEF3) - ISRC2 Ctrl 1 */
+	{ 0x00000ef4, 0x0001 }, /* R3828 (0xEF4) - ISRC2 Ctrl 2 */
+	{ 0x00000ef5, 0x0000 }, /* R3829 (0xEF5) - ISRC2 Ctrl 3 */
+	{ 0x00001700, 0x2801 }, /* R5888 (0x1700) - GPIO1 Ctrl 1 */
+	{ 0x00001701, 0xe800 }, /* R5889 (0x1701) - GPIO1 Ctrl 2 */
+	{ 0x00001702, 0x2801 }, /* R5890 (0x1702) - GPIO2 Ctrl 1 */
+	{ 0x00001703, 0xe800 }, /* R5891 (0x1703) - GPIO2 Ctrl 2 */
+	{ 0x00001704, 0x2801 }, /* R5892 (0x1704) - GPIO3 Ctrl 1 */
+	{ 0x00001705, 0xe800 }, /* R5893 (0x1705) - GPIO3 Ctrl 2 */
+	{ 0x00001706, 0x2801 }, /* R5894 (0x1706) - GPIO4 Ctrl 1 */
+	{ 0x00001707, 0xe800 }, /* R5895 (0x1707) - GPIO4 Ctrl 2 */
+	{ 0x00001708, 0x2801 }, /* R5896 (0x1708) - GPIO5 Ctrl 1 */
+	{ 0x00001709, 0xe800 }, /* R5897 (0x1709) - GPIO5 Ctrl 2 */
+	{ 0x0000170a, 0x2801 }, /* R5898 (0x170A) - GPIO6 Ctrl 1 */
+	{ 0x0000170b, 0xe800 }, /* R5899 (0x170B) - GPIO6 Ctrl 2 */
+	{ 0x0000170c, 0x2801 }, /* R5900 (0x170C) - GPIO7 Ctrl 1 */
+	{ 0x0000170d, 0xe800 }, /* R5901 (0x170D) - GPIO7 Ctrl 2 */
+	{ 0x0000170e, 0x2801 }, /* R5902 (0x170E) - GPIO8 Ctrl 1 */
+	{ 0x0000170f, 0xe800 }, /* R5903 (0x170F) - GPIO8 Ctrl 2 */
+	{ 0x00001710, 0x2801 }, /* R5904 (0x1710) - GPIO9 Ctrl 1 */
+	{ 0x00001711, 0xe800 }, /* R5905 (0x1711) - GPIO9 Ctrl 2 */
+	{ 0x00001712, 0x2801 }, /* R5906 (0x1712) - GPIO10 Ctrl 1 */
+	{ 0x00001713, 0xe800 }, /* R5907 (0x1713) - GPIO10 Ctrl 2 */
+	{ 0x00001714, 0x2801 }, /* R5908 (0x1714) - GPIO11 Ctrl 1 */
+	{ 0x00001715, 0xe800 }, /* R5909 (0x1715) - GPIO11 Ctrl 2 */
+	{ 0x00001716, 0x2801 }, /* R5910 (0x1716) - GPIO12 Ctrl 1 */
+	{ 0x00001717, 0xe800 }, /* R5911 (0x1717) - GPIO12 Ctrl 2 */
+	{ 0x00001718, 0x2801 }, /* R5912 (0x1718) - GPIO13 Ctrl 1 */
+	{ 0x00001719, 0xe800 }, /* R5913 (0x1719) - GPIO13 Ctrl 2 */
+	{ 0x0000171a, 0x2801 }, /* R5914 (0x171A) - GPIO14 Ctrl 1 */
+	{ 0x0000171b, 0xe800 }, /* R5915 (0x171B) - GPIO14 Ctrl 2 */
+	{ 0x0000171c, 0x2801 }, /* R5916 (0x171C) - GPIO15 Ctrl 1 */
+	{ 0x0000171d, 0xe800 }, /* R5917 (0x171D) - GPIO15 Ctrl 2 */
+	{ 0x00001840, 0xffff }, /* R6208 (0x1840) - IRQ1 Mask 1 */
+	{ 0x00001841, 0xffff }, /* R6209 (0x1841) - IRQ1 Mask 2 */
+	{ 0x00001842, 0xffff }, /* R6210 (0x1842) - IRQ1 Mask 3 */
+	{ 0x00001843, 0xffff }, /* R6211 (0x1843) - IRQ1 Mask 4 */
+	{ 0x00001844, 0xffff }, /* R6212 (0x1844) - IRQ1 Mask 5 */
+	{ 0x00001845, 0xffff }, /* R6213 (0x1845) - IRQ1 Mask 6 */
+	{ 0x00001846, 0xffff }, /* R6214 (0x1846) - IRQ1 Mask 7 */
+	{ 0x00001847, 0xffff }, /* R6215 (0x1847) - IRQ1 Mask 8 */
+	{ 0x00001848, 0xffff }, /* R6216 (0x1848) - IRQ1 Mask 9 */
+	{ 0x00001849, 0xffff }, /* R6217 (0x1849) - IRQ1 Mask 10 */
+	{ 0x0000184a, 0xffff }, /* R6218 (0x184A) - IRQ1 Mask 11 */
+	{ 0x0000184b, 0xffff }, /* R6219 (0x184B) - IRQ1 Mask 12 */
+	{ 0x0000184c, 0xffff }, /* R6220 (0x184C) - IRQ1 Mask 13 */
+	{ 0x0000184d, 0xffff }, /* R6221 (0x184D) - IRQ1 Mask 14 */
+	{ 0x0000184e, 0xffff }, /* R6222 (0x184E) - IRQ1 Mask 15 */
+	{ 0x0000184f, 0xffff }, /* R6223 (0x184F) - IRQ1 Mask 16 */
+	{ 0x00001850, 0xffff }, /* R6224 (0x1850) - IRQ1 Mask 17 */
+	{ 0x00001851, 0xffff }, /* R6225 (0x1851) - IRQ1 Mask 18 */
+	{ 0x00001852, 0xffff }, /* R6226 (0x1852) - IRQ1 Mask 19 */
+	{ 0x00001853, 0xffff }, /* R6227 (0x1853) - IRQ1 Mask 20 */
+	{ 0x00001854, 0xffff }, /* R6228 (0x1854) - IRQ1 Mask 21 */
+	{ 0x00001855, 0xffff }, /* R6229 (0x1855) - IRQ1 Mask 22 */
+	{ 0x00001856, 0xffff }, /* R6230 (0x1856) - IRQ1 Mask 23 */
+	{ 0x00001857, 0xffff }, /* R6231 (0x1857) - IRQ1 Mask 24 */
+	{ 0x00001858, 0xffff }, /* R6232 (0x1858) - IRQ1 Mask 25 */
+	{ 0x00001859, 0xffff }, /* R6233 (0x1859) - IRQ1 Mask 26 */
+	{ 0x0000185a, 0xffff }, /* R6234 (0x185A) - IRQ1 Mask 27 */
+	{ 0x0000185b, 0xffff }, /* R6235 (0x185B) - IRQ1 Mask 28 */
+	{ 0x0000185c, 0xffff }, /* R6236 (0x185C) - IRQ1 Mask 29 */
+	{ 0x0000185d, 0xffff }, /* R6237 (0x185D) - IRQ1 Mask 30 */
+	{ 0x0000185e, 0xffff }, /* R6238 (0x185E) - IRQ1 Mask 31 */
+	{ 0x0000185f, 0xffff }, /* R6239 (0x185F) - IRQ1 Mask 32 */
+	{ 0x00001860, 0xffff }, /* R6240 (0x1860) - IRQ1 Mask 33 */
+	{ 0x00001a06, 0x0000 }, /* R6662 (0x1A06) - Interrupt Debounce 7 */
+	{ 0x00001a80, 0x4400 }, /* R6784 (0x1A80) - IRQ1 Ctrl */
+};
+
+static bool cs47l15_is_adsp_memory(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case 0x080000 ... 0x088ffe:
+	case 0x0a0000 ... 0x0a9ffe:
+	case 0x0c0000 ... 0x0c1ffe:
+	case 0x0e0000 ... 0x0e1ffe:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l15_16bit_readable_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_SOFTWARE_RESET:
+	case MADERA_HARDWARE_REVISION:
+	case MADERA_WRITE_SEQUENCER_CTRL_0 ... MADERA_WRITE_SEQUENCER_CTRL_2:
+	case MADERA_TONE_GENERATOR_1 ... MADERA_TONE_GENERATOR_5:
+	case MADERA_PWM_DRIVE_1 ... MADERA_PWM_DRIVE_3:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_1:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_2:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_3:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_4:
+	case MADERA_ALWAYS_ON_TRIGGERS_SEQUENCE_SELECT_1:
+	case MADERA_ALWAYS_ON_TRIGGERS_SEQUENCE_SELECT_2:
+	case MADERA_HAPTICS_CONTROL_1 ... MADERA_HAPTICS_CONTROL_2:
+	case MADERA_HAPTICS_PHASE_1_INTENSITY:
+	case MADERA_HAPTICS_PHASE_1_DURATION:
+	case MADERA_HAPTICS_PHASE_2_INTENSITY:
+	case MADERA_HAPTICS_PHASE_2_DURATION:
+	case MADERA_HAPTICS_PHASE_3_INTENSITY:
+	case MADERA_HAPTICS_PHASE_3_DURATION:
+	case MADERA_HAPTICS_STATUS:
+	case MADERA_COMFORT_NOISE_GENERATOR:
+	case MADERA_CLOCK_32K_1:
+	case MADERA_SYSTEM_CLOCK_1:
+	case MADERA_SAMPLE_RATE_1 ... MADERA_SAMPLE_RATE_3:
+	case MADERA_SAMPLE_RATE_1_STATUS:
+	case MADERA_SAMPLE_RATE_2_STATUS:
+	case MADERA_SAMPLE_RATE_3_STATUS:
+	case MADERA_DSP_CLOCK_1:
+	case MADERA_DSP_CLOCK_2:
+	case MADERA_OUTPUT_SYSTEM_CLOCK:
+	case MADERA_RATE_ESTIMATOR_1 ... MADERA_RATE_ESTIMATOR_5:
+	case MADERA_FLL1_CONTROL_1 ... MADERA_FLL1_CONTROL_6:
+	case MADERA_FLL1_CONTROL_7:
+	case MADERA_FLL1_EFS_2:
+	case MADERA_FLL1_LOOP_FILTER_TEST_1:
+	case MADERA_FLL1_SYNCHRONISER_1 ... MADERA_FLL1_SYNCHRONISER_7:
+	case MADERA_FLL1_SPREAD_SPECTRUM:
+	case MADERA_FLL1_GPIO_CLOCK:
+	case MADERA_FLLAO_CONTROL_1:
+	case MADERA_FLLAO_CONTROL_2:
+	case MADERA_FLLAO_CONTROL_3:
+	case MADERA_FLLAO_CONTROL_4:
+	case MADERA_FLLAO_CONTROL_5:
+	case MADERA_FLLAO_CONTROL_6:
+	case MADERA_FLLAO_CONTROL_7:
+	case MADERA_FLLAO_CONTROL_8:
+	case MADERA_FLLAO_CONTROL_9:
+	case MADERA_FLLAO_CONTROL_10:
+	case MADERA_FLLAO_CONTROL_11:
+	case MADERA_MIC_BIAS_CTRL_1:
+	case MADERA_MIC_BIAS_CTRL_5:
+	case MADERA_HP_CTRL_1L:
+	case MADERA_HP_CTRL_1R:
+	case MADERA_ACCESSORY_DETECT_MODE_1:
+	case MADERA_HEADPHONE_DETECT_0:
+	case MADERA_HEADPHONE_DETECT_1:
+	case MADERA_HEADPHONE_DETECT_2:
+	case MADERA_HEADPHONE_DETECT_3:
+	case MADERA_HEADPHONE_DETECT_5:
+	case MADERA_MICD_CLAMP_CONTROL:
+	case MADERA_MIC_DETECT_1_CONTROL_0:
+	case MADERA_MIC_DETECT_1_CONTROL_1:
+	case MADERA_MIC_DETECT_1_CONTROL_2:
+	case MADERA_MIC_DETECT_1_CONTROL_3:
+	case MADERA_MIC_DETECT_1_LEVEL_1 ... MADERA_MIC_DETECT_1_LEVEL_4:
+	case MADERA_MIC_DETECT_1_CONTROL_4:
+	case MADERA_GP_SWITCH_1:
+	case MADERA_JACK_DETECT_ANALOGUE:
+	case MADERA_INPUT_ENABLES:
+	case MADERA_INPUT_ENABLES_STATUS:
+	case MADERA_INPUT_RATE:
+	case MADERA_INPUT_VOLUME_RAMP:
+	case MADERA_HPF_CONTROL:
+	case MADERA_IN1L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_1L:
+	case MADERA_DMIC1L_CONTROL:
+	case MADERA_IN1L_RATE_CONTROL:
+	case MADERA_IN1R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_1R:
+	case MADERA_DMIC1R_CONTROL:
+	case MADERA_IN1R_RATE_CONTROL:
+	case MADERA_IN2L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_2L:
+	case MADERA_DMIC2L_CONTROL:
+	case MADERA_IN2L_RATE_CONTROL:
+	case MADERA_IN2R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_2R:
+	case MADERA_DMIC2R_CONTROL:
+	case MADERA_IN2R_RATE_CONTROL:
+	case CS47L15_ADC_INT_BIAS:
+	case CS47L15_PGA_BIAS_SEL:
+	case MADERA_OUTPUT_ENABLES_1:
+	case MADERA_OUTPUT_STATUS_1:
+	case MADERA_RAW_OUTPUT_STATUS_1:
+	case MADERA_OUTPUT_RATE_1:
+	case MADERA_OUTPUT_VOLUME_RAMP:
+	case MADERA_OUTPUT_PATH_CONFIG_1L:
+	case MADERA_DAC_DIGITAL_VOLUME_1L:
+	case MADERA_OUTPUT_PATH_CONFIG_1:
+	case MADERA_NOISE_GATE_SELECT_1L:
+	case MADERA_OUTPUT_PATH_CONFIG_1R:
+	case MADERA_DAC_DIGITAL_VOLUME_1R:
+	case MADERA_NOISE_GATE_SELECT_1R:
+	case MADERA_OUTPUT_PATH_CONFIG_2:
+	case MADERA_OUTPUT_PATH_CONFIG_4L:
+	case MADERA_DAC_DIGITAL_VOLUME_4L:
+	case MADERA_NOISE_GATE_SELECT_4L:
+	case MADERA_OUTPUT_PATH_CONFIG_5L:
+	case MADERA_DAC_DIGITAL_VOLUME_5L:
+	case MADERA_NOISE_GATE_SELECT_5L:
+	case MADERA_OUTPUT_PATH_CONFIG_5R:
+	case MADERA_DAC_DIGITAL_VOLUME_5R:
+	case MADERA_NOISE_GATE_SELECT_5R:
+	case MADERA_DAC_AEC_CONTROL_1:
+	case MADERA_DAC_AEC_CONTROL_2:
+	case MADERA_NOISE_GATE_CONTROL:
+	case MADERA_PDM_SPK1_CTRL_1 ... MADERA_PDM_SPK1_CTRL_2:
+	case MADERA_HP1_SHORT_CIRCUIT_CTRL:
+	case MADERA_HP_TEST_CTRL_5:
+	case MADERA_HP_TEST_CTRL_6:
+	case MADERA_AIF1_BCLK_CTRL:
+	case MADERA_AIF1_TX_PIN_CTRL:
+	case MADERA_AIF1_RX_PIN_CTRL:
+	case MADERA_AIF1_RATE_CTRL:
+	case MADERA_AIF1_FORMAT:
+	case MADERA_AIF1_RX_BCLK_RATE:
+	case MADERA_AIF1_FRAME_CTRL_1 ... MADERA_AIF1_FRAME_CTRL_8:
+	case MADERA_AIF1_FRAME_CTRL_11 ... MADERA_AIF1_FRAME_CTRL_16:
+	case MADERA_AIF1_TX_ENABLES:
+	case MADERA_AIF1_RX_ENABLES:
+	case MADERA_AIF2_BCLK_CTRL:
+	case MADERA_AIF2_TX_PIN_CTRL:
+	case MADERA_AIF2_RX_PIN_CTRL:
+	case MADERA_AIF2_RATE_CTRL:
+	case MADERA_AIF2_FORMAT:
+	case MADERA_AIF2_RX_BCLK_RATE:
+	case MADERA_AIF2_FRAME_CTRL_1 ... MADERA_AIF2_FRAME_CTRL_6:
+	case MADERA_AIF2_FRAME_CTRL_11 ... MADERA_AIF2_FRAME_CTRL_14:
+	case MADERA_AIF2_TX_ENABLES:
+	case MADERA_AIF2_RX_ENABLES:
+	case MADERA_AIF3_BCLK_CTRL:
+	case MADERA_AIF3_TX_PIN_CTRL:
+	case MADERA_AIF3_RX_PIN_CTRL:
+	case MADERA_AIF3_RATE_CTRL:
+	case MADERA_AIF3_FORMAT:
+	case MADERA_AIF3_RX_BCLK_RATE:
+	case MADERA_AIF3_FRAME_CTRL_1 ... MADERA_AIF3_FRAME_CTRL_4:
+	case MADERA_AIF3_FRAME_CTRL_11 ... MADERA_AIF3_FRAME_CTRL_12:
+	case MADERA_AIF3_TX_ENABLES:
+	case MADERA_AIF3_RX_ENABLES:
+	case MADERA_SPD1_TX_CONTROL:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_2:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_3:
+	case MADERA_PWM1MIX_INPUT_1_SOURCE:
+	case MADERA_PWM1MIX_INPUT_1_VOLUME:
+	case MADERA_PWM1MIX_INPUT_2_SOURCE:
+	case MADERA_PWM1MIX_INPUT_2_VOLUME:
+	case MADERA_PWM1MIX_INPUT_3_SOURCE:
+	case MADERA_PWM1MIX_INPUT_3_VOLUME:
+	case MADERA_PWM1MIX_INPUT_4_SOURCE:
+	case MADERA_PWM1MIX_INPUT_4_VOLUME:
+	case MADERA_PWM2MIX_INPUT_1_SOURCE:
+	case MADERA_PWM2MIX_INPUT_1_VOLUME:
+	case MADERA_PWM2MIX_INPUT_2_SOURCE:
+	case MADERA_PWM2MIX_INPUT_2_VOLUME:
+	case MADERA_PWM2MIX_INPUT_3_SOURCE:
+	case MADERA_PWM2MIX_INPUT_3_VOLUME:
+	case MADERA_PWM2MIX_INPUT_4_SOURCE:
+	case MADERA_PWM2MIX_INPUT_4_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_4_VOLUME:
+	case MADERA_OUT4LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT4LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT4LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT4LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT4LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT4LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT4LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT4LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_4_VOLUME:
+	case MADERA_SPDIF1TX1MIX_INPUT_1_SOURCE:
+	case MADERA_SPDIF1TX1MIX_INPUT_1_VOLUME:
+	case MADERA_SPDIF1TX2MIX_INPUT_1_SOURCE:
+	case MADERA_SPDIF1TX2MIX_INPUT_1_VOLUME:
+	case MADERA_EQ1MIX_INPUT_1_SOURCE:
+	case MADERA_EQ1MIX_INPUT_1_VOLUME:
+	case MADERA_EQ1MIX_INPUT_2_SOURCE:
+	case MADERA_EQ1MIX_INPUT_2_VOLUME:
+	case MADERA_EQ1MIX_INPUT_3_SOURCE:
+	case MADERA_EQ1MIX_INPUT_3_VOLUME:
+	case MADERA_EQ1MIX_INPUT_4_SOURCE:
+	case MADERA_EQ1MIX_INPUT_4_VOLUME:
+	case MADERA_EQ2MIX_INPUT_1_SOURCE:
+	case MADERA_EQ2MIX_INPUT_1_VOLUME:
+	case MADERA_EQ2MIX_INPUT_2_SOURCE:
+	case MADERA_EQ2MIX_INPUT_2_VOLUME:
+	case MADERA_EQ2MIX_INPUT_3_SOURCE:
+	case MADERA_EQ2MIX_INPUT_3_VOLUME:
+	case MADERA_EQ2MIX_INPUT_4_SOURCE:
+	case MADERA_EQ2MIX_INPUT_4_VOLUME:
+	case MADERA_EQ3MIX_INPUT_1_SOURCE:
+	case MADERA_EQ3MIX_INPUT_1_VOLUME:
+	case MADERA_EQ3MIX_INPUT_2_SOURCE:
+	case MADERA_EQ3MIX_INPUT_2_VOLUME:
+	case MADERA_EQ3MIX_INPUT_3_SOURCE:
+	case MADERA_EQ3MIX_INPUT_3_VOLUME:
+	case MADERA_EQ3MIX_INPUT_4_SOURCE:
+	case MADERA_EQ3MIX_INPUT_4_VOLUME:
+	case MADERA_EQ4MIX_INPUT_1_SOURCE:
+	case MADERA_EQ4MIX_INPUT_1_VOLUME:
+	case MADERA_EQ4MIX_INPUT_2_SOURCE:
+	case MADERA_EQ4MIX_INPUT_2_VOLUME:
+	case MADERA_EQ4MIX_INPUT_3_SOURCE:
+	case MADERA_EQ4MIX_INPUT_3_VOLUME:
+	case MADERA_EQ4MIX_INPUT_4_SOURCE:
+	case MADERA_EQ4MIX_INPUT_4_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_1_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_1_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_2_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_2_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_3_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_3_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_4_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_4_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_1_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_1_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_2_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_2_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_3_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_3_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_4_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_4_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_1_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_1_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_2_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_2_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_3_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_3_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_4_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_4_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_1_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_1_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_2_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_2_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_3_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_3_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_4_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_4_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_4_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_1_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_1_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_2_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_2_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_3_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_3_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_4_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_4_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_1_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_1_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_2_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_2_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_3_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_3_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_4_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_4_VOLUME:
+	case MADERA_DSP1AUX1MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX2MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX3MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX4MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX5MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX6MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC3MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC4MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT3MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT4MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC3MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC4MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT3MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT4MIX_INPUT_1_SOURCE:
+	case MADERA_FX_CTRL1 ... MADERA_FX_CTRL2:
+	case MADERA_EQ1_1 ... MADERA_EQ1_21:
+	case MADERA_EQ2_1 ... MADERA_EQ2_21:
+	case MADERA_EQ3_1 ... MADERA_EQ3_21:
+	case MADERA_EQ4_1 ... MADERA_EQ4_21:
+	case MADERA_DRC1_CTRL1 ... MADERA_DRC1_CTRL5:
+	case MADERA_DRC2_CTRL1 ... MADERA_DRC2_CTRL5:
+	case MADERA_HPLPF1_1 ... MADERA_HPLPF1_2:
+	case MADERA_HPLPF2_1 ... MADERA_HPLPF2_2:
+	case MADERA_HPLPF3_1 ... MADERA_HPLPF3_2:
+	case MADERA_HPLPF4_1 ... MADERA_HPLPF4_2:
+	case MADERA_ISRC_1_CTRL_1 ... MADERA_ISRC_1_CTRL_3:
+	case MADERA_ISRC_2_CTRL_1 ... MADERA_ISRC_2_CTRL_3:
+	case MADERA_GPIO1_CTRL_1 ... MADERA_GPIO15_CTRL_2:
+	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
+	case MADERA_IRQ1_MASK_1 ... MADERA_IRQ1_MASK_33:
+	case MADERA_IRQ1_RAW_STATUS_1 ... MADERA_IRQ1_RAW_STATUS_33:
+	case MADERA_INTERRUPT_DEBOUNCE_7:
+	case MADERA_IRQ1_CTRL:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l15_16bit_volatile_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_SOFTWARE_RESET:
+	case MADERA_HARDWARE_REVISION:
+	case MADERA_WRITE_SEQUENCER_CTRL_0 ... MADERA_WRITE_SEQUENCER_CTRL_2:
+	case MADERA_HAPTICS_STATUS:
+	case MADERA_SAMPLE_RATE_1_STATUS:
+	case MADERA_SAMPLE_RATE_2_STATUS:
+	case MADERA_SAMPLE_RATE_3_STATUS:
+	case MADERA_HP_CTRL_1L:
+	case MADERA_HP_CTRL_1R:
+	case MADERA_MIC_DETECT_1_CONTROL_3:
+	case MADERA_MIC_DETECT_1_CONTROL_4:
+	case MADERA_HEADPHONE_DETECT_2:
+	case MADERA_HEADPHONE_DETECT_3:
+	case MADERA_HEADPHONE_DETECT_5:
+	case MADERA_INPUT_ENABLES_STATUS:
+	case MADERA_OUTPUT_STATUS_1:
+	case MADERA_RAW_OUTPUT_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_2:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_3:
+	case MADERA_FX_CTRL2:
+	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
+	case MADERA_IRQ1_RAW_STATUS_1 ... MADERA_IRQ1_RAW_STATUS_33:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l15_32bit_readable_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_WSEQ_SEQUENCE_1 ... MADERA_WSEQ_SEQUENCE_225:
+	case MADERA_OTP_HPDET_CAL_1 ... MADERA_OTP_HPDET_CAL_2:
+	case MADERA_DSP1_CONFIG_1 ... MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR:
+		return true;
+	default:
+		return cs47l15_is_adsp_memory(dev, reg);
+	}
+}
+
+static bool cs47l15_32bit_volatile_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_WSEQ_SEQUENCE_1 ... MADERA_WSEQ_SEQUENCE_225:
+	case MADERA_OTP_HPDET_CAL_1 ... MADERA_OTP_HPDET_CAL_2:
+	case MADERA_DSP1_CONFIG_1 ... MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR:
+		return true;
+	default:
+		return cs47l15_is_adsp_memory(dev, reg);
+	}
+}
+
+const struct regmap_config cs47l15_16bit_spi_regmap = {
+	.name = "cs47l15_16bit",
+	.reg_bits = 32,
+	.pad_bits = 16,
+	.val_bits = 16,
+
+	.max_register = MADERA_INTERRUPT_RAW_STATUS_1,
+	.readable_reg = &cs47l15_16bit_readable_register,
+	.volatile_reg = &cs47l15_16bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = cs47l15_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(cs47l15_reg_default),
+};
+EXPORT_SYMBOL_GPL(cs47l15_16bit_spi_regmap);
+
+const struct regmap_config cs47l15_16bit_i2c_regmap = {
+	.name = "cs47l15_16bit",
+	.reg_bits = 32,
+	.val_bits = 16,
+
+	.max_register = MADERA_INTERRUPT_RAW_STATUS_1,
+	.readable_reg = &cs47l15_16bit_readable_register,
+	.volatile_reg = &cs47l15_16bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = cs47l15_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(cs47l15_reg_default),
+};
+EXPORT_SYMBOL_GPL(cs47l15_16bit_i2c_regmap);
+
+const struct regmap_config cs47l15_32bit_spi_regmap = {
+	.name = "cs47l15_32bit",
+	.reg_bits = 32,
+	.reg_stride = 2,
+	.pad_bits = 16,
+	.val_bits = 32,
+
+	.max_register = MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR,
+	.readable_reg = &cs47l15_32bit_readable_register,
+	.volatile_reg = &cs47l15_32bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_GPL(cs47l15_32bit_spi_regmap);
+
+const struct regmap_config cs47l15_32bit_i2c_regmap = {
+	.name = "cs47l15_32bit",
+	.reg_bits = 32,
+	.reg_stride = 2,
+	.val_bits = 32,
+
+	.max_register = MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR,
+	.readable_reg = &cs47l15_32bit_readable_register,
+	.volatile_reg = &cs47l15_32bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_GPL(cs47l15_32bit_i2c_regmap);
diff --git a/drivers/mfd/madera-core.c b/drivers/mfd/madera-core.c
index bc4e25b5b97d..a354567ebc86 100644
--- a/drivers/mfd/madera-core.c
+++ b/drivers/mfd/madera-core.c
@@ -31,6 +31,7 @@
 
 #include "madera.h"
 
+#define CS47L15_SILICON_ID	0x6370
 #define CS47L35_SILICON_ID	0x6360
 #define CS47L85_SILICON_ID	0x6338
 #define CS47L90_SILICON_ID	0x6364
@@ -46,6 +47,28 @@ static const struct mfd_cell madera_ldo1_devs[] = {
 	{ .name = "madera-ldo1" },
 };
 
+static const char * const cs47l15_supplies[] = {
+	"MICVDD",
+	"CPVDD1",
+	"SPKVDD",
+};
+
+static const struct mfd_cell cs47l15_devs[] = {
+	{ .name = "madera-pinctrl", },
+	{ .name = "madera-irq" },
+	{ .name = "madera-gpio" },
+	{
+		.name = "madera-extcon",
+		.parent_supplies = cs47l15_supplies,
+		.num_parent_supplies = 1, /* We only need MICVDD */
+	},
+	{
+		.name = "cs47l15-codec",
+		.parent_supplies = cs47l15_supplies,
+		.num_parent_supplies = ARRAY_SIZE(cs47l15_supplies),
+	},
+};
+
 static const char * const cs47l35_supplies[] = {
 	"MICVDD",
 	"DBVDD2",
@@ -129,6 +152,8 @@ static const struct mfd_cell cs47l90_devs[] = {
 const char *madera_name_from_type(enum madera_type type)
 {
 	switch (type) {
+	case CS47L15:
+		return "CS47L15";
 	case CS47L35:
 		return "CS47L35";
 	case CS47L85:
@@ -291,6 +316,7 @@ const struct dev_pm_ops madera_pm_ops = {
 EXPORT_SYMBOL_GPL(madera_pm_ops);
 
 const struct of_device_id madera_of_match[] = {
+	{ .compatible = "cirrus,cs47l15", .data = (void *)CS47L15 },
 	{ .compatible = "cirrus,cs47l35", .data = (void *)CS47L35 },
 	{ .compatible = "cirrus,cs47l85", .data = (void *)CS47L85 },
 	{ .compatible = "cirrus,cs47l90", .data = (void *)CS47L90 },
@@ -339,6 +365,10 @@ static void madera_set_micbias_info(struct madera *madera)
 	 * childbiases for each micbias. Unspecified values default to 0.
 	 */
 	switch (madera->type) {
+	case CS47L15:
+		madera->num_micbias = 1;
+		madera->num_childbias[0] = 3;
+		return;
 	case CS47L35:
 		madera->num_micbias = 2;
 		madera->num_childbias[0] = 2;
@@ -402,6 +432,7 @@ int madera_dev_init(struct madera *madera)
 	 * No devm_ because we need to control shutdown order of children.
 	 */
 	switch (madera->type) {
+	case CS47L15:
 	case CS47L35:
 	case CS47L90:
 	case CS47L91:
@@ -471,6 +502,19 @@ int madera_dev_init(struct madera *madera)
 	}
 
 	switch (hwid) {
+	case CS47L15_SILICON_ID:
+		if (IS_ENABLED(CONFIG_MFD_CS47L15)) {
+			switch (madera->type) {
+			case CS47L15:
+				patch_fn = &cs47l15_patch;
+				mfd_devs = cs47l15_devs;
+				n_devs = ARRAY_SIZE(cs47l15_devs);
+				break;
+			default:
+				break;
+			}
+		}
+		break;
 	case CS47L35_SILICON_ID:
 		if (IS_ENABLED(CONFIG_MFD_CS47L35)) {
 			switch (madera->type) {
diff --git a/drivers/mfd/madera-i2c.c b/drivers/mfd/madera-i2c.c
index 05ae94be01d8..bd868459cedb 100644
--- a/drivers/mfd/madera-i2c.c
+++ b/drivers/mfd/madera-i2c.c
@@ -39,6 +39,12 @@ static int madera_i2c_probe(struct i2c_client *i2c,
 		type = id->driver_data;
 
 	switch (type) {
+	case CS47L15:
+		if (IS_ENABLED(CONFIG_MFD_CS47L15)) {
+			regmap_16bit_config = &cs47l15_16bit_i2c_regmap;
+			regmap_32bit_config = &cs47l15_32bit_i2c_regmap;
+		}
+		break;
 	case CS47L35:
 		if (IS_ENABLED(CONFIG_MFD_CS47L35)) {
 			regmap_16bit_config = &cs47l35_16bit_i2c_regmap;
@@ -113,6 +119,7 @@ static int madera_i2c_remove(struct i2c_client *i2c)
 }
 
 static const struct i2c_device_id madera_i2c_id[] = {
+	{ "cs47l15", CS47L15 },
 	{ "cs47l35", CS47L35 },
 	{ "cs47l85", CS47L85 },
 	{ "cs47l90", CS47L90 },
diff --git a/drivers/mfd/madera-spi.c b/drivers/mfd/madera-spi.c
index 4c398b278bba..a36741b73c25 100644
--- a/drivers/mfd/madera-spi.c
+++ b/drivers/mfd/madera-spi.c
@@ -39,6 +39,12 @@ static int madera_spi_probe(struct spi_device *spi)
 		type = id->driver_data;
 
 	switch (type) {
+	case CS47L15:
+		if (IS_ENABLED(CONFIG_MFD_CS47L15)) {
+			regmap_16bit_config = &cs47l15_16bit_spi_regmap;
+			regmap_32bit_config = &cs47l15_32bit_spi_regmap;
+		}
+		break;
 	case CS47L35:
 		if (IS_ENABLED(CONFIG_MFD_CS47L35)) {
 			regmap_16bit_config = &cs47l35_16bit_spi_regmap;
@@ -112,6 +118,7 @@ static int madera_spi_remove(struct spi_device *spi)
 }
 
 static const struct spi_device_id madera_spi_ids[] = {
+	{ "cs47l15", CS47L15 },
 	{ "cs47l35", CS47L35 },
 	{ "cs47l85", CS47L85 },
 	{ "cs47l90", CS47L90 },
diff --git a/drivers/mfd/madera.h b/drivers/mfd/madera.h
index 891b84efb9a7..ccc16f2a1288 100644
--- a/drivers/mfd/madera.h
+++ b/drivers/mfd/madera.h
@@ -24,6 +24,12 @@ int madera_dev_exit(struct madera *madera);
 
 const char *madera_name_from_type(enum madera_type type);
 
+extern const struct regmap_config cs47l15_16bit_spi_regmap;
+extern const struct regmap_config cs47l15_32bit_spi_regmap;
+extern const struct regmap_config cs47l15_16bit_i2c_regmap;
+extern const struct regmap_config cs47l15_32bit_i2c_regmap;
+int cs47l15_patch(struct madera *madera);
+
 extern const struct regmap_config cs47l35_16bit_spi_regmap;
 extern const struct regmap_config cs47l35_32bit_spi_regmap;
 extern const struct regmap_config cs47l35_16bit_i2c_regmap;
diff --git a/include/linux/mfd/madera/core.h b/include/linux/mfd/madera/core.h
index 4d5d51a9c8a6..98dd3cb5e84d 100644
--- a/include/linux/mfd/madera/core.h
+++ b/include/linux/mfd/madera/core.h
@@ -27,11 +27,13 @@ enum madera_type {
 	CS47L90 = 3,
 	CS47L91 = 4,
 	WM1840 = 7,
+	CS47L15 = 8,
 };
 
 #define MADERA_MAX_CORE_SUPPLIES	2
 #define MADERA_MAX_GPIOS		40
 
+#define CS47L15_NUM_GPIOS		15
 #define CS47L35_NUM_GPIOS		16
 #define CS47L85_NUM_GPIOS		40
 #define CS47L90_NUM_GPIOS		38
diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index 977e06101711..5b054d511c6a 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -244,6 +244,8 @@
 #define MADERA_IN6R_CONTROL				0x33C
 #define MADERA_ADC_DIGITAL_VOLUME_6R			0x33D
 #define MADERA_DMIC6R_CONTROL				0x33E
+#define CS47L15_ADC_INT_BIAS				0x3A8
+#define CS47L15_PGA_BIAS_SEL				0x3C4
 #define MADERA_OUTPUT_ENABLES_1				0x400
 #define MADERA_OUTPUT_STATUS_1				0x401
 #define MADERA_RAW_OUTPUT_STATUS_1			0x406
@@ -1202,6 +1204,8 @@
 #define MADERA_GPIO1_CTRL_2				0x1701
 #define MADERA_GPIO2_CTRL_1				0x1702
 #define MADERA_GPIO2_CTRL_2				0x1703
+#define MADERA_GPIO15_CTRL_1				0x171C
+#define MADERA_GPIO15_CTRL_2				0x171D
 #define MADERA_GPIO16_CTRL_1				0x171E
 #define MADERA_GPIO16_CTRL_2				0x171F
 #define MADERA_GPIO38_CTRL_1				0x174A
@@ -1232,6 +1236,7 @@
 #define MADERA_IRQ2_CTRL				0x1A82
 #define MADERA_INTERRUPT_RAW_STATUS_1			0x1AA0
 #define MADERA_WSEQ_SEQUENCE_1				0x3000
+#define MADERA_WSEQ_SEQUENCE_225			0x31C0
 #define MADERA_WSEQ_SEQUENCE_252			0x31F6
 #define CS47L35_OTP_HPDET_CAL_1				0x31F8
 #define CS47L35_OTP_HPDET_CAL_2				0x31FA
-- 
cgit v1.2.3-59-g8ed1b


From 297939901f382f16ab78a8073cdfb2a6279bb654 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Thu, 30 May 2019 15:39:53 +0100
Subject: mfd: madera: Add Madera core support for CS47L92

This patch adds all the core support and defines for the Cirrus
Logic CS42L92, CS47L92 and CS47L93 smart audio CODECs.

Registers or fields are named MADERA_* if it is part of the
common hardware platform and does not conflict with any other
Madera codecs. It is named CS47L15_* if it is unique to CS47L15
and conflicts with definitions on other codecs.

Signed-off-by: Stuart Henderson <stuarth@opensource.cirrus.com>
Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/Kconfig                  |    7 +
 drivers/mfd/Makefile                 |    3 +
 drivers/mfd/cs47l92-tables.c         | 1948 ++++++++++++++++++++++++++++++++++
 drivers/mfd/madera-core.c            |   58 +
 drivers/mfd/madera-i2c.c             |   11 +
 drivers/mfd/madera-spi.c             |   11 +
 drivers/mfd/madera.h                 |    7 +
 include/linux/mfd/madera/core.h      |    4 +
 include/linux/mfd/madera/registers.h |  195 ++++
 9 files changed, 2244 insertions(+)
 create mode 100644 drivers/mfd/cs47l92-tables.c

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 86ae0a11f631..760100c7d5f9 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -288,6 +288,13 @@ config MFD_CS47L90
 	help
 	  Support for Cirrus Logic CS47L90 and CS47L91 Smart Codecs
 
+config MFD_CS47L92
+	bool "Cirrus Logic CS47L92/93"
+	select PINCTRL_CS47L92
+	depends on MFD_MADERA
+	help
+	  Support for Cirrus Logic CS42L92, CS47L92 and CS47L93 Smart Codecs
+
 config MFD_ASIC3
 	bool "Compaq ASIC3"
 	depends on GPIOLIB && ARM
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index cc044f38af84..f026ada68f6a 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -87,6 +87,9 @@ endif
 ifeq ($(CONFIG_MFD_CS47L90),y)
 madera-objs			+= cs47l90-tables.o
 endif
+ifeq ($(CONFIG_MFD_CS47L92),y)
+madera-objs			+= cs47l92-tables.o
+endif
 obj-$(CONFIG_MFD_MADERA)	+= madera.o
 obj-$(CONFIG_MFD_MADERA_I2C)	+= madera-i2c.o
 obj-$(CONFIG_MFD_MADERA_SPI)	+= madera-spi.o
diff --git a/drivers/mfd/cs47l92-tables.c b/drivers/mfd/cs47l92-tables.c
new file mode 100644
index 000000000000..3dc1fefe68f5
--- /dev/null
+++ b/drivers/mfd/cs47l92-tables.c
@@ -0,0 +1,1948 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Regmap tables for CS47L92 codec
+ *
+ * Copyright (C) 2016-2019 Cirrus Logic, Inc. and
+ *                         Cirrus Logic International Semiconductor Ltd.
+ *
+ * Author: Stuart Henderson <stuarth@opensource.cirrus.com>
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/regmap.h>
+
+#include <linux/mfd/madera/core.h>
+#include <linux/mfd/madera/registers.h>
+
+#include "madera.h"
+
+static const struct reg_sequence cs47l92_reva_16_patch[] = {
+	{ 0x3A2,  0x2C29 },
+	{ 0x3A3,  0x0E00 },
+	{ 0x281,  0x0000 },
+	{ 0x282,  0x0000 },
+	{ 0x4EA,  0x0100 },
+	{ 0x22B,  0x0000 },
+	{ 0x4A0,  0x0080 },
+	{ 0x4A1,  0x0000 },
+	{ 0x4A2,  0x0000 },
+	{ 0x180B, 0x033F },
+	{ 0x190B, 0x033F },
+	{ 0x442,  0x0304 },
+	{ 0x34C,  0x0003 },
+	{ 0x124,  0x0C49 },
+	{ 0x120,  0x0345 },
+	{ 0x120,  0x0305 },
+	{ 0x4FA,  0x5064 },
+	{ 0x1300, 0x050E },
+	{ 0x1302, 0x0101 },
+	{ 0x1380, 0x02E0 },
+	{ 0x1381, 0xF942 },
+	{ 0x1382, 0x04CE },
+	{ 0x1383, 0xFF06 },
+	{ 0x1390, 0x0304 },
+	{ 0x1391, 0xF8FF },
+	{ 0x1392, 0x04F3 },
+	{ 0x1393, 0xFF00 },
+	{ 0x13A0, 0x02E0 },
+	{ 0x13A1, 0xF942 },
+	{ 0x13A2, 0x04CE },
+	{ 0x13A3, 0xFF06 },
+	{ 0x13B0, 0x0304 },
+	{ 0x13B1, 0xF8FF },
+	{ 0x13B2, 0x04F3 },
+	{ 0x13B3, 0xFF00 },
+	{ 0x412,  0x0005 },
+	{ 0x41A,  0x0005 },
+	{ 0x422,  0x0005 },
+};
+
+static const struct reg_sequence cs47l92_reva_32_patch[] = {
+	{ 0x3030, 0x04A00C01 },
+	{ 0x3032, 0x0225F501 },
+	{ 0x3044, 0x04A00C00 },
+	{ 0x3046, 0x0225FF01 },
+	{ 0x3080, 0x04A00C01 },
+	{ 0x3082, 0x0226F501 },
+	{ 0x3094, 0x04A00C00 },
+	{ 0x3096, 0x0226FF01 },
+	{ 0x30D1, 0x04A10C01 },
+	{ 0x30D2, 0x0227F501 },
+	{ 0x30E4, 0x04A10C00 },
+	{ 0x30E6, 0x0227FF01 },
+	{ 0x3120, 0x04A10C01 },
+	{ 0x3122, 0x0228F501 },
+	{ 0x3134, 0x04A10C00 },
+	{ 0x3136, 0x0228FF01 },
+	{ 0x3170, 0x04A20C01 },
+	{ 0x3172, 0x022B0101 },
+	{ 0x3174, 0x0229F501 },
+	{ 0x3184, 0x04A20C00 },
+	{ 0x3186, 0x022B0100 },
+	{ 0x3188, 0x0229FF01 },
+	{ 0x31C0, 0x04A20C01 },
+	{ 0x31C2, 0x022B0001 },
+	{ 0x31C4, 0x022AF501 },
+	{ 0x31D4, 0x04A20C00 },
+	{ 0x31D6, 0x022B0000 },
+	{ 0x31D8, 0x022AFF01 },
+};
+
+int cs47l92_patch(struct madera *madera)
+{
+	int ret;
+
+	ret = regmap_register_patch(madera->regmap,
+				    cs47l92_reva_16_patch,
+				    ARRAY_SIZE(cs47l92_reva_16_patch));
+	if (ret < 0) {
+		dev_err(madera->dev,
+			"Error in applying 16-bit patch: %d\n", ret);
+		return ret;
+	}
+
+	ret = regmap_register_patch(madera->regmap_32bit,
+				    cs47l92_reva_32_patch,
+				    ARRAY_SIZE(cs47l92_reva_32_patch));
+	if (ret < 0) {
+		dev_err(madera->dev,
+			"Error in applying 32-bit patch: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cs47l92_patch);
+
+static const struct reg_default cs47l92_reg_default[] = {
+	{ 0x00000020, 0x0000 }, /* R32 (0x20) - Tone Generator 1 */
+	{ 0x00000021, 0x1000 }, /* R33 (0x21) - Tone Generator 2 */
+	{ 0x00000022, 0x0000 }, /* R34 (0x22) - Tone Generator 3 */
+	{ 0x00000023, 0x1000 }, /* R35 (0x23) - Tone Generator 4 */
+	{ 0x00000024, 0x0000 }, /* R36 (0x24) - Tone Generator 5 */
+	{ 0x00000030, 0x0000 }, /* R48 (0x30) - PWM Drive 1 */
+	{ 0x00000031, 0x0100 }, /* R49 (0x31) - PWM Drive 2 */
+	{ 0x00000032, 0x0100 }, /* R50 (0x32) - PWM Drive 3 */
+	{ 0x00000061, 0x01ff }, /* R97 (0x61) - Sample Rate Sequence Select 1 */
+	{ 0x00000062, 0x01ff }, /* R98 (0x62) - Sample Rate Sequence Select 2 */
+	{ 0x00000063, 0x01ff }, /* R99 (0x63) - Sample Rate Sequence Select 3 */
+	{ 0x00000064, 0x01ff }, /* R100 (0x64) - Sample Rate Sequence Select 4 */
+	{ 0x00000090, 0x0000 }, /* R144 (0x90) - Haptics Control 1 */
+	{ 0x00000091, 0x7fff }, /* R145 (0x91) - Haptics Control 2 */
+	{ 0x00000092, 0x0000 }, /* R146 (0x92) - Haptics Phase 1 Intensity */
+	{ 0x00000093, 0x0000 }, /* R147 (0x93) - Haptics Phase 1 Duration */
+	{ 0x00000094, 0x0000 }, /* R148 (0x94) - Haptics Phase 2 Intensity */
+	{ 0x00000095, 0x0000 }, /* R149 (0x95) - Haptics Phase 2 Duration */
+	{ 0x00000096, 0x0000 }, /* R150 (0x96) - Haptics Phase 3 Intensity */
+	{ 0x00000097, 0x0000 }, /* R151 (0x97) - Haptics Phase 3 Duration */
+	{ 0x000000a0, 0x0000 }, /* R160 (0xa0) - Comfort Noise Generator */
+	{ 0x00000100, 0x0002 }, /* R256 (0x100) - Clock 32k 1 */
+	{ 0x00000101, 0x0404 }, /* R257 (0x101) - System Clock 1 */
+	{ 0x00000102, 0x0011 }, /* R258 (0x102) - Sample Rate 1 */
+	{ 0x00000103, 0x0011 }, /* R259 (0x103) - Sample Rate 2 */
+	{ 0x00000104, 0x0011 }, /* R260 (0x104) - Sample Rate 3 */
+	{ 0x00000112, 0x0305 }, /* R274 (0x112) - Async Clock 1 */
+	{ 0x00000113, 0x0011 }, /* R275 (0x113) - Async Sample Rate 1 */
+	{ 0x00000114, 0x0011 }, /* R276 (0x114) - Async Sample Rate 2 */
+	{ 0x00000120, 0x0305 }, /* R288 (0x120) - DSP Clock 1 */
+	{ 0x00000122, 0x0000 }, /* R290 (0x122) - DSP Clock 2 */
+	{ 0x00000149, 0x0000 }, /* R329 (0x149) - Output System Clock */
+	{ 0x0000014a, 0x0000 }, /* R330 (0x14a) - Output Async Clock */
+	{ 0x00000152, 0x0000 }, /* R338 (0x152) - Rate Estimator 1 */
+	{ 0x00000153, 0x0000 }, /* R339 (0x153) - Rate Estimator 2 */
+	{ 0x00000154, 0x0000 }, /* R340 (0x154) - Rate Estimator 3 */
+	{ 0x00000155, 0x0000 }, /* R341 (0x155) - Rate Estimator 4 */
+	{ 0x00000156, 0x0000 }, /* R342 (0x156) - Rate Estimator 5 */
+	{ 0x00000171, 0x7004 }, /* R369 (0x171) - FLL1 Control 1 */
+	{ 0x00000172, 0x0004 }, /* R370 (0x172) - FLL1 Control 2 */
+	{ 0x00000173, 0x0000 }, /* R371 (0x173) - FLL1 Control 3 */
+	{ 0x00000174, 0x0000 }, /* R372 (0x174) - FLL1 Control 4 */
+	{ 0x00000175, 0x0001 }, /* R373 (0x175) - FLL1 Control 5 */
+	{ 0x00000176, 0x8000 }, /* R374 (0x176) - FLL1 Control 6 */
+	{ 0x00000177, 0x0680 }, /* R375 (0x177) - FLL1 Control 7 */
+	{ 0x00000178, 0x21f0 }, /* R376 (0x178) - FLL1 Control 8 */
+	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 9 */
+	{ 0x0000017a, 0x0000 }, /* R378 (0x17a) - FLL1 Control 10 */
+	{ 0x0000017b, 0x0011 }, /* R379 (0x17b) - FLL1 Control 11 */
+	{ 0x0000017d, 0x33e8 }, /* R381 (0x17d) - FLL1 Digital Test 1 */
+	{ 0x00000181, 0x7000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
+	{ 0x00000182, 0x0004 }, /* R386 (0x182) - FLL1 Synchroniser 2 */
+	{ 0x00000183, 0x0000 }, /* R387 (0x183) - FLL1 Synchroniser 3 */
+	{ 0x00000184, 0x0000 }, /* R388 (0x184) - FLL1 Synchroniser 4 */
+	{ 0x00000185, 0x0001 }, /* R389 (0x185) - FLL1 Synchroniser 5 */
+	{ 0x00000186, 0x0000 }, /* R390 (0x186) - FLL1 Synchroniser 6 */
+	{ 0x0000018e, 0x0c04 }, /* R398 (0x18e) - FLL1 GPIO Clock */
+	{ 0x00000191, 0x7000 }, /* R401 (0x191) - FLL2 Control 1 */
+	{ 0x00000192, 0x0004 }, /* R402 (0x192) - FLL2 Control 2 */
+	{ 0x00000193, 0x0000 }, /* R403 (0x193) - FLL2 Control 3 */
+	{ 0x00000194, 0x0000 }, /* R404 (0x194) - FLL2 Control 4 */
+	{ 0x00000195, 0x0001 }, /* R405 (0x195) - FLL2 Control 5 */
+	{ 0x00000196, 0x8000 }, /* R406 (0x196) - FLL2 Control 6 */
+	{ 0x00000197, 0x0680 }, /* R407 (0x197) - FLL2 Control 7 */
+	{ 0x00000198, 0x21f0 }, /* R408 (0x198) - FLL2 Control 8 */
+	{ 0x00000199, 0x0000 }, /* R409 (0x199) - FLL2 Control 9 */
+	{ 0x0000019a, 0x0000 }, /* R410 (0x19a) - FLL2 Control 10 */
+	{ 0x0000019b, 0x0011 }, /* R411 (0x19b) - FLL2 Control 11 */
+	{ 0x0000019d, 0x33e8 }, /* R413 (0x19d) - FLL2 Digital Test 1 */
+	{ 0x000001a1, 0x7000 }, /* R417 (0x1a1) - FLL2 Synchroniser 1 */
+	{ 0x000001a2, 0x0004 }, /* R418 (0x1a2) - FLL2 Synchroniser 2 */
+	{ 0x000001a3, 0x0000 }, /* R419 (0x1a3) - FLL2 Synchroniser 3 */
+	{ 0x000001a4, 0x0000 }, /* R420 (0x1a4) - FLL2 Synchroniser 4 */
+	{ 0x000001a5, 0x0001 }, /* R421 (0x1a5) - FLL2 Synchroniser 5 */
+	{ 0x000001a6, 0x0000 }, /* R422 (0x1a6) - FLL2 Synchroniser 6 */
+	{ 0x000001ae, 0x0c04 }, /* R430 (0x1ae) - FLL2 GPIO Clock */
+	{ 0x00000200, 0x0006 }, /* R512 (0x200) - Mic Charge Pump 1 */
+	{ 0x00000213, 0x03e4 }, /* R531 (0x213) - LDO2 Control 1 */
+	{ 0x00000218, 0x00e6 }, /* R536 (0x218) - Mic Bias Ctrl 1 */
+	{ 0x00000219, 0x00e6 }, /* R537 (0x219) - Mic Bias Ctrl 2 */
+	{ 0x0000021c, 0x2222 }, /* R540 (0x21c) - Mic Bias Ctrl 5 */
+	{ 0x0000021e, 0x0022 }, /* R542 (0x21e) - Mic Bias Ctrl 6 */
+	{ 0x00000293, 0x0080 }, /* R659 (0x293) - Accessory Detect Mode 1 */
+	{ 0x00000299, 0x0000 }, /* R665 (0x299) - Headphone Detect 0 */
+	{ 0x0000029b, 0x0000 }, /* R667 (0x29b) - Headphone Detect 1 */
+	{ 0x000002a2, 0x0010 }, /* R674 (0x2a2) - Mic Detect 1 Control 0 */
+	{ 0x000002a3, 0x1102 }, /* R675 (0x2a3) - Mic Detect 1 Control 1 */
+	{ 0x000002a4, 0x009f }, /* R676 (0x2a4) - Mic Detect 1 Control 2 */
+	{ 0x000002a6, 0x3d3d }, /* R678 (0x2a6) - Mic Detect 1 Level 1 */
+	{ 0x000002a7, 0x3d3d }, /* R679 (0x2a7) - Mic Detect 1 Level 2 */
+	{ 0x000002a8, 0x333d }, /* R680 (0x2a8) - Mic Detect 1 Level 3 */
+	{ 0x000002a9, 0x202d }, /* R681 (0x2a9) - Mic Detect 1 Level 4 */
+	{ 0x000002b2, 0x0010 }, /* R690 (0x2b2) - Mic Detect 2 Control 0 */
+	{ 0x000002b3, 0x1102 }, /* R691 (0x2b3) - Mic Detect 2 Control 1 */
+	{ 0x000002b4, 0x009f }, /* R692 (0x2b4) - Mic Detect 2 Control 2 */
+	{ 0x000002b6, 0x3d3d }, /* R694 (0x2b6) - Mic Detect 2 Level 1 */
+	{ 0x000002b7, 0x3d3d }, /* R695 (0x2b7) - Mic Detect 2 Level 2 */
+	{ 0x000002b8, 0x333d }, /* R696 (0x2b8) - Mic Detect 2 Level 3 */
+	{ 0x000002b9, 0x202d }, /* R697 (0x2b9) - Mic Detect 2 Level 4 */
+	{ 0x000002c6, 0x0210 }, /* R710 (0x2c6) - Micd Clamp control */
+	{ 0x000002c8, 0x0000 }, /* R712 (0x2c8) - GP Switch 1 */
+	{ 0x000002d3, 0x0000 }, /* R723 (0x2d3) - Jack Detect Analogue */
+	{ 0x00000300, 0x0000 }, /* R768 (0x300) - Input Enables */
+	{ 0x00000308, 0x0400 }, /* R776 (0x308) - Input Rate */
+	{ 0x00000309, 0x0022 }, /* R777 (0x309) - Input Volume Ramp */
+	{ 0x0000030c, 0x0002 }, /* R780 (0x30c) - HPF Control */
+	{ 0x00000310, 0x0080 }, /* R784 (0x310) - IN1L Control */
+	{ 0x00000311, 0x0180 }, /* R785 (0x311) - ADC Digital Volume 1L */
+	{ 0x00000312, 0x0500 }, /* R786 (0x312) - DMIC1L Control */
+	{ 0x00000313, 0x0000 }, /* R787 (0x313) - IN1L Rate Control */
+	{ 0x00000314, 0x0080 }, /* R788 (0x314) - IN1R Control */
+	{ 0x00000315, 0x0180 }, /* R789 (0x315) - ADC Digital Volume 1R */
+	{ 0x00000316, 0x0000 }, /* R790 (0x316) - DMIC1R Control */
+	{ 0x00000317, 0x0000 }, /* R791 (0x317) - IN1R Rate Control */
+	{ 0x00000318, 0x0080 }, /* R792 (0x318) - IN2L Control */
+	{ 0x00000319, 0x0180 }, /* R793 (0x319) - ADC Digital Volume 2L */
+	{ 0x0000031a, 0x0500 }, /* R794 (0x31a) - DMIC2L Control */
+	{ 0x0000031b, 0x0000 }, /* R795 (0x31b) - IN2L Rate Control */
+	{ 0x0000031c, 0x0080 }, /* R796 (0x31c) - IN2R Control */
+	{ 0x0000031d, 0x0180 }, /* R797 (0x31d) - ADC Digital Volume 2R */
+	{ 0x0000031e, 0x0000 }, /* R798 (0x31e) - DMIC2R Control */
+	{ 0x0000031f, 0x0000 }, /* R799 (0x31f) - IN2R Rate Control */
+	{ 0x00000320, 0x0000 }, /* R800 (0x320) - IN3L Control */
+	{ 0x00000321, 0x0180 }, /* R801 (0x321) - ADC Digital Volume 3L */
+	{ 0x00000322, 0x0500 }, /* R802 (0x322) - DMIC3L Control */
+	{ 0x00000323, 0x0000 }, /* R803 (0x323) - IN3L Rate Control */
+	{ 0x00000324, 0x0000 }, /* R804 (0x324) - IN3R Control */
+	{ 0x00000325, 0x0180 }, /* R805 (0x325) - ADC Digital Volume 3R */
+	{ 0x00000326, 0x0000 }, /* R806 (0x326) - DMIC3R Control */
+	{ 0x00000327, 0x0000 }, /* R807 (0x327) - IN3R Rate Control */
+	{ 0x00000328, 0x0000 }, /* R808 (0x328) - IN4L Control */
+	{ 0x00000329, 0x0180 }, /* R809 (0x329) - ADC Digital Volume 4L */
+	{ 0x0000032a, 0x0500 }, /* R810 (0x32a) - DMIC4L Control */
+	{ 0x0000032b, 0x0000 }, /* R811 (0x32b) - IN4L Rate Control */
+	{ 0x0000032c, 0x0000 }, /* R812 (0x32c) - IN4R Control */
+	{ 0x0000032d, 0x0180 }, /* R813 (0x32d) - ADC Digital Volume 4R */
+	{ 0x0000032e, 0x0000 }, /* R814 (0x32e) - DMIC4R Control */
+	{ 0x0000032f, 0x0000 }, /* R815 (0x32f) - IN4R Rate Control */
+	{ 0x00000400, 0x0000 }, /* R1024 (0x400) - Output Enables 1 */
+	{ 0x00000408, 0x0040 }, /* R1032 (0x408) - Output Rate 1 */
+	{ 0x00000409, 0x0022 }, /* R1033 (0x409) - Output Volume Ramp */
+	{ 0x00000410, 0x0080 }, /* R1040 (0x410) - Output Path Config 1L */
+	{ 0x00000411, 0x0180 }, /* R1041 (0x411) - DAC Digital Volume 1L */
+	{ 0x00000412, 0x0005 }, /* R1042 (0x412) - Output Path Config 1 */
+	{ 0x00000413, 0x0001 }, /* R1043 (0x413) - Noise Gate Select 1L */
+	{ 0x00000414, 0x0080 }, /* R1044 (0x414) - Output Path Config 1R */
+	{ 0x00000415, 0x0180 }, /* R1045 (0x415) - DAC Digital Volume 1R */
+	{ 0x00000417, 0x0002 }, /* R1047 (0x417) - Noise Gate Select 1R */
+	{ 0x00000418, 0x0080 }, /* R1048 (0x418) - Output Path Config 2L */
+	{ 0x00000419, 0x0180 }, /* R1049 (0x419) - DAC Digital Volume 2L */
+	{ 0x0000041a, 0x0005 }, /* R1050 (0x41a) - Output Path Config 2 */
+	{ 0x0000041b, 0x0004 }, /* R1051 (0x41b) - Noise Gate Select 2L */
+	{ 0x0000041c, 0x0080 }, /* R1052 (0x41c) - Output Path Config 2R */
+	{ 0x0000041d, 0x0180 }, /* R1053 (0x41d) - DAC Digital Volume 2R */
+	{ 0x0000041f, 0x0008 }, /* R1055 (0x41f) - Noise Gate Select 2R */
+	{ 0x00000420, 0x0080 }, /* R1056 (0x420) - Output Path Config 3L */
+	{ 0x00000421, 0x0180 }, /* R1057 (0x421) - DAC Digital Volume 3L */
+	{ 0x00000422, 0x0005 }, /* R1058 (0x422) - Output Path Config 3 */
+	{ 0x00000423, 0x0010 }, /* R1059 (0x423) - Noise Gate Select 3L */
+	{ 0x00000424, 0x0080 }, /* R1060 (0x424) - Output Path Config 3R */
+	{ 0x00000425, 0x0180 }, /* R1061 (0x425) - DAC Digital Volume 3R */
+	{ 0x00000427, 0x0020 }, /* R1063 (0x427) - Noise Gate Select 3R */
+	{ 0x00000430, 0x0000 }, /* R1072 (0x430) - Output Path Config 5L */
+	{ 0x00000431, 0x0180 }, /* R1073 (0x431) - DAC Digital Volume 5L */
+	{ 0x00000433, 0x0100 }, /* R1075 (0x433) - Noise Gate Select 5L */
+	{ 0x00000434, 0x0000 }, /* R1076 (0x434) - Output Path Config 5R */
+	{ 0x00000435, 0x0180 }, /* R1077 (0x435) - DAC Digital Volume 5R */
+	{ 0x00000437, 0x0200 }, /* R1079 (0x437) - Noise Gate Select 5R */
+	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
+	{ 0x00000451, 0x0000 }, /* R1105 (0x451) - DAC AEC Control 2 */
+	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
+	{ 0x00000490, 0x0069 }, /* R1168 (0x490) - PDM SPK1 Ctrl 1 */
+	{ 0x00000491, 0x0000 }, /* R1169 (0x491) - PDM SPK1 Ctrl 2 */
+	{ 0x000004a0, 0x0080 }, /* R1184 (0x4a0) - HP1 Short Circuit Ctrl */
+	{ 0x000004a1, 0x0000 }, /* R1185 (0x4a1) - HP2 Short Circuit Ctrl */
+	{ 0x000004a2, 0x0000 }, /* R1186 (0x4a2) - HP3 Short Circuit Ctrl */
+	{ 0x00000500, 0x000c }, /* R1280 (0x500) - AIF1 BCLK Ctrl */
+	{ 0x00000501, 0x0000 }, /* R1281 (0x501) - AIF1 Tx Pin Ctrl */
+	{ 0x00000502, 0x0000 }, /* R1282 (0x502) - AIF1 Rx Pin Ctrl */
+	{ 0x00000503, 0x0000 }, /* R1283 (0x503) - AIF1 Rate Ctrl */
+	{ 0x00000504, 0x0000 }, /* R1284 (0x504) - AIF1 Format */
+	{ 0x00000506, 0x0040 }, /* R1286 (0x506) - AIF1 Rx BCLK Rate */
+	{ 0x00000507, 0x1818 }, /* R1287 (0x507) - AIF1 Frame Ctrl 1 */
+	{ 0x00000508, 0x1818 }, /* R1288 (0x508) - AIF1 Frame Ctrl 2 */
+	{ 0x00000509, 0x0000 }, /* R1289 (0x509) - AIF1 Frame Ctrl 3 */
+	{ 0x0000050a, 0x0001 }, /* R1290 (0x50a) - AIF1 Frame Ctrl 4 */
+	{ 0x0000050b, 0x0002 }, /* R1291 (0x50b) - AIF1 Frame Ctrl 5 */
+	{ 0x0000050c, 0x0003 }, /* R1292 (0x50c) - AIF1 Frame Ctrl 6 */
+	{ 0x0000050d, 0x0004 }, /* R1293 (0x50d) - AIF1 Frame Ctrl 7 */
+	{ 0x0000050e, 0x0005 }, /* R1294 (0x50e) - AIF1 Frame Ctrl 8 */
+	{ 0x0000050f, 0x0006 }, /* R1295 (0x50f) - AIF1 Frame Ctrl 9 */
+	{ 0x00000510, 0x0007 }, /* R1296 (0x510) - AIF1 Frame Ctrl 10 */
+	{ 0x00000511, 0x0000 }, /* R1297 (0x511) - AIF1 Frame Ctrl 11 */
+	{ 0x00000512, 0x0001 }, /* R1298 (0x512) - AIF1 Frame Ctrl 12 */
+	{ 0x00000513, 0x0002 }, /* R1299 (0x513) - AIF1 Frame Ctrl 13 */
+	{ 0x00000514, 0x0003 }, /* R1300 (0x514) - AIF1 Frame Ctrl 14 */
+	{ 0x00000515, 0x0004 }, /* R1301 (0x515) - AIF1 Frame Ctrl 15 */
+	{ 0x00000516, 0x0005 }, /* R1302 (0x516) - AIF1 Frame Ctrl 16 */
+	{ 0x00000517, 0x0006 }, /* R1303 (0x517) - AIF1 Frame Ctrl 17 */
+	{ 0x00000518, 0x0007 }, /* R1304 (0x518) - AIF1 Frame Ctrl 18 */
+	{ 0x00000519, 0x0000 }, /* R1305 (0x519) - AIF1 Tx Enables */
+	{ 0x0000051a, 0x0000 }, /* R1306 (0x51a) - AIF1 Rx Enables */
+	{ 0x00000540, 0x000c }, /* R1344 (0x540) - AIF2 BCLK Ctrl */
+	{ 0x00000541, 0x0000 }, /* R1345 (0x541) - AIF2 Tx Pin Ctrl */
+	{ 0x00000542, 0x0000 }, /* R1346 (0x542) - AIF2 Rx Pin Ctrl */
+	{ 0x00000543, 0x0000 }, /* R1347 (0x543) - AIF2 Rate Ctrl */
+	{ 0x00000544, 0x0000 }, /* R1348 (0x544) - AIF2 Format */
+	{ 0x00000546, 0x0040 }, /* R1350 (0x546) - AIF2 Rx BCLK Rate */
+	{ 0x00000547, 0x1818 }, /* R1351 (0x547) - AIF2 Frame Ctrl 1 */
+	{ 0x00000548, 0x1818 }, /* R1352 (0x548) - AIF2 Frame Ctrl 2 */
+	{ 0x00000549, 0x0000 }, /* R1353 (0x549) - AIF2 Frame Ctrl 3 */
+	{ 0x0000054a, 0x0001 }, /* R1354 (0x54a) - AIF2 Frame Ctrl 4 */
+	{ 0x0000054b, 0x0002 }, /* R1355 (0x54b) - AIF2 Frame Ctrl 5 */
+	{ 0x0000054c, 0x0003 }, /* R1356 (0x54c) - AIF2 Frame Ctrl 6 */
+	{ 0x0000054d, 0x0004 }, /* R1357 (0x54d) - AIF2 Frame Ctrl 7 */
+	{ 0x0000054e, 0x0005 }, /* R1358 (0x54e) - AIF2 Frame Ctrl 8 */
+	{ 0x0000054f, 0x0006 }, /* R1359 (0x54f) - AIF2 Frame Ctrl 9 */
+	{ 0x00000550, 0x0007 }, /* R1360 (0x550) - AIF2 Frame Ctrl 10 */
+	{ 0x00000551, 0x0000 }, /* R1361 (0x551) - AIF2 Frame Ctrl 11 */
+	{ 0x00000552, 0x0001 }, /* R1362 (0x552) - AIF2 Frame Ctrl 12 */
+	{ 0x00000553, 0x0002 }, /* R1363 (0x553) - AIF2 Frame Ctrl 13 */
+	{ 0x00000554, 0x0003 }, /* R1364 (0x554) - AIF2 Frame Ctrl 14 */
+	{ 0x00000555, 0x0004 }, /* R1365 (0x555) - AIF2 Frame Ctrl 15 */
+	{ 0x00000556, 0x0005 }, /* R1366 (0x556) - AIF2 Frame Ctrl 16 */
+	{ 0x00000557, 0x0006 }, /* R1367 (0x557) - AIF2 Frame Ctrl 17 */
+	{ 0x00000558, 0x0007 }, /* R1368 (0x558) - AIF2 Frame Ctrl 18 */
+	{ 0x00000559, 0x0000 }, /* R1369 (0x559) - AIF2 Tx Enables */
+	{ 0x0000055a, 0x0000 }, /* R1370 (0x55a) - AIF2 Rx Enables */
+	{ 0x00000580, 0x000c }, /* R1408 (0x580) - AIF3 BCLK Ctrl */
+	{ 0x00000581, 0x0000 }, /* R1409 (0x581) - AIF3 Tx Pin Ctrl */
+	{ 0x00000582, 0x0000 }, /* R1410 (0x582) - AIF3 Rx Pin Ctrl */
+	{ 0x00000583, 0x0000 }, /* R1411 (0x583) - AIF3 Rate Ctrl */
+	{ 0x00000584, 0x0000 }, /* R1412 (0x584) - AIF3 Format */
+	{ 0x00000586, 0x0040 }, /* R1414 (0x586) - AIF3 Rx BCLK Rate */
+	{ 0x00000587, 0x1818 }, /* R1415 (0x587) - AIF3 Frame Ctrl 1 */
+	{ 0x00000588, 0x1818 }, /* R1416 (0x588) - AIF3 Frame Ctrl 2 */
+	{ 0x00000589, 0x0000 }, /* R1417 (0x589) - AIF3 Frame Ctrl 3 */
+	{ 0x0000058a, 0x0001 }, /* R1418 (0x58a) - AIF3 Frame Ctrl 4 */
+	{ 0x0000058b, 0x0002 }, /* R1419 (0x58b) - AIF3 Frame Ctrl 5 */
+	{ 0x0000058c, 0x0003 }, /* R1420 (0x58c) - AIF3 Frame Ctrl 6 */
+	{ 0x0000058d, 0x0004 }, /* R1421 (0x58d) - AIF3 Frame Ctrl 7 */
+	{ 0x0000058e, 0x0005 }, /* R1422 (0x58e) - AIF3 Frame Ctrl 8 */
+	{ 0x0000058f, 0x0006 }, /* R1423 (0x58f) - AIF3 Frame Ctrl 9 */
+	{ 0x00000590, 0x0007 }, /* R1424 (0x590) - AIF3 Frame Ctrl 10 */
+	{ 0x00000591, 0x0000 }, /* R1425 (0x591) - AIF3 Frame Ctrl 11 */
+	{ 0x00000592, 0x0001 }, /* R1426 (0x592) - AIF3 Frame Ctrl 12 */
+	{ 0x00000593, 0x0002 }, /* R1427 (0x593) - AIF3 Frame Ctrl 13 */
+	{ 0x00000594, 0x0003 }, /* R1428 (0x594) - AIF3 Frame Ctrl 14 */
+	{ 0x00000595, 0x0004 }, /* R1429 (0x595) - AIF3 Frame Ctrl 15 */
+	{ 0x00000596, 0x0005 }, /* R1430 (0x596) - AIF3 Frame Ctrl 16 */
+	{ 0x00000597, 0x0006 }, /* R1431 (0x597) - AIF3 Frame Ctrl 17 */
+	{ 0x00000598, 0x0007 }, /* R1432 (0x598) - AIF3 Frame Ctrl 18 */
+	{ 0x00000599, 0x0000 }, /* R1433 (0x599) - AIF3 Tx Enables */
+	{ 0x0000059a, 0x0000 }, /* R1434 (0x59a) - AIF3 Rx Enables */
+	{ 0x000005c2, 0x0000 }, /* R1474 (0x5c2) - SPD1 Tx Control */
+	{ 0x000005e3, 0x0000 }, /* R1507 (0x5e3) - SLIMBus Framer Ref Gear */
+	{ 0x000005e5, 0x0000 }, /* R1509 (0x5e5) - SLIMBus Rates 1 */
+	{ 0x000005e6, 0x0000 }, /* R1510 (0x5e6) - SLIMBus Rates 2 */
+	{ 0x000005e7, 0x0000 }, /* R1511 (0x5e7) - SLIMBus Rates 3 */
+	{ 0x000005e8, 0x0000 }, /* R1512 (0x5e8) - SLIMBus Rates 4 */
+	{ 0x000005e9, 0x0000 }, /* R1513 (0x5e9) - SLIMBus Rates 5 */
+	{ 0x000005ea, 0x0000 }, /* R1514 (0x5ea) - SLIMBus Rates 6 */
+	{ 0x000005eb, 0x0000 }, /* R1515 (0x5eb) - SLIMBus Rates 7 */
+	{ 0x000005ec, 0x0000 }, /* R1516 (0x5ec) - SLIMBus Rates 8 */
+	{ 0x000005f5, 0x0000 }, /* R1525 (0x5f5) - SLIMBus RX Channel Enable */
+	{ 0x000005f6, 0x0000 }, /* R1526 (0x5f6) - SLIMBus TX Channel Enable */
+	{ 0x00000640, 0x0000 }, /* R1600 (0x640) - PWM1MIX Input 1 Source */
+	{ 0x00000641, 0x0080 }, /* R1601 (0x641) - PWM1MIX Input 1 Volume */
+	{ 0x00000642, 0x0000 }, /* R1602 (0x642) - PWM1MIX Input 2 Source */
+	{ 0x00000643, 0x0080 }, /* R1603 (0x643) - PWM1MIX Input 2 Volume */
+	{ 0x00000644, 0x0000 }, /* R1604 (0x644) - PWM1MIX Input 3 Source */
+	{ 0x00000645, 0x0080 }, /* R1605 (0x645) - PWM1MIX Input 3 Volume */
+	{ 0x00000646, 0x0000 }, /* R1606 (0x646) - PWM1MIX Input 4 Source */
+	{ 0x00000647, 0x0080 }, /* R1607 (0x647) - PWM1MIX Input 4 Volume */
+	{ 0x00000648, 0x0000 }, /* R1608 (0x648) - PWM2MIX Input 1 Source */
+	{ 0x00000649, 0x0080 }, /* R1609 (0x649) - PWM2MIX Input 1 Volume */
+	{ 0x0000064a, 0x0000 }, /* R1610 (0x64a) - PWM2MIX Input 2 Source */
+	{ 0x0000064b, 0x0080 }, /* R1611 (0x64b) - PWM2MIX Input 2 Volume */
+	{ 0x0000064c, 0x0000 }, /* R1612 (0x64c) - PWM2MIX Input 3 Source */
+	{ 0x0000064d, 0x0080 }, /* R1613 (0x64d) - PWM2MIX Input 3 Volume */
+	{ 0x0000064e, 0x0000 }, /* R1614 (0x64e) - PWM2MIX Input 4 Source */
+	{ 0x0000064f, 0x0080 }, /* R1615 (0x64f) - PWM2MIX Input 4 Volume */
+	{ 0x00000680, 0x0000 }, /* R1664 (0x680) - OUT1LMIX Input 1 Source */
+	{ 0x00000681, 0x0080 }, /* R1665 (0x681) - OUT1LMIX Input 1 Volume */
+	{ 0x00000682, 0x0000 }, /* R1666 (0x682) - OUT1LMIX Input 2 Source */
+	{ 0x00000683, 0x0080 }, /* R1667 (0x683) - OUT1LMIX Input 2 Volume */
+	{ 0x00000684, 0x0000 }, /* R1668 (0x684) - OUT1LMIX Input 3 Source */
+	{ 0x00000685, 0x0080 }, /* R1669 (0x685) - OUT1LMIX Input 3 Volume */
+	{ 0x00000686, 0x0000 }, /* R1670 (0x686) - OUT1LMIX Input 4 Source */
+	{ 0x00000687, 0x0080 }, /* R1671 (0x687) - OUT1LMIX Input 4 Volume */
+	{ 0x00000688, 0x0000 }, /* R1672 (0x688) - OUT1RMIX Input 1 Source */
+	{ 0x00000689, 0x0080 }, /* R1673 (0x689) - OUT1RMIX Input 1 Volume */
+	{ 0x0000068a, 0x0000 }, /* R1674 (0x68a) - OUT1RMIX Input 2 Source */
+	{ 0x0000068b, 0x0080 }, /* R1675 (0x68b) - OUT1RMIX Input 2 Volume */
+	{ 0x0000068c, 0x0000 }, /* R1676 (0x68c) - OUT1RMIX Input 3 Source */
+	{ 0x0000068d, 0x0080 }, /* R1677 (0x68d) - OUT1RMIX Input 3 Volume */
+	{ 0x0000068e, 0x0000 }, /* R1678 (0x68e) - OUT1RMIX Input 4 Source */
+	{ 0x0000068f, 0x0080 }, /* R1679 (0x68f) - OUT1RMIX Input 4 Volume */
+	{ 0x00000690, 0x0000 }, /* R1680 (0x690) - OUT2LMIX Input 1 Source */
+	{ 0x00000691, 0x0080 }, /* R1681 (0x691) - OUT2LMIX Input 1 Volume */
+	{ 0x00000692, 0x0000 }, /* R1682 (0x692) - OUT2LMIX Input 2 Source */
+	{ 0x00000693, 0x0080 }, /* R1683 (0x693) - OUT2LMIX Input 2 Volume */
+	{ 0x00000694, 0x0000 }, /* R1684 (0x694) - OUT2LMIX Input 3 Source */
+	{ 0x00000695, 0x0080 }, /* R1685 (0x695) - OUT2LMIX Input 3 Volume */
+	{ 0x00000696, 0x0000 }, /* R1686 (0x696) - OUT2LMIX Input 4 Source */
+	{ 0x00000697, 0x0080 }, /* R1687 (0x697) - OUT2LMIX Input 4 Volume */
+	{ 0x00000698, 0x0000 }, /* R1688 (0x698) - OUT2RMIX Input 1 Source */
+	{ 0x00000699, 0x0080 }, /* R1689 (0x699) - OUT2RMIX Input 1 Volume */
+	{ 0x0000069a, 0x0000 }, /* R1690 (0x69a) - OUT2RMIX Input 2 Source */
+	{ 0x0000069b, 0x0080 }, /* R1691 (0x69b) - OUT2RMIX Input 2 Volume */
+	{ 0x0000069c, 0x0000 }, /* R1692 (0x69c) - OUT2RMIX Input 3 Source */
+	{ 0x0000069d, 0x0080 }, /* R1693 (0x69d) - OUT2RMIX Input 3 Volume */
+	{ 0x0000069e, 0x0000 }, /* R1694 (0x69e) - OUT2RMIX Input 4 Source */
+	{ 0x0000069f, 0x0080 }, /* R1695 (0x69f) - OUT2RMIX Input 4 Volume */
+	{ 0x000006a0, 0x0000 }, /* R1696 (0x6a0) - OUT3LMIX Input 1 Source */
+	{ 0x000006a1, 0x0080 }, /* R1697 (0x6a1) - OUT3LMIX Input 1 Volume */
+	{ 0x000006a2, 0x0000 }, /* R1698 (0x6a2) - OUT3LMIX Input 2 Source */
+	{ 0x000006a3, 0x0080 }, /* R1699 (0x6a3) - OUT3LMIX Input 2 Volume */
+	{ 0x000006a4, 0x0000 }, /* R1700 (0x6a4) - OUT3LMIX Input 3 Source */
+	{ 0x000006a5, 0x0080 }, /* R1701 (0x6a5) - OUT3LMIX Input 3 Volume */
+	{ 0x000006a6, 0x0000 }, /* R1702 (0x6a6) - OUT3LMIX Input 4 Source */
+	{ 0x000006a7, 0x0080 }, /* R1703 (0x6a7) - OUT3LMIX Input 4 Volume */
+	{ 0x000006a8, 0x0000 }, /* R1704 (0x6a8) - OUT3RMIX Input 1 Source */
+	{ 0x000006a9, 0x0080 }, /* R1705 (0x6a9) - OUT3RMIX Input 1 Volume */
+	{ 0x000006aa, 0x0000 }, /* R1706 (0x6aa) - OUT3RMIX Input 2 Source */
+	{ 0x000006ab, 0x0080 }, /* R1707 (0x6ab) - OUT3RMIX Input 2 Volume */
+	{ 0x000006ac, 0x0000 }, /* R1708 (0x6ac) - OUT3RMIX Input 3 Source */
+	{ 0x000006ad, 0x0080 }, /* R1709 (0x6ad) - OUT3RMIX Input 3 Volume */
+	{ 0x000006ae, 0x0000 }, /* R1710 (0x6ae) - OUT3RMIX Input 4 Source */
+	{ 0x000006af, 0x0080 }, /* R1711 (0x6af) - OUT3RMIX Input 4 Volume */
+	{ 0x000006c0, 0x0000 }, /* R1728 (0x6c0) - OUT5LMIX Input 1 Source */
+	{ 0x000006c1, 0x0080 }, /* R1729 (0x6c1) - OUT5LMIX Input 1 Volume */
+	{ 0x000006c2, 0x0000 }, /* R1730 (0x6c2) - OUT5LMIX Input 2 Source */
+	{ 0x000006c3, 0x0080 }, /* R1731 (0x6c3) - OUT5LMIX Input 2 Volume */
+	{ 0x000006c4, 0x0000 }, /* R1732 (0x6c4) - OUT5LMIX Input 3 Source */
+	{ 0x000006c5, 0x0080 }, /* R1733 (0x6c5) - OUT5LMIX Input 3 Volume */
+	{ 0x000006c6, 0x0000 }, /* R1734 (0x6c6) - OUT5LMIX Input 4 Source */
+	{ 0x000006c7, 0x0080 }, /* R1735 (0x6c7) - OUT5LMIX Input 4 Volume */
+	{ 0x000006c8, 0x0000 }, /* R1736 (0x6c8) - OUT5RMIX Input 1 Source */
+	{ 0x000006c9, 0x0080 }, /* R1737 (0x6c9) - OUT5RMIX Input 1 Volume */
+	{ 0x000006ca, 0x0000 }, /* R1738 (0x6ca) - OUT5RMIX Input 2 Source */
+	{ 0x000006cb, 0x0080 }, /* R1739 (0x6cb) - OUT5RMIX Input 2 Volume */
+	{ 0x000006cc, 0x0000 }, /* R1740 (0x6cc) - OUT5RMIX Input 3 Source */
+	{ 0x000006cd, 0x0080 }, /* R1741 (0x6cd) - OUT5RMIX Input 3 Volume */
+	{ 0x000006ce, 0x0000 }, /* R1742 (0x6ce) - OUT5RMIX Input 4 Source */
+	{ 0x000006cf, 0x0080 }, /* R1743 (0x6cf) - OUT5RMIX Input 4 Volume */
+	{ 0x00000700, 0x0000 }, /* R1792 (0x700) - AIF1TX1MIX Input 1 Source */
+	{ 0x00000701, 0x0080 }, /* R1793 (0x701) - AIF1TX1MIX Input 1 Volume */
+	{ 0x00000702, 0x0000 }, /* R1794 (0x702) - AIF1TX1MIX Input 2 Source */
+	{ 0x00000703, 0x0080 }, /* R1795 (0x703) - AIF1TX1MIX Input 2 Volume */
+	{ 0x00000704, 0x0000 }, /* R1796 (0x704) - AIF1TX1MIX Input 3 Source */
+	{ 0x00000705, 0x0080 }, /* R1797 (0x705) - AIF1TX1MIX Input 3 Volume */
+	{ 0x00000706, 0x0000 }, /* R1798 (0x706) - AIF1TX1MIX Input 4 Source */
+	{ 0x00000707, 0x0080 }, /* R1799 (0x707) - AIF1TX1MIX Input 4 Volume */
+	{ 0x00000708, 0x0000 }, /* R1800 (0x708) - AIF1TX2MIX Input 1 Source */
+	{ 0x00000709, 0x0080 }, /* R1801 (0x709) - AIF1TX2MIX Input 1 Volume */
+	{ 0x0000070a, 0x0000 }, /* R1802 (0x70a) - AIF1TX2MIX Input 2 Source */
+	{ 0x0000070b, 0x0080 }, /* R1803 (0x70b) - AIF1TX2MIX Input 2 Volume */
+	{ 0x0000070c, 0x0000 }, /* R1804 (0x70c) - AIF1TX2MIX Input 3 Source */
+	{ 0x0000070d, 0x0080 }, /* R1805 (0x70d) - AIF1TX2MIX Input 3 Volume */
+	{ 0x0000070e, 0x0000 }, /* R1806 (0x70e) - AIF1TX2MIX Input 4 Source */
+	{ 0x0000070f, 0x0080 }, /* R1807 (0x70f) - AIF1TX2MIX Input 4 Volume */
+	{ 0x00000710, 0x0000 }, /* R1808 (0x710) - AIF1TX3MIX Input 1 Source */
+	{ 0x00000711, 0x0080 }, /* R1809 (0x711) - AIF1TX3MIX Input 1 Volume */
+	{ 0x00000712, 0x0000 }, /* R1810 (0x712) - AIF1TX3MIX Input 2 Source */
+	{ 0x00000713, 0x0080 }, /* R1811 (0x713) - AIF1TX3MIX Input 2 Volume */
+	{ 0x00000714, 0x0000 }, /* R1812 (0x714) - AIF1TX3MIX Input 3 Source */
+	{ 0x00000715, 0x0080 }, /* R1813 (0x715) - AIF1TX3MIX Input 3 Volume */
+	{ 0x00000716, 0x0000 }, /* R1814 (0x716) - AIF1TX3MIX Input 4 Source */
+	{ 0x00000717, 0x0080 }, /* R1815 (0x717) - AIF1TX3MIX Input 4 Volume */
+	{ 0x00000718, 0x0000 }, /* R1816 (0x718) - AIF1TX4MIX Input 1 Source */
+	{ 0x00000719, 0x0080 }, /* R1817 (0x719) - AIF1TX4MIX Input 1 Volume */
+	{ 0x0000071a, 0x0000 }, /* R1818 (0x71a) - AIF1TX4MIX Input 2 Source */
+	{ 0x0000071b, 0x0080 }, /* R1819 (0x71b) - AIF1TX4MIX Input 2 Volume */
+	{ 0x0000071c, 0x0000 }, /* R1820 (0x71c) - AIF1TX4MIX Input 3 Source */
+	{ 0x0000071d, 0x0080 }, /* R1821 (0x71d) - AIF1TX4MIX Input 3 Volume */
+	{ 0x0000071e, 0x0000 }, /* R1822 (0x71e) - AIF1TX4MIX Input 4 Source */
+	{ 0x0000071f, 0x0080 }, /* R1823 (0x71f) - AIF1TX4MIX Input 4 Volume */
+	{ 0x00000720, 0x0000 }, /* R1824 (0x720) - AIF1TX5MIX Input 1 Source */
+	{ 0x00000721, 0x0080 }, /* R1825 (0x721) - AIF1TX5MIX Input 1 Volume */
+	{ 0x00000722, 0x0000 }, /* R1826 (0x722) - AIF1TX5MIX Input 2 Source */
+	{ 0x00000723, 0x0080 }, /* R1827 (0x723) - AIF1TX5MIX Input 2 Volume */
+	{ 0x00000724, 0x0000 }, /* R1828 (0x724) - AIF1TX5MIX Input 3 Source */
+	{ 0x00000725, 0x0080 }, /* R1829 (0x725) - AIF1TX5MIX Input 3 Volume */
+	{ 0x00000726, 0x0000 }, /* R1830 (0x726) - AIF1TX5MIX Input 4 Source */
+	{ 0x00000727, 0x0080 }, /* R1831 (0x727) - AIF1TX5MIX Input 4 Volume */
+	{ 0x00000728, 0x0000 }, /* R1832 (0x728) - AIF1TX6MIX Input 1 Source */
+	{ 0x00000729, 0x0080 }, /* R1833 (0x729) - AIF1TX6MIX Input 1 Volume */
+	{ 0x0000072a, 0x0000 }, /* R1834 (0x72a) - AIF1TX6MIX Input 2 Source */
+	{ 0x0000072b, 0x0080 }, /* R1835 (0x72b) - AIF1TX6MIX Input 2 Volume */
+	{ 0x0000072c, 0x0000 }, /* R1836 (0x72c) - AIF1TX6MIX Input 3 Source */
+	{ 0x0000072d, 0x0080 }, /* R1837 (0x72d) - AIF1TX6MIX Input 3 Volume */
+	{ 0x0000072e, 0x0000 }, /* R1838 (0x72e) - AIF1TX6MIX Input 4 Source */
+	{ 0x0000072f, 0x0080 }, /* R1839 (0x72f) - AIF1TX6MIX Input 4 Volume */
+	{ 0x00000730, 0x0000 }, /* R1840 (0x730) - AIF1TX7MIX Input 1 Source */
+	{ 0x00000731, 0x0080 }, /* R1841 (0x731) - AIF1TX7MIX Input 1 Volume */
+	{ 0x00000732, 0x0000 }, /* R1842 (0x732) - AIF1TX7MIX Input 2 Source */
+	{ 0x00000733, 0x0080 }, /* R1843 (0x733) - AIF1TX7MIX Input 2 Volume */
+	{ 0x00000734, 0x0000 }, /* R1844 (0x734) - AIF1TX7MIX Input 3 Source */
+	{ 0x00000735, 0x0080 }, /* R1845 (0x735) - AIF1TX7MIX Input 3 Volume */
+	{ 0x00000736, 0x0000 }, /* R1846 (0x736) - AIF1TX7MIX Input 4 Source */
+	{ 0x00000737, 0x0080 }, /* R1847 (0x737) - AIF1TX7MIX Input 4 Volume */
+	{ 0x00000738, 0x0000 }, /* R1848 (0x738) - AIF1TX8MIX Input 1 Source */
+	{ 0x00000739, 0x0080 }, /* R1849 (0x739) - AIF1TX8MIX Input 1 Volume */
+	{ 0x0000073a, 0x0000 }, /* R1850 (0x73a) - AIF1TX8MIX Input 2 Source */
+	{ 0x0000073b, 0x0080 }, /* R1851 (0x73b) - AIF1TX8MIX Input 2 Volume */
+	{ 0x0000073c, 0x0000 }, /* R1852 (0x73c) - AIF1TX8MIX Input 3 Source */
+	{ 0x0000073d, 0x0080 }, /* R1853 (0x73d) - AIF1TX8MIX Input 3 Volume */
+	{ 0x0000073e, 0x0000 }, /* R1854 (0x73e) - AIF1TX8MIX Input 4 Source */
+	{ 0x0000073f, 0x0080 }, /* R1855 (0x73f) - AIF1TX8MIX Input 4 Volume */
+	{ 0x00000740, 0x0000 }, /* R1856 (0x740) - AIF2TX1MIX Input 1 Source */
+	{ 0x00000741, 0x0080 }, /* R1857 (0x741) - AIF2TX1MIX Input 1 Volume */
+	{ 0x00000742, 0x0000 }, /* R1858 (0x742) - AIF2TX1MIX Input 2 Source */
+	{ 0x00000743, 0x0080 }, /* R1859 (0x743) - AIF2TX1MIX Input 2 Volume */
+	{ 0x00000744, 0x0000 }, /* R1860 (0x744) - AIF2TX1MIX Input 3 Source */
+	{ 0x00000745, 0x0080 }, /* R1861 (0x745) - AIF2TX1MIX Input 3 Volume */
+	{ 0x00000746, 0x0000 }, /* R1862 (0x746) - AIF2TX1MIX Input 4 Source */
+	{ 0x00000747, 0x0080 }, /* R1863 (0x747) - AIF2TX1MIX Input 4 Volume */
+	{ 0x00000748, 0x0000 }, /* R1864 (0x748) - AIF2TX2MIX Input 1 Source */
+	{ 0x00000749, 0x0080 }, /* R1865 (0x749) - AIF2TX2MIX Input 1 Volume */
+	{ 0x0000074a, 0x0000 }, /* R1866 (0x74a) - AIF2TX2MIX Input 2 Source */
+	{ 0x0000074b, 0x0080 }, /* R1867 (0x74b) - AIF2TX2MIX Input 2 Volume */
+	{ 0x0000074c, 0x0000 }, /* R1868 (0x74c) - AIF2TX2MIX Input 3 Source */
+	{ 0x0000074d, 0x0080 }, /* R1869 (0x74d) - AIF2TX2MIX Input 3 Volume */
+	{ 0x0000074e, 0x0000 }, /* R1870 (0x74e) - AIF2TX2MIX Input 4 Source */
+	{ 0x0000074f, 0x0080 }, /* R1871 (0x74f) - AIF2TX2MIX Input 4 Volume */
+	{ 0x00000750, 0x0000 }, /* R1872 (0x750) - AIF2TX3MIX Input 1 Source */
+	{ 0x00000751, 0x0080 }, /* R1873 (0x751) - AIF2TX3MIX Input 1 Volume */
+	{ 0x00000752, 0x0000 }, /* R1874 (0x752) - AIF2TX3MIX Input 2 Source */
+	{ 0x00000753, 0x0080 }, /* R1875 (0x753) - AIF2TX3MIX Input 2 Volume */
+	{ 0x00000754, 0x0000 }, /* R1876 (0x754) - AIF2TX3MIX Input 3 Source */
+	{ 0x00000755, 0x0080 }, /* R1877 (0x755) - AIF2TX3MIX Input 3 Volume */
+	{ 0x00000756, 0x0000 }, /* R1878 (0x756) - AIF2TX3MIX Input 4 Source */
+	{ 0x00000757, 0x0080 }, /* R1879 (0x757) - AIF2TX3MIX Input 4 Volume */
+	{ 0x00000758, 0x0000 }, /* R1880 (0x758) - AIF2TX4MIX Input 1 Source */
+	{ 0x00000759, 0x0080 }, /* R1881 (0x759) - AIF2TX4MIX Input 1 Volume */
+	{ 0x0000075a, 0x0000 }, /* R1882 (0x75a) - AIF2TX4MIX Input 2 Source */
+	{ 0x0000075b, 0x0080 }, /* R1883 (0x75b) - AIF2TX4MIX Input 2 Volume */
+	{ 0x0000075c, 0x0000 }, /* R1884 (0x75c) - AIF2TX4MIX Input 3 Source */
+	{ 0x0000075d, 0x0080 }, /* R1885 (0x75d) - AIF2TX4MIX Input 3 Volume */
+	{ 0x0000075e, 0x0000 }, /* R1886 (0x75e) - AIF2TX4MIX Input 4 Source */
+	{ 0x0000075f, 0x0080 }, /* R1887 (0x75f) - AIF2TX4MIX Input 4 Volume */
+	{ 0x00000760, 0x0000 }, /* R1888 (0x760) - AIF2TX5MIX Input 1 Source */
+	{ 0x00000761, 0x0080 }, /* R1889 (0x761) - AIF2TX5MIX Input 1 Volume */
+	{ 0x00000762, 0x0000 }, /* R1890 (0x762) - AIF2TX5MIX Input 2 Source */
+	{ 0x00000763, 0x0080 }, /* R1891 (0x763) - AIF2TX5MIX Input 2 Volume */
+	{ 0x00000764, 0x0000 }, /* R1892 (0x764) - AIF2TX5MIX Input 3 Source */
+	{ 0x00000765, 0x0080 }, /* R1893 (0x765) - AIF2TX5MIX Input 3 Volume */
+	{ 0x00000766, 0x0000 }, /* R1894 (0x766) - AIF2TX5MIX Input 4 Source */
+	{ 0x00000767, 0x0080 }, /* R1895 (0x767) - AIF2TX5MIX Input 4 Volume */
+	{ 0x00000768, 0x0000 }, /* R1896 (0x768) - AIF2TX6MIX Input 1 Source */
+	{ 0x00000769, 0x0080 }, /* R1897 (0x769) - AIF2TX6MIX Input 1 Volume */
+	{ 0x0000076a, 0x0000 }, /* R1898 (0x76a) - AIF2TX6MIX Input 2 Source */
+	{ 0x0000076b, 0x0080 }, /* R1899 (0x76b) - AIF2TX6MIX Input 2 Volume */
+	{ 0x0000076c, 0x0000 }, /* R1900 (0x76c) - AIF2TX6MIX Input 3 Source */
+	{ 0x0000076d, 0x0080 }, /* R1901 (0x76d) - AIF2TX6MIX Input 3 Volume */
+	{ 0x0000076e, 0x0000 }, /* R1902 (0x76e) - AIF2TX6MIX Input 4 Source */
+	{ 0x0000076f, 0x0080 }, /* R1903 (0x76f) - AIF2TX6MIX Input 4 Volume */
+	{ 0x00000770, 0x0000 }, /* R1904 (0x770) - AIF2TX7MIX Input 1 Source */
+	{ 0x00000771, 0x0080 }, /* R1905 (0x771) - AIF2TX7MIX Input 1 Volume */
+	{ 0x00000772, 0x0000 }, /* R1906 (0x772) - AIF2TX7MIX Input 2 Source */
+	{ 0x00000773, 0x0080 }, /* R1907 (0x773) - AIF2TX7MIX Input 2 Volume */
+	{ 0x00000774, 0x0000 }, /* R1908 (0x774) - AIF2TX7MIX Input 3 Source */
+	{ 0x00000775, 0x0080 }, /* R1909 (0x775) - AIF2TX7MIX Input 3 Volume */
+	{ 0x00000776, 0x0000 }, /* R1910 (0x776) - AIF2TX7MIX Input 4 Source */
+	{ 0x00000777, 0x0080 }, /* R1911 (0x777) - AIF2TX7MIX Input 4 Volume */
+	{ 0x00000778, 0x0000 }, /* R1912 (0x778) - AIF2TX8MIX Input 1 Source */
+	{ 0x00000779, 0x0080 }, /* R1913 (0x779) - AIF2TX8MIX Input 1 Volume */
+	{ 0x0000077a, 0x0000 }, /* R1914 (0x77a) - AIF2TX8MIX Input 2 Source */
+	{ 0x0000077b, 0x0080 }, /* R1915 (0x77b) - AIF2TX8MIX Input 2 Volume */
+	{ 0x0000077c, 0x0000 }, /* R1916 (0x77c) - AIF2TX8MIX Input 3 Source */
+	{ 0x0000077d, 0x0080 }, /* R1917 (0x77d) - AIF2TX8MIX Input 3 Volume */
+	{ 0x0000077e, 0x0000 }, /* R1918 (0x77e) - AIF2TX8MIX Input 4 Source */
+	{ 0x0000077f, 0x0080 }, /* R1919 (0x77f) - AIF2TX8MIX Input 4 Volume */
+	{ 0x00000780, 0x0000 }, /* R1920 (0x780) - AIF3TX1MIX Input 1 Source */
+	{ 0x00000781, 0x0080 }, /* R1921 (0x781) - AIF3TX1MIX Input 1 Volume */
+	{ 0x00000782, 0x0000 }, /* R1922 (0x782) - AIF3TX1MIX Input 2 Source */
+	{ 0x00000783, 0x0080 }, /* R1923 (0x783) - AIF3TX1MIX Input 2 Volume */
+	{ 0x00000784, 0x0000 }, /* R1924 (0x784) - AIF3TX1MIX Input 3 Source */
+	{ 0x00000785, 0x0080 }, /* R1925 (0x785) - AIF3TX1MIX Input 3 Volume */
+	{ 0x00000786, 0x0000 }, /* R1926 (0x786) - AIF3TX1MIX Input 4 Source */
+	{ 0x00000787, 0x0080 }, /* R1927 (0x787) - AIF3TX1MIX Input 4 Volume */
+	{ 0x00000788, 0x0000 }, /* R1928 (0x788) - AIF3TX2MIX Input 1 Source */
+	{ 0x00000789, 0x0080 }, /* R1929 (0x789) - AIF3TX2MIX Input 1 Volume */
+	{ 0x0000078a, 0x0000 }, /* R1930 (0x78a) - AIF3TX2MIX Input 2 Source */
+	{ 0x0000078b, 0x0080 }, /* R1931 (0x78b) - AIF3TX2MIX Input 2 Volume */
+	{ 0x0000078c, 0x0000 }, /* R1932 (0x78c) - AIF3TX2MIX Input 3 Source */
+	{ 0x0000078d, 0x0080 }, /* R1933 (0x78d) - AIF3TX2MIX Input 3 Volume */
+	{ 0x0000078e, 0x0000 }, /* R1934 (0x78e) - AIF3TX2MIX Input 4 Source */
+	{ 0x0000078f, 0x0080 }, /* R1935 (0x78f) - AIF3TX2MIX Input 4 Volume */
+	{ 0x00000790, 0x0000 }, /* R1936 (0x790) - AIF3TX3MIX Input 1 Source */
+	{ 0x00000791, 0x0080 }, /* R1937 (0x791) - AIF3TX3MIX Input 1 Volume */
+	{ 0x00000792, 0x0000 }, /* R1938 (0x792) - AIF3TX3MIX Input 2 Source */
+	{ 0x00000793, 0x0080 }, /* R1939 (0x793) - AIF3TX3MIX Input 2 Volume */
+	{ 0x00000794, 0x0000 }, /* R1940 (0x794) - AIF3TX3MIX Input 3 Source */
+	{ 0x00000795, 0x0080 }, /* R1941 (0x795) - AIF3TX3MIX Input 3 Volume */
+	{ 0x00000796, 0x0000 }, /* R1942 (0x796) - AIF3TX3MIX Input 4 Source */
+	{ 0x00000797, 0x0080 }, /* R1943 (0x797) - AIF3TX3MIX Input 4 Volume */
+	{ 0x00000798, 0x0000 }, /* R1944 (0x798) - AIF3TX4MIX Input 1 Source */
+	{ 0x00000799, 0x0080 }, /* R1945 (0x799) - AIF3TX4MIX Input 1 Volume */
+	{ 0x0000079a, 0x0000 }, /* R1946 (0x79a) - AIF3TX4MIX Input 2 Source */
+	{ 0x0000079b, 0x0080 }, /* R1947 (0x79b) - AIF3TX4MIX Input 2 Volume */
+	{ 0x0000079c, 0x0000 }, /* R1948 (0x79c) - AIF3TX4MIX Input 3 Source */
+	{ 0x0000079d, 0x0080 }, /* R1949 (0x79d) - AIF3TX4MIX Input 3 Volume */
+	{ 0x0000079e, 0x0000 }, /* R1950 (0x79e) - AIF3TX4MIX Input 4 Source */
+	{ 0x0000079f, 0x0080 }, /* R1951 (0x79f) - AIF3TX4MIX Input 4 Volume */
+	{ 0x000007a0, 0x0000 }, /* R1952 (0x7a0) - AIF3TX5MIX Input 1 Source */
+	{ 0x000007a1, 0x0080 }, /* R1953 (0x7a1) - AIF3TX5MIX Input 1 Volume */
+	{ 0x000007a2, 0x0000 }, /* R1954 (0x7a2) - AIF3TX5MIX Input 2 Source */
+	{ 0x000007a3, 0x0080 }, /* R1955 (0x7a3) - AIF3TX5MIX Input 2 Volume */
+	{ 0x000007a4, 0x0000 }, /* R1956 (0x7a4) - AIF3TX5MIX Input 3 Source */
+	{ 0x000007a5, 0x0080 }, /* R1957 (0x7a5) - AIF3TX5MIX Input 3 Volume */
+	{ 0x000007a6, 0x0000 }, /* R1958 (0x7a6) - AIF3TX5MIX Input 4 Source */
+	{ 0x000007a7, 0x0080 }, /* R1959 (0x7a7) - AIF3TX5MIX Input 4 Volume */
+	{ 0x000007a8, 0x0000 }, /* R1960 (0x7a8) - AIF3TX6MIX Input 1 Source */
+	{ 0x000007a9, 0x0080 }, /* R1961 (0x7a9) - AIF3TX6MIX Input 1 Volume */
+	{ 0x000007aa, 0x0000 }, /* R1962 (0x7aa) - AIF3TX6MIX Input 2 Source */
+	{ 0x000007ab, 0x0080 }, /* R1963 (0x7ab) - AIF3TX6MIX Input 2 Volume */
+	{ 0x000007ac, 0x0000 }, /* R1964 (0x7ac) - AIF3TX6MIX Input 3 Source */
+	{ 0x000007ad, 0x0080 }, /* R1965 (0x7ad) - AIF3TX6MIX Input 3 Volume */
+	{ 0x000007ae, 0x0000 }, /* R1966 (0x7ae) - AIF3TX6MIX Input 4 Source */
+	{ 0x000007af, 0x0080 }, /* R1967 (0x7af) - AIF3TX6MIX Input 4 Volume */
+	{ 0x000007b0, 0x0000 }, /* R1968 (0x7b0) - AIF3TX7MIX Input 1 Source */
+	{ 0x000007b1, 0x0080 }, /* R1969 (0x7b1) - AIF3TX7MIX Input 1 Volume */
+	{ 0x000007b2, 0x0000 }, /* R1970 (0x7b2) - AIF3TX7MIX Input 2 Source */
+	{ 0x000007b3, 0x0080 }, /* R1971 (0x7b3) - AIF3TX7MIX Input 2 Volume */
+	{ 0x000007b4, 0x0000 }, /* R1972 (0x7b4) - AIF3TX7MIX Input 3 Source */
+	{ 0x000007b5, 0x0080 }, /* R1973 (0x7b5) - AIF3TX7MIX Input 3 Volume */
+	{ 0x000007b6, 0x0000 }, /* R1974 (0x7b6) - AIF3TX7MIX Input 4 Source */
+	{ 0x000007b7, 0x0080 }, /* R1975 (0x7b7) - AIF3TX7MIX Input 4 Volume */
+	{ 0x000007b8, 0x0000 }, /* R1976 (0x7b8) - AIF3TX8MIX Input 1 Source */
+	{ 0x000007b9, 0x0080 }, /* R1977 (0x7b9) - AIF3TX8MIX Input 1 Volume */
+	{ 0x000007ba, 0x0000 }, /* R1978 (0x7ba) - AIF3TX8MIX Input 2 Source */
+	{ 0x000007bb, 0x0080 }, /* R1979 (0x7bb) - AIF3TX8MIX Input 2 Volume */
+	{ 0x000007bc, 0x0000 }, /* R1980 (0x7bc) - AIF3TX8MIX Input 3 Source */
+	{ 0x000007bd, 0x0080 }, /* R1981 (0x7bd) - AIF3TX8MIX Input 3 Volume */
+	{ 0x000007be, 0x0000 }, /* R1982 (0x7be) - AIF3TX8MIX Input 4 Source */
+	{ 0x000007bf, 0x0080 }, /* R1983 (0x7bf) - AIF3TX8MIX Input 4 Volume */
+	{ 0x000007c0, 0x0000 }, /* R1984 (0x7c0) - SLIMTX1MIX Input 1 Source */
+	{ 0x000007c1, 0x0080 }, /* R1985 (0x7c1) - SLIMTX1MIX Input 1 Volume */
+	{ 0x000007c2, 0x0000 }, /* R1986 (0x7c2) - SLIMTX1MIX Input 2 Source */
+	{ 0x000007c3, 0x0080 }, /* R1987 (0x7c3) - SLIMTX1MIX Input 2 Volume */
+	{ 0x000007c4, 0x0000 }, /* R1988 (0x7c4) - SLIMTX1MIX Input 3 Source */
+	{ 0x000007c5, 0x0080 }, /* R1989 (0x7c5) - SLIMTX1MIX Input 3 Volume */
+	{ 0x000007c6, 0x0000 }, /* R1990 (0x7c6) - SLIMTX1MIX Input 4 Source */
+	{ 0x000007c7, 0x0080 }, /* R1991 (0x7c7) - SLIMTX1MIX Input 4 Volume */
+	{ 0x000007c8, 0x0000 }, /* R1992 (0x7c8) - SLIMTX2MIX Input 1 Source */
+	{ 0x000007c9, 0x0080 }, /* R1993 (0x7c9) - SLIMTX2MIX Input 1 Volume */
+	{ 0x000007ca, 0x0000 }, /* R1994 (0x7ca) - SLIMTX2MIX Input 2 Source */
+	{ 0x000007cb, 0x0080 }, /* R1995 (0x7cb) - SLIMTX2MIX Input 2 Volume */
+	{ 0x000007cc, 0x0000 }, /* R1996 (0x7cc) - SLIMTX2MIX Input 3 Source */
+	{ 0x000007cd, 0x0080 }, /* R1997 (0x7cd) - SLIMTX2MIX Input 3 Volume */
+	{ 0x000007ce, 0x0000 }, /* R1998 (0x7ce) - SLIMTX2MIX Input 4 Source */
+	{ 0x000007cf, 0x0080 }, /* R1999 (0x7cf) - SLIMTX2MIX Input 4 Volume */
+	{ 0x000007d0, 0x0000 }, /* R2000 (0x7d0) - SLIMTX3MIX Input 1 Source */
+	{ 0x000007d1, 0x0080 }, /* R2001 (0x7d1) - SLIMTX3MIX Input 1 Volume */
+	{ 0x000007d2, 0x0000 }, /* R2002 (0x7d2) - SLIMTX3MIX Input 2 Source */
+	{ 0x000007d3, 0x0080 }, /* R2003 (0x7d3) - SLIMTX3MIX Input 2 Volume */
+	{ 0x000007d4, 0x0000 }, /* R2004 (0x7d4) - SLIMTX3MIX Input 3 Source */
+	{ 0x000007d5, 0x0080 }, /* R2005 (0x7d5) - SLIMTX3MIX Input 3 Volume */
+	{ 0x000007d6, 0x0000 }, /* R2006 (0x7d6) - SLIMTX3MIX Input 4 Source */
+	{ 0x000007d7, 0x0080 }, /* R2007 (0x7d7) - SLIMTX3MIX Input 4 Volume */
+	{ 0x000007d8, 0x0000 }, /* R2008 (0x7d8) - SLIMTX4MIX Input 1 Source */
+	{ 0x000007d9, 0x0080 }, /* R2009 (0x7d9) - SLIMTX4MIX Input 1 Volume */
+	{ 0x000007da, 0x0000 }, /* R2010 (0x7da) - SLIMTX4MIX Input 2 Source */
+	{ 0x000007db, 0x0080 }, /* R2011 (0x7db) - SLIMTX4MIX Input 2 Volume */
+	{ 0x000007dc, 0x0000 }, /* R2012 (0x7dc) - SLIMTX4MIX Input 3 Source */
+	{ 0x000007dd, 0x0080 }, /* R2013 (0x7dd) - SLIMTX4MIX Input 3 Volume */
+	{ 0x000007de, 0x0000 }, /* R2014 (0x7de) - SLIMTX4MIX Input 4 Source */
+	{ 0x000007df, 0x0080 }, /* R2015 (0x7df) - SLIMTX4MIX Input 4 Volume */
+	{ 0x000007e0, 0x0000 }, /* R2016 (0x7e0) - SLIMTX5MIX Input 1 Source */
+	{ 0x000007e1, 0x0080 }, /* R2017 (0x7e1) - SLIMTX5MIX Input 1 Volume */
+	{ 0x000007e2, 0x0000 }, /* R2018 (0x7e2) - SLIMTX5MIX Input 2 Source */
+	{ 0x000007e3, 0x0080 }, /* R2019 (0x7e3) - SLIMTX5MIX Input 2 Volume */
+	{ 0x000007e4, 0x0000 }, /* R2020 (0x7e4) - SLIMTX5MIX Input 3 Source */
+	{ 0x000007e5, 0x0080 }, /* R2021 (0x7e5) - SLIMTX5MIX Input 3 Volume */
+	{ 0x000007e6, 0x0000 }, /* R2022 (0x7e6) - SLIMTX5MIX Input 4 Source */
+	{ 0x000007e7, 0x0080 }, /* R2023 (0x7e7) - SLIMTX5MIX Input 4 Volume */
+	{ 0x000007e8, 0x0000 }, /* R2024 (0x7e8) - SLIMTX6MIX Input 1 Source */
+	{ 0x000007e9, 0x0080 }, /* R2025 (0x7e9) - SLIMTX6MIX Input 1 Volume */
+	{ 0x000007ea, 0x0000 }, /* R2026 (0x7ea) - SLIMTX6MIX Input 2 Source */
+	{ 0x000007eb, 0x0080 }, /* R2027 (0x7eb) - SLIMTX6MIX Input 2 Volume */
+	{ 0x000007ec, 0x0000 }, /* R2028 (0x7ec) - SLIMTX6MIX Input 3 Source */
+	{ 0x000007ed, 0x0080 }, /* R2029 (0x7ed) - SLIMTX6MIX Input 3 Volume */
+	{ 0x000007ee, 0x0000 }, /* R2030 (0x7ee) - SLIMTX6MIX Input 4 Source */
+	{ 0x000007ef, 0x0080 }, /* R2031 (0x7ef) - SLIMTX6MIX Input 4 Volume */
+	{ 0x000007f0, 0x0000 }, /* R2032 (0x7f0) - SLIMTX7MIX Input 1 Source */
+	{ 0x000007f1, 0x0080 }, /* R2033 (0x7f1) - SLIMTX7MIX Input 1 Volume */
+	{ 0x000007f2, 0x0000 }, /* R2034 (0x7f2) - SLIMTX7MIX Input 2 Source */
+	{ 0x000007f3, 0x0080 }, /* R2035 (0x7f3) - SLIMTX7MIX Input 2 Volume */
+	{ 0x000007f4, 0x0000 }, /* R2036 (0x7f4) - SLIMTX7MIX Input 3 Source */
+	{ 0x000007f5, 0x0080 }, /* R2037 (0x7f5) - SLIMTX7MIX Input 3 Volume */
+	{ 0x000007f6, 0x0000 }, /* R2038 (0x7f6) - SLIMTX7MIX Input 4 Source */
+	{ 0x000007f7, 0x0080 }, /* R2039 (0x7f7) - SLIMTX7MIX Input 4 Volume */
+	{ 0x000007f8, 0x0000 }, /* R2040 (0x7f8) - SLIMTX8MIX Input 1 Source */
+	{ 0x000007f9, 0x0080 }, /* R2041 (0x7f9) - SLIMTX8MIX Input 1 Volume */
+	{ 0x000007fa, 0x0000 }, /* R2042 (0x7fa) - SLIMTX8MIX Input 2 Source */
+	{ 0x000007fb, 0x0080 }, /* R2043 (0x7fb) - SLIMTX8MIX Input 2 Volume */
+	{ 0x000007fc, 0x0000 }, /* R2044 (0x7fc) - SLIMTX8MIX Input 3 Source */
+	{ 0x000007fd, 0x0080 }, /* R2045 (0x7fd) - SLIMTX8MIX Input 3 Volume */
+	{ 0x000007fe, 0x0000 }, /* R2046 (0x7fe) - SLIMTX8MIX Input 4 Source */
+	{ 0x000007ff, 0x0080 }, /* R2047 (0x7ff) - SLIMTX8MIX Input 4 Volume */
+	{ 0x00000800, 0x0000 }, /* R2048 (0x800) - SPDIF1TX1MIX Input 1 Source */
+	{ 0x00000801, 0x0080 }, /* R2049 (0x801) - SPDIF1TX1MIX Input 1 Volume */
+	{ 0x00000808, 0x0000 }, /* R2056 (0x808) - SPDIF1TX2MIX Input 1 Source */
+	{ 0x00000809, 0x0080 }, /* R2057 (0x809) - SPDIF1TX2MIX Input 1 Volume */
+	{ 0x00000880, 0x0000 }, /* R2176 (0x880) - EQ1MIX Input 1 Source */
+	{ 0x00000881, 0x0080 }, /* R2177 (0x881) - EQ1MIX Input 1 Volume */
+	{ 0x00000882, 0x0000 }, /* R2178 (0x882) - EQ1MIX Input 2 Source */
+	{ 0x00000883, 0x0080 }, /* R2179 (0x883) - EQ1MIX Input 2 Volume */
+	{ 0x00000884, 0x0000 }, /* R2180 (0x884) - EQ1MIX Input 3 Source */
+	{ 0x00000885, 0x0080 }, /* R2181 (0x885) - EQ1MIX Input 3 Volume */
+	{ 0x00000886, 0x0000 }, /* R2182 (0x886) - EQ1MIX Input 4 Source */
+	{ 0x00000887, 0x0080 }, /* R2183 (0x887) - EQ1MIX Input 4 Volume */
+	{ 0x00000888, 0x0000 }, /* R2184 (0x888) - EQ2MIX Input 1 Source */
+	{ 0x00000889, 0x0080 }, /* R2185 (0x889) - EQ2MIX Input 1 Volume */
+	{ 0x0000088a, 0x0000 }, /* R2186 (0x88a) - EQ2MIX Input 2 Source */
+	{ 0x0000088b, 0x0080 }, /* R2187 (0x88b) - EQ2MIX Input 2 Volume */
+	{ 0x0000088c, 0x0000 }, /* R2188 (0x88c) - EQ2MIX Input 3 Source */
+	{ 0x0000088d, 0x0080 }, /* R2189 (0x88d) - EQ2MIX Input 3 Volume */
+	{ 0x0000088e, 0x0000 }, /* R2190 (0x88e) - EQ2MIX Input 4 Source */
+	{ 0x0000088f, 0x0080 }, /* R2191 (0x88f) - EQ2MIX Input 4 Volume */
+	{ 0x00000890, 0x0000 }, /* R2192 (0x890) - EQ3MIX Input 1 Source */
+	{ 0x00000891, 0x0080 }, /* R2193 (0x891) - EQ3MIX Input 1 Volume */
+	{ 0x00000892, 0x0000 }, /* R2194 (0x892) - EQ3MIX Input 2 Source */
+	{ 0x00000893, 0x0080 }, /* R2195 (0x893) - EQ3MIX Input 2 Volume */
+	{ 0x00000894, 0x0000 }, /* R2196 (0x894) - EQ3MIX Input 3 Source */
+	{ 0x00000895, 0x0080 }, /* R2197 (0x895) - EQ3MIX Input 3 Volume */
+	{ 0x00000896, 0x0000 }, /* R2198 (0x896) - EQ3MIX Input 4 Source */
+	{ 0x00000897, 0x0080 }, /* R2199 (0x897) - EQ3MIX Input 4 Volume */
+	{ 0x00000898, 0x0000 }, /* R2200 (0x898) - EQ4MIX Input 1 Source */
+	{ 0x00000899, 0x0080 }, /* R2201 (0x899) - EQ4MIX Input 1 Volume */
+	{ 0x0000089a, 0x0000 }, /* R2202 (0x89a) - EQ4MIX Input 2 Source */
+	{ 0x0000089b, 0x0080 }, /* R2203 (0x89b) - EQ4MIX Input 2 Volume */
+	{ 0x0000089c, 0x0000 }, /* R2204 (0x89c) - EQ4MIX Input 3 Source */
+	{ 0x0000089d, 0x0080 }, /* R2205 (0x89d) - EQ4MIX Input 3 Volume */
+	{ 0x0000089e, 0x0000 }, /* R2206 (0x89e) - EQ4MIX Input 4 Source */
+	{ 0x0000089f, 0x0080 }, /* R2207 (0x89f) - EQ4MIX Input 4 Volume */
+	{ 0x000008c0, 0x0000 }, /* R2240 (0x8c0) - DRC1LMIX Input 1 Source */
+	{ 0x000008c1, 0x0080 }, /* R2241 (0x8c1) - DRC1LMIX Input 1 Volume */
+	{ 0x000008c2, 0x0000 }, /* R2242 (0x8c2) - DRC1LMIX Input 2 Source */
+	{ 0x000008c3, 0x0080 }, /* R2243 (0x8c3) - DRC1LMIX Input 2 Volume */
+	{ 0x000008c4, 0x0000 }, /* R2244 (0x8c4) - DRC1LMIX Input 3 Source */
+	{ 0x000008c5, 0x0080 }, /* R2245 (0x8c5) - DRC1LMIX Input 3 Volume */
+	{ 0x000008c6, 0x0000 }, /* R2246 (0x8c6) - DRC1LMIX Input 4 Source */
+	{ 0x000008c7, 0x0080 }, /* R2247 (0x8c7) - DRC1LMIX Input 4 Volume */
+	{ 0x000008c8, 0x0000 }, /* R2248 (0x8c8) - DRC1RMIX Input 1 Source */
+	{ 0x000008c9, 0x0080 }, /* R2249 (0x8c9) - DRC1RMIX Input 1 Volume */
+	{ 0x000008ca, 0x0000 }, /* R2250 (0x8ca) - DRC1RMIX Input 2 Source */
+	{ 0x000008cb, 0x0080 }, /* R2251 (0x8cb) - DRC1RMIX Input 2 Volume */
+	{ 0x000008cc, 0x0000 }, /* R2252 (0x8cc) - DRC1RMIX Input 3 Source */
+	{ 0x000008cd, 0x0080 }, /* R2253 (0x8cd) - DRC1RMIX Input 3 Volume */
+	{ 0x000008ce, 0x0000 }, /* R2254 (0x8ce) - DRC1RMIX Input 4 Source */
+	{ 0x000008cf, 0x0080 }, /* R2255 (0x8cf) - DRC1RMIX Input 4 Volume */
+	{ 0x000008d0, 0x0000 }, /* R2256 (0x8d0) - DRC2LMIX Input 1 Source */
+	{ 0x000008d1, 0x0080 }, /* R2257 (0x8d1) - DRC2LMIX Input 1 Volume */
+	{ 0x000008d2, 0x0000 }, /* R2258 (0x8d2) - DRC2LMIX Input 2 Source */
+	{ 0x000008d3, 0x0080 }, /* R2259 (0x8d3) - DRC2LMIX Input 2 Volume */
+	{ 0x000008d4, 0x0000 }, /* R2260 (0x8d4) - DRC2LMIX Input 3 Source */
+	{ 0x000008d5, 0x0080 }, /* R2261 (0x8d5) - DRC2LMIX Input 3 Volume */
+	{ 0x000008d6, 0x0000 }, /* R2262 (0x8d6) - DRC2LMIX Input 4 Source */
+	{ 0x000008d7, 0x0080 }, /* R2263 (0x8d7) - DRC2LMIX Input 4 Volume */
+	{ 0x000008d8, 0x0000 }, /* R2264 (0x8d8) - DRC2RMIX Input 1 Source */
+	{ 0x000008d9, 0x0080 }, /* R2265 (0x8d9) - DRC2RMIX Input 1 Volume */
+	{ 0x000008da, 0x0000 }, /* R2266 (0x8da) - DRC2RMIX Input 2 Source */
+	{ 0x000008db, 0x0080 }, /* R2267 (0x8db) - DRC2RMIX Input 2 Volume */
+	{ 0x000008dc, 0x0000 }, /* R2268 (0x8dc) - DRC2RMIX Input 3 Source */
+	{ 0x000008dd, 0x0080 }, /* R2269 (0x8dd) - DRC2RMIX Input 3 Volume */
+	{ 0x000008de, 0x0000 }, /* R2270 (0x8de) - DRC2RMIX Input 4 Source */
+	{ 0x000008df, 0x0080 }, /* R2271 (0x8df) - DRC2RMIX Input 4 Volume */
+	{ 0x00000900, 0x0000 }, /* R2304 (0x900) - HPLP1MIX Input 1 Source */
+	{ 0x00000901, 0x0080 }, /* R2305 (0x901) - HPLP1MIX Input 1 Volume */
+	{ 0x00000902, 0x0000 }, /* R2306 (0x902) - HPLP1MIX Input 2 Source */
+	{ 0x00000903, 0x0080 }, /* R2307 (0x903) - HPLP1MIX Input 2 Volume */
+	{ 0x00000904, 0x0000 }, /* R2308 (0x904) - HPLP1MIX Input 3 Source */
+	{ 0x00000905, 0x0080 }, /* R2309 (0x905) - HPLP1MIX Input 3 Volume */
+	{ 0x00000906, 0x0000 }, /* R2310 (0x906) - HPLP1MIX Input 4 Source */
+	{ 0x00000907, 0x0080 }, /* R2311 (0x907) - HPLP1MIX Input 4 Volume */
+	{ 0x00000908, 0x0000 }, /* R2312 (0x908) - HPLP2MIX Input 1 Source */
+	{ 0x00000909, 0x0080 }, /* R2313 (0x909) - HPLP2MIX Input 1 Volume */
+	{ 0x0000090a, 0x0000 }, /* R2314 (0x90a) - HPLP2MIX Input 2 Source */
+	{ 0x0000090b, 0x0080 }, /* R2315 (0x90b) - HPLP2MIX Input 2 Volume */
+	{ 0x0000090c, 0x0000 }, /* R2316 (0x90c) - HPLP2MIX Input 3 Source */
+	{ 0x0000090d, 0x0080 }, /* R2317 (0x90d) - HPLP2MIX Input 3 Volume */
+	{ 0x0000090e, 0x0000 }, /* R2318 (0x90e) - HPLP2MIX Input 4 Source */
+	{ 0x0000090f, 0x0080 }, /* R2319 (0x90f) - HPLP2MIX Input 4 Volume */
+	{ 0x00000910, 0x0000 }, /* R2320 (0x910) - HPLP3MIX Input 1 Source */
+	{ 0x00000911, 0x0080 }, /* R2321 (0x911) - HPLP3MIX Input 1 Volume */
+	{ 0x00000912, 0x0000 }, /* R2322 (0x912) - HPLP3MIX Input 2 Source */
+	{ 0x00000913, 0x0080 }, /* R2323 (0x913) - HPLP3MIX Input 2 Volume */
+	{ 0x00000914, 0x0000 }, /* R2324 (0x914) - HPLP3MIX Input 3 Source */
+	{ 0x00000915, 0x0080 }, /* R2325 (0x915) - HPLP3MIX Input 3 Volume */
+	{ 0x00000916, 0x0000 }, /* R2326 (0x916) - HPLP3MIX Input 4 Source */
+	{ 0x00000917, 0x0080 }, /* R2327 (0x917) - HPLP3MIX Input 4 Volume */
+	{ 0x00000918, 0x0000 }, /* R2328 (0x918) - HPLP4MIX Input 1 Source */
+	{ 0x00000919, 0x0080 }, /* R2329 (0x919) - HPLP4MIX Input 1 Volume */
+	{ 0x0000091a, 0x0000 }, /* R2330 (0x91a) - HPLP4MIX Input 2 Source */
+	{ 0x0000091b, 0x0080 }, /* R2331 (0x91b) - HPLP4MIX Input 2 Volume */
+	{ 0x0000091c, 0x0000 }, /* R2332 (0x91c) - HPLP4MIX Input 3 Source */
+	{ 0x0000091d, 0x0080 }, /* R2333 (0x91d) - HPLP4MIX Input 3 Volume */
+	{ 0x0000091e, 0x0000 }, /* R2334 (0x91e) - HPLP4MIX Input 4 Source */
+	{ 0x0000091f, 0x0080 }, /* R2335 (0x91f) - HPLP4MIX Input 4 Volume */
+	{ 0x00000940, 0x0000 }, /* R2368 (0x940) - DSP1LMIX Input 1 Source */
+	{ 0x00000941, 0x0080 }, /* R2369 (0x941) - DSP1LMIX Input 1 Volume */
+	{ 0x00000942, 0x0000 }, /* R2370 (0x942) - DSP1LMIX Input 2 Source */
+	{ 0x00000943, 0x0080 }, /* R2371 (0x943) - DSP1LMIX Input 2 Volume */
+	{ 0x00000944, 0x0000 }, /* R2372 (0x944) - DSP1LMIX Input 3 Source */
+	{ 0x00000945, 0x0080 }, /* R2373 (0x945) - DSP1LMIX Input 3 Volume */
+	{ 0x00000946, 0x0000 }, /* R2374 (0x946) - DSP1LMIX Input 4 Source */
+	{ 0x00000947, 0x0080 }, /* R2375 (0x947) - DSP1LMIX Input 4 Volume */
+	{ 0x00000948, 0x0000 }, /* R2376 (0x948) - DSP1RMIX Input 1 Source */
+	{ 0x00000949, 0x0080 }, /* R2377 (0x949) - DSP1RMIX Input 1 Volume */
+	{ 0x0000094a, 0x0000 }, /* R2378 (0x94a) - DSP1RMIX Input 2 Source */
+	{ 0x0000094b, 0x0080 }, /* R2379 (0x94b) - DSP1RMIX Input 2 Volume */
+	{ 0x0000094c, 0x0000 }, /* R2380 (0x94c) - DSP1RMIX Input 3 Source */
+	{ 0x0000094d, 0x0080 }, /* R2381 (0x94d) - DSP1RMIX Input 3 Volume */
+	{ 0x0000094e, 0x0000 }, /* R2382 (0x94e) - DSP1RMIX Input 4 Source */
+	{ 0x0000094f, 0x0080 }, /* R2383 (0x94f) - DSP1RMIX Input 4 Volume */
+	{ 0x00000950, 0x0000 }, /* R2384 (0x950) - DSP1AUX1MIX Input 1 Source */
+	{ 0x00000958, 0x0000 }, /* R2392 (0x958) - DSP1AUX2MIX Input 1 Source */
+	{ 0x00000960, 0x0000 }, /* R2400 (0x960) - DSP1AUX3MIX Input 1 Source */
+	{ 0x00000968, 0x0000 }, /* R2408 (0x968) - DSP1AUX4MIX Input 1 Source */
+	{ 0x00000970, 0x0000 }, /* R2416 (0x970) - DSP1AUX5MIX Input 1 Source */
+	{ 0x00000978, 0x0000 }, /* R2424 (0x978) - DSP1AUX6MIX Input 1 Source */
+	{ 0x00000a80, 0x0000 }, /* R2688 (0xa80) - ASRC1 1LMIX Input 1 Source */
+	{ 0x00000a88, 0x0000 }, /* R2696 (0xa88) - ASRC1 1RMIX Input 1 Source */
+	{ 0x00000a90, 0x0000 }, /* R2704 (0xa90) - ASRC1 2LMIX Input 1 Source */
+	{ 0x00000a98, 0x0000 }, /* R2712 (0xa98) - ASRC1 2RMIX Input 1 Source */
+	{ 0x00000b00, 0x0000 }, /* R2816 (0xb00) - ISRC1DEC1MIX Input 1 Source */
+	{ 0x00000b08, 0x0000 }, /* R2824 (0xb08) - ISRC1DEC2MIX Input 1 Source */
+	{ 0x00000b20, 0x0000 }, /* R2848 (0xb20) - ISRC1INT1MIX Input 1 Source */
+	{ 0x00000b28, 0x0000 }, /* R2856 (0xb28) - ISRC1INT2MIX Input 1 Source */
+	{ 0x00000b40, 0x0000 }, /* R2880 (0xb40) - ISRC2DEC1MIX Input 1 Source */
+	{ 0x00000b48, 0x0000 }, /* R2888 (0xb48) - ISRC2DEC2MIX Input 1 Source */
+	{ 0x00000b60, 0x0000 }, /* R2912 (0xb60) - ISRC2INT1MIX Input 1 Source */
+	{ 0x00000b68, 0x0000 }, /* R2920 (0xb68) - ISRC2INT2MIX Input 1 Source */
+	{ 0x00000dc0, 0x0000 }, /* R3520 (0xdc0) - DFC1MIX Input 1 Source */
+	{ 0x00000dc8, 0x0000 }, /* R3528 (0xdc8) - DFC2MIX Input 1 Source */
+	{ 0x00000dd0, 0x0000 }, /* R3536 (0xdd0) - DFC3MIX Input 1 Source */
+	{ 0x00000dd8, 0x0000 }, /* R3544 (0xdd8) - DFC4MIX Input 1 Source */
+	{ 0x00000de0, 0x0000 }, /* R3552 (0xde0) - DFC5MIX Input 1 Source */
+	{ 0x00000de8, 0x0000 }, /* R3560 (0xde8) - DFC6MIX Input 1 Source */
+	{ 0x00000df0, 0x0000 }, /* R3568 (0xdf0) - DFC7MIX Input 1 Source */
+	{ 0x00000df8, 0x0000 }, /* R3576 (0xdf8) - DFC8MIX Input 1 Source */
+	{ 0x00000e00, 0x0000 }, /* R3584 (0xe00) - FX Ctrl 1 */
+	{ 0x00000e10, 0x6318 }, /* R3600 (0xe10) - EQ1 1 */
+	{ 0x00000e11, 0x6300 }, /* R3601 (0xe11) - EQ1 2 */
+	{ 0x00000e12, 0x0fc8 }, /* R3602 (0xe12) - EQ1 3 */
+	{ 0x00000e13, 0x03fe }, /* R3603 (0xe13) - EQ1 4 */
+	{ 0x00000e14, 0x00e0 }, /* R3604 (0xe14) - EQ1 5 */
+	{ 0x00000e15, 0x1ec4 }, /* R3605 (0xe15) - EQ1 6 */
+	{ 0x00000e16, 0xf136 }, /* R3606 (0xe16) - EQ1 7 */
+	{ 0x00000e17, 0x0409 }, /* R3607 (0xe17) - EQ1 8 */
+	{ 0x00000e18, 0x04cc }, /* R3608 (0xe18) - EQ1 9 */
+	{ 0x00000e19, 0x1c9b }, /* R3609 (0xe19) - EQ1 10 */
+	{ 0x00000e1a, 0xf337 }, /* R3610 (0xe1a) - EQ1 11 */
+	{ 0x00000e1b, 0x040b }, /* R3611 (0xe1b) - EQ1 12 */
+	{ 0x00000e1c, 0x0cbb }, /* R3612 (0xe1c) - EQ1 13 */
+	{ 0x00000e1d, 0x16f8 }, /* R3613 (0xe1d) - EQ1 14 */
+	{ 0x00000e1e, 0xf7d9 }, /* R3614 (0xe1e) - EQ1 15 */
+	{ 0x00000e1f, 0x040a }, /* R3615 (0xe1f) - EQ1 16 */
+	{ 0x00000e20, 0x1f14 }, /* R3616 (0xe20) - EQ1 17 */
+	{ 0x00000e21, 0x058c }, /* R3617 (0xe21) - EQ1 18 */
+	{ 0x00000e22, 0x0563 }, /* R3618 (0xe22) - EQ1 19 */
+	{ 0x00000e23, 0x4000 }, /* R3619 (0xe23) - EQ1 20 */
+	{ 0x00000e24, 0x0b75 }, /* R3620 (0xe24) - EQ1 21 */
+	{ 0x00000e26, 0x6318 }, /* R3622 (0xe26) - EQ2 1 */
+	{ 0x00000e27, 0x6300 }, /* R3623 (0xe27) - EQ2 2 */
+	{ 0x00000e28, 0x0fc8 }, /* R3624 (0xe28) - EQ2 3 */
+	{ 0x00000e29, 0x03fe }, /* R3625 (0xe29) - EQ2 4 */
+	{ 0x00000e2a, 0x00e0 }, /* R3626 (0xe2a) - EQ2 5 */
+	{ 0x00000e2b, 0x1ec4 }, /* R3627 (0xe2b) - EQ2 6 */
+	{ 0x00000e2c, 0xf136 }, /* R3628 (0xe2c) - EQ2 7 */
+	{ 0x00000e2d, 0x0409 }, /* R3629 (0xe2d) - EQ2 8 */
+	{ 0x00000e2e, 0x04cc }, /* R3630 (0xe2e) - EQ2 9 */
+	{ 0x00000e2f, 0x1c9b }, /* R3631 (0xe2f) - EQ2 10 */
+	{ 0x00000e30, 0xf337 }, /* R3632 (0xe30) - EQ2 11 */
+	{ 0x00000e31, 0x040b }, /* R3633 (0xe31) - EQ2 12 */
+	{ 0x00000e32, 0x0cbb }, /* R3634 (0xe32) - EQ2 13 */
+	{ 0x00000e33, 0x16f8 }, /* R3635 (0xe33) - EQ2 14 */
+	{ 0x00000e34, 0xf7d9 }, /* R3636 (0xe34) - EQ2 15 */
+	{ 0x00000e35, 0x040a }, /* R3637 (0xe35) - EQ2 16 */
+	{ 0x00000e36, 0x1f14 }, /* R3638 (0xe36) - EQ2 17 */
+	{ 0x00000e37, 0x058c }, /* R3639 (0xe37) - EQ2 18 */
+	{ 0x00000e38, 0x0563 }, /* R3640 (0xe38) - EQ2 19 */
+	{ 0x00000e39, 0x4000 }, /* R3641 (0xe39) - EQ2 20 */
+	{ 0x00000e3a, 0x0b75 }, /* R3642 (0xe3a) - EQ2 21 */
+	{ 0x00000e3c, 0x6318 }, /* R3644 (0xe3c) - EQ3 1 */
+	{ 0x00000e3d, 0x6300 }, /* R3645 (0xe3d) - EQ3 2 */
+	{ 0x00000e3e, 0x0fc8 }, /* R3646 (0xe3e) - EQ3 3 */
+	{ 0x00000e3f, 0x03fe }, /* R3647 (0xe3f) - EQ3 4 */
+	{ 0x00000e40, 0x00e0 }, /* R3648 (0xe40) - EQ3 5 */
+	{ 0x00000e41, 0x1ec4 }, /* R3649 (0xe41) - EQ3 6 */
+	{ 0x00000e42, 0xf136 }, /* R3650 (0xe42) - EQ3 7 */
+	{ 0x00000e43, 0x0409 }, /* R3651 (0xe43) - EQ3 8 */
+	{ 0x00000e44, 0x04cc }, /* R3652 (0xe44) - EQ3 9 */
+	{ 0x00000e45, 0x1c9b }, /* R3653 (0xe45) - EQ3 10 */
+	{ 0x00000e46, 0xf337 }, /* R3654 (0xe46) - EQ3 11 */
+	{ 0x00000e47, 0x040b }, /* R3655 (0xe47) - EQ3 12 */
+	{ 0x00000e48, 0x0cbb }, /* R3656 (0xe48) - EQ3 13 */
+	{ 0x00000e49, 0x16f8 }, /* R3657 (0xe49) - EQ3 14 */
+	{ 0x00000e4a, 0xf7d9 }, /* R3658 (0xe4a) - EQ3 15 */
+	{ 0x00000e4b, 0x040a }, /* R3659 (0xe4b) - EQ3 16 */
+	{ 0x00000e4c, 0x1f14 }, /* R3660 (0xe4c) - EQ3 17 */
+	{ 0x00000e4d, 0x058c }, /* R3661 (0xe4d) - EQ3 18 */
+	{ 0x00000e4e, 0x0563 }, /* R3662 (0xe4e) - EQ3 19 */
+	{ 0x00000e4f, 0x4000 }, /* R3663 (0xe4f) - EQ3 20 */
+	{ 0x00000e50, 0x0b75 }, /* R3664 (0xe50) - EQ3 21 */
+	{ 0x00000e52, 0x6318 }, /* R3666 (0xe52) - EQ4 1 */
+	{ 0x00000e53, 0x6300 }, /* R3667 (0xe53) - EQ4 2 */
+	{ 0x00000e54, 0x0fc8 }, /* R3668 (0xe54) - EQ4 3 */
+	{ 0x00000e55, 0x03fe }, /* R3669 (0xe55) - EQ4 4 */
+	{ 0x00000e56, 0x00e0 }, /* R3670 (0xe56) - EQ4 5 */
+	{ 0x00000e57, 0x1ec4 }, /* R3671 (0xe57) - EQ4 6 */
+	{ 0x00000e58, 0xf136 }, /* R3672 (0xe58) - EQ4 7 */
+	{ 0x00000e59, 0x0409 }, /* R3673 (0xe59) - EQ4 8 */
+	{ 0x00000e5a, 0x04cc }, /* R3674 (0xe5a) - EQ4 9 */
+	{ 0x00000e5b, 0x1c9b }, /* R3675 (0xe5b) - EQ4 10 */
+	{ 0x00000e5c, 0xf337 }, /* R3676 (0xe5c) - EQ4 11 */
+	{ 0x00000e5d, 0x040b }, /* R3677 (0xe5d) - EQ4 12 */
+	{ 0x00000e5e, 0x0cbb }, /* R3678 (0xe5e) - EQ4 13 */
+	{ 0x00000e5f, 0x16f8 }, /* R3679 (0xe5f) - EQ4 14 */
+	{ 0x00000e60, 0xf7d9 }, /* R3680 (0xe60) - EQ4 15 */
+	{ 0x00000e61, 0x040a }, /* R3681 (0xe61) - EQ4 16 */
+	{ 0x00000e62, 0x1f14 }, /* R3682 (0xe62) - EQ4 17 */
+	{ 0x00000e63, 0x058c }, /* R3683 (0xe63) - EQ4 18 */
+	{ 0x00000e64, 0x0563 }, /* R3684 (0xe64) - EQ4 19 */
+	{ 0x00000e65, 0x4000 }, /* R3685 (0xe65) - EQ4 20 */
+	{ 0x00000e66, 0x0b75 }, /* R3686 (0xe66) - EQ4 21 */
+	{ 0x00000e80, 0x0018 }, /* R3712 (0xe80) - DRC1 Ctrl 1 */
+	{ 0x00000e81, 0x0933 }, /* R3713 (0xe81) - DRC1 Ctrl 2 */
+	{ 0x00000e82, 0x0018 }, /* R3714 (0xe82) - DRC1 Ctrl 3 */
+	{ 0x00000e83, 0x0000 }, /* R3715 (0xe83) - DRC1 Ctrl 4 */
+	{ 0x00000e84, 0x0000 }, /* R3716 (0xe84) - DRC1 Ctrl 5 */
+	{ 0x00000e88, 0x0018 }, /* R3720 (0xe88) - DRC2 Ctrl 1 */
+	{ 0x00000e89, 0x0933 }, /* R3721 (0xe89) - DRC2 Ctrl 2 */
+	{ 0x00000e8a, 0x0018 }, /* R3722 (0xe8a) - DRC2 Ctrl 3 */
+	{ 0x00000e8b, 0x0000 }, /* R3723 (0xe8b) - DRC2 Ctrl 4 */
+	{ 0x00000e8c, 0x0000 }, /* R3724 (0xe8c) - DRC2 Ctrl 5 */
+	{ 0x00000ec0, 0x0000 }, /* R3776 (0xec0) - HPLPF1 1 */
+	{ 0x00000ec1, 0x0000 }, /* R3777 (0xec1) - HPLPF1 2 */
+	{ 0x00000ec4, 0x0000 }, /* R3780 (0xec4) - HPLPF2 1 */
+	{ 0x00000ec5, 0x0000 }, /* R3781 (0xec5) - HPLPF2 2 */
+	{ 0x00000ec8, 0x0000 }, /* R3784 (0xec8) - HPLPF3 1 */
+	{ 0x00000ec9, 0x0000 }, /* R3785 (0xec9) - HPLPF3 2 */
+	{ 0x00000ecc, 0x0000 }, /* R3788 (0xecc) - HPLPF4 1 */
+	{ 0x00000ecd, 0x0000 }, /* R3789 (0xecd) - HPLPF4 2 */
+	{ 0x00000ee0, 0x0000 }, /* R3808 (0xee0) - ASRC1 Enable */
+	{ 0x00000ee2, 0x0000 }, /* R3810 (0xee2) - ASRC1 Rate 1 */
+	{ 0x00000ee3, 0x4000 }, /* R3811 (0xee3) - ASRC1 Rate 2 */
+	{ 0x00000ef0, 0x0000 }, /* R3824 (0xef0) - ISRC1 Ctrl 1 */
+	{ 0x00000ef1, 0x0001 }, /* R3825 (0xef1) - ISRC1 Ctrl 2 */
+	{ 0x00000ef2, 0x0000 }, /* R3826 (0xef2) - ISRC1 Ctrl 3 */
+	{ 0x00000ef3, 0x0000 }, /* R3827 (0xef3) - ISRC2 Ctrl 1 */
+	{ 0x00000ef4, 0x0001 }, /* R3828 (0xef4) - ISRC2 Ctrl 2 */
+	{ 0x00000ef5, 0x0000 }, /* R3829 (0xef5) - ISRC2 Ctrl 3 */
+	{ 0x000010c0, 0x0008 }, /* R4288 (0x10c0) - AUXPDM1 Ctrl 0 */
+	{ 0x000010c1, 0x4000 }, /* R4289 (0x10c1) - AUXPDM1 Ctrl 1 */
+	{ 0x00001480, 0x0000 }, /* R5248 (0x1480) - DFC1 Ctrl W0 */
+	{ 0x00001482, 0x1f00 }, /* R5250 (0x1482) - DFC1 Rx W0 */
+	{ 0x00001484, 0x1f00 }, /* R5252 (0x1484) - DFC1 Tx W0 */
+	{ 0x00001486, 0x0000 }, /* R5254 (0x1486) - DFC2 Ctrl W0 */
+	{ 0x00001488, 0x1f00 }, /* R5256 (0x1488) - DFC2 Rx W0 */
+	{ 0x0000148a, 0x1f00 }, /* R5258 (0x148a) - DFC2 Tx W0 */
+	{ 0x0000148c, 0x0000 }, /* R5260 (0x148c) - DFC3 Ctrl W0 */
+	{ 0x0000148e, 0x1f00 }, /* R5262 (0x148e) - DFC3 Rx W0 */
+	{ 0x00001490, 0x1f00 }, /* R5264 (0x1490) - DFC3 Tx W0 */
+	{ 0x00001492, 0x0000 }, /* R5266 (0x1492) - DFC4 Ctrl W0 */
+	{ 0x00001494, 0x1f00 }, /* R5268 (0x1494) - DFC4 Rx W0 */
+	{ 0x00001496, 0x1f00 }, /* R5270 (0x1496) - DFC4 Tx W0 */
+	{ 0x00001498, 0x0000 }, /* R5272 (0x1498) - DFC5 Ctrl W0 */
+	{ 0x0000149a, 0x1f00 }, /* R5274 (0x149a) - DFC5 Rx W0 */
+	{ 0x0000149c, 0x1f00 }, /* R5276 (0x149c) - DFC5 Tx W0 */
+	{ 0x0000149e, 0x0000 }, /* R5278 (0x149e) - DFC6 Ctrl W0 */
+	{ 0x000014a0, 0x1f00 }, /* R5280 (0x14a0) - DFC6 Rx W0 */
+	{ 0x000014a2, 0x1f00 }, /* R5282 (0x14a2) - DFC6 Tx W0 */
+	{ 0x000014a4, 0x0000 }, /* R5284 (0x14a4) - DFC7 Ctrl W0 */
+	{ 0x000014a6, 0x1f00 }, /* R5286 (0x14a6) - DFC7 Rx W0 */
+	{ 0x000014a8, 0x1f00 }, /* R5288 (0x14a8) - DFC7 Tx W0 */
+	{ 0x000014aa, 0x0000 }, /* R5290 (0x14aa) - DFC8 Ctrl W0 */
+	{ 0x000014ac, 0x1f00 }, /* R5292 (0x14ac) - DFC8 Rx W0 */
+	{ 0x000014ae, 0x1f00 }, /* R5294 (0x14ae) - DFC8 Tx W0 */
+	{ 0x00001700, 0x2001 }, /* R5888 (0x1700) - GPIO1 Ctrl 1 */
+	{ 0x00001701, 0xf000 }, /* R5889 (0x1701) - GPIO1 Ctrl 2 */
+	{ 0x00001702, 0x2001 }, /* R5890 (0x1702) - GPIO2 Ctrl 1 */
+	{ 0x00001703, 0xf000 }, /* R5891 (0x1703) - GPIO2 Ctrl 2 */
+	{ 0x00001704, 0x2001 }, /* R5892 (0x1704) - GPIO3 Ctrl 1 */
+	{ 0x00001705, 0xf000 }, /* R5893 (0x1705) - GPIO3 Ctrl 2 */
+	{ 0x00001706, 0x2001 }, /* R5894 (0x1706) - GPIO4 Ctrl 1 */
+	{ 0x00001707, 0xf000 }, /* R5895 (0x1707) - GPIO4 Ctrl 2 */
+	{ 0x00001708, 0x2001 }, /* R5896 (0x1708) - GPIO5 Ctrl 1 */
+	{ 0x00001709, 0xf000 }, /* R5897 (0x1709) - GPIO5 Ctrl 2 */
+	{ 0x0000170a, 0x2001 }, /* R5898 (0x170a) - GPIO6 Ctrl 1 */
+	{ 0x0000170b, 0xf000 }, /* R5899 (0x170b) - GPIO6 Ctrl 2 */
+	{ 0x0000170c, 0x2001 }, /* R5900 (0x170c) - GPIO7 Ctrl 1 */
+	{ 0x0000170d, 0xf000 }, /* R5901 (0x170d) - GPIO7 Ctrl 2 */
+	{ 0x0000170e, 0x2001 }, /* R5902 (0x170e) - GPIO8 Ctrl 1 */
+	{ 0x0000170f, 0xf000 }, /* R5903 (0x170f) - GPIO8 Ctrl 2 */
+	{ 0x00001710, 0x2001 }, /* R5904 (0x1710) - GPIO9 Ctrl 1 */
+	{ 0x00001711, 0xf000 }, /* R5905 (0x1711) - GPIO9 Ctrl 2 */
+	{ 0x00001712, 0x2001 }, /* R5906 (0x1712) - GPIO10 Ctrl 1 */
+	{ 0x00001713, 0xf000 }, /* R5907 (0x1713) - GPIO10 Ctrl 2 */
+	{ 0x00001714, 0x2001 }, /* R5908 (0x1714) - GPIO11 Ctrl 1 */
+	{ 0x00001715, 0xf000 }, /* R5909 (0x1715) - GPIO11 Ctrl 2 */
+	{ 0x00001716, 0x2001 }, /* R5910 (0x1716) - GPIO12 Ctrl 1 */
+	{ 0x00001717, 0xf000 }, /* R5911 (0x1717) - GPIO12 Ctrl 2 */
+	{ 0x00001718, 0x2001 }, /* R5912 (0x1718) - GPIO13 Ctrl 1 */
+	{ 0x00001719, 0xf000 }, /* R5913 (0x1719) - GPIO13 Ctrl 2 */
+	{ 0x0000171a, 0x2001 }, /* R5914 (0x171a) - GPIO14 Ctrl 1 */
+	{ 0x0000171b, 0xf000 }, /* R5915 (0x171b) - GPIO14 Ctrl 2 */
+	{ 0x0000171c, 0x2001 }, /* R5916 (0x171c) - GPIO15 Ctrl 1 */
+	{ 0x0000171d, 0xf000 }, /* R5917 (0x171d) - GPIO15 Ctrl 2 */
+	{ 0x0000171e, 0x2001 }, /* R5918 (0x171e) - GPIO16 Ctrl 1 */
+	{ 0x0000171f, 0xf000 }, /* R5919 (0x171f) - GPIO16 Ctrl 2 */
+	{ 0x00001840, 0x1200 }, /* R6208 (0x1840) - IRQ1 Mask 1 */
+	{ 0x00001841, 0x77e0 }, /* R6209 (0x1841) - IRQ1 Mask 2 */
+	{ 0x00001842, 0xffff }, /* R6210 (0x1842) - IRQ1 Mask 3 */
+	{ 0x00001843, 0xffff }, /* R6211 (0x1843) - IRQ1 Mask 4 */
+	{ 0x00001844, 0xffff }, /* R6212 (0x1844) - IRQ1 Mask 5 */
+	{ 0x00001845, 0x0301 }, /* R6213 (0x1845) - IRQ1 Mask 6 */
+	{ 0x00001846, 0x0f3f }, /* R6214 (0x1846) - IRQ1 Mask 7 */
+	{ 0x00001847, 0xffff }, /* R6215 (0x1847) - IRQ1 Mask 8 */
+	{ 0x00001848, 0x031f }, /* R6216 (0x1848) - IRQ1 Mask 9 */
+	{ 0x00001849, 0x031f }, /* R6217 (0x1849) - IRQ1 Mask 10 */
+	{ 0x0000184a, 0xffff }, /* R6218 (0x184a) - IRQ1 Mask 11 */
+	{ 0x0000184b, 0x033f }, /* R6219 (0x184b) - IRQ1 Mask 12 */
+	{ 0x0000184c, 0x003f }, /* R6220 (0x184c) - IRQ1 Mask 13 */
+	{ 0x0000184d, 0x003f }, /* R6221 (0x184d) - IRQ1 Mask 14 */
+	{ 0x0000184e, 0x1000 }, /* R6222 (0x184e) - IRQ1 Mask 15 */
+	{ 0x0000184f, 0xffff }, /* R6223 (0x184f) - IRQ1 Mask 16 */
+	{ 0x00001850, 0xffff }, /* R6224 (0x1850) - IRQ1 Mask 17 */
+	{ 0x00001851, 0xffff }, /* R6225 (0x1851) - IRQ1 Mask 18 */
+	{ 0x00001852, 0xffff }, /* R6226 (0x1852) - IRQ1 Mask 19 */
+	{ 0x00001853, 0xffff }, /* R6227 (0x1853) - IRQ1 Mask 20 */
+	{ 0x00001854, 0x0001 }, /* R6228 (0x1854) - IRQ1 Mask 21 */
+	{ 0x00001855, 0x0001 }, /* R6229 (0x1855) - IRQ1 Mask 22 */
+	{ 0x00001856, 0x0001 }, /* R6230 (0x1856) - IRQ1 Mask 23 */
+	{ 0x00001857, 0x0001 }, /* R6231 (0x1857) - IRQ1 Mask 24 */
+	{ 0x00001858, 0x0001 }, /* R6232 (0x1858) - IRQ1 Mask 25 */
+	{ 0x00001859, 0xffff }, /* R6233 (0x1859) - IRQ1 Mask 26 */
+	{ 0x0000185a, 0x0001 }, /* R6234 (0x185a) - IRQ1 Mask 27 */
+	{ 0x0000185b, 0x0001 }, /* R6235 (0x185b) - IRQ1 Mask 28 */
+	{ 0x0000185c, 0xffff }, /* R6236 (0x185c) - IRQ1 Mask 29 */
+	{ 0x0000185d, 0x0001 }, /* R6237 (0x185d) - IRQ1 Mask 30 */
+	{ 0x0000185e, 0xffff }, /* R6238 (0x185e) - IRQ1 Mask 31 */
+	{ 0x0000185f, 0xffff }, /* R6239 (0x185f) - IRQ1 Mask 32 */
+	{ 0x00001860, 0x0001 }, /* R6240 (0x1860) - IRQ1 Mask 33 */
+	{ 0x00001948, 0x031f }, /* R6472 (0x1948) - IRQ2 Mask 9 */
+	{ 0x00001a06, 0x0000 }, /* R6662 (0x1a06) - Interrupt Debounce 7 */
+	{ 0x00001a80, 0x4400 }, /* R6784 (0x1a80) - IRQ1 Ctrl */
+};
+
+static bool cs47l92_is_adsp_memory(unsigned int reg)
+{
+	switch (reg) {
+	case 0x080000 ... 0x082ffe:
+	case 0x0a0000 ... 0x0a1ffe:
+	case 0x0c0000 ... 0x0c1ffe:
+	case 0x0e0000 ... 0x0e1ffe:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l92_16bit_readable_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_SOFTWARE_RESET:
+	case MADERA_HARDWARE_REVISION:
+	case MADERA_WRITE_SEQUENCER_CTRL_0 ... MADERA_WRITE_SEQUENCER_CTRL_2:
+	case MADERA_TONE_GENERATOR_1 ... MADERA_TONE_GENERATOR_5:
+	case MADERA_PWM_DRIVE_1 ... MADERA_PWM_DRIVE_3:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_1:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_2:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_3:
+	case MADERA_SAMPLE_RATE_SEQUENCE_SELECT_4:
+	case MADERA_HAPTICS_CONTROL_1 ... MADERA_HAPTICS_CONTROL_2:
+	case MADERA_HAPTICS_PHASE_1_INTENSITY:
+	case MADERA_HAPTICS_PHASE_1_DURATION:
+	case MADERA_HAPTICS_PHASE_2_INTENSITY:
+	case MADERA_HAPTICS_PHASE_2_DURATION:
+	case MADERA_HAPTICS_PHASE_3_INTENSITY:
+	case MADERA_HAPTICS_PHASE_3_DURATION:
+	case MADERA_HAPTICS_STATUS:
+	case MADERA_COMFORT_NOISE_GENERATOR:
+	case MADERA_CLOCK_32K_1:
+	case MADERA_SYSTEM_CLOCK_1:
+	case MADERA_SAMPLE_RATE_1 ... MADERA_SAMPLE_RATE_3:
+	case MADERA_SAMPLE_RATE_1_STATUS:
+	case MADERA_SAMPLE_RATE_2_STATUS:
+	case MADERA_SAMPLE_RATE_3_STATUS:
+	case MADERA_ASYNC_CLOCK_1:
+	case MADERA_ASYNC_SAMPLE_RATE_1:
+	case MADERA_ASYNC_SAMPLE_RATE_1_STATUS:
+	case MADERA_ASYNC_SAMPLE_RATE_2:
+	case MADERA_ASYNC_SAMPLE_RATE_2_STATUS:
+	case MADERA_DSP_CLOCK_1:
+	case MADERA_DSP_CLOCK_2:
+	case MADERA_OUTPUT_SYSTEM_CLOCK:
+	case MADERA_OUTPUT_ASYNC_CLOCK:
+	case MADERA_RATE_ESTIMATOR_1 ...  MADERA_RATE_ESTIMATOR_5:
+	case MADERA_FLL1_CONTROL_1 ... MADERA_FLL1_CONTROL_6:
+	case CS47L92_FLL1_CONTROL_7 ...  CS47L92_FLL1_CONTROL_10:
+	case MADERA_FLL1_CONTROL_11:
+	case MADERA_FLL1_DIGITAL_TEST_1:
+	case MADERA_FLL1_SYNCHRONISER_1 ... MADERA_FLL1_SYNCHRONISER_6:
+	case CS47L92_FLL1_GPIO_CLOCK:
+	case MADERA_FLL2_CONTROL_1 ... MADERA_FLL2_CONTROL_6:
+	case CS47L92_FLL2_CONTROL_7 ... CS47L92_FLL2_CONTROL_10:
+	case MADERA_FLL2_CONTROL_11:
+	case MADERA_FLL2_DIGITAL_TEST_1:
+	case MADERA_FLL2_SYNCHRONISER_1 ... MADERA_FLL2_SYNCHRONISER_6:
+	case CS47L92_FLL2_GPIO_CLOCK:
+	case MADERA_MIC_CHARGE_PUMP_1:
+	case MADERA_LDO2_CONTROL_1:
+	case MADERA_MIC_BIAS_CTRL_1:
+	case MADERA_MIC_BIAS_CTRL_2:
+	case MADERA_MIC_BIAS_CTRL_5:
+	case MADERA_MIC_BIAS_CTRL_6:
+	case MADERA_HP_CTRL_1L:
+	case MADERA_HP_CTRL_1R:
+	case MADERA_HP_CTRL_2L:
+	case MADERA_HP_CTRL_2R:
+	case MADERA_HP_CTRL_3L:
+	case MADERA_HP_CTRL_3R:
+	case MADERA_ACCESSORY_DETECT_MODE_1:
+	case MADERA_HEADPHONE_DETECT_0:
+	case MADERA_HEADPHONE_DETECT_1:
+	case MADERA_HEADPHONE_DETECT_2:
+	case MADERA_HEADPHONE_DETECT_3:
+	case MADERA_HEADPHONE_DETECT_5:
+	case MADERA_MICD_CLAMP_CONTROL:
+	case MADERA_MIC_DETECT_1_CONTROL_0:
+	case MADERA_MIC_DETECT_1_CONTROL_1:
+	case MADERA_MIC_DETECT_1_CONTROL_2:
+	case MADERA_MIC_DETECT_1_CONTROL_3:
+	case MADERA_MIC_DETECT_1_CONTROL_4:
+	case MADERA_MIC_DETECT_1_LEVEL_1 ... MADERA_MIC_DETECT_1_LEVEL_4:
+	case MADERA_MIC_DETECT_2_CONTROL_0:
+	case MADERA_MIC_DETECT_2_CONTROL_1:
+	case MADERA_MIC_DETECT_2_CONTROL_2:
+	case MADERA_MIC_DETECT_2_CONTROL_3:
+	case MADERA_MIC_DETECT_2_CONTROL_4:
+	case MADERA_MIC_DETECT_2_LEVEL_1 ... MADERA_MIC_DETECT_2_LEVEL_4:
+	case MADERA_GP_SWITCH_1:
+	case MADERA_JACK_DETECT_ANALOGUE:
+	case MADERA_INPUT_ENABLES:
+	case MADERA_INPUT_ENABLES_STATUS:
+	case MADERA_INPUT_RATE:
+	case MADERA_INPUT_VOLUME_RAMP:
+	case MADERA_HPF_CONTROL:
+	case MADERA_IN1L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_1L:
+	case MADERA_DMIC1L_CONTROL:
+	case MADERA_IN1L_RATE_CONTROL:
+	case MADERA_IN1R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_1R:
+	case MADERA_DMIC1R_CONTROL:
+	case MADERA_IN1R_RATE_CONTROL:
+	case MADERA_IN2L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_2L:
+	case MADERA_DMIC2L_CONTROL:
+	case MADERA_IN2L_RATE_CONTROL:
+	case MADERA_IN2R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_2R:
+	case MADERA_DMIC2R_CONTROL:
+	case MADERA_IN2R_RATE_CONTROL:
+	case MADERA_IN3L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_3L:
+	case MADERA_DMIC3L_CONTROL:
+	case MADERA_IN3L_RATE_CONTROL:
+	case MADERA_IN3R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_3R:
+	case MADERA_DMIC3R_CONTROL:
+	case MADERA_IN3R_RATE_CONTROL:
+	case MADERA_IN4L_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_4L:
+	case MADERA_DMIC4L_CONTROL:
+	case MADERA_IN4L_RATE_CONTROL:
+	case MADERA_IN4R_CONTROL:
+	case MADERA_ADC_DIGITAL_VOLUME_4R:
+	case MADERA_DMIC4R_CONTROL:
+	case MADERA_IN4R_RATE_CONTROL:
+	case MADERA_OUTPUT_ENABLES_1:
+	case MADERA_OUTPUT_STATUS_1:
+	case MADERA_RAW_OUTPUT_STATUS_1:
+	case MADERA_OUTPUT_RATE_1:
+	case MADERA_OUTPUT_VOLUME_RAMP:
+	case MADERA_OUTPUT_PATH_CONFIG_1L:
+	case MADERA_DAC_DIGITAL_VOLUME_1L:
+	case MADERA_OUTPUT_PATH_CONFIG_1:
+	case MADERA_NOISE_GATE_SELECT_1L:
+	case MADERA_OUTPUT_PATH_CONFIG_1R:
+	case MADERA_DAC_DIGITAL_VOLUME_1R:
+	case MADERA_NOISE_GATE_SELECT_1R:
+	case MADERA_OUTPUT_PATH_CONFIG_2L:
+	case MADERA_DAC_DIGITAL_VOLUME_2L:
+	case MADERA_OUTPUT_PATH_CONFIG_2:
+	case MADERA_NOISE_GATE_SELECT_2L:
+	case MADERA_OUTPUT_PATH_CONFIG_2R:
+	case MADERA_DAC_DIGITAL_VOLUME_2R:
+	case MADERA_NOISE_GATE_SELECT_2R:
+	case MADERA_OUTPUT_PATH_CONFIG_3L:
+	case MADERA_DAC_DIGITAL_VOLUME_3L:
+	case MADERA_OUTPUT_PATH_CONFIG_3:
+	case MADERA_NOISE_GATE_SELECT_3L:
+	case MADERA_OUTPUT_PATH_CONFIG_3R:
+	case MADERA_DAC_DIGITAL_VOLUME_3R:
+	case MADERA_NOISE_GATE_SELECT_3R:
+	case MADERA_OUTPUT_PATH_CONFIG_5L:
+	case MADERA_DAC_DIGITAL_VOLUME_5L:
+	case MADERA_NOISE_GATE_SELECT_5L:
+	case MADERA_OUTPUT_PATH_CONFIG_5R:
+	case MADERA_DAC_DIGITAL_VOLUME_5R:
+	case MADERA_NOISE_GATE_SELECT_5R:
+	case MADERA_DAC_AEC_CONTROL_1 ...  MADERA_DAC_AEC_CONTROL_2:
+	case MADERA_NOISE_GATE_CONTROL:
+	case MADERA_PDM_SPK1_CTRL_1 ... MADERA_PDM_SPK1_CTRL_2:
+	case MADERA_HP1_SHORT_CIRCUIT_CTRL:
+	case MADERA_HP2_SHORT_CIRCUIT_CTRL:
+	case MADERA_HP3_SHORT_CIRCUIT_CTRL:
+	case MADERA_AIF1_BCLK_CTRL:
+	case MADERA_AIF1_TX_PIN_CTRL:
+	case MADERA_AIF1_RX_PIN_CTRL:
+	case MADERA_AIF1_RATE_CTRL:
+	case MADERA_AIF1_FORMAT:
+	case MADERA_AIF1_RX_BCLK_RATE:
+	case MADERA_AIF1_FRAME_CTRL_1 ... MADERA_AIF1_FRAME_CTRL_18:
+	case MADERA_AIF1_TX_ENABLES:
+	case MADERA_AIF1_RX_ENABLES:
+	case MADERA_AIF2_BCLK_CTRL:
+	case MADERA_AIF2_TX_PIN_CTRL:
+	case MADERA_AIF2_RX_PIN_CTRL:
+	case MADERA_AIF2_RATE_CTRL:
+	case MADERA_AIF2_FORMAT:
+	case MADERA_AIF2_RX_BCLK_RATE:
+	case MADERA_AIF2_FRAME_CTRL_1 ... MADERA_AIF2_FRAME_CTRL_18:
+	case MADERA_AIF2_TX_ENABLES:
+	case MADERA_AIF2_RX_ENABLES:
+	case MADERA_AIF3_BCLK_CTRL:
+	case MADERA_AIF3_TX_PIN_CTRL:
+	case MADERA_AIF3_RX_PIN_CTRL:
+	case MADERA_AIF3_RATE_CTRL:
+	case MADERA_AIF3_FORMAT:
+	case MADERA_AIF3_RX_BCLK_RATE:
+	case MADERA_AIF3_FRAME_CTRL_1 ... MADERA_AIF3_FRAME_CTRL_18:
+	case MADERA_AIF3_TX_ENABLES:
+	case MADERA_AIF3_RX_ENABLES:
+	case MADERA_SPD1_TX_CONTROL:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_2:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_3:
+	case MADERA_SLIMBUS_FRAMER_REF_GEAR:
+	case MADERA_SLIMBUS_RATES_1 ... MADERA_SLIMBUS_RATES_8:
+	case MADERA_SLIMBUS_RX_CHANNEL_ENABLE:
+	case MADERA_SLIMBUS_TX_CHANNEL_ENABLE:
+	case MADERA_SLIMBUS_RX_PORT_STATUS:
+	case MADERA_SLIMBUS_TX_PORT_STATUS:
+	case MADERA_PWM1MIX_INPUT_1_SOURCE:
+	case MADERA_PWM1MIX_INPUT_1_VOLUME:
+	case MADERA_PWM1MIX_INPUT_2_SOURCE:
+	case MADERA_PWM1MIX_INPUT_2_VOLUME:
+	case MADERA_PWM1MIX_INPUT_3_SOURCE:
+	case MADERA_PWM1MIX_INPUT_3_VOLUME:
+	case MADERA_PWM1MIX_INPUT_4_SOURCE:
+	case MADERA_PWM1MIX_INPUT_4_VOLUME:
+	case MADERA_PWM2MIX_INPUT_1_SOURCE:
+	case MADERA_PWM2MIX_INPUT_1_VOLUME:
+	case MADERA_PWM2MIX_INPUT_2_SOURCE:
+	case MADERA_PWM2MIX_INPUT_2_VOLUME:
+	case MADERA_PWM2MIX_INPUT_3_SOURCE:
+	case MADERA_PWM2MIX_INPUT_3_VOLUME:
+	case MADERA_PWM2MIX_INPUT_4_SOURCE:
+	case MADERA_PWM2MIX_INPUT_4_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT1LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT1LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT1RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT1RMIX_INPUT_4_VOLUME:
+	case MADERA_OUT2LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT2LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT2LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT2LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT2LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT2LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT2LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT2LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT2RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT2RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT2RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT2RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT2RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT2RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT2RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT2RMIX_INPUT_4_VOLUME:
+	case MADERA_OUT3LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT3LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT3LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT3LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT3LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT3LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT3LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT3LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT3RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT3RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT3RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT3RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT3RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT3RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT3RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT3RMIX_INPUT_4_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_1_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_1_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_2_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_2_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_3_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_3_VOLUME:
+	case MADERA_OUT5LMIX_INPUT_4_SOURCE:
+	case MADERA_OUT5LMIX_INPUT_4_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_1_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_1_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_2_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_2_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_3_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_3_VOLUME:
+	case MADERA_OUT5RMIX_INPUT_4_SOURCE:
+	case MADERA_OUT5RMIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX4MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX5MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX5MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX6MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX6MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX7MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX7MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX7MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX7MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX7MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX7MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX7MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX7MIX_INPUT_4_VOLUME:
+	case MADERA_AIF1TX8MIX_INPUT_1_SOURCE:
+	case MADERA_AIF1TX8MIX_INPUT_1_VOLUME:
+	case MADERA_AIF1TX8MIX_INPUT_2_SOURCE:
+	case MADERA_AIF1TX8MIX_INPUT_2_VOLUME:
+	case MADERA_AIF1TX8MIX_INPUT_3_SOURCE:
+	case MADERA_AIF1TX8MIX_INPUT_3_VOLUME:
+	case MADERA_AIF1TX8MIX_INPUT_4_SOURCE:
+	case MADERA_AIF1TX8MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX4MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX5MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX5MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX5MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX5MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX5MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX5MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX5MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX5MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX6MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX6MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX6MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX6MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX6MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX6MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX6MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX6MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX7MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX7MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX7MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX7MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX7MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX7MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX7MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX7MIX_INPUT_4_VOLUME:
+	case MADERA_AIF2TX8MIX_INPUT_1_SOURCE:
+	case MADERA_AIF2TX8MIX_INPUT_1_VOLUME:
+	case MADERA_AIF2TX8MIX_INPUT_2_SOURCE:
+	case MADERA_AIF2TX8MIX_INPUT_2_VOLUME:
+	case MADERA_AIF2TX8MIX_INPUT_3_SOURCE:
+	case MADERA_AIF2TX8MIX_INPUT_3_VOLUME:
+	case MADERA_AIF2TX8MIX_INPUT_4_SOURCE:
+	case MADERA_AIF2TX8MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX1MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX1MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX2MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX2MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX3MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX3MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX3MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX3MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX3MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX3MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX3MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX3MIX_INPUT_4_VOLUME:
+	case MADERA_AIF3TX4MIX_INPUT_1_SOURCE:
+	case MADERA_AIF3TX4MIX_INPUT_1_VOLUME:
+	case MADERA_AIF3TX4MIX_INPUT_2_SOURCE:
+	case MADERA_AIF3TX4MIX_INPUT_2_VOLUME:
+	case MADERA_AIF3TX4MIX_INPUT_3_SOURCE:
+	case MADERA_AIF3TX4MIX_INPUT_3_VOLUME:
+	case MADERA_AIF3TX4MIX_INPUT_4_SOURCE:
+	case MADERA_AIF3TX4MIX_INPUT_4_VOLUME:
+	case CS47L92_AIF3TX5MIX_INPUT_1_SOURCE:
+	case CS47L92_AIF3TX5MIX_INPUT_1_VOLUME:
+	case CS47L92_AIF3TX5MIX_INPUT_2_SOURCE:
+	case CS47L92_AIF3TX5MIX_INPUT_2_VOLUME:
+	case CS47L92_AIF3TX5MIX_INPUT_3_SOURCE:
+	case CS47L92_AIF3TX5MIX_INPUT_3_VOLUME:
+	case CS47L92_AIF3TX5MIX_INPUT_4_SOURCE:
+	case CS47L92_AIF3TX5MIX_INPUT_4_VOLUME:
+	case CS47L92_AIF3TX6MIX_INPUT_1_SOURCE:
+	case CS47L92_AIF3TX6MIX_INPUT_1_VOLUME:
+	case CS47L92_AIF3TX6MIX_INPUT_2_SOURCE:
+	case CS47L92_AIF3TX6MIX_INPUT_2_VOLUME:
+	case CS47L92_AIF3TX6MIX_INPUT_3_SOURCE:
+	case CS47L92_AIF3TX6MIX_INPUT_3_VOLUME:
+	case CS47L92_AIF3TX6MIX_INPUT_4_SOURCE:
+	case CS47L92_AIF3TX6MIX_INPUT_4_VOLUME:
+	case CS47L92_AIF3TX7MIX_INPUT_1_SOURCE:
+	case CS47L92_AIF3TX7MIX_INPUT_1_VOLUME:
+	case CS47L92_AIF3TX7MIX_INPUT_2_SOURCE:
+	case CS47L92_AIF3TX7MIX_INPUT_2_VOLUME:
+	case CS47L92_AIF3TX7MIX_INPUT_3_SOURCE:
+	case CS47L92_AIF3TX7MIX_INPUT_3_VOLUME:
+	case CS47L92_AIF3TX7MIX_INPUT_4_SOURCE:
+	case CS47L92_AIF3TX7MIX_INPUT_4_VOLUME:
+	case CS47L92_AIF3TX8MIX_INPUT_1_SOURCE:
+	case CS47L92_AIF3TX8MIX_INPUT_1_VOLUME:
+	case CS47L92_AIF3TX8MIX_INPUT_2_SOURCE:
+	case CS47L92_AIF3TX8MIX_INPUT_2_VOLUME:
+	case CS47L92_AIF3TX8MIX_INPUT_3_SOURCE:
+	case CS47L92_AIF3TX8MIX_INPUT_3_VOLUME:
+	case CS47L92_AIF3TX8MIX_INPUT_4_SOURCE:
+	case CS47L92_AIF3TX8MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX1MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX1MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX1MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX1MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX1MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX1MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX1MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX1MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX2MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX2MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX2MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX2MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX2MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX2MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX2MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX2MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX3MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX3MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX3MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX3MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX3MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX3MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX3MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX3MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX4MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX4MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX4MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX4MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX4MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX4MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX4MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX4MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX5MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX5MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX5MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX5MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX5MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX5MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX5MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX5MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX6MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX6MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX6MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX6MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX6MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX6MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX6MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX6MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX7MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX7MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX7MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX7MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX7MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX7MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX7MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX7MIX_INPUT_4_VOLUME:
+	case MADERA_SLIMTX8MIX_INPUT_1_SOURCE:
+	case MADERA_SLIMTX8MIX_INPUT_1_VOLUME:
+	case MADERA_SLIMTX8MIX_INPUT_2_SOURCE:
+	case MADERA_SLIMTX8MIX_INPUT_2_VOLUME:
+	case MADERA_SLIMTX8MIX_INPUT_3_SOURCE:
+	case MADERA_SLIMTX8MIX_INPUT_3_VOLUME:
+	case MADERA_SLIMTX8MIX_INPUT_4_SOURCE:
+	case MADERA_SLIMTX8MIX_INPUT_4_VOLUME:
+	case MADERA_SPDIF1TX1MIX_INPUT_1_SOURCE:
+	case MADERA_SPDIF1TX1MIX_INPUT_1_VOLUME:
+	case MADERA_SPDIF1TX2MIX_INPUT_1_SOURCE:
+	case MADERA_SPDIF1TX2MIX_INPUT_1_VOLUME:
+	case MADERA_EQ1MIX_INPUT_1_SOURCE:
+	case MADERA_EQ1MIX_INPUT_1_VOLUME:
+	case MADERA_EQ1MIX_INPUT_2_SOURCE:
+	case MADERA_EQ1MIX_INPUT_2_VOLUME:
+	case MADERA_EQ1MIX_INPUT_3_SOURCE:
+	case MADERA_EQ1MIX_INPUT_3_VOLUME:
+	case MADERA_EQ1MIX_INPUT_4_SOURCE:
+	case MADERA_EQ1MIX_INPUT_4_VOLUME:
+	case MADERA_EQ2MIX_INPUT_1_SOURCE:
+	case MADERA_EQ2MIX_INPUT_1_VOLUME:
+	case MADERA_EQ2MIX_INPUT_2_SOURCE:
+	case MADERA_EQ2MIX_INPUT_2_VOLUME:
+	case MADERA_EQ2MIX_INPUT_3_SOURCE:
+	case MADERA_EQ2MIX_INPUT_3_VOLUME:
+	case MADERA_EQ2MIX_INPUT_4_SOURCE:
+	case MADERA_EQ2MIX_INPUT_4_VOLUME:
+	case MADERA_EQ3MIX_INPUT_1_SOURCE:
+	case MADERA_EQ3MIX_INPUT_1_VOLUME:
+	case MADERA_EQ3MIX_INPUT_2_SOURCE:
+	case MADERA_EQ3MIX_INPUT_2_VOLUME:
+	case MADERA_EQ3MIX_INPUT_3_SOURCE:
+	case MADERA_EQ3MIX_INPUT_3_VOLUME:
+	case MADERA_EQ3MIX_INPUT_4_SOURCE:
+	case MADERA_EQ3MIX_INPUT_4_VOLUME:
+	case MADERA_EQ4MIX_INPUT_1_SOURCE:
+	case MADERA_EQ4MIX_INPUT_1_VOLUME:
+	case MADERA_EQ4MIX_INPUT_2_SOURCE:
+	case MADERA_EQ4MIX_INPUT_2_VOLUME:
+	case MADERA_EQ4MIX_INPUT_3_SOURCE:
+	case MADERA_EQ4MIX_INPUT_3_VOLUME:
+	case MADERA_EQ4MIX_INPUT_4_SOURCE:
+	case MADERA_EQ4MIX_INPUT_4_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_1_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_1_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_2_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_2_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_3_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_3_VOLUME:
+	case MADERA_DRC1LMIX_INPUT_4_SOURCE:
+	case MADERA_DRC1LMIX_INPUT_4_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_1_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_1_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_2_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_2_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_3_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_3_VOLUME:
+	case MADERA_DRC1RMIX_INPUT_4_SOURCE:
+	case MADERA_DRC1RMIX_INPUT_4_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_1_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_1_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_2_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_2_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_3_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_3_VOLUME:
+	case MADERA_DRC2LMIX_INPUT_4_SOURCE:
+	case MADERA_DRC2LMIX_INPUT_4_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_1_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_1_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_2_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_2_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_3_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_3_VOLUME:
+	case MADERA_DRC2RMIX_INPUT_4_SOURCE:
+	case MADERA_DRC2RMIX_INPUT_4_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP1MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP1MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP2MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP2MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP3MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP3MIX_INPUT_4_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_1_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_1_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_2_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_2_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_3_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_3_VOLUME:
+	case MADERA_HPLP4MIX_INPUT_4_SOURCE:
+	case MADERA_HPLP4MIX_INPUT_4_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_1_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_1_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_2_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_2_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_3_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_3_VOLUME:
+	case MADERA_DSP1LMIX_INPUT_4_SOURCE:
+	case MADERA_DSP1LMIX_INPUT_4_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_1_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_1_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_2_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_2_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_3_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_3_VOLUME:
+	case MADERA_DSP1RMIX_INPUT_4_SOURCE:
+	case MADERA_DSP1RMIX_INPUT_4_VOLUME:
+	case MADERA_DSP1AUX1MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX2MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX3MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX4MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX5MIX_INPUT_1_SOURCE:
+	case MADERA_DSP1AUX6MIX_INPUT_1_SOURCE:
+	case MADERA_ASRC1_1LMIX_INPUT_1_SOURCE:
+	case MADERA_ASRC1_1RMIX_INPUT_1_SOURCE:
+	case MADERA_ASRC1_2LMIX_INPUT_1_SOURCE:
+	case MADERA_ASRC1_2RMIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1DEC2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC1INT2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2DEC2MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT1MIX_INPUT_1_SOURCE:
+	case MADERA_ISRC2INT2MIX_INPUT_1_SOURCE:
+	case MADERA_DFC1MIX_INPUT_1_SOURCE:
+	case MADERA_DFC2MIX_INPUT_1_SOURCE:
+	case MADERA_DFC3MIX_INPUT_1_SOURCE:
+	case MADERA_DFC4MIX_INPUT_1_SOURCE:
+	case MADERA_DFC5MIX_INPUT_1_SOURCE:
+	case MADERA_DFC6MIX_INPUT_1_SOURCE:
+	case MADERA_DFC7MIX_INPUT_1_SOURCE:
+	case MADERA_DFC8MIX_INPUT_1_SOURCE:
+	case MADERA_FX_CTRL1 ... MADERA_FX_CTRL2:
+	case MADERA_EQ1_1 ... MADERA_EQ1_21:
+	case MADERA_EQ2_1 ... MADERA_EQ2_21:
+	case MADERA_EQ3_1 ... MADERA_EQ3_21:
+	case MADERA_EQ4_1 ... MADERA_EQ4_21:
+	case MADERA_DRC1_CTRL1 ... MADERA_DRC1_CTRL5:
+	case MADERA_DRC2_CTRL1 ... MADERA_DRC2_CTRL5:
+	case MADERA_HPLPF1_1 ... MADERA_HPLPF1_2:
+	case MADERA_HPLPF2_1 ... MADERA_HPLPF2_2:
+	case MADERA_HPLPF3_1 ... MADERA_HPLPF3_2:
+	case MADERA_HPLPF4_1 ... MADERA_HPLPF4_2:
+	case MADERA_ASRC1_ENABLE:
+	case MADERA_ASRC1_STATUS:
+	case MADERA_ASRC1_RATE1 ... MADERA_ASRC1_RATE2:
+	case MADERA_ISRC_1_CTRL_1 ... MADERA_ISRC_1_CTRL_3:
+	case MADERA_ISRC_2_CTRL_1 ... MADERA_ISRC_2_CTRL_3:
+	case MADERA_AUXPDM1_CTRL_0 ... MADERA_AUXPDM1_CTRL_1:
+	case MADERA_DFC1_CTRL:
+	case MADERA_DFC1_RX:
+	case MADERA_DFC1_TX:
+	case MADERA_DFC2_CTRL:
+	case MADERA_DFC2_RX:
+	case MADERA_DFC2_TX:
+	case MADERA_DFC3_CTRL:
+	case MADERA_DFC3_RX:
+	case MADERA_DFC3_TX:
+	case MADERA_DFC4_CTRL:
+	case MADERA_DFC4_RX:
+	case MADERA_DFC4_TX:
+	case MADERA_DFC5_CTRL:
+	case MADERA_DFC5_RX:
+	case MADERA_DFC5_TX:
+	case MADERA_DFC6_CTRL:
+	case MADERA_DFC6_RX:
+	case MADERA_DFC6_TX:
+	case MADERA_DFC7_CTRL:
+	case MADERA_DFC7_RX:
+	case MADERA_DFC7_TX:
+	case MADERA_DFC8_CTRL:
+	case MADERA_DFC8_RX:
+	case MADERA_DFC8_TX:
+	case MADERA_DFC_STATUS:
+	case MADERA_GPIO1_CTRL_1 ... MADERA_GPIO16_CTRL_2:
+	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
+	case MADERA_IRQ1_MASK_1 ... MADERA_IRQ1_MASK_33:
+	case MADERA_IRQ1_RAW_STATUS_1 ... MADERA_IRQ1_RAW_STATUS_33:
+	case MADERA_INTERRUPT_DEBOUNCE_7:
+	case MADERA_IRQ1_CTRL:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l92_16bit_volatile_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_SOFTWARE_RESET:
+	case MADERA_HARDWARE_REVISION:
+	case MADERA_WRITE_SEQUENCER_CTRL_0 ... MADERA_WRITE_SEQUENCER_CTRL_2:
+	case MADERA_HAPTICS_STATUS:
+	case MADERA_SAMPLE_RATE_1_STATUS:
+	case MADERA_SAMPLE_RATE_2_STATUS:
+	case MADERA_SAMPLE_RATE_3_STATUS:
+	case MADERA_ASYNC_SAMPLE_RATE_1_STATUS:
+	case MADERA_ASYNC_SAMPLE_RATE_2_STATUS:
+	case MADERA_HP_CTRL_1L:
+	case MADERA_HP_CTRL_1R:
+	case MADERA_HP_CTRL_2L:
+	case MADERA_HP_CTRL_2R:
+	case MADERA_HP_CTRL_3L:
+	case MADERA_HP_CTRL_3R:
+	case MADERA_MIC_DETECT_1_CONTROL_3:
+	case MADERA_MIC_DETECT_1_CONTROL_4:
+	case MADERA_MIC_DETECT_2_CONTROL_3:
+	case MADERA_MIC_DETECT_2_CONTROL_4:
+	case MADERA_HEADPHONE_DETECT_2:
+	case MADERA_HEADPHONE_DETECT_3:
+	case MADERA_HEADPHONE_DETECT_5:
+	case MADERA_INPUT_ENABLES_STATUS:
+	case MADERA_OUTPUT_STATUS_1:
+	case MADERA_RAW_OUTPUT_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_1:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_2:
+	case MADERA_SPD1_TX_CHANNEL_STATUS_3:
+	case MADERA_SLIMBUS_RX_PORT_STATUS:
+	case MADERA_SLIMBUS_TX_PORT_STATUS:
+	case MADERA_FX_CTRL2:
+	case MADERA_ASRC1_STATUS:
+	case MADERA_DFC_STATUS:
+	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
+	case MADERA_IRQ1_RAW_STATUS_1 ... MADERA_IRQ1_RAW_STATUS_33:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static bool cs47l92_32bit_readable_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_WSEQ_SEQUENCE_1 ... MADERA_WSEQ_SEQUENCE_508:
+	case MADERA_OTP_HPDET_CAL_1 ... MADERA_OTP_HPDET_CAL_2:
+	case MADERA_DSP1_CONFIG_1 ... MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR:
+		return true;
+	default:
+		return cs47l92_is_adsp_memory(reg);
+	}
+}
+
+static bool cs47l92_32bit_volatile_register(struct device *dev,
+					    unsigned int reg)
+{
+	switch (reg) {
+	case MADERA_WSEQ_SEQUENCE_1 ... MADERA_WSEQ_SEQUENCE_508:
+	case MADERA_OTP_HPDET_CAL_1 ... MADERA_OTP_HPDET_CAL_2:
+	case MADERA_DSP1_CONFIG_1 ... MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR:
+		return true;
+	default:
+		return cs47l92_is_adsp_memory(reg);
+	}
+}
+
+const struct regmap_config cs47l92_16bit_spi_regmap = {
+	.name = "cs47l92_16bit",
+	.reg_bits = 32,
+	.pad_bits = 16,
+	.val_bits = 16,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = MADERA_INTERRUPT_RAW_STATUS_1,
+	.readable_reg = &cs47l92_16bit_readable_register,
+	.volatile_reg = &cs47l92_16bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = cs47l92_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(cs47l92_reg_default),
+};
+EXPORT_SYMBOL_GPL(cs47l92_16bit_spi_regmap);
+
+const struct regmap_config cs47l92_16bit_i2c_regmap = {
+	.name = "cs47l92_16bit",
+	.reg_bits = 32,
+	.val_bits = 16,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = MADERA_INTERRUPT_RAW_STATUS_1,
+	.readable_reg = &cs47l92_16bit_readable_register,
+	.volatile_reg = &cs47l92_16bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = cs47l92_reg_default,
+	.num_reg_defaults = ARRAY_SIZE(cs47l92_reg_default),
+};
+EXPORT_SYMBOL_GPL(cs47l92_16bit_i2c_regmap);
+
+const struct regmap_config cs47l92_32bit_spi_regmap = {
+	.name = "cs47l92_32bit",
+	.reg_bits = 32,
+	.reg_stride = 2,
+	.pad_bits = 16,
+	.val_bits = 32,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR,
+	.readable_reg = &cs47l92_32bit_readable_register,
+	.volatile_reg = &cs47l92_32bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_GPL(cs47l92_32bit_spi_regmap);
+
+const struct regmap_config cs47l92_32bit_i2c_regmap = {
+	.name = "cs47l92_32bit",
+	.reg_bits = 32,
+	.reg_stride = 2,
+	.val_bits = 32,
+	.reg_format_endian = REGMAP_ENDIAN_BIG,
+	.val_format_endian = REGMAP_ENDIAN_BIG,
+
+	.max_register = MADERA_DSP1_PMEM_ERR_ADDR___XMEM_ERR_ADDR,
+	.readable_reg = &cs47l92_32bit_readable_register,
+	.volatile_reg = &cs47l92_32bit_volatile_register,
+
+	.cache_type = REGCACHE_RBTREE,
+};
+EXPORT_SYMBOL_GPL(cs47l92_32bit_i2c_regmap);
diff --git a/drivers/mfd/madera-core.c b/drivers/mfd/madera-core.c
index a354567ebc86..b9e9c169c6cc 100644
--- a/drivers/mfd/madera-core.c
+++ b/drivers/mfd/madera-core.c
@@ -35,6 +35,7 @@
 #define CS47L35_SILICON_ID	0x6360
 #define CS47L85_SILICON_ID	0x6338
 #define CS47L90_SILICON_ID	0x6364
+#define CS47L92_SILICON_ID	0x6371
 
 #define MADERA_32KZ_MCLK2	1
 
@@ -148,6 +149,29 @@ static const struct mfd_cell cs47l90_devs[] = {
 	},
 };
 
+static const char * const cs47l92_supplies[] = {
+	"MICVDD",
+	"CPVDD1",
+	"CPVDD2",
+};
+
+static const struct mfd_cell cs47l92_devs[] = {
+	{ .name = "madera-pinctrl" },
+	{ .name = "madera-irq", },
+	{ .name = "madera-micsupp", },
+	{ .name = "madera-gpio" },
+	{
+		.name = "madera-extcon",
+		.parent_supplies = cs47l92_supplies,
+		.num_parent_supplies = 1, /* We only need MICVDD */
+	},
+	{
+		.name = "cs47l92-codec",
+		.parent_supplies = cs47l92_supplies,
+		.num_parent_supplies = ARRAY_SIZE(cs47l92_supplies),
+	},
+};
+
 /* Used by madera-i2c and madera-spi drivers */
 const char *madera_name_from_type(enum madera_type type)
 {
@@ -162,6 +186,12 @@ const char *madera_name_from_type(enum madera_type type)
 		return "CS47L90";
 	case CS47L91:
 		return "CS47L91";
+	case CS42L92:
+		return "CS42L92";
+	case CS47L92:
+		return "CS47L92";
+	case CS47L93:
+		return "CS47L93";
 	case WM1840:
 		return "WM1840";
 	default:
@@ -321,6 +351,9 @@ const struct of_device_id madera_of_match[] = {
 	{ .compatible = "cirrus,cs47l85", .data = (void *)CS47L85 },
 	{ .compatible = "cirrus,cs47l90", .data = (void *)CS47L90 },
 	{ .compatible = "cirrus,cs47l91", .data = (void *)CS47L91 },
+	{ .compatible = "cirrus,cs42l92", .data = (void *)CS42L92 },
+	{ .compatible = "cirrus,cs47l92", .data = (void *)CS47L92 },
+	{ .compatible = "cirrus,cs47l93", .data = (void *)CS47L93 },
 	{ .compatible = "cirrus,wm1840", .data = (void *)WM1840 },
 	{}
 };
@@ -385,6 +418,13 @@ static void madera_set_micbias_info(struct madera *madera)
 		madera->num_childbias[0] = 4;
 		madera->num_childbias[1] = 4;
 		return;
+	case CS42L92:
+	case CS47L92:
+	case CS47L93:
+		madera->num_micbias = 2;
+		madera->num_childbias[0] = 4;
+		madera->num_childbias[1] = 2;
+		return;
 	default:
 		return;
 	}
@@ -436,6 +476,9 @@ int madera_dev_init(struct madera *madera)
 	case CS47L35:
 	case CS47L90:
 	case CS47L91:
+	case CS42L92:
+	case CS47L92:
+	case CS47L93:
 		break;
 	case CS47L85:
 	case WM1840:
@@ -556,6 +599,21 @@ int madera_dev_init(struct madera *madera)
 			}
 		}
 		break;
+	case CS47L92_SILICON_ID:
+		if (IS_ENABLED(CONFIG_MFD_CS47L92)) {
+			switch (madera->type) {
+			case CS42L92:
+			case CS47L92:
+			case CS47L93:
+				patch_fn = cs47l92_patch;
+				mfd_devs = cs47l92_devs;
+				n_devs = ARRAY_SIZE(cs47l92_devs);
+				break;
+			default:
+				break;
+			}
+		}
+		break;
 	default:
 		dev_err(madera->dev, "Unknown device ID: %x\n", hwid);
 		ret = -EINVAL;
diff --git a/drivers/mfd/madera-i2c.c b/drivers/mfd/madera-i2c.c
index bd868459cedb..3f4ab5dcf5c3 100644
--- a/drivers/mfd/madera-i2c.c
+++ b/drivers/mfd/madera-i2c.c
@@ -65,6 +65,14 @@ static int madera_i2c_probe(struct i2c_client *i2c,
 			regmap_32bit_config = &cs47l90_32bit_i2c_regmap;
 		}
 		break;
+	case CS42L92:
+	case CS47L92:
+	case CS47L93:
+		if (IS_ENABLED(CONFIG_MFD_CS47L92)) {
+			regmap_16bit_config = &cs47l92_16bit_i2c_regmap;
+			regmap_32bit_config = &cs47l92_32bit_i2c_regmap;
+		}
+		break;
 	default:
 		dev_err(&i2c->dev,
 			"Unknown Madera I2C device type %ld\n", type);
@@ -124,6 +132,9 @@ static const struct i2c_device_id madera_i2c_id[] = {
 	{ "cs47l85", CS47L85 },
 	{ "cs47l90", CS47L90 },
 	{ "cs47l91", CS47L91 },
+	{ "cs42l92", CS42L92 },
+	{ "cs47l92", CS47L92 },
+	{ "cs47l93", CS47L93 },
 	{ "wm1840", WM1840 },
 	{ }
 };
diff --git a/drivers/mfd/madera-spi.c b/drivers/mfd/madera-spi.c
index a36741b73c25..d76c7e7376d7 100644
--- a/drivers/mfd/madera-spi.c
+++ b/drivers/mfd/madera-spi.c
@@ -65,6 +65,14 @@ static int madera_spi_probe(struct spi_device *spi)
 			regmap_32bit_config = &cs47l90_32bit_spi_regmap;
 		}
 		break;
+	case CS42L92:
+	case CS47L92:
+	case CS47L93:
+		if (IS_ENABLED(CONFIG_MFD_CS47L92)) {
+			regmap_16bit_config = &cs47l92_16bit_spi_regmap;
+			regmap_32bit_config = &cs47l92_32bit_spi_regmap;
+		}
+		break;
 	default:
 		dev_err(&spi->dev,
 			"Unknown Madera SPI device type %ld\n", type);
@@ -123,6 +131,9 @@ static const struct spi_device_id madera_spi_ids[] = {
 	{ "cs47l85", CS47L85 },
 	{ "cs47l90", CS47L90 },
 	{ "cs47l91", CS47L91 },
+	{ "cs42l92", CS42L92 },
+	{ "cs47l92", CS47L92 },
+	{ "cs47l93", CS47L93 },
 	{ "wm1840", WM1840 },
 	{ }
 };
diff --git a/drivers/mfd/madera.h b/drivers/mfd/madera.h
index ccc16f2a1288..69a40aba7e69 100644
--- a/drivers/mfd/madera.h
+++ b/drivers/mfd/madera.h
@@ -47,4 +47,11 @@ extern const struct regmap_config cs47l90_32bit_spi_regmap;
 extern const struct regmap_config cs47l90_16bit_i2c_regmap;
 extern const struct regmap_config cs47l90_32bit_i2c_regmap;
 int cs47l90_patch(struct madera *madera);
+
+extern const struct regmap_config cs47l92_16bit_spi_regmap;
+extern const struct regmap_config cs47l92_32bit_spi_regmap;
+extern const struct regmap_config cs47l92_16bit_i2c_regmap;
+extern const struct regmap_config cs47l92_32bit_i2c_regmap;
+int cs47l92_patch(struct madera *madera);
+
 #endif
diff --git a/include/linux/mfd/madera/core.h b/include/linux/mfd/madera/core.h
index 98dd3cb5e84d..7b87f9a02ecc 100644
--- a/include/linux/mfd/madera/core.h
+++ b/include/linux/mfd/madera/core.h
@@ -26,8 +26,11 @@ enum madera_type {
 	CS47L85 = 2,
 	CS47L90 = 3,
 	CS47L91 = 4,
+	CS47L92 = 5,
+	CS47L93 = 6,
 	WM1840 = 7,
 	CS47L15 = 8,
+	CS42L92 = 9,
 };
 
 #define MADERA_MAX_CORE_SUPPLIES	2
@@ -37,6 +40,7 @@ enum madera_type {
 #define CS47L35_NUM_GPIOS		16
 #define CS47L85_NUM_GPIOS		40
 #define CS47L90_NUM_GPIOS		38
+#define CS47L92_NUM_GPIOS		16
 
 #define MADERA_MAX_MICBIAS		4
 
diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index 5b054d511c6a..6439c0282ac6 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -77,9 +77,15 @@
 #define MADERA_FLL1_CONTROL_5				0x175
 #define MADERA_FLL1_CONTROL_6				0x176
 #define MADERA_FLL1_LOOP_FILTER_TEST_1			0x177
+#define CS47L92_FLL1_CONTROL_7				0x177
 #define MADERA_FLL1_NCO_TEST_0				0x178
+#define CS47L92_FLL1_CONTROL_8				0x178
 #define MADERA_FLL1_CONTROL_7				0x179
+#define CS47L92_FLL1_CONTROL_9				0x179
 #define MADERA_FLL1_EFS_2				0x17A
+#define CS47L92_FLL1_CONTROL_10				0x17A
+#define MADERA_FLL1_CONTROL_11				0x17B
+#define MADERA_FLL1_DIGITAL_TEST_1			0x17D
 #define CS47L35_FLL1_SYNCHRONISER_1			0x17F
 #define CS47L35_FLL1_SYNCHRONISER_2			0x180
 #define CS47L35_FLL1_SYNCHRONISER_3			0x181
@@ -98,6 +104,7 @@
 #define MADERA_FLL1_SYNCHRONISER_7			0x187
 #define MADERA_FLL1_SPREAD_SPECTRUM			0x189
 #define MADERA_FLL1_GPIO_CLOCK				0x18A
+#define CS47L92_FLL1_GPIO_CLOCK				0x18E
 #define MADERA_FLL2_CONTROL_1				0x191
 #define MADERA_FLL2_CONTROL_2				0x192
 #define MADERA_FLL2_CONTROL_3				0x193
@@ -105,9 +112,15 @@
 #define MADERA_FLL2_CONTROL_5				0x195
 #define MADERA_FLL2_CONTROL_6				0x196
 #define MADERA_FLL2_LOOP_FILTER_TEST_1			0x197
+#define CS47L92_FLL2_CONTROL_7				0x197
 #define MADERA_FLL2_NCO_TEST_0				0x198
+#define CS47L92_FLL2_CONTROL_8				0x198
 #define MADERA_FLL2_CONTROL_7				0x199
+#define CS47L92_FLL2_CONTROL_9				0x199
 #define MADERA_FLL2_EFS_2				0x19A
+#define CS47L92_FLL2_CONTROL_10				0x19A
+#define MADERA_FLL2_CONTROL_11				0x19B
+#define MADERA_FLL2_DIGITAL_TEST_1			0x19D
 #define MADERA_FLL2_SYNCHRONISER_1			0x1A1
 #define MADERA_FLL2_SYNCHRONISER_2			0x1A2
 #define MADERA_FLL2_SYNCHRONISER_3			0x1A3
@@ -117,6 +130,7 @@
 #define MADERA_FLL2_SYNCHRONISER_7			0x1A7
 #define MADERA_FLL2_SPREAD_SPECTRUM			0x1A9
 #define MADERA_FLL2_GPIO_CLOCK				0x1AA
+#define CS47L92_FLL2_GPIO_CLOCK				0x1AE
 #define MADERA_FLL3_CONTROL_1				0x1B1
 #define MADERA_FLL3_CONTROL_2				0x1B2
 #define MADERA_FLL3_CONTROL_3				0x1B3
@@ -267,6 +281,7 @@
 #define MADERA_NOISE_GATE_SELECT_2R			0x41F
 #define MADERA_OUTPUT_PATH_CONFIG_3L			0x420
 #define MADERA_DAC_DIGITAL_VOLUME_3L			0x421
+#define MADERA_OUTPUT_PATH_CONFIG_3			0x422
 #define MADERA_NOISE_GATE_SELECT_3L			0x423
 #define MADERA_OUTPUT_PATH_CONFIG_3R			0x424
 #define MADERA_DAC_DIGITAL_VOLUME_3R			0x425
@@ -369,8 +384,20 @@
 #define MADERA_AIF3_FRAME_CTRL_2			0x588
 #define MADERA_AIF3_FRAME_CTRL_3			0x589
 #define MADERA_AIF3_FRAME_CTRL_4			0x58A
+#define MADERA_AIF3_FRAME_CTRL_5			0x58B
+#define MADERA_AIF3_FRAME_CTRL_6			0x58C
+#define MADERA_AIF3_FRAME_CTRL_7			0x58D
+#define MADERA_AIF3_FRAME_CTRL_8			0x58E
+#define MADERA_AIF3_FRAME_CTRL_9			0x58F
+#define MADERA_AIF3_FRAME_CTRL_10			0x590
 #define MADERA_AIF3_FRAME_CTRL_11			0x591
 #define MADERA_AIF3_FRAME_CTRL_12			0x592
+#define MADERA_AIF3_FRAME_CTRL_13			0x593
+#define MADERA_AIF3_FRAME_CTRL_14			0x594
+#define MADERA_AIF3_FRAME_CTRL_15			0x595
+#define MADERA_AIF3_FRAME_CTRL_16			0x596
+#define MADERA_AIF3_FRAME_CTRL_17			0x597
+#define MADERA_AIF3_FRAME_CTRL_18			0x598
 #define MADERA_AIF3_TX_ENABLES				0x599
 #define MADERA_AIF3_RX_ENABLES				0x59A
 #define MADERA_AIF3_FORCE_WRITE				0x59B
@@ -662,6 +689,54 @@
 #define MADERA_AIF3TX2MIX_INPUT_3_VOLUME		0x78D
 #define MADERA_AIF3TX2MIX_INPUT_4_SOURCE		0x78E
 #define MADERA_AIF3TX2MIX_INPUT_4_VOLUME		0x78F
+#define MADERA_AIF3TX3MIX_INPUT_1_SOURCE		0x790
+#define MADERA_AIF3TX3MIX_INPUT_1_VOLUME		0x791
+#define MADERA_AIF3TX3MIX_INPUT_2_SOURCE		0x792
+#define MADERA_AIF3TX3MIX_INPUT_2_VOLUME		0x793
+#define MADERA_AIF3TX3MIX_INPUT_3_SOURCE		0x794
+#define MADERA_AIF3TX3MIX_INPUT_3_VOLUME		0x795
+#define MADERA_AIF3TX3MIX_INPUT_4_SOURCE		0x796
+#define MADERA_AIF3TX3MIX_INPUT_4_VOLUME		0x797
+#define MADERA_AIF3TX4MIX_INPUT_1_SOURCE		0x798
+#define MADERA_AIF3TX4MIX_INPUT_1_VOLUME		0x799
+#define MADERA_AIF3TX4MIX_INPUT_2_SOURCE		0x79A
+#define MADERA_AIF3TX4MIX_INPUT_2_VOLUME		0x79B
+#define MADERA_AIF3TX4MIX_INPUT_3_SOURCE		0x79C
+#define MADERA_AIF3TX4MIX_INPUT_3_VOLUME		0x79D
+#define MADERA_AIF3TX4MIX_INPUT_4_SOURCE		0x79E
+#define MADERA_AIF3TX4MIX_INPUT_4_VOLUME		0x79F
+#define CS47L92_AIF3TX5MIX_INPUT_1_SOURCE		0x7A0
+#define CS47L92_AIF3TX5MIX_INPUT_1_VOLUME		0x7A1
+#define CS47L92_AIF3TX5MIX_INPUT_2_SOURCE		0x7A2
+#define CS47L92_AIF3TX5MIX_INPUT_2_VOLUME		0x7A3
+#define CS47L92_AIF3TX5MIX_INPUT_3_SOURCE		0x7A4
+#define CS47L92_AIF3TX5MIX_INPUT_3_VOLUME		0x7A5
+#define CS47L92_AIF3TX5MIX_INPUT_4_SOURCE		0x7A6
+#define CS47L92_AIF3TX5MIX_INPUT_4_VOLUME		0x7A7
+#define CS47L92_AIF3TX6MIX_INPUT_1_SOURCE		0x7A8
+#define CS47L92_AIF3TX6MIX_INPUT_1_VOLUME		0x7A9
+#define CS47L92_AIF3TX6MIX_INPUT_2_SOURCE		0x7AA
+#define CS47L92_AIF3TX6MIX_INPUT_2_VOLUME		0x7AB
+#define CS47L92_AIF3TX6MIX_INPUT_3_SOURCE		0x7AC
+#define CS47L92_AIF3TX6MIX_INPUT_3_VOLUME		0x7AD
+#define CS47L92_AIF3TX6MIX_INPUT_4_SOURCE		0x7AE
+#define CS47L92_AIF3TX6MIX_INPUT_4_VOLUME		0x7AF
+#define CS47L92_AIF3TX7MIX_INPUT_1_SOURCE		0x7B0
+#define CS47L92_AIF3TX7MIX_INPUT_1_VOLUME		0x7B1
+#define CS47L92_AIF3TX7MIX_INPUT_2_SOURCE		0x7B2
+#define CS47L92_AIF3TX7MIX_INPUT_2_VOLUME		0x7B3
+#define CS47L92_AIF3TX7MIX_INPUT_3_SOURCE		0x7B4
+#define CS47L92_AIF3TX7MIX_INPUT_3_VOLUME		0x7B5
+#define CS47L92_AIF3TX7MIX_INPUT_4_SOURCE		0x7B6
+#define CS47L92_AIF3TX7MIX_INPUT_4_VOLUME		0x7B7
+#define CS47L92_AIF3TX8MIX_INPUT_1_SOURCE		0x7B8
+#define CS47L92_AIF3TX8MIX_INPUT_1_VOLUME		0x7B9
+#define CS47L92_AIF3TX8MIX_INPUT_2_SOURCE		0x7BA
+#define CS47L92_AIF3TX8MIX_INPUT_2_VOLUME		0x7BB
+#define CS47L92_AIF3TX8MIX_INPUT_3_SOURCE		0x7BC
+#define CS47L92_AIF3TX8MIX_INPUT_3_VOLUME		0x7BD
+#define CS47L92_AIF3TX8MIX_INPUT_4_SOURCE		0x7BE
+#define CS47L92_AIF3TX8MIX_INPUT_4_VOLUME		0x7BF
 #define MADERA_AIF4TX1MIX_INPUT_1_SOURCE		0x7A0
 #define MADERA_AIF4TX1MIX_INPUT_1_VOLUME		0x7A1
 #define MADERA_AIF4TX1MIX_INPUT_2_SOURCE		0x7A2
@@ -1105,6 +1180,8 @@
 #define MADERA_FCR_ADC_REFORMATTER_CONTROL		0xF73
 #define MADERA_FCR_COEFF_START				0xF74
 #define MADERA_FCR_COEFF_END				0xFC5
+#define MADERA_AUXPDM1_CTRL_0				0x10C0
+#define MADERA_AUXPDM1_CTRL_1				0x10C1
 #define MADERA_DAC_COMP_1				0x1300
 #define MADERA_DAC_COMP_2				0x1302
 #define MADERA_FRF_COEFFICIENT_1L_1			0x1380
@@ -1446,6 +1523,12 @@
 #define MADERA_OPCLK_ASYNC_SEL_WIDTH			     3
 
 /* (0x0171)  FLL1_Control_1 */
+#define CS47L92_FLL1_REFCLK_SRC_MASK			0xF000
+#define CS47L92_FLL1_REFCLK_SRC_SHIFT			    12
+#define CS47L92_FLL1_REFCLK_SRC_WIDTH			     4
+#define MADERA_FLL1_HOLD_MASK				0x0004
+#define MADERA_FLL1_HOLD_SHIFT				     2
+#define MADERA_FLL1_HOLD_WIDTH				     1
 #define MADERA_FLL1_FREERUN				0x0002
 #define MADERA_FLL1_FREERUN_MASK			0x0002
 #define MADERA_FLL1_FREERUN_SHIFT			     1
@@ -1478,6 +1561,9 @@
 #define MADERA_FLL1_FRATIO_MASK				0x0F00
 #define MADERA_FLL1_FRATIO_SHIFT			     8
 #define MADERA_FLL1_FRATIO_WIDTH			     4
+#define MADERA_FLL1_FB_DIV_MASK				0x03FF
+#define MADERA_FLL1_FB_DIV_SHIFT			     0
+#define MADERA_FLL1_FB_DIV_WIDTH			    10
 
 /* (0x0176)  FLL1_Control_6 */
 #define MADERA_FLL1_REFCLK_DIV_MASK			0x00C0
@@ -1509,6 +1595,30 @@
 #define MADERA_FLL1_PHASE_ENA_SHIFT			    11
 #define MADERA_FLL1_PHASE_ENA_WIDTH			     1
 
+/* (0x017A)  FLL1_Control_10 */
+#define MADERA_FLL1_HP_MASK				0xC000
+#define MADERA_FLL1_HP_SHIFT				    14
+#define MADERA_FLL1_HP_WIDTH				     2
+#define MADERA_FLL1_PHASEDET_ENA_MASK			0x1000
+#define MADERA_FLL1_PHASEDET_ENA_SHIFT			    12
+#define MADERA_FLL1_PHASEDET_ENA_WIDTH			     1
+
+/* (0x017B)  FLL1_Control_11 */
+#define MADERA_FLL1_LOCKDET_THR_MASK			0x001E
+#define MADERA_FLL1_LOCKDET_THR_SHIFT			     1
+#define MADERA_FLL1_LOCKDET_THR_WIDTH			     4
+#define MADERA_FLL1_LOCKDET_MASK			0x0001
+#define MADERA_FLL1_LOCKDET_SHIFT			     0
+#define MADERA_FLL1_LOCKDET_WIDTH			     1
+
+/* (0x017D)  FLL1_Digital_Test_1 */
+#define MADERA_FLL1_SYNC_EFS_ENA_MASK			0x0100
+#define MADERA_FLL1_SYNC_EFS_ENA_SHIFT			     8
+#define MADERA_FLL1_SYNC_EFS_ENA_WIDTH			     1
+#define MADERA_FLL1_CLK_VCO_FAST_SRC_MASK		0x0003
+#define MADERA_FLL1_CLK_VCO_FAST_SRC_SHIFT		     0
+#define MADERA_FLL1_CLK_VCO_FAST_SRC_WIDTH		     2
+
 /* (0x0181)  FLL1_Synchroniser_1 */
 #define MADERA_FLL1_SYNC_ENA				0x0001
 #define MADERA_FLL1_SYNC_ENA_MASK			0x0001
@@ -1630,6 +1740,13 @@
 #define MADERA_LDO2_ENA_WIDTH				     1
 
 /* (0x0218)  Mic_Bias_Ctrl_1 */
+#define MADERA_MICB1_EXT_CAP				0x8000
+#define MADERA_MICB1_EXT_CAP_MASK			0x8000
+#define MADERA_MICB1_EXT_CAP_SHIFT			    15
+#define MADERA_MICB1_EXT_CAP_WIDTH			     1
+#define MADERA_MICB1_LVL_MASK				0x01E0
+#define MADERA_MICB1_LVL_SHIFT				     5
+#define MADERA_MICB1_LVL_WIDTH				     4
 #define MADERA_MICB1_ENA				0x0001
 #define MADERA_MICB1_ENA_MASK				0x0001
 #define MADERA_MICB1_ENA_SHIFT				     0
@@ -2313,6 +2430,17 @@
 #define MADERA_OUT1R_ENA_SHIFT				     0
 #define MADERA_OUT1R_ENA_WIDTH				     1
 
+/* (0x0408)  Output_Rate_1 */
+#define MADERA_CP_DAC_MODE_MASK				0x0040
+#define MADERA_CP_DAC_MODE_SHIFT			     6
+#define MADERA_CP_DAC_MODE_WIDTH			     1
+#define MADERA_OUT_EXT_CLK_DIV_MASK			0x0030
+#define MADERA_OUT_EXT_CLK_DIV_SHIFT			     4
+#define MADERA_OUT_EXT_CLK_DIV_WIDTH			     2
+#define MADERA_OUT_CLK_SRC_MASK				0x0007
+#define MADERA_OUT_CLK_SRC_SHIFT			     0
+#define MADERA_OUT_CLK_SRC_WIDTH			     3
+
 /* (0x0409)  Output_Volume_Ramp */
 #define MADERA_OUT_VD_RAMP_MASK				0x0070
 #define MADERA_OUT_VD_RAMP_SHIFT			     4
@@ -2834,6 +2962,30 @@
 #define MADERA_AIF2RX1_ENA_WIDTH			     1
 
 /* (0x0599)  AIF3_Tx_Enables */
+#define MADERA_AIF3TX8_ENA				0x0080
+#define MADERA_AIF3TX8_ENA_MASK				0x0080
+#define MADERA_AIF3TX8_ENA_SHIFT			     7
+#define MADERA_AIF3TX8_ENA_WIDTH			     1
+#define MADERA_AIF3TX7_ENA				0x0040
+#define MADERA_AIF3TX7_ENA_MASK				0x0040
+#define MADERA_AIF3TX7_ENA_SHIFT			     6
+#define MADERA_AIF3TX7_ENA_WIDTH			     1
+#define MADERA_AIF3TX6_ENA				0x0020
+#define MADERA_AIF3TX6_ENA_MASK				0x0020
+#define MADERA_AIF3TX6_ENA_SHIFT			     5
+#define MADERA_AIF3TX6_ENA_WIDTH			     1
+#define MADERA_AIF3TX5_ENA				0x0010
+#define MADERA_AIF3TX5_ENA_MASK				0x0010
+#define MADERA_AIF3TX5_ENA_SHIFT			     4
+#define MADERA_AIF3TX5_ENA_WIDTH			     1
+#define MADERA_AIF3TX4_ENA				0x0008
+#define MADERA_AIF3TX4_ENA_MASK				0x0008
+#define MADERA_AIF3TX4_ENA_SHIFT			     3
+#define MADERA_AIF3TX4_ENA_WIDTH			     1
+#define MADERA_AIF3TX3_ENA				0x0004
+#define MADERA_AIF3TX3_ENA_MASK				0x0004
+#define MADERA_AIF3TX3_ENA_SHIFT			     2
+#define MADERA_AIF3TX3_ENA_WIDTH			     1
 #define MADERA_AIF3TX2_ENA				0x0002
 #define MADERA_AIF3TX2_ENA_MASK				0x0002
 #define MADERA_AIF3TX2_ENA_SHIFT			     1
@@ -2844,6 +2996,30 @@
 #define MADERA_AIF3TX1_ENA_WIDTH			     1
 
 /* (0x059A)  AIF3_Rx_Enables */
+#define MADERA_AIF3RX8_ENA				0x0080
+#define MADERA_AIF3RX8_ENA_MASK				0x0080
+#define MADERA_AIF3RX8_ENA_SHIFT			     7
+#define MADERA_AIF3RX8_ENA_WIDTH			     1
+#define MADERA_AIF3RX7_ENA				0x0040
+#define MADERA_AIF3RX7_ENA_MASK				0x0040
+#define MADERA_AIF3RX7_ENA_SHIFT			     6
+#define MADERA_AIF3RX7_ENA_WIDTH			     1
+#define MADERA_AIF3RX6_ENA				0x0020
+#define MADERA_AIF3RX6_ENA_MASK				0x0020
+#define MADERA_AIF3RX6_ENA_SHIFT			     5
+#define MADERA_AIF3RX6_ENA_WIDTH			     1
+#define MADERA_AIF3RX5_ENA				0x0010
+#define MADERA_AIF3RX5_ENA_MASK				0x0010
+#define MADERA_AIF3RX5_ENA_SHIFT			     4
+#define MADERA_AIF3RX5_ENA_WIDTH			     1
+#define MADERA_AIF3RX4_ENA				0x0008
+#define MADERA_AIF3RX4_ENA_MASK				0x0008
+#define MADERA_AIF3RX4_ENA_SHIFT			     3
+#define MADERA_AIF3RX4_ENA_WIDTH			     1
+#define MADERA_AIF3RX3_ENA				0x0004
+#define MADERA_AIF3RX3_ENA_MASK				0x0004
+#define MADERA_AIF3RX3_ENA_SHIFT			     2
+#define MADERA_AIF3RX3_ENA_WIDTH			     1
 #define MADERA_AIF3RX2_ENA				0x0002
 #define MADERA_AIF3RX2_ENA_MASK				0x0002
 #define MADERA_AIF3RX2_ENA_SHIFT			     1
@@ -3458,6 +3634,25 @@
 #define MADERA_FCR_MIC_MODE_SEL_SHIFT			     2
 #define MADERA_FCR_MIC_MODE_SEL_WIDTH			     2
 
+/* (0x10C0)  AUXPDM1_CTRL_0 */
+#define MADERA_AUXPDM1_SRC_MASK				0x0F00
+#define MADERA_AUXPDM1_SRC_SHIFT			     8
+#define MADERA_AUXPDM1_SRC_WIDTH			     4
+#define MADERA_AUXPDM1_TXEDGE_MASK			0x0010
+#define MADERA_AUXPDM1_TXEDGE_SHIFT			     4
+#define MADERA_AUXPDM1_TXEDGE_WIDTH			     1
+#define MADERA_AUXPDM1_MSTR_MASK			0x0008
+#define MADERA_AUXPDM1_MSTR_SHIFT			     3
+#define MADERA_AUXPDM1_MSTR_WIDTH			     1
+#define MADERA_AUXPDM1_ENABLE_MASK			0x0001
+#define MADERA_AUXPDM1_ENABLE_SHIFT			     0
+#define MADERA_AUXPDM1_ENABLE_WIDTH			     1
+
+/* (0x10C1)  AUXPDM1_CTRL_1 */
+#define MADERA_AUXPDM1_CLK_FREQ_MASK			0xC000
+#define MADERA_AUXPDM1_CLK_FREQ_SHIFT			    14
+#define MADERA_AUXPDM1_CLK_FREQ_WIDTH			     2
+
 /* (0x1480)  DFC1_CTRL_W0 */
 #define MADERA_DFC1_RATE_MASK				0x007C
 #define MADERA_DFC1_RATE_SHIFT				     2
-- 
cgit v1.2.3-59-g8ed1b


From 554e937ec8d75930bc711612208af27961e3dc61 Mon Sep 17 00:00:00 2001
From: Pi-Hsun Shih <pihsun@chromium.org>
Date: Mon, 3 Jun 2019 11:45:11 +0800
Subject: mfd: cros_ec: differentiate SCP from EC by feature bit

System Companion Processor (SCP) is Cortex M4 co-processor on some
MediaTek platform that can run EC-style firmware. Since a SCP and EC
would both exist on a system, and use the cros_ec_dev driver, we need to
differentiate between them for the userspace, or they would both be
registered at /dev/cros_ec, causing a conflict.

Signed-off-by: Pi-Hsun Shih <pihsun@chromium.org>
Acked-by: Enric Balletbo i Serra <enric.balletbo@collabora.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cros_ec_dev.c   | 10 ++++++++++
 include/linux/mfd/cros_ec.h |  1 +
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
index d992365472b8..a47223d2baf4 100644
--- a/drivers/mfd/cros_ec_dev.c
+++ b/drivers/mfd/cros_ec_dev.c
@@ -459,6 +459,16 @@ static int ec_device_probe(struct platform_device *pdev)
 		ec_platform->ec_name = CROS_EC_DEV_TP_NAME;
 	}
 
+	/* Check whether this is actually a SCP rather than an EC. */
+	if (cros_ec_check_features(ec, EC_FEATURE_SCP)) {
+		dev_info(dev, "CrOS SCP MCU detected.\n");
+		/*
+		 * Help userspace differentiating ECs from SCP,
+		 * regardless of the probing order.
+		 */
+		ec_platform->ec_name = CROS_EC_DEV_SCP_NAME;
+	}
+
 	/*
 	 * Add the class device
 	 * Link to the character device for creating the /dev entry
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
index cfa78bb4990f..751cb3756d49 100644
--- a/include/linux/mfd/cros_ec.h
+++ b/include/linux/mfd/cros_ec.h
@@ -27,6 +27,7 @@
 #define CROS_EC_DEV_PD_NAME "cros_pd"
 #define CROS_EC_DEV_TP_NAME "cros_tp"
 #define CROS_EC_DEV_ISH_NAME "cros_ish"
+#define CROS_EC_DEV_SCP_NAME "cros_scp"
 
 /*
  * The EC is unresponsive for a time after a reboot command.  Add a
-- 
cgit v1.2.3-59-g8ed1b


From 76304994645028accc0cfe287652344b696f4470 Mon Sep 17 00:00:00 2001
From: Stefan Mavrodiev <stefan@olimex.com>
Date: Fri, 7 Jun 2019 15:42:25 +0300
Subject: mfd: rk808: Check pm_power_off pointer

The function pointer pm_power_off may point to function from other
module (PSCI for example). If rk808 is removed, pm_power_off is
overwritten to NULL and the system cannot be powered off.

This patch checks if pm_power_off points to a module function.

Signed-off-by: Stefan Mavrodiev <stefan@olimex.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rk808.c       | 17 +++++++++++------
 include/linux/mfd/rk808.h |  1 +
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index 6ee1c461a3bb..e234720fee02 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -562,7 +562,6 @@ static int rk808_probe(struct i2c_client *client,
 	struct rk808 *rk808;
 	const struct rk808_reg_data *pre_init_reg;
 	const struct mfd_cell *cells;
-	void (*pm_pwroff_fn)(void) = NULL;
 	int nr_pre_init_regs;
 	int nr_cells;
 	int pm_off = 0, msb, lsb;
@@ -609,7 +608,7 @@ static int rk808_probe(struct i2c_client *client,
 		nr_pre_init_regs = ARRAY_SIZE(rk805_pre_init_reg);
 		cells = rk805s;
 		nr_cells = ARRAY_SIZE(rk805s);
-		pm_pwroff_fn = rk805_device_shutdown;
+		rk808->pm_pwroff_fn = rk805_device_shutdown;
 		break;
 	case RK808_ID:
 		rk808->regmap_cfg = &rk808_regmap_config;
@@ -618,7 +617,7 @@ static int rk808_probe(struct i2c_client *client,
 		nr_pre_init_regs = ARRAY_SIZE(rk808_pre_init_reg);
 		cells = rk808s;
 		nr_cells = ARRAY_SIZE(rk808s);
-		pm_pwroff_fn = rk808_device_shutdown;
+		rk808->pm_pwroff_fn = rk808_device_shutdown;
 		break;
 	case RK818_ID:
 		rk808->regmap_cfg = &rk818_regmap_config;
@@ -627,7 +626,7 @@ static int rk808_probe(struct i2c_client *client,
 		nr_pre_init_regs = ARRAY_SIZE(rk818_pre_init_reg);
 		cells = rk818s;
 		nr_cells = ARRAY_SIZE(rk818s);
-		pm_pwroff_fn = rk818_device_shutdown;
+		rk808->pm_pwroff_fn = rk818_device_shutdown;
 		break;
 	case RK809_ID:
 	case RK817_ID:
@@ -692,7 +691,7 @@ static int rk808_probe(struct i2c_client *client,
 				"rockchip,system-power-controller");
 	if (pm_off && !pm_power_off) {
 		rk808_i2c_client = client;
-		pm_power_off = pm_pwroff_fn;
+		pm_power_off = rk808->pm_pwroff_fn;
 	}
 
 	return 0;
@@ -707,7 +706,13 @@ static int rk808_remove(struct i2c_client *client)
 	struct rk808 *rk808 = i2c_get_clientdata(client);
 
 	regmap_del_irq_chip(client->irq, rk808->irq_data);
-	pm_power_off = NULL;
+
+	/**
+	 * pm_power_off may points to a function from another module.
+	 * Check if the pointer is set by us and only then overwrite it.
+	 */
+	if (rk808->pm_pwroff_fn && pm_power_off == rk808->pm_pwroff_fn)
+		pm_power_off = NULL;
 
 	return 0;
 }
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 2a9cd01691b2..286316375636 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -628,5 +628,6 @@ struct rk808 {
 	long				variant;
 	const struct regmap_config	*regmap_cfg;
 	const struct regmap_irq_chip	*regmap_irq_chip;
+	void				(*pm_pwroff_fn)(void);
 };
 #endif /* __LINUX_REGULATOR_RK808_H */
-- 
cgit v1.2.3-59-g8ed1b


From ac195d94280a783f030a01ee84998a198b779d99 Mon Sep 17 00:00:00 2001
From: Stefan Mavrodiev <stefan@olimex.com>
Date: Fri, 7 Jun 2019 15:42:26 +0300
Subject: mfd: rk808: Prepare rk805 for poweroff

RK805 has SLEEP signal, which can put the device into SLEEP or OFF
mode. The default is SLEEP mode.

However, when the kernel performs power-off (actually the ATF) the
device will not go fully off and this will result in higher power
consumption and inability to wake the device with RTC alarm.

The solution is to enable pm_power_off_prepare function, which will
configure SLEEP pin for OFF function.

Signed-off-by: Stefan Mavrodiev <stefan@olimex.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/rk808.c       | 50 +++++++++++++++++++++++++++++++++--------------
 include/linux/mfd/rk808.h |  1 +
 2 files changed, 36 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/rk808.c b/drivers/mfd/rk808.c
index e234720fee02..09fc8d3da541 100644
--- a/drivers/mfd/rk808.c
+++ b/drivers/mfd/rk808.c
@@ -474,17 +474,29 @@ static void rk805_device_shutdown(void)
 	int ret;
 	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
 
-	if (!rk808) {
-		dev_warn(&rk808_i2c_client->dev,
-			 "have no rk805, so do nothing here\n");
+	if (!rk808)
 		return;
-	}
 
 	ret = regmap_update_bits(rk808->regmap,
 				 RK805_DEV_CTRL_REG,
 				 DEV_OFF, DEV_OFF);
 	if (ret)
-		dev_err(&rk808_i2c_client->dev, "power off error!\n");
+		dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
+}
+
+static void rk805_device_shutdown_prepare(void)
+{
+	int ret;
+	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
+
+	if (!rk808)
+		return;
+
+	ret = regmap_update_bits(rk808->regmap,
+				 RK805_GPIO_IO_POL_REG,
+				 SLP_SD_MSK, SHUTDOWN_FUN);
+	if (ret)
+		dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
 }
 
 static void rk808_device_shutdown(void)
@@ -492,17 +504,14 @@ static void rk808_device_shutdown(void)
 	int ret;
 	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
 
-	if (!rk808) {
-		dev_warn(&rk808_i2c_client->dev,
-			 "have no rk808, so do nothing here\n");
+	if (!rk808)
 		return;
-	}
 
 	ret = regmap_update_bits(rk808->regmap,
 				 RK808_DEVCTRL_REG,
 				 DEV_OFF_RST, DEV_OFF_RST);
 	if (ret)
-		dev_err(&rk808_i2c_client->dev, "power off error!\n");
+		dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
 }
 
 static void rk818_device_shutdown(void)
@@ -510,17 +519,14 @@ static void rk818_device_shutdown(void)
 	int ret;
 	struct rk808 *rk808 = i2c_get_clientdata(rk808_i2c_client);
 
-	if (!rk808) {
-		dev_warn(&rk808_i2c_client->dev,
-			 "have no rk818, so do nothing here\n");
+	if (!rk808)
 		return;
-	}
 
 	ret = regmap_update_bits(rk808->regmap,
 				 RK818_DEVCTRL_REG,
 				 DEV_OFF, DEV_OFF);
 	if (ret)
-		dev_err(&rk808_i2c_client->dev, "power off error!\n");
+		dev_err(&rk808_i2c_client->dev, "Failed to shutdown device!\n");
 }
 
 static void rk8xx_syscore_shutdown(void)
@@ -609,6 +615,7 @@ static int rk808_probe(struct i2c_client *client,
 		cells = rk805s;
 		nr_cells = ARRAY_SIZE(rk805s);
 		rk808->pm_pwroff_fn = rk805_device_shutdown;
+		rk808->pm_pwroff_prep_fn = rk805_device_shutdown_prepare;
 		break;
 	case RK808_ID:
 		rk808->regmap_cfg = &rk808_regmap_config;
@@ -694,6 +701,12 @@ static int rk808_probe(struct i2c_client *client,
 		pm_power_off = rk808->pm_pwroff_fn;
 	}
 
+	if (pm_off && !pm_power_off_prepare) {
+		if (!rk808_i2c_client)
+			rk808_i2c_client = client;
+		pm_power_off_prepare = rk808->pm_pwroff_prep_fn;
+	}
+
 	return 0;
 
 err_irq:
@@ -714,6 +727,13 @@ static int rk808_remove(struct i2c_client *client)
 	if (rk808->pm_pwroff_fn && pm_power_off == rk808->pm_pwroff_fn)
 		pm_power_off = NULL;
 
+	/**
+	 * As above, check if the pointer is set by us before overwrite.
+	 */
+	if (rk808->pm_pwroff_prep_fn &&
+	    pm_power_off_prepare == rk808->pm_pwroff_prep_fn)
+		pm_power_off_prepare = NULL;
+
 	return 0;
 }
 
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 286316375636..b264ac794c74 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -629,5 +629,6 @@ struct rk808 {
 	const struct regmap_config	*regmap_cfg;
 	const struct regmap_irq_chip	*regmap_irq_chip;
 	void				(*pm_pwroff_fn)(void);
+	void				(*pm_pwroff_prep_fn)(void);
 };
 #endif /* __LINUX_REGULATOR_RK808_H */
-- 
cgit v1.2.3-59-g8ed1b


From b1c83bd84618e5a3ec6395845d11803047a3ef9a Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Fri, 10 May 2019 18:23:01 -0700
Subject: mfd: stmfx: Fix macro definition spelling

Clang warns:

In file included from drivers/mfd/stmfx.c:13:
include/linux/mfd/stmfx.h:7:9: warning: 'MFD_STMFX_H' is used as a
header guard here, followed by #define of a different macro
[-Wheader-guard]

Fixes: 06252ade9156 ("mfd: Add ST Multi-Function eXpander (STMFX) core driver")
Link: https://github.com/ClangBuiltLinux/linux/issues/475
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 include/linux/mfd/stmfx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/stmfx.h b/include/linux/mfd/stmfx.h
index d890595b89b6..3c67983678ec 100644
--- a/include/linux/mfd/stmfx.h
+++ b/include/linux/mfd/stmfx.h
@@ -5,7 +5,7 @@
  */
 
 #ifndef MFD_STMFX_H
-#define MFX_STMFX_H
+#define MFD_STMFX_H
 
 #include <linux/regmap.h>
 
-- 
cgit v1.2.3-59-g8ed1b


From 0772a34bb8a12fcc245074e0f76e96cba2c9a434 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 26 Jun 2019 14:33:35 +0100
Subject: mfd: madera: Remove some unused registers and fix some defaults

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cs47l15-tables.c         |   2 -
 drivers/mfd/cs47l35-tables.c         |  54 +---------------
 drivers/mfd/cs47l85-tables.c         | 122 ++---------------------------------
 drivers/mfd/cs47l90-tables.c         |  76 ----------------------
 drivers/mfd/cs47l92-tables.c         |   1 -
 include/linux/mfd/madera/registers.h |  80 -----------------------
 6 files changed, 6 insertions(+), 329 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/cs47l15-tables.c b/drivers/mfd/cs47l15-tables.c
index 1b4f6f79eac2..73db8d03b531 100644
--- a/drivers/mfd/cs47l15-tables.c
+++ b/drivers/mfd/cs47l15-tables.c
@@ -88,7 +88,6 @@ static const struct reg_default cs47l15_reg_default[] = {
 	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
 	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
 	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
-	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
 	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
 	{ 0x0000017a, 0x2906 }, /* R378 (0x17A) - FLL1 EFS 2 */
 	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
@@ -746,7 +745,6 @@ static bool cs47l15_16bit_readable_register(struct device *dev,
 	case MADERA_FLL1_CONTROL_1 ... MADERA_FLL1_CONTROL_6:
 	case MADERA_FLL1_CONTROL_7:
 	case MADERA_FLL1_EFS_2:
-	case MADERA_FLL1_LOOP_FILTER_TEST_1:
 	case MADERA_FLL1_SYNCHRONISER_1 ... MADERA_FLL1_SYNCHRONISER_7:
 	case MADERA_FLL1_SPREAD_SPECTRUM:
 	case MADERA_FLL1_GPIO_CLOCK:
diff --git a/drivers/mfd/cs47l35-tables.c b/drivers/mfd/cs47l35-tables.c
index 338b825127f1..fe838cbc2a7e 100644
--- a/drivers/mfd/cs47l35-tables.c
+++ b/drivers/mfd/cs47l35-tables.c
@@ -109,9 +109,8 @@ static const struct reg_default cs47l35_reg_default[] = {
 	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
 	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
 	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
-	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
 	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
-	{ 0x0000017a, 0x0b06 }, /* R378 (0x17a) - FLL1 EFS2 */
+	{ 0x0000017a, 0x2906 }, /* R378 (0x17a) - FLL1 EFS2 */
 	{ 0x0000017f, 0x0000 }, /* R383 (0x17f) - FLL1 Synchroniser 1 */
 	{ 0x00000180, 0x0000 }, /* R384 (0x180) - FLL1 Synchroniser 2 */
 	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 3 */
@@ -174,9 +173,6 @@ static const struct reg_default cs47l35_reg_default[] = {
 	{ 0x00000434, 0x0000 }, /* R1076 (0x434) - Output Path Config 5R */
 	{ 0x00000435, 0x0180 }, /* R1077 (0x435) - DAC Digital Volume 5R */
 	{ 0x00000437, 0x0200 }, /* R1079 (0x437) - Noise Gate Select 5R */
-	{ 0x00000440, 0x0003 }, /* R1088 (0x440) - DRE Enable */
-	{ 0x00000448, 0x0a83 }, /* R1096 (0x448) - eDRE Enable */
-	{ 0x0000044a, 0x0000 }, /* R1098 (0x44a) - eDRE Manual */
 	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
 	{ 0x00000451, 0x0000 }, /* R1105 (0x451) - DAC AEC Control 2 */
 	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
@@ -720,28 +716,6 @@ static const struct reg_default cs47l35_reg_default[] = {
 	{ 0x00000ef3, 0x0000 }, /* R3827 (0xef3) - ISRC 2 CTRL 1 */
 	{ 0x00000ef4, 0x0001 }, /* R3828 (0xef4) - ISRC 2 CTRL 2 */
 	{ 0x00000ef5, 0x0000 }, /* R3829 (0xef5) - ISRC 2 CTRL 3 */
-	{ 0x00001300, 0x0000 }, /* R4864 (0x1300) - DAC Comp 1 */
-	{ 0x00001302, 0x0000 }, /* R4866 (0x1302) - DAC Comp 2 */
-	{ 0x00001380, 0x0000 }, /* R4992 (0x1380) - FRF Coefficient 1L 1 */
-	{ 0x00001381, 0x0000 }, /* R4993 (0x1381) - FRF Coefficient 1L 2 */
-	{ 0x00001382, 0x0000 }, /* R4994 (0x1382) - FRF Coefficient 1L 3 */
-	{ 0x00001383, 0x0000 }, /* R4995 (0x1383) - FRF Coefficient 1L 4 */
-	{ 0x00001390, 0x0000 }, /* R5008 (0x1390) - FRF Coefficient 1R 1 */
-	{ 0x00001391, 0x0000 }, /* R5009 (0x1391) - FRF Coefficient 1R 2 */
-	{ 0x00001392, 0x0000 }, /* R5010 (0x1392) - FRF Coefficient 1R 3 */
-	{ 0x00001393, 0x0000 }, /* R5011 (0x1393) - FRF Coefficient 1R 4 */
-	{ 0x000013a0, 0x0000 }, /* R5024 (0x13a0) - FRF Coefficient 4L 1 */
-	{ 0x000013a1, 0x0000 }, /* R5025 (0x13a1) - FRF Coefficient 4L 2 */
-	{ 0x000013a2, 0x0000 }, /* R5026 (0x13a2) - FRF Coefficient 4L 3 */
-	{ 0x000013a3, 0x0000 }, /* R5027 (0x13a3) - FRF Coefficient 4L 4 */
-	{ 0x000013b0, 0x0000 }, /* R5040 (0x13b0) - FRF Coefficient 5L 1 */
-	{ 0x000013b1, 0x0000 }, /* R5041 (0x13b1) - FRF Coefficient 5L 2 */
-	{ 0x000013b2, 0x0000 }, /* R5042 (0x13b2) - FRF Coefficient 5L 3 */
-	{ 0x000013b3, 0x0000 }, /* R5043 (0x13b3) - FRF Coefficient 5L 4 */
-	{ 0x000013c0, 0x0000 }, /* R5040 (0x13c0) - FRF Coefficient 5R 1 */
-	{ 0x000013c1, 0x0000 }, /* R5041 (0x13c1) - FRF Coefficient 5R 2 */
-	{ 0x000013c2, 0x0000 }, /* R5042 (0x13c2) - FRF Coefficient 5R 3 */
-	{ 0x000013c3, 0x0000 }, /* R5043 (0x13c3) - FRF Coefficient 5R 4 */
 	{ 0x00001700, 0x2001 }, /* R5888 (0x1700) - GPIO1 Control 1 */
 	{ 0x00001701, 0xf000 }, /* R5889 (0x1701) - GPIO1 Control 2 */
 	{ 0x00001702, 0x2001 }, /* R5890 (0x1702) - GPIO2 Control 1 */
@@ -892,7 +866,6 @@ static bool cs47l35_16bit_readable_register(struct device *dev,
 	case MADERA_FLL1_CONTROL_6:
 	case MADERA_FLL1_CONTROL_7:
 	case MADERA_FLL1_EFS_2:
-	case MADERA_FLL1_LOOP_FILTER_TEST_1:
 	case CS47L35_FLL1_SYNCHRONISER_1:
 	case CS47L35_FLL1_SYNCHRONISER_2:
 	case CS47L35_FLL1_SYNCHRONISER_3:
@@ -967,9 +940,6 @@ static bool cs47l35_16bit_readable_register(struct device *dev,
 	case MADERA_OUTPUT_PATH_CONFIG_5R:
 	case MADERA_DAC_DIGITAL_VOLUME_5R:
 	case MADERA_NOISE_GATE_SELECT_5R:
-	case MADERA_DRE_ENABLE:
-	case MADERA_EDRE_ENABLE:
-	case MADERA_EDRE_MANUAL:
 	case MADERA_DAC_AEC_CONTROL_1:
 	case MADERA_DAC_AEC_CONTROL_2:
 	case MADERA_NOISE_GATE_CONTROL:
@@ -1439,28 +1409,6 @@ static bool cs47l35_16bit_readable_register(struct device *dev,
 	case MADERA_ISRC_2_CTRL_1:
 	case MADERA_ISRC_2_CTRL_2:
 	case MADERA_ISRC_2_CTRL_3:
-	case MADERA_DAC_COMP_1:
-	case MADERA_DAC_COMP_2:
-	case MADERA_FRF_COEFFICIENT_1L_1:
-	case MADERA_FRF_COEFFICIENT_1L_2:
-	case MADERA_FRF_COEFFICIENT_1L_3:
-	case MADERA_FRF_COEFFICIENT_1L_4:
-	case MADERA_FRF_COEFFICIENT_1R_1:
-	case MADERA_FRF_COEFFICIENT_1R_2:
-	case MADERA_FRF_COEFFICIENT_1R_3:
-	case MADERA_FRF_COEFFICIENT_1R_4:
-	case CS47L35_FRF_COEFFICIENT_4L_1:
-	case CS47L35_FRF_COEFFICIENT_4L_2:
-	case CS47L35_FRF_COEFFICIENT_4L_3:
-	case CS47L35_FRF_COEFFICIENT_4L_4:
-	case CS47L35_FRF_COEFFICIENT_5L_1:
-	case CS47L35_FRF_COEFFICIENT_5L_2:
-	case CS47L35_FRF_COEFFICIENT_5L_3:
-	case CS47L35_FRF_COEFFICIENT_5L_4:
-	case CS47L35_FRF_COEFFICIENT_5R_1:
-	case CS47L35_FRF_COEFFICIENT_5R_2:
-	case CS47L35_FRF_COEFFICIENT_5R_3:
-	case CS47L35_FRF_COEFFICIENT_5R_4:
 	case MADERA_GPIO1_CTRL_1 ... MADERA_GPIO16_CTRL_2:
 	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
 	case MADERA_IRQ1_MASK_1 ... MADERA_IRQ1_MASK_33:
diff --git a/drivers/mfd/cs47l85-tables.c b/drivers/mfd/cs47l85-tables.c
index 43803145d8e5..d0198b5e86ba 100644
--- a/drivers/mfd/cs47l85-tables.c
+++ b/drivers/mfd/cs47l85-tables.c
@@ -402,7 +402,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
 	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
 	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
-	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
 	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
 	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
 	{ 0x00000182, 0x0000 }, /* R386 (0x182) - FLL1 Synchroniser 2 */
@@ -419,7 +418,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000194, 0x007d }, /* R404 (0x194) - FLL2 Control 4 */
 	{ 0x00000195, 0x0000 }, /* R405 (0x195) - FLL2 Control 5 */
 	{ 0x00000196, 0x0000 }, /* R406 (0x196) - FLL2 Control 6 */
-	{ 0x00000197, 0x0281 }, /* R407 (0x197) - FLL2 Loop Filter Test 1 */
 	{ 0x00000199, 0x0000 }, /* R409 (0x199) - FLL2 Control 7 */
 	{ 0x000001a1, 0x0000 }, /* R417 (0x1a1) - FLL2 Synchroniser 1 */
 	{ 0x000001a2, 0x0000 }, /* R418 (0x1a2) - FLL2 Synchroniser 2 */
@@ -436,7 +434,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x000001b4, 0x007d }, /* R436 (0x1b4) - FLL3 Control 4 */
 	{ 0x000001b5, 0x0000 }, /* R437 (0x1b5) - FLL3 Control 5 */
 	{ 0x000001b6, 0x0000 }, /* R438 (0x1b6) - FLL3 Control 6 */
-	{ 0x000001b7, 0x0281 }, /* R439 (0x1b7) - FLL3 Loop Filter Test 1 */
 	{ 0x000001b9, 0x0000 }, /* R441 (0x1b9) - FLL3 Control 7 */
 	{ 0x000001c1, 0x0000 }, /* R449 (0x1c1) - FLL3 Synchroniser 1 */
 	{ 0x000001c2, 0x0000 }, /* R450 (0x1c2) - FLL3 Synchroniser 2 */
@@ -546,9 +543,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x0000043c, 0x0000 }, /* R1084 (0x43c) - Output Path Config 6R */
 	{ 0x0000043d, 0x0180 }, /* R1085 (0x43d) - DAC Digital Volume 6R */
 	{ 0x0000043f, 0x0800 }, /* R1087 (0x43f) - Noise Gate Select 6R */
-	{ 0x00000440, 0x003f }, /* R1088 (0x440) - DRE Enable */
-	{ 0x00000448, 0x003f }, /* R1096 (0x448) - EDRE Enable */
-	{ 0x0000044a, 0x0000 }, /* R1098 (0x44a) - EDRE Manual */
 	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
 	{ 0x00000451, 0x0000 }, /* R1105 (0x451) - DAC AEC Control 2 */
 	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
@@ -556,7 +550,7 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000491, 0x0000 }, /* R1169 (0x491) - PDM SPK1 CTRL 2 */
 	{ 0x00000492, 0x0069 }, /* R1170 (0x492) - PDM SPK2 CTRL 1 */
 	{ 0x00000493, 0x0000 }, /* R1171 (0x493) - PDM SPK2 CTRL 2 */
-	{ 0x000004a0, 0x3210 }, /* R1184 (0x4a0) - HP1 Short Circuit Ctrl */
+	{ 0x000004a0, 0x3280 }, /* R1184 (0x4a0) - HP1 Short Circuit Ctrl */
 	{ 0x000004a1, 0x3200 }, /* R1185 (0x4a1) - HP2 Short Circuit Ctrl */
 	{ 0x000004a2, 0x3200 }, /* R1186 (0x4a2) - HP3 Short Circuit Ctrl */
 	{ 0x000004a8, 0x7020 }, /* R1192 (0x4a8) - HP Test Ctrl 5 */
@@ -1365,11 +1359,11 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000e82, 0x0018 }, /* R3714 (0xe82) - DRC1 ctrl3 */
 	{ 0x00000e83, 0x0000 }, /* R3715 (0xe83) - DRC1 ctrl4 */
 	{ 0x00000e84, 0x0000 }, /* R3716 (0xe84) - DRC1 ctrl5 */
-	{ 0x00000e88, 0x0933 }, /* R3720 (0xe88) - DRC2 ctrl1 */
-	{ 0x00000e89, 0x0018 }, /* R3721 (0xe89) - DRC2 ctrl2 */
-	{ 0x00000e8a, 0x0000 }, /* R3722 (0xe8a) - DRC2 ctrl3 */
+	{ 0x00000e88, 0x0018 }, /* R3720 (0xe88) - DRC2 ctrl1 */
+	{ 0x00000e89, 0x0933 }, /* R3721 (0xe89) - DRC2 ctrl2 */
+	{ 0x00000e8a, 0x0018 }, /* R3722 (0xe8a) - DRC2 ctrl3 */
 	{ 0x00000e8b, 0x0000 }, /* R3723 (0xe8b) - DRC2 ctrl4 */
-	{ 0x00000e8c, 0x0040 }, /* R3724 (0xe8c) - DRC2 ctrl5 */
+	{ 0x00000e8c, 0x0000 }, /* R3724 (0xe8c) - DRC2 ctrl5 */
 	{ 0x00000ec0, 0x0000 }, /* R3776 (0xec0) - HPLPF1_1 */
 	{ 0x00000ec1, 0x0000 }, /* R3777 (0xec1) - HPLPF1_2 */
 	{ 0x00000ec4, 0x0000 }, /* R3780 (0xec4) - HPLPF2_1 */
@@ -1577,56 +1571,6 @@ static const struct reg_default cs47l85_reg_default[] = {
 	{ 0x00000fc3, 0x0000 }, /* R4035 (0xfc3) - ANC Coefficient */
 	{ 0x00000fc4, 0x0000 }, /* R4036 (0xfc4) - ANC Coefficient */
 	{ 0x00000fc5, 0x0000 }, /* R4037 (0xfc5) - ANC Coefficient */
-	{ 0x00001300, 0x0000 }, /* R4864 (0x1300) - DAC Comp 1 */
-	{ 0x00001302, 0x0000 }, /* R4866 (0x1302) - DAC Comp 2 */
-	{ 0x00001380, 0x0000 }, /* R4992 (0x1380) - FRF Coefficient 1L 1 */
-	{ 0x00001381, 0x0000 }, /* R4993 (0x1381) - FRF Coefficient 1L 2 */
-	{ 0x00001382, 0x0000 }, /* R4994 (0x1382) - FRF Coefficient 1L 3 */
-	{ 0x00001383, 0x0000 }, /* R4995 (0x1383) - FRF Coefficient 1L 4 */
-	{ 0x00001390, 0x0000 }, /* R5008 (0x1390) - FRF Coefficient 1R 1 */
-	{ 0x00001391, 0x0000 }, /* R5009 (0x1391) - FRF Coefficient 1R 2 */
-	{ 0x00001392, 0x0000 }, /* R5010 (0x1392) - FRF Coefficient 1R 3 */
-	{ 0x00001393, 0x0000 }, /* R5011 (0x1393) - FRF Coefficient 1R 4 */
-	{ 0x000013a0, 0x0000 }, /* R5024 (0x13a0) - FRF Coefficient 2L 1 */
-	{ 0x000013a1, 0x0000 }, /* R5025 (0x13a1) - FRF Coefficient 2L 2 */
-	{ 0x000013a2, 0x0000 }, /* R5026 (0x13a2) - FRF Coefficient 2L 3 */
-	{ 0x000013a3, 0x0000 }, /* R5027 (0x13a3) - FRF Coefficient 2L 4 */
-	{ 0x000013b0, 0x0000 }, /* R5040 (0x13b0) - FRF Coefficient 2R 1 */
-	{ 0x000013b1, 0x0000 }, /* R5041 (0x13b1) - FRF Coefficient 2R 2 */
-	{ 0x000013b2, 0x0000 }, /* R5042 (0x13b2) - FRF Coefficient 2R 3 */
-	{ 0x000013b3, 0x0000 }, /* R5043 (0x13b3) - FRF Coefficient 2R 4 */
-	{ 0x000013c0, 0x0000 }, /* R5040 (0x13c0) - FRF Coefficient 3L 1 */
-	{ 0x000013c1, 0x0000 }, /* R5041 (0x13c1) - FRF Coefficient 3L 2 */
-	{ 0x000013c2, 0x0000 }, /* R5042 (0x13c2) - FRF Coefficient 3L 3 */
-	{ 0x000013c3, 0x0000 }, /* R5043 (0x13c3) - FRF Coefficient 3L 4 */
-	{ 0x000013d0, 0x0000 }, /* R5072 (0x13d0) - FRF Coefficient 3R 1 */
-	{ 0x000013d1, 0x0000 }, /* R5073 (0x13d1) - FRF Coefficient 3R 2 */
-	{ 0x000013d2, 0x0000 }, /* R5074 (0x13d2) - FRF Coefficient 3R 3 */
-	{ 0x000013d3, 0x0000 }, /* R5075 (0x13d3) - FRF Coefficient 3R 4 */
-	{ 0x000013e0, 0x0000 }, /* R5088 (0x13e0) - FRF Coefficient 4L 1 */
-	{ 0x000013e1, 0x0000 }, /* R5089 (0x13e1) - FRF Coefficient 4L 2 */
-	{ 0x000013e2, 0x0000 }, /* R5090 (0x13e2) - FRF Coefficient 4L 3 */
-	{ 0x000013e3, 0x0000 }, /* R5091 (0x13e3) - FRF Coefficient 4L 4 */
-	{ 0x000013f0, 0x0000 }, /* R5104 (0x13f0) - FRF Coefficient 4R 1 */
-	{ 0x000013f1, 0x0000 }, /* R5105 (0x13f1) - FRF Coefficient 4R 2 */
-	{ 0x000013f2, 0x0000 }, /* R5106 (0x13f2) - FRF Coefficient 4R 3 */
-	{ 0x000013f3, 0x0000 }, /* R5107 (0x13f3) - FRF Coefficient 4R 4 */
-	{ 0x00001400, 0x0000 }, /* R5120 (0x1400) - FRF Coefficient 5L 1 */
-	{ 0x00001401, 0x0000 }, /* R5121 (0x1401) - FRF Coefficient 5L 2 */
-	{ 0x00001402, 0x0000 }, /* R5122 (0x1402) - FRF Coefficient 5L 3 */
-	{ 0x00001403, 0x0000 }, /* R5123 (0x1403) - FRF Coefficient 5L 4 */
-	{ 0x00001410, 0x0000 }, /* R5136 (0x1410) - FRF Coefficient 5R 1 */
-	{ 0x00001411, 0x0000 }, /* R5137 (0x1411) - FRF Coefficient 5R 2 */
-	{ 0x00001412, 0x0000 }, /* R5138 (0x1412) - FRF Coefficient 5R 3 */
-	{ 0x00001413, 0x0000 }, /* R5139 (0x1413) - FRF Coefficient 5R 4 */
-	{ 0x00001420, 0x0000 }, /* R5152 (0x1420) - FRF Coefficient 6L 1 */
-	{ 0x00001421, 0x0000 }, /* R5153 (0x1421) - FRF Coefficient 6L 2 */
-	{ 0x00001422, 0x0000 }, /* R5154 (0x1422) - FRF Coefficient 6L 3 */
-	{ 0x00001423, 0x0000 }, /* R5155 (0x1423) - FRF Coefficient 6L 4 */
-	{ 0x00001430, 0x0000 }, /* R5168 (0x1430) - FRF Coefficient 6R 1 */
-	{ 0x00001431, 0x0000 }, /* R5169 (0x1431) - FRF Coefficient 6R 2 */
-	{ 0x00001432, 0x0000 }, /* R5170 (0x1432) - FRF Coefficient 6R 3 */
-	{ 0x00001433, 0x0000 }, /* R5171 (0x1433) - FRF Coefficient 6R 4 */
 	{ 0x00001700, 0x2001 }, /* R5888 (0x1700) - GPIO1 Control 1 */
 	{ 0x00001701, 0xe000 }, /* R5889 (0x1701) - GPIO1 Control 2 */
 	{ 0x00001702, 0x2001 }, /* R5890 (0x1702) - GPIO2 Control 1 */
@@ -1845,7 +1789,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_FLL1_CONTROL_5:
 	case MADERA_FLL1_CONTROL_6:
 	case MADERA_FLL1_CONTROL_7:
-	case MADERA_FLL1_LOOP_FILTER_TEST_1:
 	case MADERA_FLL1_SYNCHRONISER_1:
 	case MADERA_FLL1_SYNCHRONISER_2:
 	case MADERA_FLL1_SYNCHRONISER_3:
@@ -1862,7 +1805,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_FLL2_CONTROL_5:
 	case MADERA_FLL2_CONTROL_6:
 	case MADERA_FLL2_CONTROL_7:
-	case MADERA_FLL2_LOOP_FILTER_TEST_1:
 	case MADERA_FLL2_SYNCHRONISER_1:
 	case MADERA_FLL2_SYNCHRONISER_2:
 	case MADERA_FLL2_SYNCHRONISER_3:
@@ -1879,7 +1821,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_FLL3_CONTROL_5:
 	case MADERA_FLL3_CONTROL_6:
 	case MADERA_FLL3_CONTROL_7:
-	case MADERA_FLL3_LOOP_FILTER_TEST_1:
 	case MADERA_FLL3_SYNCHRONISER_1:
 	case MADERA_FLL3_SYNCHRONISER_2:
 	case MADERA_FLL3_SYNCHRONISER_3:
@@ -2004,9 +1945,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_OUTPUT_PATH_CONFIG_6R:
 	case MADERA_DAC_DIGITAL_VOLUME_6R:
 	case MADERA_NOISE_GATE_SELECT_6R:
-	case MADERA_DRE_ENABLE:
-	case MADERA_EDRE_ENABLE:
-	case MADERA_EDRE_MANUAL:
 	case MADERA_DAC_AEC_CONTROL_1:
 	case MADERA_DAC_AEC_CONTROL_2:
 	case MADERA_NOISE_GATE_CONTROL:
@@ -2792,56 +2730,6 @@ static bool cs47l85_16bit_readable_register(struct device *dev,
 	case MADERA_FCR_FILTER_CONTROL:
 	case MADERA_FCR_ADC_REFORMATTER_CONTROL:
 	case MADERA_FCR_COEFF_START ... MADERA_FCR_COEFF_END:
-	case MADERA_DAC_COMP_1:
-	case MADERA_DAC_COMP_2:
-	case MADERA_FRF_COEFFICIENT_1L_1:
-	case MADERA_FRF_COEFFICIENT_1L_2:
-	case MADERA_FRF_COEFFICIENT_1L_3:
-	case MADERA_FRF_COEFFICIENT_1L_4:
-	case MADERA_FRF_COEFFICIENT_1R_1:
-	case MADERA_FRF_COEFFICIENT_1R_2:
-	case MADERA_FRF_COEFFICIENT_1R_3:
-	case MADERA_FRF_COEFFICIENT_1R_4:
-	case MADERA_FRF_COEFFICIENT_2L_1:
-	case MADERA_FRF_COEFFICIENT_2L_2:
-	case MADERA_FRF_COEFFICIENT_2L_3:
-	case MADERA_FRF_COEFFICIENT_2L_4:
-	case MADERA_FRF_COEFFICIENT_2R_1:
-	case MADERA_FRF_COEFFICIENT_2R_2:
-	case MADERA_FRF_COEFFICIENT_2R_3:
-	case MADERA_FRF_COEFFICIENT_2R_4:
-	case MADERA_FRF_COEFFICIENT_3L_1:
-	case MADERA_FRF_COEFFICIENT_3L_2:
-	case MADERA_FRF_COEFFICIENT_3L_3:
-	case MADERA_FRF_COEFFICIENT_3L_4:
-	case MADERA_FRF_COEFFICIENT_3R_1:
-	case MADERA_FRF_COEFFICIENT_3R_2:
-	case MADERA_FRF_COEFFICIENT_3R_3:
-	case MADERA_FRF_COEFFICIENT_3R_4:
-	case MADERA_FRF_COEFFICIENT_4L_1:
-	case MADERA_FRF_COEFFICIENT_4L_2:
-	case MADERA_FRF_COEFFICIENT_4L_3:
-	case MADERA_FRF_COEFFICIENT_4L_4:
-	case MADERA_FRF_COEFFICIENT_4R_1:
-	case MADERA_FRF_COEFFICIENT_4R_2:
-	case MADERA_FRF_COEFFICIENT_4R_3:
-	case MADERA_FRF_COEFFICIENT_4R_4:
-	case MADERA_FRF_COEFFICIENT_5L_1:
-	case MADERA_FRF_COEFFICIENT_5L_2:
-	case MADERA_FRF_COEFFICIENT_5L_3:
-	case MADERA_FRF_COEFFICIENT_5L_4:
-	case MADERA_FRF_COEFFICIENT_5R_1:
-	case MADERA_FRF_COEFFICIENT_5R_2:
-	case MADERA_FRF_COEFFICIENT_5R_3:
-	case MADERA_FRF_COEFFICIENT_5R_4:
-	case MADERA_FRF_COEFFICIENT_6L_1:
-	case MADERA_FRF_COEFFICIENT_6L_2:
-	case MADERA_FRF_COEFFICIENT_6L_3:
-	case MADERA_FRF_COEFFICIENT_6L_4:
-	case MADERA_FRF_COEFFICIENT_6R_1:
-	case MADERA_FRF_COEFFICIENT_6R_2:
-	case MADERA_FRF_COEFFICIENT_6R_3:
-	case MADERA_FRF_COEFFICIENT_6R_4:
 	case MADERA_GPIO1_CTRL_1 ... MADERA_GPIO40_CTRL_2:
 	case MADERA_IRQ1_STATUS_1 ... MADERA_IRQ1_STATUS_33:
 	case MADERA_IRQ1_MASK_1 ... MADERA_IRQ1_MASK_33:
diff --git a/drivers/mfd/cs47l90-tables.c b/drivers/mfd/cs47l90-tables.c
index c040d3d7232a..2c761fc241f3 100644
--- a/drivers/mfd/cs47l90-tables.c
+++ b/drivers/mfd/cs47l90-tables.c
@@ -119,7 +119,6 @@ static const struct reg_default cs47l90_reg_default[] = {
 	{ 0x00000174, 0x007d }, /* R372 (0x174) - FLL1 Control 4 */
 	{ 0x00000175, 0x0000 }, /* R373 (0x175) - FLL1 Control 5 */
 	{ 0x00000176, 0x0000 }, /* R374 (0x176) - FLL1 Control 6 */
-	{ 0x00000177, 0x0281 }, /* R375 (0x177) - FLL1 Loop Filter Test 1 */
 	{ 0x00000179, 0x0000 }, /* R377 (0x179) - FLL1 Control 7 */
 	{ 0x0000017a, 0x2906 }, /* R377 (0x17a) - FLL1 Efs 2 */
 	{ 0x00000181, 0x0000 }, /* R385 (0x181) - FLL1 Synchroniser 1 */
@@ -137,7 +136,6 @@ static const struct reg_default cs47l90_reg_default[] = {
 	{ 0x00000194, 0x007d }, /* R404 (0x194) - FLL2 Control 4 */
 	{ 0x00000195, 0x0000 }, /* R405 (0x195) - FLL2 Control 5 */
 	{ 0x00000196, 0x0000 }, /* R406 (0x196) - FLL2 Control 6 */
-	{ 0x00000197, 0x0281 }, /* R407 (0x197) - FLL2 Loop Filter Test 1 */
 	{ 0x00000199, 0x0000 }, /* R409 (0x199) - FLL2 Control 7 */
 	{ 0x0000019a, 0x2906 }, /* R410 (0x19a) - FLL2 Efs 2 */
 	{ 0x000001a1, 0x0000 }, /* R417 (0x1a1) - FLL2 Synchroniser 1 */
@@ -260,8 +258,6 @@ static const struct reg_default cs47l90_reg_default[] = {
 	{ 0x00000434, 0x0000 }, /* R1076 (0x434) - Output Path Config 5R */
 	{ 0x00000435, 0x0180 }, /* R1077 (0x435) - DAC Digital Volume 5R */
 	{ 0x00000437, 0x0200 }, /* R1079 (0x437) - Noise Gate Select 5R */
-	{ 0x00000440, 0x003f }, /* R1088 (0x440) - DRE Enable */
-	{ 0x00000448, 0x003f }, /* R1096 (0x448) - eDRE Enable */
 	{ 0x00000450, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 1 */
 	{ 0x00000451, 0x0000 }, /* R1104 (0x450) - DAC AEC Control 2 */
 	{ 0x00000458, 0x0000 }, /* R1112 (0x458) - Noise Gate Control */
@@ -1262,40 +1258,6 @@ static const struct reg_default cs47l90_reg_default[] = {
 	{ 0x00000fc3, 0x0000 }, /* R4035 (0xfc3) - ANC Coefficient */
 	{ 0x00000fc4, 0x0000 }, /* R4036 (0xfc4) - ANC Coefficient */
 	{ 0x00000fc5, 0x0000 }, /* R4037 (0xfc5) - ANC Coefficient */
-	{ 0x00001300, 0x050E }, /* R4864 (0x1300) - DAC Comp 1 */
-	{ 0x00001302, 0x0101 }, /* R4866 (0x1302) - DAC Comp 2 */
-	{ 0x00001380, 0x0425 }, /* R4992 (0x1380) - FRF Coefficient 1L 1 */
-	{ 0x00001381, 0xF6D8 }, /* R4993 (0x1381) - FRF Coefficient 1L 2 */
-	{ 0x00001382, 0x0632 }, /* R4994 (0x1382) - FRF Coefficient 1L 3 */
-	{ 0x00001383, 0xFEC8 }, /* R4995 (0x1383) - FRF Coefficient 1L 4 */
-	{ 0x00001390, 0x042F }, /* R5008 (0x1390) - FRF Coefficient 1R 1 */
-	{ 0x00001391, 0xF6CA }, /* R5009 (0x1391) - FRF Coefficient 1R 2 */
-	{ 0x00001392, 0x0637 }, /* R5010 (0x1392) - FRF Coefficient 1R 3 */
-	{ 0x00001393, 0xFEC8 }, /* R5011 (0x1393) - FRF Coefficient 1R 4 */
-	{ 0x000013a0, 0x0000 }, /* R5024 (0x13a0) - FRF Coefficient 2L 1 */
-	{ 0x000013a1, 0x0000 }, /* R5025 (0x13a1) - FRF Coefficient 2L 2 */
-	{ 0x000013a2, 0x0000 }, /* R5026 (0x13a2) - FRF Coefficient 2L 3 */
-	{ 0x000013a3, 0x0000 }, /* R5027 (0x13a3) - FRF Coefficient 2L 4 */
-	{ 0x000013b0, 0x0000 }, /* R5040 (0x13b0) - FRF Coefficient 2R 1 */
-	{ 0x000013b1, 0x0000 }, /* R5041 (0x13b1) - FRF Coefficient 2R 2 */
-	{ 0x000013b2, 0x0000 }, /* R5042 (0x13b2) - FRF Coefficient 2R 3 */
-	{ 0x000013b3, 0x0000 }, /* R5043 (0x13b3) - FRF Coefficient 2R 4 */
-	{ 0x000013c0, 0x0000 }, /* R5040 (0x13c0) - FRF Coefficient 3L 1 */
-	{ 0x000013c1, 0x0000 }, /* R5041 (0x13c1) - FRF Coefficient 3L 2 */
-	{ 0x000013c2, 0x0000 }, /* R5042 (0x13c2) - FRF Coefficient 3L 3 */
-	{ 0x000013c3, 0x0000 }, /* R5043 (0x13c3) - FRF Coefficient 3L 4 */
-	{ 0x000013d0, 0x0000 }, /* R5072 (0x13d0) - FRF Coefficient 3R 1 */
-	{ 0x000013d1, 0x0000 }, /* R5073 (0x13d1) - FRF Coefficient 3R 2 */
-	{ 0x000013d2, 0x0000 }, /* R5074 (0x13d2) - FRF Coefficient 3R 3 */
-	{ 0x000013d3, 0x0000 }, /* R5075 (0x13d3) - FRF Coefficient 3R 4 */
-	{ 0x00001400, 0x0000 }, /* R5120 (0x1400) - FRF Coefficient 5L 1 */
-	{ 0x00001401, 0x0000 }, /* R5121 (0x1401) - FRF Coefficient 5L 2 */
-	{ 0x00001402, 0x0000 }, /* R5122 (0x1402) - FRF Coefficient 5L 3 */
-	{ 0x00001403, 0x0000 }, /* R5123 (0x1403) - FRF Coefficient 5L 4 */
-	{ 0x00001410, 0x0000 }, /* R5136 (0x1410) - FRF Coefficient 5R 1 */
-	{ 0x00001411, 0x0000 }, /* R5137 (0x1411) - FRF Coefficient 5R 2 */
-	{ 0x00001412, 0x0000 }, /* R5138 (0x1412) - FRF Coefficient 5R 3 */
-	{ 0x00001413, 0x0000 }, /* R5139 (0x1413) - FRF Coefficient 5R 4 */
 	{ 0x00001480, 0x0000 }, /* R5248 (0x1480) - DFC1_CTRL */
 	{ 0x00001482, 0x1f00 }, /* R5250 (0x1482) - DFC1_RX */
 	{ 0x00001484, 0x1f00 }, /* R5252 (0x1486) - DFC1_TX */
@@ -1535,7 +1497,6 @@ static bool cs47l90_16bit_readable_register(struct device *dev,
 	case MADERA_FLL1_CONTROL_6:
 	case MADERA_FLL1_CONTROL_7:
 	case MADERA_FLL1_EFS_2:
-	case MADERA_FLL1_LOOP_FILTER_TEST_1:
 	case MADERA_FLL1_SYNCHRONISER_1:
 	case MADERA_FLL1_SYNCHRONISER_2:
 	case MADERA_FLL1_SYNCHRONISER_3:
@@ -1553,7 +1514,6 @@ static bool cs47l90_16bit_readable_register(struct device *dev,
 	case MADERA_FLL2_CONTROL_6:
 	case MADERA_FLL2_CONTROL_7:
 	case MADERA_FLL2_EFS_2:
-	case MADERA_FLL2_LOOP_FILTER_TEST_1:
 	case MADERA_FLL2_SYNCHRONISER_1:
 	case MADERA_FLL2_SYNCHRONISER_2:
 	case MADERA_FLL2_SYNCHRONISER_3:
@@ -1690,8 +1650,6 @@ static bool cs47l90_16bit_readable_register(struct device *dev,
 	case MADERA_OUTPUT_PATH_CONFIG_5R:
 	case MADERA_DAC_DIGITAL_VOLUME_5R:
 	case MADERA_NOISE_GATE_SELECT_5R:
-	case MADERA_DRE_ENABLE:
-	case MADERA_EDRE_ENABLE:
 	case MADERA_DAC_AEC_CONTROL_1:
 	case MADERA_DAC_AEC_CONTROL_2:
 	case MADERA_NOISE_GATE_CONTROL:
@@ -2449,40 +2407,6 @@ static bool cs47l90_16bit_readable_register(struct device *dev,
 	case MADERA_FCR_FILTER_CONTROL:
 	case MADERA_FCR_ADC_REFORMATTER_CONTROL:
 	case MADERA_FCR_COEFF_START ... MADERA_FCR_COEFF_END:
-	case MADERA_DAC_COMP_1:
-	case MADERA_DAC_COMP_2:
-	case MADERA_FRF_COEFFICIENT_1L_1:
-	case MADERA_FRF_COEFFICIENT_1L_2:
-	case MADERA_FRF_COEFFICIENT_1L_3:
-	case MADERA_FRF_COEFFICIENT_1L_4:
-	case MADERA_FRF_COEFFICIENT_1R_1:
-	case MADERA_FRF_COEFFICIENT_1R_2:
-	case MADERA_FRF_COEFFICIENT_1R_3:
-	case MADERA_FRF_COEFFICIENT_1R_4:
-	case MADERA_FRF_COEFFICIENT_2L_1:
-	case MADERA_FRF_COEFFICIENT_2L_2:
-	case MADERA_FRF_COEFFICIENT_2L_3:
-	case MADERA_FRF_COEFFICIENT_2L_4:
-	case MADERA_FRF_COEFFICIENT_2R_1:
-	case MADERA_FRF_COEFFICIENT_2R_2:
-	case MADERA_FRF_COEFFICIENT_2R_3:
-	case MADERA_FRF_COEFFICIENT_2R_4:
-	case MADERA_FRF_COEFFICIENT_3L_1:
-	case MADERA_FRF_COEFFICIENT_3L_2:
-	case MADERA_FRF_COEFFICIENT_3L_3:
-	case MADERA_FRF_COEFFICIENT_3L_4:
-	case MADERA_FRF_COEFFICIENT_3R_1:
-	case MADERA_FRF_COEFFICIENT_3R_2:
-	case MADERA_FRF_COEFFICIENT_3R_3:
-	case MADERA_FRF_COEFFICIENT_3R_4:
-	case MADERA_FRF_COEFFICIENT_5L_1:
-	case MADERA_FRF_COEFFICIENT_5L_2:
-	case MADERA_FRF_COEFFICIENT_5L_3:
-	case MADERA_FRF_COEFFICIENT_5L_4:
-	case MADERA_FRF_COEFFICIENT_5R_1:
-	case MADERA_FRF_COEFFICIENT_5R_2:
-	case MADERA_FRF_COEFFICIENT_5R_3:
-	case MADERA_FRF_COEFFICIENT_5R_4:
 	case MADERA_DFC1_CTRL:
 	case MADERA_DFC1_RX:
 	case MADERA_DFC1_TX:
diff --git a/drivers/mfd/cs47l92-tables.c b/drivers/mfd/cs47l92-tables.c
index 3dc1fefe68f5..c8a234381350 100644
--- a/drivers/mfd/cs47l92-tables.c
+++ b/drivers/mfd/cs47l92-tables.c
@@ -1063,7 +1063,6 @@ static const struct reg_default cs47l92_reg_default[] = {
 	{ 0x0000185e, 0xffff }, /* R6238 (0x185e) - IRQ1 Mask 31 */
 	{ 0x0000185f, 0xffff }, /* R6239 (0x185f) - IRQ1 Mask 32 */
 	{ 0x00001860, 0x0001 }, /* R6240 (0x1860) - IRQ1 Mask 33 */
-	{ 0x00001948, 0x031f }, /* R6472 (0x1948) - IRQ2 Mask 9 */
 	{ 0x00001a06, 0x0000 }, /* R6662 (0x1a06) - Interrupt Debounce 7 */
 	{ 0x00001a80, 0x4400 }, /* R6784 (0x1a80) - IRQ1 Ctrl */
 };
diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index 6439c0282ac6..53c2377b54b2 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -76,9 +76,7 @@
 #define MADERA_FLL1_CONTROL_4				0x174
 #define MADERA_FLL1_CONTROL_5				0x175
 #define MADERA_FLL1_CONTROL_6				0x176
-#define MADERA_FLL1_LOOP_FILTER_TEST_1			0x177
 #define CS47L92_FLL1_CONTROL_7				0x177
-#define MADERA_FLL1_NCO_TEST_0				0x178
 #define CS47L92_FLL1_CONTROL_8				0x178
 #define MADERA_FLL1_CONTROL_7				0x179
 #define CS47L92_FLL1_CONTROL_9				0x179
@@ -111,9 +109,7 @@
 #define MADERA_FLL2_CONTROL_4				0x194
 #define MADERA_FLL2_CONTROL_5				0x195
 #define MADERA_FLL2_CONTROL_6				0x196
-#define MADERA_FLL2_LOOP_FILTER_TEST_1			0x197
 #define CS47L92_FLL2_CONTROL_7				0x197
-#define MADERA_FLL2_NCO_TEST_0				0x198
 #define CS47L92_FLL2_CONTROL_8				0x198
 #define MADERA_FLL2_CONTROL_7				0x199
 #define CS47L92_FLL2_CONTROL_9				0x199
@@ -137,8 +133,6 @@
 #define MADERA_FLL3_CONTROL_4				0x1B4
 #define MADERA_FLL3_CONTROL_5				0x1B5
 #define MADERA_FLL3_CONTROL_6				0x1B6
-#define MADERA_FLL3_LOOP_FILTER_TEST_1			0x1B7
-#define MADERA_FLL3_NCO_TEST_0				0x1B8
 #define MADERA_FLL3_CONTROL_7				0x1B9
 #define MADERA_FLL3_SYNCHRONISER_1			0x1C1
 #define MADERA_FLL3_SYNCHRONISER_2			0x1C2
@@ -304,9 +298,6 @@
 #define MADERA_OUTPUT_PATH_CONFIG_6R			0x43C
 #define MADERA_DAC_DIGITAL_VOLUME_6R			0x43D
 #define MADERA_NOISE_GATE_SELECT_6R			0x43F
-#define MADERA_DRE_ENABLE				0x440
-#define MADERA_EDRE_ENABLE				0x448
-#define MADERA_EDRE_MANUAL				0x44A
 #define MADERA_DAC_AEC_CONTROL_1			0x450
 #define MADERA_DAC_AEC_CONTROL_2			0x451
 #define MADERA_NOISE_GATE_CONTROL			0x458
@@ -1182,68 +1173,6 @@
 #define MADERA_FCR_COEFF_END				0xFC5
 #define MADERA_AUXPDM1_CTRL_0				0x10C0
 #define MADERA_AUXPDM1_CTRL_1				0x10C1
-#define MADERA_DAC_COMP_1				0x1300
-#define MADERA_DAC_COMP_2				0x1302
-#define MADERA_FRF_COEFFICIENT_1L_1			0x1380
-#define MADERA_FRF_COEFFICIENT_1L_2			0x1381
-#define MADERA_FRF_COEFFICIENT_1L_3			0x1382
-#define MADERA_FRF_COEFFICIENT_1L_4			0x1383
-#define MADERA_FRF_COEFFICIENT_1R_1			0x1390
-#define MADERA_FRF_COEFFICIENT_1R_2			0x1391
-#define MADERA_FRF_COEFFICIENT_1R_3			0x1392
-#define MADERA_FRF_COEFFICIENT_1R_4			0x1393
-#define MADERA_FRF_COEFFICIENT_2L_1			0x13A0
-#define MADERA_FRF_COEFFICIENT_2L_2			0x13A1
-#define MADERA_FRF_COEFFICIENT_2L_3			0x13A2
-#define MADERA_FRF_COEFFICIENT_2L_4			0x13A3
-#define MADERA_FRF_COEFFICIENT_2R_1			0x13B0
-#define MADERA_FRF_COEFFICIENT_2R_2			0x13B1
-#define MADERA_FRF_COEFFICIENT_2R_3			0x13B2
-#define MADERA_FRF_COEFFICIENT_2R_4			0x13B3
-#define MADERA_FRF_COEFFICIENT_3L_1			0x13C0
-#define MADERA_FRF_COEFFICIENT_3L_2			0x13C1
-#define MADERA_FRF_COEFFICIENT_3L_3			0x13C2
-#define MADERA_FRF_COEFFICIENT_3L_4			0x13C3
-#define MADERA_FRF_COEFFICIENT_3R_1			0x13D0
-#define MADERA_FRF_COEFFICIENT_3R_2			0x13D1
-#define MADERA_FRF_COEFFICIENT_3R_3			0x13D2
-#define MADERA_FRF_COEFFICIENT_3R_4			0x13D3
-#define MADERA_FRF_COEFFICIENT_4L_1			0x13E0
-#define MADERA_FRF_COEFFICIENT_4L_2			0x13E1
-#define MADERA_FRF_COEFFICIENT_4L_3			0x13E2
-#define MADERA_FRF_COEFFICIENT_4L_4			0x13E3
-#define MADERA_FRF_COEFFICIENT_4R_1			0x13F0
-#define MADERA_FRF_COEFFICIENT_4R_2			0x13F1
-#define MADERA_FRF_COEFFICIENT_4R_3			0x13F2
-#define MADERA_FRF_COEFFICIENT_4R_4			0x13F3
-#define CS47L35_FRF_COEFFICIENT_4L_1			0x13A0
-#define CS47L35_FRF_COEFFICIENT_4L_2			0x13A1
-#define CS47L35_FRF_COEFFICIENT_4L_3			0x13A2
-#define CS47L35_FRF_COEFFICIENT_4L_4			0x13A3
-#define CS47L35_FRF_COEFFICIENT_5L_1			0x13B0
-#define CS47L35_FRF_COEFFICIENT_5L_2			0x13B1
-#define CS47L35_FRF_COEFFICIENT_5L_3			0x13B2
-#define CS47L35_FRF_COEFFICIENT_5L_4			0x13B3
-#define CS47L35_FRF_COEFFICIENT_5R_1			0x13C0
-#define CS47L35_FRF_COEFFICIENT_5R_2			0x13C1
-#define CS47L35_FRF_COEFFICIENT_5R_3			0x13C2
-#define CS47L35_FRF_COEFFICIENT_5R_4			0x13C3
-#define MADERA_FRF_COEFFICIENT_5L_1			0x1400
-#define MADERA_FRF_COEFFICIENT_5L_2			0x1401
-#define MADERA_FRF_COEFFICIENT_5L_3			0x1402
-#define MADERA_FRF_COEFFICIENT_5L_4			0x1403
-#define MADERA_FRF_COEFFICIENT_5R_1			0x1410
-#define MADERA_FRF_COEFFICIENT_5R_2			0x1411
-#define MADERA_FRF_COEFFICIENT_5R_3			0x1412
-#define MADERA_FRF_COEFFICIENT_5R_4			0x1413
-#define MADERA_FRF_COEFFICIENT_6L_1			0x1420
-#define MADERA_FRF_COEFFICIENT_6L_2			0x1421
-#define MADERA_FRF_COEFFICIENT_6L_3			0x1422
-#define MADERA_FRF_COEFFICIENT_6L_4			0x1423
-#define MADERA_FRF_COEFFICIENT_6R_1			0x1430
-#define MADERA_FRF_COEFFICIENT_6R_2			0x1431
-#define MADERA_FRF_COEFFICIENT_6R_3			0x1432
-#define MADERA_FRF_COEFFICIENT_6R_4			0x1433
 #define MADERA_DFC1_CTRL				0x1480
 #define MADERA_DFC1_RX					0x1482
 #define MADERA_DFC1_TX					0x1484
@@ -1573,15 +1502,6 @@
 #define MADERA_FLL1_REFCLK_SRC_SHIFT			     0
 #define MADERA_FLL1_REFCLK_SRC_WIDTH			     4
 
-/* (0x0177)  FLL1_Loop_Filter_Test_1 */
-#define MADERA_FLL1_FRC_INTEG_UPD			0x8000
-#define MADERA_FLL1_FRC_INTEG_UPD_MASK			0x8000
-#define MADERA_FLL1_FRC_INTEG_UPD_SHIFT			    15
-#define MADERA_FLL1_FRC_INTEG_UPD_WIDTH			     1
-#define MADERA_FLL1_FRC_INTEG_VAL_MASK			0x0FFF
-#define MADERA_FLL1_FRC_INTEG_VAL_SHIFT			     0
-#define MADERA_FLL1_FRC_INTEG_VAL_WIDTH			    12
-
 /* (0x0179)  FLL1_Control_7 */
 #define MADERA_FLL1_GAIN_MASK				0x003c
 #define MADERA_FLL1_GAIN_SHIFT				     2
-- 
cgit v1.2.3-59-g8ed1b


From d6871a73387d51dfdde6ad1479aea54d3eafcc89 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Wed, 26 Jun 2019 14:33:36 +0100
Subject: mfd: madera: Fixup SPDX headers

GPL-2.0-only is the preferred way of expressing v2 of the GPL, so switch
to that. Remove some redundant copyright notices and correct some
instances where the wrong comment type has been used in header files.

Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/cs47l15-tables.c         | 2 +-
 drivers/mfd/cs47l35-tables.c         | 6 +-----
 drivers/mfd/cs47l85-tables.c         | 6 +-----
 drivers/mfd/cs47l90-tables.c         | 6 +-----
 drivers/mfd/cs47l92-tables.c         | 2 +-
 drivers/mfd/madera-core.c            | 6 +-----
 drivers/mfd/madera-i2c.c             | 6 +-----
 drivers/mfd/madera-spi.c             | 6 +-----
 include/linux/mfd/madera/core.h      | 6 +-----
 include/linux/mfd/madera/pdata.h     | 6 +-----
 include/linux/mfd/madera/registers.h | 6 +-----
 11 files changed, 11 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/cs47l15-tables.c b/drivers/mfd/cs47l15-tables.c
index 73db8d03b531..f81b45336690 100644
--- a/drivers/mfd/cs47l15-tables.c
+++ b/drivers/mfd/cs47l15-tables.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L15 codec
  *
diff --git a/drivers/mfd/cs47l35-tables.c b/drivers/mfd/cs47l35-tables.c
index fe838cbc2a7e..a0bc6c5100d6 100644
--- a/drivers/mfd/cs47l35-tables.c
+++ b/drivers/mfd/cs47l35-tables.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L35 codec
  *
  * Copyright (C) 2015-2017 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/cs47l85-tables.c b/drivers/mfd/cs47l85-tables.c
index d0198b5e86ba..270d8eda3f5f 100644
--- a/drivers/mfd/cs47l85-tables.c
+++ b/drivers/mfd/cs47l85-tables.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L85 codec
  *
  * Copyright (C) 2015-2017 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/cs47l90-tables.c b/drivers/mfd/cs47l90-tables.c
index 2c761fc241f3..7345fc09c0bb 100644
--- a/drivers/mfd/cs47l90-tables.c
+++ b/drivers/mfd/cs47l90-tables.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L90 codec
  *
  * Copyright (C) 2015-2017 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/cs47l92-tables.c b/drivers/mfd/cs47l92-tables.c
index c8a234381350..f296e355df4d 100644
--- a/drivers/mfd/cs47l92-tables.c
+++ b/drivers/mfd/cs47l92-tables.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Regmap tables for CS47L92 codec
  *
diff --git a/drivers/mfd/madera-core.c b/drivers/mfd/madera-core.c
index b9e9c169c6cc..29540cbf7593 100644
--- a/drivers/mfd/madera-core.c
+++ b/drivers/mfd/madera-core.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * Core MFD support for Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/madera-i2c.c b/drivers/mfd/madera-i2c.c
index 3f4ab5dcf5c3..6b965eb034b6 100644
--- a/drivers/mfd/madera-i2c.c
+++ b/drivers/mfd/madera-i2c.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * I2C bus interface to Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/drivers/mfd/madera-spi.c b/drivers/mfd/madera-spi.c
index d76c7e7376d7..e860f5ff0933 100644
--- a/drivers/mfd/madera-spi.c
+++ b/drivers/mfd/madera-spi.c
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
  * SPI bus interface to Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #include <linux/device.h>
diff --git a/include/linux/mfd/madera/core.h b/include/linux/mfd/madera/core.h
index 7b87f9a02ecc..7ffa696cce7c 100644
--- a/include/linux/mfd/madera/core.h
+++ b/include/linux/mfd/madera/core.h
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * MFD internals for Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #ifndef MADERA_CORE_H
diff --git a/include/linux/mfd/madera/pdata.h b/include/linux/mfd/madera/pdata.h
index dd00ab824e5b..ec0711bcad50 100644
--- a/include/linux/mfd/madera/pdata.h
+++ b/include/linux/mfd/madera/pdata.h
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Platform data for Cirrus Logic Madera codecs
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #ifndef MADERA_PDATA_H
diff --git a/include/linux/mfd/madera/registers.h b/include/linux/mfd/madera/registers.h
index 53c2377b54b2..fe909d177762 100644
--- a/include/linux/mfd/madera/registers.h
+++ b/include/linux/mfd/madera/registers.h
@@ -1,12 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0-only */
 /*
  * Madera register definitions
  *
  * Copyright (C) 2015-2018 Cirrus Logic
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2.
  */
 
 #ifndef MADERA_REGISTERS_H
-- 
cgit v1.2.3-59-g8ed1b


From 07ec38917e68f0114b9c8aeeb1c584b5e73e4dd6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:01 +0200
Subject: mm: remove the struct hmm_device infrastructure

This code is a trivial wrapper around device model helpers, which
should have been integrated into the driver device model usage from
the start.  Assuming it actually had users, which it never had since
the code was added more than 1 1/2 years ago.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h | 20 --------------
 mm/hmm.c            | 80 -----------------------------------------------------
 2 files changed, 100 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 044a36d7c3f8..99765be3284d 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -751,26 +751,6 @@ static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
 {
 	return page->hmm_data;
 }
-
-
-/*
- * struct hmm_device - fake device to hang device memory onto
- *
- * @device: device struct
- * @minor: device minor number
- */
-struct hmm_device {
-	struct device		device;
-	unsigned int		minor;
-};
-
-/*
- * A device driver that wants to handle multiple devices memory through a
- * single fake device can use hmm_device to do so. This is purely a helper and
- * it is not strictly needed, in order to make use of any HMM functionality.
- */
-struct hmm_device *hmm_device_new(void *drvdata);
-void hmm_device_put(struct hmm_device *hmm_device);
 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 #else /* IS_ENABLED(CONFIG_HMM) */
 static inline void hmm_mm_destroy(struct mm_struct *mm) {}
diff --git a/mm/hmm.c b/mm/hmm.c
index f702a3895d05..00cc642b3d7e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1528,84 +1528,4 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
 	return devmem;
 }
 EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
-
-/*
- * A device driver that wants to handle multiple devices memory through a
- * single fake device can use hmm_device to do so. This is purely a helper
- * and it is not needed to make use of any HMM functionality.
- */
-#define HMM_DEVICE_MAX 256
-
-static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
-static DEFINE_SPINLOCK(hmm_device_lock);
-static struct class *hmm_device_class;
-static dev_t hmm_device_devt;
-
-static void hmm_device_release(struct device *device)
-{
-	struct hmm_device *hmm_device;
-
-	hmm_device = container_of(device, struct hmm_device, device);
-	spin_lock(&hmm_device_lock);
-	clear_bit(hmm_device->minor, hmm_device_mask);
-	spin_unlock(&hmm_device_lock);
-
-	kfree(hmm_device);
-}
-
-struct hmm_device *hmm_device_new(void *drvdata)
-{
-	struct hmm_device *hmm_device;
-
-	hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
-	if (!hmm_device)
-		return ERR_PTR(-ENOMEM);
-
-	spin_lock(&hmm_device_lock);
-	hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
-	if (hmm_device->minor >= HMM_DEVICE_MAX) {
-		spin_unlock(&hmm_device_lock);
-		kfree(hmm_device);
-		return ERR_PTR(-EBUSY);
-	}
-	set_bit(hmm_device->minor, hmm_device_mask);
-	spin_unlock(&hmm_device_lock);
-
-	dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
-	hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
-					hmm_device->minor);
-	hmm_device->device.release = hmm_device_release;
-	dev_set_drvdata(&hmm_device->device, drvdata);
-	hmm_device->device.class = hmm_device_class;
-	device_initialize(&hmm_device->device);
-
-	return hmm_device;
-}
-EXPORT_SYMBOL(hmm_device_new);
-
-void hmm_device_put(struct hmm_device *hmm_device)
-{
-	put_device(&hmm_device->device);
-}
-EXPORT_SYMBOL(hmm_device_put);
-
-static int __init hmm_init(void)
-{
-	int ret;
-
-	ret = alloc_chrdev_region(&hmm_device_devt, 0,
-				  HMM_DEVICE_MAX,
-				  "hmm_device");
-	if (ret)
-		return ret;
-
-	hmm_device_class = class_create(THIS_MODULE, "hmm_device");
-	if (IS_ERR(hmm_device_class)) {
-		unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
-		return PTR_ERR(hmm_device_class);
-	}
-	return 0;
-}
-
-device_initcall(hmm_init);
 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
-- 
cgit v1.2.3-59-g8ed1b


From 25b2995a35b609119cf96f6b62eccd56c0234c7d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Jun 2019 22:50:49 +0200
Subject: mm: remove MEMORY_DEVICE_PUBLIC support

The code hasn't been used since it was added to the tree, and doesn't
appear to actually be usable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 fs/proc/task_mmu.c       |  2 +-
 include/linux/hmm.h      |  7 ++-----
 include/linux/ioport.h   |  1 -
 include/linux/memremap.h |  8 -------
 include/linux/mm.h       | 18 ++--------------
 mm/Kconfig               | 11 ----------
 mm/gup.c                 |  7 -------
 mm/hmm.c                 | 54 ++----------------------------------------------
 mm/madvise.c             |  2 +-
 mm/memcontrol.c          | 13 ++++++------
 mm/memory-failure.c      |  6 +-----
 mm/memory.c              | 40 +++--------------------------------
 mm/migrate.c             | 28 ++++---------------------
 mm/swap.c                | 11 ----------
 14 files changed, 22 insertions(+), 186 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 01d4eb0e6bd1..74d8f00b3615 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1279,7 +1279,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		if (pm->show_pfn)
 			frame = pte_pfn(pte);
 		flags |= PM_PRESENT;
-		page = _vm_normal_page(vma, addr, pte, true);
+		page = vm_normal_page(vma, addr, pte);
 		if (pte_soft_dirty(pte))
 			flags |= PM_SOFT_DIRTY;
 	} else if (is_swap_pte(pte)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 99765be3284d..44a5ac738bb5 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -584,7 +584,7 @@ static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 struct hmm_devmem;
 
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
@@ -722,9 +722,6 @@ struct hmm_devmem {
 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 				  struct device *device,
 				  unsigned long size);
-struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
-					   struct device *device,
-					   struct resource *res);
 
 /*
  * hmm_devmem_page_set_drvdata - set per-page driver data field
@@ -751,7 +748,7 @@ static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
 {
 	return page->hmm_data;
 }
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+#endif /* CONFIG_DEVICE_PRIVATE */
 #else /* IS_ENABLED(CONFIG_HMM) */
 static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..dd961882bc74 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -132,7 +132,6 @@ enum {
 	IORES_DESC_PERSISTENT_MEMORY		= 4,
 	IORES_DESC_PERSISTENT_MEMORY_LEGACY	= 5,
 	IORES_DESC_DEVICE_PRIVATE_MEMORY	= 6,
-	IORES_DESC_DEVICE_PUBLIC_MEMORY		= 7,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 1732dea030b2..995c62c5a48b 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -37,13 +37,6 @@ struct vmem_altmap {
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.rst.
  *
- * MEMORY_DEVICE_PUBLIC:
- * Device memory that is cache coherent from device and CPU point of view. This
- * is use on platform that have an advance system bus (like CAPI or CCIX). A
- * driver can hotplug the device memory using ZONE_DEVICE and with that memory
- * type. Any page of a process can be migrated to such memory. However no one
- * should be allow to pin such memory so that it can always be evicted.
- *
  * MEMORY_DEVICE_FS_DAX:
  * Host memory that has similar access semantics as System RAM i.e. DMA
  * coherent and supports page pinning. In support of coordinating page
@@ -58,7 +51,6 @@ struct vmem_altmap {
  */
 enum memory_type {
 	MEMORY_DEVICE_PRIVATE = 1,
-	MEMORY_DEVICE_PUBLIC,
 	MEMORY_DEVICE_FS_DAX,
 	MEMORY_DEVICE_PCI_P2PDMA,
 };
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd0b5f4e1e45..7399f9f08de6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -944,7 +944,6 @@ static inline bool put_devmap_managed_page(struct page *page)
 		return false;
 	switch (page->pgmap->type) {
 	case MEMORY_DEVICE_PRIVATE:
-	case MEMORY_DEVICE_PUBLIC:
 	case MEMORY_DEVICE_FS_DAX:
 		__put_devmap_managed_page(page);
 		return true;
@@ -960,12 +959,6 @@ static inline bool is_device_private_page(const struct page *page)
 		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
-static inline bool is_device_public_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PUBLIC;
-}
-
 #ifdef CONFIG_PCI_P2PDMA
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
@@ -998,11 +991,6 @@ static inline bool is_device_private_page(const struct page *page)
 	return false;
 }
 
-static inline bool is_device_public_page(const struct page *page)
-{
-	return false;
-}
-
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
 	return false;
@@ -1431,10 +1419,8 @@ struct zap_details {
 	pgoff_t last_index;			/* Highest page->index to unmap */
 };
 
-struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-			     pte_t pte, bool with_public_device);
-#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
-
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+			     pte_t pte);
 struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
 				pmd_t pmd);
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 0d2ba7e1f43e..6f35b85b3052 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -718,17 +718,6 @@ config DEVICE_PRIVATE
 	  memory; i.e., memory that is only accessible from the device (or
 	  group of devices). You likely also want to select HMM_MIRROR.
 
-config DEVICE_PUBLIC
-	bool "Addressable device memory (like GPU memory)"
-	depends on ARCH_HAS_HMM
-	select HMM
-	select DEV_PAGEMAP_OPS
-
-	help
-	  Allows creation of struct pages to represent addressable device
-	  memory; i.e., memory that is accessible from both the device and
-	  the CPU
-
 config FRAME_VECTOR
 	bool
 
diff --git a/mm/gup.c b/mm/gup.c
index ddde097cf9e4..fe131d879c70 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -605,13 +605,6 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
 		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
 			goto unmap;
 		*page = pte_page(*pte);
-
-		/*
-		 * This should never happen (a device public page in the gate
-		 * area).
-		 */
-		if (is_device_public_page(*page))
-			goto unmap;
 	}
 	if (unlikely(!try_get_page(*page))) {
 		ret = -ENOMEM;
diff --git a/mm/hmm.c b/mm/hmm.c
index 00cc642b3d7e..376159a769fb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1331,7 +1331,7 @@ EXPORT_SYMBOL(hmm_range_dma_unmap);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
 				       unsigned long addr)
 {
@@ -1478,54 +1478,4 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	return devmem;
 }
 EXPORT_SYMBOL_GPL(hmm_devmem_add);
-
-struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
-					   struct device *device,
-					   struct resource *res)
-{
-	struct hmm_devmem *devmem;
-	void *result;
-	int ret;
-
-	if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
-		return ERR_PTR(-EINVAL);
-
-	dev_pagemap_get_ops();
-
-	devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
-	if (!devmem)
-		return ERR_PTR(-ENOMEM);
-
-	init_completion(&devmem->completion);
-	devmem->pfn_first = -1UL;
-	devmem->pfn_last = -1UL;
-	devmem->resource = res;
-	devmem->device = device;
-	devmem->ops = ops;
-
-	ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
-			      0, GFP_KERNEL);
-	if (ret)
-		return ERR_PTR(ret);
-
-	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
-	devmem->pfn_last = devmem->pfn_first +
-			   (resource_size(devmem->resource) >> PAGE_SHIFT);
-	devmem->page_fault = hmm_devmem_fault;
-
-	devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
-	devmem->pagemap.res = *devmem->resource;
-	devmem->pagemap.page_free = hmm_devmem_free;
-	devmem->pagemap.altmap_valid = false;
-	devmem->pagemap.ref = &devmem->ref;
-	devmem->pagemap.data = devmem;
-	devmem->pagemap.kill = hmm_devmem_ref_kill;
-	devmem->pagemap.cleanup = hmm_devmem_ref_exit;
-
-	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
-	if (IS_ERR(result))
-		return result;
-	return devmem;
-}
-EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+#endif /* CONFIG_DEVICE_PRIVATE  */
diff --git a/mm/madvise.c b/mm/madvise.c
index 628022e674a7..968df3aa069f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -354,7 +354,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			continue;
 		}
 
-		page = _vm_normal_page(vma, addr, ptent, true);
+		page = vm_normal_page(vma, addr, ptent);
 		if (!page)
 			continue;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ba9138a4a1de..d2a6454fa0bd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4793,7 +4793,7 @@ enum mc_target_type {
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 						unsigned long addr, pte_t ptent)
 {
-	struct page *page = _vm_normal_page(vma, addr, ptent, true);
+	struct page *page = vm_normal_page(vma, addr, ptent);
 
 	if (!page || !page_mapped(page))
 		return NULL;
@@ -4994,8 +4994,8 @@ out:
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
- *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
+ *     (so ZONE_DEVICE page and thus not on the lru).
  *     For now we such page is charge like a regular page would be as for all
  *     intent and purposes it is just special memory taking the place of a
  *     regular page.
@@ -5029,8 +5029,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		 */
 		if (page->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
-			if (is_device_private_page(page) ||
-			    is_device_public_page(page))
+			if (is_device_private_page(page))
 				ret = MC_TARGET_DEVICE;
 			if (target)
 				target->page = page;
@@ -5101,8 +5100,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 	if (ptl) {
 		/*
 		 * Note their can not be MC_TARGET_DEVICE for now as we do not
-		 * support transparent huge page with MEMORY_DEVICE_PUBLIC or
-		 * MEMORY_DEVICE_PRIVATE but this might change.
+		 * support transparent huge page with MEMORY_DEVICE_PRIVATE but
+		 * this might change.
 		 */
 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d9cc6606f409..31e7c7b424a1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1177,16 +1177,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 		goto unlock;
 	}
 
-	switch (pgmap->type) {
-	case MEMORY_DEVICE_PRIVATE:
-	case MEMORY_DEVICE_PUBLIC:
+	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
 		/*
 		 * TODO: Handle HMM pages which may need coordination
 		 * with device-side memory.
 		 */
 		goto unlock;
-	default:
-		break;
 	}
 
 	/*
diff --git a/mm/memory.c b/mm/memory.c
index ddf20bd0c317..2d14f4c7e152 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -571,8 +571,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
  * PFNMAP mappings in order to support COWable mappings.
  *
  */
-struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-			     pte_t pte, bool with_public_device)
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+			    pte_t pte)
 {
 	unsigned long pfn = pte_pfn(pte);
 
@@ -585,29 +585,6 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 			return NULL;
 		if (is_zero_pfn(pfn))
 			return NULL;
-
-		/*
-		 * Device public pages are special pages (they are ZONE_DEVICE
-		 * pages but different from persistent memory). They behave
-		 * allmost like normal pages. The difference is that they are
-		 * not on the lru and thus should never be involve with any-
-		 * thing that involve lru manipulation (mlock, numa balancing,
-		 * ...).
-		 *
-		 * This is why we still want to return NULL for such page from
-		 * vm_normal_page() so that we do not have to special case all
-		 * call site of vm_normal_page().
-		 */
-		if (likely(pfn <= highest_memmap_pfn)) {
-			struct page *page = pfn_to_page(pfn);
-
-			if (is_device_public_page(page)) {
-				if (with_public_device)
-					return page;
-				return NULL;
-			}
-		}
-
 		if (pte_devmap(pte))
 			return NULL;
 
@@ -797,17 +774,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		rss[mm_counter(page)]++;
 	} else if (pte_devmap(pte)) {
 		page = pte_page(pte);
-
-		/*
-		 * Cache coherent device memory behave like regular page and
-		 * not like persistent memory page. For more informations see
-		 * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
-		 */
-		if (is_device_public_page(page)) {
-			get_page(page);
-			page_dup_rmap(page, false);
-			rss[mm_counter(page)]++;
-		}
 	}
 
 out_set_pte:
@@ -1063,7 +1029,7 @@ again:
 		if (pte_present(ptent)) {
 			struct page *page;
 
-			page = _vm_normal_page(vma, addr, ptent, true);
+			page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(details) && page) {
 				/*
 				 * unmap_shared_mapping_pages() wants to
diff --git a/mm/migrate.c b/mm/migrate.c
index f2ecc2855a12..78d45e184457 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -246,8 +246,6 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 			if (is_device_private_page(new)) {
 				entry = make_device_private_entry(new, pte_write(pte));
 				pte = swp_entry_to_pte(entry);
-			} else if (is_device_public_page(new)) {
-				pte = pte_mkdevmap(pte);
 			}
 		}
 
@@ -381,7 +379,6 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
 	 * ZONE_DEVICE pages.
 	 */
 	expected_count += is_device_private_page(page);
-	expected_count += is_device_public_page(page);
 	if (mapping)
 		expected_count += hpage_nr_pages(page) + page_has_private(page);
 
@@ -994,10 +991,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 		if (!PageMappingFlags(page))
 			page->mapping = NULL;
 
-		if (unlikely(is_zone_device_page(newpage))) {
-			if (is_device_public_page(newpage))
-				flush_dcache_page(newpage);
-		} else
+		if (likely(!is_zone_device_page(newpage)))
 			flush_dcache_page(newpage);
 
 	}
@@ -2265,7 +2259,7 @@ again:
 				pfn = 0;
 				goto next;
 			}
-			page = _vm_normal_page(migrate->vma, addr, pte, true);
+			page = vm_normal_page(migrate->vma, addr, pte);
 			mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
 			mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 		}
@@ -2406,16 +2400,7 @@ static bool migrate_vma_check_page(struct page *page)
 		 * FIXME proper solution is to rework migration_entry_wait() so
 		 * it does not need to take a reference on page.
 		 */
-		if (is_device_private_page(page))
-			return true;
-
-		/*
-		 * Only allow device public page to be migrated and account for
-		 * the extra reference count imply by ZONE_DEVICE pages.
-		 */
-		if (!is_device_public_page(page))
-			return false;
-		extra++;
+		return is_device_private_page(page);
 	}
 
 	/* For file back page */
@@ -2665,11 +2650,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 
 			swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
 			entry = swp_entry_to_pte(swp_entry);
-		} else if (is_device_public_page(page)) {
-			entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
-			if (vma->vm_flags & VM_WRITE)
-				entry = pte_mkwrite(pte_mkdirty(entry));
-			entry = pte_mkdevmap(entry);
 		}
 	} else {
 		entry = mk_pte(page, vma->vm_page_prot);
@@ -2789,7 +2769,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
 					migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
 					continue;
 				}
-			} else if (!is_device_public_page(newpage)) {
+			} else {
 				/*
 				 * Other types of ZONE_DEVICE page are not
 				 * supported.
diff --git a/mm/swap.c b/mm/swap.c
index 7ede3eddc12a..83107410d29f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -740,17 +740,6 @@ void release_pages(struct page **pages, int nr)
 		if (is_huge_zero_page(page))
 			continue;
 
-		/* Device public page can not be huge page */
-		if (is_device_public_page(page)) {
-			if (locked_pgdat) {
-				spin_unlock_irqrestore(&locked_pgdat->lru_lock,
-						       flags);
-				locked_pgdat = NULL;
-			}
-			put_devmap_managed_page(page);
-			continue;
-		}
-
 		page = compound_head(page);
 		if (!put_page_testzero(page))
 			continue;
-- 
cgit v1.2.3-59-g8ed1b


From 0092908d16c604b8207c2141ec64b0fa4473bb03 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:06 +0200
Subject: mm: factor out a devm_request_free_mem_region helper

Keep the physical address allocation that hmm_add_device does with the
rest of the resource code, and allow future reuse of it without the hmm
wrapper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/ioport.h |  2 ++
 kernel/resource.c      | 39 +++++++++++++++++++++++++++++++++++++++
 mm/hmm.c               | 33 ++++-----------------------------
 3 files changed, 45 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index dd961882bc74..a02b290ca08a 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -285,6 +285,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
        return (r1->start <= r2->end && r1->end >= r2->start);
 }
 
+struct resource *devm_request_free_mem_region(struct device *dev,
+		struct resource *base, unsigned long size);
 
 #endif /* __ASSEMBLY__ */
 #endif	/* _LINUX_IOPORT_H */
diff --git a/kernel/resource.c b/kernel/resource.c
index 158f04ec1d4f..d22423e85cf8 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head)
 }
 EXPORT_SYMBOL(resource_list_free);
 
+#ifdef CONFIG_DEVICE_PRIVATE
+/**
+ * devm_request_free_mem_region - find free region for device private memory
+ *
+ * @dev: device struct to bind the resource to
+ * @size: size in bytes of the device memory to add
+ * @base: resource tree to look in
+ *
+ * This function tries to find an empty range of physical address big enough to
+ * contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
+ * memory, which in turn allocates struct pages.
+ */
+struct resource *devm_request_free_mem_region(struct device *dev,
+		struct resource *base, unsigned long size)
+{
+	resource_size_t end, addr;
+	struct resource *res;
+
+	size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
+	end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
+	addr = end - size + 1UL;
+
+	for (; addr > size && addr >= base->start; addr -= size) {
+		if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
+				REGION_DISJOINT)
+			continue;
+
+		res = devm_request_mem_region(dev, addr, size, dev_name(dev));
+		if (!res)
+			return ERR_PTR(-ENOMEM);
+		res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+		return res;
+	}
+
+	return ERR_PTR(-ERANGE);
+}
+EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
+#endif /* CONFIG_DEVICE_PRIVATE */
+
 static int __init strict_iomem(char *str)
 {
 	if (strstr(str, "relaxed"))
diff --git a/mm/hmm.c b/mm/hmm.c
index e7dd2ab8f9ab..48574f8485bb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -25,8 +25,6 @@
 #include <linux/mmu_notifier.h>
 #include <linux/memory_hotplug.h>
 
-#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-
 #if IS_ENABLED(CONFIG_HMM_MIRROR)
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 
@@ -1408,7 +1406,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 				  unsigned long size)
 {
 	struct hmm_devmem *devmem;
-	resource_size_t addr;
 	void *result;
 	int ret;
 
@@ -1430,32 +1427,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	if (ret)
 		return ERR_PTR(ret);
 
-	size = ALIGN(size, PA_SECTION_SIZE);
-	addr = min((unsigned long)iomem_resource.end,
-		   (1UL << MAX_PHYSMEM_BITS) - 1);
-	addr = addr - size + 1UL;
-
-	/*
-	 * FIXME add a new helper to quickly walk resource tree and find free
-	 * range
-	 *
-	 * FIXME what about ioport_resource resource ?
-	 */
-	for (; addr > size && addr >= iomem_resource.start; addr -= size) {
-		ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
-		if (ret != REGION_DISJOINT)
-			continue;
-
-		devmem->resource = devm_request_mem_region(device, addr, size,
-							   dev_name(device));
-		if (!devmem->resource)
-			return ERR_PTR(-ENOMEM);
-		break;
-	}
-	if (!devmem->resource)
-		return ERR_PTR(-ERANGE);
-
-	devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
+	devmem->resource = devm_request_free_mem_region(device, &iomem_resource,
+			size);
+	if (IS_ERR(devmem->resource))
+		return ERR_CAST(devmem->resource);
 	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
 	devmem->pfn_last = devmem->pfn_first +
 			   (resource_size(devmem->resource) >> PAGE_SHIFT);
-- 
cgit v1.2.3-59-g8ed1b


From 3ed2dcdf54d5bf1f9823b5faf1a702e7cee53982 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:07 +0200
Subject: memremap: validate the pagemap type passed to devm_memremap_pages

Most pgmap types are only supported when certain config options are
enabled.  Check for a type that is valid for the current configuration
before setting up the pagemap.  For this the usage of the 0 type for
device dax gets replaced with an explicit MEMORY_DEVICE_DEVDAX type.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/dax/device.c     |  1 +
 include/linux/memremap.h |  8 ++++++++
 kernel/memremap.c        | 22 ++++++++++++++++++++++
 3 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 8465d12fecba..79014baa782d 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -468,6 +468,7 @@ int dev_dax_probe(struct device *dev)
 	dev_dax->pgmap.ref = &dev_dax->ref;
 	dev_dax->pgmap.kill = dev_dax_percpu_kill;
 	dev_dax->pgmap.cleanup = dev_dax_percpu_exit;
+	dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX;
 	addr = devm_memremap_pages(dev, &dev_dax->pgmap);
 	if (IS_ERR(addr))
 		return PTR_ERR(addr);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 995c62c5a48b..0c86f2c5ac9c 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -45,13 +45,21 @@ struct vmem_altmap {
  * wakeup is used to coordinate physical address space management (ex:
  * fs truncate/hole punch) vs pinned pages (ex: device dma).
  *
+ * MEMORY_DEVICE_DEVDAX:
+ * Host memory that has similar access semantics as System RAM i.e. DMA
+ * coherent and supports page pinning. In contrast to
+ * MEMORY_DEVICE_FS_DAX, this memory is access via a device-dax
+ * character device.
+ *
  * MEMORY_DEVICE_PCI_P2PDMA:
  * Device memory residing in a PCI BAR intended for use with Peer-to-Peer
  * transactions.
  */
 enum memory_type {
+	/* 0 is reserved to catch uninitialized type fields */
 	MEMORY_DEVICE_PRIVATE = 1,
 	MEMORY_DEVICE_FS_DAX,
+	MEMORY_DEVICE_DEVDAX,
 	MEMORY_DEVICE_PCI_P2PDMA,
 };
 
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6e1970719dc2..abda62d1e5a3 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -157,6 +157,28 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
 
+	switch (pgmap->type) {
+	case MEMORY_DEVICE_PRIVATE:
+		if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
+			WARN(1, "Device private memory not supported\n");
+			return ERR_PTR(-EINVAL);
+		}
+		break;
+	case MEMORY_DEVICE_FS_DAX:
+		if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
+		    IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
+			WARN(1, "File system DAX not supported\n");
+			return ERR_PTR(-EINVAL);
+		}
+		break;
+	case MEMORY_DEVICE_DEVDAX:
+	case MEMORY_DEVICE_PCI_P2PDMA:
+		break;
+	default:
+		WARN(1, "Invalid pgmap type %d\n", pgmap->type);
+		break;
+	}
+
 	if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
 		WARN(1, "Missing reference count teardown definition\n");
 		return ERR_PTR(-EINVAL);
-- 
cgit v1.2.3-59-g8ed1b


From 1e240e8d4a7d92232b6214e02a0a4197a53afd6c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:08 +0200
Subject: memremap: move dev_pagemap callbacks into a separate structure

The dev_pagemap is a growing too many callbacks.  Move them into a
separate ops structure so that they are not duplicated for multiple
instances, and an attacker can't easily overwrite them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/dax/device.c              | 11 +++++++----
 drivers/dax/pmem/core.c           |  2 +-
 drivers/nvdimm/pmem.c             | 19 +++++++++++--------
 drivers/pci/p2pdma.c              |  8 ++++++--
 include/linux/memremap.h          | 36 ++++++++++++++++++++----------------
 kernel/memremap.c                 | 18 +++++++++---------
 mm/hmm.c                          | 10 +++++++---
 tools/testing/nvdimm/test/iomap.c |  7 ++++---
 8 files changed, 65 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 79014baa782d..f390083a64d7 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -36,9 +36,8 @@ static void dev_dax_percpu_exit(struct percpu_ref *ref)
 	percpu_ref_exit(ref);
 }
 
-static void dev_dax_percpu_kill(struct percpu_ref *data)
+static void dev_dax_percpu_kill(struct percpu_ref *ref)
 {
-	struct percpu_ref *ref = data;
 	struct dev_dax *dev_dax = ref_to_dev_dax(ref);
 
 	dev_dbg(&dev_dax->dev, "%s\n", __func__);
@@ -442,6 +441,11 @@ static void dev_dax_kill(void *dev_dax)
 	kill_dev_dax(dev_dax);
 }
 
+static const struct dev_pagemap_ops dev_dax_pagemap_ops = {
+	.kill		= dev_dax_percpu_kill,
+	.cleanup	= dev_dax_percpu_exit,
+};
+
 int dev_dax_probe(struct device *dev)
 {
 	struct dev_dax *dev_dax = to_dev_dax(dev);
@@ -466,9 +470,8 @@ int dev_dax_probe(struct device *dev)
 		return rc;
 
 	dev_dax->pgmap.ref = &dev_dax->ref;
-	dev_dax->pgmap.kill = dev_dax_percpu_kill;
-	dev_dax->pgmap.cleanup = dev_dax_percpu_exit;
 	dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX;
+	dev_dax->pgmap.ops = &dev_dax_pagemap_ops;
 	addr = devm_memremap_pages(dev, &dev_dax->pgmap);
 	if (IS_ERR(addr))
 		return PTR_ERR(addr);
diff --git a/drivers/dax/pmem/core.c b/drivers/dax/pmem/core.c
index f9f51786d556..6eb6dfdf19bf 100644
--- a/drivers/dax/pmem/core.c
+++ b/drivers/dax/pmem/core.c
@@ -16,7 +16,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
 	struct dev_dax *dev_dax;
 	struct nd_namespace_io *nsio;
 	struct dax_region *dax_region;
-	struct dev_pagemap pgmap = { 0 };
+	struct dev_pagemap pgmap = { };
 	struct nd_namespace_common *ndns;
 	struct nd_dax *nd_dax = to_nd_dax(dev);
 	struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 24d7fe7c74ed..c2449af2b388 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -303,7 +303,7 @@ static const struct attribute_group *pmem_attribute_groups[] = {
 	NULL,
 };
 
-static void __pmem_release_queue(struct percpu_ref *ref)
+static void pmem_pagemap_cleanup(struct percpu_ref *ref)
 {
 	struct request_queue *q;
 
@@ -313,10 +313,10 @@ static void __pmem_release_queue(struct percpu_ref *ref)
 
 static void pmem_release_queue(void *ref)
 {
-	__pmem_release_queue(ref);
+	pmem_pagemap_cleanup(ref);
 }
 
-static void pmem_freeze_queue(struct percpu_ref *ref)
+static void pmem_pagemap_kill(struct percpu_ref *ref)
 {
 	struct request_queue *q;
 
@@ -339,19 +339,24 @@ static void pmem_release_pgmap_ops(void *__pgmap)
 	dev_pagemap_put_ops();
 }
 
-static void fsdax_pagefree(struct page *page, void *data)
+static void pmem_pagemap_page_free(struct page *page, void *data)
 {
 	wake_up_var(&page->_refcount);
 }
 
+static const struct dev_pagemap_ops fsdax_pagemap_ops = {
+	.page_free		= pmem_pagemap_page_free,
+	.kill			= pmem_pagemap_kill,
+	.cleanup		= pmem_pagemap_cleanup,
+};
+
 static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
 {
 	dev_pagemap_get_ops();
 	if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
 		return -ENOMEM;
 	pgmap->type = MEMORY_DEVICE_FS_DAX;
-	pgmap->page_free = fsdax_pagefree;
-
+	pgmap->ops = &fsdax_pagemap_ops;
 	return 0;
 }
 
@@ -409,8 +414,6 @@ static int pmem_attach_disk(struct device *dev,
 
 	pmem->pfn_flags = PFN_DEV;
 	pmem->pgmap.ref = &q->q_usage_counter;
-	pmem->pgmap.kill = pmem_freeze_queue;
-	pmem->pgmap.cleanup = __pmem_release_queue;
 	if (is_nd_pfn(dev)) {
 		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
 			return -ENOMEM;
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index a4994aa3acc0..fb039259d463 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -153,6 +153,11 @@ out:
 	return error;
 }
 
+static const struct dev_pagemap_ops pci_p2pdma_pagemap_ops = {
+	.kill		= pci_p2pdma_percpu_kill,
+	.cleanup	= pci_p2pdma_percpu_cleanup,
+};
+
 /**
  * pci_p2pdma_add_resource - add memory for use as p2p memory
  * @pdev: the device to add the memory to
@@ -208,8 +213,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
 	pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
 	pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) -
 		pci_resource_start(pdev, bar);
-	pgmap->kill = pci_p2pdma_percpu_kill;
-	pgmap->cleanup = pci_p2pdma_percpu_cleanup;
+	pgmap->ops = &pci_p2pdma_pagemap_ops;
 
 	addr = devm_memremap_pages(&pdev->dev, pgmap);
 	if (IS_ERR(addr)) {
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 0c86f2c5ac9c..919755f48c7e 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -63,41 +63,45 @@ enum memory_type {
 	MEMORY_DEVICE_PCI_P2PDMA,
 };
 
-/*
- * Additional notes about MEMORY_DEVICE_PRIVATE may be found in
- * include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief
- * explanation in include/linux/memory_hotplug.h.
- *
- * The page_free() callback is called once the page refcount reaches 1
- * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
- * This allows the device driver to implement its own memory management.)
- */
-typedef void (*dev_page_free_t)(struct page *page, void *data);
+struct dev_pagemap_ops {
+	/*
+	 * Called once the page refcount reaches 1.  (ZONE_DEVICE pages never
+	 * reach 0 refcount unless there is a refcount bug. This allows the
+	 * device driver to implement its own memory management.)
+	 */
+	void (*page_free)(struct page *page, void *data);
+
+	/*
+	 * Transition the refcount in struct dev_pagemap to the dead state.
+	 */
+	void (*kill)(struct percpu_ref *ref);
+
+	/*
+	 * Wait for refcount in struct dev_pagemap to be idle and reap it.
+	 */
+	void (*cleanup)(struct percpu_ref *ref);
+};
 
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
- * @page_free: free page callback when page refcount reaches 1
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @res: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
- * @kill: callback to transition @ref to the dead state
- * @cleanup: callback to wait for @ref to be idle and reap it
  * @dev: host device of the mapping for debug
  * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
+ * @ops: method table
  */
 struct dev_pagemap {
-	dev_page_free_t page_free;
 	struct vmem_altmap altmap;
 	bool altmap_valid;
 	struct resource res;
 	struct percpu_ref *ref;
-	void (*kill)(struct percpu_ref *ref);
-	void (*cleanup)(struct percpu_ref *ref);
 	struct device *dev;
 	void *data;
 	enum memory_type type;
 	u64 pci_p2pdma_bus_offset;
+	const struct dev_pagemap_ops *ops;
 };
 
 #ifdef CONFIG_ZONE_DEVICE
diff --git a/kernel/memremap.c b/kernel/memremap.c
index abda62d1e5a3..0824237ef979 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data)
 	unsigned long pfn;
 	int nid;
 
-	pgmap->kill(pgmap->ref);
+	pgmap->ops->kill(pgmap->ref);
 	for_each_device_pfn(pfn, pgmap)
 		put_page(pfn_to_page(pfn));
-	pgmap->cleanup(pgmap->ref);
+	pgmap->ops->cleanup(pgmap->ref);
 
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
@@ -128,8 +128,8 @@ static void devm_memremap_pages_release(void *data)
  * @pgmap: pointer to a struct dev_pagemap
  *
  * Notes:
- * 1/ At a minimum the res, ref and type members of @pgmap must be initialized
- *    by the caller before passing it to this function
+ * 1/ At a minimum the res, ref and type and ops members of @pgmap must be
+ *    initialized by the caller before passing it to this function
  *
  * 2/ The altmap field may optionally be initialized, in which case altmap_valid
  *    must be set to true
@@ -179,7 +179,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		break;
 	}
 
-	if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
+	if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
+	    !pgmap->ops->cleanup) {
 		WARN(1, "Missing reference count teardown definition\n");
 		return ERR_PTR(-EINVAL);
 	}
@@ -293,9 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
  err_pfn_remap:
 	pgmap_array_delete(res);
  err_array:
-	pgmap->kill(pgmap->ref);
-	pgmap->cleanup(pgmap->ref);
-
+	pgmap->ops->kill(pgmap->ref);
+	pgmap->ops->cleanup(pgmap->ref);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
@@ -388,7 +388,7 @@ void __put_devmap_managed_page(struct page *page)
 
 		mem_cgroup_uncharge(page);
 
-		page->pgmap->page_free(page, page->pgmap->data);
+		page->pgmap->ops->page_free(page, page->pgmap->data);
 	} else if (!count)
 		__put_page(page);
 }
diff --git a/mm/hmm.c b/mm/hmm.c
index 48574f8485bb..583a02a16872 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1384,6 +1384,12 @@ static void hmm_devmem_free(struct page *page, void *data)
 	devmem->ops->free(devmem, page);
 }
 
+static const struct dev_pagemap_ops hmm_pagemap_ops = {
+	.page_free		= hmm_devmem_free,
+	.kill			= hmm_devmem_ref_kill,
+	.cleanup		= hmm_devmem_ref_exit,
+};
+
 /*
  * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
  *
@@ -1438,12 +1444,10 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 
 	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
 	devmem->pagemap.res = *devmem->resource;
-	devmem->pagemap.page_free = hmm_devmem_free;
+	devmem->pagemap.ops = &hmm_pagemap_ops;
 	devmem->pagemap.altmap_valid = false;
 	devmem->pagemap.ref = &devmem->ref;
 	devmem->pagemap.data = devmem;
-	devmem->pagemap.kill = hmm_devmem_ref_kill;
-	devmem->pagemap.cleanup = hmm_devmem_ref_exit;
 
 	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
 	if (IS_ERR(result))
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 076df22e4bda..cf3f064a697d 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -100,9 +100,10 @@ static void nfit_test_kill(void *_pgmap)
 {
 	struct dev_pagemap *pgmap = _pgmap;
 
-	WARN_ON(!pgmap || !pgmap->ref || !pgmap->kill || !pgmap->cleanup);
-	pgmap->kill(pgmap->ref);
-	pgmap->cleanup(pgmap->ref);
+	WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
+		!pgmap->ops->cleanup);
+	pgmap->ops->kill(pgmap->ref);
+	pgmap->ops->cleanup(pgmap->ref);
 }
 
 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
-- 
cgit v1.2.3-59-g8ed1b


From d8668bb0451c3c45b59dbcde2654e0539aad1d2a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:09 +0200
Subject: memremap: pass a struct dev_pagemap to ->kill and ->cleanup

Passing the actual typed structure leads to more understandable code
vs just passing the ref member.

Reported-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/dax/device.c              | 12 ++++++------
 drivers/nvdimm/pmem.c             | 18 +++++++++---------
 drivers/pci/p2pdma.c              |  9 +++++----
 include/linux/memremap.h          |  4 ++--
 kernel/memremap.c                 |  8 ++++----
 mm/hmm.c                          | 10 +++++-----
 tools/testing/nvdimm/test/iomap.c |  4 ++--
 7 files changed, 33 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index f390083a64d7..b5257038c188 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -27,21 +27,21 @@ static void dev_dax_percpu_release(struct percpu_ref *ref)
 	complete(&dev_dax->cmp);
 }
 
-static void dev_dax_percpu_exit(struct percpu_ref *ref)
+static void dev_dax_percpu_exit(struct dev_pagemap *pgmap)
 {
-	struct dev_dax *dev_dax = ref_to_dev_dax(ref);
+	struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap);
 
 	dev_dbg(&dev_dax->dev, "%s\n", __func__);
 	wait_for_completion(&dev_dax->cmp);
-	percpu_ref_exit(ref);
+	percpu_ref_exit(pgmap->ref);
 }
 
-static void dev_dax_percpu_kill(struct percpu_ref *ref)
+static void dev_dax_percpu_kill(struct dev_pagemap *pgmap)
 {
-	struct dev_dax *dev_dax = ref_to_dev_dax(ref);
+	struct dev_dax *dev_dax = container_of(pgmap, struct dev_dax, pgmap);
 
 	dev_dbg(&dev_dax->dev, "%s\n", __func__);
-	percpu_ref_kill(ref);
+	percpu_ref_kill(pgmap->ref);
 }
 
 static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index c2449af2b388..9dac48359353 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -303,24 +303,24 @@ static const struct attribute_group *pmem_attribute_groups[] = {
 	NULL,
 };
 
-static void pmem_pagemap_cleanup(struct percpu_ref *ref)
+static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap)
 {
-	struct request_queue *q;
+	struct request_queue *q =
+		container_of(pgmap->ref, struct request_queue, q_usage_counter);
 
-	q = container_of(ref, typeof(*q), q_usage_counter);
 	blk_cleanup_queue(q);
 }
 
-static void pmem_release_queue(void *ref)
+static void pmem_release_queue(void *pgmap)
 {
-	pmem_pagemap_cleanup(ref);
+	pmem_pagemap_cleanup(pgmap);
 }
 
-static void pmem_pagemap_kill(struct percpu_ref *ref)
+static void pmem_pagemap_kill(struct dev_pagemap *pgmap)
 {
-	struct request_queue *q;
+	struct request_queue *q =
+		container_of(pgmap->ref, struct request_queue, q_usage_counter);
 
-	q = container_of(ref, typeof(*q), q_usage_counter);
 	blk_freeze_queue_start(q);
 }
 
@@ -435,7 +435,7 @@ static int pmem_attach_disk(struct device *dev,
 		memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
 	} else {
 		if (devm_add_action_or_reset(dev, pmem_release_queue,
-					&q->q_usage_counter))
+					&pmem->pgmap))
 			return -ENOMEM;
 		addr = devm_memremap(dev, pmem->phys_addr,
 				pmem->size, ARCH_MEMREMAP_PMEM);
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index fb039259d463..fa6249e4ed5f 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -91,14 +91,15 @@ static void pci_p2pdma_percpu_release(struct percpu_ref *ref)
 	complete(&p2p_pgmap->ref_done);
 }
 
-static void pci_p2pdma_percpu_kill(struct percpu_ref *ref)
+static void pci_p2pdma_percpu_kill(struct dev_pagemap *pgmap)
 {
-	percpu_ref_kill(ref);
+	percpu_ref_kill(pgmap->ref);
 }
 
-static void pci_p2pdma_percpu_cleanup(struct percpu_ref *ref)
+static void pci_p2pdma_percpu_cleanup(struct dev_pagemap *pgmap)
 {
-	struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
+	struct p2pdma_pagemap *p2p_pgmap =
+		container_of(pgmap, struct p2pdma_pagemap, pgmap);
 
 	wait_for_completion(&p2p_pgmap->ref_done);
 	percpu_ref_exit(&p2p_pgmap->ref);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 919755f48c7e..b8666a0d8665 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -74,12 +74,12 @@ struct dev_pagemap_ops {
 	/*
 	 * Transition the refcount in struct dev_pagemap to the dead state.
 	 */
-	void (*kill)(struct percpu_ref *ref);
+	void (*kill)(struct dev_pagemap *pgmap);
 
 	/*
 	 * Wait for refcount in struct dev_pagemap to be idle and reap it.
 	 */
-	void (*cleanup)(struct percpu_ref *ref);
+	void (*cleanup)(struct dev_pagemap *pgmap);
 };
 
 /**
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 0824237ef979..00c1ceb60c19 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -92,10 +92,10 @@ static void devm_memremap_pages_release(void *data)
 	unsigned long pfn;
 	int nid;
 
-	pgmap->ops->kill(pgmap->ref);
+	pgmap->ops->kill(pgmap);
 	for_each_device_pfn(pfn, pgmap)
 		put_page(pfn_to_page(pfn));
-	pgmap->ops->cleanup(pgmap->ref);
+	pgmap->ops->cleanup(pgmap);
 
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
@@ -294,8 +294,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
  err_pfn_remap:
 	pgmap_array_delete(res);
  err_array:
-	pgmap->ops->kill(pgmap->ref);
-	pgmap->ops->cleanup(pgmap->ref);
+	pgmap->ops->kill(pgmap);
+	pgmap->ops->cleanup(pgmap);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
diff --git a/mm/hmm.c b/mm/hmm.c
index 583a02a16872..987793fba923 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1352,18 +1352,18 @@ static void hmm_devmem_ref_release(struct percpu_ref *ref)
 	complete(&devmem->completion);
 }
 
-static void hmm_devmem_ref_exit(struct percpu_ref *ref)
+static void hmm_devmem_ref_exit(struct dev_pagemap *pgmap)
 {
 	struct hmm_devmem *devmem;
 
-	devmem = container_of(ref, struct hmm_devmem, ref);
+	devmem = container_of(pgmap, struct hmm_devmem, pagemap);
 	wait_for_completion(&devmem->completion);
-	percpu_ref_exit(ref);
+	percpu_ref_exit(pgmap->ref);
 }
 
-static void hmm_devmem_ref_kill(struct percpu_ref *ref)
+static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap)
 {
-	percpu_ref_kill(ref);
+	percpu_ref_kill(pgmap->ref);
 }
 
 static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma,
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index cf3f064a697d..82f901569e06 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -102,8 +102,8 @@ static void nfit_test_kill(void *_pgmap)
 
 	WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
 		!pgmap->ops->cleanup);
-	pgmap->ops->kill(pgmap->ref);
-	pgmap->ops->cleanup(pgmap->ref);
+	pgmap->ops->kill(pgmap);
+	pgmap->ops->cleanup(pgmap);
 }
 
 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
-- 
cgit v1.2.3-59-g8ed1b


From f6a55e1a3fe6b3bb294a80a05437fcf86488d819 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:10 +0200
Subject: memremap: lift the devmap_enable manipulation into
 devm_memremap_pages

Just check if there is a ->page_free operation set and take care of the
static key enable, as well as the put using device managed resources.
Also check that a ->page_free is provided for the pgmaps types that
require it, and check for a valid type as well while we are at it.

Note that this also fixes the fact that hmm never called
dev_pagemap_put_ops and thus would leave the slow path enabled forever,
even after a device driver unload or disable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/nvdimm/pmem.c | 23 ++++----------------
 include/linux/mm.h    | 10 ---------
 kernel/memremap.c     | 59 ++++++++++++++++++++++++++++++++-------------------
 mm/hmm.c              |  2 --
 4 files changed, 41 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 9dac48359353..48767171a4df 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -334,11 +334,6 @@ static void pmem_release_disk(void *__pmem)
 	put_disk(pmem->disk);
 }
 
-static void pmem_release_pgmap_ops(void *__pgmap)
-{
-	dev_pagemap_put_ops();
-}
-
 static void pmem_pagemap_page_free(struct page *page, void *data)
 {
 	wake_up_var(&page->_refcount);
@@ -350,16 +345,6 @@ static const struct dev_pagemap_ops fsdax_pagemap_ops = {
 	.cleanup		= pmem_pagemap_cleanup,
 };
 
-static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
-{
-	dev_pagemap_get_ops();
-	if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
-		return -ENOMEM;
-	pgmap->type = MEMORY_DEVICE_FS_DAX;
-	pgmap->ops = &fsdax_pagemap_ops;
-	return 0;
-}
-
 static int pmem_attach_disk(struct device *dev,
 		struct nd_namespace_common *ndns)
 {
@@ -415,8 +400,8 @@ static int pmem_attach_disk(struct device *dev,
 	pmem->pfn_flags = PFN_DEV;
 	pmem->pgmap.ref = &q->q_usage_counter;
 	if (is_nd_pfn(dev)) {
-		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
-			return -ENOMEM;
+		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pfn_sb = nd_pfn->pfn_sb;
 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
@@ -428,8 +413,8 @@ static int pmem_attach_disk(struct device *dev,
 	} else if (pmem_should_map_pages(dev)) {
 		memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
 		pmem->pgmap.altmap_valid = false;
-		if (setup_pagemap_fsdax(dev, &pmem->pgmap))
-			return -ENOMEM;
+		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
+		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
 		pmem->pfn_flags |= PFN_MAP;
 		memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7399f9f08de6..2425f4167ec2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -932,8 +932,6 @@ static inline bool is_zone_device_page(const struct page *page)
 #endif
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-void dev_pagemap_get_ops(void);
-void dev_pagemap_put_ops(void);
 void __put_devmap_managed_page(struct page *page);
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
 static inline bool put_devmap_managed_page(struct page *page)
@@ -973,14 +971,6 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
 #endif /* CONFIG_PCI_P2PDMA */
 
 #else /* CONFIG_DEV_PAGEMAP_OPS */
-static inline void dev_pagemap_get_ops(void)
-{
-}
-
-static inline void dev_pagemap_put_ops(void)
-{
-}
-
 static inline bool put_devmap_managed_page(struct page *page)
 {
 	return false;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 00c1ceb60c19..3219a4c91d07 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -17,6 +17,35 @@ static DEFINE_XARRAY(pgmap_array);
 #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
 #define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
 
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
+EXPORT_SYMBOL(devmap_managed_key);
+static atomic_t devmap_managed_enable;
+
+static void devmap_managed_enable_put(void *data)
+{
+	if (atomic_dec_and_test(&devmap_managed_enable))
+		static_branch_disable(&devmap_managed_key);
+}
+
+static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
+{
+	if (!pgmap->ops->page_free) {
+		WARN(1, "Missing page_free method\n");
+		return -EINVAL;
+	}
+
+	if (atomic_inc_return(&devmap_managed_enable) == 1)
+		static_branch_enable(&devmap_managed_key);
+	return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL);
+}
+#else
+static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
+
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
 		       unsigned long addr,
@@ -156,6 +185,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	};
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
+	bool need_devmap_managed = true;
 
 	switch (pgmap->type) {
 	case MEMORY_DEVICE_PRIVATE:
@@ -173,6 +203,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		break;
 	case MEMORY_DEVICE_DEVDAX:
 	case MEMORY_DEVICE_PCI_P2PDMA:
+		need_devmap_managed = false;
 		break;
 	default:
 		WARN(1, "Invalid pgmap type %d\n", pgmap->type);
@@ -185,6 +216,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		return ERR_PTR(-EINVAL);
 	}
 
+	if (need_devmap_managed) {
+		error = devmap_managed_enable_get(dev, pgmap);
+		if (error)
+			return ERR_PTR(error);
+	}
+
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
 		- align_start;
@@ -351,28 +388,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
 EXPORT_SYMBOL_GPL(get_dev_pagemap);
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
-EXPORT_SYMBOL(devmap_managed_key);
-static atomic_t devmap_enable;
-
-/*
- * Toggle the static key for ->page_free() callbacks when dev_pagemap
- * pages go idle.
- */
-void dev_pagemap_get_ops(void)
-{
-	if (atomic_inc_return(&devmap_enable) == 1)
-		static_branch_enable(&devmap_managed_key);
-}
-EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
-
-void dev_pagemap_put_ops(void)
-{
-	if (atomic_dec_and_test(&devmap_enable))
-		static_branch_disable(&devmap_managed_key);
-}
-EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
-
 void __put_devmap_managed_page(struct page *page)
 {
 	int count = page_ref_dec_return(page);
diff --git a/mm/hmm.c b/mm/hmm.c
index 987793fba923..5b0bd5f6a74f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1415,8 +1415,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	void *result;
 	int ret;
 
-	dev_pagemap_get_ops();
-
 	devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
 	if (!devmem)
 		return ERR_PTR(-ENOMEM);
-- 
cgit v1.2.3-59-g8ed1b


From 897e6365cda6ba6356e83a3aaa68dec82ef4c548 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:11 +0200
Subject: memremap: add a migrate_to_ram method to struct dev_pagemap_ops

This replaces the hacky ->fault callback, which is currently directly
called from common code through a hmm specific data structure as an
exercise in layering violations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h      |  6 ------
 include/linux/memremap.h |  6 ++++++
 include/linux/swapops.h  | 15 ---------------
 kernel/memremap.c        | 35 ++++-------------------------------
 mm/hmm.c                 | 13 +++++--------
 mm/memory.c              |  9 ++-------
 6 files changed, 17 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 44a5ac738bb5..ba19c19e24ed 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -692,11 +692,6 @@ struct hmm_devmem_ops {
  * chunk, as an optimization. It must, however, prioritize the faulting address
  * over all the others.
  */
-typedef vm_fault_t (*dev_page_fault_t)(struct vm_area_struct *vma,
-				unsigned long addr,
-				const struct page *page,
-				unsigned int flags,
-				pmd_t *pmdp);
 
 struct hmm_devmem {
 	struct completion		completion;
@@ -707,7 +702,6 @@ struct hmm_devmem {
 	struct dev_pagemap		pagemap;
 	const struct hmm_devmem_ops	*ops;
 	struct percpu_ref		ref;
-	dev_page_fault_t		page_fault;
 };
 
 /*
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index b8666a0d8665..ac985bd03a7f 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -80,6 +80,12 @@ struct dev_pagemap_ops {
 	 * Wait for refcount in struct dev_pagemap to be idle and reap it.
 	 */
 	void (*cleanup)(struct dev_pagemap *pgmap);
+
+	/*
+	 * Used for private (un-addressable) device memory only.  Must migrate
+	 * the page back to a CPU accessible page.
+	 */
+	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
 };
 
 /**
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 4d961668e5fc..15bdb6fe71e5 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -129,12 +129,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
 {
 	return pfn_to_page(swp_offset(entry));
 }
-
-vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
-		       unsigned long addr,
-		       swp_entry_t entry,
-		       unsigned int flags,
-		       pmd_t *pmdp);
 #else /* CONFIG_DEVICE_PRIVATE */
 static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
 {
@@ -164,15 +158,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
 {
 	return NULL;
 }
-
-static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
-				     unsigned long addr,
-				     swp_entry_t entry,
-				     unsigned int flags,
-				     pmd_t *pmdp)
-{
-	return VM_FAULT_SIGBUS;
-}
 #endif /* CONFIG_DEVICE_PRIVATE */
 
 #ifdef CONFIG_MIGRATION
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 3219a4c91d07..c06a5487dda7 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,7 +11,6 @@
 #include <linux/types.h>
 #include <linux/wait_bit.h>
 #include <linux/xarray.h>
-#include <linux/hmm.h>
 
 static DEFINE_XARRAY(pgmap_array);
 #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
@@ -46,36 +45,6 @@ static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgm
 }
 #endif /* CONFIG_DEV_PAGEMAP_OPS */
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
-		       unsigned long addr,
-		       swp_entry_t entry,
-		       unsigned int flags,
-		       pmd_t *pmdp)
-{
-	struct page *page = device_private_entry_to_page(entry);
-	struct hmm_devmem *devmem;
-
-	devmem = container_of(page->pgmap, typeof(*devmem), pagemap);
-
-	/*
-	 * The page_fault() callback must migrate page back to system memory
-	 * so that CPU can access it. This might fail for various reasons
-	 * (device issue, device was unsafely unplugged, ...). When such
-	 * error conditions happen, the callback must return VM_FAULT_SIGBUS.
-	 *
-	 * Note that because memory cgroup charges are accounted to the device
-	 * memory, this should never fail because of memory restrictions (but
-	 * allocation of regular system page might still fail because we are
-	 * out of memory).
-	 *
-	 * There is a more in-depth description of what that callback can and
-	 * cannot do, in include/linux/memremap.h
-	 */
-	return devmem->page_fault(vma, addr, page, flags, pmdp);
-}
-#endif /* CONFIG_DEVICE_PRIVATE */
-
 static void pgmap_array_delete(struct resource *res)
 {
 	xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
@@ -193,6 +162,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 			WARN(1, "Device private memory not supported\n");
 			return ERR_PTR(-EINVAL);
 		}
+		if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
+			WARN(1, "Missing migrate_to_ram method\n");
+			return ERR_PTR(-EINVAL);
+		}
 		break;
 	case MEMORY_DEVICE_FS_DAX:
 		if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
diff --git a/mm/hmm.c b/mm/hmm.c
index 5b0bd5f6a74f..96633ee066d8 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1366,15 +1366,12 @@ static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap)
 	percpu_ref_kill(pgmap->ref);
 }
 
-static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma,
-			    unsigned long addr,
-			    const struct page *page,
-			    unsigned int flags,
-			    pmd_t *pmdp)
+static vm_fault_t hmm_devmem_migrate_to_ram(struct vm_fault *vmf)
 {
-	struct hmm_devmem *devmem = page->pgmap->data;
+	struct hmm_devmem *devmem = vmf->page->pgmap->data;
 
-	return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
+	return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page,
+			vmf->flags, vmf->pmd);
 }
 
 static void hmm_devmem_free(struct page *page, void *data)
@@ -1388,6 +1385,7 @@ static const struct dev_pagemap_ops hmm_pagemap_ops = {
 	.page_free		= hmm_devmem_free,
 	.kill			= hmm_devmem_ref_kill,
 	.cleanup		= hmm_devmem_ref_exit,
+	.migrate_to_ram		= hmm_devmem_migrate_to_ram,
 };
 
 /*
@@ -1438,7 +1436,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
 	devmem->pfn_last = devmem->pfn_first +
 			   (resource_size(devmem->resource) >> PAGE_SHIFT);
-	devmem->page_fault = hmm_devmem_fault;
 
 	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
 	devmem->pagemap.res = *devmem->resource;
diff --git a/mm/memory.c b/mm/memory.c
index 2d14f4c7e152..d437ccdb210c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2748,13 +2748,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 			migration_entry_wait(vma->vm_mm, vmf->pmd,
 					     vmf->address);
 		} else if (is_device_private_entry(entry)) {
-			/*
-			 * For un-addressable device memory we call the pgmap
-			 * fault handler callback. The callback must migrate
-			 * the page back to some CPU accessible page.
-			 */
-			ret = device_private_entry_fault(vma, vmf->address, entry,
-						 vmf->flags, vmf->pmd);
+			vmf->page = device_private_entry_to_page(entry);
+			ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
 		} else if (is_hwpoison_entry(entry)) {
 			ret = VM_FAULT_HWPOISON;
 		} else {
-- 
cgit v1.2.3-59-g8ed1b


From 80a72d0af05ae97a8b106c172e431072ba587492 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:12 +0200
Subject: memremap: remove the data field in struct dev_pagemap

struct dev_pagemap is always embedded into a containing structure, so
there is no need to an additional private data field.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/nvdimm/pmem.c    | 2 +-
 include/linux/memremap.h | 3 +--
 kernel/memremap.c        | 2 +-
 mm/hmm.c                 | 9 +++++----
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 48767171a4df..093408ce40ad 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -334,7 +334,7 @@ static void pmem_release_disk(void *__pmem)
 	put_disk(pmem->disk);
 }
 
-static void pmem_pagemap_page_free(struct page *page, void *data)
+static void pmem_pagemap_page_free(struct page *page)
 {
 	wake_up_var(&page->_refcount);
 }
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index ac985bd03a7f..336eca601dad 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -69,7 +69,7 @@ struct dev_pagemap_ops {
 	 * reach 0 refcount unless there is a refcount bug. This allows the
 	 * device driver to implement its own memory management.)
 	 */
-	void (*page_free)(struct page *page, void *data);
+	void (*page_free)(struct page *page);
 
 	/*
 	 * Transition the refcount in struct dev_pagemap to the dead state.
@@ -104,7 +104,6 @@ struct dev_pagemap {
 	struct resource res;
 	struct percpu_ref *ref;
 	struct device *dev;
-	void *data;
 	enum memory_type type;
 	u64 pci_p2pdma_bus_offset;
 	const struct dev_pagemap_ops *ops;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index c06a5487dda7..6c3dbb692037 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -376,7 +376,7 @@ void __put_devmap_managed_page(struct page *page)
 
 		mem_cgroup_uncharge(page);
 
-		page->pgmap->ops->page_free(page, page->pgmap->data);
+		page->pgmap->ops->page_free(page);
 	} else if (!count)
 		__put_page(page);
 }
diff --git a/mm/hmm.c b/mm/hmm.c
index 96633ee066d8..36e25cdbdac1 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1368,15 +1368,17 @@ static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap)
 
 static vm_fault_t hmm_devmem_migrate_to_ram(struct vm_fault *vmf)
 {
-	struct hmm_devmem *devmem = vmf->page->pgmap->data;
+	struct hmm_devmem *devmem =
+		container_of(vmf->page->pgmap, struct hmm_devmem, pagemap);
 
 	return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page,
 			vmf->flags, vmf->pmd);
 }
 
-static void hmm_devmem_free(struct page *page, void *data)
+static void hmm_devmem_free(struct page *page)
 {
-	struct hmm_devmem *devmem = data;
+	struct hmm_devmem *devmem =
+		container_of(page->pgmap, struct hmm_devmem, pagemap);
 
 	devmem->ops->free(devmem, page);
 }
@@ -1442,7 +1444,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	devmem->pagemap.ops = &hmm_pagemap_ops;
 	devmem->pagemap.altmap_valid = false;
 	devmem->pagemap.ref = &devmem->ref;
-	devmem->pagemap.data = devmem;
 
 	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
 	if (IS_ERR(result))
-- 
cgit v1.2.3-59-g8ed1b


From 514caf23a70fd697fa2ece238b2cd8dcc73fb16f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:13 +0200
Subject: memremap: replace the altmap_valid field with a PGMAP_ALTMAP_VALID
 flag

Add a flags field to struct dev_pagemap to replace the altmap_valid
boolean to be a little more extensible.  Also add a pgmap_altmap() helper
to find the optional altmap and clean up the code using the altmap using
it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 arch/powerpc/mm/mem.c     | 10 +---------
 arch/x86/mm/init_64.c     |  8 ++------
 drivers/nvdimm/pfn_devs.c |  3 +--
 drivers/nvdimm/pmem.c     |  1 -
 include/linux/memremap.h  | 12 +++++++++++-
 kernel/memremap.c         | 26 ++++++++++----------------
 mm/hmm.c                  |  1 -
 mm/memory_hotplug.c       |  6 ++----
 mm/page_alloc.c           |  5 ++---
 9 files changed, 29 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 2540d3b2588c..a2923c5c1982 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -131,17 +131,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct page *page;
+	struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
 	int ret;
 
-	/*
-	 * If we have an altmap then we need to skip over any reserved PFNs
-	 * when querying the zone.
-	 */
-	page = pfn_to_page(start_pfn);
-	if (altmap)
-		page += vmem_altmap_offset(altmap);
-
 	__remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
 
 	/* Remove htab bolted mappings for this section of memory */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0f01c7b1d217..08bbf648827b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1213,13 +1213,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
-	struct page *page = pfn_to_page(start_pfn);
-	struct zone *zone;
+	struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
+	struct zone *zone = page_zone(page);
 
-	/* With altmap the first mapped page is offset from @start */
-	if (altmap)
-		page += vmem_altmap_offset(altmap);
-	zone = page_zone(page);
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 	kernel_physical_mapping_remove(start, start + size);
 }
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 0f81fc56bbfd..55fb6b7433ed 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -622,7 +622,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 		if (offset < reserve)
 			return -EINVAL;
 		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
-		pgmap->altmap_valid = false;
 	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
 		nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res)
 					- offset) / PAGE_SIZE);
@@ -634,7 +633,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 		memcpy(altmap, &__altmap, sizeof(*altmap));
 		altmap->free = PHYS_PFN(offset - reserve);
 		altmap->alloc = 0;
-		pgmap->altmap_valid = true;
+		pgmap->flags |= PGMAP_ALTMAP_VALID;
 	} else
 		return -ENXIO;
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 093408ce40ad..e7d8cc9f41e8 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -412,7 +412,6 @@ static int pmem_attach_disk(struct device *dev,
 		bb_res.start += pmem->data_offset;
 	} else if (pmem_should_map_pages(dev)) {
 		memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
-		pmem->pgmap.altmap_valid = false;
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
 		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 336eca601dad..e25685b878e9 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -88,6 +88,8 @@ struct dev_pagemap_ops {
 	vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
 };
 
+#define PGMAP_ALTMAP_VALID	(1 << 0)
+
 /**
  * struct dev_pagemap - metadata for ZONE_DEVICE mappings
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
@@ -96,19 +98,27 @@ struct dev_pagemap_ops {
  * @dev: host device of the mapping for debug
  * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
+ * @flags: PGMAP_* flags to specify defailed behavior
  * @ops: method table
  */
 struct dev_pagemap {
 	struct vmem_altmap altmap;
-	bool altmap_valid;
 	struct resource res;
 	struct percpu_ref *ref;
 	struct device *dev;
 	enum memory_type type;
+	unsigned int flags;
 	u64 pci_p2pdma_bus_offset;
 	const struct dev_pagemap_ops *ops;
 };
 
+static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
+{
+	if (pgmap->flags & PGMAP_ALTMAP_VALID)
+		return &pgmap->altmap;
+	return NULL;
+}
+
 #ifdef CONFIG_ZONE_DEVICE
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
 void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6c3dbb692037..eee490e7d7e1 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -54,14 +54,8 @@ static void pgmap_array_delete(struct resource *res)
 
 static unsigned long pfn_first(struct dev_pagemap *pgmap)
 {
-	const struct resource *res = &pgmap->res;
-	struct vmem_altmap *altmap = &pgmap->altmap;
-	unsigned long pfn;
-
-	pfn = res->start >> PAGE_SHIFT;
-	if (pgmap->altmap_valid)
-		pfn += vmem_altmap_offset(altmap);
-	return pfn;
+	return (pgmap->res.start >> PAGE_SHIFT) +
+		vmem_altmap_offset(pgmap_altmap(pgmap));
 }
 
 static unsigned long pfn_end(struct dev_pagemap *pgmap)
@@ -109,7 +103,7 @@ static void devm_memremap_pages_release(void *data)
 				align_size >> PAGE_SHIFT, NULL);
 	} else {
 		arch_remove_memory(nid, align_start, align_size,
-				pgmap->altmap_valid ? &pgmap->altmap : NULL);
+				pgmap_altmap(pgmap));
 		kasan_remove_zero_shadow(__va(align_start), align_size);
 	}
 	mem_hotplug_done();
@@ -129,8 +123,8 @@ static void devm_memremap_pages_release(void *data)
  * 1/ At a minimum the res, ref and type and ops members of @pgmap must be
  *    initialized by the caller before passing it to this function
  *
- * 2/ The altmap field may optionally be initialized, in which case altmap_valid
- *    must be set to true
+ * 2/ The altmap field may optionally be initialized, in which case
+ *    PGMAP_ALTMAP_VALID must be set in pgmap->flags.
  *
  * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
  *    at devm_memremap_pages_release() time, or if this routine fails.
@@ -142,15 +136,13 @@ static void devm_memremap_pages_release(void *data)
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 {
 	resource_size_t align_start, align_size, align_end;
-	struct vmem_altmap *altmap = pgmap->altmap_valid ?
-			&pgmap->altmap : NULL;
 	struct resource *res = &pgmap->res;
 	struct dev_pagemap *conflict_pgmap;
 	struct mhp_restrictions restrictions = {
 		/*
 		 * We do not want any optional features only our own memmap
 		*/
-		.altmap = altmap,
+		.altmap = pgmap_altmap(pgmap),
 	};
 	pgprot_t pgprot = PAGE_KERNEL;
 	int error, nid, is_ram;
@@ -274,7 +266,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 
 		zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
 		move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
-				align_size >> PAGE_SHIFT, altmap);
+				align_size >> PAGE_SHIFT, pgmap_altmap(pgmap));
 	}
 
 	mem_hotplug_done();
@@ -319,7 +311,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages);
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
 {
 	/* number of pfns from base where pfn_to_page() is valid */
-	return altmap->reserve + altmap->free;
+	if (altmap)
+		return altmap->reserve + altmap->free;
+	return 0;
 }
 
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
diff --git a/mm/hmm.c b/mm/hmm.c
index 36e25cdbdac1..e4470462298f 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1442,7 +1442,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
 	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
 	devmem->pagemap.res = *devmem->resource;
 	devmem->pagemap.ops = &hmm_pagemap_ops;
-	devmem->pagemap.altmap_valid = false;
 	devmem->pagemap.ref = &devmem->ref;
 
 	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e096c987d261..6166ba5a15f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -557,10 +557,8 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	int sections_to_remove;
 
 	/* In the ZONE_DEVICE case device driver owns the memory region */
-	if (is_dev_zone(zone)) {
-		if (altmap)
-			map_offset = vmem_altmap_offset(altmap);
-	}
+	if (is_dev_zone(zone))
+		map_offset = vmem_altmap_offset(altmap);
 
 	clear_zone_contiguous(zone);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d66bc8abe0af..17a39d40a556 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5853,6 +5853,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 {
 	unsigned long pfn, end_pfn = start_pfn + size;
 	struct pglist_data *pgdat = zone->zone_pgdat;
+	struct vmem_altmap *altmap = pgmap_altmap(pgmap);
 	unsigned long zone_idx = zone_idx(zone);
 	unsigned long start = jiffies;
 	int nid = pgdat->node_id;
@@ -5865,9 +5866,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 	 * of the pages reserved for the memmap, so we can just jump to
 	 * the end of that region and start processing the device pages.
 	 */
-	if (pgmap->altmap_valid) {
-		struct vmem_altmap *altmap = &pgmap->altmap;
-
+	if (altmap) {
 		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
 		size = end_pfn - start_pfn;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 24917f6b1041a73993178920656e13364f847995 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:14 +0200
Subject: memremap: provide an optional internal refcount in struct dev_pagemap

Provide an internal refcounting logic if no ->ref field is provided
in the pagemap passed into devm_memremap_pages so that callers don't
have to reinvent it poorly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/memremap.h          |  4 +++
 kernel/memremap.c                 | 64 +++++++++++++++++++++++++++++++--------
 tools/testing/nvdimm/test/iomap.c | 58 +++++++++++++++++++++++++++--------
 3 files changed, 101 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index e25685b878e9..f8a5b2a19945 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -95,6 +95,8 @@ struct dev_pagemap_ops {
  * @altmap: pre-allocated/reserved memory for vmemmap allocations
  * @res: physical address range covered by @ref
  * @ref: reference count that pins the devm_memremap_pages() mapping
+ * @internal_ref: internal reference if @ref is not provided by the caller
+ * @done: completion for @internal_ref
  * @dev: host device of the mapping for debug
  * @data: private data pointer for page_free()
  * @type: memory type: see MEMORY_* in memory_hotplug.h
@@ -105,6 +107,8 @@ struct dev_pagemap {
 	struct vmem_altmap altmap;
 	struct resource res;
 	struct percpu_ref *ref;
+	struct percpu_ref internal_ref;
+	struct completion done;
 	struct device *dev;
 	enum memory_type type;
 	unsigned int flags;
diff --git a/kernel/memremap.c b/kernel/memremap.c
index eee490e7d7e1..bea6f887adad 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -29,7 +29,7 @@ static void devmap_managed_enable_put(void *data)
 
 static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
 {
-	if (!pgmap->ops->page_free) {
+	if (!pgmap->ops || !pgmap->ops->page_free) {
 		WARN(1, "Missing page_free method\n");
 		return -EINVAL;
 	}
@@ -75,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn)
 #define for_each_device_pfn(pfn, map) \
 	for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
 
+static void dev_pagemap_kill(struct dev_pagemap *pgmap)
+{
+	if (pgmap->ops && pgmap->ops->kill)
+		pgmap->ops->kill(pgmap);
+	else
+		percpu_ref_kill(pgmap->ref);
+}
+
+static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
+{
+	if (pgmap->ops && pgmap->ops->cleanup) {
+		pgmap->ops->cleanup(pgmap);
+	} else {
+		wait_for_completion(&pgmap->done);
+		percpu_ref_exit(pgmap->ref);
+	}
+}
+
 static void devm_memremap_pages_release(void *data)
 {
 	struct dev_pagemap *pgmap = data;
@@ -84,10 +102,10 @@ static void devm_memremap_pages_release(void *data)
 	unsigned long pfn;
 	int nid;
 
-	pgmap->ops->kill(pgmap);
+	dev_pagemap_kill(pgmap);
 	for_each_device_pfn(pfn, pgmap)
 		put_page(pfn_to_page(pfn));
-	pgmap->ops->cleanup(pgmap);
+	dev_pagemap_cleanup(pgmap);
 
 	/* pages are dead and unused, undo the arch mapping */
 	align_start = res->start & ~(SECTION_SIZE - 1);
@@ -114,20 +132,29 @@ static void devm_memremap_pages_release(void *data)
 		      "%s: failed to free all reserved pages\n", __func__);
 }
 
+static void dev_pagemap_percpu_release(struct percpu_ref *ref)
+{
+	struct dev_pagemap *pgmap =
+		container_of(ref, struct dev_pagemap, internal_ref);
+
+	complete(&pgmap->done);
+}
+
 /**
  * devm_memremap_pages - remap and provide memmap backing for the given resource
  * @dev: hosting device for @res
  * @pgmap: pointer to a struct dev_pagemap
  *
  * Notes:
- * 1/ At a minimum the res, ref and type and ops members of @pgmap must be
- *    initialized by the caller before passing it to this function
+ * 1/ At a minimum the res and type members of @pgmap must be initialized
+ *    by the caller before passing it to this function
  *
  * 2/ The altmap field may optionally be initialized, in which case
  *    PGMAP_ALTMAP_VALID must be set in pgmap->flags.
  *
- * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
- *    at devm_memremap_pages_release() time, or if this routine fails.
+ * 3/ The ref field may optionally be provided, in which pgmap->ref must be
+ *    'live' on entry and will be killed and reaped at
+ *    devm_memremap_pages_release() time, or if this routine fails.
  *
  * 4/ res is expected to be a host memory range that could feasibly be
  *    treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -175,10 +202,21 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 		break;
 	}
 
-	if (!pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
-	    !pgmap->ops->cleanup) {
-		WARN(1, "Missing reference count teardown definition\n");
-		return ERR_PTR(-EINVAL);
+	if (!pgmap->ref) {
+		if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
+			return ERR_PTR(-EINVAL);
+
+		init_completion(&pgmap->done);
+		error = percpu_ref_init(&pgmap->internal_ref,
+				dev_pagemap_percpu_release, 0, GFP_KERNEL);
+		if (error)
+			return ERR_PTR(error);
+		pgmap->ref = &pgmap->internal_ref;
+	} else {
+		if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
+			WARN(1, "Missing reference count teardown definition\n");
+			return ERR_PTR(-EINVAL);
+		}
 	}
 
 	if (need_devmap_managed) {
@@ -296,8 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
  err_pfn_remap:
 	pgmap_array_delete(res);
  err_array:
-	pgmap->ops->kill(pgmap);
-	pgmap->ops->cleanup(pgmap);
+	dev_pagemap_kill(pgmap);
+	dev_pagemap_cleanup(pgmap);
 	return ERR_PTR(error);
 }
 EXPORT_SYMBOL_GPL(devm_memremap_pages);
diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c
index 82f901569e06..cd040b5abffe 100644
--- a/tools/testing/nvdimm/test/iomap.c
+++ b/tools/testing/nvdimm/test/iomap.c
@@ -100,26 +100,60 @@ static void nfit_test_kill(void *_pgmap)
 {
 	struct dev_pagemap *pgmap = _pgmap;
 
-	WARN_ON(!pgmap || !pgmap->ref || !pgmap->ops || !pgmap->ops->kill ||
-		!pgmap->ops->cleanup);
-	pgmap->ops->kill(pgmap);
-	pgmap->ops->cleanup(pgmap);
+	WARN_ON(!pgmap || !pgmap->ref);
+
+	if (pgmap->ops && pgmap->ops->kill)
+		pgmap->ops->kill(pgmap);
+	else
+		percpu_ref_kill(pgmap->ref);
+
+	if (pgmap->ops && pgmap->ops->cleanup) {
+		pgmap->ops->cleanup(pgmap);
+	} else {
+		wait_for_completion(&pgmap->done);
+		percpu_ref_exit(pgmap->ref);
+	}
+}
+
+static void dev_pagemap_percpu_release(struct percpu_ref *ref)
+{
+	struct dev_pagemap *pgmap =
+		container_of(ref, struct dev_pagemap, internal_ref);
+
+	complete(&pgmap->done);
 }
 
 void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 {
+	int error;
 	resource_size_t offset = pgmap->res.start;
 	struct nfit_test_resource *nfit_res = get_nfit_res(offset);
 
-	if (nfit_res) {
-		int rc;
-
-		rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
-		if (rc)
-			return ERR_PTR(rc);
-		return nfit_res->buf + offset - nfit_res->res.start;
+	if (!nfit_res)
+		return devm_memremap_pages(dev, pgmap);
+
+	pgmap->dev = dev;
+	if (!pgmap->ref) {
+		if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
+			return ERR_PTR(-EINVAL);
+
+		init_completion(&pgmap->done);
+		error = percpu_ref_init(&pgmap->internal_ref,
+				dev_pagemap_percpu_release, 0, GFP_KERNEL);
+		if (error)
+			return ERR_PTR(error);
+		pgmap->ref = &pgmap->internal_ref;
+	} else {
+		if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
+			WARN(1, "Missing reference count teardown definition\n");
+			return ERR_PTR(-EINVAL);
+		}
 	}
-	return devm_memremap_pages(dev, pgmap);
+
+	error = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
+	if (error)
+		return ERR_PTR(error);
+	return nfit_res->buf + offset - nfit_res->res.start;
 }
 EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
 
-- 
cgit v1.2.3-59-g8ed1b


From 47e9d836a5e827acdaa5cb6175648fbef15b4e84 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:19 +0200
Subject: mm: remove hmm_vma_alloc_locked_page

The only user of it has just been removed, and there wasn't really any need
to wrap a basic memory allocator to start with.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/hmm.h |  3 ---
 mm/hmm.c            | 14 --------------
 2 files changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ba19c19e24ed..1d55b7ea2da6 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -587,9 +587,6 @@ static inline void hmm_mm_init(struct mm_struct *mm) {}
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
 struct hmm_devmem;
 
-struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
-				       unsigned long addr);
-
 /*
  * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
  *
diff --git a/mm/hmm.c b/mm/hmm.c
index e4470462298f..fdbd48771292 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1330,20 +1330,6 @@ EXPORT_SYMBOL(hmm_range_dma_unmap);
 
 
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
-				       unsigned long addr)
-{
-	struct page *page;
-
-	page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
-	if (!page)
-		return NULL;
-	lock_page(page);
-	return page;
-}
-EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
-
-
 static void hmm_devmem_ref_release(struct percpu_ref *ref)
 {
 	struct hmm_devmem *devmem;
-- 
cgit v1.2.3-59-g8ed1b


From eee3ae41b153e55e25d6cf7bd5b5098ba0afe705 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:20 +0200
Subject: mm: remove hmm_devmem_add

There isn't really much value add in the hmm_devmem_add wrapper and
more, as using devm_memremap_pages directly now is just as simple.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 Documentation/vm/hmm.rst |  26 ----------
 include/linux/hmm.h      | 129 -----------------------------------------------
 mm/hmm.c                 | 110 ----------------------------------------
 3 files changed, 265 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 7cdf7282e022..50e1380950a9 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -329,32 +329,6 @@ directly using struct page for device memory which left most kernel code paths
 unaware of the difference. We only need to make sure that no one ever tries to
 map those pages from the CPU side.
 
-HMM provides a set of helpers to register and hotplug device memory as a new
-region needing a struct page. This is offered through a very simple API::
-
- struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
-                                   struct device *device,
-                                   unsigned long size);
- void hmm_devmem_remove(struct hmm_devmem *devmem);
-
-The hmm_devmem_ops is where most of the important things are::
-
- struct hmm_devmem_ops {
-     void (*free)(struct hmm_devmem *devmem, struct page *page);
-     int (*fault)(struct hmm_devmem *devmem,
-                  struct vm_area_struct *vma,
-                  unsigned long addr,
-                  struct page *page,
-                  unsigned flags,
-                  pmd_t *pmdp);
- };
-
-The first callback (free()) happens when the last reference on a device page is
-dropped. This means the device page is now free and no longer used by anyone.
-The second callback happens whenever the CPU tries to access a device page
-which it cannot do. This second callback must trigger a migration back to
-system memory.
-
 
 Migration to and from device memory
 ===================================
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 1d55b7ea2da6..86aa4ec3404c 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -585,135 +585,6 @@ static inline void hmm_mm_init(struct mm_struct *mm) {}
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-struct hmm_devmem;
-
-/*
- * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
- *
- * @free: call when refcount on page reach 1 and thus is no longer use
- * @fault: call when there is a page fault to unaddressable memory
- *
- * Both callback happens from page_free() and page_fault() callback of struct
- * dev_pagemap respectively. See include/linux/memremap.h for more details on
- * those.
- *
- * The hmm_devmem_ops callback are just here to provide a coherent and
- * uniq API to device driver and device driver should not register their
- * own page_free() or page_fault() but rely on the hmm_devmem_ops call-
- * back.
- */
-struct hmm_devmem_ops {
-	/*
-	 * free() - free a device page
-	 * @devmem: device memory structure (see struct hmm_devmem)
-	 * @page: pointer to struct page being freed
-	 *
-	 * Call back occurs whenever a device page refcount reach 1 which
-	 * means that no one is holding any reference on the page anymore
-	 * (ZONE_DEVICE page have an elevated refcount of 1 as default so
-	 * that they are not release to the general page allocator).
-	 *
-	 * Note that callback has exclusive ownership of the page (as no
-	 * one is holding any reference).
-	 */
-	void (*free)(struct hmm_devmem *devmem, struct page *page);
-	/*
-	 * fault() - CPU page fault or get user page (GUP)
-	 * @devmem: device memory structure (see struct hmm_devmem)
-	 * @vma: virtual memory area containing the virtual address
-	 * @addr: virtual address that faulted or for which there is a GUP
-	 * @page: pointer to struct page backing virtual address (unreliable)
-	 * @flags: FAULT_FLAG_* (see include/linux/mm.h)
-	 * @pmdp: page middle directory
-	 * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
-	 *   on error
-	 *
-	 * The callback occurs whenever there is a CPU page fault or GUP on a
-	 * virtual address. This means that the device driver must migrate the
-	 * page back to regular memory (CPU accessible).
-	 *
-	 * The device driver is free to migrate more than one page from the
-	 * fault() callback as an optimization. However if device decide to
-	 * migrate more than one page it must always priotirize the faulting
-	 * address over the others.
-	 *
-	 * The struct page pointer is only given as an hint to allow quick
-	 * lookup of internal device driver data. A concurrent migration
-	 * might have already free that page and the virtual address might
-	 * not longer be back by it. So it should not be modified by the
-	 * callback.
-	 *
-	 * Note that mmap semaphore is held in read mode at least when this
-	 * callback occurs, hence the vma is valid upon callback entry.
-	 */
-	vm_fault_t (*fault)(struct hmm_devmem *devmem,
-		     struct vm_area_struct *vma,
-		     unsigned long addr,
-		     const struct page *page,
-		     unsigned int flags,
-		     pmd_t *pmdp);
-};
-
-/*
- * struct hmm_devmem - track device memory
- *
- * @completion: completion object for device memory
- * @pfn_first: first pfn for this resource (set by hmm_devmem_add())
- * @pfn_last: last pfn for this resource (set by hmm_devmem_add())
- * @resource: IO resource reserved for this chunk of memory
- * @pagemap: device page map for that chunk
- * @device: device to bind resource to
- * @ops: memory operations callback
- * @ref: per CPU refcount
- * @page_fault: callback when CPU fault on an unaddressable device page
- *
- * This an helper structure for device drivers that do not wish to implement
- * the gory details related to hotplugging new memoy and allocating struct
- * pages.
- *
- * Device drivers can directly use ZONE_DEVICE memory on their own if they
- * wish to do so.
- *
- * The page_fault() callback must migrate page back, from device memory to
- * system memory, so that the CPU can access it. This might fail for various
- * reasons (device issues,  device have been unplugged, ...). When such error
- * conditions happen, the page_fault() callback must return VM_FAULT_SIGBUS and
- * set the CPU page table entry to "poisoned".
- *
- * Note that because memory cgroup charges are transferred to the device memory,
- * this should never fail due to memory restrictions. However, allocation
- * of a regular system page might still fail because we are out of memory. If
- * that happens, the page_fault() callback must return VM_FAULT_OOM.
- *
- * The page_fault() callback can also try to migrate back multiple pages in one
- * chunk, as an optimization. It must, however, prioritize the faulting address
- * over all the others.
- */
-
-struct hmm_devmem {
-	struct completion		completion;
-	unsigned long			pfn_first;
-	unsigned long			pfn_last;
-	struct resource			*resource;
-	struct device			*device;
-	struct dev_pagemap		pagemap;
-	const struct hmm_devmem_ops	*ops;
-	struct percpu_ref		ref;
-};
-
-/*
- * To add (hotplug) device memory, HMM assumes that there is no real resource
- * that reserves a range in the physical address space (this is intended to be
- * use by unaddressable device memory). It will reserve a physical range big
- * enough and allocate struct page for it.
- *
- * The device driver can wrap the hmm_devmem struct inside a private device
- * driver struct.
- */
-struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
-				  struct device *device,
-				  unsigned long size);
-
 /*
  * hmm_devmem_page_set_drvdata - set per-page driver data field
  *
diff --git a/mm/hmm.c b/mm/hmm.c
index fdbd48771292..90ca0cdab9db 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1327,113 +1327,3 @@ long hmm_range_dma_unmap(struct hmm_range *range,
 }
 EXPORT_SYMBOL(hmm_range_dma_unmap);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-
-
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-static void hmm_devmem_ref_release(struct percpu_ref *ref)
-{
-	struct hmm_devmem *devmem;
-
-	devmem = container_of(ref, struct hmm_devmem, ref);
-	complete(&devmem->completion);
-}
-
-static void hmm_devmem_ref_exit(struct dev_pagemap *pgmap)
-{
-	struct hmm_devmem *devmem;
-
-	devmem = container_of(pgmap, struct hmm_devmem, pagemap);
-	wait_for_completion(&devmem->completion);
-	percpu_ref_exit(pgmap->ref);
-}
-
-static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap)
-{
-	percpu_ref_kill(pgmap->ref);
-}
-
-static vm_fault_t hmm_devmem_migrate_to_ram(struct vm_fault *vmf)
-{
-	struct hmm_devmem *devmem =
-		container_of(vmf->page->pgmap, struct hmm_devmem, pagemap);
-
-	return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page,
-			vmf->flags, vmf->pmd);
-}
-
-static void hmm_devmem_free(struct page *page)
-{
-	struct hmm_devmem *devmem =
-		container_of(page->pgmap, struct hmm_devmem, pagemap);
-
-	devmem->ops->free(devmem, page);
-}
-
-static const struct dev_pagemap_ops hmm_pagemap_ops = {
-	.page_free		= hmm_devmem_free,
-	.kill			= hmm_devmem_ref_kill,
-	.cleanup		= hmm_devmem_ref_exit,
-	.migrate_to_ram		= hmm_devmem_migrate_to_ram,
-};
-
-/*
- * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
- *
- * @ops: memory event device driver callback (see struct hmm_devmem_ops)
- * @device: device struct to bind the resource too
- * @size: size in bytes of the device memory to add
- * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
- *
- * This function first finds an empty range of physical address big enough to
- * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
- * in turn allocates struct pages. It does not do anything beyond that; all
- * events affecting the memory will go through the various callbacks provided
- * by hmm_devmem_ops struct.
- *
- * Device driver should call this function during device initialization and
- * is then responsible of memory management. HMM only provides helpers.
- */
-struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
-				  struct device *device,
-				  unsigned long size)
-{
-	struct hmm_devmem *devmem;
-	void *result;
-	int ret;
-
-	devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
-	if (!devmem)
-		return ERR_PTR(-ENOMEM);
-
-	init_completion(&devmem->completion);
-	devmem->pfn_first = -1UL;
-	devmem->pfn_last = -1UL;
-	devmem->resource = NULL;
-	devmem->device = device;
-	devmem->ops = ops;
-
-	ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
-			      0, GFP_KERNEL);
-	if (ret)
-		return ERR_PTR(ret);
-
-	devmem->resource = devm_request_free_mem_region(device, &iomem_resource,
-			size);
-	if (IS_ERR(devmem->resource))
-		return ERR_CAST(devmem->resource);
-	devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
-	devmem->pfn_last = devmem->pfn_first +
-			   (resource_size(devmem->resource) >> PAGE_SHIFT);
-
-	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-	devmem->pagemap.res = *devmem->resource;
-	devmem->pagemap.ops = &hmm_pagemap_ops;
-	devmem->pagemap.ref = &devmem->ref;
-
-	result = devm_memremap_pages(devmem->device, &devmem->pagemap);
-	if (IS_ERR(result))
-		return result;
-	return devmem;
-}
-EXPORT_SYMBOL_GPL(hmm_devmem_add);
-#endif /* CONFIG_DEVICE_PRIVATE  */
-- 
cgit v1.2.3-59-g8ed1b


From 8a164fef9c4ccf6ff7757170397222860e40d192 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:21 +0200
Subject: mm: simplify ZONE_DEVICE page private data

Remove the clumsy hmm_devmem_page_{get,set}_drvdata helpers, and
instead just access the page directly.  Also make the page data
a void pointer, and thus much easier to use.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/gpu/drm/nouveau/nouveau_dmem.c | 18 +++++++-----------
 include/linux/hmm.h                    | 32 --------------------------------
 include/linux/mm_types.h               |  2 +-
 mm/page_alloc.c                        |  8 ++++----
 4 files changed, 12 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 0fb7a44b8bc4..42c026010938 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -104,11 +104,8 @@ struct nouveau_migrate {
 
 static void nouveau_dmem_page_free(struct page *page)
 {
-	struct nouveau_dmem_chunk *chunk;
-	unsigned long idx;
-
-	chunk = (void *)hmm_devmem_page_get_drvdata(page);
-	idx = page_to_pfn(page) - chunk->pfn_first;
+	struct nouveau_dmem_chunk *chunk = page->zone_device_data;
+	unsigned long idx = page_to_pfn(page) - chunk->pfn_first;
 
 	/*
 	 * FIXME:
@@ -200,7 +197,7 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
 
 		dst_addr = fault->dma[fault->npages++];
 
-		chunk = (void *)hmm_devmem_page_get_drvdata(spage);
+		chunk = spage->zone_device_data;
 		src_addr = page_to_pfn(spage) - chunk->pfn_first;
 		src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
 
@@ -633,9 +630,8 @@ nouveau_dmem_init(struct nouveau_drm *drm)
 		list_add_tail(&chunk->list, &drm->dmem->chunk_empty);
 
 		page = pfn_to_page(chunk->pfn_first);
-		for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page) {
-			hmm_devmem_page_set_drvdata(page, (long)chunk);
-		}
+		for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page)
+			page->zone_device_data = chunk;
 	}
 
 	NV_INFO(drm, "DMEM: registered %ldMB of device memory\n", size >> 20);
@@ -698,7 +694,7 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma,
 		if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
 			continue;
 
-		chunk = (void *)hmm_devmem_page_get_drvdata(dpage);
+		chunk = dpage->zone_device_data;
 		dst_addr = page_to_pfn(dpage) - chunk->pfn_first;
 		dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
 
@@ -862,7 +858,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
 			continue;
 		}
 
-		chunk = (void *)hmm_devmem_page_get_drvdata(page);
+		chunk = page->zone_device_data;
 		addr = page_to_pfn(page) - chunk->pfn_first;
 		addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT;
 
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 86aa4ec3404c..3d00e9550e77 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -584,36 +584,4 @@ static inline void hmm_mm_destroy(struct mm_struct *mm) {}
 static inline void hmm_mm_init(struct mm_struct *mm) {}
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
-/*
- * hmm_devmem_page_set_drvdata - set per-page driver data field
- *
- * @page: pointer to struct page
- * @data: driver data value to set
- *
- * Because page can not be on lru we have an unsigned long that driver can use
- * to store a per page field. This just a simple helper to do that.
- */
-static inline void hmm_devmem_page_set_drvdata(struct page *page,
-					       unsigned long data)
-{
-	page->hmm_data = data;
-}
-
-/*
- * hmm_devmem_page_get_drvdata - get per page driver data field
- *
- * @page: pointer to struct page
- * Return: driver data value
- */
-static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page)
-{
-	return page->hmm_data;
-}
-#endif /* CONFIG_DEVICE_PRIVATE */
-#else /* IS_ENABLED(CONFIG_HMM) */
-static inline void hmm_mm_destroy(struct mm_struct *mm) {}
-static inline void hmm_mm_init(struct mm_struct *mm) {}
-#endif /* IS_ENABLED(CONFIG_HMM) */
-
 #endif /* LINUX_HMM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8ec38b11b361..f33a1289c101 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -158,7 +158,7 @@ struct page {
 		struct {	/* ZONE_DEVICE pages */
 			/** @pgmap: Points to the hosting device page map. */
 			struct dev_pagemap *pgmap;
-			unsigned long hmm_data;
+			void *zone_device_data;
 			unsigned long _zd_pad_1;	/* uses mapping */
 		};
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17a39d40a556..c0e031c52db5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5886,12 +5886,12 @@ void __ref memmap_init_zone_device(struct zone *zone,
 		__SetPageReserved(page);
 
 		/*
-		 * ZONE_DEVICE pages union ->lru with a ->pgmap back
-		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE
-		 * page is ever freed or placed on a driver-private list.
+		 * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
+		 * and zone_device_data.  It is a bug if a ZONE_DEVICE page is
+		 * ever freed or placed on a driver-private list.
 		 */
 		page->pgmap = pgmap;
-		page->hmm_data = 0;
+		page->zone_device_data = NULL;
 
 		/*
 		 * Mark the block movable so that blocks are reserved for
-- 
cgit v1.2.3-59-g8ed1b


From 43535b0aefab29ea6564e608de4c783ed2ab6c49 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 26 Jun 2019 14:27:23 +0200
Subject: mm: remove the HMM config option

All the mm/hmm.c code is better keyed off HMM_MIRROR.  Also let nouveau
depend on it instead of the mix of a dummy dependency symbol plus the
actually selected one.  Drop various odd dependencies, as the code is
pretty portable.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/gpu/drm/nouveau/Kconfig |  3 +--
 include/linux/hmm.h             |  5 +----
 include/linux/mm_types.h        |  2 +-
 mm/Kconfig                      | 27 ++++-----------------------
 mm/Makefile                     |  2 +-
 mm/hmm.c                        |  2 --
 6 files changed, 8 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig
index 6303d203ab1d..66c839d8e9d1 100644
--- a/drivers/gpu/drm/nouveau/Kconfig
+++ b/drivers/gpu/drm/nouveau/Kconfig
@@ -84,11 +84,10 @@ config DRM_NOUVEAU_BACKLIGHT
 
 config DRM_NOUVEAU_SVM
 	bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support"
-	depends on ARCH_HAS_HMM
 	depends on DEVICE_PRIVATE
 	depends on DRM_NOUVEAU
+	depends on HMM_MIRROR
 	depends on STAGING
-	select HMM_MIRROR
 	default n
 	help
 	  Say Y here if you want to enable experimental support for
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 3d00e9550e77..b697496e85ba 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -62,7 +62,7 @@
 #include <linux/kconfig.h>
 #include <asm/pgtable.h>
 
-#if IS_ENABLED(CONFIG_HMM)
+#ifdef CONFIG_HMM_MIRROR
 
 #include <linux/device.h>
 #include <linux/migrate.h>
@@ -332,9 +332,6 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
 	return hmm_device_entry_from_pfn(range, pfn);
 }
 
-
-
-#if IS_ENABLED(CONFIG_HMM_MIRROR)
 /*
  * Mirroring: how to synchronize device page table with CPU page table.
  *
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index f33a1289c101..8d37182f8dbe 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -501,7 +501,7 @@ struct mm_struct {
 #endif
 		struct work_struct async_put_work;
 
-#if IS_ENABLED(CONFIG_HMM)
+#ifdef CONFIG_HMM_MIRROR
 		/* HMM needs to track a few things per mm */
 		struct hmm *hmm;
 #endif
diff --git a/mm/Kconfig b/mm/Kconfig
index eecf037a54b3..1e426c26b1d6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -669,37 +669,18 @@ config ZONE_DEVICE
 
 	  If FS_DAX is enabled, then say Y.
 
-config ARCH_HAS_HMM_MIRROR
-	bool
-	default y
-	depends on (X86_64 || PPC64)
-	depends on MMU && 64BIT
-
-config ARCH_HAS_HMM
-	bool
-	depends on (X86_64 || PPC64)
-	depends on ZONE_DEVICE
-	depends on MMU && 64BIT
-	depends on MEMORY_HOTPLUG
-	depends on MEMORY_HOTREMOVE
-	depends on SPARSEMEM_VMEMMAP
-	default y
-
 config MIGRATE_VMA_HELPER
 	bool
 
 config DEV_PAGEMAP_OPS
 	bool
 
-config HMM
-	bool
-	select MMU_NOTIFIER
-	select MIGRATE_VMA_HELPER
-
 config HMM_MIRROR
 	bool "HMM mirror CPU page table into a device page table"
-	depends on ARCH_HAS_HMM
-	select HMM
+	depends on (X86_64 || PPC64)
+	depends on MMU && 64BIT
+	select MMU_NOTIFIER
+	select MIGRATE_VMA_HELPER
 	help
 	  Select HMM_MIRROR if you want to mirror range of the CPU page table of a
 	  process into a device page table. Here, mirror means "keep synchronized".
diff --git a/mm/Makefile b/mm/Makefile
index ac5e5ba78874..91c99040065c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -102,5 +102,5 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
 obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
 obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
-obj-$(CONFIG_HMM) += hmm.o
+obj-$(CONFIG_HMM_MIRROR) += hmm.o
 obj-$(CONFIG_MEMFD_CREATE) += memfd.o
diff --git a/mm/hmm.c b/mm/hmm.c
index 90ca0cdab9db..d62ce64d6bca 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -25,7 +25,6 @@
 #include <linux/mmu_notifier.h>
 #include <linux/memory_hotplug.h>
 
-#if IS_ENABLED(CONFIG_HMM_MIRROR)
 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
 
 static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
@@ -1326,4 +1325,3 @@ long hmm_range_dma_unmap(struct hmm_range *range,
 	return cpages;
 }
 EXPORT_SYMBOL(hmm_range_dma_unmap);
-#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-- 
cgit v1.2.3-59-g8ed1b


From 0b07ee944701dabcddc294d903b5e8e21c2c5d95 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:17 +0530
Subject: PM / QOS: Pass request type to dev_pm_qos_{add|remove}_notifier()

In order to use the same set of routines to register notifiers for
different request types, update the existing
dev_pm_qos_{add|remove}_notifier() routines with an additional
parameter: request-type.

For now, it only supports resume-latency request type but will be
extended to frequency limit (min/max) constraints later on.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/pm_qos_interface.txt | 10 ++++++----
 drivers/base/power/domain.c              |  8 +++++---
 drivers/base/power/qos.c                 | 14 ++++++++++++--
 include/linux/pm_qos.h                   | 12 ++++++++----
 4 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index 19c5f7b1a7ba..ec7d662d1707 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -164,12 +164,14 @@ directory.
 Notification mechanisms:
 The per-device PM QoS framework has a per-device notification tree.
 
-int dev_pm_qos_add_notifier(device, notifier):
-Adds a notification callback function for the device.
+int dev_pm_qos_add_notifier(device, notifier, type):
+Adds a notification callback function for the device for a particular request
+type.
+
 The callback is called when the aggregated value of the device constraints list
-is changed (for resume latency device PM QoS only).
+is changed.
 
-int dev_pm_qos_remove_notifier(device, notifier):
+int dev_pm_qos_remove_notifier(device, notifier, type):
 Removes the notification callback function for the device.
 
 
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 33c30c1e6a30..b063bc41b0a9 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1536,7 +1536,8 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	if (ret)
 		genpd_free_dev_data(dev, gpd_data);
 	else
-		dev_pm_qos_add_notifier(dev, &gpd_data->nb);
+		dev_pm_qos_add_notifier(dev, &gpd_data->nb,
+					DEV_PM_QOS_RESUME_LATENCY);
 
 	return ret;
 }
@@ -1569,7 +1570,8 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
 
 	pdd = dev->power.subsys_data->domain_data;
 	gpd_data = to_gpd_data(pdd);
-	dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
+	dev_pm_qos_remove_notifier(dev, &gpd_data->nb,
+				   DEV_PM_QOS_RESUME_LATENCY);
 
 	genpd_lock(genpd);
 
@@ -1597,7 +1599,7 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
 
  out:
 	genpd_unlock(genpd);
-	dev_pm_qos_add_notifier(dev, &gpd_data->nb);
+	dev_pm_qos_add_notifier(dev, &gpd_data->nb, DEV_PM_QOS_RESUME_LATENCY);
 
 	return ret;
 }
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 6c91f8df1d59..cfd463212513 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -467,6 +467,7 @@ EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request);
  *
  * @dev: target device for the constraint
  * @notifier: notifier block managed by caller.
+ * @type: request type.
  *
  * Will register the notifier into a notification chain that gets called
  * upon changes to the target value for the device.
@@ -474,10 +475,14 @@ EXPORT_SYMBOL_GPL(dev_pm_qos_remove_request);
  * If the device's constraints object doesn't exist when this routine is called,
  * it will be created (or error code will be returned if that fails).
  */
-int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier)
+int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
+			    enum dev_pm_qos_req_type type)
 {
 	int ret = 0;
 
+	if (WARN_ON(type != DEV_PM_QOS_RESUME_LATENCY))
+		return -EINVAL;
+
 	mutex_lock(&dev_pm_qos_mtx);
 
 	if (IS_ERR(dev->power.qos))
@@ -500,15 +505,20 @@ EXPORT_SYMBOL_GPL(dev_pm_qos_add_notifier);
  *
  * @dev: target device for the constraint
  * @notifier: notifier block to be removed.
+ * @type: request type.
  *
  * Will remove the notifier from the notification chain that gets called
  * upon changes to the target value.
  */
 int dev_pm_qos_remove_notifier(struct device *dev,
-			       struct notifier_block *notifier)
+			       struct notifier_block *notifier,
+			       enum dev_pm_qos_req_type type)
 {
 	int retval = 0;
 
+	if (WARN_ON(type != DEV_PM_QOS_RESUME_LATENCY))
+		return -EINVAL;
+
 	mutex_lock(&dev_pm_qos_mtx);
 
 	/* Silently return if the constraints object is not present. */
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 6ea1ae373d77..58e8749ceac5 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -146,9 +146,11 @@ int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
 int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value);
 int dev_pm_qos_remove_request(struct dev_pm_qos_request *req);
 int dev_pm_qos_add_notifier(struct device *dev,
-			    struct notifier_block *notifier);
+			    struct notifier_block *notifier,
+			    enum dev_pm_qos_req_type type);
 int dev_pm_qos_remove_notifier(struct device *dev,
-			       struct notifier_block *notifier);
+			       struct notifier_block *notifier,
+			       enum dev_pm_qos_req_type type);
 void dev_pm_qos_constraints_init(struct device *dev);
 void dev_pm_qos_constraints_destroy(struct device *dev);
 int dev_pm_qos_add_ancestor_request(struct device *dev,
@@ -202,10 +204,12 @@ static inline int dev_pm_qos_update_request(struct dev_pm_qos_request *req,
 static inline int dev_pm_qos_remove_request(struct dev_pm_qos_request *req)
 			{ return 0; }
 static inline int dev_pm_qos_add_notifier(struct device *dev,
-					  struct notifier_block *notifier)
+					  struct notifier_block *notifier,
+					  enum dev_pm_qos_req_type type)
 			{ return 0; }
 static inline int dev_pm_qos_remove_notifier(struct device *dev,
-					     struct notifier_block *notifier)
+					     struct notifier_block *notifier,
+					     enum dev_pm_qos_req_type type)
 			{ return 0; }
 static inline void dev_pm_qos_constraints_init(struct device *dev)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 8262331eaaf751076fb2c781f492bafd8344591d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:18 +0530
Subject: PM / QOS: Rename __dev_pm_qos_read_value() and
 dev_pm_qos_raw_read_value()

dev_pm_qos_read_value() will soon need to support more constraint types
(min/max frequency) and will have another argument to it, i.e. type of
the constraint. While that is fine for the existing users of
dev_pm_qos_read_value(), but not that optimal for the callers of
__dev_pm_qos_read_value() and dev_pm_qos_raw_read_value() as all the
callers of these two routines are only looking for resume latency
constraint.

Lets make these two routines care only about the resume latency
constraint and rename them to __dev_pm_qos_resume_latency() and
dev_pm_qos_raw_resume_latency().

Suggested-by: Rafael J. Wysocki <rjw@rjwysocki.net>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain_governor.c |  2 +-
 drivers/base/power/qos.c             | 13 +++++++++----
 drivers/base/power/runtime.c         |  2 +-
 drivers/cpuidle/governor.c           |  2 +-
 include/linux/pm_qos.h               |  8 ++++----
 5 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c
index 3838045c9277..20e56a5be01f 100644
--- a/drivers/base/power/domain_governor.c
+++ b/drivers/base/power/domain_governor.c
@@ -66,7 +66,7 @@ static bool default_suspend_ok(struct device *dev)
 	td->constraint_changed = false;
 	td->cached_suspend_ok = false;
 	td->effective_constraint_ns = 0;
-	constraint_ns = __dev_pm_qos_read_value(dev);
+	constraint_ns = __dev_pm_qos_resume_latency(dev);
 
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index cfd463212513..7a0d197f0809 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -90,16 +90,16 @@ enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask)
 EXPORT_SYMBOL_GPL(dev_pm_qos_flags);
 
 /**
- * __dev_pm_qos_read_value - Get PM QoS constraint for a given device.
+ * __dev_pm_qos_resume_latency - Get resume latency constraint for a given device.
  * @dev: Device to get the PM QoS constraint value for.
  *
  * This routine must be called with dev->power.lock held.
  */
-s32 __dev_pm_qos_read_value(struct device *dev)
+s32 __dev_pm_qos_resume_latency(struct device *dev)
 {
 	lockdep_assert_held(&dev->power.lock);
 
-	return dev_pm_qos_raw_read_value(dev);
+	return dev_pm_qos_raw_resume_latency(dev);
 }
 
 /**
@@ -112,7 +112,12 @@ s32 dev_pm_qos_read_value(struct device *dev)
 	s32 ret;
 
 	spin_lock_irqsave(&dev->power.lock, flags);
-	ret = __dev_pm_qos_read_value(dev);
+
+	if (IS_ERR_OR_NULL(dev->power.qos))
+		ret = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+	else
+		ret = pm_qos_read_value(&dev->power.qos->resume_latency);
+
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
 	return ret;
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 952a1e7057c7..b75335508d2c 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -275,7 +275,7 @@ static int rpm_check_suspend_allowed(struct device *dev)
 	    || (dev->power.request_pending
 			&& dev->power.request == RPM_REQ_RESUME))
 		retval = -EAGAIN;
-	else if (__dev_pm_qos_read_value(dev) == 0)
+	else if (__dev_pm_qos_resume_latency(dev) == 0)
 		retval = -EPERM;
 	else if (dev->power.runtime_status == RPM_SUSPENDED)
 		retval = 1;
diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index 9fddf828a76f..2e3e14192bee 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -110,7 +110,7 @@ int cpuidle_governor_latency_req(unsigned int cpu)
 {
 	int global_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
 	struct device *device = get_cpu_device(cpu);
-	int device_req = dev_pm_qos_raw_read_value(device);
+	int device_req = dev_pm_qos_raw_resume_latency(device);
 
 	return device_req < global_req ? device_req : global_req;
 }
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 58e8749ceac5..5e09d4980786 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -139,7 +139,7 @@ s32 pm_qos_read_value(struct pm_qos_constraints *c);
 #ifdef CONFIG_PM
 enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask);
 enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask);
-s32 __dev_pm_qos_read_value(struct device *dev);
+s32 __dev_pm_qos_resume_latency(struct device *dev);
 s32 dev_pm_qos_read_value(struct device *dev);
 int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
 			   enum dev_pm_qos_req_type type, s32 value);
@@ -176,7 +176,7 @@ static inline s32 dev_pm_qos_requested_flags(struct device *dev)
 	return dev->power.qos->flags_req->data.flr.flags;
 }
 
-static inline s32 dev_pm_qos_raw_read_value(struct device *dev)
+static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev)
 {
 	return IS_ERR_OR_NULL(dev->power.qos) ?
 		PM_QOS_RESUME_LATENCY_NO_CONSTRAINT :
@@ -189,7 +189,7 @@ static inline enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev,
 static inline enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev,
 							s32 mask)
 			{ return PM_QOS_FLAGS_UNDEFINED; }
-static inline s32 __dev_pm_qos_read_value(struct device *dev)
+static inline s32 __dev_pm_qos_resume_latency(struct device *dev)
 			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
 static inline s32 dev_pm_qos_read_value(struct device *dev)
 			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
@@ -245,7 +245,7 @@ static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev)
 	return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
 }
 static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return 0; }
-static inline s32 dev_pm_qos_raw_read_value(struct device *dev)
+static inline s32 dev_pm_qos_raw_resume_latency(struct device *dev)
 {
 	return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 2a79ea5ec53973c8711b54d33ace5c77659dc8f8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:19 +0530
Subject: PM / QOS: Pass request type to dev_pm_qos_read_value()

In order to allow dev_pm_qos_read_value() to read values for different
QoS requests, pass request type as a parameter to these routines.

For now, it only supports resume-latency request type but will be
extended to frequency limit (min/max) constraints later on.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/pm_qos_interface.txt |  2 +-
 drivers/base/power/domain_governor.c     |  2 +-
 drivers/base/power/qos.c                 | 17 ++++++++++++-----
 include/linux/pm_qos.h                   | 16 +++++++++++++---
 4 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index ec7d662d1707..cfcb1df39799 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -123,7 +123,7 @@ Will remove the element.  After removal it will update the aggregate target and
 call the notification trees if the target was changed as a result of removing
 the request.
 
-s32 dev_pm_qos_read_value(device):
+s32 dev_pm_qos_read_value(device, type):
 Returns the aggregated value for a given device's constraints list.
 
 enum pm_qos_flags_status dev_pm_qos_flags(device, mask)
diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c
index 20e56a5be01f..daa8c7689f7e 100644
--- a/drivers/base/power/domain_governor.c
+++ b/drivers/base/power/domain_governor.c
@@ -33,7 +33,7 @@ static int dev_update_qos_constraint(struct device *dev, void *data)
 		 * take its current PM QoS constraint (that's the only thing
 		 * known at this point anyway).
 		 */
-		constraint_ns = dev_pm_qos_read_value(dev);
+		constraint_ns = dev_pm_qos_read_value(dev, DEV_PM_QOS_RESUME_LATENCY);
 		constraint_ns *= NSEC_PER_USEC;
 	}
 
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 7a0d197f0809..2461fed0efa0 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -105,18 +105,25 @@ s32 __dev_pm_qos_resume_latency(struct device *dev)
 /**
  * dev_pm_qos_read_value - Get PM QoS constraint for a given device (locked).
  * @dev: Device to get the PM QoS constraint value for.
+ * @type: QoS request type.
  */
-s32 dev_pm_qos_read_value(struct device *dev)
+s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type)
 {
+	struct dev_pm_qos *qos = dev->power.qos;
 	unsigned long flags;
 	s32 ret;
 
 	spin_lock_irqsave(&dev->power.lock, flags);
 
-	if (IS_ERR_OR_NULL(dev->power.qos))
-		ret = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
-	else
-		ret = pm_qos_read_value(&dev->power.qos->resume_latency);
+	switch (type) {
+	case DEV_PM_QOS_RESUME_LATENCY:
+		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT
+			: pm_qos_read_value(&qos->resume_latency);
+		break;
+	default:
+		WARN_ON(1);
+		ret = 0;
+	}
 
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 5e09d4980786..9a21b7ba72ae 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -140,7 +140,7 @@ s32 pm_qos_read_value(struct pm_qos_constraints *c);
 enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask);
 enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask);
 s32 __dev_pm_qos_resume_latency(struct device *dev);
-s32 dev_pm_qos_read_value(struct device *dev);
+s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type);
 int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
 			   enum dev_pm_qos_req_type type, s32 value);
 int dev_pm_qos_update_request(struct dev_pm_qos_request *req, s32 new_value);
@@ -191,8 +191,18 @@ static inline enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev,
 			{ return PM_QOS_FLAGS_UNDEFINED; }
 static inline s32 __dev_pm_qos_resume_latency(struct device *dev)
 			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
-static inline s32 dev_pm_qos_read_value(struct device *dev)
-			{ return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; }
+static inline s32 dev_pm_qos_read_value(struct device *dev,
+					enum dev_pm_qos_req_type type)
+{
+	switch (type) {
+	case DEV_PM_QOS_RESUME_LATENCY:
+		return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+	default:
+		WARN_ON(1);
+		return 0;
+	}
+}
+
 static inline int dev_pm_qos_add_request(struct device *dev,
 					 struct dev_pm_qos_request *req,
 					 enum dev_pm_qos_req_type type,
-- 
cgit v1.2.3-59-g8ed1b


From 208637b37824c8956fe28d277835a403ee35fa84 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:20 +0530
Subject: PM / QoS: Add support for MIN/MAX frequency constraints

This patch introduces the min-frequency and max-frequency device
constraints, which will be used by the cpufreq core to begin with.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/qos.c | 111 +++++++++++++++++++++++++++++++++++++++++------
 include/linux/pm_qos.h   |  12 +++++
 2 files changed, 109 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 2461fed0efa0..6c90fd7e2ff8 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -120,6 +120,14 @@ s32 dev_pm_qos_read_value(struct device *dev, enum dev_pm_qos_req_type type)
 		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_RESUME_LATENCY_NO_CONSTRAINT
 			: pm_qos_read_value(&qos->resume_latency);
 		break;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE
+			: pm_qos_read_value(&qos->min_frequency);
+		break;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		ret = IS_ERR_OR_NULL(qos) ? PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE
+			: pm_qos_read_value(&qos->max_frequency);
+		break;
 	default:
 		WARN_ON(1);
 		ret = 0;
@@ -161,6 +169,14 @@ static int apply_constraint(struct dev_pm_qos_request *req,
 			req->dev->power.set_latency_tolerance(req->dev, value);
 		}
 		break;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		ret = pm_qos_update_target(&qos->min_frequency,
+					   &req->data.pnode, action, value);
+		break;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		ret = pm_qos_update_target(&qos->max_frequency,
+					   &req->data.pnode, action, value);
+		break;
 	case DEV_PM_QOS_FLAGS:
 		ret = pm_qos_update_flags(&qos->flags, &req->data.flr,
 					  action, value);
@@ -189,12 +205,11 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	if (!qos)
 		return -ENOMEM;
 
-	n = kzalloc(sizeof(*n), GFP_KERNEL);
+	n = kzalloc(3 * sizeof(*n), GFP_KERNEL);
 	if (!n) {
 		kfree(qos);
 		return -ENOMEM;
 	}
-	BLOCKING_INIT_NOTIFIER_HEAD(n);
 
 	c = &qos->resume_latency;
 	plist_head_init(&c->list);
@@ -203,6 +218,7 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	c->no_constraint_value = PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
 	c->type = PM_QOS_MIN;
 	c->notifiers = n;
+	BLOCKING_INIT_NOTIFIER_HEAD(n);
 
 	c = &qos->latency_tolerance;
 	plist_head_init(&c->list);
@@ -211,6 +227,24 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
 	c->type = PM_QOS_MIN;
 
+	c = &qos->min_frequency;
+	plist_head_init(&c->list);
+	c->target_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+	c->default_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+	c->no_constraint_value = PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+	c->type = PM_QOS_MAX;
+	c->notifiers = ++n;
+	BLOCKING_INIT_NOTIFIER_HEAD(n);
+
+	c = &qos->max_frequency;
+	plist_head_init(&c->list);
+	c->target_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
+	c->default_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
+	c->no_constraint_value = PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
+	c->type = PM_QOS_MIN;
+	c->notifiers = ++n;
+	BLOCKING_INIT_NOTIFIER_HEAD(n);
+
 	INIT_LIST_HEAD(&qos->flags.list);
 
 	spin_lock_irq(&dev->power.lock);
@@ -264,11 +298,25 @@ void dev_pm_qos_constraints_destroy(struct device *dev)
 		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
 		memset(req, 0, sizeof(*req));
 	}
+
 	c = &qos->latency_tolerance;
 	plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
 		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
 		memset(req, 0, sizeof(*req));
 	}
+
+	c = &qos->min_frequency;
+	plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
+		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE);
+		memset(req, 0, sizeof(*req));
+	}
+
+	c = &qos->max_frequency;
+	plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
+		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE);
+		memset(req, 0, sizeof(*req));
+	}
+
 	f = &qos->flags;
 	list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) {
 		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
@@ -380,6 +428,8 @@ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req,
 	switch(req->type) {
 	case DEV_PM_QOS_RESUME_LATENCY:
 	case DEV_PM_QOS_LATENCY_TOLERANCE:
+	case DEV_PM_QOS_MIN_FREQUENCY:
+	case DEV_PM_QOS_MAX_FREQUENCY:
 		curr_value = req->data.pnode.prio;
 		break;
 	case DEV_PM_QOS_FLAGS:
@@ -492,9 +542,6 @@ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
 {
 	int ret = 0;
 
-	if (WARN_ON(type != DEV_PM_QOS_RESUME_LATENCY))
-		return -EINVAL;
-
 	mutex_lock(&dev_pm_qos_mtx);
 
 	if (IS_ERR(dev->power.qos))
@@ -502,10 +549,28 @@ int dev_pm_qos_add_notifier(struct device *dev, struct notifier_block *notifier,
 	else if (!dev->power.qos)
 		ret = dev_pm_qos_constraints_allocate(dev);
 
-	if (!ret)
+	if (ret)
+		goto unlock;
+
+	switch (type) {
+	case DEV_PM_QOS_RESUME_LATENCY:
 		ret = blocking_notifier_chain_register(dev->power.qos->resume_latency.notifiers,
 						       notifier);
+		break;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		ret = blocking_notifier_chain_register(dev->power.qos->min_frequency.notifiers,
+						       notifier);
+		break;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		ret = blocking_notifier_chain_register(dev->power.qos->max_frequency.notifiers,
+						       notifier);
+		break;
+	default:
+		WARN_ON(1);
+		ret = -EINVAL;
+	}
 
+unlock:
 	mutex_unlock(&dev_pm_qos_mtx);
 	return ret;
 }
@@ -526,20 +591,35 @@ int dev_pm_qos_remove_notifier(struct device *dev,
 			       struct notifier_block *notifier,
 			       enum dev_pm_qos_req_type type)
 {
-	int retval = 0;
-
-	if (WARN_ON(type != DEV_PM_QOS_RESUME_LATENCY))
-		return -EINVAL;
+	int ret = 0;
 
 	mutex_lock(&dev_pm_qos_mtx);
 
 	/* Silently return if the constraints object is not present. */
-	if (!IS_ERR_OR_NULL(dev->power.qos))
-		retval = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers,
-							    notifier);
+	if (IS_ERR_OR_NULL(dev->power.qos))
+		goto unlock;
+
+	switch (type) {
+	case DEV_PM_QOS_RESUME_LATENCY:
+		ret = blocking_notifier_chain_unregister(dev->power.qos->resume_latency.notifiers,
+							 notifier);
+		break;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		ret = blocking_notifier_chain_unregister(dev->power.qos->min_frequency.notifiers,
+							 notifier);
+		break;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		ret = blocking_notifier_chain_unregister(dev->power.qos->max_frequency.notifiers,
+							 notifier);
+		break;
+	default:
+		WARN_ON(1);
+		ret = -EINVAL;
+	}
 
+unlock:
 	mutex_unlock(&dev_pm_qos_mtx);
-	return retval;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(dev_pm_qos_remove_notifier);
 
@@ -599,6 +679,9 @@ static void __dev_pm_qos_drop_user_request(struct device *dev,
 		req = dev->power.qos->flags_req;
 		dev->power.qos->flags_req = NULL;
 		break;
+	default:
+		WARN_ON(1);
+		return;
 	}
 	__dev_pm_qos_remove_request(req);
 	kfree(req);
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 9a21b7ba72ae..2aebbc5b9950 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -40,6 +40,8 @@ enum pm_qos_flags_status {
 #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT	PM_QOS_LATENCY_ANY
 #define PM_QOS_RESUME_LATENCY_NO_CONSTRAINT_NS	PM_QOS_LATENCY_ANY_NS
 #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE	0
+#define PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE	0
+#define PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE	(-1)
 #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT	(-1)
 
 #define PM_QOS_FLAG_NO_POWER_OFF	(1 << 0)
@@ -58,6 +60,8 @@ struct pm_qos_flags_request {
 enum dev_pm_qos_req_type {
 	DEV_PM_QOS_RESUME_LATENCY = 1,
 	DEV_PM_QOS_LATENCY_TOLERANCE,
+	DEV_PM_QOS_MIN_FREQUENCY,
+	DEV_PM_QOS_MAX_FREQUENCY,
 	DEV_PM_QOS_FLAGS,
 };
 
@@ -99,10 +103,14 @@ struct pm_qos_flags {
 struct dev_pm_qos {
 	struct pm_qos_constraints resume_latency;
 	struct pm_qos_constraints latency_tolerance;
+	struct pm_qos_constraints min_frequency;
+	struct pm_qos_constraints max_frequency;
 	struct pm_qos_flags flags;
 	struct dev_pm_qos_request *resume_latency_req;
 	struct dev_pm_qos_request *latency_tolerance_req;
 	struct dev_pm_qos_request *flags_req;
+	struct dev_pm_qos_request *min_frequency_req;
+	struct dev_pm_qos_request *max_frequency_req;
 };
 
 /* Action requested to pm_qos_update_target */
@@ -197,6 +205,10 @@ static inline s32 dev_pm_qos_read_value(struct device *dev,
 	switch (type) {
 	case DEV_PM_QOS_RESUME_LATENCY:
 		return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT;
+	case DEV_PM_QOS_MIN_FREQUENCY:
+		return PM_QOS_MIN_FREQUENCY_DEFAULT_VALUE;
+	case DEV_PM_QOS_MAX_FREQUENCY:
+		return PM_QOS_MAX_FREQUENCY_DEFAULT_VALUE;
 	default:
 		WARN_ON(1);
 		return 0;
-- 
cgit v1.2.3-59-g8ed1b


From 2ac295d4f0c095310addbcb03d91d2a4c9f7d435 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 1 Jun 2019 20:48:55 -0400
Subject: convenience helper get_tree_nodev()

counterpart of mount_nodev().  Switch hugetlb and pseudo to it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hugetlbfs/inode.c       | 2 +-
 fs/libfs.c                 | 2 +-
 fs/super.c                 | 8 ++++++++
 include/linux/fs_context.h | 3 +++
 4 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1dcc57189382..a478df035651 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1299,7 +1299,7 @@ static int hugetlbfs_get_tree(struct fs_context *fc)
 	int err = hugetlbfs_validate(fc);
 	if (err)
 		return err;
-	return vfs_get_super(fc, vfs_get_independent_super, hugetlbfs_fill_super);
+	return get_tree_nodev(fc, hugetlbfs_fill_super);
 }
 
 static void hugetlbfs_fs_context_free(struct fs_context *fc)
diff --git a/fs/libfs.c b/fs/libfs.c
index 7e6811ba4edd..c9463dc6a5d4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -270,7 +270,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
 
 static int pseudo_fs_get_tree(struct fs_context *fc)
 {
-	return vfs_get_super(fc, vfs_get_independent_super, pseudo_fs_fill_super);
+	return get_tree_nodev(fc, pseudo_fs_fill_super);
 }
 
 static void pseudo_fs_free(struct fs_context *fc)
diff --git a/fs/super.c b/fs/super.c
index ca2302501d32..3318225b0878 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1198,6 +1198,14 @@ int vfs_get_super(struct fs_context *fc,
 }
 EXPORT_SYMBOL(vfs_get_super);
 
+int get_tree_nodev(struct fs_context *fc,
+		  int (*fill_super)(struct super_block *sb,
+				    struct fs_context *fc))
+{
+	return vfs_get_super(fc, vfs_get_independent_super, fill_super);
+}
+EXPORT_SYMBOL(get_tree_nodev);
+
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index c995b852ba40..38b1ec918a4e 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -151,6 +151,9 @@ extern int vfs_get_super(struct fs_context *fc,
 			 enum vfs_get_super_keying keying,
 			 int (*fill_super)(struct super_block *sb,
 					   struct fs_context *fc));
+extern int get_tree_nodev(struct fs_context *fc,
+			 int (*fill_super)(struct super_block *sb,
+					   struct fs_context *fc));
 
 extern const struct file_operations fscontext_fops;
 
-- 
cgit v1.2.3-59-g8ed1b


From c23a0bbab30cc1714b6b1d6a1c153a5ccab3f0d8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 22 May 2019 21:23:39 -0400
Subject: convenience helper: get_tree_single()

counterpart of mount_single(); switch fusectl to it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/control.c          | 2 +-
 fs/super.c                 | 8 ++++++++
 include/linux/fs_context.h | 3 +++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 14ce1e47f980..c23f6f243ad4 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -346,7 +346,7 @@ static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx)
 
 static int fuse_ctl_get_tree(struct fs_context *fc)
 {
-	return vfs_get_super(fc, vfs_get_single_super, fuse_ctl_fill_super);
+	return get_tree_single(fc, fuse_ctl_fill_super);
 }
 
 static const struct fs_context_operations fuse_ctl_context_ops = {
diff --git a/fs/super.c b/fs/super.c
index 3318225b0878..113c58f19425 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1206,6 +1206,14 @@ int get_tree_nodev(struct fs_context *fc,
 }
 EXPORT_SYMBOL(get_tree_nodev);
 
+int get_tree_single(struct fs_context *fc,
+		  int (*fill_super)(struct super_block *sb,
+				    struct fs_context *fc))
+{
+	return vfs_get_super(fc, vfs_get_single_super, fill_super);
+}
+EXPORT_SYMBOL(get_tree_single);
+
 #ifdef CONFIG_BLOCK
 static int set_bdev_super(struct super_block *s, void *data)
 {
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 38b1ec918a4e..1775969e000d 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -154,6 +154,9 @@ extern int vfs_get_super(struct fs_context *fc,
 extern int get_tree_nodev(struct fs_context *fc,
 			 int (*fill_super)(struct super_block *sb,
 					   struct fs_context *fc));
+extern int get_tree_single(struct fs_context *fc,
+			 int (*fill_super)(struct super_block *sb,
+					   struct fs_context *fc));
 
 extern const struct file_operations fscontext_fops;
 
-- 
cgit v1.2.3-59-g8ed1b


From 14a253ce4210cd2ef133b392062477e9d656db4a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 30 May 2019 15:59:57 -0400
Subject: init_rootfs(): don't bother with init_ramfs_fs()

the only thing done by the latter is making ramfs visible
to mount(2); we don't need it there - rootfs is separate
and, in fact, made visible to mount(2) in the same init_rootfs().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ramfs/inode.c      | 6 +-----
 include/linux/ramfs.h | 1 -
 init/do_mounts.c      | 2 --
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 11201b2d06b9..733c6b4193dc 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -266,12 +266,8 @@ static struct file_system_type ramfs_fs_type = {
 	.fs_flags	= FS_USERNS_MOUNT,
 };
 
-int __init init_ramfs_fs(void)
+static int __init init_ramfs_fs(void)
 {
-	static unsigned long once;
-
-	if (test_and_set_bit(0, &once))
-		return 0;
 	return register_filesystem(&ramfs_fs_type);
 }
 fs_initcall(init_ramfs_fs);
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 5ef7d54caac2..ee582bdb7fda 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -19,7 +19,6 @@ extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize);
 
 extern const struct file_operations ramfs_file_operations;
 extern const struct vm_operations_struct generic_file_vm_ops;
-extern int __init init_ramfs_fs(void);
 
 int ramfs_fill_super(struct super_block *sb, void *data, int silent);
 
diff --git a/init/do_mounts.c b/init/do_mounts.c
index f8c230c77035..c170d8b309b1 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -658,8 +658,6 @@ int __init init_rootfs(void)
 		(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
 		err = shmem_init();
 		is_tmpfs = true;
-	} else {
-		err = init_ramfs_fs();
 	}
 
 	if (err)
-- 
cgit v1.2.3-59-g8ed1b


From fd3e007f6c6a0f677e4ee8aca4b9bab8ad6cab9a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 30 May 2019 17:48:35 -0400
Subject: don't bother with registering rootfs

init_mount_tree() can get to rootfs_fs_type directly and that simplifies
a lot of things.  We don't need to register it, we don't need to look
it up *and* we don't need to bother with preventing subsequent userland
mounts.  That's the way we should've done that from the very beginning.

There is a user-visible change, namely the disappearance of "rootfs"
from /proc/filesystems.  Note that it's been unmountable all along
and it didn't show up in /proc/mounts; however, it *is* a user-visible
change and theoretically some script might've been using its presence
in /proc/filesystems to tell 2.4.11+ from earlier kernels.

*IF* any complaints about behaviour change do show up, we could fake
it in /proc/filesystems.  I very much doubt we'll have to, though.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c       |  7 +------
 include/linux/init.h |  3 +++
 init/do_mounts.c     | 15 ++-------------
 3 files changed, 6 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index 1141641dff96..2db2f4c36c50 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3686,13 +3686,8 @@ static void __init init_mount_tree(void)
 	struct mount *m;
 	struct mnt_namespace *ns;
 	struct path root;
-	struct file_system_type *type;
 
-	type = get_fs_type("rootfs");
-	if (!type)
-		panic("Can't find rootfs type");
-	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
-	put_filesystem(type);
+	mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
 	if (IS_ERR(mnt))
 		panic("Can't create rootfs");
 
diff --git a/include/linux/init.h b/include/linux/init.h
index 5255069f5a9f..cbe93521397e 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -137,6 +137,8 @@ extern initcall_entry_t __con_initcall_start[], __con_initcall_end[];
 /* Used for contructor calls. */
 typedef void (*ctor_fn_t)(void);
 
+struct file_system_type;
+
 /* Defined in init/main.c */
 extern int do_one_initcall(initcall_t fn);
 extern char __initdata boot_command_line[];
@@ -147,6 +149,7 @@ extern unsigned int reset_devices;
 void setup_arch(char **);
 void prepare_namespace(void);
 int __init init_rootfs(void);
+extern struct file_system_type rootfs_fs_type;
 
 #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
 extern bool rodata_enabled;
diff --git a/init/do_mounts.c b/init/do_mounts.c
index c170d8b309b1..e7f0b0f18cce 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -629,19 +629,15 @@ static bool is_tmpfs;
 static struct dentry *rootfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	static unsigned long once;
 	void *fill = ramfs_fill_super;
 
-	if (test_and_set_bit(0, &once))
-		return ERR_PTR(-ENODEV);
-
 	if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs)
 		fill = shmem_fill_super;
 
 	return mount_nodev(fs_type, flags, data, fill);
 }
 
-static struct file_system_type rootfs_fs_type = {
+struct file_system_type rootfs_fs_type = {
 	.name		= "rootfs",
 	.mount		= rootfs_mount,
 	.kill_sb	= kill_litter_super,
@@ -649,19 +645,12 @@ static struct file_system_type rootfs_fs_type = {
 
 int __init init_rootfs(void)
 {
-	int err = register_filesystem(&rootfs_fs_type);
-
-	if (err)
-		return err;
+	int err = 0;
 
 	if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] &&
 		(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
 		err = shmem_init();
 		is_tmpfs = true;
 	}
-
-	if (err)
-		unregister_filesystem(&rootfs_fs_type);
-
 	return err;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 33488845f211afcdb7e5c00a3152890e06cdc78e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 31 May 2019 20:09:15 -0400
Subject: constify ksys_mount() string arguments

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/base/devtmpfs.c  | 3 +--
 fs/namespace.c           | 4 ++--
 include/linux/syscalls.h | 4 ++--
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 0dbc43068eeb..ba5c80903efe 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -357,8 +357,7 @@ int devtmpfs_mount(const char *mntdir)
 	if (!thread)
 		return 0;
 
-	err = ksys_mount("devtmpfs", (char *)mntdir, "devtmpfs", MS_SILENT,
-			 NULL);
+	err = ksys_mount("devtmpfs", mntdir, "devtmpfs", MS_SILENT, NULL);
 	if (err)
 		printk(KERN_INFO "devtmpfs: error mounting %i\n", err);
 	else
diff --git a/fs/namespace.c b/fs/namespace.c
index 2db2f4c36c50..e272c2403014 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3295,8 +3295,8 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name)
 }
 EXPORT_SYMBOL(mount_subtree);
 
-int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
-	       unsigned long flags, void __user *data)
+int ksys_mount(const char __user *dev_name, const char __user *dir_name,
+	       const char __user *type, unsigned long flags, void __user *data)
 {
 	int ret;
 	char *kernel_type;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e2870fe1be5b..2a0ac10a6f95 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1228,8 +1228,8 @@ asmlinkage long sys_ni_syscall(void);
  * the ksys_xyzyyz() functions prototyped below.
  */
 
-int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
-	       unsigned long flags, void __user *data);
+int ksys_mount(const char __user *dev_name, const char __user *dir_name,
+	       const char __user *type, unsigned long flags, void __user *data);
 int ksys_umount(char __user *name, int flags);
 int ksys_dup(unsigned int fildes);
 int ksys_chroot(const char __user *filename);
-- 
cgit v1.2.3-59-g8ed1b


From 037f11b4752f717201143a1dc5d6acf3cb71ddfa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 1 Jun 2019 18:09:44 -0400
Subject: mnt_init(): call shmem_init() unconditionally

No point having two call sites (earlier in init_rootfs() from
mnt_init() in case we are going to use shmem-style rootfs,
later from do_basic_setup() unconditionally), along with the
logics in shmem_init() itself to make the second call a no-op...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c       | 2 ++
 include/linux/init.h | 2 +-
 init/do_mounts.c     | 9 ++-------
 init/main.c          | 1 -
 mm/shmem.c           | 4 ----
 5 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index e272c2403014..e6990f3d526d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -29,6 +29,7 @@
 #include <linux/sched/task.h>
 #include <uapi/linux/mount.h>
 #include <linux/fs_context.h>
+#include <linux/shmem_fs.h>
 
 #include "pnode.h"
 #include "internal.h"
@@ -3740,6 +3741,7 @@ void __init mnt_init(void)
 	fs_kobj = kobject_create_and_add("fs", NULL);
 	if (!fs_kobj)
 		printk(KERN_WARNING "%s: kobj create error\n", __func__);
+	shmem_init();
 	init_rootfs();
 	init_mount_tree();
 }
diff --git a/include/linux/init.h b/include/linux/init.h
index cbe93521397e..212fc9e2f691 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -148,7 +148,7 @@ extern unsigned int reset_devices;
 /* used by init/main.c */
 void setup_arch(char **);
 void prepare_namespace(void);
-int __init init_rootfs(void);
+void __init init_rootfs(void);
 extern struct file_system_type rootfs_fs_type;
 
 #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
diff --git a/init/do_mounts.c b/init/do_mounts.c
index e7f0b0f18cce..864c032e995d 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -643,14 +643,9 @@ struct file_system_type rootfs_fs_type = {
 	.kill_sb	= kill_litter_super,
 };
 
-int __init init_rootfs(void)
+void __init init_rootfs(void)
 {
-	int err = 0;
-
 	if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] &&
-		(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
-		err = shmem_init();
+		(!root_fs_names || strstr(root_fs_names, "tmpfs")))
 		is_tmpfs = true;
-	}
-	return err;
 }
diff --git a/init/main.c b/init/main.c
index 5a2c69b4d7b3..4dbc7243557e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1000,7 +1000,6 @@ static void __init do_initcalls(void)
 static void __init do_basic_setup(void)
 {
 	cpuset_init_smp();
-	shmem_init();
 	driver_init();
 	init_irq_proc();
 	do_ctors();
diff --git a/mm/shmem.c b/mm/shmem.c
index 1bb3b8dc8bb2..1f67ec9e2062 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3775,10 +3775,6 @@ int __init shmem_init(void)
 {
 	int error;
 
-	/* If rootfs called this, don't re-init */
-	if (shmem_inode_cachep)
-		return 0;
-
 	shmem_init_inodecache();
 
 	error = register_filesystem(&shmem_fs_type);
-- 
cgit v1.2.3-59-g8ed1b


From d14133dd41614aaaac1fa0505c7dab01f4211d2c Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Tue, 2 Jul 2019 13:02:36 +0300
Subject: IB/mlx5: Support set qp counter

Support bind a qp with counter. If counter is null then bind the qp to the
default counter. Different QP state has different operation:

- RESET: Set the counter field so that it will take effective during
  RST2INIT change;
- RTS: Issue an RTS2RTS change to update the QP counter;
- Other: Set the counter field and mark the counter_pending flag, when QP
  is moved to RTS state and this flag is set, then issue an RTS2RTS
  modification to update the counter.

Signed-off-by: Mark Zhang <markz@mellanox.com>
Reviewed-by: Majd Dibbiny <majd@mellanox.com>
Acked-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  6 +++
 drivers/infiniband/hw/mlx5/qp.c      | 76 +++++++++++++++++++++++++++++++++++-
 include/linux/mlx5/qp.h              |  1 +
 3 files changed, 81 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 7373e9da0919..c482f19958b3 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -439,6 +439,10 @@ struct mlx5_ib_qp {
 	u32			flags_en;
 	/* storage for qp sub type when core qp type is IB_QPT_DRIVER */
 	enum ib_qp_type		qp_sub_type;
+	/* A flag to indicate if there's a new counter is configured
+	 * but not take effective
+	 */
+	u32                     counter_pending;
 };
 
 struct mlx5_ib_cq_buf {
@@ -1468,4 +1472,6 @@ void mlx5_ib_put_xlt_emergency_page(void);
 int bfregn_to_uar_index(struct mlx5_ib_dev *dev,
 			struct mlx5_bfreg_info *bfregi, u32 bfregn,
 			bool dyn_bfreg);
+
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
 #endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 8b7a60ada92c..2a97619ed603 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -34,6 +34,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/rdma_counter.h>
 #include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
 #include "ib_rep.h"
@@ -3380,6 +3381,35 @@ static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
 	return tx_port_affinity;
 }
 
+static int __mlx5_ib_qp_set_counter(struct ib_qp *qp,
+				    struct rdma_counter *counter)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	struct mlx5_qp_context context = {};
+	struct mlx5_ib_port *mibport = NULL;
+	struct mlx5_ib_qp_base *base;
+	u32 set_id;
+
+	if (!MLX5_CAP_GEN(dev->mdev, rts2rts_qp_counters_set_id))
+		return 0;
+
+	if (counter) {
+		set_id = counter->id;
+	} else {
+		mibport = &dev->port[mqp->port - 1];
+		set_id = mibport->cnts.set_id;
+	}
+
+	base = &mqp->trans_qp.base;
+	context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff);
+	context.qp_counter_set_usr_page |= cpu_to_be32(set_id << 24);
+	return mlx5_core_qp_modify(dev->mdev,
+				   MLX5_CMD_OP_RTS2RTS_QP,
+				   MLX5_QP_OPTPAR_COUNTER_SET_ID,
+				   &context, &base->mqp);
+}
+
 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			       const struct ib_qp_attr *attr, int attr_mask,
 			       enum ib_qp_state cur_state,
@@ -3433,6 +3463,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 	struct mlx5_ib_port *mibport = NULL;
 	enum mlx5_qp_state mlx5_cur, mlx5_new;
 	enum mlx5_qp_optpar optpar;
+	u32 set_id = 0;
 	int mlx5_st;
 	int err;
 	u16 op;
@@ -3595,8 +3626,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 			port_num = 0;
 
 		mibport = &dev->port[port_num];
+		if (ibqp->counter)
+			set_id = ibqp->counter->id;
+		else
+			set_id = mibport->cnts.set_id;
 		context->qp_counter_set_usr_page |=
-			cpu_to_be32((u32)(mibport->cnts.set_id) << 24);
+			cpu_to_be32(set_id << 24);
 	}
 
 	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
@@ -3624,7 +3659,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 
 		raw_qp_param.operation = op;
 		if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
-			raw_qp_param.rq_q_ctr_id = mibport->cnts.set_id;
+			raw_qp_param.rq_q_ctr_id = set_id;
 			raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID;
 		}
 
@@ -3701,6 +3736,12 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
 		qp->db.db[MLX5_SND_DBR] = 0;
 	}
 
+	if ((new_state == IB_QPS_RTS) && qp->counter_pending) {
+		err = __mlx5_ib_qp_set_counter(ibqp, ibqp->counter);
+		if (!err)
+			qp->counter_pending = 0;
+	}
+
 out:
 	kfree(context);
 	return err;
@@ -6435,3 +6476,34 @@ void mlx5_ib_drain_rq(struct ib_qp *qp)
 
 	handle_drain_completion(cq, &rdrain, dev);
 }
+
+/**
+ * Bind a qp to a counter. If @counter is NULL then bind the qp to
+ * the default counter
+ */
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter)
+{
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	int err = 0;
+
+	mutex_lock(&mqp->mutex);
+	if (mqp->state == IB_QPS_RESET) {
+		qp->counter = counter;
+		goto out;
+	}
+
+	if (mqp->state == IB_QPS_RTS) {
+		err = __mlx5_ib_qp_set_counter(qp, counter);
+		if (!err)
+			qp->counter = counter;
+
+		goto out;
+	}
+
+	mqp->counter_pending = 1;
+	qp->counter = counter;
+
+out:
+	mutex_unlock(&mqp->mutex);
+	return err;
+}
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
index 937041101504..ae63b1ae9004 100644
--- a/include/linux/mlx5/qp.h
+++ b/include/linux/mlx5/qp.h
@@ -71,6 +71,7 @@ enum mlx5_qp_optpar {
 	MLX5_QP_OPTPAR_CQN_RCV			= 1 << 19,
 	MLX5_QP_OPTPAR_DC_HS			= 1 << 20,
 	MLX5_QP_OPTPAR_DC_KEY			= 1 << 21,
+	MLX5_QP_OPTPAR_COUNTER_SET_ID		= 1 << 25,
 };
 
 enum mlx5_qp_state {
-- 
cgit v1.2.3-59-g8ed1b


From c5d4355d10d414a96ca870b731756b89d068d57a Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Fri, 5 Jul 2019 19:33:22 +0530
Subject: libnvdimm: nd_region flush callback support

This patch adds functionality to perform flush from guest
to host over VIRTIO. We are registering a callback based
on 'nd_region' type. virtio_pmem driver requires this special
flush function. For rest of the region types we are registering
existing flush function. Report error returned by host fsync
failure to userspace.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit/core.c     |  4 ++--
 drivers/nvdimm/claim.c       |  6 ++++--
 drivers/nvdimm/nd.h          |  1 +
 drivers/nvdimm/pmem.c        | 13 ++++++++-----
 drivers/nvdimm/region_devs.c | 26 ++++++++++++++++++++++++--
 include/linux/libnvdimm.h    |  9 ++++++++-
 6 files changed, 47 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 23022cf20d26..c02fa27dd3f3 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2426,7 +2426,7 @@ static void write_blk_ctl(struct nfit_blk *nfit_blk, unsigned int bw,
 		offset = to_interleave_offset(offset, mmio);
 
 	writeq(cmd, mmio->addr.base + offset);
-	nvdimm_flush(nfit_blk->nd_region);
+	nvdimm_flush(nfit_blk->nd_region, NULL);
 
 	if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH)
 		readq(mmio->addr.base + offset);
@@ -2475,7 +2475,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
 	}
 
 	if (rw)
-		nvdimm_flush(nfit_blk->nd_region);
+		nvdimm_flush(nfit_blk->nd_region, NULL);
 
 	rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
 	return rc;
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 26c1c7618891..2985ca949912 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -255,7 +255,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
 	unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
 	sector_t sector = offset >> 9;
-	int rc = 0;
+	int rc = 0, ret = 0;
 
 	if (unlikely(!size))
 		return 0;
@@ -293,7 +293,9 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
 	}
 
 	memcpy_flushcache(nsio->addr + offset, buf, size);
-	nvdimm_flush(to_nd_region(ndns->dev.parent));
+	ret = nvdimm_flush(to_nd_region(ndns->dev.parent), NULL);
+	if (ret)
+		rc = ret;
 
 	return rc;
 }
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index d24304c0e6d7..1b9955651379 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -155,6 +155,7 @@ struct nd_region {
 	struct badblocks bb;
 	struct nd_interleave_set *nd_set;
 	struct nd_percpu_lane __percpu *lane;
+	int (*flush)(struct nd_region *nd_region, struct bio *bio);
 	struct nd_mapping mapping[0];
 };
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 24d7fe7c74ed..223da63d1bd7 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -184,6 +184,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 
 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 {
+	int ret = 0;
 	blk_status_t rc = 0;
 	bool do_acct;
 	unsigned long start;
@@ -193,7 +194,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 	struct nd_region *nd_region = to_region(pmem);
 
 	if (bio->bi_opf & REQ_PREFLUSH)
-		nvdimm_flush(nd_region);
+		ret = nvdimm_flush(nd_region, bio);
 
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
@@ -208,7 +209,10 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 		nd_iostat_end(bio, start);
 
 	if (bio->bi_opf & REQ_FUA)
-		nvdimm_flush(nd_region);
+		ret = nvdimm_flush(nd_region, bio);
+
+	if (ret)
+		bio->bi_status = errno_to_blk_status(ret);
 
 	bio_endio(bio);
 	return BLK_QC_T_NONE;
@@ -477,7 +481,6 @@ static int pmem_attach_disk(struct device *dev,
 	}
 	dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
 	pmem->dax_dev = dax_dev;
-
 	gendev = disk_to_dev(disk);
 	gendev->groups = pmem_attribute_groups;
 
@@ -535,14 +538,14 @@ static int nd_pmem_remove(struct device *dev)
 		sysfs_put(pmem->bb_state);
 		pmem->bb_state = NULL;
 	}
-	nvdimm_flush(to_nd_region(dev->parent));
+	nvdimm_flush(to_nd_region(dev->parent), NULL);
 
 	return 0;
 }
 
 static void nd_pmem_shutdown(struct device *dev)
 {
-	nvdimm_flush(to_nd_region(dev->parent));
+	nvdimm_flush(to_nd_region(dev->parent), NULL);
 }
 
 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 4fed9ce9c2fe..eca2e62af134 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -287,7 +287,9 @@ static ssize_t deep_flush_store(struct device *dev, struct device_attribute *att
 		return rc;
 	if (!flush)
 		return -EINVAL;
-	nvdimm_flush(nd_region);
+	rc = nvdimm_flush(nd_region, NULL);
+	if (rc)
+		return rc;
 
 	return len;
 }
@@ -1077,6 +1079,11 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
 	dev->of_node = ndr_desc->of_node;
 	nd_region->ndr_size = resource_size(ndr_desc->res);
 	nd_region->ndr_start = ndr_desc->res->start;
+	if (ndr_desc->flush)
+		nd_region->flush = ndr_desc->flush;
+	else
+		nd_region->flush = NULL;
+
 	nd_device_register(dev);
 
 	return nd_region;
@@ -1117,11 +1124,24 @@ struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus,
 }
 EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create);
 
+int nvdimm_flush(struct nd_region *nd_region, struct bio *bio)
+{
+	int rc = 0;
+
+	if (!nd_region->flush)
+		rc = generic_nvdimm_flush(nd_region);
+	else {
+		if (nd_region->flush(nd_region, bio))
+			rc = -EIO;
+	}
+
+	return rc;
+}
 /**
  * nvdimm_flush - flush any posted write queues between the cpu and pmem media
  * @nd_region: blk or interleaved pmem region
  */
-void nvdimm_flush(struct nd_region *nd_region)
+int generic_nvdimm_flush(struct nd_region *nd_region)
 {
 	struct nd_region_data *ndrd = dev_get_drvdata(&nd_region->dev);
 	int i, idx;
@@ -1145,6 +1165,8 @@ void nvdimm_flush(struct nd_region *nd_region)
 		if (ndrd_get_flush_wpq(ndrd, i, 0))
 			writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
 	wmb();
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nvdimm_flush);
 
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index 03d5c3aece9d..e13100f424c8 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -11,6 +11,7 @@
 #include <linux/types.h>
 #include <linux/uuid.h>
 #include <linux/spinlock.h>
+#include <linux/bio.h>
 
 struct badrange_entry {
 	u64 start;
@@ -57,6 +58,9 @@ enum {
 	 */
 	ND_REGION_PERSIST_MEMCTRL = 2,
 
+	/* Platform provides asynchronous flush mechanism */
+	ND_REGION_ASYNC = 3,
+
 	/* mark newly adjusted resources as requiring a label update */
 	DPA_RESOURCE_ADJUSTED = 1 << 0,
 };
@@ -113,6 +117,7 @@ struct nd_mapping_desc {
 	int position;
 };
 
+struct nd_region;
 struct nd_region_desc {
 	struct resource *res;
 	struct nd_mapping_desc *mapping;
@@ -125,6 +130,7 @@ struct nd_region_desc {
 	int target_node;
 	unsigned long flags;
 	struct device_node *of_node;
+	int (*flush)(struct nd_region *nd_region, struct bio *bio);
 };
 
 struct device;
@@ -252,7 +258,8 @@ unsigned long nd_blk_memremap_flags(struct nd_blk_region *ndbr);
 unsigned int nd_region_acquire_lane(struct nd_region *nd_region);
 void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane);
 u64 nd_fletcher64(void *addr, size_t len, bool le);
-void nvdimm_flush(struct nd_region *nd_region);
+int nvdimm_flush(struct nd_region *nd_region, struct bio *bio);
+int generic_nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
 int nvdimm_has_cache(struct nd_region *nd_region);
 int nvdimm_in_overwrite(struct nvdimm *nvdimm);
-- 
cgit v1.2.3-59-g8ed1b


From fefc1d97fa4b5e016bbe15447dc3edcd9e1bcb9f Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Fri, 5 Jul 2019 19:33:24 +0530
Subject: libnvdimm: add dax_dev sync flag

This patch adds 'DAXDEV_SYNC' flag which is set
for nd_region doing synchronous flush. This later
is used to disable MAP_SYNC functionality for
ext4 & xfs filesystem for devices don't support
synchronous flush.

Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/bus.c            |  2 +-
 drivers/dax/super.c          | 19 ++++++++++++++++++-
 drivers/md/dm.c              |  3 ++-
 drivers/nvdimm/pmem.c        |  5 ++++-
 drivers/nvdimm/region_devs.c |  7 +++++++
 drivers/s390/block/dcssblk.c |  2 +-
 include/linux/dax.h          | 24 ++++++++++++++++++++++--
 include/linux/libnvdimm.h    |  1 +
 8 files changed, 56 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 2109cfe80219..5f184e751c82 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -388,7 +388,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region *dax_region, int id,
 	 * No 'host' or dax_operations since there is no access to this
 	 * device outside of mmap of the resulting character device.
 	 */
-	dax_dev = alloc_dax(dev_dax, NULL, NULL);
+	dax_dev = alloc_dax(dev_dax, NULL, NULL, DAXDEV_F_SYNC);
 	if (!dax_dev)
 		goto err;
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 4e5ae7e8b557..8ab12068eea3 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -195,6 +195,8 @@ enum dax_device_flags {
 	DAXDEV_ALIVE,
 	/* gate whether dax_flush() calls the low level flush routine */
 	DAXDEV_WRITE_CACHE,
+	/* flag to check if device supports synchronous flush */
+	DAXDEV_SYNC,
 };
 
 /**
@@ -372,6 +374,18 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(dax_write_cache_enabled);
 
+bool __dax_synchronous(struct dax_device *dax_dev)
+{
+	return test_bit(DAXDEV_SYNC, &dax_dev->flags);
+}
+EXPORT_SYMBOL_GPL(__dax_synchronous);
+
+void __set_dax_synchronous(struct dax_device *dax_dev)
+{
+	set_bit(DAXDEV_SYNC, &dax_dev->flags);
+}
+EXPORT_SYMBOL_GPL(__set_dax_synchronous);
+
 bool dax_alive(struct dax_device *dax_dev)
 {
 	lockdep_assert_held(&dax_srcu);
@@ -526,7 +540,7 @@ static void dax_add_host(struct dax_device *dax_dev, const char *host)
 }
 
 struct dax_device *alloc_dax(void *private, const char *__host,
-		const struct dax_operations *ops)
+		const struct dax_operations *ops, unsigned long flags)
 {
 	struct dax_device *dax_dev;
 	const char *host;
@@ -549,6 +563,9 @@ struct dax_device *alloc_dax(void *private, const char *__host,
 	dax_add_host(dax_dev, host);
 	dax_dev->ops = ops;
 	dax_dev->private = private;
+	if (flags & DAXDEV_F_SYNC)
+		set_dax_synchronous(dax_dev);
+
 	return dax_dev;
 
  err_dev:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5475081dcbd6..b1caa7188209 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1991,7 +1991,8 @@ static struct mapped_device *alloc_dev(int minor)
 	sprintf(md->disk->disk_name, "dm-%d", minor);
 
 	if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
-		md->dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+		md->dax_dev = alloc_dax(md, md->disk->disk_name,
+					&dm_dax_ops, 0);
 		if (!md->dax_dev)
 			goto bad;
 	}
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 223da63d1bd7..8be868e2a18b 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -376,6 +376,7 @@ static int pmem_attach_disk(struct device *dev,
 	struct gendisk *disk;
 	void *addr;
 	int rc;
+	unsigned long flags = 0UL;
 
 	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
 	if (!pmem)
@@ -474,7 +475,9 @@ static int pmem_attach_disk(struct device *dev,
 	nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res);
 	disk->bb = &pmem->bb;
 
-	dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops);
+	if (is_nvdimm_sync(nd_region))
+		flags = DAXDEV_F_SYNC;
+	dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags);
 	if (!dax_dev) {
 		put_disk(disk);
 		return -ENOMEM;
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index eca2e62af134..56f2227f192a 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1211,6 +1211,13 @@ int nvdimm_has_cache(struct nd_region *nd_region)
 }
 EXPORT_SYMBOL_GPL(nvdimm_has_cache);
 
+bool is_nvdimm_sync(struct nd_region *nd_region)
+{
+	return is_nd_pmem(&nd_region->dev) &&
+		!test_bit(ND_REGION_ASYNC, &nd_region->flags);
+}
+EXPORT_SYMBOL_GPL(is_nvdimm_sync);
+
 struct conflict_context {
 	struct nd_region *nd_region;
 	resource_size_t start, size;
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index d04d4378ca50..63502ca537eb 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -679,7 +679,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 		goto put_dev;
 
 	dev_info->dax_dev = alloc_dax(dev_info, dev_info->gd->disk_name,
-			&dcssblk_dax_ops);
+			&dcssblk_dax_ops, DAXDEV_F_SYNC);
 	if (!dev_info->dax_dev) {
 		rc = -ENOMEM;
 		goto put_dev;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index becaea5f4488..8b535bc4526f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -7,6 +7,9 @@
 #include <linux/radix-tree.h>
 #include <asm/pgtable.h>
 
+/* Flag for synchronous flush */
+#define DAXDEV_F_SYNC (1UL << 0)
+
 typedef unsigned long dax_entry_t;
 
 struct iomap_ops;
@@ -38,18 +41,28 @@ extern struct attribute_group dax_attribute_group;
 #if IS_ENABLED(CONFIG_DAX)
 struct dax_device *dax_get_by_host(const char *host);
 struct dax_device *alloc_dax(void *private, const char *host,
-		const struct dax_operations *ops);
+		const struct dax_operations *ops, unsigned long flags);
 void put_dax(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
 bool dax_write_cache_enabled(struct dax_device *dax_dev);
+bool __dax_synchronous(struct dax_device *dax_dev);
+static inline bool dax_synchronous(struct dax_device *dax_dev)
+{
+	return  __dax_synchronous(dax_dev);
+}
+void __set_dax_synchronous(struct dax_device *dax_dev);
+static inline void set_dax_synchronous(struct dax_device *dax_dev)
+{
+	__set_dax_synchronous(dax_dev);
+}
 #else
 static inline struct dax_device *dax_get_by_host(const char *host)
 {
 	return NULL;
 }
 static inline struct dax_device *alloc_dax(void *private, const char *host,
-		const struct dax_operations *ops)
+		const struct dax_operations *ops, unsigned long flags)
 {
 	/*
 	 * Callers should check IS_ENABLED(CONFIG_DAX) to know if this
@@ -70,6 +83,13 @@ static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
 {
 	return false;
 }
+static inline bool dax_synchronous(struct dax_device *dax_dev)
+{
+	return true;
+}
+static inline void set_dax_synchronous(struct dax_device *dax_dev)
+{
+}
 #endif
 
 struct writeback_control;
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index e13100f424c8..7a64b3ddb408 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -263,6 +263,7 @@ int generic_nvdimm_flush(struct nd_region *nd_region);
 int nvdimm_has_flush(struct nd_region *nd_region);
 int nvdimm_has_cache(struct nd_region *nd_region);
 int nvdimm_in_overwrite(struct nvdimm *nvdimm);
+bool is_nvdimm_sync(struct nd_region *nd_region);
 
 static inline int nvdimm_ctl(struct nvdimm *nvdimm, unsigned int cmd, void *buf,
 		unsigned int buf_len, int *cmd_rc)
-- 
cgit v1.2.3-59-g8ed1b


From 32de1484648a837db5dea0a7007fe7136804e392 Mon Sep 17 00:00:00 2001
From: Pankaj Gupta <pagupta@redhat.com>
Date: Fri, 5 Jul 2019 19:33:26 +0530
Subject: dax: check synchronous mapping is supported

This patch introduces 'daxdev_mapping_supported' helper
which checks if 'MAP_SYNC' is supported with filesystem
mapping. It also checks if corresponding dax_device is
synchronous. Virtio pmem device is asynchronous and
does not not support VM_SYNC.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Pankaj Gupta <pagupta@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dax.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 8b535bc4526f..9bd8528bd305 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -56,6 +56,18 @@ static inline void set_dax_synchronous(struct dax_device *dax_dev)
 {
 	__set_dax_synchronous(dax_dev);
 }
+/*
+ * Check if given mapping is supported by the file / underlying device.
+ */
+static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
+					     struct dax_device *dax_dev)
+{
+	if (!(vma->vm_flags & VM_SYNC))
+		return true;
+	if (!IS_DAX(file_inode(vma->vm_file)))
+		return false;
+	return dax_synchronous(dax_dev);
+}
 #else
 static inline struct dax_device *dax_get_by_host(const char *host)
 {
@@ -90,6 +102,11 @@ static inline bool dax_synchronous(struct dax_device *dax_dev)
 static inline void set_dax_synchronous(struct dax_device *dax_dev)
 {
 }
+static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
+				struct dax_device *dax_dev)
+{
+	return !(vma->vm_flags & VM_SYNC);
+}
 #endif
 
 struct writeback_control;
-- 
cgit v1.2.3-59-g8ed1b


From 7e0a0e38fcfea47e74b0ff6da6266f00bcd2af43 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 1 May 2019 10:49:27 -0400
Subject: SUNRPC: Replace the queue timer with a delayed work function

The queue timer function, which walks the RPC queue in order to locate
candidates for waking up is one of the current constraints against
removing the bh-safe queue spin locks. Replace it with a delayed
work queue, so that we can do the actual rpc task wake ups from an
ordinary process context.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/sched.h |  3 ++-
 net/sunrpc/sched.c           | 30 ++++++++++++++++++++----------
 2 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index d0e451868f02..7d8db5dcac04 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -183,8 +183,9 @@ struct rpc_task_setup {
 #define RPC_NR_PRIORITY		(1 + RPC_PRIORITY_PRIVILEGED - RPC_PRIORITY_LOW)
 
 struct rpc_timer {
-	struct timer_list timer;
 	struct list_head list;
+	unsigned long expires;
+	struct delayed_work dwork;
 };
 
 /*
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index a2c114812717..e0a0cf381eba 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -46,7 +46,7 @@ static mempool_t	*rpc_buffer_mempool __read_mostly;
 
 static void			rpc_async_schedule(struct work_struct *);
 static void			 rpc_release_task(struct rpc_task *task);
-static void __rpc_queue_timer_fn(struct timer_list *t);
+static void __rpc_queue_timer_fn(struct work_struct *);
 
 /*
  * RPC tasks sit here while waiting for conditions to improve.
@@ -87,13 +87,19 @@ __rpc_disable_timer(struct rpc_wait_queue *queue, struct rpc_task *task)
 	task->tk_timeout = 0;
 	list_del(&task->u.tk_wait.timer_list);
 	if (list_empty(&queue->timer_list.list))
-		del_timer(&queue->timer_list.timer);
+		cancel_delayed_work(&queue->timer_list.dwork);
 }
 
 static void
 rpc_set_queue_timer(struct rpc_wait_queue *queue, unsigned long expires)
 {
-	timer_reduce(&queue->timer_list.timer, expires);
+	unsigned long now = jiffies;
+	queue->timer_list.expires = expires;
+	if (time_before_eq(expires, now))
+		expires = 0;
+	else
+		expires -= now;
+	mod_delayed_work(rpciod_workqueue, &queue->timer_list.dwork, expires);
 }
 
 /*
@@ -107,7 +113,8 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task,
 		task->tk_pid, jiffies_to_msecs(timeout - jiffies));
 
 	task->tk_timeout = timeout;
-	rpc_set_queue_timer(queue, timeout);
+	if (list_empty(&queue->timer_list.list) || time_before(timeout, queue->timer_list.expires))
+		rpc_set_queue_timer(queue, timeout);
 	list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list);
 }
 
@@ -250,7 +257,8 @@ static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const c
 	queue->maxpriority = nr_queues - 1;
 	rpc_reset_waitqueue_priority(queue);
 	queue->qlen = 0;
-	timer_setup(&queue->timer_list.timer, __rpc_queue_timer_fn, 0);
+	queue->timer_list.expires = 0;
+	INIT_DEFERRABLE_WORK(&queue->timer_list.dwork, __rpc_queue_timer_fn);
 	INIT_LIST_HEAD(&queue->timer_list.list);
 	rpc_assign_waitqueue_name(queue, qname);
 }
@@ -269,7 +277,7 @@ EXPORT_SYMBOL_GPL(rpc_init_wait_queue);
 
 void rpc_destroy_wait_queue(struct rpc_wait_queue *queue)
 {
-	del_timer_sync(&queue->timer_list.timer);
+	cancel_delayed_work_sync(&queue->timer_list.dwork);
 }
 EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
 
@@ -759,13 +767,15 @@ void rpc_wake_up_status(struct rpc_wait_queue *queue, int status)
 }
 EXPORT_SYMBOL_GPL(rpc_wake_up_status);
 
-static void __rpc_queue_timer_fn(struct timer_list *t)
+static void __rpc_queue_timer_fn(struct work_struct *work)
 {
-	struct rpc_wait_queue *queue = from_timer(queue, t, timer_list.timer);
+	struct rpc_wait_queue *queue = container_of(work,
+			struct rpc_wait_queue,
+			timer_list.dwork.work);
 	struct rpc_task *task, *n;
 	unsigned long expires, now, timeo;
 
-	spin_lock(&queue->lock);
+	spin_lock_bh(&queue->lock);
 	expires = now = jiffies;
 	list_for_each_entry_safe(task, n, &queue->timer_list.list, u.tk_wait.timer_list) {
 		timeo = task->tk_timeout;
@@ -780,7 +790,7 @@ static void __rpc_queue_timer_fn(struct timer_list *t)
 	}
 	if (!list_empty(&queue->timer_list.list))
 		rpc_set_queue_timer(queue, expires);
-	spin_unlock(&queue->lock);
+	spin_unlock_bh(&queue->lock);
 }
 
 static void __rpc_atrun(struct rpc_task *task)
-- 
cgit v1.2.3-59-g8ed1b


From 4f8943f8088348ec01456b075d44ad19dce3d698 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 1 May 2019 16:28:29 -0400
Subject: SUNRPC: Replace direct task wakeups from softirq context

Replace the direct task wakeups from inside a softirq context with
wakeups from a process context.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/xprtsock.h |  5 +++
 net/sunrpc/xprtsock.c           | 78 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 77 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h
index b81d0b3e0799..7638dbe7bc50 100644
--- a/include/linux/sunrpc/xprtsock.h
+++ b/include/linux/sunrpc/xprtsock.h
@@ -56,6 +56,7 @@ struct sock_xprt {
 	 */
 	unsigned long		sock_state;
 	struct delayed_work	connect_worker;
+	struct work_struct	error_worker;
 	struct work_struct	recv_worker;
 	struct mutex		recv_mutex;
 	struct sockaddr_storage	srcaddr;
@@ -84,6 +85,10 @@ struct sock_xprt {
 #define XPRT_SOCK_CONNECTING	1U
 #define XPRT_SOCK_DATA_READY	(2)
 #define XPRT_SOCK_UPD_TIMEOUT	(3)
+#define XPRT_SOCK_WAKE_ERROR	(4)
+#define XPRT_SOCK_WAKE_WRITE	(5)
+#define XPRT_SOCK_WAKE_PENDING	(6)
+#define XPRT_SOCK_WAKE_DISCONNECT	(7)
 
 #endif /* __KERNEL__ */
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 36652352a38c..92af57019b96 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1211,6 +1211,15 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt)
 	struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
 
 	clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
+	clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state);
+	clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state);
+	clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state);
+}
+
+static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr)
+{
+	set_bit(nr, &transport->sock_state);
+	queue_work(xprtiod_workqueue, &transport->error_worker);
 }
 
 static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
@@ -1231,6 +1240,7 @@ static void xs_sock_reset_connection_flags(struct rpc_xprt *xprt)
  */
 static void xs_error_report(struct sock *sk)
 {
+	struct sock_xprt *transport;
 	struct rpc_xprt *xprt;
 	int err;
 
@@ -1238,13 +1248,14 @@ static void xs_error_report(struct sock *sk)
 	if (!(xprt = xprt_from_sock(sk)))
 		goto out;
 
+	transport = container_of(xprt, struct sock_xprt, xprt);
 	err = -sk->sk_err;
 	if (err == 0)
 		goto out;
 	dprintk("RPC:       xs_error_report client %p, error=%d...\n",
 			xprt, -err);
 	trace_rpc_socket_error(xprt, sk->sk_socket, err);
-	xprt_wake_pending_tasks(xprt, err);
+	xs_run_error_worker(transport, XPRT_SOCK_WAKE_ERROR);
  out:
 	read_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1507,7 +1518,7 @@ static void xs_tcp_state_change(struct sock *sk)
 			xprt->stat.connect_count++;
 			xprt->stat.connect_time += (long)jiffies -
 						   xprt->stat.connect_start;
-			xprt_wake_pending_tasks(xprt, -EAGAIN);
+			xs_run_error_worker(transport, XPRT_SOCK_WAKE_PENDING);
 		}
 		spin_unlock(&xprt->transport_lock);
 		break;
@@ -1525,7 +1536,7 @@ static void xs_tcp_state_change(struct sock *sk)
 		/* The server initiated a shutdown of the socket */
 		xprt->connect_cookie++;
 		clear_bit(XPRT_CONNECTED, &xprt->state);
-		xs_tcp_force_close(xprt);
+		xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
 		/* fall through */
 	case TCP_CLOSING:
 		/*
@@ -1547,7 +1558,7 @@ static void xs_tcp_state_change(struct sock *sk)
 			xprt_clear_connecting(xprt);
 		clear_bit(XPRT_CLOSING, &xprt->state);
 		/* Trigger the socket release */
-		xs_tcp_force_close(xprt);
+		xs_run_error_worker(transport, XPRT_SOCK_WAKE_DISCONNECT);
 	}
  out:
 	read_unlock_bh(&sk->sk_callback_lock);
@@ -1556,6 +1567,7 @@ static void xs_tcp_state_change(struct sock *sk)
 static void xs_write_space(struct sock *sk)
 {
 	struct socket_wq *wq;
+	struct sock_xprt *transport;
 	struct rpc_xprt *xprt;
 
 	if (!sk->sk_socket)
@@ -1564,13 +1576,14 @@ static void xs_write_space(struct sock *sk)
 
 	if (unlikely(!(xprt = xprt_from_sock(sk))))
 		return;
+	transport = container_of(xprt, struct sock_xprt, xprt);
 	rcu_read_lock();
 	wq = rcu_dereference(sk->sk_wq);
 	if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
 		goto out;
 
-	if (xprt_write_space(xprt))
-		sk->sk_write_pending--;
+	xs_run_error_worker(transport, XPRT_SOCK_WAKE_WRITE);
+	sk->sk_write_pending--;
 out:
 	rcu_read_unlock();
 }
@@ -2461,6 +2474,56 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 			delay);
 }
 
+static void xs_wake_disconnect(struct sock_xprt *transport)
+{
+	if (test_and_clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state))
+		xs_tcp_force_close(&transport->xprt);
+}
+
+static void xs_wake_write(struct sock_xprt *transport)
+{
+	if (test_and_clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state))
+		xprt_write_space(&transport->xprt);
+}
+
+static void xs_wake_error(struct sock_xprt *transport)
+{
+	int sockerr;
+	int sockerr_len = sizeof(sockerr);
+
+	if (!test_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
+		return;
+	mutex_lock(&transport->recv_mutex);
+	if (transport->sock == NULL)
+		goto out;
+	if (!test_and_clear_bit(XPRT_SOCK_WAKE_ERROR, &transport->sock_state))
+		goto out;
+	if (kernel_getsockopt(transport->sock, SOL_SOCKET, SO_ERROR,
+				(char *)&sockerr, &sockerr_len) != 0)
+		goto out;
+	if (sockerr < 0)
+		xprt_wake_pending_tasks(&transport->xprt, sockerr);
+out:
+	mutex_unlock(&transport->recv_mutex);
+}
+
+static void xs_wake_pending(struct sock_xprt *transport)
+{
+	if (test_and_clear_bit(XPRT_SOCK_WAKE_PENDING, &transport->sock_state))
+		xprt_wake_pending_tasks(&transport->xprt, -EAGAIN);
+}
+
+static void xs_error_handle(struct work_struct *work)
+{
+	struct sock_xprt *transport = container_of(work,
+			struct sock_xprt, error_worker);
+
+	xs_wake_disconnect(transport);
+	xs_wake_write(transport);
+	xs_wake_error(transport);
+	xs_wake_pending(transport);
+}
+
 /**
  * xs_local_print_stats - display AF_LOCAL socket-specifc stats
  * @xprt: rpc_xprt struct containing statistics
@@ -2873,6 +2936,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
 	xprt->timeout = &xs_local_default_timeout;
 
 	INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
+	INIT_WORK(&transport->error_worker, xs_error_handle);
 	INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket);
 
 	switch (sun->sun_family) {
@@ -2943,6 +3007,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
 	xprt->timeout = &xs_udp_default_timeout;
 
 	INIT_WORK(&transport->recv_worker, xs_udp_data_receive_workfn);
+	INIT_WORK(&transport->error_worker, xs_error_handle);
 	INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket);
 
 	switch (addr->sa_family) {
@@ -3024,6 +3089,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 		(xprt->timeout->to_retries + 1);
 
 	INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn);
+	INIT_WORK(&transport->error_worker, xs_error_handle);
 	INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
 
 	switch (addr->sa_family) {
-- 
cgit v1.2.3-59-g8ed1b


From 21f0ffaff510b0530bfdf77da7133c0b99dee2fe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 28 Apr 2017 10:52:42 -0400
Subject: SUNRPC: Add basic load balancing to the transport switch

For now, just count the queue length. It is less accurate than counting
number of bytes queued, but easier to implement.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/xprt.h          |  1 +
 include/linux/sunrpc/xprtmultipath.h |  2 ++
 net/sunrpc/clnt.c                    | 40 +++++++++++++++++++++++++++++++++---
 net/sunrpc/xprtmultipath.c           | 20 +++++++++++++++++-
 4 files changed, 59 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a6d9fce7f20e..15322c1d9c8c 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -238,6 +238,7 @@ struct rpc_xprt {
 	/*
 	 * Send stuff
 	 */
+	atomic_long_t		queuelen;
 	spinlock_t		transport_lock;	/* lock transport info */
 	spinlock_t		reserve_lock;	/* lock slot table */
 	spinlock_t		queue_lock;	/* send/receive queue lock */
diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index af1257c030d2..c6cce3fbf29d 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -15,6 +15,8 @@ struct rpc_xprt_switch {
 	struct kref		xps_kref;
 
 	unsigned int		xps_nxprts;
+	unsigned int		xps_nactive;
+	atomic_long_t		xps_queuelen;
 	struct list_head	xps_xprt_list;
 
 	struct net *		xps_net;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b03bfa055c08..976eab68bb5d 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -968,13 +968,47 @@ out:
 }
 EXPORT_SYMBOL_GPL(rpc_bind_new_program);
 
+static struct rpc_xprt *
+rpc_task_get_xprt(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt_switch *xps;
+	struct rpc_xprt *xprt= xprt_iter_get_next(&clnt->cl_xpi);
+
+	if (!xprt)
+		return NULL;
+	rcu_read_lock();
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+	atomic_long_inc(&xps->xps_queuelen);
+	rcu_read_unlock();
+	atomic_long_inc(&xprt->queuelen);
+
+	return xprt;
+}
+
+static void
+rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+	struct rpc_xprt_switch *xps;
+
+	atomic_long_dec(&xprt->queuelen);
+	rcu_read_lock();
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+	atomic_long_dec(&xps->xps_queuelen);
+	rcu_read_unlock();
+
+	xprt_put(xprt);
+}
+
 void rpc_task_release_transport(struct rpc_task *task)
 {
 	struct rpc_xprt *xprt = task->tk_xprt;
 
 	if (xprt) {
 		task->tk_xprt = NULL;
-		xprt_put(xprt);
+		if (task->tk_client)
+			rpc_task_release_xprt(task->tk_client, xprt);
+		else
+			xprt_put(xprt);
 	}
 }
 EXPORT_SYMBOL_GPL(rpc_task_release_transport);
@@ -983,6 +1017,7 @@ void rpc_task_release_client(struct rpc_task *task)
 {
 	struct rpc_clnt *clnt = task->tk_client;
 
+	rpc_task_release_transport(task);
 	if (clnt != NULL) {
 		/* Remove from client task list */
 		spin_lock(&clnt->cl_lock);
@@ -992,14 +1027,13 @@ void rpc_task_release_client(struct rpc_task *task)
 
 		rpc_release_client(clnt);
 	}
-	rpc_task_release_transport(task);
 }
 
 static
 void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 {
 	if (!task->tk_xprt)
-		task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
+		task->tk_xprt = rpc_task_get_xprt(clnt);
 }
 
 static
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 8394124126f8..394e427533be 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -36,6 +36,7 @@ static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps,
 	if (xps->xps_nxprts == 0)
 		xps->xps_net = xprt->xprt_net;
 	xps->xps_nxprts++;
+	xps->xps_nactive++;
 }
 
 /**
@@ -62,6 +63,7 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
 {
 	if (unlikely(xprt == NULL))
 		return;
+	xps->xps_nactive--;
 	xps->xps_nxprts--;
 	if (xps->xps_nxprts == 0)
 		xps->xps_net = NULL;
@@ -317,8 +319,24 @@ struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct list_head *head,
 static
 struct rpc_xprt *xprt_iter_next_entry_roundrobin(struct rpc_xprt_iter *xpi)
 {
-	return xprt_iter_next_entry_multiple(xpi,
+	struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
+	struct rpc_xprt *xprt;
+	unsigned long xprt_queuelen;
+	unsigned long xps_queuelen;
+	unsigned long xps_avglen;
+
+	do {
+		xprt = xprt_iter_next_entry_multiple(xpi,
 			xprt_switch_find_next_entry_roundrobin);
+		if (xprt == NULL)
+			break;
+		xprt_queuelen = atomic_long_read(&xprt->queuelen);
+		if (xprt_queuelen <= 2)
+			break;
+		xps_queuelen = atomic_long_read(&xps->xps_queuelen);
+		xps_avglen = DIV_ROUND_UP(xps_queuelen, xps->xps_nactive);
+	} while (xprt_queuelen > xps_avglen);
+	return xprt;
 }
 
 static
-- 
cgit v1.2.3-59-g8ed1b


From 1c341b777501613aad83f9c233a3fe5701cff083 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Wed, 22 May 2019 08:38:57 -0400
Subject: NFS: Add deferred cache invalidation for close-to-open consistency
 violations

If the client detects that close-to-open cache consistency has been
violated, and that the file or directory has been changed on the
server, then do a cache invalidation when we're done working with
the file.
The reason we don't do an immediate cache invalidation is that we
want to avoid performance problems due to false positives. Also,
note that we cannot guarantee cache consistency in this situation
even if we do invalidate the cache.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           |  4 ++++
 fs/nfs/inode.c         | 15 +++++++++++----
 include/linux/nfs_fs.h |  2 ++
 3 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 57b6a45576ad..bd1f9555447b 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -80,6 +80,10 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
 		ctx->dup_cookie = 0;
 		ctx->cred = get_cred(cred);
 		spin_lock(&dir->i_lock);
+		if (list_empty(&nfsi->open_files) &&
+		    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+			nfsi->cache_validity |= NFS_INO_INVALID_DATA |
+				NFS_INO_REVAL_FORCED;
 		list_add(&ctx->list, &nfsi->open_files);
 		spin_unlock(&dir->i_lock);
 		return ctx;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 53777813ca95..ea52c71534b5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -208,7 +208,7 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
 	}
 
 	if (inode->i_mapping->nrpages == 0)
-		flags &= ~NFS_INO_INVALID_DATA;
+		flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER);
 	nfsi->cache_validity |= flags;
 	if (flags & NFS_INO_INVALID_DATA)
 		nfs_fscache_invalidate(inode);
@@ -652,7 +652,8 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 	i_size_write(inode, offset);
 	/* Optimisation */
 	if (offset == 0)
-		NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
+		NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA |
+				NFS_INO_DATA_INVAL_DEFER);
 	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
 
 	spin_unlock(&inode->i_lock);
@@ -1032,6 +1033,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&inode->i_lock);
+	if (list_empty(&nfsi->open_files) &&
+	    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+		nfsi->cache_validity |= NFS_INO_INVALID_DATA |
+			NFS_INO_REVAL_FORCED;
 	list_add_tail_rcu(&ctx->list, &nfsi->open_files);
 	spin_unlock(&inode->i_lock);
 }
@@ -1313,7 +1318,8 @@ int nfs_revalidate_mapping(struct inode *inode,
 
 	set_bit(NFS_INO_INVALIDATING, bitlock);
 	smp_wmb();
-	nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+	nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA|
+			NFS_INO_DATA_INVAL_DEFER);
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
 	ret = nfs_invalidate_mapping(inode, mapping);
@@ -1871,7 +1877,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 				dprintk("NFS: change_attr change on server for file %s/%ld\n",
 						inode->i_sb->s_id,
 						inode->i_ino);
-			}
+			} else if (!have_delegation)
+				nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER;
 			inode_set_iversion_raw(inode, fattr->change_attr);
 			attr_changed = true;
 		}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d363d5765cdf..0a11712a80e3 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -223,6 +223,8 @@ struct nfs4_copy_state {
 #define NFS_INO_INVALID_MTIME	BIT(10)		/* cached mtime is invalid */
 #define NFS_INO_INVALID_SIZE	BIT(11)		/* cached size is invalid */
 #define NFS_INO_INVALID_OTHER	BIT(12)		/* other attrs are invalid */
+#define NFS_INO_DATA_INVAL_DEFER	\
+				BIT(13)		/* Deferred cache invalidation */
 
 #define NFS_INO_INVALID_ATTR	(NFS_INO_INVALID_CHANGE \
 		| NFS_INO_INVALID_CTIME \
-- 
cgit v1.2.3-59-g8ed1b


From 612b41f808a98a124b23d72229693c3181733291 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 27 Apr 2017 08:50:51 -0400
Subject: SUNRPC: Allow creation of RPC clients with multiple connections

Add an argument to struct rpc_create_args that allows the specification
of how many transport connections you want to set up to the server.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 include/linux/sunrpc/clnt.h |  1 +
 net/sunrpc/clnt.c           | 17 ++++++++++++++++-
 net/sunrpc/xprtmultipath.c  |  3 +--
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 6e8073140a5d..4619098affa3 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -124,6 +124,7 @@ struct rpc_create_args {
 	u32			prognumber;	/* overrides program->number */
 	u32			version;
 	rpc_authflavor_t	authflavor;
+	u32			nconnect;
 	unsigned long		flags;
 	char			*client_name;
 	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 976eab68bb5d..b6aca8cb5ae6 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -528,6 +528,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 		.bc_xprt = args->bc_xprt,
 	};
 	char servername[48];
+	struct rpc_clnt *clnt;
+	int i;
 
 	if (args->bc_xprt) {
 		WARN_ON_ONCE(!(args->protocol & XPRT_TRANSPORT_BC));
@@ -590,7 +592,15 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
 		xprt->resvport = 0;
 
-	return rpc_create_xprt(args, xprt);
+	clnt = rpc_create_xprt(args, xprt);
+	if (IS_ERR(clnt) || args->nconnect <= 1)
+		return clnt;
+
+	for (i = 0; i < args->nconnect - 1; i++) {
+		if (rpc_clnt_add_xprt(clnt, &xprtargs, NULL, NULL) < 0)
+			break;
+	}
+	return clnt;
 }
 EXPORT_SYMBOL_GPL(rpc_create);
 
@@ -2730,6 +2740,10 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
 		return -ENOMEM;
 	data->xps = xprt_switch_get(xps);
 	data->xprt = xprt_get(xprt);
+	if (rpc_xprt_switch_has_addr(data->xps, (struct sockaddr *)&xprt->addr)) {
+		rpc_cb_add_xprt_release(data);
+		goto success;
+	}
 
 	task = rpc_call_null_helper(clnt, xprt, NULL,
 			RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS,
@@ -2737,6 +2751,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
 	if (IS_ERR(task))
 		return PTR_ERR(task);
 	rpc_put_task(task);
+success:
 	return 1;
 }
 EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 394e427533be..9d66ce53355d 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -52,8 +52,7 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
 	if (xprt == NULL)
 		return;
 	spin_lock(&xps->xps_lock);
-	if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
-	    !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
+	if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
 		xprt_switch_add_xprt_locked(xps, xprt);
 	spin_unlock(&xps->xps_lock);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 6619079d05404cb32be29af329b87ac3b0ad4f96 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 27 Apr 2017 11:13:40 -0400
Subject: NFSv4: Allow multiple connections to NFSv4.x (x>0) servers

If the user specifies the -onconn=<number> mount option, and the transport
protocol is TCP, then set up <number> connections to the server. The
connections will all go to the same IP address.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c           |  2 ++
 fs/nfs/internal.h         |  1 +
 fs/nfs/nfs4client.c       | 11 +++++++++--
 include/linux/nfs_fs_sb.h |  1 +
 4 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d7e4f0848e28..fa6953e56a71 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -175,6 +175,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
 	clp->cl_rpcclient = ERR_PTR(-EINVAL);
 
 	clp->cl_proto = cl_init->proto;
+	clp->cl_nconnect = cl_init->nconnect;
 	clp->cl_net = get_net(cl_init->net);
 
 	clp->cl_principal = "*";
@@ -493,6 +494,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
 	struct rpc_create_args args = {
 		.net		= clp->cl_net,
 		.protocol	= clp->cl_proto,
+		.nconnect	= clp->cl_nconnect,
 		.address	= (struct sockaddr *)&clp->cl_addr,
 		.addrsize	= clp->cl_addrlen,
 		.timeout	= cl_init->timeparms,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bba09dace5d6..4a49dc1495c5 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -82,6 +82,7 @@ struct nfs_client_initdata {
 	struct nfs_subversion *nfs_mod;
 	int proto;
 	u32 minorversion;
+	unsigned int nconnect;
 	struct net *net;
 	const struct rpc_timeout *timeparms;
 	const struct cred *cred;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 81b9b6d7927a..5c244c440658 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -859,7 +859,8 @@ static int nfs4_set_client(struct nfs_server *server,
 		const size_t addrlen,
 		const char *ip_addr,
 		int proto, const struct rpc_timeout *timeparms,
-		u32 minorversion, struct net *net)
+		u32 minorversion, unsigned int nconnect,
+		struct net *net)
 {
 	struct nfs_client_initdata cl_init = {
 		.hostname = hostname,
@@ -875,6 +876,8 @@ static int nfs4_set_client(struct nfs_server *server,
 	};
 	struct nfs_client *clp;
 
+	if (minorversion > 0 && proto == XPRT_TRANSPORT_TCP)
+		cl_init.nconnect = nconnect;
 	if (server->flags & NFS_MOUNT_NORESVPORT)
 		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 	if (server->options & NFS_OPTION_MIGRATION)
@@ -1074,6 +1077,7 @@ static int nfs4_init_server(struct nfs_server *server,
 			data->nfs_server.protocol,
 			&timeparms,
 			data->minorversion,
+			data->nfs_server.nconnect,
 			data->net);
 	if (error < 0)
 		return error;
@@ -1163,6 +1167,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				XPRT_TRANSPORT_RDMA,
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
+				parent_client->cl_nconnect,
 				parent_client->cl_net);
 	if (!error)
 		goto init_server;
@@ -1176,6 +1181,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				XPRT_TRANSPORT_TCP,
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
+				parent_client->cl_nconnect,
 				parent_client->cl_net);
 	if (error < 0)
 		goto error;
@@ -1271,7 +1277,8 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
 	set_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
 	error = nfs4_set_client(server, hostname, sap, salen, buf,
 				clp->cl_proto, clnt->cl_timeout,
-				clp->cl_minorversion, net);
+				clp->cl_minorversion,
+				clp->cl_nconnect, net);
 	clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
 	if (error != 0) {
 		nfs_server_insert_lists(server);
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 1e78032a174b..a87fe854f008 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -58,6 +58,7 @@ struct nfs_client {
 	struct nfs_subversion *	cl_nfs_mod;	/* pointer to nfs version module */
 
 	u32			cl_minorversion;/* NFSv4 minorversion */
+	unsigned int		cl_nconnect;	/* Number of connections */
 	const char *		cl_principal;  /* used for machine cred */
 
 #if IS_ENABLED(CONFIG_NFS_V4)
-- 
cgit v1.2.3-59-g8ed1b


From 5a0c257f8e0f4c4b3c33dff545317c21a921303e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 30 May 2019 10:41:28 +1000
Subject: NFS: send state management on a single connection.

With NFSv4.1, different network connections need to be explicitly
bound to a session.  During session startup, this is not possible
so only a single connection must be used for session startup.

So add a task flag to disable the default round-robin choice of
connections (when nconnect > 1) and force the use of a single
connection.
Then use that flag on all requests for session management - for
consistence, include NFSv4.0 management (SETCLIENTID) and session
destruction

Reported-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c            | 22 +++++++++++++---------
 include/linux/sunrpc/sched.h |  1 +
 net/sunrpc/clnt.c            | 24 +++++++++++++++++++++++-
 3 files changed, 37 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 26626ea1f197..d115d9973efc 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5992,7 +5992,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_setclientid_ops,
 		.callback_data = &setclientid,
-		.flags = RPC_TASK_TIMEOUT,
+		.flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN,
 	};
 	int status;
 
@@ -6058,7 +6058,8 @@ int nfs4_proc_setclientid_confirm(struct nfs_client *clp,
 	dprintk("NFS call  setclientid_confirm auth=%s, (client ID %llx)\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
 		clp->cl_clientid);
-	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_setclientid_confirm(clp, status);
 	dprintk("NFS reply setclientid_confirm: %d\n", status);
 	return status;
@@ -7639,7 +7640,7 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
 		NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg);
 
 	status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args,
-				&res.seq_res, 0);
+				&res.seq_res, RPC_TASK_NO_ROUND_ROBIN);
 	dprintk("NFS reply  secinfo: %d\n", status);
 
 	put_cred(cred);
@@ -7977,7 +7978,7 @@ nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred,
 		.rpc_client = clp->cl_rpcclient,
 		.callback_ops = &nfs4_exchange_id_call_ops,
 		.rpc_message = &msg,
-		.flags = RPC_TASK_TIMEOUT,
+		.flags = RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN,
 	};
 	struct nfs41_exchange_id_data *calldata;
 	int status;
@@ -8202,7 +8203,8 @@ static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
 	};
 	int status;
 
-	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_destroy_clientid(clp, status);
 	if (status)
 		dprintk("NFS: Got error %d from the server %s on "
@@ -8481,7 +8483,8 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
 	nfs4_init_channel_attrs(&args, clp->cl_rpcclient);
 	args.flags = (SESSION4_PERSIST | SESSION4_BACK_CHAN);
 
-	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_create_session(clp, status);
 
 	switch (status) {
@@ -8557,7 +8560,8 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
 	if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state))
 		return 0;
 
-	status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
+	status = rpc_call_sync(session->clp->cl_rpcclient, &msg,
+			       RPC_TASK_TIMEOUT | RPC_TASK_NO_ROUND_ROBIN);
 	trace_nfs4_destroy_session(session->clp, status);
 
 	if (status)
@@ -8811,7 +8815,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp,
 		.rpc_client = clp->cl_rpcclient,
 		.rpc_message = &msg,
 		.callback_ops = &nfs4_reclaim_complete_call_ops,
-		.flags = RPC_TASK_ASYNC,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN,
 	};
 	int status = -ENOMEM;
 
@@ -9330,7 +9334,7 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
 
 	dprintk("--> %s\n", __func__);
 	status = nfs4_call_sync(clnt, server, &msg, &args.seq_args,
-				&res.seq_res, 0);
+				&res.seq_res, RPC_TASK_NO_ROUND_ROBIN);
 	dprintk("<-- %s status=%d\n", __func__, status);
 
 	put_cred(cred);
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index d0e451868f02..11424bdf09e6 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -126,6 +126,7 @@ struct rpc_task_setup {
 #define RPC_CALL_MAJORSEEN	0x0020		/* major timeout seen */
 #define RPC_TASK_ROOTCREDS	0x0040		/* force root creds */
 #define RPC_TASK_DYNAMIC	0x0080		/* task was kmalloc'ed */
+#define	RPC_TASK_NO_ROUND_ROBIN	0x0100		/* send requests on "main" xprt */
 #define RPC_TASK_SOFT		0x0200		/* Use soft timeouts */
 #define RPC_TASK_SOFTCONN	0x0400		/* Fail if can't connect */
 #define RPC_TASK_SENT		0x0800		/* message was sent */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b6aca8cb5ae6..d599fab8adcb 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -995,6 +995,24 @@ rpc_task_get_xprt(struct rpc_clnt *clnt)
 	return xprt;
 }
 
+static struct rpc_xprt *
+rpc_task_get_first_xprt(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt_switch *xps;
+	struct rpc_xprt *xprt;
+
+	rcu_read_lock();
+	xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+	if (xprt) {
+		atomic_long_inc(&xprt->queuelen);
+		xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+		atomic_long_inc(&xps->xps_queuelen);
+	}
+	rcu_read_unlock();
+
+	return xprt;
+}
+
 static void
 rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 {
@@ -1042,7 +1060,11 @@ void rpc_task_release_client(struct rpc_task *task)
 static
 void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 {
-	if (!task->tk_xprt)
+	if (task->tk_xprt)
+		return;
+	if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
+		task->tk_xprt = rpc_task_get_first_xprt(clnt);
+	else
 		task->tk_xprt = rpc_task_get_xprt(clnt);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a332518fda4731c07394164b3edcbb6efaf4c4d7 Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Thu, 23 May 2019 16:13:50 -0400
Subject: SUNRPC: Count ops completing with tk_status < 0

We often see various error conditions with NFS4.x that show up with
a very high operation count all completing with tk_status < 0 in a
short period of time.  Add a count to rpc_iostats to record on a
per-op basis the ops that complete in this manner, which will
enable lower overhead diagnostics.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/metrics.h | 7 ++++++-
 net/sunrpc/stats.c             | 8 ++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/metrics.h b/include/linux/sunrpc/metrics.h
index 1b3751327575..0ee3f7052846 100644
--- a/include/linux/sunrpc/metrics.h
+++ b/include/linux/sunrpc/metrics.h
@@ -30,7 +30,7 @@
 #include <linux/ktime.h>
 #include <linux/spinlock.h>
 
-#define RPC_IOSTATS_VERS	"1.0"
+#define RPC_IOSTATS_VERS	"1.1"
 
 struct rpc_iostats {
 	spinlock_t		om_lock;
@@ -66,6 +66,11 @@ struct rpc_iostats {
 	ktime_t			om_queue,	/* queued for xmit */
 				om_rtt,		/* RPC RTT */
 				om_execute;	/* RPC execution */
+	/*
+	 * The count of operations that complete with tk_status < 0.
+	 * These statuses usually indicate error conditions.
+	 */
+	unsigned long           om_error_status;
 } ____cacheline_aligned;
 
 struct rpc_task;
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index 2f7bde82450b..48ea776364f8 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -177,6 +177,8 @@ void rpc_count_iostats_metrics(const struct rpc_task *task,
 
 	execute = ktime_sub(now, task->tk_start);
 	op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute);
+	if (task->tk_status < 0)
+		op_metrics->om_error_status++;
 
 	spin_unlock(&op_metrics->om_lock);
 
@@ -219,13 +221,14 @@ static void _add_rpc_iostats(struct rpc_iostats *a, struct rpc_iostats *b)
 	a->om_queue = ktime_add(a->om_queue, b->om_queue);
 	a->om_rtt = ktime_add(a->om_rtt, b->om_rtt);
 	a->om_execute = ktime_add(a->om_execute, b->om_execute);
+	a->om_error_status += b->om_error_status;
 }
 
 static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats,
 			       int op, const struct rpc_procinfo *procs)
 {
 	_print_name(seq, op, procs);
-	seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu\n",
+	seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu %lu\n",
 		   stats->om_ops,
 		   stats->om_ntrans,
 		   stats->om_timeouts,
@@ -233,7 +236,8 @@ static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats,
 		   stats->om_bytes_recv,
 		   ktime_to_ms(stats->om_queue),
 		   ktime_to_ms(stats->om_rtt),
-		   ktime_to_ms(stats->om_execute));
+		   ktime_to_ms(stats->om_execute),
+		   stats->om_error_status);
 }
 
 void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt)
-- 
cgit v1.2.3-59-g8ed1b


From 1dd7382b1bb608e7ccae3672621eaceca355ae8b Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Mon, 1 Jul 2019 21:14:01 +0300
Subject: net/mlx5: Introduce VHCA tunnel device capability

When using the device emulation feature (introduced in Bluefield-1 SOC),
a privileged function (the device emulation manager) will be able to
create a channel to execute commands on behalf of the emulated function.

This channel will be a general object of type VHCA_TUNNEL that will have
a unique ID for each emulated function. This ID will be passed in each
cmd that will be issued by the emulation SW in a well known offset in
the command header.

This channel is needed since the emulated function doesn't have a normal
command interface to the HCA HW, but some basic configuration for that
function is needed (e.g. initialize and enable the HCA). For that matter,
a specific command-set was defined and only those commands will be issued
by the HCA.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
---
 include/linux/mlx5/mlx5_ifc.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 06881b79167e..ba60bd17a92a 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1390,7 +1390,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   reserved_at_6c8[0x28];
 	u8	   sf_base_id[0x10];
 
-	u8	   reserved_at_700[0x100];
+	u8	   reserved_at_700[0x80];
+	u8	   vhca_tunnel_commands[0x40];
+	u8	   reserved_at_7c0[0x40];
 };
 
 enum mlx5_flow_destination_type {
@@ -9694,7 +9696,7 @@ struct mlx5_ifc_general_obj_in_cmd_hdr_bits {
 	u8         opcode[0x10];
 	u8         uid[0x10];
 
-	u8         reserved_at_20[0x10];
+	u8         vhca_tunnel_id[0x10];
 	u8         obj_type[0x10];
 
 	u8         obj_id[0x20];
-- 
cgit v1.2.3-59-g8ed1b


From 97a385e558292ba0851906783642239865670a5f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 1 May 2019 16:40:32 -0400
Subject: libceph: remove ceph_get_direct_page_vector()

This function is entirely unused.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/libceph.h |  4 ----
 net/ceph/pagevec.c           | 33 ---------------------------------
 2 files changed, 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 337d5049ff93..a3cddf5f0e60 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -299,10 +299,6 @@ int ceph_wait_for_latest_osdmap(struct ceph_client *client,
 
 /* pagevec.c */
 extern void ceph_release_page_vector(struct page **pages, int num_pages);
-
-extern struct page **ceph_get_direct_page_vector(const void __user *data,
-						 int num_pages,
-						 bool write_page);
 extern void ceph_put_page_vector(struct page **pages, int num_pages,
 				 bool dirty);
 extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 74cafc0142ea..64305e7056a1 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -10,39 +10,6 @@
 
 #include <linux/ceph/libceph.h>
 
-/*
- * build a vector of user pages
- */
-struct page **ceph_get_direct_page_vector(const void __user *data,
-					  int num_pages, bool write_page)
-{
-	struct page **pages;
-	int got = 0;
-	int rc = 0;
-
-	pages = kmalloc_array(num_pages, sizeof(*pages), GFP_NOFS);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
-
-	while (got < num_pages) {
-		rc = get_user_pages_fast(
-		    (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
-		    num_pages - got, write_page ? FOLL_WRITE : 0, pages + got);
-		if (rc < 0)
-			break;
-		BUG_ON(rc == 0);
-		got += rc;
-	}
-	if (rc < 0)
-		goto fail;
-	return pages;
-
-fail:
-	ceph_put_page_vector(pages, got, false);
-	return ERR_PTR(rc);
-}
-EXPORT_SYMBOL(ceph_get_direct_page_vector);
-
 void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
 {
 	int i;
-- 
cgit v1.2.3-59-g8ed1b


From 6c37f0e64173571914a443f74d36e5a22dabfc05 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 3 Jun 2019 14:45:16 -0400
Subject: libceph: add ceph_decode_entity_addr

Add a function for decoding an entity_addr_t. Once
CEPH_FEATURE_MSG_ADDR2 is enabled, the server daemons will start
encoding entity_addr_t differently.

Add a new helper function that can handle either format.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/decode.h |  2 +
 net/ceph/Makefile           |  2 +-
 net/ceph/decode.c           | 90 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 1 deletion(-)
 create mode 100644 net/ceph/decode.c

(limited to 'include/linux')

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index a6c2a48d42e0..1c0a665bfc03 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -230,6 +230,8 @@ static inline void ceph_decode_addr(struct ceph_entity_addr *a)
 	WARN_ON(a->in_addr.ss_family == 512);
 }
 
+extern int ceph_decode_entity_addr(void **p, void *end,
+				   struct ceph_entity_addr *addr);
 /*
  * encoders
  */
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index db09defe27d0..59d0ba2072de 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_CEPH_LIB) += libceph.o
 
 libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
-	mon_client.o \
+	mon_client.o decode.o \
 	cls_lock_client.o \
 	osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
 	striper.o \
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
new file mode 100644
index 000000000000..b82981199549
--- /dev/null
+++ b/net/ceph/decode.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ceph/decode.h>
+
+static int
+ceph_decode_entity_addr_versioned(void **p, void *end,
+				  struct ceph_entity_addr *addr)
+{
+	int ret;
+	u8 struct_v;
+	u32 struct_len, addr_len;
+	void *struct_end;
+
+	ret = ceph_start_decoding(p, end, 1, "entity_addr_t", &struct_v,
+				  &struct_len);
+	if (ret)
+		goto bad;
+
+	ret = -EINVAL;
+	struct_end = *p + struct_len;
+
+	ceph_decode_copy_safe(p, end, &addr->type, sizeof(addr->type), bad);
+
+	/*
+	 * TYPE_NONE == 0
+	 * TYPE_LEGACY == 1
+	 *
+	 * Clients that don't support ADDR2 always send TYPE_NONE.
+	 * For now, since all we support is msgr1, just set this to 0
+	 * when we get a TYPE_LEGACY type.
+	 */
+	if (addr->type == cpu_to_le32(1))
+		addr->type = 0;
+
+	ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
+
+	ceph_decode_32_safe(p, end, addr_len, bad);
+	if (addr_len > sizeof(addr->in_addr))
+		goto bad;
+
+	memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+	if (addr_len) {
+		ceph_decode_copy_safe(p, end, &addr->in_addr, addr_len, bad);
+
+		addr->in_addr.ss_family =
+			le16_to_cpu((__force __le16)addr->in_addr.ss_family);
+	}
+
+	/* Advance past anything the client doesn't yet understand */
+	*p = struct_end;
+	ret = 0;
+bad:
+	return ret;
+}
+
+static int
+ceph_decode_entity_addr_legacy(void **p, void *end,
+			       struct ceph_entity_addr *addr)
+{
+	int ret = -EINVAL;
+
+	/* Skip rest of type field */
+	ceph_decode_skip_n(p, end, 3, bad);
+	addr->type = 0;
+	ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
+	memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+	ceph_decode_copy_safe(p, end, &addr->in_addr,
+			      sizeof(addr->in_addr), bad);
+	addr->in_addr.ss_family =
+			be16_to_cpu((__force __be16)addr->in_addr.ss_family);
+	ret = 0;
+bad:
+	return ret;
+}
+
+int
+ceph_decode_entity_addr(void **p, void *end, struct ceph_entity_addr *addr)
+{
+	u8 marker;
+
+	ceph_decode_8_safe(p, end, marker, bad);
+	if (marker == 1)
+		return ceph_decode_entity_addr_versioned(p, end, addr);
+	else if (marker == 0)
+		return ceph_decode_entity_addr_legacy(p, end, addr);
+bad:
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addr);
+
-- 
cgit v1.2.3-59-g8ed1b


From 0bfb0f288992adbf8d1f0d5f22f0fd398b146316 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 31 May 2019 15:32:28 -0400
Subject: libceph: ADDR2 support for monmap

Switch the MonMap decoder to use the new decoding routine for
entity_addr_t's.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/mon_client.h |  1 -
 net/ceph/mon_client.c           | 21 +++++++++++++--------
 2 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 3a4688af7455..b4d134d3312a 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -104,7 +104,6 @@ struct ceph_mon_client {
 #endif
 };
 
-extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
 extern int ceph_monmap_contains(struct ceph_monmap *m,
 				struct ceph_entity_addr *addr);
 
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 895679d3529b..0520bf9825aa 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -39,7 +39,7 @@ static int __validate_auth(struct ceph_mon_client *monc);
 /*
  * Decode a monmap blob (e.g., during mount).
  */
-struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+static struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 {
 	struct ceph_monmap *m = NULL;
 	int i, err = -EINVAL;
@@ -50,7 +50,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	ceph_decode_32_safe(&p, end, len, bad);
 	ceph_decode_need(&p, end, len, bad);
 
-	dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+	dout("monmap_decode %p %p len %d (%d)\n", p, end, len, (int)(end-p));
 	p += sizeof(u16);  /* skip version */
 
 	ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
@@ -58,7 +58,6 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	epoch = ceph_decode_32(&p);
 
 	num_mon = ceph_decode_32(&p);
-	ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
 
 	if (num_mon > CEPH_MAX_MON)
 		goto bad;
@@ -68,17 +67,22 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
 	m->fsid = fsid;
 	m->epoch = epoch;
 	m->num_mon = num_mon;
-	ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
-	for (i = 0; i < num_mon; i++)
-		ceph_decode_addr(&m->mon_inst[i].addr);
-
+	for (i = 0; i < num_mon; ++i) {
+		struct ceph_entity_inst *inst = &m->mon_inst[i];
+
+		/* copy name portion */
+		ceph_decode_copy_safe(&p, end, &inst->name,
+					sizeof(inst->name), bad);
+		err = ceph_decode_entity_addr(&p, end, &inst->addr);
+		if (err)
+			goto bad;
+	}
 	dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
 	     m->num_mon);
 	for (i = 0; i < m->num_mon; i++)
 		dout("monmap_decode  mon%d is %s\n", i,
 		     ceph_pr_addr(&m->mon_inst[i].addr));
 	return m;
-
 bad:
 	dout("monmap_decode failed with %d\n", err);
 	kfree(m);
@@ -469,6 +473,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
 	if (IS_ERR(monmap)) {
 		pr_err("problem decoding monmap, %d\n",
 		       (int)PTR_ERR(monmap));
+		ceph_msg_dump(msg);
 		goto out;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From d3c3c0a841d5dafc5395be363996d619255a732f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 17 Jun 2019 06:57:25 -0400
Subject: libceph: use TYPE_LEGACY for entity addrs instead of TYPE_NONE

Going forward, we'll have different address types so let's use
the addr2 TYPE_LEGACY for internal tracking rather than TYPE_NONE.

Also, make ceph_pr_addr print the address type value as well.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/decode.h |  7 +++++++
 net/ceph/decode.c           | 18 ++++++------------
 net/ceph/messenger.c        |  7 +++++--
 3 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 1c0a665bfc03..ce488d95be89 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -218,16 +218,23 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
 /*
  * sockaddr_storage <-> ceph_sockaddr
  */
+#define CEPH_ENTITY_ADDR_TYPE_NONE	0
+#define CEPH_ENTITY_ADDR_TYPE_LEGACY	__cpu_to_le32(1)
+
 static inline void ceph_encode_addr(struct ceph_entity_addr *a)
 {
 	__be16 ss_family = htons(a->in_addr.ss_family);
 	a->in_addr.ss_family = *(__u16 *)&ss_family;
+
+	/* Banner addresses require TYPE_NONE */
+	a->type = CEPH_ENTITY_ADDR_TYPE_NONE;
 }
 static inline void ceph_decode_addr(struct ceph_entity_addr *a)
 {
 	__be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
 	a->in_addr.ss_family = ntohs(ss_family);
 	WARN_ON(a->in_addr.ss_family == 512);
+	a->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 }
 
 extern int ceph_decode_entity_addr(void **p, void *end,
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
index b82981199549..eea529595a7a 100644
--- a/net/ceph/decode.c
+++ b/net/ceph/decode.c
@@ -21,17 +21,6 @@ ceph_decode_entity_addr_versioned(void **p, void *end,
 
 	ceph_decode_copy_safe(p, end, &addr->type, sizeof(addr->type), bad);
 
-	/*
-	 * TYPE_NONE == 0
-	 * TYPE_LEGACY == 1
-	 *
-	 * Clients that don't support ADDR2 always send TYPE_NONE.
-	 * For now, since all we support is msgr1, just set this to 0
-	 * when we get a TYPE_LEGACY type.
-	 */
-	if (addr->type == cpu_to_le32(1))
-		addr->type = 0;
-
 	ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
 
 	ceph_decode_32_safe(p, end, addr_len, bad);
@@ -61,7 +50,12 @@ ceph_decode_entity_addr_legacy(void **p, void *end,
 
 	/* Skip rest of type field */
 	ceph_decode_skip_n(p, end, 3, bad);
-	addr->type = 0;
+
+	/*
+	 * Clients that don't support ADDR2 always send TYPE_NONE, change it
+	 * to TYPE_LEGACY for forward compatibility.
+	 */
+	addr->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 	ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
 	memset(&addr->in_addr, 0, sizeof(addr->in_addr));
 	ceph_decode_copy_safe(p, end, &addr->in_addr,
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 8d0c51dd4666..0a3ef33cf7ac 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -199,12 +199,14 @@ const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
 
 	switch (ss.ss_family) {
 	case AF_INET:
-		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr,
+		snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu",
+			 le32_to_cpu(addr->type), &in4->sin_addr,
 			 ntohs(in4->sin_port));
 		break;
 
 	case AF_INET6:
-		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr,
+		snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu",
+			 le32_to_cpu(addr->type), &in6->sin6_addr,
 			 ntohs(in6->sin6_port));
 		break;
 
@@ -1982,6 +1984,7 @@ int ceph_parse_ips(const char *c, const char *end,
 		}
 
 		addr_set_port(&addr[i], port);
+		addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
 
 		dout("parse_ips got %s\n", ceph_pr_addr(&addr[i]));
 
-- 
cgit v1.2.3-59-g8ed1b


From 2c66de560fa2dda0a600e908897116914db8f500 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 17 Jun 2019 09:24:31 -0400
Subject: libceph: rename ceph_encode_addr to ceph_encode_banner_addr

...ditto for the decode function. We only use these functions to fix
up banner addresses now, so let's name them more appropriately.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/decode.h | 4 ++--
 net/ceph/messenger.c        | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index ce488d95be89..450384fe487c 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -221,7 +221,7 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv,
 #define CEPH_ENTITY_ADDR_TYPE_NONE	0
 #define CEPH_ENTITY_ADDR_TYPE_LEGACY	__cpu_to_le32(1)
 
-static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+static inline void ceph_encode_banner_addr(struct ceph_entity_addr *a)
 {
 	__be16 ss_family = htons(a->in_addr.ss_family);
 	a->in_addr.ss_family = *(__u16 *)&ss_family;
@@ -229,7 +229,7 @@ static inline void ceph_encode_addr(struct ceph_entity_addr *a)
 	/* Banner addresses require TYPE_NONE */
 	a->type = CEPH_ENTITY_ADDR_TYPE_NONE;
 }
-static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+static inline void ceph_decode_banner_addr(struct ceph_entity_addr *a)
 {
 	__be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
 	a->in_addr.ss_family = ntohs(ss_family);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 0a3ef33cf7ac..0473d9a7b1f4 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -222,7 +222,7 @@ EXPORT_SYMBOL(ceph_pr_addr);
 static void encode_my_addr(struct ceph_messenger *msgr)
 {
 	memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
-	ceph_encode_addr(&msgr->my_enc_addr);
+	ceph_encode_banner_addr(&msgr->my_enc_addr);
 }
 
 /*
@@ -1734,14 +1734,14 @@ static int read_partial_banner(struct ceph_connection *con)
 	ret = read_partial(con, end, size, &con->actual_peer_addr);
 	if (ret <= 0)
 		goto out;
-	ceph_decode_addr(&con->actual_peer_addr);
+	ceph_decode_banner_addr(&con->actual_peer_addr);
 
 	size = sizeof (con->peer_addr_for_me);
 	end += size;
 	ret = read_partial(con, end, size, &con->peer_addr_for_me);
 	if (ret <= 0)
 		goto out;
-	ceph_decode_addr(&con->peer_addr_for_me);
+	ceph_decode_banner_addr(&con->peer_addr_for_me);
 
 out:
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 6adaaafdd81d5c01875fe233ab73deb81b34caa1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Fri, 31 May 2019 12:24:22 -0400
Subject: libceph: turn on CEPH_FEATURE_MSG_ADDR2

Now that the client can handle either address formatting, advertise to
the peer that we can support it.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/ceph/ceph_features.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 65a38c4a02a1..39e6f4c57580 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -211,6 +211,7 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin
 	 CEPH_FEATURE_MON_STATEFUL_SUB |	\
 	 CEPH_FEATURE_CRUSH_TUNABLES5 |		\
 	 CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING |	\
+	 CEPH_FEATURE_MSG_ADDR2 |		\
 	 CEPH_FEATURE_CEPHX_V2)
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT	0
-- 
cgit v1.2.3-59-g8ed1b


From 441d367644e2f60b37f36bfc656deee551acba5b Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Wed, 5 Jun 2019 17:24:22 -0400
Subject: iversion: add a routine to update a raw value with a larger one

Under ceph, clients can be independently updating iversion themselves,
while working under comprehensive sets of caps on an inode. In that
situation we always want to prefer the largest value of a change
attribute. Add a new function that will update a raw value with a larger
one, but otherwise leave it alone.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 include/linux/iversion.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/iversion.h b/include/linux/iversion.h
index be50ef7cedab..2917ef990d43 100644
--- a/include/linux/iversion.h
+++ b/include/linux/iversion.h
@@ -112,6 +112,30 @@ inode_peek_iversion_raw(const struct inode *inode)
 	return atomic64_read(&inode->i_version);
 }
 
+/**
+ * inode_set_max_iversion_raw - update i_version new value is larger
+ * @inode: inode to set
+ * @val: new i_version to set
+ *
+ * Some self-managed filesystems (e.g Ceph) will only update the i_version
+ * value if the new value is larger than the one we already have.
+ */
+static inline void
+inode_set_max_iversion_raw(struct inode *inode, u64 val)
+{
+	u64 cur, old;
+
+	cur = inode_peek_iversion_raw(inode);
+	for (;;) {
+		if (cur > val)
+			break;
+		old = atomic64_cmpxchg(&inode->i_version, cur, val);
+		if (likely(old == cur))
+			break;
+		cur = old;
+	}
+}
+
 /**
  * inode_set_iversion - set i_version to a particular value
  * @inode: inode to set
-- 
cgit v1.2.3-59-g8ed1b


From 49ada6e8dc9f64ad1e8dd6f7b453c9e584e9f897 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Thu, 20 Jun 2019 12:09:08 +0800
Subject: ceph: more precise CEPH_CLIENT_CAPS_PENDING_CAPSNAP

Client uses this flag to tell mds if there is more cap snap need to
flush. It's mainly for the case that client needs to re-send cap/snap
flushes after mds failover, but CEPH_CAP_ANY_FILE_WR on corresponding
inodes are all released before mds failover.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/caps.c               | 41 ++++++++++++++++++++++++++++++-----------
 include/linux/ceph/ceph_fs.h |  2 +-
 2 files changed, 31 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index f9055cdec3c7..d98dcd976c80 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1295,7 +1295,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
  * caller should hold snap_rwsem (read), s_mutex.
  */
 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
-		      int op, bool sync, int used, int want, int retain,
+		      int op, int flags, int used, int want, int retain,
 		      int flushing, u64 flush_tid, u64 oldest_flush_tid)
 	__releases(cap->ci->i_ceph_lock)
 {
@@ -1393,12 +1393,19 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	arg.mode = inode->i_mode;
 
 	arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
-	if (list_empty(&ci->i_cap_snaps))
-		arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
-	else
-		arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
-	if (sync)
-		arg.flags |= CEPH_CLIENT_CAPS_SYNC;
+	if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
+	    !list_empty(&ci->i_cap_snaps)) {
+		struct ceph_cap_snap *capsnap;
+		list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
+			if (capsnap->cap_flush.tid)
+				break;
+			if (capsnap->need_flush) {
+				flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
+				break;
+			}
+		}
+	}
+	arg.flags = flags;
 
 	spin_unlock(&ci->i_ceph_lock);
 
@@ -2085,7 +2092,7 @@ ack:
 		sent++;
 
 		/* __send_cap drops i_ceph_lock */
-		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, 0,
 				cap_used, want, retain, flushing,
 				flush_tid, oldest_flush_tid);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
@@ -2155,7 +2162,8 @@ retry_locked:
 						&flush_tid, &oldest_flush_tid);
 
 		/* __send_cap drops i_ceph_lock */
-		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+				     CEPH_CLIENT_CAPS_SYNC,
 				     __ceph_caps_used(ci),
 				     __ceph_caps_wanted(ci),
 				     (cap->issued | cap->implemented),
@@ -2328,9 +2336,17 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 	struct ceph_cap_flush *cf;
 	int ret;
 	u64 first_tid = 0;
+	u64 last_snap_flush = 0;
 
 	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
 
+	list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
+		if (!cf->caps) {
+			last_snap_flush = cf->tid;
+			break;
+		}
+	}
+
 	list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
 		if (cf->tid < first_tid)
 			continue;
@@ -2348,10 +2364,13 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 			dout("kick_flushing_caps %p cap %p tid %llu %s\n",
 			     inode, cap, cf->tid, ceph_cap_string(cf->caps));
 			ci->i_ceph_flags |= CEPH_I_NODELAY;
+
 			ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-					  false, __ceph_caps_used(ci),
+					 (cf->tid < last_snap_flush ?
+					  CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
+					  __ceph_caps_used(ci),
 					  __ceph_caps_wanted(ci),
-					  cap->issued | cap->implemented,
+					  (cap->issued | cap->implemented),
 					  cf->caps, cf->tid, oldest_flush_tid);
 			if (ret) {
 				pr_err("kick_flushing_caps: error sending "
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index 3ac0feaf2b5e..cb21c5cf12c3 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -682,7 +682,7 @@ extern const char *ceph_cap_op_name(int op);
 /* flags field in client cap messages (version >= 10) */
 #define CEPH_CLIENT_CAPS_SYNC			(1<<0)
 #define CEPH_CLIENT_CAPS_NO_CAPSNAP		(1<<1)
-#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP	(1<<2);
+#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP	(1<<2)
 
 /*
  * caps message, used for capability callbacks, acks, requests, etc.
-- 
cgit v1.2.3-59-g8ed1b


From 94e85771881027e62afdddadd31e3eec73025990 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 8 Jul 2019 12:50:09 +0200
Subject: libceph: rename r_unsafe_item to r_private_item

This list item remained from when we had safe and unsafe replies
(commit vs ack).  It has since become a private list item for use by
clients.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/ceph/file.c                  | 6 +++---
 include/linux/ceph/osd_client.h | 2 +-
 net/ceph/osd_client.c           | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a06090c8281e..d5bee928603a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1026,7 +1026,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			req->r_callback = ceph_aio_complete_req;
 			req->r_inode = inode;
 			req->r_priv = aio_req;
-			list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
+			list_add_tail(&req->r_private_item, &aio_req->osd_reqs);
 
 			pos += len;
 			continue;
@@ -1086,8 +1086,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 		while (!list_empty(&osd_reqs)) {
 			req = list_first_entry(&osd_reqs,
 					       struct ceph_osd_request,
-					       r_unsafe_item);
-			list_del_init(&req->r_unsafe_item);
+					       r_private_item);
+			list_del_init(&req->r_private_item);
 			if (ret >= 0)
 				ret = ceph_osdc_start_request(req->r_osdc,
 							      req, false);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 2294f963dab7..024f6fed0ac5 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -198,9 +198,9 @@ struct ceph_osd_request {
 	bool              r_mempool;
 	struct completion r_completion;       /* private to osd_client.c */
 	ceph_osdc_callback_t r_callback;
-	struct list_head  r_unsafe_item;
 
 	struct inode *r_inode;         	      /* for use by callbacks */
+	struct list_head r_private_item;      /* ditto */
 	void *r_priv;			      /* ditto */
 
 	/* set by submitter */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 54170a35ecec..6495982c5c07 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -478,7 +478,7 @@ static void request_release_checks(struct ceph_osd_request *req)
 {
 	WARN_ON(!RB_EMPTY_NODE(&req->r_node));
 	WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
-	WARN_ON(!list_empty(&req->r_unsafe_item));
+	WARN_ON(!list_empty(&req->r_private_item));
 	WARN_ON(req->r_osd);
 }
 
@@ -538,7 +538,7 @@ static void request_init(struct ceph_osd_request *req)
 	init_completion(&req->r_completion);
 	RB_CLEAR_NODE(&req->r_node);
 	RB_CLEAR_NODE(&req->r_mc_node);
-	INIT_LIST_HEAD(&req->r_unsafe_item);
+	INIT_LIST_HEAD(&req->r_private_item);
 
 	target_init(&req->r_t);
 }
-- 
cgit v1.2.3-59-g8ed1b


From ef83171b49c66d851a1a0dc6da5b4a4d8ee6ce9a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Mon, 8 Apr 2019 14:16:05 +0200
Subject: libceph: bump CEPH_MSG_MAX_DATA_LEN (again)

This time for rbd object map.  Object maps are limited in size to
256000000 objects, two bits per object.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
---
 include/linux/ceph/libceph.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index a3cddf5f0e60..82156da3c650 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -84,11 +84,13 @@ struct ceph_options {
 #define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
 
 /*
- * Handle the largest possible rbd object in one message.
+ * The largest possible rbd data object is 32M.
+ * The largest possible rbd object map object is 64M.
+ *
  * There is no limit on the size of cephfs objects, but it has to obey
  * rsize and wsize mount options anyway.
  */
-#define CEPH_MSG_MAX_DATA_LEN	(32*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN	(64*1024*1024)
 
 #define CEPH_AUTH_NAME_DEFAULT   "guest"
 
-- 
cgit v1.2.3-59-g8ed1b


From 68ada915eea10f36760ffe414810390a104df093 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 14 Jun 2019 18:16:51 +0200
Subject: libceph: change ceph_osdc_call() to take page vector for response

This will be used for loading object map.  rbd_obj_read_sync() isn't
suitable because object map must be accessed through class methods.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 drivers/block/rbd.c             |  8 ++++----
 include/linux/ceph/osd_client.h |  2 +-
 net/ceph/cls_lock_client.c      |  2 +-
 net/ceph/osd_client.c           | 10 +++++-----
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6d1df82eb883..f0814c148b1c 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4076,7 +4076,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 
 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
-			     reply_page, &inbound_size);
+			     &reply_page, &inbound_size);
 	if (!ret) {
 		memcpy(inbound, page_address(reply_page), inbound_size);
 		ret = inbound_size;
@@ -5102,7 +5102,7 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
 
 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
-			     req_page, sizeof(u64), reply_page, &reply_len);
+			     req_page, sizeof(u64), &reply_page, &reply_len);
 	if (ret)
 		return ret == -EOPNOTSUPP ? 1 : ret;
 
@@ -5114,7 +5114,7 @@ static int __get_parent_info(struct rbd_device *rbd_dev,
 
 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
-			     req_page, sizeof(u64), reply_page, &reply_len);
+			     req_page, sizeof(u64), &reply_page, &reply_len);
 	if (ret)
 		return ret;
 
@@ -5145,7 +5145,7 @@ static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
 
 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
-			     req_page, sizeof(u64), reply_page, &reply_len);
+			     req_page, sizeof(u64), &reply_page, &reply_len);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 024f6fed0ac5..c567cfa4f107 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -497,7 +497,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 		   const char *class, const char *method,
 		   unsigned int flags,
 		   struct page *req_page, size_t req_len,
-		   struct page *resp_page, size_t *resp_len);
+		   struct page **resp_pages, size_t *resp_len);
 
 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 			       struct ceph_vino vino,
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index b1d12bf4b83e..fb59094caf13 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -363,7 +363,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
 	dout("%s lock_name %s\n", __func__, lock_name);
 	ret = ceph_osdc_call(osdc, oid, oloc, "lock", "get_info",
 			     CEPH_OSD_FLAG_READ, get_info_op_page,
-			     get_info_op_buf_size, reply_page, &reply_len);
+			     get_info_op_buf_size, &reply_page, &reply_len);
 
 	dout("%s: status %d\n", __func__, ret);
 	if (ret >= 0) {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 6495982c5c07..a90fbfce7e93 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -5050,12 +5050,12 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 		   const char *class, const char *method,
 		   unsigned int flags,
 		   struct page *req_page, size_t req_len,
-		   struct page *resp_page, size_t *resp_len)
+		   struct page **resp_pages, size_t *resp_len)
 {
 	struct ceph_osd_request *req;
 	int ret;
 
-	if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
+	if (req_len > PAGE_SIZE)
 		return -E2BIG;
 
 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
@@ -5073,8 +5073,8 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 	if (req_page)
 		osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
 						  0, false, false);
-	if (resp_page)
-		osd_req_op_cls_response_data_pages(req, 0, &resp_page,
+	if (resp_pages)
+		osd_req_op_cls_response_data_pages(req, 0, resp_pages,
 						   *resp_len, 0, false, false);
 
 	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
@@ -5085,7 +5085,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
 	ret = ceph_osdc_wait_request(osdc, req);
 	if (ret >= 0) {
 		ret = req->r_ops[0].rval;
-		if (resp_page)
+		if (resp_pages)
 			*resp_len = req->r_ops[0].outdata_len;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 4cf3e6dff7ea517544e1da7810a0b3ebba380d2c Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 14 Jun 2019 18:00:19 +0200
Subject: libceph: export osd_req_op_data() macro

We already have one exported wrapper around it for extent.osd_data and
rbd_object_map_update_finish() needs another one for cls.request_data.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/ceph/osd_client.h | 8 ++++++++
 net/ceph/osd_client.c           | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index c567cfa4f107..ad7fe5d10dcd 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -389,6 +389,14 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
 void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err);
 
+#define osd_req_op_data(oreq, whch, typ, fld)				\
+({									\
+	struct ceph_osd_request *__oreq = (oreq);			\
+	unsigned int __whch = (whch);					\
+	BUG_ON(__whch >= __oreq->r_num_ops);				\
+	&__oreq->r_ops[__whch].typ.fld;					\
+})
+
 extern void osd_req_op_init(struct ceph_osd_request *osd_req,
 			    unsigned int which, u16 opcode, u32 flags);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a90fbfce7e93..0b2df09b2554 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -171,14 +171,6 @@ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
 	osd_data->num_bvecs = num_bvecs;
 }
 
-#define osd_req_op_data(oreq, whch, typ, fld)				\
-({									\
-	struct ceph_osd_request *__oreq = (oreq);			\
-	unsigned int __whch = (whch);					\
-	BUG_ON(__whch >= __oreq->r_num_ops);				\
-	&__oreq->r_ops[__whch].typ.fld;					\
-})
-
 static struct ceph_osd_data *
 osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 22e8bd51bb0469d1a524130a057f894ff632376a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Wed, 5 Jun 2019 19:25:11 +0200
Subject: rbd: support for object-map and fast-diff

Speed up reads, discards and zeroouts through RBD_OBJ_FLAG_MAY_EXIST
and RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT based on object map.

Invalid object maps are not trusted, but still updated.  Note that we
never iterate, resize or invalidate object maps.  If object-map feature
is enabled but object map fails to load, we just fail the requester
(either "rbd map" or I/O, by way of post-acquire action).

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 drivers/block/rbd.c                  | 720 ++++++++++++++++++++++++++++++++++-
 drivers/block/rbd_types.h            |  10 +
 include/linux/ceph/cls_lock_client.h |   3 +
 include/linux/ceph/striper.h         |   2 +
 net/ceph/cls_lock_client.c           |  45 +++
 net/ceph/striper.c                   |  17 +
 6 files changed, 794 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3d861d3013f8..0df91665c4eb 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -115,6 +115,8 @@ static int atomic_dec_return_safe(atomic_t *v)
 #define RBD_FEATURE_LAYERING		(1ULL<<0)
 #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
 #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
+#define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
+#define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
 #define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
 #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
 #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
@@ -122,6 +124,8 @@ static int atomic_dec_return_safe(atomic_t *v)
 #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
 				 RBD_FEATURE_STRIPINGV2 |	\
 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
+				 RBD_FEATURE_OBJECT_MAP |	\
+				 RBD_FEATURE_FAST_DIFF |	\
 				 RBD_FEATURE_DEEP_FLATTEN |	\
 				 RBD_FEATURE_DATA_POOL |	\
 				 RBD_FEATURE_OPERATIONS)
@@ -227,6 +231,8 @@ enum obj_operation_type {
 #define RBD_OBJ_FLAG_DELETION			(1U << 0)
 #define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
 #define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
+#define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
+#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)
 
 enum rbd_obj_read_state {
 	RBD_OBJ_READ_START = 1,
@@ -261,14 +267,18 @@ enum rbd_obj_read_state {
  */
 enum rbd_obj_write_state {
 	RBD_OBJ_WRITE_START = 1,
+	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
 	RBD_OBJ_WRITE_OBJECT,
 	__RBD_OBJ_WRITE_COPYUP,
 	RBD_OBJ_WRITE_COPYUP,
+	RBD_OBJ_WRITE_POST_OBJECT_MAP,
 };
 
 enum rbd_obj_copyup_state {
 	RBD_OBJ_COPYUP_START = 1,
 	RBD_OBJ_COPYUP_READ_PARENT,
+	__RBD_OBJ_COPYUP_OBJECT_MAPS,
+	RBD_OBJ_COPYUP_OBJECT_MAPS,
 	__RBD_OBJ_COPYUP_WRITE_OBJECT,
 	RBD_OBJ_COPYUP_WRITE_OBJECT,
 };
@@ -419,6 +429,11 @@ struct rbd_device {
 	int			acquire_err;
 	struct completion	releasing_wait;
 
+	spinlock_t		object_map_lock;
+	u8			*object_map;
+	u64			object_map_size;	/* in objects */
+	u64			object_map_flags;
+
 	struct workqueue_struct	*task_wq;
 
 	struct rbd_spec		*parent_spec;
@@ -620,6 +635,7 @@ static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
 				u8 *order, u64 *snap_size);
 static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
 		u64 *snap_features);
+static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
 
 static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
 static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
@@ -1768,6 +1784,466 @@ static void rbd_img_request_destroy(struct kref *kref)
 	kmem_cache_free(rbd_img_request_cache, img_request);
 }
 
+#define BITS_PER_OBJ	2
+#define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
+#define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)
+
+static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
+				   u64 *index, u8 *shift)
+{
+	u32 off;
+
+	rbd_assert(objno < rbd_dev->object_map_size);
+	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
+	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
+}
+
+static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
+{
+	u64 index;
+	u8 shift;
+
+	lockdep_assert_held(&rbd_dev->object_map_lock);
+	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
+	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
+}
+
+static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
+{
+	u64 index;
+	u8 shift;
+	u8 *p;
+
+	lockdep_assert_held(&rbd_dev->object_map_lock);
+	rbd_assert(!(val & ~OBJ_MASK));
+
+	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
+	p = &rbd_dev->object_map[index];
+	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
+}
+
+static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
+{
+	u8 state;
+
+	spin_lock(&rbd_dev->object_map_lock);
+	state = __rbd_object_map_get(rbd_dev, objno);
+	spin_unlock(&rbd_dev->object_map_lock);
+	return state;
+}
+
+static bool use_object_map(struct rbd_device *rbd_dev)
+{
+	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
+		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
+}
+
+static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
+{
+	u8 state;
+
+	/* fall back to default logic if object map is disabled or invalid */
+	if (!use_object_map(rbd_dev))
+		return true;
+
+	state = rbd_object_map_get(rbd_dev, objno);
+	return state != OBJECT_NONEXISTENT;
+}
+
+static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
+				struct ceph_object_id *oid)
+{
+	if (snap_id == CEPH_NOSNAP)
+		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
+				rbd_dev->spec->image_id);
+	else
+		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
+				rbd_dev->spec->image_id, snap_id);
+}
+
+static int rbd_object_map_lock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	u8 lock_type;
+	char *lock_tag;
+	struct ceph_locker *lockers;
+	u32 num_lockers;
+	bool broke_lock = false;
+	int ret;
+
+	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
+
+again:
+	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
+			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
+	if (ret != -EBUSY || broke_lock) {
+		if (ret == -EEXIST)
+			ret = 0; /* already locked by myself */
+		if (ret)
+			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
+		return ret;
+	}
+
+	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
+				 RBD_LOCK_NAME, &lock_type, &lock_tag,
+				 &lockers, &num_lockers);
+	if (ret) {
+		if (ret == -ENOENT)
+			goto again;
+
+		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
+		return ret;
+	}
+
+	kfree(lock_tag);
+	if (num_lockers == 0)
+		goto again;
+
+	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
+		 ENTITY_NAME(lockers[0].id.name));
+
+	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
+				  RBD_LOCK_NAME, lockers[0].id.cookie,
+				  &lockers[0].id.name);
+	ceph_free_lockers(lockers, num_lockers);
+	if (ret) {
+		if (ret == -ENOENT)
+			goto again;
+
+		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
+		return ret;
+	}
+
+	broke_lock = true;
+	goto again;
+}
+
+static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	int ret;
+
+	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
+
+	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
+			      "");
+	if (ret && ret != -ENOENT)
+		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
+}
+
+static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
+{
+	u8 struct_v;
+	u32 struct_len;
+	u32 header_len;
+	void *header_end;
+	int ret;
+
+	ceph_decode_32_safe(p, end, header_len, e_inval);
+	header_end = *p + header_len;
+
+	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
+				  &struct_len);
+	if (ret)
+		return ret;
+
+	ceph_decode_64_safe(p, end, *object_map_size, e_inval);
+
+	*p = header_end;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int __rbd_object_map_load(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	struct page **pages;
+	void *p, *end;
+	size_t reply_len;
+	u64 num_objects;
+	u64 object_map_bytes;
+	u64 object_map_size;
+	int num_pages;
+	int ret;
+
+	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
+
+	num_objects = ceph_get_num_objects(&rbd_dev->layout,
+					   rbd_dev->mapping.size);
+	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
+					    BITS_PER_BYTE);
+	num_pages = calc_pages_for(0, object_map_bytes) + 1;
+	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	reply_len = num_pages * PAGE_SIZE;
+	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
+	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
+			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
+			     NULL, 0, pages, &reply_len);
+	if (ret)
+		goto out;
+
+	p = page_address(pages[0]);
+	end = p + min(reply_len, (size_t)PAGE_SIZE);
+	ret = decode_object_map_header(&p, end, &object_map_size);
+	if (ret)
+		goto out;
+
+	if (object_map_size != num_objects) {
+		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
+			 object_map_size, num_objects);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (offset_in_page(p) + object_map_bytes > reply_len) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
+	if (!rbd_dev->object_map) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rbd_dev->object_map_size = object_map_size;
+	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
+				   offset_in_page(p), object_map_bytes);
+
+out:
+	ceph_release_page_vector(pages, num_pages);
+	return ret;
+}
+
+static void rbd_object_map_free(struct rbd_device *rbd_dev)
+{
+	kvfree(rbd_dev->object_map);
+	rbd_dev->object_map = NULL;
+	rbd_dev->object_map_size = 0;
+}
+
+static int rbd_object_map_load(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	ret = __rbd_object_map_load(rbd_dev);
+	if (ret)
+		return ret;
+
+	ret = rbd_dev_v2_get_flags(rbd_dev);
+	if (ret) {
+		rbd_object_map_free(rbd_dev);
+		return ret;
+	}
+
+	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
+		rbd_warn(rbd_dev, "object map is invalid");
+
+	return 0;
+}
+
+static int rbd_object_map_open(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	ret = rbd_object_map_lock(rbd_dev);
+	if (ret)
+		return ret;
+
+	ret = rbd_object_map_load(rbd_dev);
+	if (ret) {
+		rbd_object_map_unlock(rbd_dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void rbd_object_map_close(struct rbd_device *rbd_dev)
+{
+	rbd_object_map_free(rbd_dev);
+	rbd_object_map_unlock(rbd_dev);
+}
+
+/*
+ * This function needs snap_id (or more precisely just something to
+ * distinguish between HEAD and snapshot object maps), new_state and
+ * current_state that were passed to rbd_object_map_update().
+ *
+ * To avoid allocating and stashing a context we piggyback on the OSD
+ * request.  A HEAD update has two ops (assert_locked).  For new_state
+ * and current_state we decode our own object_map_update op, encoded in
+ * rbd_cls_object_map_update().
+ */
+static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
+					struct ceph_osd_request *osd_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_osd_data *osd_data;
+	u64 objno;
+	u8 state, new_state, current_state;
+	bool has_current_state;
+	void *p;
+
+	if (osd_req->r_result)
+		return osd_req->r_result;
+
+	/*
+	 * Nothing to do for a snapshot object map.
+	 */
+	if (osd_req->r_num_ops == 1)
+		return 0;
+
+	/*
+	 * Update in-memory HEAD object map.
+	 */
+	rbd_assert(osd_req->r_num_ops == 2);
+	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
+	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
+
+	p = page_address(osd_data->pages[0]);
+	objno = ceph_decode_64(&p);
+	rbd_assert(objno == obj_req->ex.oe_objno);
+	rbd_assert(ceph_decode_64(&p) == objno + 1);
+	new_state = ceph_decode_8(&p);
+	has_current_state = ceph_decode_8(&p);
+	if (has_current_state)
+		current_state = ceph_decode_8(&p);
+
+	spin_lock(&rbd_dev->object_map_lock);
+	state = __rbd_object_map_get(rbd_dev, objno);
+	if (!has_current_state || current_state == state ||
+	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
+		__rbd_object_map_set(rbd_dev, objno, new_state);
+	spin_unlock(&rbd_dev->object_map_lock);
+
+	return 0;
+}
+
+static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+	int result;
+
+	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
+	     osd_req->r_result, obj_req);
+
+	result = rbd_object_map_update_finish(obj_req, osd_req);
+	rbd_obj_handle_request(obj_req, result);
+}
+
+static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
+{
+	u8 state = rbd_object_map_get(rbd_dev, objno);
+
+	if (state == new_state ||
+	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
+	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
+		return false;
+
+	return true;
+}
+
+static int rbd_cls_object_map_update(struct ceph_osd_request *req,
+				     int which, u64 objno, u8 new_state,
+				     const u8 *current_state)
+{
+	struct page **pages;
+	void *p, *start;
+	int ret;
+
+	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
+	if (ret)
+		return ret;
+
+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	p = start = page_address(pages[0]);
+	ceph_encode_64(&p, objno);
+	ceph_encode_64(&p, objno + 1);
+	ceph_encode_8(&p, new_state);
+	if (current_state) {
+		ceph_encode_8(&p, 1);
+		ceph_encode_8(&p, *current_state);
+	} else {
+		ceph_encode_8(&p, 0);
+	}
+
+	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
+					  false, true);
+	return 0;
+}
+
+/*
+ * Return:
+ *   0 - object map update sent
+ *   1 - object map update isn't needed
+ *  <0 - error
+ */
+static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
+				 u8 new_state, const u8 *current_state)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct ceph_osd_request *req;
+	int num_ops = 1;
+	int which = 0;
+	int ret;
+
+	if (snap_id == CEPH_NOSNAP) {
+		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
+			return 1;
+
+		num_ops++; /* assert_locked */
+	}
+
+	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
+	if (!req)
+		return -ENOMEM;
+
+	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
+	req->r_callback = rbd_object_map_callback;
+	req->r_priv = obj_req;
+
+	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
+	req->r_flags = CEPH_OSD_FLAG_WRITE;
+	ktime_get_real_ts64(&req->r_mtime);
+
+	if (snap_id == CEPH_NOSNAP) {
+		/*
+		 * Protect against possible race conditions during lock
+		 * ownership transitions.
+		 */
+		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
+					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
+		if (ret)
+			return ret;
+	}
+
+	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
+					new_state, current_state);
+	if (ret)
+		return ret;
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	if (ret)
+		return ret;
+
+	ceph_osdc_start_request(osdc, req, false);
+	return 0;
+}
+
 static void prune_extents(struct ceph_file_extent *img_extents,
 			  u32 *num_img_extents, u64 overlap)
 {
@@ -1975,6 +2451,7 @@ static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
 	if (ret)
 		return ret;
 
+	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
 		obj_req->flags |= RBD_OBJ_FLAG_DELETION;
 
@@ -2022,6 +2499,7 @@ static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
 	if (rbd_obj_copyup_enabled(obj_req))
 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
 	if (!obj_req->num_img_extents) {
+		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
 		if (rbd_obj_is_entire(obj_req))
 			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
 	}
@@ -2407,6 +2885,20 @@ static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
 	queue_work(rbd_wq, &img_req->work);
 }
 
+static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+
+	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
+		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
+		return true;
+	}
+
+	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
+	     obj_req->ex.oe_objno);
+	return false;
+}
+
 static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
 {
 	struct ceph_osd_request *osd_req;
@@ -2482,10 +2974,17 @@ static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
 	int ret;
 
+again:
 	switch (obj_req->read_state) {
 	case RBD_OBJ_READ_START:
 		rbd_assert(!*result);
 
+		if (!rbd_obj_may_exist(obj_req)) {
+			*result = -ENOENT;
+			obj_req->read_state = RBD_OBJ_READ_OBJECT;
+			goto again;
+		}
+
 		ret = rbd_obj_read_object(obj_req);
 		if (ret) {
 			*result = ret;
@@ -2536,6 +3035,44 @@ static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
 	}
 }
 
+static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+
+	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
+		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
+
+	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
+	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
+		dout("%s %p noop for nonexistent\n", __func__, obj_req);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Return:
+ *   0 - object map update sent
+ *   1 - object map update isn't needed
+ *  <0 - error
+ */
+static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	u8 new_state;
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return 1;
+
+	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
+		new_state = OBJECT_PENDING;
+	else
+		new_state = OBJECT_EXISTS;
+
+	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
+}
+
 static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
 {
 	struct ceph_osd_request *osd_req;
@@ -2706,6 +3243,41 @@ static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
 	return rbd_obj_read_from_parent(obj_req);
 }
 
+static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
+	u8 new_state;
+	u32 i;
+	int ret;
+
+	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return;
+
+	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
+		return;
+
+	for (i = 0; i < snapc->num_snaps; i++) {
+		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
+		    i + 1 < snapc->num_snaps)
+			new_state = OBJECT_EXISTS_CLEAN;
+		else
+			new_state = OBJECT_EXISTS;
+
+		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
+					    new_state, NULL);
+		if (ret < 0) {
+			obj_req->pending.result = ret;
+			return;
+		}
+
+		rbd_assert(!ret);
+		obj_req->pending.num_pending++;
+	}
+}
+
 static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
 {
 	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
@@ -2749,6 +3321,7 @@ static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
 
 static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
 {
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
 	int ret;
 
 again:
@@ -2776,6 +3349,25 @@ again:
 			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
 		}
 
+		rbd_obj_copyup_object_maps(obj_req);
+		if (!obj_req->pending.num_pending) {
+			*result = obj_req->pending.result;
+			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
+			goto again;
+		}
+		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
+		return false;
+	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
+		if (!pending_result_dec(&obj_req->pending, result))
+			return false;
+		/* fall through */
+	case RBD_OBJ_COPYUP_OBJECT_MAPS:
+		if (*result) {
+			rbd_warn(rbd_dev, "snap object map update failed: %d",
+				 *result);
+			return true;
+		}
+
 		rbd_obj_copyup_write_object(obj_req);
 		if (!obj_req->pending.num_pending) {
 			*result = obj_req->pending.result;
@@ -2795,6 +3387,27 @@ again:
 	}
 }
 
+/*
+ * Return:
+ *   0 - object map update sent
+ *   1 - object map update isn't needed
+ *  <0 - error
+ */
+static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	u8 current_state = OBJECT_PENDING;
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return 1;
+
+	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
+		return 1;
+
+	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
+				     &current_state);
+}
+
 static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
 {
 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
@@ -2805,6 +3418,24 @@ again:
 	case RBD_OBJ_WRITE_START:
 		rbd_assert(!*result);
 
+		if (rbd_obj_write_is_noop(obj_req))
+			return true;
+
+		ret = rbd_obj_write_pre_object_map(obj_req);
+		if (ret < 0) {
+			*result = ret;
+			return true;
+		}
+		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
+		if (ret > 0)
+			goto again;
+		return false;
+	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
+		if (*result) {
+			rbd_warn(rbd_dev, "pre object map update failed: %d",
+				 *result);
+			return true;
+		}
 		ret = rbd_obj_write_object(obj_req);
 		if (ret) {
 			*result = ret;
@@ -2837,8 +3468,23 @@ again:
 			return false;
 		/* fall through */
 	case RBD_OBJ_WRITE_COPYUP:
-		if (*result)
+		if (*result) {
 			rbd_warn(rbd_dev, "copyup failed: %d", *result);
+			return true;
+		}
+		ret = rbd_obj_write_post_object_map(obj_req);
+		if (ret < 0) {
+			*result = ret;
+			return true;
+		}
+		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
+		if (ret > 0)
+			goto again;
+		return false;
+	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
+		if (*result)
+			rbd_warn(rbd_dev, "post object map update failed: %d",
+				 *result);
 		return true;
 	default:
 		BUG();
@@ -2892,7 +3538,8 @@ static bool need_exclusive_lock(struct rbd_img_request *img_req)
 		return false;
 
 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
-	if (rbd_dev->opts->lock_on_read)
+	if (rbd_dev->opts->lock_on_read ||
+	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
 		return true;
 
 	return rbd_img_is_write(img_req);
@@ -3431,7 +4078,7 @@ static int rbd_try_lock(struct rbd_device *rbd_dev)
 		if (ret)
 			goto out; /* request lock or error */
 
-		rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
+		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
 			 ENTITY_NAME(lockers[0].id.name));
 
 		ret = ceph_monc_blacklist_add(&client->monc,
@@ -3458,6 +4105,19 @@ out:
 	return ret;
 }
 
+static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
+		ret = rbd_object_map_open(rbd_dev);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 /*
  * Return:
  *   0 - lock acquired
@@ -3501,6 +4161,17 @@ static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
 	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
 	rbd_assert(list_empty(&rbd_dev->running_list));
 
+	ret = rbd_post_acquire_action(rbd_dev);
+	if (ret) {
+		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
+		/*
+		 * Can't stay in RBD_LOCK_STATE_LOCKED because
+		 * rbd_lock_add_request() would let the request through,
+		 * assuming that e.g. object map is locked and loaded.
+		 */
+		rbd_unlock(rbd_dev);
+	}
+
 out:
 	wake_lock_waiters(rbd_dev, ret);
 	up_write(&rbd_dev->lock_rwsem);
@@ -3574,10 +4245,17 @@ static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
 	return true;
 }
 
+static void rbd_pre_release_action(struct rbd_device *rbd_dev)
+{
+	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
+		rbd_object_map_close(rbd_dev);
+}
+
 static void __rbd_release_lock(struct rbd_device *rbd_dev)
 {
 	rbd_assert(list_empty(&rbd_dev->running_list));
 
+	rbd_pre_release_action(rbd_dev);
 	rbd_unlock(rbd_dev);
 }
 
@@ -4864,6 +5542,8 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
 	init_completion(&rbd_dev->acquire_wait);
 	init_completion(&rbd_dev->releasing_wait);
 
+	spin_lock_init(&rbd_dev->object_map_lock);
+
 	rbd_dev->dev.bus = &rbd_bus_type;
 	rbd_dev->dev.type = &rbd_device_type;
 	rbd_dev->dev.parent = &rbd_root_dev;
@@ -5045,6 +5725,32 @@ static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
 						&rbd_dev->header.features);
 }
 
+/*
+ * These are generic image flags, but since they are used only for
+ * object map, store them in rbd_dev->object_map_flags.
+ *
+ * For the same reason, this function is called only on object map
+ * (re)load and not on header refresh.
+ */
+static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
+{
+	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
+	__le64 flags;
+	int ret;
+
+	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, "get_flags",
+				  &snapid, sizeof(snapid),
+				  &flags, sizeof(flags));
+	if (ret < 0)
+		return ret;
+	if (ret < sizeof(flags))
+		return -EBADMSG;
+
+	rbd_dev->object_map_flags = le64_to_cpu(flags);
+	return 0;
+}
+
 struct parent_image_info {
 	u64		pool_id;
 	const char	*pool_ns;
@@ -6018,6 +6724,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
 	struct rbd_image_header	*header;
 
 	rbd_dev_parent_put(rbd_dev);
+	rbd_object_map_free(rbd_dev);
 	rbd_dev_mapping_clear(rbd_dev);
 
 	/* Free dynamic fields from the header, then zero it out */
@@ -6267,6 +6974,13 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
 	if (ret)
 		goto err_out_probe;
 
+	if (rbd_dev->spec->snap_id != CEPH_NOSNAP &&
+	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
+		ret = rbd_object_map_load(rbd_dev);
+		if (ret)
+			goto err_out_probe;
+	}
+
 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
 		ret = rbd_dev_v2_parent_info(rbd_dev);
 		if (ret)
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
index 62ff50d3e7a6..ac98ab6ccd3b 100644
--- a/drivers/block/rbd_types.h
+++ b/drivers/block/rbd_types.h
@@ -18,6 +18,7 @@
 /* For format version 2, rbd image 'foo' consists of objects
  *   rbd_id.foo		- id of image
  *   rbd_header.<id>	- image metadata
+ *   rbd_object_map.<id> - optional image object map
  *   rbd_data.<id>.0000000000000000
  *   rbd_data.<id>.0000000000000001
  *   ...		- data
@@ -25,6 +26,7 @@
  */
 
 #define RBD_HEADER_PREFIX      "rbd_header."
+#define RBD_OBJECT_MAP_PREFIX  "rbd_object_map."
 #define RBD_ID_PREFIX          "rbd_id."
 #define RBD_V2_DATA_FORMAT     "%s.%016llx"
 
@@ -39,6 +41,14 @@ enum rbd_notify_op {
 	RBD_NOTIFY_OP_HEADER_UPDATE      = 3,
 };
 
+#define OBJECT_NONEXISTENT	0
+#define OBJECT_EXISTS		1
+#define OBJECT_PENDING		2
+#define OBJECT_EXISTS_CLEAN	3
+
+#define RBD_FLAG_OBJECT_MAP_INVALID	(1ULL << 0)
+#define RBD_FLAG_FAST_DIFF_INVALID	(1ULL << 1)
+
 /*
  * For format version 1, rbd image 'foo' consists of objects
  *   foo.rbd		- image metadata
diff --git a/include/linux/ceph/cls_lock_client.h b/include/linux/ceph/cls_lock_client.h
index bea6c77d2093..17bc7584d1fe 100644
--- a/include/linux/ceph/cls_lock_client.h
+++ b/include/linux/ceph/cls_lock_client.h
@@ -52,4 +52,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
 		       char *lock_name, u8 *type, char **tag,
 		       struct ceph_locker **lockers, u32 *num_lockers);
 
+int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
+			   char *lock_name, u8 type, char *cookie, char *tag);
+
 #endif
diff --git a/include/linux/ceph/striper.h b/include/linux/ceph/striper.h
index cbd0d24b7148..3486636c0e6e 100644
--- a/include/linux/ceph/striper.h
+++ b/include/linux/ceph/striper.h
@@ -66,4 +66,6 @@ int ceph_extent_to_file(struct ceph_file_layout *l,
 			struct ceph_file_extent **file_extents,
 			u32 *num_file_extents);
 
+u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size);
+
 #endif
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index fb59094caf13..17447c19d937 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -6,6 +6,7 @@
 
 #include <linux/ceph/cls_lock_client.h>
 #include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
 
 /**
  * ceph_cls_lock - grab rados lock for object
@@ -378,3 +379,47 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
 	return ret;
 }
 EXPORT_SYMBOL(ceph_cls_lock_info);
+
+int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
+			   char *lock_name, u8 type, char *cookie, char *tag)
+{
+	int assert_op_buf_size;
+	int name_len = strlen(lock_name);
+	int cookie_len = strlen(cookie);
+	int tag_len = strlen(tag);
+	struct page **pages;
+	void *p, *end;
+	int ret;
+
+	assert_op_buf_size = name_len + sizeof(__le32) +
+			     cookie_len + sizeof(__le32) +
+			     tag_len + sizeof(__le32) +
+			     sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
+	if (assert_op_buf_size > PAGE_SIZE)
+		return -E2BIG;
+
+	ret = osd_req_op_cls_init(req, which, "lock", "assert_locked");
+	if (ret)
+		return ret;
+
+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	p = page_address(pages[0]);
+	end = p + assert_op_buf_size;
+
+	/* encode cls_lock_assert_op struct */
+	ceph_start_encoding(&p, 1, 1,
+			    assert_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+	ceph_encode_string(&p, end, lock_name, name_len);
+	ceph_encode_8(&p, type);
+	ceph_encode_string(&p, end, cookie, cookie_len);
+	ceph_encode_string(&p, end, tag, tag_len);
+	WARN_ON(p != end);
+
+	osd_req_op_cls_request_data_pages(req, which, pages, assert_op_buf_size,
+					  0, false, true);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_cls_assert_locked);
diff --git a/net/ceph/striper.c b/net/ceph/striper.c
index c36462dc86b7..3b3fa75d1189 100644
--- a/net/ceph/striper.c
+++ b/net/ceph/striper.c
@@ -259,3 +259,20 @@ int ceph_extent_to_file(struct ceph_file_layout *l,
 	return 0;
 }
 EXPORT_SYMBOL(ceph_extent_to_file);
+
+u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size)
+{
+	u64 period = (u64)l->stripe_count * l->object_size;
+	u64 num_periods = DIV64_U64_ROUND_UP(size, period);
+	u64 remainder_bytes;
+	u64 remainder_objs = 0;
+
+	div64_u64_rem(size, period, &remainder_bytes);
+	if (remainder_bytes > 0 &&
+	    remainder_bytes < (u64)l->stripe_count * l->stripe_unit)
+		remainder_objs = l->stripe_count -
+			    DIV_ROUND_UP_ULL(remainder_bytes, l->stripe_unit);
+
+	return num_periods * l->stripe_count - remainder_objs;
+}
+EXPORT_SYMBOL(ceph_get_num_objects);
-- 
cgit v1.2.3-59-g8ed1b


From f4915455dcf07c4f237d6160a4b6adb0575d2909 Mon Sep 17 00:00:00 2001
From: Yamin Friedman <yaminf@mellanox.com>
Date: Mon, 8 Jul 2019 13:59:02 +0300
Subject: linux/dim: Implement RDMA adaptive moderation (DIM)

RDMA DIM implements a different algorithm from net DIM and is based on
completions which is how we can implement interrupt moderation in RDMA.

The algorithm optimizes for number of completions and ratio between
completions and events. In order to avoid long latencies, the
implementation performs fast reduction of moderation level when the
traffic changes.

Signed-off-by: Yamin Friedman <yaminf@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 include/linux/dim.h |  36 ++++++++++++++++++
 lib/dim/Makefile    |   6 +--
 lib/dim/rdma_dim.c  | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 146 insertions(+), 4 deletions(-)
 create mode 100644 lib/dim/rdma_dim.c

(limited to 'include/linux')

diff --git a/include/linux/dim.h b/include/linux/dim.h
index aa9bdd47a648..aa69730c3b8d 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -82,6 +82,7 @@ struct dim_stats {
  * @prev_stats: Measured rates from previous iteration (for comparison)
  * @start_sample: Sampled data at start of current iteration
  * @work: Work to perform on action required
+ * @priv: A pointer to the struct that points to dim
  * @profile_ix: Current moderation profile
  * @mode: CQ period count mode
  * @tune_state: Algorithm tuning state (see below)
@@ -95,6 +96,7 @@ struct dim {
 	struct dim_sample start_sample;
 	struct dim_sample measuring_sample;
 	struct work_struct work;
+	void *priv;
 	u8 profile_ix;
 	u8 mode;
 	u8 tune_state;
@@ -363,4 +365,38 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode);
  */
 void net_dim(struct dim *dim, struct dim_sample end_sample);
 
+/* RDMA DIM */
+
+/*
+ * RDMA DIM profile:
+ * profile size must be of RDMA_DIM_PARAMS_NUM_PROFILES.
+ */
+#define RDMA_DIM_PARAMS_NUM_PROFILES 9
+#define RDMA_DIM_START_PROFILE 0
+
+static const struct dim_cq_moder
+rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
+	{1,   0, 1,  0},
+	{1,   0, 4,  0},
+	{2,   0, 4,  0},
+	{2,   0, 8,  0},
+	{4,   0, 8,  0},
+	{16,  0, 8,  0},
+	{16,  0, 16, 0},
+	{32,  0, 16, 0},
+	{32,  0, 32, 0},
+};
+
+/**
+ * rdma_dim - Runs the adaptive moderation.
+ * @dim: The moderation struct.
+ * @completions: The number of completions collected in this round.
+ *
+ * Each call to rdma_dim takes the latest amount of completions that
+ * have been collected and counts them as a new event.
+ * Once enough events have been collected the algorithm decides a new
+ * moderation level.
+ */
+void rdma_dim(struct dim *dim, u64 completions);
+
 #endif /* DIM_H */
diff --git a/lib/dim/Makefile b/lib/dim/Makefile
index 160afe288df0..1d6858a108cb 100644
--- a/lib/dim/Makefile
+++ b/lib/dim/Makefile
@@ -2,8 +2,6 @@
 # DIM Dynamic Interrupt Moderation library
 #
 
-obj-$(CONFIG_DIMLIB) = net_dim.o
+obj-$(CONFIG_DIMLIB) += dim.o
 
-net_dim-y = \
-	dim.o		\
-	net_dim.o
+dim-y := dim.o net_dim.o rdma_dim.o
diff --git a/lib/dim/rdma_dim.c b/lib/dim/rdma_dim.c
new file mode 100644
index 000000000000..f7e26c7b4749
--- /dev/null
+++ b/lib/dim/rdma_dim.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <linux/dim.h>
+
+static int rdma_dim_step(struct dim *dim)
+{
+	if (dim->tune_state == DIM_GOING_RIGHT) {
+		if (dim->profile_ix == (RDMA_DIM_PARAMS_NUM_PROFILES - 1))
+			return DIM_ON_EDGE;
+		dim->profile_ix++;
+		dim->steps_right++;
+	}
+	if (dim->tune_state == DIM_GOING_LEFT) {
+		if (dim->profile_ix == 0)
+			return DIM_ON_EDGE;
+		dim->profile_ix--;
+		dim->steps_left++;
+	}
+
+	return DIM_STEPPED;
+}
+
+static int rdma_dim_stats_compare(struct dim_stats *curr,
+				  struct dim_stats *prev)
+{
+	/* first stat */
+	if (!prev->cpms)
+		return DIM_STATS_SAME;
+
+	if (IS_SIGNIFICANT_DIFF(curr->cpms, prev->cpms))
+		return (curr->cpms > prev->cpms) ? DIM_STATS_BETTER :
+						DIM_STATS_WORSE;
+
+	if (IS_SIGNIFICANT_DIFF(curr->cpe_ratio, prev->cpe_ratio))
+		return (curr->cpe_ratio > prev->cpe_ratio) ? DIM_STATS_BETTER :
+						DIM_STATS_WORSE;
+
+	return DIM_STATS_SAME;
+}
+
+static bool rdma_dim_decision(struct dim_stats *curr_stats, struct dim *dim)
+{
+	int prev_ix = dim->profile_ix;
+	u8 state = dim->tune_state;
+	int stats_res;
+	int step_res;
+
+	if (state != DIM_PARKING_ON_TOP && state != DIM_PARKING_TIRED) {
+		stats_res = rdma_dim_stats_compare(curr_stats,
+						   &dim->prev_stats);
+
+		switch (stats_res) {
+		case DIM_STATS_SAME:
+			if (curr_stats->cpe_ratio <= 50 * prev_ix)
+				dim->profile_ix = 0;
+			break;
+		case DIM_STATS_WORSE:
+			dim_turn(dim);
+			/* fall through */
+		case DIM_STATS_BETTER:
+			step_res = rdma_dim_step(dim);
+			if (step_res == DIM_ON_EDGE)
+				dim_turn(dim);
+			break;
+		}
+	}
+
+	dim->prev_stats = *curr_stats;
+
+	return dim->profile_ix != prev_ix;
+}
+
+void rdma_dim(struct dim *dim, u64 completions)
+{
+	struct dim_sample *curr_sample = &dim->measuring_sample;
+	struct dim_stats curr_stats;
+	u32 nevents;
+
+	dim_update_sample_with_comps(curr_sample->event_ctr + 1, 0, 0,
+				     curr_sample->comp_ctr + completions,
+				     &dim->measuring_sample);
+
+	switch (dim->state) {
+	case DIM_MEASURE_IN_PROGRESS:
+		nevents = curr_sample->event_ctr - dim->start_sample.event_ctr;
+		if (nevents < DIM_NEVENTS)
+			break;
+		dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats);
+		if (rdma_dim_decision(&curr_stats, dim)) {
+			dim->state = DIM_APPLY_NEW_PROFILE;
+			schedule_work(&dim->work);
+			break;
+		}
+		/* fall through */
+	case DIM_START_MEASURE:
+		dim->state = DIM_MEASURE_IN_PROGRESS;
+		dim_update_sample_with_comps(curr_sample->event_ctr, 0, 0,
+					     curr_sample->comp_ctr,
+					     &dim->start_sample);
+		break;
+	case DIM_APPLY_NEW_PROFILE:
+		break;
+	}
+}
+EXPORT_SYMBOL(rdma_dim);
-- 
cgit v1.2.3-59-g8ed1b


From 67d874c3b2c69d65274fa5ce44ba939788d5729d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 8 Jul 2019 16:27:52 +0530
Subject: cpufreq: Register notifiers with the PM QoS framework

Register notifiers for min/max frequency constraints with the PM QoS
framework. The constraints are also taken into consideration in
cpufreq_set_policy().

This also relocates cpufreq_policy_put_kobj() as it is required to be
called from cpufreq_policy_alloc() now.

refresh_frequency_limits() is updated to avoid calling
cpufreq_set_policy() for inactive policies and handle_update() is
updated to have proper locking in place.

No constraints are added until now though.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 135 +++++++++++++++++++++++++++++++++++-----------
 include/linux/cpufreq.h   |   3 ++
 2 files changed, 108 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ceb57af15ca0..b96ef6db1bfe 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -26,6 +26,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/pm_qos.h>
 #include <linux/slab.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
@@ -999,7 +1000,7 @@ static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu)
 {
 	struct device *dev = get_cpu_device(cpu);
 
-	if (!dev)
+	if (unlikely(!dev))
 		return;
 
 	if (cpumask_test_and_set_cpu(cpu, policy->real_cpus))
@@ -1117,14 +1118,16 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
 
 static void refresh_frequency_limits(struct cpufreq_policy *policy)
 {
-	struct cpufreq_policy new_policy = *policy;
-
-	pr_debug("updating policy for CPU %u\n", policy->cpu);
+	struct cpufreq_policy new_policy;
 
-	new_policy.min = policy->user_policy.min;
-	new_policy.max = policy->user_policy.max;
+	if (!policy_is_inactive(policy)) {
+		new_policy = *policy;
+		pr_debug("updating policy for CPU %u\n", policy->cpu);
 
-	cpufreq_set_policy(policy, &new_policy);
+		new_policy.min = policy->user_policy.min;
+		new_policy.max = policy->user_policy.max;
+		cpufreq_set_policy(policy, &new_policy);
+	}
 }
 
 static void handle_update(struct work_struct *work)
@@ -1133,14 +1136,60 @@ static void handle_update(struct work_struct *work)
 		container_of(work, struct cpufreq_policy, update);
 
 	pr_debug("handle_update for cpu %u called\n", policy->cpu);
+	down_write(&policy->rwsem);
 	refresh_frequency_limits(policy);
+	up_write(&policy->rwsem);
+}
+
+static int cpufreq_notifier_min(struct notifier_block *nb, unsigned long freq,
+				void *data)
+{
+	struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_min);
+
+	schedule_work(&policy->update);
+	return 0;
+}
+
+static int cpufreq_notifier_max(struct notifier_block *nb, unsigned long freq,
+				void *data)
+{
+	struct cpufreq_policy *policy = container_of(nb, struct cpufreq_policy, nb_max);
+
+	schedule_work(&policy->update);
+	return 0;
+}
+
+static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
+{
+	struct kobject *kobj;
+	struct completion *cmp;
+
+	down_write(&policy->rwsem);
+	cpufreq_stats_free_table(policy);
+	kobj = &policy->kobj;
+	cmp = &policy->kobj_unregister;
+	up_write(&policy->rwsem);
+	kobject_put(kobj);
+
+	/*
+	 * We need to make sure that the underlying kobj is
+	 * actually not referenced anymore by anybody before we
+	 * proceed with unloading.
+	 */
+	pr_debug("waiting for dropping of refcount\n");
+	wait_for_completion(cmp);
+	pr_debug("wait complete\n");
 }
 
 static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 {
 	struct cpufreq_policy *policy;
+	struct device *dev = get_cpu_device(cpu);
 	int ret;
 
+	if (!dev)
+		return NULL;
+
 	policy = kzalloc(sizeof(*policy), GFP_KERNEL);
 	if (!policy)
 		return NULL;
@@ -1157,7 +1206,7 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
 				   cpufreq_global_kobject, "policy%u", cpu);
 	if (ret) {
-		pr_err("%s: failed to init policy->kobj: %d\n", __func__, ret);
+		dev_err(dev, "%s: failed to init policy->kobj: %d\n", __func__, ret);
 		/*
 		 * The entire policy object will be freed below, but the extra
 		 * memory allocated for the kobject name needs to be freed by
@@ -1167,6 +1216,25 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 		goto err_free_real_cpus;
 	}
 
+	policy->nb_min.notifier_call = cpufreq_notifier_min;
+	policy->nb_max.notifier_call = cpufreq_notifier_max;
+
+	ret = dev_pm_qos_add_notifier(dev, &policy->nb_min,
+				      DEV_PM_QOS_MIN_FREQUENCY);
+	if (ret) {
+		dev_err(dev, "Failed to register MIN QoS notifier: %d (%*pbl)\n",
+			ret, cpumask_pr_args(policy->cpus));
+		goto err_kobj_remove;
+	}
+
+	ret = dev_pm_qos_add_notifier(dev, &policy->nb_max,
+				      DEV_PM_QOS_MAX_FREQUENCY);
+	if (ret) {
+		dev_err(dev, "Failed to register MAX QoS notifier: %d (%*pbl)\n",
+			ret, cpumask_pr_args(policy->cpus));
+		goto err_min_qos_notifier;
+	}
+
 	INIT_LIST_HEAD(&policy->policy_list);
 	init_rwsem(&policy->rwsem);
 	spin_lock_init(&policy->transition_lock);
@@ -1177,6 +1245,11 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
 	policy->cpu = cpu;
 	return policy;
 
+err_min_qos_notifier:
+	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
+				   DEV_PM_QOS_MIN_FREQUENCY);
+err_kobj_remove:
+	cpufreq_policy_put_kobj(policy);
 err_free_real_cpus:
 	free_cpumask_var(policy->real_cpus);
 err_free_rcpumask:
@@ -1189,30 +1262,9 @@ err_free_policy:
 	return NULL;
 }
 
-static void cpufreq_policy_put_kobj(struct cpufreq_policy *policy)
-{
-	struct kobject *kobj;
-	struct completion *cmp;
-
-	down_write(&policy->rwsem);
-	cpufreq_stats_free_table(policy);
-	kobj = &policy->kobj;
-	cmp = &policy->kobj_unregister;
-	up_write(&policy->rwsem);
-	kobject_put(kobj);
-
-	/*
-	 * We need to make sure that the underlying kobj is
-	 * actually not referenced anymore by anybody before we
-	 * proceed with unloading.
-	 */
-	pr_debug("waiting for dropping of refcount\n");
-	wait_for_completion(cmp);
-	pr_debug("wait complete\n");
-}
-
 static void cpufreq_policy_free(struct cpufreq_policy *policy)
 {
+	struct device *dev = get_cpu_device(policy->cpu);
 	unsigned long flags;
 	int cpu;
 
@@ -1224,6 +1276,11 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
 		per_cpu(cpufreq_cpu_data, cpu) = NULL;
 	write_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
+	dev_pm_qos_remove_notifier(dev, &policy->nb_max,
+				   DEV_PM_QOS_MAX_FREQUENCY);
+	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
+				   DEV_PM_QOS_MIN_FREQUENCY);
+
 	cpufreq_policy_put_kobj(policy);
 	free_cpumask_var(policy->real_cpus);
 	free_cpumask_var(policy->related_cpus);
@@ -2283,6 +2340,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 		       struct cpufreq_policy *new_policy)
 {
 	struct cpufreq_governor *old_gov;
+	struct device *cpu_dev = get_cpu_device(policy->cpu);
+	unsigned long min, max;
 	int ret;
 
 	pr_debug("setting new policy for CPU %u: %u - %u kHz\n",
@@ -2297,11 +2356,27 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 	if (new_policy->min > new_policy->max)
 		return -EINVAL;
 
+	/*
+	 * PM QoS framework collects all the requests from users and provide us
+	 * the final aggregated value here.
+	 */
+	min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
+	max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
+
+	if (min > new_policy->min)
+		new_policy->min = min;
+	if (max < new_policy->max)
+		new_policy->max = max;
+
 	/* verify the cpu speed can be set within this limit */
 	ret = cpufreq_driver->verify(new_policy);
 	if (ret)
 		return ret;
 
+	/*
+	 * The notifier-chain shall be removed once all the users of
+	 * CPUFREQ_ADJUST are moved to use the QoS framework.
+	 */
 	/* adjust if necessary - all reasons */
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 			CPUFREQ_ADJUST, new_policy);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index a1467aa7f58b..95425941f46d 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -147,6 +147,9 @@ struct cpufreq_policy {
 
 	/* Pointer to the cooling device if used for thermal mitigation */
 	struct thermal_cooling_device *cdev;
+
+	struct notifier_block nb_min;
+	struct notifier_block nb_max;
 };
 
 struct cpufreq_freqs {
-- 
cgit v1.2.3-59-g8ed1b


From c57b25bdf7cd374af106992356536bf5df7c255b Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 4 Jul 2019 13:06:22 +0530
Subject: cpufreq: intel_pstate: Reuse refresh_frequency_limits()

The implementation of intel_pstate_update_max_freq() is quite similar to
refresh_frequency_limits(), lets reuse it.

Finding minimum of policy->user_policy.max and policy->cpuinfo.max_freq
in intel_pstate_update_max_freq() is redundant as cpufreq_set_policy()
will call the ->verify() callback of intel-pstate driver, which will do
this comparison anyway and so dropping it from
intel_pstate_update_max_freq() doesn't harm.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c      | 3 ++-
 drivers/cpufreq/intel_pstate.c | 7 +------
 include/linux/cpufreq.h        | 1 +
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b96ef6db1bfe..79bac52919a5 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1116,7 +1116,7 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp
 	return ret;
 }
 
-static void refresh_frequency_limits(struct cpufreq_policy *policy)
+void refresh_frequency_limits(struct cpufreq_policy *policy)
 {
 	struct cpufreq_policy new_policy;
 
@@ -1129,6 +1129,7 @@ static void refresh_frequency_limits(struct cpufreq_policy *policy)
 		cpufreq_set_policy(policy, &new_policy);
 	}
 }
+EXPORT_SYMBOL(refresh_frequency_limits);
 
 static void handle_update(struct work_struct *work)
 {
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index f2ff5de988c1..cc27d4c59dca 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -898,7 +898,6 @@ static void intel_pstate_update_policies(void)
 static void intel_pstate_update_max_freq(unsigned int cpu)
 {
 	struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
-	struct cpufreq_policy new_policy;
 	struct cpudata *cpudata;
 
 	if (!policy)
@@ -908,11 +907,7 @@ static void intel_pstate_update_max_freq(unsigned int cpu)
 	policy->cpuinfo.max_freq = global.turbo_disabled_mf ?
 			cpudata->pstate.max_freq : cpudata->pstate.turbo_freq;
 
-	memcpy(&new_policy, policy, sizeof(*policy));
-	new_policy.max = min(policy->user_policy.max, policy->cpuinfo.max_freq);
-	new_policy.min = min(policy->user_policy.min, new_policy.max);
-
-	cpufreq_set_policy(policy, &new_policy);
+	refresh_frequency_limits(policy);
 
 	cpufreq_cpu_release(policy);
 }
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 95425941f46d..1fa37b675a80 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -207,6 +207,7 @@ void cpufreq_cpu_release(struct cpufreq_policy *policy);
 int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
 int cpufreq_set_policy(struct cpufreq_policy *policy,
 		       struct cpufreq_policy *new_policy);
+void refresh_frequency_limits(struct cpufreq_policy *policy);
 void cpufreq_update_policy(unsigned int cpu);
 void cpufreq_update_limits(unsigned int cpu);
 bool have_governor_per_policy(void);
-- 
cgit v1.2.3-59-g8ed1b


From 18c49926c4bf4915e5194d1de3299c0537229f9f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 5 Jul 2019 16:21:24 +0530
Subject: cpufreq: Add QoS requests for userspace constraints

This implements QoS requests to manage userspace configuration of min
and max frequency.

Reviewed-by: Matthias Kaehlcke <mka@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: syzbot <syzbot+de771ae9390dffed7266@syzkaller.appspotmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 98 ++++++++++++++++++++++++++---------------------
 include/linux/cpufreq.h   |  8 +---
 2 files changed, 57 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 79bac52919a5..99aa7d20b458 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -718,23 +718,15 @@ static ssize_t show_scaling_cur_freq(struct cpufreq_policy *policy, char *buf)
 static ssize_t store_##file_name					\
 (struct cpufreq_policy *policy, const char *buf, size_t count)		\
 {									\
-	int ret, temp;							\
-	struct cpufreq_policy new_policy;				\
+	unsigned long val;						\
+	int ret;							\
 									\
-	memcpy(&new_policy, policy, sizeof(*policy));			\
-	new_policy.min = policy->user_policy.min;			\
-	new_policy.max = policy->user_policy.max;			\
-									\
-	ret = sscanf(buf, "%u", &new_policy.object);			\
+	ret = sscanf(buf, "%lu", &val);					\
 	if (ret != 1)							\
 		return -EINVAL;						\
 									\
-	temp = new_policy.object;					\
-	ret = cpufreq_set_policy(policy, &new_policy);		\
-	if (!ret)							\
-		policy->user_policy.object = temp;			\
-									\
-	return ret ? ret : count;					\
+	ret = dev_pm_qos_update_request(policy->object##_freq_req, val);\
+	return ret >= 0 ? count : ret;					\
 }
 
 store_one(scaling_min_freq, min);
@@ -1124,8 +1116,6 @@ void refresh_frequency_limits(struct cpufreq_policy *policy)
 		new_policy = *policy;
 		pr_debug("updating policy for CPU %u\n", policy->cpu);
 
-		new_policy.min = policy->user_policy.min;
-		new_policy.max = policy->user_policy.max;
 		cpufreq_set_policy(policy, &new_policy);
 	}
 }
@@ -1281,6 +1271,9 @@ static void cpufreq_policy_free(struct cpufreq_policy *policy)
 				   DEV_PM_QOS_MAX_FREQUENCY);
 	dev_pm_qos_remove_notifier(dev, &policy->nb_min,
 				   DEV_PM_QOS_MIN_FREQUENCY);
+	dev_pm_qos_remove_request(policy->max_freq_req);
+	dev_pm_qos_remove_request(policy->min_freq_req);
+	kfree(policy->min_freq_req);
 
 	cpufreq_policy_put_kobj(policy);
 	free_cpumask_var(policy->real_cpus);
@@ -1359,16 +1352,50 @@ static int cpufreq_online(unsigned int cpu)
 	cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);
 
 	if (new_policy) {
-		policy->user_policy.min = policy->min;
-		policy->user_policy.max = policy->max;
+		struct device *dev = get_cpu_device(cpu);
 
 		for_each_cpu(j, policy->related_cpus) {
 			per_cpu(cpufreq_cpu_data, j) = policy;
 			add_cpu_dev_symlink(policy, j);
 		}
-	} else {
-		policy->min = policy->user_policy.min;
-		policy->max = policy->user_policy.max;
+
+		policy->min_freq_req = kzalloc(2 * sizeof(*policy->min_freq_req),
+					       GFP_KERNEL);
+		if (!policy->min_freq_req)
+			goto out_destroy_policy;
+
+		ret = dev_pm_qos_add_request(dev, policy->min_freq_req,
+					     DEV_PM_QOS_MIN_FREQUENCY,
+					     policy->min);
+		if (ret < 0) {
+			/*
+			 * So we don't call dev_pm_qos_remove_request() for an
+			 * uninitialized request.
+			 */
+			kfree(policy->min_freq_req);
+			policy->min_freq_req = NULL;
+
+			dev_err(dev, "Failed to add min-freq constraint (%d)\n",
+				ret);
+			goto out_destroy_policy;
+		}
+
+		/*
+		 * This must be initialized right here to avoid calling
+		 * dev_pm_qos_remove_request() on uninitialized request in case
+		 * of errors.
+		 */
+		policy->max_freq_req = policy->min_freq_req + 1;
+
+		ret = dev_pm_qos_add_request(dev, policy->max_freq_req,
+					     DEV_PM_QOS_MAX_FREQUENCY,
+					     policy->max);
+		if (ret < 0) {
+			policy->max_freq_req = NULL;
+			dev_err(dev, "Failed to add max-freq constraint (%d)\n",
+				ret);
+			goto out_destroy_policy;
+		}
 	}
 
 	if (cpufreq_driver->get && has_target()) {
@@ -2342,7 +2369,6 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 {
 	struct cpufreq_governor *old_gov;
 	struct device *cpu_dev = get_cpu_device(policy->cpu);
-	unsigned long min, max;
 	int ret;
 
 	pr_debug("setting new policy for CPU %u: %u - %u kHz\n",
@@ -2350,24 +2376,12 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 
 	memcpy(&new_policy->cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));
 
-	/*
-	* This check works well when we store new min/max freq attributes,
-	* because new_policy is a copy of policy with one field updated.
-	*/
-	if (new_policy->min > new_policy->max)
-		return -EINVAL;
-
 	/*
 	 * PM QoS framework collects all the requests from users and provide us
 	 * the final aggregated value here.
 	 */
-	min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
-	max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
-
-	if (min > new_policy->min)
-		new_policy->min = min;
-	if (max < new_policy->max)
-		new_policy->max = max;
+	new_policy->min = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MIN_FREQUENCY);
+	new_policy->max = dev_pm_qos_read_value(cpu_dev, DEV_PM_QOS_MAX_FREQUENCY);
 
 	/* verify the cpu speed can be set within this limit */
 	ret = cpufreq_driver->verify(new_policy);
@@ -2456,10 +2470,9 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
  * @cpu: CPU to re-evaluate the policy for.
  *
  * Update the current frequency for the cpufreq policy of @cpu and use
- * cpufreq_set_policy() to re-apply the min and max limits saved in the
- * user_policy sub-structure of that policy, which triggers the evaluation
- * of policy notifiers and the cpufreq driver's ->verify() callback for the
- * policy in question, among other things.
+ * cpufreq_set_policy() to re-apply the min and max limits, which triggers the
+ * evaluation of policy notifiers and the cpufreq driver's ->verify() callback
+ * for the policy in question, among other things.
  */
 void cpufreq_update_policy(unsigned int cpu)
 {
@@ -2519,10 +2532,9 @@ static int cpufreq_boost_set_sw(int state)
 			break;
 		}
 
-		down_write(&policy->rwsem);
-		policy->user_policy.max = policy->max;
-		cpufreq_governor_limits(policy);
-		up_write(&policy->rwsem);
+		ret = dev_pm_qos_update_request(policy->max_freq_req, policy->max);
+		if (ret)
+			break;
 	}
 
 	return ret;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1fa37b675a80..afc683021ac5 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -50,11 +50,6 @@ struct cpufreq_cpuinfo {
 	unsigned int		transition_latency;
 };
 
-struct cpufreq_user_policy {
-	unsigned int		min;    /* in kHz */
-	unsigned int		max;    /* in kHz */
-};
-
 struct cpufreq_policy {
 	/* CPUs sharing clock, require sw coordination */
 	cpumask_var_t		cpus;	/* Online CPUs only */
@@ -84,7 +79,8 @@ struct cpufreq_policy {
 	struct work_struct	update; /* if update_policy() needs to be
 					 * called, but you're in IRQ context */
 
-	struct cpufreq_user_policy user_policy;
+	struct dev_pm_qos_request *min_freq_req;
+	struct dev_pm_qos_request *max_freq_req;
 	struct cpufreq_frequency_table	*freq_table;
 	enum cpufreq_table_sorting freq_table_sorted;
 
-- 
cgit v1.2.3-59-g8ed1b


From 87b512def792579641499d9bef1d640994ea9c18 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 27 Jun 2019 20:50:46 -0500
Subject: objtool: Add support for C jump tables

Objtool doesn't know how to read C jump tables, so it has to whitelist
functions which use them, causing missing ORC unwinder data for such
functions, e.g. ___bpf_prog_run().

C jump tables are very similar to GCC switch jump tables, which objtool
already knows how to read.  So adding support for C jump tables is easy.
It just needs to be able to find the tables and distinguish them from
other data.

To allow the jump tables to be found, create an __annotate_jump_table
macro which can be used to annotate them.

The annotation is done by placing the jump table in an
.rodata..c_jump_table section.  The '.rodata' prefix ensures that the data
will be placed in the rodata section by the vmlinux linker script.  The
double periods are part of an existing convention which distinguishes
kernel sections from GCC sections.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Song Liu <songliubraving@fb.com>
Cc: Kairui Song <kasong@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lkml.kernel.org/r/0ba2ca30442b16b97165992381ce643dc27b3d1a.1561685471.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h |  5 +++++
 tools/objtool/check.c    | 27 ++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 8aaf7cd026b0..f0fd5636fddb 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -116,9 +116,14 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	".pushsection .discard.unreachable\n\t"				\
 	".long 999b - .\n\t"						\
 	".popsection\n\t"
+
+/* Annotate a C jump table to allow objtool to follow the code flow */
+#define __annotate_jump_table __section(".rodata..c_jump_table")
+
 #else
 #define annotate_reachable()
 #define annotate_unreachable()
+#define __annotate_jump_table
 #endif
 
 #ifndef ASM_UNREACHABLE
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 172f99195726..27818a93f0b1 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -18,6 +18,8 @@
 
 #define FAKE_JUMP_OFFSET -1
 
+#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table"
+
 struct alternative {
 	struct list_head list;
 	struct instruction *insn;
@@ -1035,9 +1037,15 @@ static struct rela *find_switch_table(struct objtool_file *file,
 
 		/*
 		 * Make sure the .rodata address isn't associated with a
-		 * symbol.  gcc jump tables are anonymous data.
+		 * symbol.  GCC jump tables are anonymous data.
+		 *
+		 * Also support C jump tables which are in the same format as
+		 * switch jump tables.  For objtool to recognize them, they
+		 * need to be placed in the C_JUMP_TABLE_SECTION section.  They
+		 * have symbols associated with them.
 		 */
-		if (find_symbol_containing(rodata_sec, table_offset))
+		if (find_symbol_containing(rodata_sec, table_offset) &&
+		    strcmp(rodata_sec->name, C_JUMP_TABLE_SECTION))
 			continue;
 
 		rodata_rela = find_rela_by_dest(rodata_sec, table_offset);
@@ -1277,13 +1285,18 @@ static void mark_rodata(struct objtool_file *file)
 	bool found = false;
 
 	/*
-	 * This searches for the .rodata section or multiple .rodata.func_name
-	 * sections if -fdata-sections is being used. The .str.1.1 and .str.1.8
-	 * rodata sections are ignored as they don't contain jump tables.
+	 * Search for the following rodata sections, each of which can
+	 * potentially contain jump tables:
+	 *
+	 * - .rodata: can contain GCC switch tables
+	 * - .rodata.<func>: same, if -fdata-sections is being used
+	 * - .rodata..c_jump_table: contains C annotated jump tables
+	 *
+	 * .rodata.str1.* sections are ignored; they don't contain jump tables.
 	 */
 	for_each_sec(file, sec) {
-		if (!strncmp(sec->name, ".rodata", 7) &&
-		    !strstr(sec->name, ".str1.")) {
+		if ((!strncmp(sec->name, ".rodata", 7) && !strstr(sec->name, ".str1.")) ||
+		    !strcmp(sec->name, C_JUMP_TABLE_SECTION)) {
 			sec->rodata = true;
 			found = true;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From f6b6aefee70aa5261deec7feab80c249bf58397f Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 30 May 2019 08:05:58 -0500
Subject: PCI: Fix typos and whitespace errors

Fix typos in drivers/pci.  Comment and whitespace changes only.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
---
 drivers/pci/ats.c                            |  2 +-
 drivers/pci/controller/dwc/pcie-armada8k.c   |  2 +-
 drivers/pci/controller/dwc/pcie-kirin.c      |  2 +-
 drivers/pci/controller/pci-aardvark.c        |  2 +-
 drivers/pci/controller/pcie-iproc-platform.c |  2 +-
 drivers/pci/controller/pcie-iproc.c          |  2 +-
 drivers/pci/controller/vmd.c                 |  2 +-
 drivers/pci/mmap.c                           |  2 +-
 drivers/pci/msi.c                            | 43 ++++++++++++++--------------
 drivers/pci/p2pdma.c                         |  6 ++--
 drivers/pci/pci-bridge-emul.c                |  2 +-
 drivers/pci/pci-pf-stub.c                    |  2 +-
 drivers/pci/pci.c                            |  2 +-
 drivers/pci/pcie/aer_inject.c                |  2 +-
 include/linux/pci.h                          |  2 +-
 include/linux/pci_ids.h                      |  6 ++--
 16 files changed, 41 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index 97c08146534a..e18499243f84 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -432,7 +432,7 @@ EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required);
  * @pdev: PCI device structure
  *
  * Returns negative value when PASID capability is not present.
- * Otherwise it returns the numer of supported PASIDs.
+ * Otherwise it returns the number of supported PASIDs.
  */
 int pci_max_pasids(struct pci_dev *pdev)
 {
diff --git a/drivers/pci/controller/dwc/pcie-armada8k.c b/drivers/pci/controller/dwc/pcie-armada8k.c
index 0c389a30ef5d..9012d5f60be9 100644
--- a/drivers/pci/controller/dwc/pcie-armada8k.c
+++ b/drivers/pci/controller/dwc/pcie-armada8k.c
@@ -55,7 +55,7 @@ struct armada8k_pcie {
 #define PCIE_ARUSER_REG			(PCIE_VENDOR_REGS_OFFSET + 0x5C)
 #define PCIE_AWUSER_REG			(PCIE_VENDOR_REGS_OFFSET + 0x60)
 /*
- * AR/AW Cache defauls: Normal memory, Write-Back, Read / Write
+ * AR/AW Cache defaults: Normal memory, Write-Back, Read / Write
  * allocate
  */
 #define ARCACHE_DEFAULT_VALUE		0x3511
diff --git a/drivers/pci/controller/dwc/pcie-kirin.c b/drivers/pci/controller/dwc/pcie-kirin.c
index 9b599296205d..8df1914226be 100644
--- a/drivers/pci/controller/dwc/pcie-kirin.c
+++ b/drivers/pci/controller/dwc/pcie-kirin.c
@@ -2,7 +2,7 @@
 /*
  * PCIe host controller driver for Kirin Phone SoCs
  *
- * Copyright (C) 2017 Hilisicon Electronics Co., Ltd.
+ * Copyright (C) 2017 HiSilicon Electronics Co., Ltd.
  *		http://www.huawei.com
  *
  * Author: Xiaowei Song <songxiaowei@huawei.com>
diff --git a/drivers/pci/controller/pci-aardvark.c b/drivers/pci/controller/pci-aardvark.c
index 134e0306ff00..fc0fe4d4de49 100644
--- a/drivers/pci/controller/pci-aardvark.c
+++ b/drivers/pci/controller/pci-aardvark.c
@@ -308,7 +308,7 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie)
 
 	advk_writel(pcie, PCIE_ISR1_ALL_MASK, PCIE_ISR1_MASK_REG);
 
-	/* Unmask all MSI's */
+	/* Unmask all MSIs */
 	advk_writel(pcie, 0, PCIE_MSI_MASK_REG);
 
 	/* Enable summary interrupt for GIC SPI source */
diff --git a/drivers/pci/controller/pcie-iproc-platform.c b/drivers/pci/controller/pcie-iproc-platform.c
index f30f5f3fb5c1..5a3550b6bb29 100644
--- a/drivers/pci/controller/pcie-iproc-platform.c
+++ b/drivers/pci/controller/pcie-iproc-platform.c
@@ -87,7 +87,7 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev)
 
 	/*
 	 * DT nodes are not used by all platforms that use the iProc PCIe
-	 * core driver. For platforms that require explict inbound mapping
+	 * core driver. For platforms that require explicit inbound mapping
 	 * configuration, "dma-ranges" would have been present in DT
 	 */
 	pcie->need_ib_cfg = of_property_read_bool(np, "dma-ranges");
diff --git a/drivers/pci/controller/pcie-iproc.c b/drivers/pci/controller/pcie-iproc.c
index e3ca46497470..2d457bfdaf66 100644
--- a/drivers/pci/controller/pcie-iproc.c
+++ b/drivers/pci/controller/pcie-iproc.c
@@ -163,7 +163,7 @@ enum iproc_pcie_ib_map_type {
  * @size_unit: inbound mapping region size unit, could be SZ_1K, SZ_1M, or
  * SZ_1G
  * @region_sizes: list of supported inbound mapping region sizes in KB, MB, or
- * GB, depedning on the size unit
+ * GB, depending on the size unit
  * @nr_sizes: number of supported inbound mapping region sizes
  * @nr_windows: number of supported inbound mapping windows for the region
  * @imap_addr_offset: register offset between the upper and lower 32-bit
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 999a5509e57e..4575e0c6dc4b 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -627,7 +627,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	 * 32-bit resources.  __pci_assign_resource() enforces that
 	 * artificial restriction to make sure everything will fit.
 	 *
-	 * The only way we could use a 64-bit non-prefechable MEMBAR is
+	 * The only way we could use a 64-bit non-prefetchable MEMBAR is
 	 * if its address is <4GB so that we can convert it to a 32-bit
 	 * resource.  To be visible to the host OS, all VMD endpoints must
 	 * be initially configured by platform BIOS, which includes setting
diff --git a/drivers/pci/mmap.c b/drivers/pci/mmap.c
index 24505b08de40..b8c9011987f4 100644
--- a/drivers/pci/mmap.c
+++ b/drivers/pci/mmap.c
@@ -73,7 +73,7 @@ int pci_mmap_resource_range(struct pci_dev *pdev, int bar,
 #elif defined(HAVE_PCI_MMAP) /* && !ARCH_GENERIC_PCI_MMAP_RESOURCE */
 
 /*
- * Legacy setup: Impement pci_mmap_resource_range() as a wrapper around
+ * Legacy setup: Implement pci_mmap_resource_range() as a wrapper around
  * the architecture's pci_mmap_page_range(), converting to "user visible"
  * addresses as necessary.
  */
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index e039b740fe74..59a6d232f77a 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -237,7 +237,7 @@ static void msi_set_mask_bit(struct irq_data *data, u32 flag)
 }
 
 /**
- * pci_msi_mask_irq - Generic irq chip callback to mask PCI/MSI interrupts
+ * pci_msi_mask_irq - Generic IRQ chip callback to mask PCI/MSI interrupts
  * @data:	pointer to irqdata associated to that interrupt
  */
 void pci_msi_mask_irq(struct irq_data *data)
@@ -247,7 +247,7 @@ void pci_msi_mask_irq(struct irq_data *data)
 EXPORT_SYMBOL_GPL(pci_msi_mask_irq);
 
 /**
- * pci_msi_unmask_irq - Generic irq chip callback to unmask PCI/MSI interrupts
+ * pci_msi_unmask_irq - Generic IRQ chip callback to unmask PCI/MSI interrupts
  * @data:	pointer to irqdata associated to that interrupt
  */
 void pci_msi_unmask_irq(struct irq_data *data)
@@ -588,11 +588,11 @@ static int msi_verify_entries(struct pci_dev *dev)
  * msi_capability_init - configure device's MSI capability structure
  * @dev: pointer to the pci_dev data structure of MSI device function
  * @nvec: number of interrupts to allocate
- * @affd: description of automatic irq affinity assignments (may be %NULL)
+ * @affd: description of automatic IRQ affinity assignments (may be %NULL)
  *
  * Setup the MSI capability structure of the device with the requested
  * number of interrupts.  A return value of zero indicates the successful
- * setup of an entry with the new MSI irq.  A negative return value indicates
+ * setup of an entry with the new MSI IRQ.  A negative return value indicates
  * an error, and a positive return value indicates the number of interrupts
  * which could have been allocated.
  */
@@ -609,7 +609,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 	if (!entry)
 		return -ENOMEM;
 
-	/* All MSIs are unmasked by default, Mask them all */
+	/* All MSIs are unmasked by default; mask them all */
 	mask = msi_mask(entry->msi_attrib.multi_cap);
 	msi_mask_irq(entry, mask, mask);
 
@@ -637,7 +637,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec,
 		return ret;
 	}
 
-	/* Set MSI enabled bits	 */
+	/* Set MSI enabled bits	*/
 	pci_intx_for_msi(dev, 0);
 	pci_msi_set_enable(dev, 1);
 	dev->msi_enabled = 1;
@@ -729,11 +729,11 @@ static void msix_program_entries(struct pci_dev *dev,
  * @dev: pointer to the pci_dev data structure of MSI-X device function
  * @entries: pointer to an array of struct msix_entry entries
  * @nvec: number of @entries
- * @affd: Optional pointer to enable automatic affinity assignement
+ * @affd: Optional pointer to enable automatic affinity assignment
  *
  * Setup the MSI-X capability structure of device function with a
- * single MSI-X irq. A return of zero indicates the successful setup of
- * requested MSI-X entries with allocated irqs or non-zero for otherwise.
+ * single MSI-X IRQ. A return of zero indicates the successful setup of
+ * requested MSI-X entries with allocated IRQs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 				int nvec, struct irq_affinity *affd)
@@ -789,7 +789,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 out_avail:
 	if (ret < 0) {
 		/*
-		 * If we had some success, report the number of irqs
+		 * If we had some success, report the number of IRQs
 		 * we succeeded in setting up.
 		 */
 		struct msi_desc *entry;
@@ -812,7 +812,7 @@ out_free:
 /**
  * pci_msi_supported - check whether MSI may be enabled on a device
  * @dev: pointer to the pci_dev data structure of MSI device function
- * @nvec: how many MSIs have been requested ?
+ * @nvec: how many MSIs have been requested?
  *
  * Look at global flags, the device itself, and its parent buses
  * to determine if MSI/-X are supported for the device. If MSI/-X is
@@ -896,7 +896,7 @@ static void pci_msi_shutdown(struct pci_dev *dev)
 	/* Keep cached state to be restored */
 	__pci_msi_desc_mask_irq(desc, mask, ~mask);
 
-	/* Restore dev->irq to its default pin-assertion irq */
+	/* Restore dev->irq to its default pin-assertion IRQ */
 	dev->irq = desc->msi_attrib.default_irq;
 	pcibios_alloc_irq(dev);
 }
@@ -958,7 +958,7 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
 		}
 	}
 
-	/* Check whether driver already requested for MSI irq */
+	/* Check whether driver already requested for MSI IRQ */
 	if (dev->msi_enabled) {
 		pci_info(dev, "can't enable MSI-X (MSI IRQ already assigned)\n");
 		return -EINVAL;
@@ -1026,7 +1026,7 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 	if (!pci_msi_supported(dev, minvec))
 		return -EINVAL;
 
-	/* Check whether driver already requested MSI-X irqs */
+	/* Check whether driver already requested MSI-X IRQs */
 	if (dev->msix_enabled) {
 		pci_info(dev, "can't enable MSI (MSI-X already enabled)\n");
 		return -EINVAL;
@@ -1113,8 +1113,8 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
  * pci_enable_msix_range - configure device's MSI-X capability structure
  * @dev: pointer to the pci_dev data structure of MSI-X device function
  * @entries: pointer to an array of MSI-X entries
- * @minvec: minimum number of MSI-X irqs requested
- * @maxvec: maximum number of MSI-X irqs requested
+ * @minvec: minimum number of MSI-X IRQs requested
+ * @maxvec: maximum number of MSI-X IRQs requested
  *
  * Setup the MSI-X capability structure of device function with a maximum
  * possible number of interrupts in the range between @minvec and @maxvec
@@ -1179,7 +1179,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 			return msi_vecs;
 	}
 
-	/* use legacy irq if allowed */
+	/* use legacy IRQ if allowed */
 	if (flags & PCI_IRQ_LEGACY) {
 		if (min_vecs == 1 && dev->irq) {
 			/*
@@ -1248,7 +1248,7 @@ int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
 EXPORT_SYMBOL(pci_irq_vector);
 
 /**
- * pci_irq_get_affinity - return the affinity of a particular msi vector
+ * pci_irq_get_affinity - return the affinity of a particular MSI vector
  * @dev:	PCI device to operate on
  * @nr:		device-relative interrupt vector index (0-based).
  */
@@ -1280,7 +1280,7 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
 EXPORT_SYMBOL(pci_irq_get_affinity);
 
 /**
- * pci_irq_get_node - return the numa node of a particular msi vector
+ * pci_irq_get_node - return the NUMA node of a particular MSI vector
  * @pdev:	PCI device to operate on
  * @vec:	device-relative interrupt vector index (0-based).
  */
@@ -1330,7 +1330,7 @@ void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg)
 /**
  * pci_msi_domain_calc_hwirq - Generate a unique ID for an MSI source
  * @dev:	Pointer to the PCI device
- * @desc:	Pointer to the msi descriptor
+ * @desc:	Pointer to the MSI descriptor
  *
  * The ID number is only used within the irqdomain.
  */
@@ -1348,7 +1348,8 @@ static inline bool pci_msi_desc_is_multi_msi(struct msi_desc *desc)
 }
 
 /**
- * pci_msi_domain_check_cap - Verify that @domain supports the capabilities for @dev
+ * pci_msi_domain_check_cap - Verify that @domain supports the capabilities
+ * 			      for @dev
  * @domain:	The interrupt domain to check
  * @info:	The domain info for verification
  * @dev:	The device to check
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 742928d0053e..d953cc7d9a54 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -223,7 +223,7 @@ EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource);
 
 /*
  * Note this function returns the parent PCI device with a
- * reference taken. It is the caller's responsibily to drop
+ * reference taken. It is the caller's responsibility to drop
  * the reference.
  */
 static struct pci_dev *find_parent_pci_dev(struct device *dev)
@@ -380,7 +380,7 @@ static int upstream_bridge_distance(struct pci_dev *provider,
 
 	/*
 	 * Allow the connection if both devices are on a whitelisted root
-	 * complex, but add an arbitary large value to the distance.
+	 * complex, but add an arbitrary large value to the distance.
 	 */
 	if (root_complex_whitelist(provider) &&
 	    root_complex_whitelist(client))
@@ -439,7 +439,7 @@ static int upstream_bridge_distance_warn(struct pci_dev *provider,
 }
 
 /**
- * pci_p2pdma_distance_many - Determive the cumulative distance between
+ * pci_p2pdma_distance_many - Determine the cumulative distance between
  *	a p2pdma provider and the clients in use.
  * @provider: p2pdma provider to check against the client list
  * @clients: array of devices to check (NULL-terminated)
diff --git a/drivers/pci/pci-bridge-emul.c b/drivers/pci/pci-bridge-emul.c
index 83fb077d0b41..06083b86d4f4 100644
--- a/drivers/pci/pci-bridge-emul.c
+++ b/drivers/pci/pci-bridge-emul.c
@@ -305,7 +305,7 @@ int pci_bridge_emul_init(struct pci_bridge_emul *bridge,
 }
 
 /*
- * Cleanup a pci_bridge_emul structure that was previously initilized
+ * Cleanup a pci_bridge_emul structure that was previously initialized
  * using pci_bridge_emul_init().
  */
 void pci_bridge_emul_cleanup(struct pci_bridge_emul *bridge)
diff --git a/drivers/pci/pci-pf-stub.c b/drivers/pci/pci-pf-stub.c
index 9795649fc6f9..ef293e735c55 100644
--- a/drivers/pci/pci-pf-stub.c
+++ b/drivers/pci/pci-pf-stub.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* pci-pf-stub - simple stub driver for PCI SR-IOV PF device
  *
- * This driver is meant to act as a "whitelist" for devices that provde
+ * This driver is meant to act as a "whitelist" for devices that provide
  * SR-IOV functionality while at the same time not actually needing a
  * driver of their own.
  */
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 8abc843b1615..3fd4eaa32b21 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4501,7 +4501,7 @@ static int pci_af_flr(struct pci_dev *dev, int probe)
 
 	/*
 	 * Wait for Transaction Pending bit to clear.  A word-aligned test
-	 * is used, so we use the conrol offset rather than status and shift
+	 * is used, so we use the control offset rather than status and shift
 	 * the test bit to match.
 	 */
 	if (!pci_wait_for_pending(dev, pos + PCI_AF_CTRL,
diff --git a/drivers/pci/pcie/aer_inject.c b/drivers/pci/pcie/aer_inject.c
index 043b8b0cfcc5..6988fe7389b9 100644
--- a/drivers/pci/pcie/aer_inject.c
+++ b/drivers/pci/pcie/aer_inject.c
@@ -2,7 +2,7 @@
 /*
  * PCIe AER software error injection support.
  *
- * Debuging PCIe AER code is quite difficult because it is hard to
+ * Debugging PCIe AER code is quite difficult because it is hard to
  * trigger various real hardware errors. Software based error
  * injection can fake almost all kinds of errors with the help of a
  * user space helper tool aer-inject, which can be gotten from:
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 4a5a84d7bdd4..fb207a22d686 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -382,7 +382,7 @@ struct pci_dev {
 
 	unsigned int	is_busmaster:1;		/* Is busmaster */
 	unsigned int	no_msi:1;		/* May not use MSI */
-	unsigned int	no_64bit_msi:1; 	/* May only use 32-bit MSIs */
+	unsigned int	no_64bit_msi:1;		/* May only use 32-bit MSIs */
 	unsigned int	block_cfg_access:1;	/* Config space access blocked */
 	unsigned int	broken_parity_status:1;	/* Generates false positive parity */
 	unsigned int	irq_reroute_variant:2;	/* Needs IRQ rerouting variant */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 70e86148cb1e..0dd239f11e91 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1112,7 +1112,7 @@
 
 #define PCI_VENDOR_ID_AL		0x10b9
 #define PCI_DEVICE_ID_AL_M1533		0x1533
-#define PCI_DEVICE_ID_AL_M1535 		0x1535
+#define PCI_DEVICE_ID_AL_M1535		0x1535
 #define PCI_DEVICE_ID_AL_M1541		0x1541
 #define PCI_DEVICE_ID_AL_M1563		0x1563
 #define PCI_DEVICE_ID_AL_M1621		0x1621
@@ -1752,7 +1752,7 @@
 #define PCI_VENDOR_ID_STALLION		0x124d
 
 /* Allied Telesyn */
-#define PCI_VENDOR_ID_AT    		0x1259
+#define PCI_VENDOR_ID_AT		0x1259
 #define PCI_SUBDEVICE_ID_AT_2700FX	0x2701
 #define PCI_SUBDEVICE_ID_AT_2701FX	0x2703
 
@@ -2550,7 +2550,7 @@
 #define PCI_DEVICE_ID_KORENIX_JETCARDF2	0x1700
 #define PCI_DEVICE_ID_KORENIX_JETCARDF3	0x17ff
 
-#define PCI_VENDOR_ID_HUAWEI         	0x19e5
+#define PCI_VENDOR_ID_HUAWEI		0x19e5
 
 #define PCI_VENDOR_ID_NETRONOME		0x19ee
 #define PCI_DEVICE_ID_NETRONOME_NFP4000	0x4000
-- 
cgit v1.2.3-59-g8ed1b


From 675dd90ad0932f2c03912a5252458d792bd7033a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Wed, 19 Jun 2019 10:33:42 -0400
Subject: xprtrdma: Modernize ops->connect

Adapt and apply changes that were made to the TCP socket connect
code. See the following commits for details on the purpose of
these changes:

Commit 7196dbb02ea0 ("SUNRPC: Allow changing of the TCP timeout parameters on the fly")
Commit 3851f1cdb2b8 ("SUNRPC: Limit the reconnect backoff timer to the max RPC message timeout")
Commit 02910177aede ("SUNRPC: Fix reconnection timeouts")

Some common transport code is moved to xprt.c to satisfy the code
duplication police.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
---
 include/linux/sunrpc/xprt.h     |  3 ++
 include/trace/events/rpcrdma.h  | 31 +++++++++++++++++++
 net/sunrpc/sched.c              |  1 +
 net/sunrpc/xprt.c               | 32 ++++++++++++++++++++
 net/sunrpc/xprtrdma/transport.c | 66 +++++++++++++++++++++++++++++++----------
 net/sunrpc/xprtrdma/xprt_rdma.h |  1 +
 net/sunrpc/xprtsock.c           | 23 ++------------
 7 files changed, 121 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a6d9fce7f20e..cc78fd38ea7d 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -334,6 +334,9 @@ struct xprt_class {
  */
 struct rpc_xprt		*xprt_create_transport(struct xprt_create *args);
 void			xprt_connect(struct rpc_task *task);
+unsigned long		xprt_reconnect_delay(const struct rpc_xprt *xprt);
+void			xprt_reconnect_backoff(struct rpc_xprt *xprt,
+					       unsigned long init_to);
 void			xprt_reserve(struct rpc_task *task);
 void			xprt_retry_reserve(struct rpc_task *task);
 int			xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 98023d91a72d..f6a4eaa85a3e 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -375,6 +375,37 @@ DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc);
 DEFINE_RXPRT_EVENT(xprtrdma_op_close);
 DEFINE_RXPRT_EVENT(xprtrdma_op_connect);
 
+TRACE_EVENT(xprtrdma_op_set_cto,
+	TP_PROTO(
+		const struct rpcrdma_xprt *r_xprt,
+		unsigned long connect,
+		unsigned long reconnect
+	),
+
+	TP_ARGS(r_xprt, connect, reconnect),
+
+	TP_STRUCT__entry(
+		__field(const void *, r_xprt)
+		__field(unsigned long, connect)
+		__field(unsigned long, reconnect)
+		__string(addr, rpcrdma_addrstr(r_xprt))
+		__string(port, rpcrdma_portstr(r_xprt))
+	),
+
+	TP_fast_assign(
+		__entry->r_xprt = r_xprt;
+		__entry->connect = connect;
+		__entry->reconnect = reconnect;
+		__assign_str(addr, rpcrdma_addrstr(r_xprt));
+		__assign_str(port, rpcrdma_portstr(r_xprt));
+	),
+
+	TP_printk("peer=[%s]:%s r_xprt=%p: connect=%lu reconnect=%lu",
+		__get_str(addr), __get_str(port), __entry->r_xprt,
+		__entry->connect / HZ, __entry->reconnect / HZ
+	)
+);
+
 TRACE_EVENT(xprtrdma_qp_event,
 	TP_PROTO(
 		const struct rpcrdma_xprt *r_xprt,
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index bb04ae52803a..5ad5dead7bfc 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -58,6 +58,7 @@ static struct rpc_wait_queue delay_queue;
  */
 struct workqueue_struct *rpciod_workqueue __read_mostly;
 struct workqueue_struct *xprtiod_workqueue __read_mostly;
+EXPORT_SYMBOL_GPL(xprtiod_workqueue);
 
 unsigned long
 rpc_task_timeout(const struct rpc_task *task)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ad21880d5601..b1f54b7ccc0c 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -850,6 +850,38 @@ void xprt_connect(struct rpc_task *task)
 	xprt_release_write(xprt, task);
 }
 
+/**
+ * xprt_reconnect_delay - compute the wait before scheduling a connect
+ * @xprt: transport instance
+ *
+ */
+unsigned long xprt_reconnect_delay(const struct rpc_xprt *xprt)
+{
+	unsigned long start, now = jiffies;
+
+	start = xprt->stat.connect_start + xprt->reestablish_timeout;
+	if (time_after(start, now))
+		return start - now;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xprt_reconnect_delay);
+
+/**
+ * xprt_reconnect_backoff - compute the new re-establish timeout
+ * @xprt: transport instance
+ * @init_to: initial reestablish timeout
+ *
+ */
+void xprt_reconnect_backoff(struct rpc_xprt *xprt, unsigned long init_to)
+{
+	xprt->reestablish_timeout <<= 1;
+	if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
+		xprt->reestablish_timeout = xprt->max_reconnect_timeout;
+	if (xprt->reestablish_timeout < init_to)
+		xprt->reestablish_timeout = init_to;
+}
+EXPORT_SYMBOL_GPL(xprt_reconnect_backoff);
+
 enum xprt_xid_rb_cmp {
 	XID_RB_EQUAL,
 	XID_RB_LEFT,
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 3688e0782587..4993aa49ecbe 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -298,6 +298,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
 	module_put(THIS_MODULE);
 }
 
+/* 60 second timeout, no retries */
 static const struct rpc_timeout xprt_rdma_default_timeout = {
 	.to_initval = 60 * HZ,
 	.to_maxval = 60 * HZ,
@@ -323,8 +324,9 @@ xprt_setup_rdma(struct xprt_create *args)
 	if (!xprt)
 		return ERR_PTR(-ENOMEM);
 
-	/* 60 second timeout, no retries */
 	xprt->timeout = &xprt_rdma_default_timeout;
+	xprt->connect_timeout = xprt->timeout->to_initval;
+	xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
 	xprt->bind_timeout = RPCRDMA_BIND_TO;
 	xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
 	xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
@@ -487,31 +489,64 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
 }
 
 /**
- * xprt_rdma_connect - try to establish a transport connection
+ * xprt_rdma_set_connect_timeout - set timeouts for establishing a connection
+ * @xprt: controlling transport instance
+ * @connect_timeout: reconnect timeout after client disconnects
+ * @reconnect_timeout: reconnect timeout after server disconnects
+ *
+ */
+static void xprt_rdma_tcp_set_connect_timeout(struct rpc_xprt *xprt,
+					      unsigned long connect_timeout,
+					      unsigned long reconnect_timeout)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	trace_xprtrdma_op_set_cto(r_xprt, connect_timeout, reconnect_timeout);
+
+	spin_lock(&xprt->transport_lock);
+
+	if (connect_timeout < xprt->connect_timeout) {
+		struct rpc_timeout to;
+		unsigned long initval;
+
+		to = *xprt->timeout;
+		initval = connect_timeout;
+		if (initval < RPCRDMA_INIT_REEST_TO << 1)
+			initval = RPCRDMA_INIT_REEST_TO << 1;
+		to.to_initval = initval;
+		to.to_maxval = initval;
+		r_xprt->rx_timeout = to;
+		xprt->timeout = &r_xprt->rx_timeout;
+		xprt->connect_timeout = connect_timeout;
+	}
+
+	if (reconnect_timeout < xprt->max_reconnect_timeout)
+		xprt->max_reconnect_timeout = reconnect_timeout;
+
+	spin_unlock(&xprt->transport_lock);
+}
+
+/**
+ * xprt_rdma_connect - schedule an attempt to reconnect
  * @xprt: transport state
- * @task: RPC scheduler context
+ * @task: RPC scheduler context (unused)
  *
  */
 static void
 xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+	unsigned long delay;
 
 	trace_xprtrdma_op_connect(r_xprt);
+
+	delay = 0;
 	if (r_xprt->rx_ep.rep_connected != 0) {
-		/* Reconnect */
-		schedule_delayed_work(&r_xprt->rx_connect_worker,
-				      xprt->reestablish_timeout);
-		xprt->reestablish_timeout <<= 1;
-		if (xprt->reestablish_timeout > RPCRDMA_MAX_REEST_TO)
-			xprt->reestablish_timeout = RPCRDMA_MAX_REEST_TO;
-		else if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
-			xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
-	} else {
-		schedule_delayed_work(&r_xprt->rx_connect_worker, 0);
-		if (!RPC_IS_ASYNC(task))
-			flush_delayed_work(&r_xprt->rx_connect_worker);
+		delay = xprt_reconnect_delay(xprt);
+		xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
 	}
+	queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker,
+			   delay);
 }
 
 /**
@@ -769,6 +804,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
 	.send_request		= xprt_rdma_send_request,
 	.close			= xprt_rdma_close,
 	.destroy		= xprt_rdma_destroy,
+	.set_connect_timeout	= xprt_rdma_tcp_set_connect_timeout,
 	.print_stats		= xprt_rdma_print_stats,
 	.enable_swap		= xprt_rdma_enable_swap,
 	.disable_swap		= xprt_rdma_disable_swap,
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 117e32816e4f..8378f45d2da7 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -432,6 +432,7 @@ struct rpcrdma_xprt {
 	struct rpcrdma_ep	rx_ep;
 	struct rpcrdma_buffer	rx_buf;
 	struct delayed_work	rx_connect_worker;
+	struct rpc_timeout	rx_timeout;
 	struct rpcrdma_stats	rx_stats;
 };
 
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c69951ed2ebc..b154600085d6 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2402,25 +2402,6 @@ out:
 	xprt_wake_pending_tasks(xprt, status);
 }
 
-static unsigned long xs_reconnect_delay(const struct rpc_xprt *xprt)
-{
-	unsigned long start, now = jiffies;
-
-	start = xprt->stat.connect_start + xprt->reestablish_timeout;
-	if (time_after(start, now))
-		return start - now;
-	return 0;
-}
-
-static void xs_reconnect_backoff(struct rpc_xprt *xprt)
-{
-	xprt->reestablish_timeout <<= 1;
-	if (xprt->reestablish_timeout > xprt->max_reconnect_timeout)
-		xprt->reestablish_timeout = xprt->max_reconnect_timeout;
-	if (xprt->reestablish_timeout < XS_TCP_INIT_REEST_TO)
-		xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
-}
-
 /**
  * xs_connect - connect a socket to a remote endpoint
  * @xprt: pointer to transport structure
@@ -2450,8 +2431,8 @@ static void xs_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 		/* Start by resetting any existing state */
 		xs_reset_transport(transport);
 
-		delay = xs_reconnect_delay(xprt);
-		xs_reconnect_backoff(xprt);
+		delay = xprt_reconnect_delay(xprt);
+		xprt_reconnect_backoff(xprt, XS_TCP_INIT_REEST_TO);
 
 	} else
 		dprintk("RPC:       xs_connect scheduled xprt %p\n", xprt);
-- 
cgit v1.2.3-59-g8ed1b


From 0fa03c624d8fc9932d0f27c39a9deca6a37e0e17 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 19 Apr 2019 13:34:07 -0600
Subject: io_uring: add support for sendmsg()

This is done through IORING_OP_SENDMSG. There's a new sqe->msg_flags
for the flags argument, and the msghdr struct is passed in the
sqe->addr field.

We use MSG_DONTWAIT to force an inline fast path if sendmsg() doesn't
block, and punt to async execution if it would have.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 40 ++++++++++++++++++++++++++++++++++++++++
 include/linux/socket.h        |  4 ++++
 include/uapi/linux/io_uring.h |  2 ++
 net/socket.c                  |  7 +++++++
 4 files changed, 53 insertions(+)

(limited to 'include/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9f0ef4956f87..5d4cd8c4132d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1390,6 +1390,43 @@ static int io_sync_file_range(struct io_kiocb *req,
 	return 0;
 }
 
+static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	struct socket *sock;
+	int ret;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+
+	sock = sock_from_file(req->file, &ret);
+	if (sock) {
+		struct user_msghdr __user *msg;
+		unsigned flags;
+
+		flags = READ_ONCE(sqe->msg_flags);
+		if (flags & MSG_DONTWAIT)
+			req->flags |= REQ_F_NOWAIT;
+		else if (force_nonblock)
+			flags |= MSG_DONTWAIT;
+
+		msg = (struct user_msghdr __user *) (unsigned long)
+			READ_ONCE(sqe->addr);
+
+		ret = __sys_sendmsg_sock(sock, msg, flags);
+		if (force_nonblock && ret == -EAGAIN)
+			return ret;
+	}
+
+	io_cqring_add_event(req->ctx, sqe->user_data, ret);
+	io_put_req(req);
+	return 0;
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
 static void io_poll_remove_one(struct io_kiocb *req)
 {
 	struct io_poll_iocb *poll = &req->poll;
@@ -1675,6 +1712,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_SYNC_FILE_RANGE:
 		ret = io_sync_file_range(req, s->sqe, force_nonblock);
 		break;
+	case IORING_OP_SENDMSG:
+		ret = io_sendmsg(req, s->sqe, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index b57cd8bf96e2..9d770ef3ced5 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -12,6 +12,7 @@
 
 struct pid;
 struct cred;
+struct socket;
 
 #define __sockaddr_check_size(size)	\
 	BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
@@ -374,6 +375,9 @@ extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
 extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
 			  unsigned int vlen, unsigned int flags,
 			  bool forbid_cmsg_compat);
+extern long __sys_sendmsg_sock(struct socket *sock,
+			       struct user_msghdr __user *msg,
+			       unsigned int flags);
 
 /* helpers which do the actual work for syscalls */
 extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 10b7c45f6d57..d74742d6269f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -27,6 +27,7 @@ struct io_uring_sqe {
 		__u32		fsync_flags;
 		__u16		poll_events;
 		__u32		sync_range_flags;
+		__u32		msg_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	union {
@@ -58,6 +59,7 @@ struct io_uring_sqe {
 #define IORING_OP_POLL_ADD	6
 #define IORING_OP_POLL_REMOVE	7
 #define IORING_OP_SYNC_FILE_RANGE	8
+#define IORING_OP_SENDMSG	9
 
 /*
  * sqe->fsync_flags
diff --git a/net/socket.c b/net/socket.c
index bffec466b4f1..b9536940255e 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2313,6 +2313,13 @@ out_freeiov:
 /*
  *	BSD sendmsg interface
  */
+long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
+			unsigned int flags)
+{
+	struct msghdr msg_sys;
+
+	return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
+}
 
 long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
 		   bool forbid_cmsg_compat)
-- 
cgit v1.2.3-59-g8ed1b


From aa1fa28fc73ea6b740ee7b62bf3b07141883dbb8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 19 Apr 2019 13:38:09 -0600
Subject: io_uring: add support for recvmsg()

This is done through IORING_OP_RECVMSG. This opcode uses the same
sqe->msg_flags that IORING_OP_SENDMSG added, and we pass in the
msghdr struct in the sqe->addr field as well.

We use MSG_DONTWAIT to force an inline fast path if recvmsg() doesn't
block, and punt to async execution if it would have.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 31 +++++++++++++++++++++++++++----
 include/linux/socket.h        |  3 +++
 include/uapi/linux/io_uring.h |  1 +
 net/socket.c                  |  8 ++++++++
 4 files changed, 39 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5d4cd8c4132d..8d86e31b0762 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1390,10 +1390,12 @@ static int io_sync_file_range(struct io_kiocb *req,
 	return 0;
 }
 
-static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-		      bool force_nonblock)
-{
 #if defined(CONFIG_NET)
+static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			   bool force_nonblock,
+		   long (*fn)(struct socket *, struct user_msghdr __user *,
+				unsigned int))
+{
 	struct socket *sock;
 	int ret;
 
@@ -1414,7 +1416,7 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		msg = (struct user_msghdr __user *) (unsigned long)
 			READ_ONCE(sqe->addr);
 
-		ret = __sys_sendmsg_sock(sock, msg, flags);
+		ret = fn(sock, msg, flags);
 		if (force_nonblock && ret == -EAGAIN)
 			return ret;
 	}
@@ -1422,6 +1424,24 @@ static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	io_cqring_add_event(req->ctx, sqe->user_data, ret);
 	io_put_req(req);
 	return 0;
+}
+#endif
+
+static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      bool force_nonblock)
+{
+#if defined(CONFIG_NET)
+	return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
 #else
 	return -EOPNOTSUPP;
 #endif
@@ -1715,6 +1735,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	case IORING_OP_SENDMSG:
 		ret = io_sendmsg(req, s->sqe, force_nonblock);
 		break;
+	case IORING_OP_RECVMSG:
+		ret = io_recvmsg(req, s->sqe, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 9d770ef3ced5..97523818cb14 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -378,6 +378,9 @@ extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
 extern long __sys_sendmsg_sock(struct socket *sock,
 			       struct user_msghdr __user *msg,
 			       unsigned int flags);
+extern long __sys_recvmsg_sock(struct socket *sock,
+			       struct user_msghdr __user *msg,
+			       unsigned int flags);
 
 /* helpers which do the actual work for syscalls */
 extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d74742d6269f..1e1652f25cc1 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -60,6 +60,7 @@ struct io_uring_sqe {
 #define IORING_OP_POLL_REMOVE	7
 #define IORING_OP_SYNC_FILE_RANGE	8
 #define IORING_OP_SENDMSG	9
+#define IORING_OP_RECVMSG	10
 
 /*
  * sqe->fsync_flags
diff --git a/net/socket.c b/net/socket.c
index b9536940255e..98354cc18840 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2494,6 +2494,14 @@ out_freeiov:
  *	BSD recvmsg interface
  */
 
+long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
+			unsigned int flags)
+{
+	struct msghdr msg_sys;
+
+	return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
+}
+
 long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
 		   bool forbid_cmsg_compat)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 6605bdd59c21bb34c8f14ac4d6f2d419185f3528 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 28 Jun 2019 09:53:29 -0700
Subject: nvme: add I/O characteristics fields

Several new fields have been introduced in version 1.4 of the NVMe spec
at offsets that were defined as reserved in version 1.3d of the NVMe
spec. Update the definition of the nvme_id_ns data structure such that
it is in sync with version 1.4 of the NVMe spec. This change preserves
backwards compatibility.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d98b2d8baf4e..01aa6a6c241d 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -315,7 +315,7 @@ struct nvme_id_ns {
 	__u8			nmic;
 	__u8			rescap;
 	__u8			fpi;
-	__u8			rsvd33;
+	__u8			dlfeat;
 	__le16			nawun;
 	__le16			nawupf;
 	__le16			nacwu;
@@ -324,11 +324,17 @@ struct nvme_id_ns {
 	__le16			nabspf;
 	__le16			noiob;
 	__u8			nvmcap[16];
-	__u8			rsvd64[28];
+	__le16			npwg;
+	__le16			npwa;
+	__le16			npdg;
+	__le16			npda;
+	__le16			nows;
+	__u8			rsvd74[18];
 	__le32			anagrpid;
 	__u8			rsvd96[3];
 	__u8			nsattr;
-	__u8			rsvd100[4];
+	__le16			nvmsetid;
+	__le16			endgid;
 	__u8			nguid[16];
 	__u8			eui64[8];
 	struct nvme_lbaf	lbaf[16];
-- 
cgit v1.2.3-59-g8ed1b


From 13990cf8a180cc070f0b1266140e799db8754289 Mon Sep 17 00:00:00 2001
From: Amol Surati <suratiamol@gmail.com>
Date: Sun, 7 Jul 2019 14:27:29 +0530
Subject: ide: use BIT() macro for defining bit-flags

The BIT() macro is available for defining the required bit-flags.

Since it operates on an unsigned value and expands to an unsigned result,
using it, instead of an expression like (1 << x), also fixes the problem
of shifting a signed 32-bit value by 31 bits (e.g. 1 << 31).

Signed-off-by: Amol Surati <suratiamol@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ide.h | 272 ++++++++++++++++++++++++++--------------------------
 1 file changed, 136 insertions(+), 136 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ide.h b/include/linux/ide.h
index 971cf76a78a0..46b771d6999e 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -253,9 +253,9 @@ static inline void ide_std_init_ports(struct ide_hw *hw,
  * Special Driver Flags
  */
 enum {
-	IDE_SFLAG_SET_GEOMETRY		= (1 << 0),
-	IDE_SFLAG_RECALIBRATE		= (1 << 1),
-	IDE_SFLAG_SET_MULTMODE		= (1 << 2),
+	IDE_SFLAG_SET_GEOMETRY		= BIT(0),
+	IDE_SFLAG_RECALIBRATE		= BIT(1),
+	IDE_SFLAG_SET_MULTMODE		= BIT(2),
 };
 
 /*
@@ -267,13 +267,13 @@ typedef enum {
 } ide_startstop_t;
 
 enum {
-	IDE_VALID_ERROR 		= (1 << 1),
+	IDE_VALID_ERROR 		= BIT(1),
 	IDE_VALID_FEATURE		= IDE_VALID_ERROR,
-	IDE_VALID_NSECT 		= (1 << 2),
-	IDE_VALID_LBAL			= (1 << 3),
-	IDE_VALID_LBAM			= (1 << 4),
-	IDE_VALID_LBAH			= (1 << 5),
-	IDE_VALID_DEVICE		= (1 << 6),
+	IDE_VALID_NSECT 		= BIT(2),
+	IDE_VALID_LBAL			= BIT(3),
+	IDE_VALID_LBAM			= BIT(4),
+	IDE_VALID_LBAH			= BIT(5),
+	IDE_VALID_DEVICE		= BIT(6),
 	IDE_VALID_LBA			= IDE_VALID_LBAL |
 					  IDE_VALID_LBAM |
 					  IDE_VALID_LBAH,
@@ -289,24 +289,24 @@ enum {
 };
 
 enum {
-	IDE_TFLAG_LBA48			= (1 << 0),
-	IDE_TFLAG_WRITE			= (1 << 1),
-	IDE_TFLAG_CUSTOM_HANDLER	= (1 << 2),
-	IDE_TFLAG_DMA_PIO_FALLBACK	= (1 << 3),
+	IDE_TFLAG_LBA48			= BIT(0),
+	IDE_TFLAG_WRITE			= BIT(1),
+	IDE_TFLAG_CUSTOM_HANDLER	= BIT(2),
+	IDE_TFLAG_DMA_PIO_FALLBACK	= BIT(3),
 	/* force 16-bit I/O operations */
-	IDE_TFLAG_IO_16BIT		= (1 << 4),
+	IDE_TFLAG_IO_16BIT		= BIT(4),
 	/* struct ide_cmd was allocated using kmalloc() */
-	IDE_TFLAG_DYN			= (1 << 5),
-	IDE_TFLAG_FS			= (1 << 6),
-	IDE_TFLAG_MULTI_PIO		= (1 << 7),
-	IDE_TFLAG_SET_XFER		= (1 << 8),
+	IDE_TFLAG_DYN			= BIT(5),
+	IDE_TFLAG_FS			= BIT(6),
+	IDE_TFLAG_MULTI_PIO		= BIT(7),
+	IDE_TFLAG_SET_XFER		= BIT(8),
 };
 
 enum {
-	IDE_FTFLAG_FLAGGED		= (1 << 0),
-	IDE_FTFLAG_SET_IN_FLAGS		= (1 << 1),
-	IDE_FTFLAG_OUT_DATA		= (1 << 2),
-	IDE_FTFLAG_IN_DATA		= (1 << 3),
+	IDE_FTFLAG_FLAGGED		= BIT(0),
+	IDE_FTFLAG_SET_IN_FLAGS		= BIT(1),
+	IDE_FTFLAG_OUT_DATA		= BIT(2),
+	IDE_FTFLAG_IN_DATA		= BIT(3),
 };
 
 struct ide_taskfile {
@@ -357,13 +357,13 @@ struct ide_cmd {
 /* ATAPI packet command flags */
 enum {
 	/* set when an error is considered normal - no retry (ide-tape) */
-	PC_FLAG_ABORT			= (1 << 0),
-	PC_FLAG_SUPPRESS_ERROR		= (1 << 1),
-	PC_FLAG_WAIT_FOR_DSC		= (1 << 2),
-	PC_FLAG_DMA_OK			= (1 << 3),
-	PC_FLAG_DMA_IN_PROGRESS		= (1 << 4),
-	PC_FLAG_DMA_ERROR		= (1 << 5),
-	PC_FLAG_WRITING			= (1 << 6),
+	PC_FLAG_ABORT			= BIT(0),
+	PC_FLAG_SUPPRESS_ERROR		= BIT(1),
+	PC_FLAG_WAIT_FOR_DSC		= BIT(2),
+	PC_FLAG_DMA_OK			= BIT(3),
+	PC_FLAG_DMA_IN_PROGRESS		= BIT(4),
+	PC_FLAG_DMA_ERROR		= BIT(5),
+	PC_FLAG_WRITING			= BIT(6),
 };
 
 #define ATAPI_WAIT_PC		(60 * HZ)
@@ -417,111 +417,111 @@ struct ide_disk_ops {
 
 /* ATAPI device flags */
 enum {
-	IDE_AFLAG_DRQ_INTERRUPT		= (1 << 0),
+	IDE_AFLAG_DRQ_INTERRUPT		= BIT(0),
 
 	/* ide-cd */
 	/* Drive cannot eject the disc. */
-	IDE_AFLAG_NO_EJECT		= (1 << 1),
+	IDE_AFLAG_NO_EJECT		= BIT(1),
 	/* Drive is a pre ATAPI 1.2 drive. */
-	IDE_AFLAG_PRE_ATAPI12		= (1 << 2),
+	IDE_AFLAG_PRE_ATAPI12		= BIT(2),
 	/* TOC addresses are in BCD. */
-	IDE_AFLAG_TOCADDR_AS_BCD	= (1 << 3),
+	IDE_AFLAG_TOCADDR_AS_BCD	= BIT(3),
 	/* TOC track numbers are in BCD. */
-	IDE_AFLAG_TOCTRACKS_AS_BCD	= (1 << 4),
+	IDE_AFLAG_TOCTRACKS_AS_BCD	= BIT(4),
 	/* Saved TOC information is current. */
-	IDE_AFLAG_TOC_VALID		= (1 << 6),
+	IDE_AFLAG_TOC_VALID		= BIT(6),
 	/* We think that the drive door is locked. */
-	IDE_AFLAG_DOOR_LOCKED		= (1 << 7),
+	IDE_AFLAG_DOOR_LOCKED		= BIT(7),
 	/* SET_CD_SPEED command is unsupported. */
-	IDE_AFLAG_NO_SPEED_SELECT	= (1 << 8),
-	IDE_AFLAG_VERTOS_300_SSD	= (1 << 9),
-	IDE_AFLAG_VERTOS_600_ESD	= (1 << 10),
-	IDE_AFLAG_SANYO_3CD		= (1 << 11),
-	IDE_AFLAG_FULL_CAPS_PAGE	= (1 << 12),
-	IDE_AFLAG_PLAY_AUDIO_OK		= (1 << 13),
-	IDE_AFLAG_LE_SPEED_FIELDS	= (1 << 14),
+	IDE_AFLAG_NO_SPEED_SELECT	= BIT(8),
+	IDE_AFLAG_VERTOS_300_SSD	= BIT(9),
+	IDE_AFLAG_VERTOS_600_ESD	= BIT(10),
+	IDE_AFLAG_SANYO_3CD		= BIT(11),
+	IDE_AFLAG_FULL_CAPS_PAGE	= BIT(12),
+	IDE_AFLAG_PLAY_AUDIO_OK		= BIT(13),
+	IDE_AFLAG_LE_SPEED_FIELDS	= BIT(14),
 
 	/* ide-floppy */
 	/* Avoid commands not supported in Clik drive */
-	IDE_AFLAG_CLIK_DRIVE		= (1 << 15),
+	IDE_AFLAG_CLIK_DRIVE		= BIT(15),
 	/* Requires BH algorithm for packets */
-	IDE_AFLAG_ZIP_DRIVE		= (1 << 16),
+	IDE_AFLAG_ZIP_DRIVE		= BIT(16),
 	/* Supports format progress report */
-	IDE_AFLAG_SRFP			= (1 << 17),
+	IDE_AFLAG_SRFP			= BIT(17),
 
 	/* ide-tape */
-	IDE_AFLAG_IGNORE_DSC		= (1 << 18),
+	IDE_AFLAG_IGNORE_DSC		= BIT(18),
 	/* 0 When the tape position is unknown */
-	IDE_AFLAG_ADDRESS_VALID		= (1 <<	19),
+	IDE_AFLAG_ADDRESS_VALID		= BIT(19),
 	/* Device already opened */
-	IDE_AFLAG_BUSY			= (1 << 20),
+	IDE_AFLAG_BUSY			= BIT(20),
 	/* Attempt to auto-detect the current user block size */
-	IDE_AFLAG_DETECT_BS		= (1 << 21),
+	IDE_AFLAG_DETECT_BS		= BIT(21),
 	/* Currently on a filemark */
-	IDE_AFLAG_FILEMARK		= (1 << 22),
+	IDE_AFLAG_FILEMARK		= BIT(22),
 	/* 0 = no tape is loaded, so we don't rewind after ejecting */
-	IDE_AFLAG_MEDIUM_PRESENT	= (1 << 23),
+	IDE_AFLAG_MEDIUM_PRESENT	= BIT(23),
 
-	IDE_AFLAG_NO_AUTOCLOSE		= (1 << 24),
+	IDE_AFLAG_NO_AUTOCLOSE		= BIT(24),
 };
 
 /* device flags */
 enum {
 	/* restore settings after device reset */
-	IDE_DFLAG_KEEP_SETTINGS		= (1 << 0),
+	IDE_DFLAG_KEEP_SETTINGS		= BIT(0),
 	/* device is using DMA for read/write */
-	IDE_DFLAG_USING_DMA		= (1 << 1),
+	IDE_DFLAG_USING_DMA		= BIT(1),
 	/* okay to unmask other IRQs */
-	IDE_DFLAG_UNMASK		= (1 << 2),
+	IDE_DFLAG_UNMASK		= BIT(2),
 	/* don't attempt flushes */
-	IDE_DFLAG_NOFLUSH		= (1 << 3),
+	IDE_DFLAG_NOFLUSH		= BIT(3),
 	/* DSC overlap */
-	IDE_DFLAG_DSC_OVERLAP		= (1 << 4),
+	IDE_DFLAG_DSC_OVERLAP		= BIT(4),
 	/* give potential excess bandwidth */
-	IDE_DFLAG_NICE1			= (1 << 5),
+	IDE_DFLAG_NICE1			= BIT(5),
 	/* device is physically present */
-	IDE_DFLAG_PRESENT		= (1 << 6),
+	IDE_DFLAG_PRESENT		= BIT(6),
 	/* disable Host Protected Area */
-	IDE_DFLAG_NOHPA			= (1 << 7),
+	IDE_DFLAG_NOHPA			= BIT(7),
 	/* id read from device (synthetic if not set) */
-	IDE_DFLAG_ID_READ		= (1 << 8),
-	IDE_DFLAG_NOPROBE		= (1 << 9),
+	IDE_DFLAG_ID_READ		= BIT(8),
+	IDE_DFLAG_NOPROBE		= BIT(9),
 	/* need to do check_media_change() */
-	IDE_DFLAG_REMOVABLE		= (1 << 10),
+	IDE_DFLAG_REMOVABLE		= BIT(10),
 	/* needed for removable devices */
-	IDE_DFLAG_ATTACH		= (1 << 11),
-	IDE_DFLAG_FORCED_GEOM		= (1 << 12),
+	IDE_DFLAG_ATTACH		= BIT(11),
+	IDE_DFLAG_FORCED_GEOM		= BIT(12),
 	/* disallow setting unmask bit */
-	IDE_DFLAG_NO_UNMASK		= (1 << 13),
+	IDE_DFLAG_NO_UNMASK		= BIT(13),
 	/* disallow enabling 32-bit I/O */
-	IDE_DFLAG_NO_IO_32BIT		= (1 << 14),
+	IDE_DFLAG_NO_IO_32BIT		= BIT(14),
 	/* for removable only: door lock/unlock works */
-	IDE_DFLAG_DOORLOCKING		= (1 << 15),
+	IDE_DFLAG_DOORLOCKING		= BIT(15),
 	/* disallow DMA */
-	IDE_DFLAG_NODMA			= (1 << 16),
+	IDE_DFLAG_NODMA			= BIT(16),
 	/* powermanagement told us not to do anything, so sleep nicely */
-	IDE_DFLAG_BLOCKED		= (1 << 17),
+	IDE_DFLAG_BLOCKED		= BIT(17),
 	/* sleeping & sleep field valid */
-	IDE_DFLAG_SLEEPING		= (1 << 18),
-	IDE_DFLAG_POST_RESET		= (1 << 19),
-	IDE_DFLAG_UDMA33_WARNED		= (1 << 20),
-	IDE_DFLAG_LBA48			= (1 << 21),
+	IDE_DFLAG_SLEEPING		= BIT(18),
+	IDE_DFLAG_POST_RESET		= BIT(19),
+	IDE_DFLAG_UDMA33_WARNED		= BIT(20),
+	IDE_DFLAG_LBA48			= BIT(21),
 	/* status of write cache */
-	IDE_DFLAG_WCACHE		= (1 << 22),
+	IDE_DFLAG_WCACHE		= BIT(22),
 	/* used for ignoring ATA_DF */
-	IDE_DFLAG_NOWERR		= (1 << 23),
+	IDE_DFLAG_NOWERR		= BIT(23),
 	/* retrying in PIO */
-	IDE_DFLAG_DMA_PIO_RETRY		= (1 << 24),
-	IDE_DFLAG_LBA			= (1 << 25),
+	IDE_DFLAG_DMA_PIO_RETRY		= BIT(24),
+	IDE_DFLAG_LBA			= BIT(25),
 	/* don't unload heads */
-	IDE_DFLAG_NO_UNLOAD		= (1 << 26),
+	IDE_DFLAG_NO_UNLOAD		= BIT(26),
 	/* heads unloaded, please don't reset port */
-	IDE_DFLAG_PARKED		= (1 << 27),
-	IDE_DFLAG_MEDIA_CHANGED		= (1 << 28),
+	IDE_DFLAG_PARKED		= BIT(27),
+	IDE_DFLAG_MEDIA_CHANGED		= BIT(28),
 	/* write protect */
-	IDE_DFLAG_WP			= (1 << 29),
-	IDE_DFLAG_FORMAT_IN_PROGRESS	= (1 << 30),
-	IDE_DFLAG_NIEN_QUIRK		= (1 << 31),
+	IDE_DFLAG_WP			= BIT(29),
+	IDE_DFLAG_FORMAT_IN_PROGRESS	= BIT(30),
+	IDE_DFLAG_NIEN_QUIRK		= BIT(31),
 };
 
 struct ide_drive_s {
@@ -709,7 +709,7 @@ struct ide_dma_ops {
 };
 
 enum {
-	IDE_PFLAG_PROBING		= (1 << 0),
+	IDE_PFLAG_PROBING		= BIT(0),
 };
 
 struct ide_host;
@@ -862,7 +862,7 @@ extern struct mutex ide_setting_mtx;
  * configurable drive settings
  */
 
-#define DS_SYNC	(1 << 0)
+#define DS_SYNC	BIT(0)
 
 struct ide_devset {
 	int		(*get)(ide_drive_t *);
@@ -1000,15 +1000,15 @@ static inline void ide_proc_unregister_driver(ide_drive_t *drive,
 
 enum {
 	/* enter/exit functions */
-	IDE_DBG_FUNC =			(1 << 0),
+	IDE_DBG_FUNC =			BIT(0),
 	/* sense key/asc handling */
-	IDE_DBG_SENSE =			(1 << 1),
+	IDE_DBG_SENSE =			BIT(1),
 	/* packet commands handling */
-	IDE_DBG_PC =			(1 << 2),
+	IDE_DBG_PC =			BIT(2),
 	/* request handling */
-	IDE_DBG_RQ =			(1 << 3),
+	IDE_DBG_RQ =			BIT(3),
 	/* driver probing/setup */
-	IDE_DBG_PROBE =			(1 << 4),
+	IDE_DBG_PROBE =			BIT(4),
 };
 
 /* DRV_NAME has to be defined in the driver before using the macro below */
@@ -1171,10 +1171,10 @@ ssize_t ide_park_store(struct device *dev, struct device_attribute *attr,
  * the tail of our block device request queue and wait for their completion.
  */
 enum {
-	REQ_IDETAPE_PC1		= (1 << 0), /* packet command (first stage) */
-	REQ_IDETAPE_PC2		= (1 << 1), /* packet command (second stage) */
-	REQ_IDETAPE_READ	= (1 << 2),
-	REQ_IDETAPE_WRITE	= (1 << 3),
+	REQ_IDETAPE_PC1		= BIT(0), /* packet command (first stage) */
+	REQ_IDETAPE_PC2		= BIT(1), /* packet command (second stage) */
+	REQ_IDETAPE_READ	= BIT(2),
+	REQ_IDETAPE_WRITE	= BIT(3),
 };
 
 int ide_queue_pc_tail(ide_drive_t *, struct gendisk *, struct ide_atapi_pc *,
@@ -1264,71 +1264,71 @@ struct ide_pci_enablebit {
 
 enum {
 	/* Uses ISA control ports not PCI ones. */
-	IDE_HFLAG_ISA_PORTS		= (1 << 0),
+	IDE_HFLAG_ISA_PORTS		= BIT(0),
 	/* single port device */
-	IDE_HFLAG_SINGLE		= (1 << 1),
+	IDE_HFLAG_SINGLE		= BIT(1),
 	/* don't use legacy PIO blacklist */
-	IDE_HFLAG_PIO_NO_BLACKLIST	= (1 << 2),
+	IDE_HFLAG_PIO_NO_BLACKLIST	= BIT(2),
 	/* set for the second port of QD65xx */
-	IDE_HFLAG_QD_2ND_PORT		= (1 << 3),
+	IDE_HFLAG_QD_2ND_PORT		= BIT(3),
 	/* use PIO8/9 for prefetch off/on */
-	IDE_HFLAG_ABUSE_PREFETCH	= (1 << 4),
+	IDE_HFLAG_ABUSE_PREFETCH	= BIT(4),
 	/* use PIO6/7 for fast-devsel off/on */
-	IDE_HFLAG_ABUSE_FAST_DEVSEL	= (1 << 5),
+	IDE_HFLAG_ABUSE_FAST_DEVSEL	= BIT(5),
 	/* use 100-102 and 200-202 PIO values to set DMA modes */
-	IDE_HFLAG_ABUSE_DMA_MODES	= (1 << 6),
+	IDE_HFLAG_ABUSE_DMA_MODES	= BIT(6),
 	/*
 	 * keep DMA setting when programming PIO mode, may be used only
 	 * for hosts which have separate PIO and DMA timings (ie. PMAC)
 	 */
-	IDE_HFLAG_SET_PIO_MODE_KEEP_DMA	= (1 << 7),
+	IDE_HFLAG_SET_PIO_MODE_KEEP_DMA	= BIT(7),
 	/* program host for the transfer mode after programming device */
-	IDE_HFLAG_POST_SET_MODE		= (1 << 8),
+	IDE_HFLAG_POST_SET_MODE		= BIT(8),
 	/* don't program host/device for the transfer mode ("smart" hosts) */
-	IDE_HFLAG_NO_SET_MODE		= (1 << 9),
+	IDE_HFLAG_NO_SET_MODE		= BIT(9),
 	/* trust BIOS for programming chipset/device for DMA */
-	IDE_HFLAG_TRUST_BIOS_FOR_DMA	= (1 << 10),
+	IDE_HFLAG_TRUST_BIOS_FOR_DMA	= BIT(10),
 	/* host is CS5510/CS5520 */
-	IDE_HFLAG_CS5520		= (1 << 11),
+	IDE_HFLAG_CS5520		= BIT(11),
 	/* ATAPI DMA is unsupported */
-	IDE_HFLAG_NO_ATAPI_DMA		= (1 << 12),
+	IDE_HFLAG_NO_ATAPI_DMA		= BIT(12),
 	/* set if host is a "non-bootable" controller */
-	IDE_HFLAG_NON_BOOTABLE		= (1 << 13),
+	IDE_HFLAG_NON_BOOTABLE		= BIT(13),
 	/* host doesn't support DMA */
-	IDE_HFLAG_NO_DMA		= (1 << 14),
+	IDE_HFLAG_NO_DMA		= BIT(14),
 	/* check if host is PCI IDE device before allowing DMA */
-	IDE_HFLAG_NO_AUTODMA		= (1 << 15),
+	IDE_HFLAG_NO_AUTODMA		= BIT(15),
 	/* host uses MMIO */
-	IDE_HFLAG_MMIO			= (1 << 16),
+	IDE_HFLAG_MMIO			= BIT(16),
 	/* no LBA48 */
-	IDE_HFLAG_NO_LBA48		= (1 << 17),
+	IDE_HFLAG_NO_LBA48		= BIT(17),
 	/* no LBA48 DMA */
-	IDE_HFLAG_NO_LBA48_DMA		= (1 << 18),
+	IDE_HFLAG_NO_LBA48_DMA		= BIT(18),
 	/* data FIFO is cleared by an error */
-	IDE_HFLAG_ERROR_STOPS_FIFO	= (1 << 19),
+	IDE_HFLAG_ERROR_STOPS_FIFO	= BIT(19),
 	/* serialize ports */
-	IDE_HFLAG_SERIALIZE		= (1 << 20),
+	IDE_HFLAG_SERIALIZE		= BIT(20),
 	/* host is DTC2278 */
-	IDE_HFLAG_DTC2278		= (1 << 21),
+	IDE_HFLAG_DTC2278		= BIT(21),
 	/* 4 devices on a single set of I/O ports */
-	IDE_HFLAG_4DRIVES		= (1 << 22),
+	IDE_HFLAG_4DRIVES		= BIT(22),
 	/* host is TRM290 */
-	IDE_HFLAG_TRM290		= (1 << 23),
+	IDE_HFLAG_TRM290		= BIT(23),
 	/* use 32-bit I/O ops */
-	IDE_HFLAG_IO_32BIT		= (1 << 24),
+	IDE_HFLAG_IO_32BIT		= BIT(24),
 	/* unmask IRQs */
-	IDE_HFLAG_UNMASK_IRQS		= (1 << 25),
-	IDE_HFLAG_BROKEN_ALTSTATUS	= (1 << 26),
+	IDE_HFLAG_UNMASK_IRQS		= BIT(25),
+	IDE_HFLAG_BROKEN_ALTSTATUS	= BIT(26),
 	/* serialize ports if DMA is possible (for sl82c105) */
-	IDE_HFLAG_SERIALIZE_DMA		= (1 << 27),
+	IDE_HFLAG_SERIALIZE_DMA		= BIT(27),
 	/* force host out of "simplex" mode */
-	IDE_HFLAG_CLEAR_SIMPLEX		= (1 << 28),
+	IDE_HFLAG_CLEAR_SIMPLEX		= BIT(28),
 	/* DSC overlap is unsupported */
-	IDE_HFLAG_NO_DSC		= (1 << 29),
+	IDE_HFLAG_NO_DSC		= BIT(29),
 	/* never use 32-bit I/O ops */
-	IDE_HFLAG_NO_IO_32BIT		= (1 << 30),
+	IDE_HFLAG_NO_IO_32BIT		= BIT(30),
 	/* never unmask IRQs */
-	IDE_HFLAG_NO_UNMASK_IRQS	= (1 << 31),
+	IDE_HFLAG_NO_UNMASK_IRQS	= BIT(31),
 };
 
 #ifdef CONFIG_BLK_DEV_OFFBOARD
@@ -1536,16 +1536,16 @@ struct ide_timing {
 };
 
 enum {
-	IDE_TIMING_SETUP	= (1 << 0),
-	IDE_TIMING_ACT8B	= (1 << 1),
-	IDE_TIMING_REC8B	= (1 << 2),
-	IDE_TIMING_CYC8B	= (1 << 3),
+	IDE_TIMING_SETUP	= BIT(0),
+	IDE_TIMING_ACT8B	= BIT(1),
+	IDE_TIMING_REC8B	= BIT(2),
+	IDE_TIMING_CYC8B	= BIT(3),
 	IDE_TIMING_8BIT		= IDE_TIMING_ACT8B | IDE_TIMING_REC8B |
 				  IDE_TIMING_CYC8B,
-	IDE_TIMING_ACTIVE	= (1 << 4),
-	IDE_TIMING_RECOVER	= (1 << 5),
-	IDE_TIMING_CYCLE	= (1 << 6),
-	IDE_TIMING_UDMA		= (1 << 7),
+	IDE_TIMING_ACTIVE	= BIT(4),
+	IDE_TIMING_RECOVER	= BIT(5),
+	IDE_TIMING_CYCLE	= BIT(6),
+	IDE_TIMING_UDMA		= BIT(7),
 	IDE_TIMING_ALL		= IDE_TIMING_SETUP | IDE_TIMING_8BIT |
 				  IDE_TIMING_ACTIVE | IDE_TIMING_RECOVER |
 				  IDE_TIMING_CYCLE | IDE_TIMING_UDMA,
-- 
cgit v1.2.3-59-g8ed1b


From 9b0eb69b75bccada2d341d7e7ca342f0cb1c9a6a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:48 -0700
Subject: cgroup, blkcg: Prepare some symbols for module and !CONFIG_CGROUP
 usages

btrfs is going to use css_put() and wbc helpers to improve cgroup
writeback support.  Add dummy css_get() definition and export wbc
helpers to prepare for module and !CONFIG_CGROUP builds.

Reported-by: kbuild test robot <lkp@intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c     | 1 +
 fs/fs-writeback.c      | 3 +++
 include/linux/cgroup.h | 1 +
 3 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8afa52b0d148..ad7a91dec934 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -48,6 +48,7 @@ struct blkcg blkcg_root;
 EXPORT_SYMBOL_GPL(blkcg_root);
 
 struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+EXPORT_SYMBOL_GPL(blkcg_root_css);
 
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9ebfb1b28430..a8a40bc26c2f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -270,6 +270,7 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
 	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
 		wb_put(wb);
 }
+EXPORT_SYMBOL_GPL(__inode_attach_wb);
 
 /**
  * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
@@ -582,6 +583,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 	if (unlikely(wb_dying(wbc->wb)))
 		inode_switch_wbs(inode, wbc->wb_id);
 }
+EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
 
 /**
  * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
@@ -701,6 +703,7 @@ void wbc_detach_inode(struct writeback_control *wbc)
 	wb_put(wbc->wb);
 	wbc->wb = NULL;
 }
+EXPORT_SYMBOL_GPL(wbc_detach_inode);
 
 /**
  * wbc_account_io - account IO issued during writeback
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3745ecdad925..852d885df10a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -699,6 +699,7 @@ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
 struct cgroup_subsys_state;
 struct cgroup;
 
+static inline void css_get(struct cgroup_subsys_state *css) {}
 static inline void css_put(struct cgroup_subsys_state *css) {}
 static inline int cgroup_attach_task_all(struct task_struct *from,
 					 struct task_struct *t) { return 0; }
-- 
cgit v1.2.3-59-g8ed1b


From 34e51a5e1a6e939ed7d99c38173821ab86d577f4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:49 -0700
Subject: blkcg, writeback: Rename wbc_account_io() to
 wbc_account_cgroup_owner()

wbc_account_io() does a very specific job - try to see which cgroup is
actually dirtying an inode and transfer its ownership to the majority
dirtier if needed.  The name is too generic and confusing.  Let's
rename it to something more specific.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/admin-guide/cgroup-v2.rst | 2 +-
 fs/btrfs/extent_io.c                    | 4 ++--
 fs/buffer.c                             | 2 +-
 fs/ext4/page-io.c                       | 2 +-
 fs/f2fs/data.c                          | 4 ++--
 fs/fs-writeback.c                       | 8 ++++----
 fs/mpage.c                              | 2 +-
 include/linux/writeback.h               | 8 ++++----
 8 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index a5c845338d6d..6223f485f7e1 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2114,7 +2114,7 @@ following two functions.
 	a queue (device) has been associated with the bio and
 	before submission.
 
-  wbc_account_io(@wbc, @page, @bytes)
+  wbc_account_cgroup_owner(@wbc, @page, @bytes)
 	Should be called for each data segment being written out.
 	While this function doesn't care exactly when it's called
 	during the writeback session, it's the easiest and most
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index db337e53aab3..5106008f5e28 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2911,7 +2911,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 			bio = NULL;
 		} else {
 			if (wbc)
-				wbc_account_io(wbc, page, page_size);
+				wbc_account_cgroup_owner(wbc, page, page_size);
 			return 0;
 		}
 	}
@@ -2924,7 +2924,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
 	bio->bi_opf = opf;
 	if (wbc) {
 		wbc_init_bio(wbc, bio);
-		wbc_account_io(wbc, page, page_size);
+		wbc_account_cgroup_owner(wbc, page, page_size);
 	}
 
 	*bio_ret = bio;
diff --git a/fs/buffer.c b/fs/buffer.c
index e450c55f6434..40547bbbea94 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3093,7 +3093,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
 
 	if (wbc) {
 		wbc_init_bio(wbc, bio);
-		wbc_account_io(wbc, bh->b_page, bh->b_size);
+		wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
 	}
 
 	submit_bio(bio);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4690618a92e9..56e287f5ee50 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -404,7 +404,7 @@ submit_and_retry:
 	ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
 	if (ret != bh->b_size)
 		goto submit_and_retry;
-	wbc_account_io(io->io_wbc, page, bh->b_size);
+	wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
 	io->io_next_block++;
 	return 0;
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index eda4181d2092..e1cab1717ac7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -470,7 +470,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	}
 
 	if (fio->io_wbc && !is_read_io(fio->op))
-		wbc_account_io(fio->io_wbc, page, PAGE_SIZE);
+		wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
 
 	bio_set_op_attrs(bio, fio->op, fio->op_flags);
 
@@ -537,7 +537,7 @@ alloc_new:
 	}
 
 	if (fio->io_wbc)
-		wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE);
+		wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE);
 
 	io->last_block_in_bio = fio->new_blkaddr;
 	f2fs_trace_ios(fio, 0);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a8a40bc26c2f..0aef79e934bb 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -706,7 +706,7 @@ void wbc_detach_inode(struct writeback_control *wbc)
 EXPORT_SYMBOL_GPL(wbc_detach_inode);
 
 /**
- * wbc_account_io - account IO issued during writeback
+ * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
  * @wbc: writeback_control of the writeback in progress
  * @page: page being written out
  * @bytes: number of bytes being written out
@@ -715,8 +715,8 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode);
  * controlled by @wbc.  Keep the book for foreign inode detection.  See
  * wbc_detach_inode().
  */
-void wbc_account_io(struct writeback_control *wbc, struct page *page,
-		    size_t bytes)
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+			      size_t bytes)
 {
 	struct cgroup_subsys_state *css;
 	int id;
@@ -753,7 +753,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
 	else
 		wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
 }
-EXPORT_SYMBOL_GPL(wbc_account_io);
+EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
 
 /**
  * inode_congested - test whether an inode is congested
diff --git a/fs/mpage.c b/fs/mpage.c
index 436a85260394..a63620cdb73a 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -647,7 +647,7 @@ alloc_new:
 	 * the confused fail path above (OOM) will be very confused when
 	 * it finds all bh marked clean (i.e. it will not write anything)
 	 */
-	wbc_account_io(wbc, page, PAGE_SIZE);
+	wbc_account_cgroup_owner(wbc, page, PAGE_SIZE);
 	length = first_unmapped << blkbits;
 	if (bio_add_page(bio, page, length, 0) < length) {
 		bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 738a0c24874f..dda5cf228172 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -188,8 +188,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
 				 struct inode *inode)
 	__releases(&inode->i_lock);
 void wbc_detach_inode(struct writeback_control *wbc);
-void wbc_account_io(struct writeback_control *wbc, struct page *page,
-		    size_t bytes);
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+			      size_t bytes);
 void cgroup_writeback_umount(void);
 
 /**
@@ -291,8 +291,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
 {
 }
 
-static inline void wbc_account_io(struct writeback_control *wbc,
-				  struct page *page, size_t bytes)
+static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
+					    struct page *page, size_t bytes)
 {
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 27b36d8fa81fa8274fb72f4eb1484026f6b6daa8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:50 -0700
Subject: blkcg, writeback: Add wbc->no_cgroup_owner

When writeback IOs are bounced through async layers, the IOs should
only be accounted against the wbc from the original bdi writeback to
avoid confusing cgroup inode ownership arbitration.  Add
wbc->no_cgroup_owner to allow disabling wbc cgroup owner accounting.
This will be used make btrfs compression work well with cgroup IO
control.

v2: Renamed from no_wbc_acct to no_cgroup_owner and added comment as
    per Jan.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/fs-writeback.c         | 2 +-
 include/linux/writeback.h | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0aef79e934bb..542b02d170f8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -727,7 +727,7 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
 	 * behind a slow cgroup.  Ultimately, we want pageout() to kick off
 	 * regular writeback instead of writing things out itself.
 	 */
-	if (!wbc->wb)
+	if (!wbc->wb || wbc->no_cgroup_owner)
 		return;
 
 	css = mem_cgroup_css_from_page(page);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index dda5cf228172..33a50fa09fac 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -68,6 +68,15 @@ struct writeback_control {
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */
+
+	/*
+	 * When writeback IOs are bounced through async layers, only the
+	 * initial synchronous phase should be accounted towards inode
+	 * cgroup ownership arbitration to avoid confusion.  Later stages
+	 * can set the following flag to disable the accounting.
+	 */
+	unsigned no_cgroup_owner:1;
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback *wb;	/* wb this writeback is issued under */
 	struct inode *inode;		/* inode being written out */
-- 
cgit v1.2.3-59-g8ed1b


From 653c45c6b90c9659facbef10546d1f3a8e37d0cf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:51 -0700
Subject: blkcg, writeback: Implement wbc_blkcg_css()

Add a helper to determine the target blkcg from wbc.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/writeback.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 33a50fa09fac..e056a22075cf 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -11,6 +11,7 @@
 #include <linux/flex_proportions.h>
 #include <linux/backing-dev-defs.h>
 #include <linux/blk_types.h>
+#include <linux/blk-cgroup.h>
 
 struct bio;
 
@@ -101,6 +102,16 @@ static inline int wbc_to_write_flags(struct writeback_control *wbc)
 	return 0;
 }
 
+static inline struct cgroup_subsys_state *
+wbc_blkcg_css(struct writeback_control *wbc)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+	if (wbc->wb)
+		return wbc->wb->blkcg_css;
+#endif
+	return blkcg_root_css;
+}
+
 /*
  * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
  * and are measured against each other in.  There always is one global
-- 
cgit v1.2.3-59-g8ed1b


From d3f77dfdc71835f8db71ca57d272b1fbec9dfc18 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2019 13:39:52 -0700
Subject: blkcg: implement REQ_CGROUP_PUNT

When a shared kthread needs to issue a bio for a cgroup, doing so
synchronously can lead to priority inversions as the kthread can be
trapped waiting for that cgroup.  This patch implements
REQ_CGROUP_PUNT flag which makes submit_bio() punt the actual issuing
to a dedicated per-blkcg work item to avoid such priority inversions.

This will be used to fix priority inversions in btrfs compression and
should be generally useful as we grow filesystem support for
comprehensive IO control.

Cc: Chris Mason <clm@fb.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c          | 53 +++++++++++++++++++++++++++++++++++++++++++++
 block/blk-core.c            |  3 +++
 include/linux/backing-dev.h |  1 +
 include/linux/blk-cgroup.h  | 16 +++++++++++++-
 include/linux/blk_types.h   | 10 +++++++++
 include/linux/writeback.h   | 13 ++++++++---
 6 files changed, 92 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ad7a91dec934..24ed26957367 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -55,6 +55,7 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
 
 static bool blkcg_debug_stats = false;
+static struct workqueue_struct *blkcg_punt_bio_wq;
 
 static bool blkcg_policy_enabled(struct request_queue *q,
 				 const struct blkcg_policy *pol)
@@ -89,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
 {
 	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
 
+	WARN_ON(!bio_list_empty(&blkg->async_bios));
+
 	/* release the blkcg and parent blkg refs this blkg has been holding */
 	css_put(&blkg->blkcg->css);
 	if (blkg->parent)
@@ -114,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
 	call_rcu(&blkg->rcu_head, __blkg_release);
 }
 
+static void blkg_async_bio_workfn(struct work_struct *work)
+{
+	struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+					     async_bio_work);
+	struct bio_list bios = BIO_EMPTY_LIST;
+	struct bio *bio;
+
+	/* as long as there are pending bios, @blkg can't go away */
+	spin_lock_bh(&blkg->async_bio_lock);
+	bio_list_merge(&bios, &blkg->async_bios);
+	bio_list_init(&blkg->async_bios);
+	spin_unlock_bh(&blkg->async_bio_lock);
+
+	while ((bio = bio_list_pop(&bios)))
+		submit_bio(bio);
+}
+
 /**
  * blkg_alloc - allocate a blkg
  * @blkcg: block cgroup the new blkg is associated with
@@ -142,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
+	spin_lock_init(&blkg->async_bio_lock);
+	bio_list_init(&blkg->async_bios);
+	INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
 	blkg->blkcg = blkcg;
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1528,6 +1551,25 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
 
+bool __blkcg_punt_bio_submit(struct bio *bio)
+{
+	struct blkcg_gq *blkg = bio->bi_blkg;
+
+	/* consume the flag first */
+	bio->bi_opf &= ~REQ_CGROUP_PUNT;
+
+	/* never bounce for the root cgroup */
+	if (!blkg->parent)
+		return false;
+
+	spin_lock_bh(&blkg->async_bio_lock);
+	bio_list_add(&blkg->async_bios, bio);
+	spin_unlock_bh(&blkg->async_bio_lock);
+
+	queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+	return true;
+}
+
 /*
  * Scale the accumulated delay based on how long it has been since we updated
  * the delay.  We only call this when we are adding delay, in case it's been a
@@ -1729,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
 	atomic64_add(delta, &blkg->delay_nsec);
 }
 
+static int __init blkcg_init(void)
+{
+	blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+					    WQ_MEM_RECLAIM | WQ_FREEZABLE |
+					    WQ_UNBOUND | WQ_SYSFS, 0);
+	if (!blkcg_punt_bio_wq)
+		return -ENOMEM;
+	return 0;
+}
+subsys_initcall(blkcg_init);
+
 module_param(blkcg_debug_stats, bool, 0644);
 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index edd009213f5b..260e36a2c343 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1128,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
  */
 blk_qc_t submit_bio(struct bio *bio)
 {
+	if (blkcg_punt_bio_submit(bio))
+		return BLK_QC_T_NONE;
+
 	/*
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index f9b029180241..35b31d176f74 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -48,6 +48,7 @@ extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
 
 extern struct workqueue_struct *bdi_wq;
+extern struct workqueue_struct *bdi_async_bio_wq;
 
 static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
 {
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 33f23a858438..689a58231288 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -132,13 +132,17 @@ struct blkcg_gq {
 
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
 
-	struct rcu_head			rcu_head;
+	spinlock_t			async_bio_lock;
+	struct bio_list			async_bios;
+	struct work_struct		async_bio_work;
 
 	atomic_t			use_delay;
 	atomic64_t			delay_nsec;
 	atomic64_t			delay_start;
 	u64				last_delay;
 	int				last_use;
+
+	struct rcu_head			rcu_head;
 };
 
 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
@@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
 				  struct bio *bio) { return false; }
 #endif
 
+bool __blkcg_punt_bio_submit(struct bio *bio);
+
+static inline bool blkcg_punt_bio_submit(struct bio *bio)
+{
+	if (bio->bi_opf & REQ_CGROUP_PUNT)
+		return __blkcg_punt_bio_submit(bio);
+	else
+		return false;
+}
 
 static inline void blkcg_bio_issue_init(struct bio *bio)
 {
@@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
 
+static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
 static inline void blkcg_bio_issue_init(struct bio *bio) { }
 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 					 struct bio *bio) { return true; }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6a53799c3fe2..feff3fe4467e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -311,6 +311,14 @@ enum req_flag_bits {
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
 	__REQ_BACKGROUND,	/* background IO */
 	__REQ_NOWAIT,           /* Don't wait if request will block */
+	/*
+	 * When a shared kthread needs to issue a bio for a cgroup, doing
+	 * so synchronously can lead to priority inversions as the kthread
+	 * can be trapped waiting for that cgroup.  CGROUP_PUNT flag makes
+	 * submit_bio() punt the actual issuing to a dedicated per-blkcg
+	 * work item to avoid such priority inversions.
+	 */
+	__REQ_CGROUP_PUNT,
 
 	/* command specific flags for REQ_OP_WRITE_ZEROES: */
 	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
@@ -337,6 +345,8 @@ enum req_flag_bits {
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
 #define REQ_NOWAIT		(1ULL << __REQ_NOWAIT)
+#define REQ_CGROUP_PUNT		(1ULL << __REQ_CGROUP_PUNT)
+
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
 #define REQ_HIPRI		(1ULL << __REQ_HIPRI)
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index e056a22075cf..8945aac31392 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -78,6 +78,8 @@ struct writeback_control {
 	 */
 	unsigned no_cgroup_owner:1;
 
+	unsigned punt_to_cgroup:1;	/* cgrp punting, see __REQ_CGROUP_PUNT */
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct bdi_writeback *wb;	/* wb this writeback is issued under */
 	struct inode *inode;		/* inode being written out */
@@ -94,12 +96,17 @@ struct writeback_control {
 
 static inline int wbc_to_write_flags(struct writeback_control *wbc)
 {
+	int flags = 0;
+
+	if (wbc->punt_to_cgroup)
+		flags = REQ_CGROUP_PUNT;
+
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		return REQ_SYNC;
+		flags |= REQ_SYNC;
 	else if (wbc->for_kupdate || wbc->for_background)
-		return REQ_BACKGROUND;
+		flags |= REQ_BACKGROUND;
 
-	return 0;
+	return flags;
 }
 
 static inline struct cgroup_subsys_state *
-- 
cgit v1.2.3-59-g8ed1b


From 113ab72ed4794c193509a97d7c6d32a6886e1682 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Wed, 10 Jul 2019 13:53:10 +0900
Subject: block: Fix potential overflow in blk_report_zones()

For large values of the number of zones reported and/or large zone
sizes, the sector increment calculated with

blk_queue_zone_sectors(q) * n

in blk_report_zones() loop can overflow the unsigned int type used for
the calculation as both "n" and blk_queue_zone_sectors() value are
unsigned int. E.g. for a device with 256 MB zones (524288 sectors),
overflow happens with 8192 or more zones reported.

Changing the return type of blk_queue_zone_sectors() to sector_t, fixes
this problem and avoids overflow problem for all other callers of this
helper too. The same change is also applied to the bdev_zone_sectors()
helper.

Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable@vger.kernel.org
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 2 +-
 include/linux/blkdev.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ae7e91bd0618..3249738242b4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
 static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
 					     sector_t nr_sectors)
 {
-	unsigned long zone_sectors = blk_queue_zone_sectors(q);
+	sector_t zone_sectors = blk_queue_zone_sectors(q);
 
 	return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0c482371c8b3..259bd7ad8312 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -681,7 +681,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
 	}
 }
 
-static inline unsigned int blk_queue_zone_sectors(struct request_queue *q)
+static inline sector_t blk_queue_zone_sectors(struct request_queue *q)
 {
 	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
 }
@@ -1418,7 +1418,7 @@ static inline bool bdev_is_zoned(struct block_device *bdev)
 	return false;
 }
 
-static inline unsigned int bdev_zone_sectors(struct block_device *bdev)
+static inline sector_t bdev_zone_sectors(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 
-- 
cgit v1.2.3-59-g8ed1b


From 36847a005489cfb74dc6388952da73346f867dca Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 11 Jul 2019 00:56:08 +0900
Subject: block: Remove unused definitions

The ELV_MQUEUE_XXX definitions in include/linux/elevator.h are unused
since the removal of elevator_may_queue_fn in kernel 5.0. Remove these
definitions and also remove the documentation of elevator_may_queue_fn
in Documentiation/block/biodoc.txt.

Acked-by: Marcos Paulo de Souza <marcos.souza.org@gmail.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt | 5 -----
 include/linux/elevator.h       | 9 ---------
 2 files changed, 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 31c177663ed5..5a4a799fe61b 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -843,11 +843,6 @@ elevator_latter_req_fn		These return the request before or after the
 
 elevator_completed_req_fn	called when a request is completed.
 
-elevator_may_queue_fn		returns true if the scheduler wants to allow the
-				current context to queue a new request even if
-				it is over the queue limit. This must be used
-				very carefully!!
-
 elevator_set_req_fn
 elevator_put_req_fn		Must be used to allocate and free any elevator
 				specific storage for a request.
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 169bb2e02516..38590c30a11d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -160,15 +160,6 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
 #define ELEVATOR_INSERT_FLUSH	5
 #define ELEVATOR_INSERT_SORT_MERGE	6
 
-/*
- * return values from elevator_may_queue_fn
- */
-enum {
-	ELV_MQUEUE_MAY,
-	ELV_MQUEUE_NO,
-	ELV_MQUEUE_MUST,
-};
-
 #define rq_end_sector(rq)	(blk_rq_pos(rq) + blk_rq_sectors(rq))
 #define rb_entry_rq(node)	rb_entry((node), struct request, rb_node)
 
-- 
cgit v1.2.3-59-g8ed1b


From 9305d5d721f2bd5e2eeb670035159b560ca211ca Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Thu, 11 Jul 2019 00:57:41 +0900
Subject: block: Fix elevator name declaration

The elevator_name field in struct elevator_type is declared as an array
of characters (ELV_NAME_MAX size) but in practice used as a string
pointer with its initialization done statically within each
elevator elevator_type structure declaration.

Change the declaration of elevator_name to the more appropriate
"const char *" type.

Acked-by: Marcos Paulo de Souza <marcos.souza.org@gmail.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/elevator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 38590c30a11d..17cd0078377c 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -75,7 +75,7 @@ struct elevator_type
 	size_t icq_size;	/* see iocontext.h */
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
-	char elevator_name[ELV_NAME_MAX];
+	const char *elevator_name;
 	const char *elevator_alias;
 	struct module *elevator_owner;
 #ifdef CONFIG_BLK_DEBUG_FS
-- 
cgit v1.2.3-59-g8ed1b


From 9c1f2a5dc2948b9f1170d4202c84745f0b0ff0c9 Mon Sep 17 00:00:00 2001
From: Suman Anna <s-anna@ti.com>
Date: Tue, 4 Jun 2019 12:01:46 -0500
Subject: mailbox: omap: Add support for TI K3 SoCs

The TI K3 AM65x and J721E family of SoCs have a new Mailbox IP that
is based on the existing Mailbox IP present in OMAP architecture based
SoCs. Each instance of the legacy OMAP Mailbox IP is now a single cluster
within the newer Mailbox IP instance on K3 architecture based SoCs. A
single K3 Mailbox IP instance has multiple clusters with each cluster
providing the same functionality as the existing OMAP Mailbox IP.

Reuse the existing OMAP Mailbox driver to extend the support for this
newer IP present within the Main NavSS block on K3 SoCs. The K3 family
of SoCs use 64-bit ARMv8 processors for running Linux, so the driver is
also enhanced to deal with the differences between the 32-bit message
payloads and the 64-bit pointers used by the client drivers.

Signed-off-by: Suman Anna <s-anna@ti.com>
Signed-off-by: Jassi Brar <jaswinder.singh@linaro.org>
---
 drivers/mailbox/Kconfig        |  2 +-
 drivers/mailbox/omap-mailbox.c | 43 +++++++++++++++++++++++++-----------------
 include/linux/omap-mailbox.h   |  4 +++-
 3 files changed, 30 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mailbox/Kconfig b/drivers/mailbox/Kconfig
index b709481a8de6..ab4eb750bbdd 100644
--- a/drivers/mailbox/Kconfig
+++ b/drivers/mailbox/Kconfig
@@ -54,7 +54,7 @@ config ARMADA_37XX_RWTM_MBOX
 
 config OMAP2PLUS_MBOX
 	tristate "OMAP2+ Mailbox framework support"
-	depends on ARCH_OMAP2PLUS
+	depends on ARCH_OMAP2PLUS || ARCH_K3
 	help
 	  Mailbox implementation for OMAP family chips with hardware for
 	  interprocessor communication involving DSP, IVA1.0 and IVA2 in
diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index ca50177a33f2..a3cd63583cf7 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -3,7 +3,7 @@
  * OMAP mailbox driver
  *
  * Copyright (C) 2006-2009 Nokia Corporation. All rights reserved.
- * Copyright (C) 2013-2016 Texas Instruments Incorporated - http://www.ti.com
+ * Copyright (C) 2013-2019 Texas Instruments Incorporated - http://www.ti.com
  *
  * Contact: Hiroshi DOYU <Hiroshi.DOYU@nokia.com>
  *          Suman Anna <s-anna@ti.com>
@@ -141,14 +141,14 @@ void mbox_write_reg(struct omap_mbox_device *mdev, u32 val, size_t ofs)
 }
 
 /* Mailbox FIFO handle functions */
-static mbox_msg_t mbox_fifo_read(struct omap_mbox *mbox)
+static u32 mbox_fifo_read(struct omap_mbox *mbox)
 {
 	struct omap_mbox_fifo *fifo = &mbox->rx_fifo;
 
-	return (mbox_msg_t)mbox_read_reg(mbox->parent, fifo->msg);
+	return mbox_read_reg(mbox->parent, fifo->msg);
 }
 
-static void mbox_fifo_write(struct omap_mbox *mbox, mbox_msg_t msg)
+static void mbox_fifo_write(struct omap_mbox *mbox, u32 msg)
 {
 	struct omap_mbox_fifo *fifo = &mbox->tx_fifo;
 
@@ -256,14 +256,16 @@ static void mbox_rx_work(struct work_struct *work)
 {
 	struct omap_mbox_queue *mq =
 			container_of(work, struct omap_mbox_queue, work);
-	mbox_msg_t msg;
+	mbox_msg_t data;
+	u32 msg;
 	int len;
 
 	while (kfifo_len(&mq->fifo) >= sizeof(msg)) {
 		len = kfifo_out(&mq->fifo, (unsigned char *)&msg, sizeof(msg));
 		WARN_ON(len != sizeof(msg));
+		data = msg;
 
-		mbox_chan_received_data(mq->mbox->chan, (void *)msg);
+		mbox_chan_received_data(mq->mbox->chan, (void *)data);
 		spin_lock_irq(&mq->lock);
 		if (mq->full) {
 			mq->full = false;
@@ -286,7 +288,7 @@ static void __mbox_tx_interrupt(struct omap_mbox *mbox)
 static void __mbox_rx_interrupt(struct omap_mbox *mbox)
 {
 	struct omap_mbox_queue *mq = mbox->rxq;
-	mbox_msg_t msg;
+	u32 msg;
 	int len;
 
 	while (!mbox_fifo_empty(mbox)) {
@@ -540,13 +542,13 @@ static void omap_mbox_chan_shutdown(struct mbox_chan *chan)
 	mutex_unlock(&mdev->cfg_lock);
 }
 
-static int omap_mbox_chan_send_noirq(struct omap_mbox *mbox, void *data)
+static int omap_mbox_chan_send_noirq(struct omap_mbox *mbox, u32 msg)
 {
 	int ret = -EBUSY;
 
 	if (!mbox_fifo_full(mbox)) {
 		_omap_mbox_enable_irq(mbox, IRQ_RX);
-		mbox_fifo_write(mbox, (mbox_msg_t)data);
+		mbox_fifo_write(mbox, msg);
 		ret = 0;
 		_omap_mbox_disable_irq(mbox, IRQ_RX);
 
@@ -558,12 +560,12 @@ static int omap_mbox_chan_send_noirq(struct omap_mbox *mbox, void *data)
 	return ret;
 }
 
-static int omap_mbox_chan_send(struct omap_mbox *mbox, void *data)
+static int omap_mbox_chan_send(struct omap_mbox *mbox, u32 msg)
 {
 	int ret = -EBUSY;
 
 	if (!mbox_fifo_full(mbox)) {
-		mbox_fifo_write(mbox, (mbox_msg_t)data);
+		mbox_fifo_write(mbox, msg);
 		ret = 0;
 	}
 
@@ -576,14 +578,15 @@ static int omap_mbox_chan_send_data(struct mbox_chan *chan, void *data)
 {
 	struct omap_mbox *mbox = mbox_chan_to_omap_mbox(chan);
 	int ret;
+	u32 msg = omap_mbox_message(data);
 
 	if (!mbox)
 		return -EINVAL;
 
 	if (mbox->send_no_irq)
-		ret = omap_mbox_chan_send_noirq(mbox, data);
+		ret = omap_mbox_chan_send_noirq(mbox, msg);
 	else
-		ret = omap_mbox_chan_send(mbox, data);
+		ret = omap_mbox_chan_send(mbox, msg);
 
 	return ret;
 }
@@ -656,6 +659,10 @@ static const struct of_device_id omap_mailbox_of_match[] = {
 		.compatible	= "ti,omap4-mailbox",
 		.data		= &omap4_data,
 	},
+	{
+		.compatible	= "ti,am654-mailbox",
+		.data		= &omap4_data,
+	},
 	{
 		/* end */
 	},
@@ -830,7 +837,10 @@ static int omap_mbox_probe(struct platform_device *pdev)
 	mdev->intr_type = intr_type;
 	mdev->mboxes = list;
 
-	/* OMAP does not have a Tx-Done IRQ, but rather a Tx-Ready IRQ */
+	/*
+	 * OMAP/K3 Mailbox IP does not have a Tx-Done IRQ, but rather a Tx-Ready
+	 * IRQ and is needed to run the Tx state machine
+	 */
 	mdev->controller.txdone_irq = true;
 	mdev->controller.dev = mdev->dev;
 	mdev->controller.ops = &omap_mbox_chan_ops;
@@ -899,9 +909,8 @@ static int __init omap_mbox_init(void)
 		return err;
 
 	/* kfifo size sanity check: alignment and minimal size */
-	mbox_kfifo_size = ALIGN(mbox_kfifo_size, sizeof(mbox_msg_t));
-	mbox_kfifo_size = max_t(unsigned int, mbox_kfifo_size,
-							sizeof(mbox_msg_t));
+	mbox_kfifo_size = ALIGN(mbox_kfifo_size, sizeof(u32));
+	mbox_kfifo_size = max_t(unsigned int, mbox_kfifo_size, sizeof(u32));
 
 	err = platform_driver_register(&omap_mbox_driver);
 	if (err)
diff --git a/include/linux/omap-mailbox.h b/include/linux/omap-mailbox.h
index 6dbcd2da0332..8aa984ec1f38 100644
--- a/include/linux/omap-mailbox.h
+++ b/include/linux/omap-mailbox.h
@@ -6,7 +6,9 @@
 #ifndef OMAP_MAILBOX_H
 #define OMAP_MAILBOX_H
 
-typedef u32 mbox_msg_t;
+typedef uintptr_t mbox_msg_t;
+
+#define omap_mbox_message(data) (u32)(mbox_msg_t)(data)
 
 typedef int __bitwise omap_mbox_irq_t;
 #define IRQ_TX ((__force omap_mbox_irq_t) 1)
-- 
cgit v1.2.3-59-g8ed1b


From ff956826a403f5cf189978d5ff6b3eb53aa11610 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:24 +0800
Subject: intel_rapl: introduce intel_rapl.h

Create a new header file for the common definitions that might be used
by different RAPL Interface.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 MAINTAINERS                   |   1 +
 drivers/powercap/intel_rapl.c | 101 +------------------------------------
 include/linux/intel_rapl.h    | 113 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+), 99 deletions(-)
 create mode 100644 include/linux/intel_rapl.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 28a36f1efe02..9ded49d371da 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12679,6 +12679,7 @@ F:	drivers/base/power/
 F:	include/linux/pm.h
 F:	include/linux/pm_*
 F:	include/linux/powercap.h
+F:	include/linux/intel_rapl.h
 F:	drivers/powercap/
 F:	kernel/configs/nopm.config
 
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 9be9f20ff056..adb35ec9f939 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -18,8 +18,9 @@
 #include <linux/cpu.h>
 #include <linux/powercap.h>
 #include <linux/suspend.h>
-#include <asm/iosf_mbi.h>
+#include <linux/intel_rapl.h>
 
+#include <asm/iosf_mbi.h>
 #include <asm/processor.h>
 #include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
@@ -74,59 +75,9 @@ enum unit_type {
 	TIME_UNIT,
 };
 
-enum rapl_domain_type {
-	RAPL_DOMAIN_PACKAGE, /* entire package/socket */
-	RAPL_DOMAIN_PP0, /* core power plane */
-	RAPL_DOMAIN_PP1, /* graphics uncore */
-	RAPL_DOMAIN_DRAM,/* DRAM control_type */
-	RAPL_DOMAIN_PLATFORM, /* PSys control_type */
-	RAPL_DOMAIN_MAX,
-};
-
-enum rapl_domain_reg_id {
-	RAPL_DOMAIN_REG_LIMIT,
-	RAPL_DOMAIN_REG_STATUS,
-	RAPL_DOMAIN_REG_PERF,
-	RAPL_DOMAIN_REG_POLICY,
-	RAPL_DOMAIN_REG_INFO,
-	RAPL_DOMAIN_REG_MAX,
-};
-
 /* per domain data, some are optional */
-enum rapl_primitives {
-	ENERGY_COUNTER,
-	POWER_LIMIT1,
-	POWER_LIMIT2,
-	FW_LOCK,
-
-	PL1_ENABLE,  /* power limit 1, aka long term */
-	PL1_CLAMP,   /* allow frequency to go below OS request */
-	PL2_ENABLE,  /* power limit 2, aka short term, instantaneous */
-	PL2_CLAMP,
-
-	TIME_WINDOW1, /* long term */
-	TIME_WINDOW2, /* short term */
-	THERMAL_SPEC_POWER,
-	MAX_POWER,
-
-	MIN_POWER,
-	MAX_TIME_WINDOW,
-	THROTTLED_TIME,
-	PRIORITY_LEVEL,
-
-	/* below are not raw primitive data */
-	AVERAGE_POWER,
-	NR_RAPL_PRIMITIVES,
-};
-
 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
 
-/* Can be expanded to include events, etc.*/
-struct rapl_domain_data {
-	u64 primitives[NR_RAPL_PRIMITIVES];
-	unsigned long timestamp;
-};
-
 struct msrl_action {
 	u32 msr_no;
 	u64 clear_mask;
@@ -138,60 +89,12 @@ struct msrl_action {
 #define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
 #define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
 
-#define NR_POWER_LIMITS (2)
-struct rapl_power_limit {
-	struct powercap_zone_constraint *constraint;
-	int prim_id; /* primitive ID used to enable */
-	struct rapl_domain *domain;
-	const char *name;
-	u64 last_power_limit;
-};
-
 static const char pl1_name[] = "long_term";
 static const char pl2_name[] = "short_term";
 
-struct rapl_package;
-struct rapl_domain {
-	const char *name;
-	enum rapl_domain_type id;
-	int regs[RAPL_DOMAIN_REG_MAX];
-	struct powercap_zone power_zone;
-	struct rapl_domain_data rdd;
-	struct rapl_power_limit rpl[NR_POWER_LIMITS];
-	u64 attr_map; /* track capabilities */
-	unsigned int state;
-	unsigned int domain_energy_unit;
-	struct rapl_package *rp;
-};
 #define power_zone_to_rapl_domain(_zone) \
 	container_of(_zone, struct rapl_domain, power_zone)
 
-/* maximum rapl package domain name: package-%d-die-%d */
-#define PACKAGE_DOMAIN_NAME_LENGTH 30
-
-
-/* Each rapl package contains multiple domains, these are the common
- * data across RAPL domains within a package.
- */
-struct rapl_package {
-	unsigned int id; /* logical die id, equals physical 1-die systems */
-	unsigned int nr_domains;
-	unsigned long domain_map; /* bit map of active domains */
-	unsigned int power_unit;
-	unsigned int energy_unit;
-	unsigned int time_unit;
-	struct rapl_domain *domains; /* array of domains, sized at runtime */
-	struct powercap_zone *power_zone; /* keep track of parent zone */
-	unsigned long power_limit_irq; /* keep track of package power limit
-					* notify interrupt enable status.
-					*/
-	struct list_head plist;
-	int lead_cpu; /* one active cpu per package for access */
-	/* Track active cpus */
-	struct cpumask cpumask;
-	char name[PACKAGE_DOMAIN_NAME_LENGTH];
-};
-
 struct rapl_defaults {
 	u8 floor_freq_reg_addr;
 	int (*check_unit)(struct rapl_package *rp, int cpu);
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
new file mode 100644
index 000000000000..94716036d829
--- /dev/null
+++ b/include/linux/intel_rapl.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Data types and headers for RAPL support
+ *
+ *  Copyright (C) 2019  Intel Corporation.
+ *
+ *  Author: Zhang Rui <rui.zhang@intel.com>
+ */
+
+#ifndef __INTEL_RAPL_H__
+#define __INTEL_RAPL_H__
+
+#include <linux/types.h>
+#include <linux/powercap.h>
+
+enum rapl_domain_type {
+	RAPL_DOMAIN_PACKAGE,	/* entire package/socket */
+	RAPL_DOMAIN_PP0,	/* core power plane */
+	RAPL_DOMAIN_PP1,	/* graphics uncore */
+	RAPL_DOMAIN_DRAM,	/* DRAM control_type */
+	RAPL_DOMAIN_PLATFORM,	/* PSys control_type */
+	RAPL_DOMAIN_MAX,
+};
+
+enum rapl_domain_reg_id {
+	RAPL_DOMAIN_REG_LIMIT,
+	RAPL_DOMAIN_REG_STATUS,
+	RAPL_DOMAIN_REG_PERF,
+	RAPL_DOMAIN_REG_POLICY,
+	RAPL_DOMAIN_REG_INFO,
+	RAPL_DOMAIN_REG_MAX,
+};
+
+struct rapl_package;
+
+enum rapl_primitives {
+	ENERGY_COUNTER,
+	POWER_LIMIT1,
+	POWER_LIMIT2,
+	FW_LOCK,
+
+	PL1_ENABLE,		/* power limit 1, aka long term */
+	PL1_CLAMP,		/* allow frequency to go below OS request */
+	PL2_ENABLE,		/* power limit 2, aka short term, instantaneous */
+	PL2_CLAMP,
+
+	TIME_WINDOW1,		/* long term */
+	TIME_WINDOW2,		/* short term */
+	THERMAL_SPEC_POWER,
+	MAX_POWER,
+
+	MIN_POWER,
+	MAX_TIME_WINDOW,
+	THROTTLED_TIME,
+	PRIORITY_LEVEL,
+
+	/* below are not raw primitive data */
+	AVERAGE_POWER,
+	NR_RAPL_PRIMITIVES,
+};
+
+struct rapl_domain_data {
+	u64 primitives[NR_RAPL_PRIMITIVES];
+	unsigned long timestamp;
+};
+
+#define NR_POWER_LIMITS (2)
+struct rapl_power_limit {
+	struct powercap_zone_constraint *constraint;
+	int prim_id;		/* primitive ID used to enable */
+	struct rapl_domain *domain;
+	const char *name;
+	u64 last_power_limit;
+};
+
+struct rapl_package;
+
+struct rapl_domain {
+	const char *name;
+	enum rapl_domain_type id;
+	int regs[RAPL_DOMAIN_REG_MAX];
+	struct powercap_zone power_zone;
+	struct rapl_domain_data rdd;
+	struct rapl_power_limit rpl[NR_POWER_LIMITS];
+	u64 attr_map;		/* track capabilities */
+	unsigned int state;
+	unsigned int domain_energy_unit;
+	struct rapl_package *rp;
+};
+
+/* maximum rapl package domain name: package-%d-die-%d */
+#define PACKAGE_DOMAIN_NAME_LENGTH 30
+
+struct rapl_package {
+	unsigned int id;	/* logical die id, equals physical 1-die systems */
+	unsigned int nr_domains;
+	unsigned long domain_map;	/* bit map of active domains */
+	unsigned int power_unit;
+	unsigned int energy_unit;
+	unsigned int time_unit;
+	struct rapl_domain *domains;	/* array of domains, sized at runtime */
+	struct powercap_zone *power_zone;	/* keep track of parent zone */
+	unsigned long power_limit_irq;	/* keep track of package power limit
+					 * notify interrupt enable status.
+					 */
+	struct list_head plist;
+	int lead_cpu;		/* one active cpu per package for access */
+	/* Track active cpus */
+	struct cpumask cpumask;
+	char name[PACKAGE_DOMAIN_NAME_LENGTH];
+};
+
+#endif /* __INTEL_RAPL_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 7ebf8eff63b4f349e7b2ded6aa5036d94bdf94b9 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:25 +0800
Subject: intel_rapl: introduce struct rapl_if_private

Introduce a new structure, rapl_if_private, to save the private data
for different RAPL Interface.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 59 +++++++++++++++++++++----------------------
 include/linux/intel_rapl.h    | 15 +++++++++++
 2 files changed, 44 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index adb35ec9f939..e05d92d67525 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -75,6 +75,9 @@ enum unit_type {
 	TIME_UNIT,
 };
 
+/* private data for RAPL MSR Interface */
+static struct rapl_if_priv rapl_msr_priv;
+
 /* per domain data, some are optional */
 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
 
@@ -155,17 +158,14 @@ static const char * const rapl_domain_names[] = {
 	"psys",
 };
 
-static struct powercap_control_type *control_type; /* PowerCap Controller */
-static struct rapl_domain *platform_rapl_domain; /* Platform (PSys) domain */
-
 /* caller to ensure CPU hotplug lock is held */
-static struct rapl_package *rapl_find_package_domain(int cpu)
+static struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
 {
 	int id = topology_logical_die_id(cpu);
 	struct rapl_package *rp;
 
 	list_for_each_entry(rp, &rapl_packages, plist) {
-		if (rp->id == id)
+		if (rp->id == id && rp->priv->control_type == priv->control_type)
 			return rp;
 	}
 
@@ -1090,12 +1090,12 @@ static void rapl_update_domain_data(struct rapl_package *rp)
 
 static void rapl_unregister_powercap(void)
 {
-	if (platform_rapl_domain) {
-		powercap_unregister_zone(control_type,
-					 &platform_rapl_domain->power_zone);
-		kfree(platform_rapl_domain);
+	if (&rapl_msr_priv.platform_rapl_domain) {
+		powercap_unregister_zone(rapl_msr_priv.control_type,
+					 &rapl_msr_priv.platform_rapl_domain->power_zone);
+		kfree(rapl_msr_priv.platform_rapl_domain);
 	}
-	powercap_unregister_control_type(control_type);
+	powercap_unregister_control_type(rapl_msr_priv.control_type);
 }
 
 static int rapl_package_register_powercap(struct rapl_package *rp)
@@ -1113,7 +1113,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 			nr_pl = find_nr_power_limit(rd);
 			pr_debug("register package domain %s\n", rp->name);
 			power_zone = powercap_register_zone(&rd->power_zone,
-							control_type,
+							rp->priv->control_type,
 							rp->name, NULL,
 							&zone_ops[rd->id],
 							nr_pl,
@@ -1140,7 +1140,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 		/* number of power limits per domain varies */
 		nr_pl = find_nr_power_limit(rd);
 		power_zone = powercap_register_zone(&rd->power_zone,
-						control_type, rd->name,
+						rp->priv->control_type, rd->name,
 						rp->power_zone,
 						&zone_ops[rd->id], nr_pl,
 						&constraint_ops);
@@ -1161,7 +1161,7 @@ err_cleanup:
 	 */
 	while (--rd >= rp->domains) {
 		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
-		powercap_unregister_zone(control_type, &rd->power_zone);
+		powercap_unregister_zone(rp->priv->control_type, &rd->power_zone);
 	}
 
 	return ret;
@@ -1191,9 +1191,9 @@ static int __init rapl_register_psys(void)
 	rd->rpl[0].name = pl1_name;
 	rd->rpl[1].prim_id = PL2_ENABLE;
 	rd->rpl[1].name = pl2_name;
-	rd->rp = rapl_find_package_domain(0);
+	rd->rp = rapl_find_package_domain(0, &rapl_msr_priv);
 
-	power_zone = powercap_register_zone(&rd->power_zone, control_type,
+	power_zone = powercap_register_zone(&rd->power_zone, rapl_msr_priv.control_type,
 					    "psys", NULL,
 					    &zone_ops[RAPL_DOMAIN_PLATFORM],
 					    2, &constraint_ops);
@@ -1203,17 +1203,17 @@ static int __init rapl_register_psys(void)
 		return PTR_ERR(power_zone);
 	}
 
-	platform_rapl_domain = rd;
+	rapl_msr_priv.platform_rapl_domain = rd;
 
 	return 0;
 }
 
 static int __init rapl_register_powercap(void)
 {
-	control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
-	if (IS_ERR(control_type)) {
+	rapl_msr_priv.control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
+	if (IS_ERR(rapl_msr_priv.control_type)) {
 		pr_debug("failed to register powercap control_type.\n");
-		return PTR_ERR(control_type);
+		return PTR_ERR(rapl_msr_priv.control_type);
 	}
 	return 0;
 }
@@ -1338,16 +1338,16 @@ static void rapl_remove_package(struct rapl_package *rp)
 		}
 		pr_debug("remove package, undo power limit on %s: %s\n",
 			 rp->name, rd->name);
-		powercap_unregister_zone(control_type, &rd->power_zone);
+		powercap_unregister_zone(rp->priv->control_type, &rd->power_zone);
 	}
 	/* do parent zone last */
-	powercap_unregister_zone(control_type, &rd_package->power_zone);
+	powercap_unregister_zone(rp->priv->control_type, &rd_package->power_zone);
 	list_del(&rp->plist);
 	kfree(rp);
 }
 
 /* called from CPU hotplug notifier, hotplug lock held */
-static struct rapl_package *rapl_add_package(int cpu)
+static struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
 {
 	int id = topology_logical_die_id(cpu);
 	struct rapl_package *rp;
@@ -1361,6 +1361,7 @@ static struct rapl_package *rapl_add_package(int cpu)
 	/* add the new package to the list */
 	rp->id = id;
 	rp->lead_cpu = cpu;
+	rp->priv = priv;
 
 	if (topology_max_die_per_package() > 1)
 		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
@@ -1399,9 +1400,9 @@ static int rapl_cpu_online(unsigned int cpu)
 {
 	struct rapl_package *rp;
 
-	rp = rapl_find_package_domain(cpu);
+	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
 	if (!rp) {
-		rp = rapl_add_package(cpu);
+		rp = rapl_add_package(cpu, &rapl_msr_priv);
 		if (IS_ERR(rp))
 			return PTR_ERR(rp);
 	}
@@ -1414,7 +1415,7 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 	struct rapl_package *rp;
 	int lead_cpu;
 
-	rp = rapl_find_package_domain(cpu);
+	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
 	if (!rp)
 		return 0;
 
@@ -1427,8 +1428,6 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 	return 0;
 }
 
-static enum cpuhp_state pcap_rapl_online;
-
 static void power_limit_state_save(void)
 {
 	struct rapl_package *rp;
@@ -1538,7 +1537,7 @@ static int __init rapl_init(void)
 				rapl_cpu_online, rapl_cpu_down_prep);
 	if (ret < 0)
 		goto err_unreg;
-	pcap_rapl_online = ret;
+	rapl_msr_priv.pcap_rapl_online = ret;
 
 	/* Don't bail out if PSys is not supported */
 	rapl_register_psys();
@@ -1550,7 +1549,7 @@ static int __init rapl_init(void)
 	return 0;
 
 err_unreg_all:
-	cpuhp_remove_state(pcap_rapl_online);
+	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
 
 err_unreg:
 	rapl_unregister_powercap();
@@ -1560,7 +1559,7 @@ err_unreg:
 static void __exit rapl_exit(void)
 {
 	unregister_pm_notifier(&rapl_pm_notifier);
-	cpuhp_remove_state(pcap_rapl_online);
+	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
 	rapl_unregister_powercap();
 }
 
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 94716036d829..7bf1683e4a63 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -88,6 +88,20 @@ struct rapl_domain {
 	struct rapl_package *rp;
 };
 
+/**
+ * struct rapl_if_priv: private data for different RAPL interfaces
+ * @control_type:		Each RAPL interface must have its own powercap
+ *				control type.
+ * @platform_rapl_domain:	Optional. Some RAPL interface may have platform
+ *				level RAPL control.
+ * @pcap_rapl_online:		CPU hotplug state for each RAPL interface.
+ */
+struct rapl_if_priv {
+	struct powercap_control_type *control_type;
+	struct rapl_domain *platform_rapl_domain;
+	enum cpuhp_state pcap_rapl_online;
+};
+
 /* maximum rapl package domain name: package-%d-die-%d */
 #define PACKAGE_DOMAIN_NAME_LENGTH 30
 
@@ -108,6 +122,7 @@ struct rapl_package {
 	/* Track active cpus */
 	struct cpumask cpumask;
 	char name[PACKAGE_DOMAIN_NAME_LENGTH];
+	struct rapl_if_priv *priv;
 };
 
 #endif /* __INTEL_RAPL_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From 7fde2712a7adab721eaabafbd8ff93dff3262d35 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:26 +0800
Subject: intel_rapl: abstract register address

MSR and MMIO RAPL interface have different sets of registers, thus the
RAPL register address should be obtained from interface specific
structure, i.e. struct rapl_if_private, instead.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 73 +++++++++++++++++++------------------------
 include/linux/intel_rapl.h    |  4 +++
 2 files changed, 37 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index e05d92d67525..9f22aed49f24 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -76,7 +76,19 @@ enum unit_type {
 };
 
 /* private data for RAPL MSR Interface */
-static struct rapl_if_priv rapl_msr_priv;
+static struct rapl_if_priv rapl_msr_priv = {
+	.reg_unit = MSR_RAPL_POWER_UNIT,
+	.regs[RAPL_DOMAIN_PACKAGE] = {
+		MSR_PKG_POWER_LIMIT, MSR_PKG_ENERGY_STATUS, MSR_PKG_PERF_STATUS, 0, MSR_PKG_POWER_INFO },
+	.regs[RAPL_DOMAIN_PP0] = {
+		MSR_PP0_POWER_LIMIT, MSR_PP0_ENERGY_STATUS, 0, MSR_PP0_POLICY, 0 },
+	.regs[RAPL_DOMAIN_PP1] = {
+		MSR_PP1_POWER_LIMIT, MSR_PP1_ENERGY_STATUS, 0, MSR_PP1_POLICY, 0 },
+	.regs[RAPL_DOMAIN_DRAM] = {
+		MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
+	.regs[RAPL_DOMAIN_PLATFORM] = {
+		MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
+};
 
 /* per domain data, some are optional */
 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
@@ -541,15 +553,17 @@ static void rapl_init_domains(struct rapl_package *rp)
 
 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 		unsigned int mask = rp->domain_map & (1 << i);
+
+		rd->regs[RAPL_DOMAIN_REG_LIMIT] = rp->priv->regs[i][RAPL_DOMAIN_REG_LIMIT];
+		rd->regs[RAPL_DOMAIN_REG_STATUS] = rp->priv->regs[i][RAPL_DOMAIN_REG_STATUS];
+		rd->regs[RAPL_DOMAIN_REG_PERF] = rp->priv->regs[i][RAPL_DOMAIN_REG_PERF];
+		rd->regs[RAPL_DOMAIN_REG_POLICY] = rp->priv->regs[i][RAPL_DOMAIN_REG_POLICY];
+		rd->regs[RAPL_DOMAIN_REG_INFO] = rp->priv->regs[i][RAPL_DOMAIN_REG_INFO];
+
 		switch (mask) {
 		case BIT(RAPL_DOMAIN_PACKAGE):
 			rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
 			rd->id = RAPL_DOMAIN_PACKAGE;
-			rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_PKG_POWER_LIMIT;
-			rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_PKG_ENERGY_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_PERF] = MSR_PKG_PERF_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_POLICY] = 0;
-			rd->regs[RAPL_DOMAIN_REG_INFO] = MSR_PKG_POWER_INFO;
 			rd->rpl[0].prim_id = PL1_ENABLE;
 			rd->rpl[0].name = pl1_name;
 			rd->rpl[1].prim_id = PL2_ENABLE;
@@ -558,33 +572,18 @@ static void rapl_init_domains(struct rapl_package *rp)
 		case BIT(RAPL_DOMAIN_PP0):
 			rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
 			rd->id = RAPL_DOMAIN_PP0;
-			rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_PP0_POWER_LIMIT;
-			rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_PP0_ENERGY_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_PERF] = 0;
-			rd->regs[RAPL_DOMAIN_REG_POLICY] = MSR_PP0_POLICY;
-			rd->regs[RAPL_DOMAIN_REG_INFO] = 0;
 			rd->rpl[0].prim_id = PL1_ENABLE;
 			rd->rpl[0].name = pl1_name;
 			break;
 		case BIT(RAPL_DOMAIN_PP1):
 			rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
 			rd->id = RAPL_DOMAIN_PP1;
-			rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_PP1_POWER_LIMIT;
-			rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_PP1_ENERGY_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_PERF] = 0;
-			rd->regs[RAPL_DOMAIN_REG_POLICY] = MSR_PP1_POLICY;
-			rd->regs[RAPL_DOMAIN_REG_INFO] = 0;
 			rd->rpl[0].prim_id = PL1_ENABLE;
 			rd->rpl[0].name = pl1_name;
 			break;
 		case BIT(RAPL_DOMAIN_DRAM):
 			rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
 			rd->id = RAPL_DOMAIN_DRAM;
-			rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_DRAM_POWER_LIMIT;
-			rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_DRAM_ENERGY_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_PERF] = MSR_DRAM_PERF_STATUS;
-			rd->regs[RAPL_DOMAIN_REG_POLICY] = 0;
-			rd->regs[RAPL_DOMAIN_REG_INFO] = MSR_DRAM_POWER_INFO;
 			rd->rpl[0].prim_id = PL1_ENABLE;
 			rd->rpl[0].name = pl1_name;
 			rd->domain_energy_unit =
@@ -806,9 +805,9 @@ static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
 	u64 msr_val;
 	u32 value;
 
-	if (rdmsrl_safe_on_cpu(cpu, MSR_RAPL_POWER_UNIT, &msr_val)) {
+	if (rdmsrl_safe_on_cpu(cpu, rp->priv->reg_unit, &msr_val)) {
 		pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n",
-			MSR_RAPL_POWER_UNIT, cpu);
+			rp->priv->reg_unit, cpu);
 		return -ENODEV;
 	}
 
@@ -832,9 +831,9 @@ static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
 	u64 msr_val;
 	u32 value;
 
-	if (rdmsrl_safe_on_cpu(cpu, MSR_RAPL_POWER_UNIT, &msr_val)) {
+	if (rdmsrl_safe_on_cpu(cpu, rp->priv->reg_unit, &msr_val)) {
 		pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n",
-			MSR_RAPL_POWER_UNIT, cpu);
+			rp->priv->reg_unit, cpu);
 		return -ENODEV;
 	}
 	value = (msr_val & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
@@ -1173,10 +1172,10 @@ static int __init rapl_register_psys(void)
 	struct powercap_zone *power_zone;
 	u64 val;
 
-	if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_ENERGY_STATUS, &val) || !val)
+	if (rdmsrl_safe_on_cpu(0, rapl_msr_priv.regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS], &val) || !val)
 		return -ENODEV;
 
-	if (rdmsrl_safe_on_cpu(0, MSR_PLATFORM_POWER_LIMIT, &val) || !val)
+	if (rdmsrl_safe_on_cpu(0, rapl_msr_priv.regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT], &val) || !val)
 		return -ENODEV;
 
 	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
@@ -1185,8 +1184,8 @@ static int __init rapl_register_psys(void)
 
 	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
 	rd->id = RAPL_DOMAIN_PLATFORM;
-	rd->regs[RAPL_DOMAIN_REG_LIMIT] = MSR_PLATFORM_POWER_LIMIT;
-	rd->regs[RAPL_DOMAIN_REG_STATUS] = MSR_PLATFORM_ENERGY_STATUS;
+	rd->regs[RAPL_DOMAIN_REG_LIMIT] = rapl_msr_priv.regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
+	rd->regs[RAPL_DOMAIN_REG_STATUS] = rapl_msr_priv.regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
 	rd->rpl[0].prim_id = PL1_ENABLE;
 	rd->rpl[0].name = pl1_name;
 	rd->rpl[1].prim_id = PL2_ENABLE;
@@ -1218,23 +1217,17 @@ static int __init rapl_register_powercap(void)
 	return 0;
 }
 
-static int rapl_check_domain(int cpu, int domain)
+static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
 {
-	unsigned msr;
+	u32 reg;
 	u64 val = 0;
 
 	switch (domain) {
 	case RAPL_DOMAIN_PACKAGE:
-		msr = MSR_PKG_ENERGY_STATUS;
-		break;
 	case RAPL_DOMAIN_PP0:
-		msr = MSR_PP0_ENERGY_STATUS;
-		break;
 	case RAPL_DOMAIN_PP1:
-		msr = MSR_PP1_ENERGY_STATUS;
-		break;
 	case RAPL_DOMAIN_DRAM:
-		msr = MSR_DRAM_ENERGY_STATUS;
+		reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
 		break;
 	case RAPL_DOMAIN_PLATFORM:
 		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
@@ -1246,7 +1239,7 @@ static int rapl_check_domain(int cpu, int domain)
 	/* make sure domain counters are available and contains non-zero
 	 * values, otherwise skip it.
 	 */
-	if (rdmsrl_safe_on_cpu(cpu, msr, &val) || !val)
+	if (rdmsrl_safe_on_cpu(cpu, reg, &val) || !val)
 		return -ENODEV;
 
 	return 0;
@@ -1293,7 +1286,7 @@ static int rapl_detect_domains(struct rapl_package *rp, int cpu)
 
 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 		/* use physical package id to read counters */
-		if (!rapl_check_domain(cpu, i)) {
+		if (!rapl_check_domain(cpu, i, rp)) {
 			rp->domain_map |= 1 << i;
 			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
 		}
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 7bf1683e4a63..ec2c9e83274f 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -95,11 +95,15 @@ struct rapl_domain {
  * @platform_rapl_domain:	Optional. Some RAPL interface may have platform
  *				level RAPL control.
  * @pcap_rapl_online:		CPU hotplug state for each RAPL interface.
+ * @reg_unit:			Register for getting energy/power/time unit.
+ * @regs:			Register sets for different RAPL Domains.
  */
 struct rapl_if_priv {
 	struct powercap_control_type *control_type;
 	struct rapl_domain *platform_rapl_domain;
 	enum cpuhp_state pcap_rapl_online;
+	u32 reg_unit;
+	u32 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
 };
 
 /* maximum rapl package domain name: package-%d-die-%d */
-- 
cgit v1.2.3-59-g8ed1b


From beea8df821d928e7755917da6c1e45d6afde5148 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:27 +0800
Subject: intel_rapl: abstract register access operations

MSR and MMIO RAPL interfaces have different ways to access the registers,
thus in order to abstract the register access operations, two callbacks,
.read_raw()/.write_raw() are introduced, and they should be implemented by
MSR RAPL and MMIO RAPL interface driver respectly.

This patch implements them for the MSR I/F only.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 110 ++++++++++++++++++++++--------------------
 include/linux/intel_rapl.h    |  13 +++++
 2 files changed, 70 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 9f22aed49f24..d3b9d1cf4d48 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -93,13 +93,6 @@ static struct rapl_if_priv rapl_msr_priv = {
 /* per domain data, some are optional */
 #define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
 
-struct msrl_action {
-	u32 msr_no;
-	u64 clear_mask;
-	u64 set_mask;
-	int err;
-};
-
 #define	DOMAIN_STATE_INACTIVE           BIT(0)
 #define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
 #define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
@@ -692,16 +685,16 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 			enum rapl_primitives prim,
 			bool xlate, u64 *data)
 {
-	u64 value, final;
-	u32 msr;
+	u64 value;
 	struct rapl_primitive_info *rp = &rpi[prim];
+	struct reg_action ra;
 	int cpu;
 
 	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
 		return -EINVAL;
 
-	msr = rd->regs[rp->id];
-	if (!msr)
+	ra.reg = rd->regs[rp->id];
+	if (!ra.reg)
 		return -EINVAL;
 
 	cpu = rd->rp->lead_cpu;
@@ -717,47 +710,23 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 		return 0;
 	}
 
-	if (rdmsrl_safe_on_cpu(cpu, msr, &value)) {
-		pr_debug("failed to read msr 0x%x on cpu %d\n", msr, cpu);
+	ra.mask = rp->mask;
+
+	if (rd->rp->priv->read_raw(cpu, &ra)) {
+		pr_debug("failed to read reg 0x%x on cpu %d\n", ra.reg, cpu);
 		return -EIO;
 	}
 
-	final = value & rp->mask;
-	final = final >> rp->shift;
+	value = ra.value >> rp->shift;
+
 	if (xlate)
-		*data = rapl_unit_xlate(rd, rp->unit, final, 0);
+		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
 	else
-		*data = final;
+		*data = value;
 
 	return 0;
 }
 
-
-static int msrl_update_safe(u32 msr_no, u64 clear_mask, u64 set_mask)
-{
-	int err;
-	u64 val;
-
-	err = rdmsrl_safe(msr_no, &val);
-	if (err)
-		goto out;
-
-	val &= ~clear_mask;
-	val |= set_mask;
-
-	err = wrmsrl_safe(msr_no, val);
-
-out:
-	return err;
-}
-
-static void msrl_update_func(void *info)
-{
-	struct msrl_action *ma = info;
-
-	ma->err = msrl_update_safe(ma->msr_no, ma->clear_mask, ma->set_mask);
-}
-
 /* Similar use of primitive info in the read counterpart */
 static int rapl_write_data_raw(struct rapl_domain *rd,
 			enum rapl_primitives prim,
@@ -766,7 +735,7 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
 	struct rapl_primitive_info *rp = &rpi[prim];
 	int cpu;
 	u64 bits;
-	struct msrl_action ma;
+	struct reg_action ra;
 	int ret;
 
 	cpu = rd->rp->lead_cpu;
@@ -774,17 +743,13 @@ static int rapl_write_data_raw(struct rapl_domain *rd,
 	bits <<= rp->shift;
 	bits &= rp->mask;
 
-	memset(&ma, 0, sizeof(ma));
+	memset(&ra, 0, sizeof(ra));
 
-	ma.msr_no = rd->regs[rp->id];
-	ma.clear_mask = rp->mask;
-	ma.set_mask = bits;
+	ra.reg = rd->regs[rp->id];
+	ra.mask = rp->mask;
+	ra.value = bits;
 
-	ret = smp_call_function_single(cpu, msrl_update_func, &ma, 1);
-	if (ret)
-		WARN_ON_ONCE(ret);
-	else
-		ret = ma.err;
+	ret = rd->rp->priv->write_raw(cpu, &ra);
 
 	return ret;
 }
@@ -1507,6 +1472,43 @@ static struct notifier_block rapl_pm_notifier = {
 	.notifier_call = rapl_pm_callback,
 };
 
+static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
+{
+	if (rdmsrl_safe_on_cpu(cpu, ra->reg, &ra->value)) {
+		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg, cpu);
+		return -EIO;
+	}
+	ra->value &= ra->mask;
+	return 0;
+}
+
+static void rapl_msr_update_func(void *info)
+{
+	struct reg_action *ra = info;
+	u64 val;
+
+	ra->err = rdmsrl_safe(ra->reg, &val);
+	if (ra->err)
+		return;
+
+	val &= ~ra->mask;
+	val |= ra->value;
+
+	ra->err = wrmsrl_safe(ra->reg, val);
+}
+
+
+static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
+{
+	int ret;
+
+	ret = smp_call_function_single(cpu, rapl_msr_update_func, ra, 1);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
+	return ra->err;
+}
+
 static int __init rapl_init(void)
 {
 	const struct x86_cpu_id *id;
@@ -1522,6 +1524,8 @@ static int __init rapl_init(void)
 
 	rapl_defaults = (struct rapl_defaults *)id->driver_data;
 
+	rapl_msr_priv.read_raw = rapl_msr_read_raw;
+	rapl_msr_priv.write_raw = rapl_msr_write_raw;
 	ret = rapl_register_powercap();
 	if (ret)
 		return ret;
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index ec2c9e83274f..ff215d64d114 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -88,6 +88,13 @@ struct rapl_domain {
 	struct rapl_package *rp;
 };
 
+struct reg_action {
+	u32 reg;
+	u64 mask;
+	u64 value;
+	int err;
+};
+
 /**
  * struct rapl_if_priv: private data for different RAPL interfaces
  * @control_type:		Each RAPL interface must have its own powercap
@@ -97,6 +104,10 @@ struct rapl_domain {
  * @pcap_rapl_online:		CPU hotplug state for each RAPL interface.
  * @reg_unit:			Register for getting energy/power/time unit.
  * @regs:			Register sets for different RAPL Domains.
+ * @read_raw:			Callback for reading RAPL interface specific
+ *				registers.
+ * @write_raw:			Callback for writing RAPL interface specific
+ *				registers.
  */
 struct rapl_if_priv {
 	struct powercap_control_type *control_type;
@@ -104,6 +115,8 @@ struct rapl_if_priv {
 	enum cpuhp_state pcap_rapl_online;
 	u32 reg_unit;
 	u32 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
+	int (*read_raw)(int cpu, struct reg_action *ra);
+	int (*write_raw)(int cpu, struct reg_action *ra);
 };
 
 /* maximum rapl package domain name: package-%d-die-%d */
-- 
cgit v1.2.3-59-g8ed1b


From 3382388d714891fc0f575926189f33d22e7c960b Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:30 +0800
Subject: intel_rapl: abstract RAPL common code

Split intel_rapl.c to intel_rapl_common.c and intel_rapl_msr.c, where
intel_rapl_common.c contains the common code that can be used by both MSR
and MMIO interface.
intel_rapl_msr.c contains the implementation of RAPL MSR interface.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/Kconfig             |   11 +-
 drivers/powercap/Makefile            |    3 +-
 drivers/powercap/intel_rapl.c        | 1574 ----------------------------------
 drivers/powercap/intel_rapl_common.c | 1469 +++++++++++++++++++++++++++++++
 drivers/powercap/intel_rapl_msr.c    |  163 ++++
 include/linux/intel_rapl.h           |    7 +
 6 files changed, 1648 insertions(+), 1579 deletions(-)
 delete mode 100644 drivers/powercap/intel_rapl.c
 create mode 100644 drivers/powercap/intel_rapl_common.c
 create mode 100644 drivers/powercap/intel_rapl_msr.c

(limited to 'include/linux')

diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig
index 42d3798c88f0..dc1c1381d7fa 100644
--- a/drivers/powercap/Kconfig
+++ b/drivers/powercap/Kconfig
@@ -16,14 +16,17 @@ menuconfig POWERCAP
 
 if POWERCAP
 # Client driver configurations go here.
+config INTEL_RAPL_CORE
+	tristate
+
 config INTEL_RAPL
-	tristate "Intel RAPL Support"
+	tristate "Intel RAPL Support via MSR Interface"
 	depends on X86 && IOSF_MBI
-	default n
+	select INTEL_RAPL_CORE
 	---help---
 	  This enables support for the Intel Running Average Power Limit (RAPL)
-	  technology which allows power limits to be enforced and monitored on
-	  modern Intel processors (Sandy Bridge and later).
+	  technology via MSR interface, which allows power limits to be enforced
+	  and monitored on modern Intel processors (Sandy Bridge and later).
 
 	  In RAPL, the platform level settings are divided into domains for
 	  fine grained control. These domains include processor package, DRAM
diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile
index 81c8ccaba6e7..7255c94ec61c 100644
--- a/drivers/powercap/Makefile
+++ b/drivers/powercap/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_POWERCAP)	+= powercap_sys.o
-obj-$(CONFIG_INTEL_RAPL) += intel_rapl.o
+obj-$(CONFIG_INTEL_RAPL_CORE) += intel_rapl_common.o
+obj-$(CONFIG_INTEL_RAPL) += intel_rapl_msr.o
 obj-$(CONFIG_IDLE_INJECT) += idle_inject.o
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
deleted file mode 100644
index aa54c06ed518..000000000000
--- a/drivers/powercap/intel_rapl.c
+++ /dev/null
@@ -1,1574 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Intel Running Average Power Limit (RAPL) Driver
- * Copyright (c) 2013, Intel Corporation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/log2.h>
-#include <linux/bitmap.h>
-#include <linux/delay.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
-#include <linux/powercap.h>
-#include <linux/suspend.h>
-#include <linux/intel_rapl.h>
-
-#include <asm/iosf_mbi.h>
-#include <asm/processor.h>
-#include <asm/cpu_device_id.h>
-#include <asm/intel-family.h>
-
-/* Local defines */
-#define MSR_PLATFORM_POWER_LIMIT	0x0000065C
-
-/* bitmasks for RAPL MSRs, used by primitive access functions */
-#define ENERGY_STATUS_MASK      0xffffffff
-
-#define POWER_LIMIT1_MASK       0x7FFF
-#define POWER_LIMIT1_ENABLE     BIT(15)
-#define POWER_LIMIT1_CLAMP      BIT(16)
-
-#define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
-#define POWER_LIMIT2_ENABLE     BIT_ULL(47)
-#define POWER_LIMIT2_CLAMP      BIT_ULL(48)
-#define POWER_PACKAGE_LOCK      BIT_ULL(63)
-#define POWER_PP_LOCK           BIT(31)
-
-#define TIME_WINDOW1_MASK       (0x7FULL<<17)
-#define TIME_WINDOW2_MASK       (0x7FULL<<49)
-
-#define POWER_UNIT_OFFSET	0
-#define POWER_UNIT_MASK		0x0F
-
-#define ENERGY_UNIT_OFFSET	0x08
-#define ENERGY_UNIT_MASK	0x1F00
-
-#define TIME_UNIT_OFFSET	0x10
-#define TIME_UNIT_MASK		0xF0000
-
-#define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
-#define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
-#define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
-#define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
-
-#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
-#define PP_POLICY_MASK         0x1F
-
-/* Non HW constants */
-#define RAPL_PRIMITIVE_DERIVED       BIT(1) /* not from raw data */
-#define RAPL_PRIMITIVE_DUMMY         BIT(2)
-
-#define TIME_WINDOW_MAX_MSEC 40000
-#define TIME_WINDOW_MIN_MSEC 250
-#define ENERGY_UNIT_SCALE    1000 /* scale from driver unit to powercap unit */
-enum unit_type {
-	ARBITRARY_UNIT, /* no translation */
-	POWER_UNIT,
-	ENERGY_UNIT,
-	TIME_UNIT,
-};
-
-/* private data for RAPL MSR Interface */
-static struct rapl_if_priv rapl_msr_priv = {
-	.reg_unit = MSR_RAPL_POWER_UNIT,
-	.regs[RAPL_DOMAIN_PACKAGE] = {
-		MSR_PKG_POWER_LIMIT, MSR_PKG_ENERGY_STATUS, MSR_PKG_PERF_STATUS, 0, MSR_PKG_POWER_INFO },
-	.regs[RAPL_DOMAIN_PP0] = {
-		MSR_PP0_POWER_LIMIT, MSR_PP0_ENERGY_STATUS, 0, MSR_PP0_POLICY, 0 },
-	.regs[RAPL_DOMAIN_PP1] = {
-		MSR_PP1_POWER_LIMIT, MSR_PP1_ENERGY_STATUS, 0, MSR_PP1_POLICY, 0 },
-	.regs[RAPL_DOMAIN_DRAM] = {
-		MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
-	.regs[RAPL_DOMAIN_PLATFORM] = {
-		MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
-};
-
-/* per domain data, some are optional */
-#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
-
-#define	DOMAIN_STATE_INACTIVE           BIT(0)
-#define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
-#define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
-
-static const char pl1_name[] = "long_term";
-static const char pl2_name[] = "short_term";
-
-#define power_zone_to_rapl_domain(_zone) \
-	container_of(_zone, struct rapl_domain, power_zone)
-
-struct rapl_defaults {
-	u8 floor_freq_reg_addr;
-	int (*check_unit)(struct rapl_package *rp, int cpu);
-	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
-	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
-				bool to_raw);
-	unsigned int dram_domain_energy_unit;
-};
-static struct rapl_defaults *rapl_defaults;
-
-/* Sideband MBI registers */
-#define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
-#define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
-
-#define PACKAGE_PLN_INT_SAVED   BIT(0)
-#define MAX_PRIM_NAME (32)
-
-/* per domain data. used to describe individual knobs such that access function
- * can be consolidated into one instead of many inline functions.
- */
-struct rapl_primitive_info {
-	const char *name;
-	u64 mask;
-	int shift;
-	enum rapl_domain_reg_id id;
-	enum unit_type unit;
-	u32 flag;
-};
-
-#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
-		.name = #p,			\
-		.mask = m,			\
-		.shift = s,			\
-		.id = i,			\
-		.unit = u,			\
-		.flag = f			\
-	}
-
-static void rapl_init_domains(struct rapl_package *rp);
-static int rapl_read_data_raw(struct rapl_domain *rd,
-			enum rapl_primitives prim,
-			bool xlate, u64 *data);
-static int rapl_write_data_raw(struct rapl_domain *rd,
-			enum rapl_primitives prim,
-			unsigned long long value);
-static u64 rapl_unit_xlate(struct rapl_domain *rd,
-			enum unit_type type, u64 value,
-			int to_raw);
-static void package_power_limit_irq_save(struct rapl_package *rp);
-
-static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */
-
-static const char * const rapl_domain_names[] = {
-	"package",
-	"core",
-	"uncore",
-	"dram",
-	"psys",
-};
-
-/* caller to ensure CPU hotplug lock is held */
-static struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
-{
-	int id = topology_logical_die_id(cpu);
-	struct rapl_package *rp;
-
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		if (rp->id == id && rp->priv->control_type == priv->control_type)
-			return rp;
-	}
-
-	return NULL;
-}
-
-static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw)
-{
-	struct rapl_domain *rd;
-	u64 energy_now;
-
-	/* prevent CPU hotplug, make sure the RAPL domain does not go
-	 * away while reading the counter.
-	 */
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-
-	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
-		*energy_raw = energy_now;
-		put_online_cpus();
-
-		return 0;
-	}
-	put_online_cpus();
-
-	return -EIO;
-}
-
-static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
-{
-	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
-
-	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
-	return 0;
-}
-
-static int release_zone(struct powercap_zone *power_zone)
-{
-	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
-	struct rapl_package *rp = rd->rp;
-
-	/* package zone is the last zone of a package, we can free
-	 * memory here since all children has been unregistered.
-	 */
-	if (rd->id == RAPL_DOMAIN_PACKAGE) {
-		kfree(rd);
-		rp->domains = NULL;
-	}
-
-	return 0;
-
-}
-
-static int find_nr_power_limit(struct rapl_domain *rd)
-{
-	int i, nr_pl = 0;
-
-	for (i = 0; i < NR_POWER_LIMITS; i++) {
-		if (rd->rpl[i].name)
-			nr_pl++;
-	}
-
-	return nr_pl;
-}
-
-static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
-{
-	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
-
-	if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
-		return -EACCES;
-
-	get_online_cpus();
-	rapl_write_data_raw(rd, PL1_ENABLE, mode);
-	if (rapl_defaults->set_floor_freq)
-		rapl_defaults->set_floor_freq(rd, mode);
-	put_online_cpus();
-
-	return 0;
-}
-
-static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
-{
-	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
-	u64 val;
-
-	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
-		*mode = false;
-		return 0;
-	}
-	get_online_cpus();
-	if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
-		put_online_cpus();
-		return -EIO;
-	}
-	*mode = val;
-	put_online_cpus();
-
-	return 0;
-}
-
-/* per RAPL domain ops, in the order of rapl_domain_type */
-static const struct powercap_zone_ops zone_ops[] = {
-	/* RAPL_DOMAIN_PACKAGE */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-	/* RAPL_DOMAIN_PP0 */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-	/* RAPL_DOMAIN_PP1 */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-	/* RAPL_DOMAIN_DRAM */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-	/* RAPL_DOMAIN_PLATFORM */
-	{
-		.get_energy_uj = get_energy_counter,
-		.get_max_energy_range_uj = get_max_energy_counter,
-		.release = release_zone,
-		.set_enable = set_domain_enable,
-		.get_enable = get_domain_enable,
-	},
-};
-
-
-/*
- * Constraint index used by powercap can be different than power limit (PL)
- * index in that some  PLs maybe missing due to non-existant MSRs. So we
- * need to convert here by finding the valid PLs only (name populated).
- */
-static int contraint_to_pl(struct rapl_domain *rd, int cid)
-{
-	int i, j;
-
-	for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
-		if ((rd->rpl[i].name) && j++ == cid) {
-			pr_debug("%s: index %d\n", __func__, i);
-			return i;
-		}
-	}
-	pr_err("Cannot find matching power limit for constraint %d\n", cid);
-
-	return -EINVAL;
-}
-
-static int set_power_limit(struct powercap_zone *power_zone, int cid,
-			u64 power_limit)
-{
-	struct rapl_domain *rd;
-	struct rapl_package *rp;
-	int ret = 0;
-	int id;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id < 0) {
-		ret = id;
-		goto set_exit;
-	}
-
-	rp = rd->rp;
-
-	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
-		dev_warn(&power_zone->dev, "%s locked by BIOS, monitoring only\n",
-			rd->name);
-		ret = -EACCES;
-		goto set_exit;
-	}
-
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
-		break;
-	case PL2_ENABLE:
-		rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-	if (!ret)
-		package_power_limit_irq_save(rp);
-set_exit:
-	put_online_cpus();
-	return ret;
-}
-
-static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
-					u64 *data)
-{
-	struct rapl_domain *rd;
-	u64 val;
-	int prim;
-	int ret = 0;
-	int id;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id < 0) {
-		ret = id;
-		goto get_exit;
-	}
-
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		prim = POWER_LIMIT1;
-		break;
-	case PL2_ENABLE:
-		prim = POWER_LIMIT2;
-		break;
-	default:
-		put_online_cpus();
-		return -EINVAL;
-	}
-	if (rapl_read_data_raw(rd, prim, true, &val))
-		ret = -EIO;
-	else
-		*data = val;
-
-get_exit:
-	put_online_cpus();
-
-	return ret;
-}
-
-static int set_time_window(struct powercap_zone *power_zone, int cid,
-								u64 window)
-{
-	struct rapl_domain *rd;
-	int ret = 0;
-	int id;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id < 0) {
-		ret = id;
-		goto set_time_exit;
-	}
-
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		rapl_write_data_raw(rd, TIME_WINDOW1, window);
-		break;
-	case PL2_ENABLE:
-		rapl_write_data_raw(rd, TIME_WINDOW2, window);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-set_time_exit:
-	put_online_cpus();
-	return ret;
-}
-
-static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data)
-{
-	struct rapl_domain *rd;
-	u64 val;
-	int ret = 0;
-	int id;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id < 0) {
-		ret = id;
-		goto get_time_exit;
-	}
-
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
-		break;
-	case PL2_ENABLE:
-		ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
-		break;
-	default:
-		put_online_cpus();
-		return -EINVAL;
-	}
-	if (!ret)
-		*data = val;
-
-get_time_exit:
-	put_online_cpus();
-
-	return ret;
-}
-
-static const char *get_constraint_name(struct powercap_zone *power_zone, int cid)
-{
-	struct rapl_domain *rd;
-	int id;
-
-	rd = power_zone_to_rapl_domain(power_zone);
-	id = contraint_to_pl(rd, cid);
-	if (id >= 0)
-		return rd->rpl[id].name;
-
-	return NULL;
-}
-
-
-static int get_max_power(struct powercap_zone *power_zone, int id,
-					u64 *data)
-{
-	struct rapl_domain *rd;
-	u64 val;
-	int prim;
-	int ret = 0;
-
-	get_online_cpus();
-	rd = power_zone_to_rapl_domain(power_zone);
-	switch (rd->rpl[id].prim_id) {
-	case PL1_ENABLE:
-		prim = THERMAL_SPEC_POWER;
-		break;
-	case PL2_ENABLE:
-		prim = MAX_POWER;
-		break;
-	default:
-		put_online_cpus();
-		return -EINVAL;
-	}
-	if (rapl_read_data_raw(rd, prim, true, &val))
-		ret = -EIO;
-	else
-		*data = val;
-
-	put_online_cpus();
-
-	return ret;
-}
-
-static const struct powercap_zone_constraint_ops constraint_ops = {
-	.set_power_limit_uw = set_power_limit,
-	.get_power_limit_uw = get_current_power_limit,
-	.set_time_window_us = set_time_window,
-	.get_time_window_us = get_time_window,
-	.get_max_power_uw = get_max_power,
-	.get_name = get_constraint_name,
-};
-
-/* called after domain detection and package level data are set */
-static void rapl_init_domains(struct rapl_package *rp)
-{
-	int i;
-	struct rapl_domain *rd = rp->domains;
-
-	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
-		unsigned int mask = rp->domain_map & (1 << i);
-
-		rd->regs[RAPL_DOMAIN_REG_LIMIT] = rp->priv->regs[i][RAPL_DOMAIN_REG_LIMIT];
-		rd->regs[RAPL_DOMAIN_REG_STATUS] = rp->priv->regs[i][RAPL_DOMAIN_REG_STATUS];
-		rd->regs[RAPL_DOMAIN_REG_PERF] = rp->priv->regs[i][RAPL_DOMAIN_REG_PERF];
-		rd->regs[RAPL_DOMAIN_REG_POLICY] = rp->priv->regs[i][RAPL_DOMAIN_REG_POLICY];
-		rd->regs[RAPL_DOMAIN_REG_INFO] = rp->priv->regs[i][RAPL_DOMAIN_REG_INFO];
-
-		switch (mask) {
-		case BIT(RAPL_DOMAIN_PACKAGE):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
-			rd->id = RAPL_DOMAIN_PACKAGE;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			rd->rpl[1].prim_id = PL2_ENABLE;
-			rd->rpl[1].name = pl2_name;
-			break;
-		case BIT(RAPL_DOMAIN_PP0):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
-			rd->id = RAPL_DOMAIN_PP0;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			break;
-		case BIT(RAPL_DOMAIN_PP1):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
-			rd->id = RAPL_DOMAIN_PP1;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			break;
-		case BIT(RAPL_DOMAIN_DRAM):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
-			rd->id = RAPL_DOMAIN_DRAM;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			rd->domain_energy_unit =
-				rapl_defaults->dram_domain_energy_unit;
-			if (rd->domain_energy_unit)
-				pr_info("DRAM domain energy unit %dpj\n",
-					rd->domain_energy_unit);
-			break;
-		}
-		if (mask) {
-			rd->rp = rp;
-			rd++;
-		}
-	}
-}
-
-static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
-			u64 value, int to_raw)
-{
-	u64 units = 1;
-	struct rapl_package *rp = rd->rp;
-	u64 scale = 1;
-
-	switch (type) {
-	case POWER_UNIT:
-		units = rp->power_unit;
-		break;
-	case ENERGY_UNIT:
-		scale = ENERGY_UNIT_SCALE;
-		/* per domain unit takes precedence */
-		if (rd->domain_energy_unit)
-			units = rd->domain_energy_unit;
-		else
-			units = rp->energy_unit;
-		break;
-	case TIME_UNIT:
-		return rapl_defaults->compute_time_window(rp, value, to_raw);
-	case ARBITRARY_UNIT:
-	default:
-		return value;
-	};
-
-	if (to_raw)
-		return div64_u64(value, units) * scale;
-
-	value *= units;
-
-	return div64_u64(value, scale);
-}
-
-/* in the order of enum rapl_primitives */
-static struct rapl_primitive_info rpi[] = {
-	/* name, mask, shift, msr index, unit divisor */
-	PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
-				RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
-				RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
-				RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_PP_LOCK, 31,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
-				RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
-	PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
-				RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
-	PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
-				RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
-	PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
-				0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
-				RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
-				RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
-				RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
-	PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
-				RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
-	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
-				RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
-	/* non-hardware */
-	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
-				RAPL_PRIMITIVE_DERIVED),
-	{NULL, 0, 0, 0},
-};
-
-/* Read primitive data based on its related struct rapl_primitive_info.
- * if xlate flag is set, return translated data based on data units, i.e.
- * time, energy, and power.
- * RAPL MSRs are non-architectual and are laid out not consistently across
- * domains. Here we use primitive info to allow writing consolidated access
- * functions.
- * For a given primitive, it is processed by MSR mask and shift. Unit conversion
- * is pre-assigned based on RAPL unit MSRs read at init time.
- * 63-------------------------- 31--------------------------- 0
- * |                           xxxxx (mask)                   |
- * |                                |<- shift ----------------|
- * 63-------------------------- 31--------------------------- 0
- */
-static int rapl_read_data_raw(struct rapl_domain *rd,
-			enum rapl_primitives prim,
-			bool xlate, u64 *data)
-{
-	u64 value;
-	struct rapl_primitive_info *rp = &rpi[prim];
-	struct reg_action ra;
-	int cpu;
-
-	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
-		return -EINVAL;
-
-	ra.reg = rd->regs[rp->id];
-	if (!ra.reg)
-		return -EINVAL;
-
-	cpu = rd->rp->lead_cpu;
-
-	/* special-case package domain, which uses a different bit*/
-	if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) {
-		rp->mask = POWER_PACKAGE_LOCK;
-		rp->shift = 63;
-	}
-	/* non-hardware data are collected by the polling thread */
-	if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
-		*data = rd->rdd.primitives[prim];
-		return 0;
-	}
-
-	ra.mask = rp->mask;
-
-	if (rd->rp->priv->read_raw(cpu, &ra)) {
-		pr_debug("failed to read reg 0x%x on cpu %d\n", ra.reg, cpu);
-		return -EIO;
-	}
-
-	value = ra.value >> rp->shift;
-
-	if (xlate)
-		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
-	else
-		*data = value;
-
-	return 0;
-}
-
-/* Similar use of primitive info in the read counterpart */
-static int rapl_write_data_raw(struct rapl_domain *rd,
-			enum rapl_primitives prim,
-			unsigned long long value)
-{
-	struct rapl_primitive_info *rp = &rpi[prim];
-	int cpu;
-	u64 bits;
-	struct reg_action ra;
-	int ret;
-
-	cpu = rd->rp->lead_cpu;
-	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
-	bits <<= rp->shift;
-	bits &= rp->mask;
-
-	memset(&ra, 0, sizeof(ra));
-
-	ra.reg = rd->regs[rp->id];
-	ra.mask = rp->mask;
-	ra.value = bits;
-
-	ret = rd->rp->priv->write_raw(cpu, &ra);
-
-	return ret;
-}
-
-/*
- * Raw RAPL data stored in MSRs are in certain scales. We need to
- * convert them into standard units based on the units reported in
- * the RAPL unit MSRs. This is specific to CPUs as the method to
- * calculate units differ on different CPUs.
- * We convert the units to below format based on CPUs.
- * i.e.
- * energy unit: picoJoules  : Represented in picoJoules by default
- * power unit : microWatts  : Represented in milliWatts by default
- * time unit  : microseconds: Represented in seconds by default
- */
-static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
-{
-	struct reg_action ra;
-	u32 value;
-
-	ra.reg = rp->priv->reg_unit;
-	ra.mask = ~0;
-	if (rp->priv->read_raw(cpu, &ra)) {
-		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
-			rp->priv->reg_unit, cpu);
-		return -ENODEV;
-	}
-
-	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
-	rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
-
-	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
-	rp->power_unit = 1000000 / (1 << value);
-
-	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
-	rp->time_unit = 1000000 / (1 << value);
-
-	pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
-		rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
-
-	return 0;
-}
-
-static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
-{
-	struct reg_action ra;
-	u32 value;
-
-	ra.reg = rp->priv->reg_unit;
-	ra.mask = ~0;
-	if (rp->priv->read_raw(cpu, &ra)) {
-		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
-			rp->priv->reg_unit, cpu);
-		return -ENODEV;
-	}
-
-	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
-	rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
-
-	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
-	rp->power_unit = (1 << value) * 1000;
-
-	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
-	rp->time_unit = 1000000 / (1 << value);
-
-	pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
-		rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
-
-	return 0;
-}
-
-static void power_limit_irq_save_cpu(void *info)
-{
-	u32 l, h = 0;
-	struct rapl_package *rp = (struct rapl_package *)info;
-
-	/* save the state of PLN irq mask bit before disabling it */
-	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
-	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
-		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
-		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
-	}
-	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
-	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
-}
-
-
-/* REVISIT:
- * When package power limit is set artificially low by RAPL, LVT
- * thermal interrupt for package power limit should be ignored
- * since we are not really exceeding the real limit. The intention
- * is to avoid excessive interrupts while we are trying to save power.
- * A useful feature might be routing the package_power_limit interrupt
- * to userspace via eventfd. once we have a usecase, this is simple
- * to do by adding an atomic notifier.
- */
-
-static void package_power_limit_irq_save(struct rapl_package *rp)
-{
-	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
-		return;
-
-	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
-}
-
-/*
- * Restore per package power limit interrupt enable state. Called from cpu
- * hotplug code on package removal.
- */
-static void package_power_limit_irq_restore(struct rapl_package *rp)
-{
-	u32 l, h;
-
-	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
-		return;
-
-	/* irq enable state not saved, nothing to restore */
-	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
-		return;
-
-	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
-
-	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
-		l |= PACKAGE_THERM_INT_PLN_ENABLE;
-	else
-		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
-
-	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
-}
-
-static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
-{
-	int nr_powerlimit = find_nr_power_limit(rd);
-
-	/* always enable clamp such that p-state can go below OS requested
-	 * range. power capping priority over guranteed frequency.
-	 */
-	rapl_write_data_raw(rd, PL1_CLAMP, mode);
-
-	/* some domains have pl2 */
-	if (nr_powerlimit > 1) {
-		rapl_write_data_raw(rd, PL2_ENABLE, mode);
-		rapl_write_data_raw(rd, PL2_CLAMP, mode);
-	}
-}
-
-static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
-{
-	static u32 power_ctrl_orig_val;
-	u32 mdata;
-
-	if (!rapl_defaults->floor_freq_reg_addr) {
-		pr_err("Invalid floor frequency config register\n");
-		return;
-	}
-
-	if (!power_ctrl_orig_val)
-		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
-			      rapl_defaults->floor_freq_reg_addr,
-			      &power_ctrl_orig_val);
-	mdata = power_ctrl_orig_val;
-	if (enable) {
-		mdata &= ~(0x7f << 8);
-		mdata |= 1 << 8;
-	}
-	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
-		       rapl_defaults->floor_freq_reg_addr, mdata);
-}
-
-static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
-					bool to_raw)
-{
-	u64 f, y; /* fraction and exp. used for time unit */
-
-	/*
-	 * Special processing based on 2^Y*(1+F/4), refer
-	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
-	 */
-	if (!to_raw) {
-		f = (value & 0x60) >> 5;
-		y = value & 0x1f;
-		value = (1 << y) * (4 + f) * rp->time_unit / 4;
-	} else {
-		do_div(value, rp->time_unit);
-		y = ilog2(value);
-		f = div64_u64(4 * (value - (1 << y)), 1 << y);
-		value = (y & 0x1f) | ((f & 0x3) << 5);
-	}
-	return value;
-}
-
-static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
-					bool to_raw)
-{
-	/*
-	 * Atom time unit encoding is straight forward val * time_unit,
-	 * where time_unit is default to 1 sec. Never 0.
-	 */
-	if (!to_raw)
-		return (value) ? value *= rp->time_unit : rp->time_unit;
-	else
-		value = div64_u64(value, rp->time_unit);
-
-	return value;
-}
-
-static const struct rapl_defaults rapl_defaults_core = {
-	.floor_freq_reg_addr = 0,
-	.check_unit = rapl_check_unit_core,
-	.set_floor_freq = set_floor_freq_default,
-	.compute_time_window = rapl_compute_time_window_core,
-};
-
-static const struct rapl_defaults rapl_defaults_hsw_server = {
-	.check_unit = rapl_check_unit_core,
-	.set_floor_freq = set_floor_freq_default,
-	.compute_time_window = rapl_compute_time_window_core,
-	.dram_domain_energy_unit = 15300,
-};
-
-static const struct rapl_defaults rapl_defaults_byt = {
-	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
-	.check_unit = rapl_check_unit_atom,
-	.set_floor_freq = set_floor_freq_atom,
-	.compute_time_window = rapl_compute_time_window_atom,
-};
-
-static const struct rapl_defaults rapl_defaults_tng = {
-	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
-	.check_unit = rapl_check_unit_atom,
-	.set_floor_freq = set_floor_freq_atom,
-	.compute_time_window = rapl_compute_time_window_atom,
-};
-
-static const struct rapl_defaults rapl_defaults_ann = {
-	.floor_freq_reg_addr = 0,
-	.check_unit = rapl_check_unit_atom,
-	.set_floor_freq = NULL,
-	.compute_time_window = rapl_compute_time_window_atom,
-};
-
-static const struct rapl_defaults rapl_defaults_cht = {
-	.floor_freq_reg_addr = 0,
-	.check_unit = rapl_check_unit_atom,
-	.set_floor_freq = NULL,
-	.compute_time_window = rapl_compute_time_window_atom,
-};
-
-static const struct x86_cpu_id rapl_ids[] __initconst = {
-	INTEL_CPU_FAM6(SANDYBRIDGE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(SANDYBRIDGE_X,		rapl_defaults_core),
-
-	INTEL_CPU_FAM6(IVYBRIDGE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(IVYBRIDGE_X,		rapl_defaults_core),
-
-	INTEL_CPU_FAM6(HASWELL_CORE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(HASWELL_ULT,		rapl_defaults_core),
-	INTEL_CPU_FAM6(HASWELL_GT3E,		rapl_defaults_core),
-	INTEL_CPU_FAM6(HASWELL_X,		rapl_defaults_hsw_server),
-
-	INTEL_CPU_FAM6(BROADWELL_CORE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(BROADWELL_GT3E,		rapl_defaults_core),
-	INTEL_CPU_FAM6(BROADWELL_XEON_D,	rapl_defaults_core),
-	INTEL_CPU_FAM6(BROADWELL_X,		rapl_defaults_hsw_server),
-
-	INTEL_CPU_FAM6(SKYLAKE_DESKTOP,		rapl_defaults_core),
-	INTEL_CPU_FAM6(SKYLAKE_MOBILE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(SKYLAKE_X,		rapl_defaults_hsw_server),
-	INTEL_CPU_FAM6(KABYLAKE_MOBILE,		rapl_defaults_core),
-	INTEL_CPU_FAM6(KABYLAKE_DESKTOP,	rapl_defaults_core),
-	INTEL_CPU_FAM6(CANNONLAKE_MOBILE,	rapl_defaults_core),
-	INTEL_CPU_FAM6(ICELAKE_MOBILE,		rapl_defaults_core),
-
-	INTEL_CPU_FAM6(ATOM_SILVERMONT,		rapl_defaults_byt),
-	INTEL_CPU_FAM6(ATOM_AIRMONT,		rapl_defaults_cht),
-	INTEL_CPU_FAM6(ATOM_SILVERMONT_MID,	rapl_defaults_tng),
-	INTEL_CPU_FAM6(ATOM_AIRMONT_MID,	rapl_defaults_ann),
-	INTEL_CPU_FAM6(ATOM_GOLDMONT,		rapl_defaults_core),
-	INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS,	rapl_defaults_core),
-	INTEL_CPU_FAM6(ATOM_GOLDMONT_X,		rapl_defaults_core),
-	INTEL_CPU_FAM6(ATOM_TREMONT_X,		rapl_defaults_core),
-
-	INTEL_CPU_FAM6(XEON_PHI_KNL,		rapl_defaults_hsw_server),
-	INTEL_CPU_FAM6(XEON_PHI_KNM,		rapl_defaults_hsw_server),
-	{}
-};
-MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
-
-/* Read once for all raw primitive data for domains */
-static void rapl_update_domain_data(struct rapl_package *rp)
-{
-	int dmn, prim;
-	u64 val;
-
-	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
-		pr_debug("update %s domain %s data\n", rp->name,
-			 rp->domains[dmn].name);
-		/* exclude non-raw primitives */
-		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
-			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
-						rpi[prim].unit, &val))
-				rp->domains[dmn].rdd.primitives[prim] =	val;
-		}
-	}
-
-}
-
-static int rapl_package_register_powercap(struct rapl_package *rp)
-{
-	struct rapl_domain *rd;
-	struct powercap_zone *power_zone = NULL;
-	int nr_pl, ret;
-
-	/* Update the domain data of the new package */
-	rapl_update_domain_data(rp);
-
-	/* first we register package domain as the parent zone*/
-	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
-		if (rd->id == RAPL_DOMAIN_PACKAGE) {
-			nr_pl = find_nr_power_limit(rd);
-			pr_debug("register package domain %s\n", rp->name);
-			power_zone = powercap_register_zone(&rd->power_zone,
-							rp->priv->control_type,
-							rp->name, NULL,
-							&zone_ops[rd->id],
-							nr_pl,
-							&constraint_ops);
-			if (IS_ERR(power_zone)) {
-				pr_debug("failed to register power zone %s\n",
-					rp->name);
-				return PTR_ERR(power_zone);
-			}
-			/* track parent zone in per package/socket data */
-			rp->power_zone = power_zone;
-			/* done, only one package domain per socket */
-			break;
-		}
-	}
-	if (!power_zone) {
-		pr_err("no package domain found, unknown topology!\n");
-		return -ENODEV;
-	}
-	/* now register domains as children of the socket/package*/
-	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
-		if (rd->id == RAPL_DOMAIN_PACKAGE)
-			continue;
-		/* number of power limits per domain varies */
-		nr_pl = find_nr_power_limit(rd);
-		power_zone = powercap_register_zone(&rd->power_zone,
-						rp->priv->control_type, rd->name,
-						rp->power_zone,
-						&zone_ops[rd->id], nr_pl,
-						&constraint_ops);
-
-		if (IS_ERR(power_zone)) {
-			pr_debug("failed to register power_zone, %s:%s\n",
-				rp->name, rd->name);
-			ret = PTR_ERR(power_zone);
-			goto err_cleanup;
-		}
-	}
-	return 0;
-
-err_cleanup:
-	/*
-	 * Clean up previously initialized domains within the package if we
-	 * failed after the first domain setup.
-	 */
-	while (--rd >= rp->domains) {
-		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
-		powercap_unregister_zone(rp->priv->control_type, &rd->power_zone);
-	}
-
-	return ret;
-}
-
-static int __init rapl_add_platform_domain(struct rapl_if_priv *priv)
-{
-	struct rapl_domain *rd;
-	struct powercap_zone *power_zone;
-	struct reg_action ra;
-	int ret;
-
-	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
-	ra.mask = ~0;
-	ret = priv->read_raw(0, &ra);
-	if (ret || !ra.value)
-		return -ENODEV;
-
-	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
-	ra.mask = ~0;
-	ret = priv->read_raw(0, &ra);
-	if (ret || !ra.value)
-		return -ENODEV;
-
-	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
-	if (!rd)
-		return -ENOMEM;
-
-	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
-	rd->id = RAPL_DOMAIN_PLATFORM;
-	rd->regs[RAPL_DOMAIN_REG_LIMIT] = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
-	rd->regs[RAPL_DOMAIN_REG_STATUS] = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
-	rd->rpl[0].prim_id = PL1_ENABLE;
-	rd->rpl[0].name = pl1_name;
-	rd->rpl[1].prim_id = PL2_ENABLE;
-	rd->rpl[1].name = pl2_name;
-	rd->rp = rapl_find_package_domain(0, priv);
-
-	power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
-					    "psys", NULL,
-					    &zone_ops[RAPL_DOMAIN_PLATFORM],
-					    2, &constraint_ops);
-
-	if (IS_ERR(power_zone)) {
-		kfree(rd);
-		return PTR_ERR(power_zone);
-	}
-
-	priv->platform_rapl_domain = rd;
-
-	return 0;
-}
-
-static void rapl_remove_platform_domain(struct rapl_if_priv *priv)
-{
-	if (priv->platform_rapl_domain) {
-		powercap_unregister_zone(priv->control_type,
-			&priv->platform_rapl_domain->power_zone);
-		kfree(priv->platform_rapl_domain);
-	}
-}
-
-static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
-{
-	struct reg_action ra;
-
-	switch (domain) {
-	case RAPL_DOMAIN_PACKAGE:
-	case RAPL_DOMAIN_PP0:
-	case RAPL_DOMAIN_PP1:
-	case RAPL_DOMAIN_DRAM:
-		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
-		break;
-	case RAPL_DOMAIN_PLATFORM:
-		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
-		return -EINVAL;
-	default:
-		pr_err("invalid domain id %d\n", domain);
-		return -EINVAL;
-	}
-	/* make sure domain counters are available and contains non-zero
-	 * values, otherwise skip it.
-	 */
-
-	ra.mask = ~0;
-	if (rp->priv->read_raw(cpu, &ra) || !ra.value)
-		return -ENODEV;
-
-	return 0;
-}
-
-
-/*
- * Check if power limits are available. Two cases when they are not available:
- * 1. Locked by BIOS, in this case we still provide read-only access so that
- *    users can see what limit is set by the BIOS.
- * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
- *    exist at all. In this case, we do not show the contraints in powercap.
- *
- * Called after domains are detected and initialized.
- */
-static void rapl_detect_powerlimit(struct rapl_domain *rd)
-{
-	u64 val64;
-	int i;
-
-	/* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
-	if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
-		if (val64) {
-			pr_info("RAPL %s domain %s locked by BIOS\n",
-				rd->rp->name, rd->name);
-			rd->state |= DOMAIN_STATE_BIOS_LOCKED;
-		}
-	}
-	/* check if power limit MSRs exists, otherwise domain is monitoring only */
-	for (i = 0; i < NR_POWER_LIMITS; i++) {
-		int prim = rd->rpl[i].prim_id;
-		if (rapl_read_data_raw(rd, prim, false, &val64))
-			rd->rpl[i].name = NULL;
-	}
-}
-
-/* Detect active and valid domains for the given CPU, caller must
- * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
- */
-static int rapl_detect_domains(struct rapl_package *rp, int cpu)
-{
-	struct rapl_domain *rd;
-	int i;
-
-	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
-		/* use physical package id to read counters */
-		if (!rapl_check_domain(cpu, i, rp)) {
-			rp->domain_map |= 1 << i;
-			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
-		}
-	}
-	rp->nr_domains = bitmap_weight(&rp->domain_map,	RAPL_DOMAIN_MAX);
-	if (!rp->nr_domains) {
-		pr_debug("no valid rapl domains found in %s\n", rp->name);
-		return -ENODEV;
-	}
-	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
-
-	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
-			GFP_KERNEL);
-	if (!rp->domains)
-		return -ENOMEM;
-
-	rapl_init_domains(rp);
-
-	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
-		rapl_detect_powerlimit(rd);
-
-	return 0;
-}
-
-/* called from CPU hotplug notifier, hotplug lock held */
-static void rapl_remove_package(struct rapl_package *rp)
-{
-	struct rapl_domain *rd, *rd_package = NULL;
-
-	package_power_limit_irq_restore(rp);
-
-	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
-		rapl_write_data_raw(rd, PL1_ENABLE, 0);
-		rapl_write_data_raw(rd, PL1_CLAMP, 0);
-		if (find_nr_power_limit(rd) > 1) {
-			rapl_write_data_raw(rd, PL2_ENABLE, 0);
-			rapl_write_data_raw(rd, PL2_CLAMP, 0);
-		}
-		if (rd->id == RAPL_DOMAIN_PACKAGE) {
-			rd_package = rd;
-			continue;
-		}
-		pr_debug("remove package, undo power limit on %s: %s\n",
-			 rp->name, rd->name);
-		powercap_unregister_zone(rp->priv->control_type, &rd->power_zone);
-	}
-	/* do parent zone last */
-	powercap_unregister_zone(rp->priv->control_type, &rd_package->power_zone);
-	list_del(&rp->plist);
-	kfree(rp);
-}
-
-/* called from CPU hotplug notifier, hotplug lock held */
-static struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
-{
-	int id = topology_logical_die_id(cpu);
-	struct rapl_package *rp;
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	int ret;
-
-	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
-	if (!rp)
-		return ERR_PTR(-ENOMEM);
-
-	/* add the new package to the list */
-	rp->id = id;
-	rp->lead_cpu = cpu;
-	rp->priv = priv;
-
-	if (topology_max_die_per_package() > 1)
-		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
-			"package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
-	else
-		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
-			c->phys_proc_id);
-
-	/* check if the package contains valid domains */
-	if (rapl_detect_domains(rp, cpu) ||
-		rapl_defaults->check_unit(rp, cpu)) {
-		ret = -ENODEV;
-		goto err_free_package;
-	}
-	ret = rapl_package_register_powercap(rp);
-	if (!ret) {
-		INIT_LIST_HEAD(&rp->plist);
-		list_add(&rp->plist, &rapl_packages);
-		return rp;
-	}
-
-err_free_package:
-	kfree(rp->domains);
-	kfree(rp);
-	return ERR_PTR(ret);
-}
-
-/* Handles CPU hotplug on multi-socket systems.
- * If a CPU goes online as the first CPU of the physical package
- * we add the RAPL package to the system. Similarly, when the last
- * CPU of the package is removed, we remove the RAPL package and its
- * associated domains. Cooling devices are handled accordingly at
- * per-domain level.
- */
-static int rapl_cpu_online(unsigned int cpu)
-{
-	struct rapl_package *rp;
-
-	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
-	if (!rp) {
-		rp = rapl_add_package(cpu, &rapl_msr_priv);
-		if (IS_ERR(rp))
-			return PTR_ERR(rp);
-	}
-	cpumask_set_cpu(cpu, &rp->cpumask);
-	return 0;
-}
-
-static int rapl_cpu_down_prep(unsigned int cpu)
-{
-	struct rapl_package *rp;
-	int lead_cpu;
-
-	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
-	if (!rp)
-		return 0;
-
-	cpumask_clear_cpu(cpu, &rp->cpumask);
-	lead_cpu = cpumask_first(&rp->cpumask);
-	if (lead_cpu >= nr_cpu_ids)
-		rapl_remove_package(rp);
-	else if (rp->lead_cpu == cpu)
-		rp->lead_cpu = lead_cpu;
-	return 0;
-}
-
-static void power_limit_state_save(void)
-{
-	struct rapl_package *rp;
-	struct rapl_domain *rd;
-	int nr_pl, ret, i;
-
-	get_online_cpus();
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		if (!rp->power_zone)
-			continue;
-		rd = power_zone_to_rapl_domain(rp->power_zone);
-		nr_pl = find_nr_power_limit(rd);
-		for (i = 0; i < nr_pl; i++) {
-			switch (rd->rpl[i].prim_id) {
-			case PL1_ENABLE:
-				ret = rapl_read_data_raw(rd,
-						POWER_LIMIT1,
-						true,
-						&rd->rpl[i].last_power_limit);
-				if (ret)
-					rd->rpl[i].last_power_limit = 0;
-				break;
-			case PL2_ENABLE:
-				ret = rapl_read_data_raw(rd,
-						POWER_LIMIT2,
-						true,
-						&rd->rpl[i].last_power_limit);
-				if (ret)
-					rd->rpl[i].last_power_limit = 0;
-				break;
-			}
-		}
-	}
-	put_online_cpus();
-}
-
-static void power_limit_state_restore(void)
-{
-	struct rapl_package *rp;
-	struct rapl_domain *rd;
-	int nr_pl, i;
-
-	get_online_cpus();
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		if (!rp->power_zone)
-			continue;
-		rd = power_zone_to_rapl_domain(rp->power_zone);
-		nr_pl = find_nr_power_limit(rd);
-		for (i = 0; i < nr_pl; i++) {
-			switch (rd->rpl[i].prim_id) {
-			case PL1_ENABLE:
-				if (rd->rpl[i].last_power_limit)
-					rapl_write_data_raw(rd,
-						POWER_LIMIT1,
-						rd->rpl[i].last_power_limit);
-				break;
-			case PL2_ENABLE:
-				if (rd->rpl[i].last_power_limit)
-					rapl_write_data_raw(rd,
-						POWER_LIMIT2,
-						rd->rpl[i].last_power_limit);
-				break;
-			}
-		}
-	}
-	put_online_cpus();
-}
-
-static int rapl_pm_callback(struct notifier_block *nb,
-	unsigned long mode, void *_unused)
-{
-	switch (mode) {
-	case PM_SUSPEND_PREPARE:
-		power_limit_state_save();
-		break;
-	case PM_POST_SUSPEND:
-		power_limit_state_restore();
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block rapl_pm_notifier = {
-	.notifier_call = rapl_pm_callback,
-};
-
-static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
-{
-	if (rdmsrl_safe_on_cpu(cpu, ra->reg, &ra->value)) {
-		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg, cpu);
-		return -EIO;
-	}
-	ra->value &= ra->mask;
-	return 0;
-}
-
-static void rapl_msr_update_func(void *info)
-{
-	struct reg_action *ra = info;
-	u64 val;
-
-	ra->err = rdmsrl_safe(ra->reg, &val);
-	if (ra->err)
-		return;
-
-	val &= ~ra->mask;
-	val |= ra->value;
-
-	ra->err = wrmsrl_safe(ra->reg, val);
-}
-
-
-static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
-{
-	int ret;
-
-	ret = smp_call_function_single(cpu, rapl_msr_update_func, ra, 1);
-	if (WARN_ON_ONCE(ret))
-		return ret;
-
-	return ra->err;
-}
-
-static int __init rapl_init(void)
-{
-	const struct x86_cpu_id *id;
-	int ret;
-
-	id = x86_match_cpu(rapl_ids);
-	if (!id) {
-		pr_err("driver does not support CPU family %d model %d\n",
-			boot_cpu_data.x86, boot_cpu_data.x86_model);
-
-		return -ENODEV;
-	}
-
-	rapl_defaults = (struct rapl_defaults *)id->driver_data;
-
-	rapl_msr_priv.read_raw = rapl_msr_read_raw;
-	rapl_msr_priv.write_raw = rapl_msr_write_raw;
-
-	rapl_msr_priv.control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
-	if (IS_ERR(rapl_msr_priv.control_type)) {
-		pr_debug("failed to register powercap control_type.\n");
-		return PTR_ERR(rapl_msr_priv.control_type);
-	}
-
-	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
-				rapl_cpu_online, rapl_cpu_down_prep);
-	if (ret < 0)
-		goto err_unreg;
-	rapl_msr_priv.pcap_rapl_online = ret;
-
-	/* Don't bail out if PSys is not supported */
-	rapl_add_platform_domain(&rapl_msr_priv);
-
-	ret = register_pm_notifier(&rapl_pm_notifier);
-	if (ret)
-		goto err_unreg_all;
-
-	return 0;
-
-err_unreg_all:
-	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
-
-err_unreg:
-	powercap_unregister_control_type(rapl_msr_priv.control_type);
-	return ret;
-}
-
-static void __exit rapl_exit(void)
-{
-	unregister_pm_notifier(&rapl_pm_notifier);
-	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
-	rapl_remove_platform_domain(&rapl_msr_priv);
-	powercap_unregister_control_type(rapl_msr_priv.control_type);
-}
-
-module_init(rapl_init);
-module_exit(rapl_exit);
-
-MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit)");
-MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
new file mode 100644
index 000000000000..34a82531a7cf
--- /dev/null
+++ b/drivers/powercap/intel_rapl_common.c
@@ -0,0 +1,1469 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Common code for Intel Running Average Power Limit (RAPL) support.
+ * Copyright (c) 2019, Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/bitmap.h>
+#include <linux/delay.h>
+#include <linux/sysfs.h>
+#include <linux/cpu.h>
+#include <linux/powercap.h>
+#include <linux/suspend.h>
+#include <asm/iosf_mbi.h>
+#include <linux/intel_rapl.h>
+
+#include <linux/processor.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+/* Local defines */
+#define MSR_PLATFORM_POWER_LIMIT	0x0000065C
+
+/* bitmasks for RAPL MSRs, used by primitive access functions */
+#define ENERGY_STATUS_MASK      0xffffffff
+
+#define POWER_LIMIT1_MASK       0x7FFF
+#define POWER_LIMIT1_ENABLE     BIT(15)
+#define POWER_LIMIT1_CLAMP      BIT(16)
+
+#define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
+#define POWER_LIMIT2_ENABLE     BIT_ULL(47)
+#define POWER_LIMIT2_CLAMP      BIT_ULL(48)
+#define POWER_PACKAGE_LOCK      BIT_ULL(63)
+#define POWER_PP_LOCK           BIT(31)
+
+#define TIME_WINDOW1_MASK       (0x7FULL<<17)
+#define TIME_WINDOW2_MASK       (0x7FULL<<49)
+
+#define POWER_UNIT_OFFSET	0
+#define POWER_UNIT_MASK		0x0F
+
+#define ENERGY_UNIT_OFFSET	0x08
+#define ENERGY_UNIT_MASK	0x1F00
+
+#define TIME_UNIT_OFFSET	0x10
+#define TIME_UNIT_MASK		0xF0000
+
+#define POWER_INFO_MAX_MASK     (0x7fffULL<<32)
+#define POWER_INFO_MIN_MASK     (0x7fffULL<<16)
+#define POWER_INFO_MAX_TIME_WIN_MASK     (0x3fULL<<48)
+#define POWER_INFO_THERMAL_SPEC_MASK     0x7fff
+
+#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff
+#define PP_POLICY_MASK         0x1F
+
+/* Non HW constants */
+#define RAPL_PRIMITIVE_DERIVED       BIT(1)	/* not from raw data */
+#define RAPL_PRIMITIVE_DUMMY         BIT(2)
+
+#define TIME_WINDOW_MAX_MSEC 40000
+#define TIME_WINDOW_MIN_MSEC 250
+#define ENERGY_UNIT_SCALE    1000	/* scale from driver unit to powercap unit */
+enum unit_type {
+	ARBITRARY_UNIT,		/* no translation */
+	POWER_UNIT,
+	ENERGY_UNIT,
+	TIME_UNIT,
+};
+
+/* per domain data, some are optional */
+#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2)
+
+#define	DOMAIN_STATE_INACTIVE           BIT(0)
+#define	DOMAIN_STATE_POWER_LIMIT_SET    BIT(1)
+#define DOMAIN_STATE_BIOS_LOCKED        BIT(2)
+
+static const char pl1_name[] = "long_term";
+static const char pl2_name[] = "short_term";
+
+#define power_zone_to_rapl_domain(_zone) \
+	container_of(_zone, struct rapl_domain, power_zone)
+
+struct rapl_defaults {
+	u8 floor_freq_reg_addr;
+	int (*check_unit)(struct rapl_package *rp, int cpu);
+	void (*set_floor_freq)(struct rapl_domain *rd, bool mode);
+	u64 (*compute_time_window)(struct rapl_package *rp, u64 val,
+				    bool to_raw);
+	unsigned int dram_domain_energy_unit;
+};
+static struct rapl_defaults *rapl_defaults;
+
+/* Sideband MBI registers */
+#define IOSF_CPU_POWER_BUDGET_CTL_BYT (0x2)
+#define IOSF_CPU_POWER_BUDGET_CTL_TNG (0xdf)
+
+#define PACKAGE_PLN_INT_SAVED   BIT(0)
+#define MAX_PRIM_NAME (32)
+
+/* per domain data. used to describe individual knobs such that access function
+ * can be consolidated into one instead of many inline functions.
+ */
+struct rapl_primitive_info {
+	const char *name;
+	u64 mask;
+	int shift;
+	enum rapl_domain_reg_id id;
+	enum unit_type unit;
+	u32 flag;
+};
+
+#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) {	\
+		.name = #p,			\
+		.mask = m,			\
+		.shift = s,			\
+		.id = i,			\
+		.unit = u,			\
+		.flag = f			\
+	}
+
+static void rapl_init_domains(struct rapl_package *rp);
+static int rapl_read_data_raw(struct rapl_domain *rd,
+			      enum rapl_primitives prim,
+			      bool xlate, u64 *data);
+static int rapl_write_data_raw(struct rapl_domain *rd,
+			       enum rapl_primitives prim,
+			       unsigned long long value);
+static u64 rapl_unit_xlate(struct rapl_domain *rd,
+			   enum unit_type type, u64 value, int to_raw);
+static void package_power_limit_irq_save(struct rapl_package *rp);
+static int rapl_init_core(void);
+static void rapl_remove_core(void);
+
+static LIST_HEAD(rapl_packages);	/* guarded by CPU hotplug lock */
+
+static const char *const rapl_domain_names[] = {
+	"package",
+	"core",
+	"uncore",
+	"dram",
+	"psys",
+};
+
+static int get_energy_counter(struct powercap_zone *power_zone,
+			      u64 *energy_raw)
+{
+	struct rapl_domain *rd;
+	u64 energy_now;
+
+	/* prevent CPU hotplug, make sure the RAPL domain does not go
+	 * away while reading the counter.
+	 */
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+
+	if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) {
+		*energy_raw = energy_now;
+		put_online_cpus();
+
+		return 0;
+	}
+	put_online_cpus();
+
+	return -EIO;
+}
+
+static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
+{
+	struct rapl_domain *rd = power_zone_to_rapl_domain(pcd_dev);
+
+	*energy = rapl_unit_xlate(rd, ENERGY_UNIT, ENERGY_STATUS_MASK, 0);
+	return 0;
+}
+
+static int release_zone(struct powercap_zone *power_zone)
+{
+	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
+	struct rapl_package *rp = rd->rp;
+
+	/* package zone is the last zone of a package, we can free
+	 * memory here since all children has been unregistered.
+	 */
+	if (rd->id == RAPL_DOMAIN_PACKAGE) {
+		kfree(rd);
+		rp->domains = NULL;
+	}
+
+	return 0;
+
+}
+
+static int find_nr_power_limit(struct rapl_domain *rd)
+{
+	int i, nr_pl = 0;
+
+	for (i = 0; i < NR_POWER_LIMITS; i++) {
+		if (rd->rpl[i].name)
+			nr_pl++;
+	}
+
+	return nr_pl;
+}
+
+static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
+{
+	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
+
+	if (rd->state & DOMAIN_STATE_BIOS_LOCKED)
+		return -EACCES;
+
+	get_online_cpus();
+	rapl_write_data_raw(rd, PL1_ENABLE, mode);
+	if (rapl_defaults->set_floor_freq)
+		rapl_defaults->set_floor_freq(rd, mode);
+	put_online_cpus();
+
+	return 0;
+}
+
+static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
+{
+	struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone);
+	u64 val;
+
+	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
+		*mode = false;
+		return 0;
+	}
+	get_online_cpus();
+	if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) {
+		put_online_cpus();
+		return -EIO;
+	}
+	*mode = val;
+	put_online_cpus();
+
+	return 0;
+}
+
+/* per RAPL domain ops, in the order of rapl_domain_type */
+static const struct powercap_zone_ops zone_ops[] = {
+	/* RAPL_DOMAIN_PACKAGE */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+	/* RAPL_DOMAIN_PP0 */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+	/* RAPL_DOMAIN_PP1 */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+	/* RAPL_DOMAIN_DRAM */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+	/* RAPL_DOMAIN_PLATFORM */
+	{
+	 .get_energy_uj = get_energy_counter,
+	 .get_max_energy_range_uj = get_max_energy_counter,
+	 .release = release_zone,
+	 .set_enable = set_domain_enable,
+	 .get_enable = get_domain_enable,
+	 },
+};
+
+/*
+ * Constraint index used by powercap can be different than power limit (PL)
+ * index in that some  PLs maybe missing due to non-existent MSRs. So we
+ * need to convert here by finding the valid PLs only (name populated).
+ */
+static int contraint_to_pl(struct rapl_domain *rd, int cid)
+{
+	int i, j;
+
+	for (i = 0, j = 0; i < NR_POWER_LIMITS; i++) {
+		if ((rd->rpl[i].name) && j++ == cid) {
+			pr_debug("%s: index %d\n", __func__, i);
+			return i;
+		}
+	}
+	pr_err("Cannot find matching power limit for constraint %d\n", cid);
+
+	return -EINVAL;
+}
+
+static int set_power_limit(struct powercap_zone *power_zone, int cid,
+			   u64 power_limit)
+{
+	struct rapl_domain *rd;
+	struct rapl_package *rp;
+	int ret = 0;
+	int id;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto set_exit;
+	}
+
+	rp = rd->rp;
+
+	if (rd->state & DOMAIN_STATE_BIOS_LOCKED) {
+		dev_warn(&power_zone->dev,
+			 "%s locked by BIOS, monitoring only\n", rd->name);
+		ret = -EACCES;
+		goto set_exit;
+	}
+
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		rapl_write_data_raw(rd, POWER_LIMIT1, power_limit);
+		break;
+	case PL2_ENABLE:
+		rapl_write_data_raw(rd, POWER_LIMIT2, power_limit);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	if (!ret)
+		package_power_limit_irq_save(rp);
+set_exit:
+	put_online_cpus();
+	return ret;
+}
+
+static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
+				   u64 *data)
+{
+	struct rapl_domain *rd;
+	u64 val;
+	int prim;
+	int ret = 0;
+	int id;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto get_exit;
+	}
+
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		prim = POWER_LIMIT1;
+		break;
+	case PL2_ENABLE:
+		prim = POWER_LIMIT2;
+		break;
+	default:
+		put_online_cpus();
+		return -EINVAL;
+	}
+	if (rapl_read_data_raw(rd, prim, true, &val))
+		ret = -EIO;
+	else
+		*data = val;
+
+get_exit:
+	put_online_cpus();
+
+	return ret;
+}
+
+static int set_time_window(struct powercap_zone *power_zone, int cid,
+			   u64 window)
+{
+	struct rapl_domain *rd;
+	int ret = 0;
+	int id;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto set_time_exit;
+	}
+
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		rapl_write_data_raw(rd, TIME_WINDOW1, window);
+		break;
+	case PL2_ENABLE:
+		rapl_write_data_raw(rd, TIME_WINDOW2, window);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+set_time_exit:
+	put_online_cpus();
+	return ret;
+}
+
+static int get_time_window(struct powercap_zone *power_zone, int cid,
+			   u64 *data)
+{
+	struct rapl_domain *rd;
+	u64 val;
+	int ret = 0;
+	int id;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto get_time_exit;
+	}
+
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val);
+		break;
+	case PL2_ENABLE:
+		ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val);
+		break;
+	default:
+		put_online_cpus();
+		return -EINVAL;
+	}
+	if (!ret)
+		*data = val;
+
+get_time_exit:
+	put_online_cpus();
+
+	return ret;
+}
+
+static const char *get_constraint_name(struct powercap_zone *power_zone,
+				       int cid)
+{
+	struct rapl_domain *rd;
+	int id;
+
+	rd = power_zone_to_rapl_domain(power_zone);
+	id = contraint_to_pl(rd, cid);
+	if (id >= 0)
+		return rd->rpl[id].name;
+
+	return NULL;
+}
+
+static int get_max_power(struct powercap_zone *power_zone, int id, u64 *data)
+{
+	struct rapl_domain *rd;
+	u64 val;
+	int prim;
+	int ret = 0;
+
+	get_online_cpus();
+	rd = power_zone_to_rapl_domain(power_zone);
+	switch (rd->rpl[id].prim_id) {
+	case PL1_ENABLE:
+		prim = THERMAL_SPEC_POWER;
+		break;
+	case PL2_ENABLE:
+		prim = MAX_POWER;
+		break;
+	default:
+		put_online_cpus();
+		return -EINVAL;
+	}
+	if (rapl_read_data_raw(rd, prim, true, &val))
+		ret = -EIO;
+	else
+		*data = val;
+
+	put_online_cpus();
+
+	return ret;
+}
+
+static const struct powercap_zone_constraint_ops constraint_ops = {
+	.set_power_limit_uw = set_power_limit,
+	.get_power_limit_uw = get_current_power_limit,
+	.set_time_window_us = set_time_window,
+	.get_time_window_us = get_time_window,
+	.get_max_power_uw = get_max_power,
+	.get_name = get_constraint_name,
+};
+
+/* called after domain detection and package level data are set */
+static void rapl_init_domains(struct rapl_package *rp)
+{
+	int i;
+	struct rapl_domain *rd = rp->domains;
+
+	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
+		unsigned int mask = rp->domain_map & (1 << i);
+
+		rd->regs[RAPL_DOMAIN_REG_LIMIT] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_LIMIT];
+		rd->regs[RAPL_DOMAIN_REG_STATUS] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_STATUS];
+		rd->regs[RAPL_DOMAIN_REG_PERF] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_PERF];
+		rd->regs[RAPL_DOMAIN_REG_POLICY] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_POLICY];
+		rd->regs[RAPL_DOMAIN_REG_INFO] =
+		    rp->priv->regs[i][RAPL_DOMAIN_REG_INFO];
+
+		switch (mask) {
+		case BIT(RAPL_DOMAIN_PACKAGE):
+			rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
+			rd->id = RAPL_DOMAIN_PACKAGE;
+			rd->rpl[0].prim_id = PL1_ENABLE;
+			rd->rpl[0].name = pl1_name;
+			rd->rpl[1].prim_id = PL2_ENABLE;
+			rd->rpl[1].name = pl2_name;
+			break;
+		case BIT(RAPL_DOMAIN_PP0):
+			rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
+			rd->id = RAPL_DOMAIN_PP0;
+			rd->rpl[0].prim_id = PL1_ENABLE;
+			rd->rpl[0].name = pl1_name;
+			break;
+		case BIT(RAPL_DOMAIN_PP1):
+			rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
+			rd->id = RAPL_DOMAIN_PP1;
+			rd->rpl[0].prim_id = PL1_ENABLE;
+			rd->rpl[0].name = pl1_name;
+			break;
+		case BIT(RAPL_DOMAIN_DRAM):
+			rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
+			rd->id = RAPL_DOMAIN_DRAM;
+			rd->rpl[0].prim_id = PL1_ENABLE;
+			rd->rpl[0].name = pl1_name;
+			rd->domain_energy_unit =
+			    rapl_defaults->dram_domain_energy_unit;
+			if (rd->domain_energy_unit)
+				pr_info("DRAM domain energy unit %dpj\n",
+					rd->domain_energy_unit);
+			break;
+		}
+		if (mask) {
+			rd->rp = rp;
+			rd++;
+		}
+	}
+}
+
+static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
+			   u64 value, int to_raw)
+{
+	u64 units = 1;
+	struct rapl_package *rp = rd->rp;
+	u64 scale = 1;
+
+	switch (type) {
+	case POWER_UNIT:
+		units = rp->power_unit;
+		break;
+	case ENERGY_UNIT:
+		scale = ENERGY_UNIT_SCALE;
+		/* per domain unit takes precedence */
+		if (rd->domain_energy_unit)
+			units = rd->domain_energy_unit;
+		else
+			units = rp->energy_unit;
+		break;
+	case TIME_UNIT:
+		return rapl_defaults->compute_time_window(rp, value, to_raw);
+	case ARBITRARY_UNIT:
+	default:
+		return value;
+	};
+
+	if (to_raw)
+		return div64_u64(value, units) * scale;
+
+	value *= units;
+
+	return div64_u64(value, scale);
+}
+
+/* in the order of enum rapl_primitives */
+static struct rapl_primitive_info rpi[] = {
+	/* name, mask, shift, msr index, unit divisor */
+	PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0,
+			    RAPL_DOMAIN_REG_STATUS, ENERGY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0,
+			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
+			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_PP_LOCK, 31,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48,
+			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
+	PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17,
+			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49,
+			    RAPL_DOMAIN_REG_LIMIT, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK,
+			    0, RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32,
+			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16,
+			    RAPL_DOMAIN_REG_INFO, POWER_UNIT, 0),
+	PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48,
+			    RAPL_DOMAIN_REG_INFO, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0,
+			    RAPL_DOMAIN_REG_PERF, TIME_UNIT, 0),
+	PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0,
+			    RAPL_DOMAIN_REG_POLICY, ARBITRARY_UNIT, 0),
+	/* non-hardware */
+	PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT,
+			    RAPL_PRIMITIVE_DERIVED),
+	{NULL, 0, 0, 0},
+};
+
+/* Read primitive data based on its related struct rapl_primitive_info.
+ * if xlate flag is set, return translated data based on data units, i.e.
+ * time, energy, and power.
+ * RAPL MSRs are non-architectual and are laid out not consistently across
+ * domains. Here we use primitive info to allow writing consolidated access
+ * functions.
+ * For a given primitive, it is processed by MSR mask and shift. Unit conversion
+ * is pre-assigned based on RAPL unit MSRs read at init time.
+ * 63-------------------------- 31--------------------------- 0
+ * |                           xxxxx (mask)                   |
+ * |                                |<- shift ----------------|
+ * 63-------------------------- 31--------------------------- 0
+ */
+static int rapl_read_data_raw(struct rapl_domain *rd,
+			      enum rapl_primitives prim, bool xlate, u64 *data)
+{
+	u64 value;
+	struct rapl_primitive_info *rp = &rpi[prim];
+	struct reg_action ra;
+	int cpu;
+
+	if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY)
+		return -EINVAL;
+
+	ra.reg = rd->regs[rp->id];
+	if (!ra.reg)
+		return -EINVAL;
+
+	cpu = rd->rp->lead_cpu;
+
+	/* special-case package domain, which uses a different bit */
+	if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) {
+		rp->mask = POWER_PACKAGE_LOCK;
+		rp->shift = 63;
+	}
+	/* non-hardware data are collected by the polling thread */
+	if (rp->flag & RAPL_PRIMITIVE_DERIVED) {
+		*data = rd->rdd.primitives[prim];
+		return 0;
+	}
+
+	ra.mask = rp->mask;
+
+	if (rd->rp->priv->read_raw(cpu, &ra)) {
+		pr_debug("failed to read reg 0x%x on cpu %d\n", ra.reg, cpu);
+		return -EIO;
+	}
+
+	value = ra.value >> rp->shift;
+
+	if (xlate)
+		*data = rapl_unit_xlate(rd, rp->unit, value, 0);
+	else
+		*data = value;
+
+	return 0;
+}
+
+/* Similar use of primitive info in the read counterpart */
+static int rapl_write_data_raw(struct rapl_domain *rd,
+			       enum rapl_primitives prim,
+			       unsigned long long value)
+{
+	struct rapl_primitive_info *rp = &rpi[prim];
+	int cpu;
+	u64 bits;
+	struct reg_action ra;
+	int ret;
+
+	cpu = rd->rp->lead_cpu;
+	bits = rapl_unit_xlate(rd, rp->unit, value, 1);
+	bits <<= rp->shift;
+	bits &= rp->mask;
+
+	memset(&ra, 0, sizeof(ra));
+
+	ra.reg = rd->regs[rp->id];
+	ra.mask = rp->mask;
+	ra.value = bits;
+
+	ret = rd->rp->priv->write_raw(cpu, &ra);
+
+	return ret;
+}
+
+/*
+ * Raw RAPL data stored in MSRs are in certain scales. We need to
+ * convert them into standard units based on the units reported in
+ * the RAPL unit MSRs. This is specific to CPUs as the method to
+ * calculate units differ on different CPUs.
+ * We convert the units to below format based on CPUs.
+ * i.e.
+ * energy unit: picoJoules  : Represented in picoJoules by default
+ * power unit : microWatts  : Represented in milliWatts by default
+ * time unit  : microseconds: Represented in seconds by default
+ */
+static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
+{
+	struct reg_action ra;
+	u32 value;
+
+	ra.reg = rp->priv->reg_unit;
+	ra.mask = ~0;
+	if (rp->priv->read_raw(cpu, &ra)) {
+		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
+		       rp->priv->reg_unit, cpu);
+		return -ENODEV;
+	}
+
+	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
+	rp->energy_unit = ENERGY_UNIT_SCALE * 1000000 / (1 << value);
+
+	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
+	rp->power_unit = 1000000 / (1 << value);
+
+	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
+	rp->time_unit = 1000000 / (1 << value);
+
+	pr_debug("Core CPU %s energy=%dpJ, time=%dus, power=%duW\n",
+		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
+
+	return 0;
+}
+
+static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
+{
+	struct reg_action ra;
+	u32 value;
+
+	ra.reg = rp->priv->reg_unit;
+	ra.mask = ~0;
+	if (rp->priv->read_raw(cpu, &ra)) {
+		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
+		       rp->priv->reg_unit, cpu);
+		return -ENODEV;
+	}
+
+	value = (ra.value & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET;
+	rp->energy_unit = ENERGY_UNIT_SCALE * 1 << value;
+
+	value = (ra.value & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET;
+	rp->power_unit = (1 << value) * 1000;
+
+	value = (ra.value & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET;
+	rp->time_unit = 1000000 / (1 << value);
+
+	pr_debug("Atom %s energy=%dpJ, time=%dus, power=%duW\n",
+		 rp->name, rp->energy_unit, rp->time_unit, rp->power_unit);
+
+	return 0;
+}
+
+static void power_limit_irq_save_cpu(void *info)
+{
+	u32 l, h = 0;
+	struct rapl_package *rp = (struct rapl_package *)info;
+
+	/* save the state of PLN irq mask bit before disabling it */
+	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
+	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) {
+		rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE;
+		rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED;
+	}
+	l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
+	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+}
+
+/* REVISIT:
+ * When package power limit is set artificially low by RAPL, LVT
+ * thermal interrupt for package power limit should be ignored
+ * since we are not really exceeding the real limit. The intention
+ * is to avoid excessive interrupts while we are trying to save power.
+ * A useful feature might be routing the package_power_limit interrupt
+ * to userspace via eventfd. once we have a usecase, this is simple
+ * to do by adding an atomic notifier.
+ */
+
+static void package_power_limit_irq_save(struct rapl_package *rp)
+{
+	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
+		return;
+
+	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
+}
+
+/*
+ * Restore per package power limit interrupt enable state. Called from cpu
+ * hotplug code on package removal.
+ */
+static void package_power_limit_irq_restore(struct rapl_package *rp)
+{
+	u32 l, h;
+
+	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
+		return;
+
+	/* irq enable state not saved, nothing to restore */
+	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
+		return;
+
+	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
+
+	if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE)
+		l |= PACKAGE_THERM_INT_PLN_ENABLE;
+	else
+		l &= ~PACKAGE_THERM_INT_PLN_ENABLE;
+
+	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+}
+
+static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
+{
+	int nr_powerlimit = find_nr_power_limit(rd);
+
+	/* always enable clamp such that p-state can go below OS requested
+	 * range. power capping priority over guranteed frequency.
+	 */
+	rapl_write_data_raw(rd, PL1_CLAMP, mode);
+
+	/* some domains have pl2 */
+	if (nr_powerlimit > 1) {
+		rapl_write_data_raw(rd, PL2_ENABLE, mode);
+		rapl_write_data_raw(rd, PL2_CLAMP, mode);
+	}
+}
+
+static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
+{
+	static u32 power_ctrl_orig_val;
+	u32 mdata;
+
+	if (!rapl_defaults->floor_freq_reg_addr) {
+		pr_err("Invalid floor frequency config register\n");
+		return;
+	}
+
+	if (!power_ctrl_orig_val)
+		iosf_mbi_read(BT_MBI_UNIT_PMC, MBI_CR_READ,
+			      rapl_defaults->floor_freq_reg_addr,
+			      &power_ctrl_orig_val);
+	mdata = power_ctrl_orig_val;
+	if (enable) {
+		mdata &= ~(0x7f << 8);
+		mdata |= 1 << 8;
+	}
+	iosf_mbi_write(BT_MBI_UNIT_PMC, MBI_CR_WRITE,
+		       rapl_defaults->floor_freq_reg_addr, mdata);
+}
+
+static u64 rapl_compute_time_window_core(struct rapl_package *rp, u64 value,
+					 bool to_raw)
+{
+	u64 f, y;		/* fraction and exp. used for time unit */
+
+	/*
+	 * Special processing based on 2^Y*(1+F/4), refer
+	 * to Intel Software Developer's manual Vol.3B: CH 14.9.3.
+	 */
+	if (!to_raw) {
+		f = (value & 0x60) >> 5;
+		y = value & 0x1f;
+		value = (1 << y) * (4 + f) * rp->time_unit / 4;
+	} else {
+		do_div(value, rp->time_unit);
+		y = ilog2(value);
+		f = div64_u64(4 * (value - (1 << y)), 1 << y);
+		value = (y & 0x1f) | ((f & 0x3) << 5);
+	}
+	return value;
+}
+
+static u64 rapl_compute_time_window_atom(struct rapl_package *rp, u64 value,
+					 bool to_raw)
+{
+	/*
+	 * Atom time unit encoding is straight forward val * time_unit,
+	 * where time_unit is default to 1 sec. Never 0.
+	 */
+	if (!to_raw)
+		return (value) ? value *= rp->time_unit : rp->time_unit;
+
+	value = div64_u64(value, rp->time_unit);
+
+	return value;
+}
+
+static const struct rapl_defaults rapl_defaults_core = {
+	.floor_freq_reg_addr = 0,
+	.check_unit = rapl_check_unit_core,
+	.set_floor_freq = set_floor_freq_default,
+	.compute_time_window = rapl_compute_time_window_core,
+};
+
+static const struct rapl_defaults rapl_defaults_hsw_server = {
+	.check_unit = rapl_check_unit_core,
+	.set_floor_freq = set_floor_freq_default,
+	.compute_time_window = rapl_compute_time_window_core,
+	.dram_domain_energy_unit = 15300,
+};
+
+static const struct rapl_defaults rapl_defaults_byt = {
+	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_BYT,
+	.check_unit = rapl_check_unit_atom,
+	.set_floor_freq = set_floor_freq_atom,
+	.compute_time_window = rapl_compute_time_window_atom,
+};
+
+static const struct rapl_defaults rapl_defaults_tng = {
+	.floor_freq_reg_addr = IOSF_CPU_POWER_BUDGET_CTL_TNG,
+	.check_unit = rapl_check_unit_atom,
+	.set_floor_freq = set_floor_freq_atom,
+	.compute_time_window = rapl_compute_time_window_atom,
+};
+
+static const struct rapl_defaults rapl_defaults_ann = {
+	.floor_freq_reg_addr = 0,
+	.check_unit = rapl_check_unit_atom,
+	.set_floor_freq = NULL,
+	.compute_time_window = rapl_compute_time_window_atom,
+};
+
+static const struct rapl_defaults rapl_defaults_cht = {
+	.floor_freq_reg_addr = 0,
+	.check_unit = rapl_check_unit_atom,
+	.set_floor_freq = NULL,
+	.compute_time_window = rapl_compute_time_window_atom,
+};
+
+static const struct x86_cpu_id rapl_ids[] __initconst = {
+	INTEL_CPU_FAM6(SANDYBRIDGE, rapl_defaults_core),
+	INTEL_CPU_FAM6(SANDYBRIDGE_X, rapl_defaults_core),
+
+	INTEL_CPU_FAM6(IVYBRIDGE, rapl_defaults_core),
+	INTEL_CPU_FAM6(IVYBRIDGE_X, rapl_defaults_core),
+
+	INTEL_CPU_FAM6(HASWELL_CORE, rapl_defaults_core),
+	INTEL_CPU_FAM6(HASWELL_ULT, rapl_defaults_core),
+	INTEL_CPU_FAM6(HASWELL_GT3E, rapl_defaults_core),
+	INTEL_CPU_FAM6(HASWELL_X, rapl_defaults_hsw_server),
+
+	INTEL_CPU_FAM6(BROADWELL_CORE, rapl_defaults_core),
+	INTEL_CPU_FAM6(BROADWELL_GT3E, rapl_defaults_core),
+	INTEL_CPU_FAM6(BROADWELL_XEON_D, rapl_defaults_core),
+	INTEL_CPU_FAM6(BROADWELL_X, rapl_defaults_hsw_server),
+
+	INTEL_CPU_FAM6(SKYLAKE_DESKTOP, rapl_defaults_core),
+	INTEL_CPU_FAM6(SKYLAKE_MOBILE, rapl_defaults_core),
+	INTEL_CPU_FAM6(SKYLAKE_X, rapl_defaults_hsw_server),
+	INTEL_CPU_FAM6(KABYLAKE_MOBILE, rapl_defaults_core),
+	INTEL_CPU_FAM6(KABYLAKE_DESKTOP, rapl_defaults_core),
+	INTEL_CPU_FAM6(CANNONLAKE_MOBILE, rapl_defaults_core),
+	INTEL_CPU_FAM6(ICELAKE_MOBILE, rapl_defaults_core),
+
+	INTEL_CPU_FAM6(ATOM_SILVERMONT, rapl_defaults_byt),
+	INTEL_CPU_FAM6(ATOM_AIRMONT, rapl_defaults_cht),
+	INTEL_CPU_FAM6(ATOM_SILVERMONT_MID, rapl_defaults_tng),
+	INTEL_CPU_FAM6(ATOM_AIRMONT_MID, rapl_defaults_ann),
+	INTEL_CPU_FAM6(ATOM_GOLDMONT, rapl_defaults_core),
+	INTEL_CPU_FAM6(ATOM_GOLDMONT_PLUS, rapl_defaults_core),
+	INTEL_CPU_FAM6(ATOM_GOLDMONT_X, rapl_defaults_core),
+	INTEL_CPU_FAM6(ATOM_TREMONT_X, rapl_defaults_core),
+
+	INTEL_CPU_FAM6(XEON_PHI_KNL, rapl_defaults_hsw_server),
+	INTEL_CPU_FAM6(XEON_PHI_KNM, rapl_defaults_hsw_server),
+	{}
+};
+
+MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
+
+/* Read once for all raw primitive data for domains */
+static void rapl_update_domain_data(struct rapl_package *rp)
+{
+	int dmn, prim;
+	u64 val;
+
+	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
+		pr_debug("update %s domain %s data\n", rp->name,
+			 rp->domains[dmn].name);
+		/* exclude non-raw primitives */
+		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
+			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
+						rpi[prim].unit, &val))
+				rp->domains[dmn].rdd.primitives[prim] = val;
+		}
+	}
+
+}
+
+static int rapl_package_register_powercap(struct rapl_package *rp)
+{
+	struct rapl_domain *rd;
+	struct powercap_zone *power_zone = NULL;
+	int nr_pl, ret;
+
+	/* Update the domain data of the new package */
+	rapl_update_domain_data(rp);
+
+	/* first we register package domain as the parent zone */
+	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
+		if (rd->id == RAPL_DOMAIN_PACKAGE) {
+			nr_pl = find_nr_power_limit(rd);
+			pr_debug("register package domain %s\n", rp->name);
+			power_zone = powercap_register_zone(&rd->power_zone,
+					    rp->priv->control_type, rp->name,
+					    NULL, &zone_ops[rd->id], nr_pl,
+					    &constraint_ops);
+			if (IS_ERR(power_zone)) {
+				pr_debug("failed to register power zone %s\n",
+					 rp->name);
+				return PTR_ERR(power_zone);
+			}
+			/* track parent zone in per package/socket data */
+			rp->power_zone = power_zone;
+			/* done, only one package domain per socket */
+			break;
+		}
+	}
+	if (!power_zone) {
+		pr_err("no package domain found, unknown topology!\n");
+		return -ENODEV;
+	}
+	/* now register domains as children of the socket/package */
+	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
+		if (rd->id == RAPL_DOMAIN_PACKAGE)
+			continue;
+		/* number of power limits per domain varies */
+		nr_pl = find_nr_power_limit(rd);
+		power_zone = powercap_register_zone(&rd->power_zone,
+						    rp->priv->control_type,
+						    rd->name, rp->power_zone,
+						    &zone_ops[rd->id], nr_pl,
+						    &constraint_ops);
+
+		if (IS_ERR(power_zone)) {
+			pr_debug("failed to register power_zone, %s:%s\n",
+				 rp->name, rd->name);
+			ret = PTR_ERR(power_zone);
+			goto err_cleanup;
+		}
+	}
+	return 0;
+
+err_cleanup:
+	/*
+	 * Clean up previously initialized domains within the package if we
+	 * failed after the first domain setup.
+	 */
+	while (--rd >= rp->domains) {
+		pr_debug("unregister %s domain %s\n", rp->name, rd->name);
+		powercap_unregister_zone(rp->priv->control_type,
+					 &rd->power_zone);
+	}
+
+	return ret;
+}
+
+int rapl_add_platform_domain(struct rapl_if_priv *priv)
+{
+	struct rapl_domain *rd;
+	struct powercap_zone *power_zone;
+	struct reg_action ra;
+	int ret;
+
+	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
+	ra.mask = ~0;
+	ret = priv->read_raw(0, &ra);
+	if (ret || !ra.value)
+		return -ENODEV;
+
+	ra.reg = priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
+	ra.mask = ~0;
+	ret = priv->read_raw(0, &ra);
+	if (ret || !ra.value)
+		return -ENODEV;
+
+	rd = kzalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return -ENOMEM;
+
+	rd->name = rapl_domain_names[RAPL_DOMAIN_PLATFORM];
+	rd->id = RAPL_DOMAIN_PLATFORM;
+	rd->regs[RAPL_DOMAIN_REG_LIMIT] =
+	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_LIMIT];
+	rd->regs[RAPL_DOMAIN_REG_STATUS] =
+	    priv->regs[RAPL_DOMAIN_PLATFORM][RAPL_DOMAIN_REG_STATUS];
+	rd->rpl[0].prim_id = PL1_ENABLE;
+	rd->rpl[0].name = pl1_name;
+	rd->rpl[1].prim_id = PL2_ENABLE;
+	rd->rpl[1].name = pl2_name;
+	rd->rp = rapl_find_package_domain(0, priv);
+
+	power_zone = powercap_register_zone(&rd->power_zone, priv->control_type,
+					    "psys", NULL,
+					    &zone_ops[RAPL_DOMAIN_PLATFORM],
+					    2, &constraint_ops);
+
+	if (IS_ERR(power_zone)) {
+		kfree(rd);
+		return PTR_ERR(power_zone);
+	}
+
+	priv->platform_rapl_domain = rd;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rapl_add_platform_domain);
+
+void rapl_remove_platform_domain(struct rapl_if_priv *priv)
+{
+	if (priv->platform_rapl_domain) {
+		powercap_unregister_zone(priv->control_type,
+				 &priv->platform_rapl_domain->power_zone);
+		kfree(priv->platform_rapl_domain);
+	}
+}
+EXPORT_SYMBOL_GPL(rapl_remove_platform_domain);
+
+static int rapl_check_domain(int cpu, int domain, struct rapl_package *rp)
+{
+	struct reg_action ra;
+
+	switch (domain) {
+	case RAPL_DOMAIN_PACKAGE:
+	case RAPL_DOMAIN_PP0:
+	case RAPL_DOMAIN_PP1:
+	case RAPL_DOMAIN_DRAM:
+		ra.reg = rp->priv->regs[domain][RAPL_DOMAIN_REG_STATUS];
+		break;
+	case RAPL_DOMAIN_PLATFORM:
+		/* PSYS(PLATFORM) is not a CPU domain, so avoid printng error */
+		return -EINVAL;
+	default:
+		pr_err("invalid domain id %d\n", domain);
+		return -EINVAL;
+	}
+	/* make sure domain counters are available and contains non-zero
+	 * values, otherwise skip it.
+	 */
+
+	ra.mask = ~0;
+	if (rp->priv->read_raw(cpu, &ra) || !ra.value)
+		return -ENODEV;
+
+	return 0;
+}
+
+/*
+ * Check if power limits are available. Two cases when they are not available:
+ * 1. Locked by BIOS, in this case we still provide read-only access so that
+ *    users can see what limit is set by the BIOS.
+ * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
+ *    exist at all. In this case, we do not show the constraints in powercap.
+ *
+ * Called after domains are detected and initialized.
+ */
+static void rapl_detect_powerlimit(struct rapl_domain *rd)
+{
+	u64 val64;
+	int i;
+
+	/* check if the domain is locked by BIOS, ignore if MSR doesn't exist */
+	if (!rapl_read_data_raw(rd, FW_LOCK, false, &val64)) {
+		if (val64) {
+			pr_info("RAPL %s domain %s locked by BIOS\n",
+				rd->rp->name, rd->name);
+			rd->state |= DOMAIN_STATE_BIOS_LOCKED;
+		}
+	}
+	/* check if power limit MSR exists, otherwise domain is monitoring only */
+	for (i = 0; i < NR_POWER_LIMITS; i++) {
+		int prim = rd->rpl[i].prim_id;
+
+		if (rapl_read_data_raw(rd, prim, false, &val64))
+			rd->rpl[i].name = NULL;
+	}
+}
+
+/* Detect active and valid domains for the given CPU, caller must
+ * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
+ */
+static int rapl_detect_domains(struct rapl_package *rp, int cpu)
+{
+	struct rapl_domain *rd;
+	int i;
+
+	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
+		/* use physical package id to read counters */
+		if (!rapl_check_domain(cpu, i, rp)) {
+			rp->domain_map |= 1 << i;
+			pr_info("Found RAPL domain %s\n", rapl_domain_names[i]);
+		}
+	}
+	rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX);
+	if (!rp->nr_domains) {
+		pr_debug("no valid rapl domains found in %s\n", rp->name);
+		return -ENODEV;
+	}
+	pr_debug("found %d domains on %s\n", rp->nr_domains, rp->name);
+
+	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
+			      GFP_KERNEL);
+	if (!rp->domains)
+		return -ENOMEM;
+
+	rapl_init_domains(rp);
+
+	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
+		rapl_detect_powerlimit(rd);
+
+	return 0;
+}
+
+/* called from CPU hotplug notifier, hotplug lock held */
+void rapl_remove_package(struct rapl_package *rp)
+{
+	struct rapl_domain *rd, *rd_package = NULL;
+
+	package_power_limit_irq_restore(rp);
+
+	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
+		rapl_write_data_raw(rd, PL1_ENABLE, 0);
+		rapl_write_data_raw(rd, PL1_CLAMP, 0);
+		if (find_nr_power_limit(rd) > 1) {
+			rapl_write_data_raw(rd, PL2_ENABLE, 0);
+			rapl_write_data_raw(rd, PL2_CLAMP, 0);
+		}
+		if (rd->id == RAPL_DOMAIN_PACKAGE) {
+			rd_package = rd;
+			continue;
+		}
+		pr_debug("remove package, undo power limit on %s: %s\n",
+			 rp->name, rd->name);
+		powercap_unregister_zone(rp->priv->control_type,
+					 &rd->power_zone);
+	}
+	/* do parent zone last */
+	powercap_unregister_zone(rp->priv->control_type,
+				 &rd_package->power_zone);
+	list_del(&rp->plist);
+	if (list_empty(&rapl_packages))
+		rapl_remove_core();
+	kfree(rp);
+}
+EXPORT_SYMBOL_GPL(rapl_remove_package);
+
+/* caller to ensure CPU hotplug lock is held */
+struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv)
+{
+	int id = topology_logical_die_id(cpu);
+	struct rapl_package *rp;
+
+	list_for_each_entry(rp, &rapl_packages, plist) {
+		if (rp->id == id
+		    && rp->priv->control_type == priv->control_type)
+			return rp;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(rapl_find_package_domain);
+
+/* called from CPU hotplug notifier, hotplug lock held */
+struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv)
+{
+	int id = topology_logical_die_id(cpu);
+	struct rapl_package *rp;
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	int ret;
+
+	ret = rapl_init_core();
+	if (ret)
+		return ERR_PTR(ret);
+
+	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
+	if (!rp)
+		return ERR_PTR(-ENOMEM);
+
+	/* add the new package to the list */
+	rp->id = id;
+	rp->lead_cpu = cpu;
+	rp->priv = priv;
+
+	if (topology_max_die_per_package() > 1)
+		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH,
+			 "package-%d-die-%d", c->phys_proc_id, c->cpu_die_id);
+	else
+		snprintf(rp->name, PACKAGE_DOMAIN_NAME_LENGTH, "package-%d",
+			 c->phys_proc_id);
+
+	/* check if the package contains valid domains */
+	if (rapl_detect_domains(rp, cpu) || rapl_defaults->check_unit(rp, cpu)) {
+		ret = -ENODEV;
+		goto err_free_package;
+	}
+	ret = rapl_package_register_powercap(rp);
+	if (!ret) {
+		INIT_LIST_HEAD(&rp->plist);
+		list_add(&rp->plist, &rapl_packages);
+		return rp;
+	}
+
+err_free_package:
+	kfree(rp->domains);
+	kfree(rp);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(rapl_add_package);
+
+static void power_limit_state_save(void)
+{
+	struct rapl_package *rp;
+	struct rapl_domain *rd;
+	int nr_pl, ret, i;
+
+	get_online_cpus();
+	list_for_each_entry(rp, &rapl_packages, plist) {
+		if (!rp->power_zone)
+			continue;
+		rd = power_zone_to_rapl_domain(rp->power_zone);
+		nr_pl = find_nr_power_limit(rd);
+		for (i = 0; i < nr_pl; i++) {
+			switch (rd->rpl[i].prim_id) {
+			case PL1_ENABLE:
+				ret = rapl_read_data_raw(rd,
+						 POWER_LIMIT1, true,
+						 &rd->rpl[i].last_power_limit);
+				if (ret)
+					rd->rpl[i].last_power_limit = 0;
+				break;
+			case PL2_ENABLE:
+				ret = rapl_read_data_raw(rd,
+						 POWER_LIMIT2, true,
+						 &rd->rpl[i].last_power_limit);
+				if (ret)
+					rd->rpl[i].last_power_limit = 0;
+				break;
+			}
+		}
+	}
+	put_online_cpus();
+}
+
+static void power_limit_state_restore(void)
+{
+	struct rapl_package *rp;
+	struct rapl_domain *rd;
+	int nr_pl, i;
+
+	get_online_cpus();
+	list_for_each_entry(rp, &rapl_packages, plist) {
+		if (!rp->power_zone)
+			continue;
+		rd = power_zone_to_rapl_domain(rp->power_zone);
+		nr_pl = find_nr_power_limit(rd);
+		for (i = 0; i < nr_pl; i++) {
+			switch (rd->rpl[i].prim_id) {
+			case PL1_ENABLE:
+				if (rd->rpl[i].last_power_limit)
+					rapl_write_data_raw(rd, POWER_LIMIT1,
+					    rd->rpl[i].last_power_limit);
+				break;
+			case PL2_ENABLE:
+				if (rd->rpl[i].last_power_limit)
+					rapl_write_data_raw(rd, POWER_LIMIT2,
+					    rd->rpl[i].last_power_limit);
+				break;
+			}
+		}
+	}
+	put_online_cpus();
+}
+
+static int rapl_pm_callback(struct notifier_block *nb,
+			    unsigned long mode, void *_unused)
+{
+	switch (mode) {
+	case PM_SUSPEND_PREPARE:
+		power_limit_state_save();
+		break;
+	case PM_POST_SUSPEND:
+		power_limit_state_restore();
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block rapl_pm_notifier = {
+	.notifier_call = rapl_pm_callback,
+};
+
+static int rapl_init_core(void)
+{
+	const struct x86_cpu_id *id;
+	int ret;
+
+	if (rapl_defaults)
+		return 0;
+
+	id = x86_match_cpu(rapl_ids);
+	if (!id) {
+		pr_err("driver does not support CPU family %d model %d\n",
+		       boot_cpu_data.x86, boot_cpu_data.x86_model);
+
+		return -ENODEV;
+	}
+
+	rapl_defaults = (struct rapl_defaults *)id->driver_data;
+
+	ret = register_pm_notifier(&rapl_pm_notifier);
+
+	return 0;
+}
+
+static void rapl_remove_core(void)
+{
+	unregister_pm_notifier(&rapl_pm_notifier);
+	rapl_defaults = NULL;
+}
+
+MODULE_DESCRIPTION("Intel Runtime Average Power Limit (RAPL) common code");
+MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
new file mode 100644
index 000000000000..89645222e3e0
--- /dev/null
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Intel Running Average Power Limit (RAPL) Driver via MSR interface
+ * Copyright (c) 2019, Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+#include <linux/bitmap.h>
+#include <linux/delay.h>
+#include <linux/sysfs.h>
+#include <linux/cpu.h>
+#include <linux/powercap.h>
+#include <linux/suspend.h>
+#include <linux/intel_rapl.h>
+#include <linux/processor.h>
+
+#include <asm/iosf_mbi.h>
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+/* Local defines */
+#define MSR_PLATFORM_POWER_LIMIT	0x0000065C
+
+/* private data for RAPL MSR Interface */
+static struct rapl_if_priv rapl_msr_priv = {
+	.reg_unit = MSR_RAPL_POWER_UNIT,
+	.regs[RAPL_DOMAIN_PACKAGE] = {
+		MSR_PKG_POWER_LIMIT, MSR_PKG_ENERGY_STATUS, MSR_PKG_PERF_STATUS, 0, MSR_PKG_POWER_INFO },
+	.regs[RAPL_DOMAIN_PP0] = {
+		MSR_PP0_POWER_LIMIT, MSR_PP0_ENERGY_STATUS, 0, MSR_PP0_POLICY, 0 },
+	.regs[RAPL_DOMAIN_PP1] = {
+		MSR_PP1_POWER_LIMIT, MSR_PP1_ENERGY_STATUS, 0, MSR_PP1_POLICY, 0 },
+	.regs[RAPL_DOMAIN_DRAM] = {
+		MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
+	.regs[RAPL_DOMAIN_PLATFORM] = {
+		MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
+};
+
+/* Handles CPU hotplug on multi-socket systems.
+ * If a CPU goes online as the first CPU of the physical package
+ * we add the RAPL package to the system. Similarly, when the last
+ * CPU of the package is removed, we remove the RAPL package and its
+ * associated domains. Cooling devices are handled accordingly at
+ * per-domain level.
+ */
+static int rapl_cpu_online(unsigned int cpu)
+{
+	struct rapl_package *rp;
+
+	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
+	if (!rp) {
+		rp = rapl_add_package(cpu, &rapl_msr_priv);
+		if (IS_ERR(rp))
+			return PTR_ERR(rp);
+	}
+	cpumask_set_cpu(cpu, &rp->cpumask);
+	return 0;
+}
+
+static int rapl_cpu_down_prep(unsigned int cpu)
+{
+	struct rapl_package *rp;
+	int lead_cpu;
+
+	rp = rapl_find_package_domain(cpu, &rapl_msr_priv);
+	if (!rp)
+		return 0;
+
+	cpumask_clear_cpu(cpu, &rp->cpumask);
+	lead_cpu = cpumask_first(&rp->cpumask);
+	if (lead_cpu >= nr_cpu_ids)
+		rapl_remove_package(rp);
+	else if (rp->lead_cpu == cpu)
+		rp->lead_cpu = lead_cpu;
+	return 0;
+}
+
+static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
+{
+	if (rdmsrl_safe_on_cpu(cpu, ra->reg, &ra->value)) {
+		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg, cpu);
+		return -EIO;
+	}
+	ra->value &= ra->mask;
+	return 0;
+}
+
+static void rapl_msr_update_func(void *info)
+{
+	struct reg_action *ra = info;
+	u64 val;
+
+	ra->err = rdmsrl_safe(ra->reg, &val);
+	if (ra->err)
+		return;
+
+	val &= ~ra->mask;
+	val |= ra->value;
+
+	ra->err = wrmsrl_safe(ra->reg, val);
+}
+
+static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
+{
+	int ret;
+
+	ret = smp_call_function_single(cpu, rapl_msr_update_func, ra, 1);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
+	return ra->err;
+}
+
+static int __init rapl_msr_init(void)
+{
+	int ret;
+
+	rapl_msr_priv.read_raw = rapl_msr_read_raw;
+	rapl_msr_priv.write_raw = rapl_msr_write_raw;
+
+	rapl_msr_priv.control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
+	if (IS_ERR(rapl_msr_priv.control_type)) {
+		pr_debug("failed to register powercap control_type.\n");
+		return PTR_ERR(rapl_msr_priv.control_type);
+	}
+
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
+				rapl_cpu_online, rapl_cpu_down_prep);
+	if (ret < 0)
+		goto out;
+	rapl_msr_priv.pcap_rapl_online = ret;
+
+	/* Don't bail out if PSys is not supported */
+	rapl_add_platform_domain(&rapl_msr_priv);
+
+	return 0;
+
+out:
+	if (ret)
+		powercap_unregister_control_type(rapl_msr_priv.control_type);
+	return ret;
+}
+
+static void __exit rapl_msr_exit(void)
+{
+	cpuhp_remove_state(rapl_msr_priv.pcap_rapl_online);
+	rapl_remove_platform_domain(&rapl_msr_priv);
+	powercap_unregister_control_type(rapl_msr_priv.control_type);
+}
+
+module_init(rapl_msr_init);
+module_exit(rapl_msr_exit);
+
+MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit) control via MSR interface");
+MODULE_AUTHOR("Zhang Rui <rui.zhang@intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index ff215d64d114..9579f458fe4d 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -142,4 +142,11 @@ struct rapl_package {
 	struct rapl_if_priv *priv;
 };
 
+struct rapl_package *rapl_find_package_domain(int cpu, struct rapl_if_priv *priv);
+struct rapl_package *rapl_add_package(int cpu, struct rapl_if_priv *priv);
+void rapl_remove_package(struct rapl_package *rp);
+
+int rapl_add_platform_domain(struct rapl_if_priv *priv);
+void rapl_remove_platform_domain(struct rapl_if_priv *priv);
+
 #endif /* __INTEL_RAPL_H__ */
-- 
cgit v1.2.3-59-g8ed1b


From d978e755aabe215cb67bf713e103ed3916ec306d Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:31 +0800
Subject: intel_rapl: support 64 bit register

RAPL MMIO interface uses 64 bit registers, thus force use 64 bit register
for all the RAPL code.

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c |  6 +++---
 drivers/powercap/intel_rapl_msr.c    | 11 +++++++----
 include/linux/intel_rapl.h           |  8 ++++----
 3 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 34a82531a7cf..8e4de036f6d0 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -689,7 +689,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 	ra.mask = rp->mask;
 
 	if (rd->rp->priv->read_raw(cpu, &ra)) {
-		pr_debug("failed to read reg 0x%x on cpu %d\n", ra.reg, cpu);
+		pr_debug("failed to read reg 0x%llx on cpu %d\n", ra.reg, cpu);
 		return -EIO;
 	}
 
@@ -749,7 +749,7 @@ static int rapl_check_unit_core(struct rapl_package *rp, int cpu)
 	ra.reg = rp->priv->reg_unit;
 	ra.mask = ~0;
 	if (rp->priv->read_raw(cpu, &ra)) {
-		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
+		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
 		       rp->priv->reg_unit, cpu);
 		return -ENODEV;
 	}
@@ -777,7 +777,7 @@ static int rapl_check_unit_atom(struct rapl_package *rp, int cpu)
 	ra.reg = rp->priv->reg_unit;
 	ra.mask = ~0;
 	if (rp->priv->read_raw(cpu, &ra)) {
-		pr_err("Failed to read power unit REG 0x%x on CPU %d, exit.\n",
+		pr_err("Failed to read power unit REG 0x%llx on CPU %d, exit.\n",
 		       rp->priv->reg_unit, cpu);
 		return -ENODEV;
 	}
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
index 89645222e3e0..6cd8a8fb9238 100644
--- a/drivers/powercap/intel_rapl_msr.c
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -84,8 +84,10 @@ static int rapl_cpu_down_prep(unsigned int cpu)
 
 static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
 {
-	if (rdmsrl_safe_on_cpu(cpu, ra->reg, &ra->value)) {
-		pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg, cpu);
+	u32 msr = (u32)ra->reg;
+
+	if (rdmsrl_safe_on_cpu(cpu, msr, &ra->value)) {
+		pr_debug("failed to read msr 0x%x on cpu %d\n", msr, cpu);
 		return -EIO;
 	}
 	ra->value &= ra->mask;
@@ -95,16 +97,17 @@ static int rapl_msr_read_raw(int cpu, struct reg_action *ra)
 static void rapl_msr_update_func(void *info)
 {
 	struct reg_action *ra = info;
+	u32 msr = (u32)ra->reg;
 	u64 val;
 
-	ra->err = rdmsrl_safe(ra->reg, &val);
+	ra->err = rdmsrl_safe(msr, &val);
 	if (ra->err)
 		return;
 
 	val &= ~ra->mask;
 	val |= ra->value;
 
-	ra->err = wrmsrl_safe(ra->reg, val);
+	ra->err = wrmsrl_safe(msr, val);
 }
 
 static int rapl_msr_write_raw(int cpu, struct reg_action *ra)
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 9579f458fe4d..649e19981eb0 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -78,7 +78,7 @@ struct rapl_package;
 struct rapl_domain {
 	const char *name;
 	enum rapl_domain_type id;
-	int regs[RAPL_DOMAIN_REG_MAX];
+	u64 regs[RAPL_DOMAIN_REG_MAX];
 	struct powercap_zone power_zone;
 	struct rapl_domain_data rdd;
 	struct rapl_power_limit rpl[NR_POWER_LIMITS];
@@ -89,7 +89,7 @@ struct rapl_domain {
 };
 
 struct reg_action {
-	u32 reg;
+	u64 reg;
 	u64 mask;
 	u64 value;
 	int err;
@@ -113,8 +113,8 @@ struct rapl_if_priv {
 	struct powercap_control_type *control_type;
 	struct rapl_domain *platform_rapl_domain;
 	enum cpuhp_state pcap_rapl_online;
-	u32 reg_unit;
-	u32 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
+	u64 reg_unit;
+	u64 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
 	int (*read_raw)(int cpu, struct reg_action *ra);
 	int (*write_raw)(int cpu, struct reg_action *ra);
 };
-- 
cgit v1.2.3-59-g8ed1b


From 0c2ddedd8bcb88c4100acb9e0fc5ac8752d09501 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Wed, 10 Jul 2019 21:44:32 +0800
Subject: intel_rapl: support two power limits for every RAPL domain

RAPL MSR interface supports 2 power limits for package domain, and 1 power
limit for other domains, while RAPL MMIO interface supports 2 power limits
for both package and dram domains.
And when 2 power limits are supported, the FW_LOCK bit is in bit 63 of the
register, instead of bit 31.

Remove the assumption that only pakcage domain supports 2 power limits.
And allow the RAPL interface driver to specify the number of power limits
supported, for every single RAPL domain it owns..

Reviewed-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Tested-by: Pandruvada, Srinivas <srinivas.pandruvada@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c | 72 +++++++++++++-----------------------
 drivers/powercap/intel_rapl_msr.c    |  1 +
 include/linux/intel_rapl.h           |  2 +
 3 files changed, 28 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index 8e4de036f6d0..db8df19d8133 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -38,8 +38,8 @@
 #define POWER_LIMIT2_MASK       (0x7FFFULL<<32)
 #define POWER_LIMIT2_ENABLE     BIT_ULL(47)
 #define POWER_LIMIT2_CLAMP      BIT_ULL(48)
-#define POWER_PACKAGE_LOCK      BIT_ULL(63)
-#define POWER_PP_LOCK           BIT(31)
+#define POWER_HIGH_LOCK         BIT_ULL(63)
+#define POWER_LOW_LOCK          BIT(31)
 
 #define TIME_WINDOW1_MASK       (0x7FULL<<17)
 #define TIME_WINDOW2_MASK       (0x7FULL<<49)
@@ -513,60 +513,38 @@ static const struct powercap_zone_constraint_ops constraint_ops = {
 /* called after domain detection and package level data are set */
 static void rapl_init_domains(struct rapl_package *rp)
 {
-	int i;
+	enum rapl_domain_type i;
+	enum rapl_domain_reg_id j;
 	struct rapl_domain *rd = rp->domains;
 
 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 		unsigned int mask = rp->domain_map & (1 << i);
 
-		rd->regs[RAPL_DOMAIN_REG_LIMIT] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_LIMIT];
-		rd->regs[RAPL_DOMAIN_REG_STATUS] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_STATUS];
-		rd->regs[RAPL_DOMAIN_REG_PERF] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_PERF];
-		rd->regs[RAPL_DOMAIN_REG_POLICY] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_POLICY];
-		rd->regs[RAPL_DOMAIN_REG_INFO] =
-		    rp->priv->regs[i][RAPL_DOMAIN_REG_INFO];
-
-		switch (mask) {
-		case BIT(RAPL_DOMAIN_PACKAGE):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE];
-			rd->id = RAPL_DOMAIN_PACKAGE;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
+		if (!mask)
+			continue;
+
+		rd->rp = rp;
+		rd->name = rapl_domain_names[i];
+		rd->id = i;
+		rd->rpl[0].prim_id = PL1_ENABLE;
+		rd->rpl[0].name = pl1_name;
+		/* some domain may support two power limits */
+		if (rp->priv->limits[i] == 2) {
 			rd->rpl[1].prim_id = PL2_ENABLE;
 			rd->rpl[1].name = pl2_name;
-			break;
-		case BIT(RAPL_DOMAIN_PP0):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PP0];
-			rd->id = RAPL_DOMAIN_PP0;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			break;
-		case BIT(RAPL_DOMAIN_PP1):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_PP1];
-			rd->id = RAPL_DOMAIN_PP1;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
-			break;
-		case BIT(RAPL_DOMAIN_DRAM):
-			rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM];
-			rd->id = RAPL_DOMAIN_DRAM;
-			rd->rpl[0].prim_id = PL1_ENABLE;
-			rd->rpl[0].name = pl1_name;
+		}
+
+		for (j = 0; j < RAPL_DOMAIN_REG_MAX; j++)
+			rd->regs[j] = rp->priv->regs[i][j];
+
+		if (i == RAPL_DOMAIN_DRAM) {
 			rd->domain_energy_unit =
 			    rapl_defaults->dram_domain_energy_unit;
 			if (rd->domain_energy_unit)
 				pr_info("DRAM domain energy unit %dpj\n",
 					rd->domain_energy_unit);
-			break;
-		}
-		if (mask) {
-			rd->rp = rp;
-			rd++;
 		}
+		rd++;
 	}
 }
 
@@ -613,7 +591,7 @@ static struct rapl_primitive_info rpi[] = {
 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
 	PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32,
 			    RAPL_DOMAIN_REG_LIMIT, POWER_UNIT, 0),
-	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_PP_LOCK, 31,
+	PRIMITIVE_INFO_INIT(FW_LOCK, POWER_LOW_LOCK, 31,
 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
 	PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15,
 			    RAPL_DOMAIN_REG_LIMIT, ARBITRARY_UNIT, 0),
@@ -675,9 +653,9 @@ static int rapl_read_data_raw(struct rapl_domain *rd,
 
 	cpu = rd->rp->lead_cpu;
 
-	/* special-case package domain, which uses a different bit */
-	if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) {
-		rp->mask = POWER_PACKAGE_LOCK;
+	/* domain with 2 limits has different bit */
+	if (prim == FW_LOCK && rd->rp->priv->limits[rd->id] == 2) {
+		rp->mask = POWER_HIGH_LOCK;
 		rp->shift = 63;
 	}
 	/* non-hardware data are collected by the polling thread */
diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c
index 6cd8a8fb9238..bc14a4579acb 100644
--- a/drivers/powercap/intel_rapl_msr.c
+++ b/drivers/powercap/intel_rapl_msr.c
@@ -41,6 +41,7 @@ static struct rapl_if_priv rapl_msr_priv = {
 		MSR_DRAM_POWER_LIMIT, MSR_DRAM_ENERGY_STATUS, MSR_DRAM_PERF_STATUS, 0, MSR_DRAM_POWER_INFO },
 	.regs[RAPL_DOMAIN_PLATFORM] = {
 		MSR_PLATFORM_POWER_LIMIT, MSR_PLATFORM_ENERGY_STATUS, 0, 0, 0},
+	.limits[RAPL_DOMAIN_PACKAGE] = 2,
 };
 
 /* Handles CPU hotplug on multi-socket systems.
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 649e19981eb0..0c179d92d110 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -104,6 +104,7 @@ struct reg_action {
  * @pcap_rapl_online:		CPU hotplug state for each RAPL interface.
  * @reg_unit:			Register for getting energy/power/time unit.
  * @regs:			Register sets for different RAPL Domains.
+ * @limits:			Number of power limits supported by each domain.
  * @read_raw:			Callback for reading RAPL interface specific
  *				registers.
  * @write_raw:			Callback for writing RAPL interface specific
@@ -115,6 +116,7 @@ struct rapl_if_priv {
 	enum cpuhp_state pcap_rapl_online;
 	u64 reg_unit;
 	u64 regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX];
+	int limits[RAPL_DOMAIN_MAX];
 	int (*read_raw)(int cpu, struct reg_action *ra);
 	int (*write_raw)(int cpu, struct reg_action *ra);
 };
-- 
cgit v1.2.3-59-g8ed1b


From bedc0fd0f9b517698193d644f914b33951856fd2 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Thu, 11 Jul 2019 09:55:56 -0400
Subject: RDMA/core: Fix -Wunused-const-variable warnings

The commit below introduced a few compilation warnings.

In file included from ./include/rdma/ib_verbs.h:64,
                 from ./include/linux/mlx5/device.h:37,
                 from ./include/linux/mlx5/driver.h:51,
                 from drivers/net/ethernet/mellanox/mlx5/core/uar.c:36:
./include/linux/dim.h:378:1: warning: 'rdma_dim_prof' defined but not
used [-Wunused-const-variable=]
 rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
 ^~~~~~~~~~~~~
In file included from ./include/rdma/ib_verbs.h:64,
                 from ./include/linux/mlx5/device.h:37,
                 from ./include/linux/mlx5/driver.h:51,
                 from
drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c:37:
./include/linux/dim.h:378:1: warning: 'rdma_dim_prof' defined but not
used [-Wunused-const-variable=]
 rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
 ^~~~~~~~~~~~~

Since only ib_cq_rdma_dim_work() in drivers/infiniband/core/cq.c uses it,
just move the definition over there.

Fixes: f4915455dcf0 ("linux/dim: Implement RDMA adaptive moderation (DIM)")
Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cq.c | 13 +++++++++++++
 include/linux/dim.h          | 13 -------------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index ffd6e24109d5..7c599878ccf7 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -18,6 +18,19 @@
 #define IB_POLL_FLAGS \
 	(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
 
+static const struct dim_cq_moder
+rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
+	{1,   0, 1,  0},
+	{1,   0, 4,  0},
+	{2,   0, 4,  0},
+	{2,   0, 8,  0},
+	{4,   0, 8,  0},
+	{16,  0, 8,  0},
+	{16,  0, 16, 0},
+	{32,  0, 16, 0},
+	{32,  0, 32, 0},
+};
+
 static void ib_cq_rdma_dim_work(struct work_struct *w)
 {
 	struct dim *dim = container_of(w, struct dim, work);
diff --git a/include/linux/dim.h b/include/linux/dim.h
index aa69730c3b8d..d3a0fbfff2bb 100644
--- a/include/linux/dim.h
+++ b/include/linux/dim.h
@@ -374,19 +374,6 @@ void net_dim(struct dim *dim, struct dim_sample end_sample);
 #define RDMA_DIM_PARAMS_NUM_PROFILES 9
 #define RDMA_DIM_START_PROFILE 0
 
-static const struct dim_cq_moder
-rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = {
-	{1,   0, 1,  0},
-	{1,   0, 4,  0},
-	{2,   0, 4,  0},
-	{2,   0, 8,  0},
-	{4,   0, 8,  0},
-	{16,  0, 8,  0},
-	{16,  0, 16, 0},
-	{32,  0, 16, 0},
-	{32,  0, 32, 0},
-};
-
 /**
  * rdma_dim - Runs the adaptive moderation.
  * @dim: The moderation struct.
-- 
cgit v1.2.3-59-g8ed1b


From b516ea586d717472178e6ef1c152e85608b0ce32 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Mon, 8 Jul 2019 13:17:44 +0800
Subject: PCI: Enable NVIDIA HDA controllers

Many NVIDIA GPUs can be configured as either a single-function video device
or a multi-function device with video at function 0 and an HDA audio
controller at function 1.  The HDA controller can be enabled or disabled by
a bit in the function 0 config space.

Some BIOSes leave the HDA disabled, which means the HDMI connector from the
NVIDIA GPU may not work.  Sometimes the BIOS enables the HDA if an HDMI
cable is connected at boot time, but that doesn't handle hotplug cases.

Enable the HDA controller on device enumeration and resume and re-read the
header type, which tells us whether the GPU is a multi-function device.

This quirk is limited to NVIDIA PCI devices with the VGA Controller device
class.  This is expected to correspond to product configurations where the
NVIDIA GPU has connectors attached.  Other products where the device class
is 3D Controller are expected to correspond to configurations where the
NVIDIA GPU is dedicated (dGPU) and has no connectors.  See original post
(URL below) for more details.

This commit takes inspiration from an earlier patch by Daniel Drake.

Link: https://lore.kernel.org/r/20190708051744.24039-1-drake@endlessm.com v2
Link: https://lore.kernel.org/r/20190613063514.15317-1-drake@endlessm.com v1
Link: https://devtalk.nvidia.com/default/topic/1024022
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=75985
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Daniel Drake <drake@endlessm.com>
[bhelgaas: commit log, log message, return early if already enabled]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Aaron Plattner <aplattner@nvidia.com>
Cc: Peter Wu <peter@lekensteyn.nl>
Cc: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: Karol Herbst <kherbst@redhat.com>
Cc: Maik Freudenberg <hhfeuer@gmx.de>
---
 drivers/pci/quirks.c    | 30 ++++++++++++++++++++++++++++++
 include/linux/pci_ids.h |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index c66c0ca446c4..208aacf39329 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -5011,6 +5011,36 @@ DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
 			      PCI_CLASS_SERIAL_UNKNOWN, 8,
 			      quirk_gpu_usb_typec_ucsi);
 
+/*
+ * Enable the NVIDIA GPU integrated HDA controller if the BIOS left it
+ * disabled.  https://devtalk.nvidia.com/default/topic/1024022
+ */
+static void quirk_nvidia_hda(struct pci_dev *gpu)
+{
+	u8 hdr_type;
+	u32 val;
+
+	/* There was no integrated HDA controller before MCP89 */
+	if (gpu->device < PCI_DEVICE_ID_NVIDIA_GEFORCE_320M)
+		return;
+
+	/* Bit 25 at offset 0x488 enables the HDA controller */
+	pci_read_config_dword(gpu, 0x488, &val);
+	if (val & BIT(25))
+		return;
+
+	pci_info(gpu, "Enabling HDA controller\n");
+	pci_write_config_dword(gpu, 0x488, val | BIT(25));
+
+	/* The GPU becomes a multi-function device when the HDA is enabled */
+	pci_read_config_byte(gpu, PCI_HEADER_TYPE, &hdr_type);
+	gpu->multifunction = !!(hdr_type & 0x80);
+}
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
+			       PCI_BASE_CLASS_DISPLAY, 16, quirk_nvidia_hda);
+DECLARE_PCI_FIXUP_CLASS_RESUME_EARLY(PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID,
+			       PCI_BASE_CLASS_DISPLAY, 16, quirk_nvidia_hda);
+
 /*
  * Some IDT switches incorrectly flag an ACS Source Validation error on
  * completions for config read requests even though PCIe r4.0, sec
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 70e86148cb1e..66898463b81f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1336,6 +1336,7 @@
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP78S_SMBUS    0x0752
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE       0x0759
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_SMBUS     0x07D8
+#define PCI_DEVICE_ID_NVIDIA_GEFORCE_320M           0x08A0
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP79_SMBUS     0x0AA2
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP89_SATA	    0x0D85
 
-- 
cgit v1.2.3-59-g8ed1b


From db849faa9bef993a1379dc510623f750a72fa7ce Mon Sep 17 00:00:00 2001
From: Saeed Mahameed <saeedm@mellanox.com>
Date: Fri, 3 May 2019 13:14:59 -0700
Subject: net/mlx5e: Rx, Fix checksum calculation for new hardware

CQE checksum full mode in new HW, provides a full checksum of rx frame.
Covering bytes starting from eth protocol up to last byte in the received
frame (frame_size - ETH_HLEN), as expected by the stack.

Fixing up skb->csum by the driver is not required in such case. This fix
is to avoid wrong checksum calculation in drivers which already support
the new hardware with the new checksum mode.

Fixes: 85327a9c4150 ("net/mlx5: Update the list of the PCI supported devices")
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h      | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 7 ++++++-
 include/linux/mlx5/mlx5_ifc.h                     | 3 ++-
 4 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index cc6797e24571..cc227a7aa79f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -294,6 +294,7 @@ enum {
 	MLX5E_RQ_STATE_ENABLED,
 	MLX5E_RQ_STATE_AM,
 	MLX5E_RQ_STATE_NO_CSUM_COMPLETE,
+	MLX5E_RQ_STATE_CSUM_FULL, /* cqe_csum_full hw bit is set */
 };
 
 struct mlx5e_cq {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index a8e8350b38aa..98d75271fc73 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -855,6 +855,9 @@ static int mlx5e_open_rq(struct mlx5e_channel *c,
 	if (err)
 		goto err_destroy_rq;
 
+	if (MLX5_CAP_ETH(c->mdev, cqe_checksum_full))
+		__set_bit(MLX5E_RQ_STATE_CSUM_FULL, &c->rq.state);
+
 	if (params->rx_dim_enabled)
 		__set_bit(MLX5E_RQ_STATE_AM, &c->rq.state);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 13133e7f088e..8a5f9411cac6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -873,8 +873,14 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 		if (unlikely(get_ip_proto(skb, network_depth, proto) == IPPROTO_SCTP))
 			goto csum_unnecessary;
 
+		stats->csum_complete++;
 		skb->ip_summed = CHECKSUM_COMPLETE;
 		skb->csum = csum_unfold((__force __sum16)cqe->check_sum);
+
+		if (test_bit(MLX5E_RQ_STATE_CSUM_FULL, &rq->state))
+			return; /* CQE csum covers all received bytes */
+
+		/* csum might need some fixups ...*/
 		if (network_depth > ETH_HLEN)
 			/* CQE csum is calculated from the IP header and does
 			 * not cover VLAN headers (if present). This will add
@@ -885,7 +891,6 @@ static inline void mlx5e_handle_csum(struct net_device *netdev,
 						 skb->csum);
 
 		mlx5e_skb_padding_csum(skb, network_depth, proto, stats);
-		stats->csum_complete++;
 		return;
 	}
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 5e74305e2e57..7e42efa143a0 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -749,7 +749,8 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits {
 	u8         swp[0x1];
 	u8         swp_csum[0x1];
 	u8         swp_lso[0x1];
-	u8         reserved_at_23[0xd];
+	u8         cqe_checksum_full[0x1];
+	u8         reserved_at_24[0xc];
 	u8         max_vxlan_udp_ports[0x8];
 	u8         reserved_at_38[0x6];
 	u8         max_geneve_opt_len[0x1];
-- 
cgit v1.2.3-59-g8ed1b


From bd976e52725965ddcceb9abecbcc7ca46863665c Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Mon, 1 Jul 2019 14:09:16 +0900
Subject: block: Kill gfp_t argument of blkdev_report_zones()

Only GFP_KERNEL and GFP_NOIO are used with blkdev_report_zones(). In
preparation of using vmalloc() for large report buffer and zone array
allocations used by this function, remove its "gfp_t gfp_mask" argument
and rely on the caller context to use memalloc_noio_save/restore() where
necessary (block layer zone revalidation and dm-zoned I/O error path).

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c              | 31 +++++++++++++++++++------------
 drivers/block/null_blk.h       |  3 +--
 drivers/block/null_blk_zoned.c |  3 +--
 drivers/md/dm-flakey.c         |  5 ++---
 drivers/md/dm-linear.c         |  5 ++---
 drivers/md/dm-zoned-metadata.c | 16 ++++++++++++----
 drivers/md/dm.c                |  6 ++----
 drivers/scsi/sd.h              |  3 +--
 drivers/scsi/sd_zbc.c          |  6 ++----
 fs/f2fs/super.c                |  4 +---
 include/linux/blkdev.h         |  5 ++---
 include/linux/device-mapper.h  |  3 +--
 12 files changed, 46 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 3249738242b4..58ced170b424 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -14,6 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
+#include <linux/sched/mm.h>
 
 #include "blk.h"
 
@@ -117,8 +118,7 @@ static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep)
 }
 
 static int blk_report_zones(struct gendisk *disk, sector_t sector,
-			    struct blk_zone *zones, unsigned int *nr_zones,
-			    gfp_t gfp_mask)
+			    struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct request_queue *q = disk->queue;
 	unsigned int z = 0, n, nrz = *nr_zones;
@@ -127,8 +127,7 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
 
 	while (z < nrz && sector < capacity) {
 		n = nrz - z;
-		ret = disk->fops->report_zones(disk, sector, &zones[z], &n,
-					       gfp_mask);
+		ret = disk->fops->report_zones(disk, sector, &zones[z], &n);
 		if (ret)
 			return ret;
 		if (!n)
@@ -149,17 +148,18 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
  * @sector:	Sector from which to report zones
  * @zones:	Array of zone structures where to return the zones information
  * @nr_zones:	Number of zone structures in the zone array
- * @gfp_mask:	Memory allocation flags (for bio_alloc)
  *
  * Description:
  *    Get zone information starting from the zone containing @sector.
  *    The number of zone information reported may be less than the number
  *    requested by @nr_zones. The number of zones actually reported is
  *    returned in @nr_zones.
+ *    The caller must use memalloc_noXX_save/restore() calls to control
+ *    memory allocations done within this function (zone array and command
+ *    buffer allocation by the device driver).
  */
 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
-			struct blk_zone *zones, unsigned int *nr_zones,
-			gfp_t gfp_mask)
+			struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 	unsigned int i, nrz;
@@ -184,7 +184,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 	nrz = min(*nr_zones,
 		  __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector));
 	ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector,
-			       zones, &nrz, gfp_mask);
+			       zones, &nrz);
 	if (ret)
 		return ret;
 
@@ -305,9 +305,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!zones)
 		return -ENOMEM;
 
-	ret = blkdev_report_zones(bdev, rep.sector,
-				  zones, &rep.nr_zones,
-				  GFP_KERNEL);
+	ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones);
 	if (ret)
 		goto out;
 
@@ -415,6 +413,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 	unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
 	unsigned int i, rep_nr_zones = 0, z = 0, nrz;
 	struct blk_zone *zones = NULL;
+	unsigned int noio_flag;
 	sector_t sector = 0;
 	int ret = 0;
 
@@ -427,6 +426,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 		return 0;
 	}
 
+	/*
+	 * Ensure that all memory allocations in this context are done as
+	 * if GFP_NOIO was specified.
+	 */
+	noio_flag = memalloc_noio_save();
+
 	if (!blk_queue_is_zoned(q) || !nr_zones) {
 		nr_zones = 0;
 		goto update;
@@ -449,7 +454,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 
 	while (z < nr_zones) {
 		nrz = min(nr_zones - z, rep_nr_zones);
-		ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO);
+		ret = blk_report_zones(disk, sector, zones, &nrz);
 		if (ret)
 			goto out;
 		if (!nrz)
@@ -480,6 +485,8 @@ update:
 	blk_mq_unfreeze_queue(q);
 
 out:
+	memalloc_noio_restore(noio_flag);
+
 	free_pages((unsigned long)zones,
 		   get_order(rep_nr_zones * sizeof(struct blk_zone)));
 	kfree(seq_zones_wlock);
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index 34b22d6523ba..4b9bbe3bb5a1 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -89,8 +89,7 @@ struct nullb {
 int null_zone_init(struct nullb_device *dev);
 void null_zone_exit(struct nullb_device *dev);
 int null_zone_report(struct gendisk *disk, sector_t sector,
-		     struct blk_zone *zones, unsigned int *nr_zones,
-		     gfp_t gfp_mask);
+		     struct blk_zone *zones, unsigned int *nr_zones);
 void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
 			unsigned int nr_sectors);
 void null_zone_reset(struct nullb_cmd *cmd, sector_t sector);
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
index fca0c97ff1aa..cb28d93f2bd1 100644
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -67,8 +67,7 @@ void null_zone_exit(struct nullb_device *dev)
 }
 
 int null_zone_report(struct gendisk *disk, sector_t sector,
-		     struct blk_zone *zones, unsigned int *nr_zones,
-		     gfp_t gfp_mask)
+		     struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct nullb *nullb = disk->private_data;
 	struct nullb_device *dev = nullb->dev;
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index a9bc518156f2..2900fbde89b3 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -461,15 +461,14 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static int flakey_report_zones(struct dm_target *ti, sector_t sector,
-			       struct blk_zone *zones, unsigned int *nr_zones,
-			       gfp_t gfp_mask)
+			       struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct flakey_c *fc = ti->private;
 	int ret;
 
 	/* Do report and remap it */
 	ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector),
-				  zones, nr_zones, gfp_mask);
+				  zones, nr_zones);
 	if (ret != 0)
 		return ret;
 
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index ad980a38fb1e..ecefe6703736 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -137,15 +137,14 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev
 
 #ifdef CONFIG_BLK_DEV_ZONED
 static int linear_report_zones(struct dm_target *ti, sector_t sector,
-			       struct blk_zone *zones, unsigned int *nr_zones,
-			       gfp_t gfp_mask)
+			       struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct linear_c *lc = (struct linear_c *) ti->private;
 	int ret;
 
 	/* Do report and remap it */
 	ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector),
-				  zones, nr_zones, gfp_mask);
+				  zones, nr_zones);
 	if (ret != 0)
 		return ret;
 
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index d8334cd45d7c..9faf3e49c7af 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -8,6 +8,7 @@
 
 #include <linux/module.h>
 #include <linux/crc32.h>
+#include <linux/sched/mm.h>
 
 #define	DM_MSG_PREFIX		"zoned metadata"
 
@@ -1162,8 +1163,7 @@ static int dmz_init_zones(struct dmz_metadata *zmd)
 	while (sector < dev->capacity) {
 		/* Get zone information */
 		nr_blkz = DMZ_REPORT_NR_ZONES;
-		ret = blkdev_report_zones(dev->bdev, sector, blkz,
-					  &nr_blkz, GFP_KERNEL);
+		ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz);
 		if (ret) {
 			dmz_dev_err(dev, "Report zones failed %d", ret);
 			goto out;
@@ -1201,12 +1201,20 @@ out:
 static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
 {
 	unsigned int nr_blkz = 1;
+	unsigned int noio_flag;
 	struct blk_zone blkz;
 	int ret;
 
-	/* Get zone information from disk */
+	/*
+	 * Get zone information from disk. Since blkdev_report_zones() uses
+	 * GFP_KERNEL by default for memory allocations, set the per-task
+	 * PF_MEMALLOC_NOIO flag so that all allocations are done as if
+	 * GFP_NOIO was specified.
+	 */
+	noio_flag = memalloc_noio_save();
 	ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
-				  &blkz, &nr_blkz, GFP_NOIO);
+				  &blkz, &nr_blkz);
+	memalloc_noio_restore(noio_flag);
 	if (!nr_blkz)
 		ret = -EIO;
 	if (ret) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5475081dcbd6..61f1152b74e9 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -441,8 +441,7 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 }
 
 static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
-			       struct blk_zone *zones, unsigned int *nr_zones,
-			       gfp_t gfp_mask)
+			       struct blk_zone *zones, unsigned int *nr_zones)
 {
 #ifdef CONFIG_BLK_DEV_ZONED
 	struct mapped_device *md = disk->private_data;
@@ -480,8 +479,7 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
 	 * So there is no need to loop here trying to fill the entire array
 	 * of zones.
 	 */
-	ret = tgt->type->report_zones(tgt, sector, zones,
-				      nr_zones, gfp_mask);
+	ret = tgt->type->report_zones(tgt, sector, zones, nr_zones);
 
 out:
 	dm_put_live_table(md, srcu_idx);
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 5796ace76225..38c50946fc42 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -213,8 +213,7 @@ extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
 extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
 			    struct scsi_sense_hdr *sshdr);
 extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
-			       struct blk_zone *zones, unsigned int *nr_zones,
-			       gfp_t gfp_mask);
+			       struct blk_zone *zones, unsigned int *nr_zones);
 
 #else /* CONFIG_BLK_DEV_ZONED */
 
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 7334024b64f1..ec3764c8f3f1 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -109,13 +109,11 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf,
  * @sector: Start 512B sector of the report
  * @zones: Array of zone descriptors
  * @nr_zones: Number of descriptors in the array
- * @gfp_mask: Memory allocation mask
  *
  * Execute a report zones command on the target disk.
  */
 int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
-			struct blk_zone *zones, unsigned int *nr_zones,
-			gfp_t gfp_mask)
+			struct blk_zone *zones, unsigned int *nr_zones)
 {
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	unsigned int i, buflen, nrz = *nr_zones;
@@ -134,7 +132,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
 	 */
 	buflen = min(queue_max_hw_sectors(disk->queue) << 9,
 		     roundup((nrz + 1) * 64, 512));
-	buf = kmalloc(buflen, gfp_mask);
+	buf = kmalloc(buflen, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6b959bbb336a..4e91ba6c8a2e 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -2841,9 +2841,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 	while (zones && sector < nr_sectors) {
 
 		nr_zones = F2FS_REPORT_NR_ZONES;
-		err = blkdev_report_zones(bdev, sector,
-					  zones, &nr_zones,
-					  GFP_KERNEL);
+		err = blkdev_report_zones(bdev, sector, zones, &nr_zones);
 		if (err)
 			break;
 		if (!nr_zones) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 259bd7ad8312..05036e3e3458 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -347,7 +347,7 @@ struct queue_limits {
 extern unsigned int blkdev_nr_zones(struct block_device *bdev);
 extern int blkdev_report_zones(struct block_device *bdev,
 			       sector_t sector, struct blk_zone *zones,
-			       unsigned int *nr_zones, gfp_t gfp_mask);
+			       unsigned int *nr_zones);
 extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
 			      sector_t nr_sectors, gfp_t gfp_mask);
 extern int blk_revalidate_disk_zones(struct gendisk *disk);
@@ -1673,8 +1673,7 @@ struct block_device_operations {
 	/* this callback is with swap_lock and sometimes page table lock held */
 	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 	int (*report_zones)(struct gendisk *, sector_t sector,
-			    struct blk_zone *zones, unsigned int *nr_zones,
-			    gfp_t gfp_mask);
+			    struct blk_zone *zones, unsigned int *nr_zones);
 	struct module *owner;
 	const struct pr_ops *pr_ops;
 };
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e1f51d607cc5..3b470cb03b66 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -95,8 +95,7 @@ typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device **
 
 typedef int (*dm_report_zones_fn) (struct dm_target *ti, sector_t sector,
 				   struct blk_zone *zones,
-				   unsigned int *nr_zones,
-				   gfp_t gfp_mask);
+				   unsigned int *nr_zones);
 
 /*
  * These iteration functions are typically used to check (and combine)
-- 
cgit v1.2.3-59-g8ed1b


From 26202928fafad8bda8b478edb7e62c885be623d7 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Mon, 1 Jul 2019 14:09:18 +0900
Subject: block: Limit zone array allocation size

Limit the size of the struct blk_zone array used in
blk_revalidate_disk_zones() to avoid memory allocation failures leading
to disk revalidation failure. Also further reduce the likelyhood of
such failures by using kvcalloc() (that is vmalloc()) instead of
allocating contiguous pages with alloc_pages().

Fixes: 515ce6061312 ("scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation")
Fixes: e76239a3748c ("block: add a report_zones method")
Cc: stable@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 36 ++++++++++++++++++++----------------
 include/linux/blkdev.h |  5 +++++
 2 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 58ced170b424..6c503824ba3f 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -14,6 +14,8 @@
 #include <linux/rbtree.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
 #include <linux/sched/mm.h>
 
 #include "blk.h"
@@ -371,22 +373,25 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
  * Allocate an array of struct blk_zone to get nr_zones zone information.
  * The allocated array may be smaller than nr_zones.
  */
-static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
+static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones)
 {
-	size_t size = *nr_zones * sizeof(struct blk_zone);
-	struct page *page;
-	int order;
-
-	for (order = get_order(size); order >= 0; order--) {
-		page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
-		if (page) {
-			*nr_zones = min_t(unsigned int, *nr_zones,
-				(PAGE_SIZE << order) / sizeof(struct blk_zone));
-			return page_address(page);
-		}
+	struct blk_zone *zones;
+	size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES);
+
+	/*
+	 * GFP_KERNEL here is meaningless as the caller task context has
+	 * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones()
+	 * with memalloc_noio_save().
+	 */
+	zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL);
+	if (!zones) {
+		*nr_zones = 0;
+		return NULL;
 	}
 
-	return NULL;
+	*nr_zones = nrz;
+
+	return zones;
 }
 
 void blk_queue_free_zone_bitmaps(struct request_queue *q)
@@ -448,7 +453,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 
 	/* Get zone information and initialize seq_zones_bitmap */
 	rep_nr_zones = nr_zones;
-	zones = blk_alloc_zones(q->node, &rep_nr_zones);
+	zones = blk_alloc_zones(&rep_nr_zones);
 	if (!zones)
 		goto out;
 
@@ -487,8 +492,7 @@ update:
 out:
 	memalloc_noio_restore(noio_flag);
 
-	free_pages((unsigned long)zones,
-		   get_order(rep_nr_zones * sizeof(struct blk_zone)));
+	kvfree(zones);
 	kfree(seq_zones_wlock);
 	kfree(seq_zones_bitmap);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 05036e3e3458..1ef375dafb1c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -344,6 +344,11 @@ struct queue_limits {
 
 #ifdef CONFIG_BLK_DEV_ZONED
 
+/*
+ * Maximum number of zones to report with a single report zones command.
+ */
+#define BLK_ZONED_REPORT_MAX_ZONES	8192U
+
 extern unsigned int blkdev_nr_zones(struct block_device *bdev);
 extern int blkdev_report_zones(struct block_device *bdev,
 			       sector_t sector, struct blk_zone *zones,
-- 
cgit v1.2.3-59-g8ed1b


From a101b043c44dfcb63bed7f29a675e9fa0259005e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 11 Jul 2019 16:33:12 -0400
Subject: SUNRPC: Fix transport accounting when caller specifies an rpc_xprt

Ensure that we do the required accounting for the round robin queue
when the caller to rpc_init_task() has passed in a transport to be
used.

Reported-by: Olga Kornievskaia <aglo@umich.edu>
Reported-by: Neil Brown <neilb@suse.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/clnt.c           | 42 ++++++++++++++++++++----------------------
 net/sunrpc/sched.c          |  3 ++-
 3 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 4619098affa3..4e070e00c143 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -164,6 +164,8 @@ void		rpc_shutdown_client(struct rpc_clnt *);
 void		rpc_release_client(struct rpc_clnt *);
 void		rpc_task_release_transport(struct rpc_task *);
 void		rpc_task_release_client(struct rpc_task *);
+struct rpc_xprt	*rpc_task_get_xprt(struct rpc_clnt *clnt,
+		struct rpc_xprt *xprt);
 
 int		rpcb_create_local(struct net *);
 void		rpcb_put_local(struct net *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index d599fab8adcb..383555d2b522 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -978,11 +978,10 @@ out:
 }
 EXPORT_SYMBOL_GPL(rpc_bind_new_program);
 
-static struct rpc_xprt *
-rpc_task_get_xprt(struct rpc_clnt *clnt)
+struct rpc_xprt *
+rpc_task_get_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 {
 	struct rpc_xprt_switch *xps;
-	struct rpc_xprt *xprt= xprt_iter_get_next(&clnt->cl_xpi);
 
 	if (!xprt)
 		return NULL;
@@ -995,24 +994,6 @@ rpc_task_get_xprt(struct rpc_clnt *clnt)
 	return xprt;
 }
 
-static struct rpc_xprt *
-rpc_task_get_first_xprt(struct rpc_clnt *clnt)
-{
-	struct rpc_xprt_switch *xps;
-	struct rpc_xprt *xprt;
-
-	rcu_read_lock();
-	xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
-	if (xprt) {
-		atomic_long_inc(&xprt->queuelen);
-		xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
-		atomic_long_inc(&xps->xps_queuelen);
-	}
-	rcu_read_unlock();
-
-	return xprt;
-}
-
 static void
 rpc_task_release_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
 {
@@ -1057,6 +1038,23 @@ void rpc_task_release_client(struct rpc_task *task)
 	}
 }
 
+static struct rpc_xprt *
+rpc_task_get_first_xprt(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt *xprt;
+
+	rcu_read_lock();
+	xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+	rcu_read_unlock();
+	return rpc_task_get_xprt(clnt, xprt);
+}
+
+static struct rpc_xprt *
+rpc_task_get_next_xprt(struct rpc_clnt *clnt)
+{
+	return rpc_task_get_xprt(clnt, xprt_iter_get_next(&clnt->cl_xpi));
+}
+
 static
 void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 {
@@ -1065,7 +1063,7 @@ void rpc_task_set_transport(struct rpc_task *task, struct rpc_clnt *clnt)
 	if (task->tk_flags & RPC_TASK_NO_ROUND_ROBIN)
 		task->tk_xprt = rpc_task_get_first_xprt(clnt);
 	else
-		task->tk_xprt = rpc_task_get_xprt(clnt);
+		task->tk_xprt = rpc_task_get_next_xprt(clnt);
 }
 
 static
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 8a0779e963f9..1f275aba786f 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -1092,7 +1092,8 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
 	/* Initialize workqueue for async tasks */
 	task->tk_workqueue = task_setup_data->workqueue;
 
-	task->tk_xprt = xprt_get(task_setup_data->rpc_xprt);
+	task->tk_xprt = rpc_task_get_xprt(task_setup_data->rpc_client,
+			xprt_get(task_setup_data->rpc_xprt));
 
 	task->tk_op_cred = get_rpccred(task_setup_data->rpc_op_cred);
 
-- 
cgit v1.2.3-59-g8ed1b


From 1df379924304b687263942452836db1d725155df Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 2 Jul 2019 12:03:50 +1000
Subject: clk: consoldiate the __clk_get_hw() declarations

Without this we were getting errors like:

In file included from drivers/clk/clkdev.c:22:0:
drivers/clk/clk.h:36:23: error: static declaration of '__clk_get_hw' follows non-static declaration
include/linux/clk-provider.h:808:16: note: previous declaration of '__clk_get_hw' was here

Fixes: 59fcdce425b7 ("clk: Remove ifdef for COMMON_CLK in clk-provider.h")
fixes: 73e0e496afda ("clkdev: Always allocate a struct clk and call __clk_get() w/ CCF")
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/clk.h             | 4 ----
 drivers/clk/imx/clk-imx6q.c   | 1 +
 drivers/clk/imx/clk-imx6sll.c | 1 +
 drivers/clk/imx/clk-imx6sx.c  | 1 +
 drivers/clk/imx/clk-imx6ul.c  | 1 +
 drivers/clk/imx/clk-imx7d.c   | 1 +
 drivers/clk/imx/clk.c         | 1 +
 include/linux/clk-provider.h  | 7 +++++++
 8 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.h b/drivers/clk/clk.h
index d8400d623b34..2d801900cad5 100644
--- a/drivers/clk/clk.h
+++ b/drivers/clk/clk.h
@@ -33,10 +33,6 @@ clk_hw_create_clk(struct device *dev, struct clk_hw *hw, const char *dev_id,
 {
 	return (struct clk *)hw;
 }
-static struct clk_hw *__clk_get_hw(struct clk *clk)
-{
-	return (struct clk_hw *)clk;
-}
 static inline void __clk_put(struct clk *clk) { }
 
 #endif
diff --git a/drivers/clk/imx/clk-imx6q.c b/drivers/clk/imx/clk-imx6q.c
index 708e7c5590dd..fa5ef3cc2240 100644
--- a/drivers/clk/imx/clk-imx6q.c
+++ b/drivers/clk/imx/clk-imx6q.c
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/of.h>
diff --git a/drivers/clk/imx/clk-imx6sll.c b/drivers/clk/imx/clk-imx6sll.c
index 7eea448cb9a9..a9548c4b6d78 100644
--- a/drivers/clk/imx/clk-imx6sll.c
+++ b/drivers/clk/imx/clk-imx6sll.c
@@ -7,6 +7,7 @@
 #include <dt-bindings/clock/imx6sll-clock.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk-imx6sx.c b/drivers/clk/imx/clk-imx6sx.c
index 91558b09bf9e..77748d6d4ccc 100644
--- a/drivers/clk/imx/clk-imx6sx.c
+++ b/drivers/clk/imx/clk-imx6sx.c
@@ -12,6 +12,7 @@
 #include <dt-bindings/clock/imx6sx-clock.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk-imx6ul.c b/drivers/clk/imx/clk-imx6ul.c
index fd60d1549f71..e0e4625aacd0 100644
--- a/drivers/clk/imx/clk-imx6ul.c
+++ b/drivers/clk/imx/clk-imx6ul.c
@@ -12,6 +12,7 @@
 #include <dt-bindings/clock/imx6ul-clock.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk-imx7d.c b/drivers/clk/imx/clk-imx7d.c
index 5b8a0c729f90..0ff3eb14d3af 100644
--- a/drivers/clk/imx/clk-imx7d.c
+++ b/drivers/clk/imx/clk-imx7d.c
@@ -12,6 +12,7 @@
 #include <dt-bindings/clock/imx7d-clock.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/io.h>
diff --git a/drivers/clk/imx/clk.c b/drivers/clk/imx/clk.c
index 1efed86217f7..588d1f45325d 100644
--- a/drivers/clk/imx/clk.c
+++ b/drivers/clk/imx/clk.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/clk.h>
+#include <linux/clk-provider.h>
 #include <linux/err.h>
 #include <linux/of.h>
 #include <linux/slab.h>
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 9ba000e3a50d..55d48140b0d0 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -805,7 +805,14 @@ void devm_clk_hw_unregister(struct device *dev, struct clk_hw *hw);
 /* helper functions */
 const char *__clk_get_name(const struct clk *clk);
 const char *clk_hw_get_name(const struct clk_hw *hw);
+#ifdef CONFIG_COMMON_CLK
 struct clk_hw *__clk_get_hw(struct clk *clk);
+#else
+static inline struct clk_hw *__clk_get_hw(struct clk *clk)
+{
+	return (struct clk_hw *)clk;
+}
+#endif
 unsigned int clk_hw_get_num_parents(const struct clk_hw *hw);
 struct clk_hw *clk_hw_get_parent(const struct clk_hw *hw);
 struct clk_hw *clk_hw_get_parent_by_index(const struct clk_hw *hw,
-- 
cgit v1.2.3-59-g8ed1b


From 54638c6eaf445ecf901128599cfeb4620be47d2f Mon Sep 17 00:00:00 2001
From: Denis Efremov <efremov@linux.com>
Date: Wed, 10 Jul 2019 21:03:24 +0300
Subject: net: phy: make exported variables non-static

The variables phy_basic_ports_array, phy_fibre_port_array and
phy_all_ports_features_array are declared static and marked
EXPORT_SYMBOL_GPL(), which is at best an odd combination.
Because the variables were decided to be a part of API, this commit
removes the static attributes and adds the declarations to the header.

Fixes: 3c1bcc8614db ("net: ethernet: Convert phydev advertize and supported from u32 to link mode")
Signed-off-by: Denis Efremov <efremov@linux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 6 +++---
 include/linux/phy.h          | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 53878908adf4..6b5cb87f3866 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -56,19 +56,19 @@ EXPORT_SYMBOL_GPL(phy_10gbit_features);
 __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_fec_features) __ro_after_init;
 EXPORT_SYMBOL_GPL(phy_10gbit_fec_features);
 
-static const int phy_basic_ports_array[] = {
+const int phy_basic_ports_array[3] = {
 	ETHTOOL_LINK_MODE_Autoneg_BIT,
 	ETHTOOL_LINK_MODE_TP_BIT,
 	ETHTOOL_LINK_MODE_MII_BIT,
 };
 EXPORT_SYMBOL_GPL(phy_basic_ports_array);
 
-static const int phy_fibre_port_array[] = {
+const int phy_fibre_port_array[1] = {
 	ETHTOOL_LINK_MODE_FIBRE_BIT,
 };
 EXPORT_SYMBOL_GPL(phy_fibre_port_array);
 
-static const int phy_all_ports_features_array[] = {
+const int phy_all_ports_features_array[7] = {
 	ETHTOOL_LINK_MODE_Autoneg_BIT,
 	ETHTOOL_LINK_MODE_TP_BIT,
 	ETHTOOL_LINK_MODE_MII_BIT,
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 1739c6dc470e..462b90b73f93 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -55,6 +55,9 @@ extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_ini
 #define PHY_10GBIT_FEC_FEATURES ((unsigned long *)&phy_10gbit_fec_features)
 #define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features)
 
+extern const int phy_basic_ports_array[3];
+extern const int phy_fibre_port_array[1];
+extern const int phy_all_ports_features_array[7];
 extern const int phy_10_100_features_array[4];
 extern const int phy_basic_t1_features_array[2];
 extern const int phy_gbit_features_array[2];
-- 
cgit v1.2.3-59-g8ed1b


From 8a58ddae23796c733c5dfbd717538d89d036c5bd Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Mon, 1 Jul 2019 14:07:55 +0300
Subject: perf/core: Fix exclusive events' grouping

So far, we tried to disallow grouping exclusive events for the fear of
complications they would cause with moving between contexts. Specifically,
moving a software group to a hardware context would violate the exclusivity
rules if both groups contain matching exclusive events.

This attempt was, however, unsuccessful: the check that we have in the
perf_event_open() syscall is both wrong (looks at wrong PMU) and
insufficient (group leader may still be exclusive), as can be illustrated
by running:

  $ perf record -e '{intel_pt//,cycles}' uname
  $ perf record -e '{cycles,intel_pt//}' uname

ultimately successfully.

Furthermore, we are completely free to trigger the exclusivity violation
by:

   perf -e '{cycles,intel_pt//}' -e '{intel_pt//,instructions}'

even though the helpful perf record will not allow that, the ABI will.

The warning later in the perf_event_open() path will also not trigger, because
it's also wrong.

Fix all this by validating the original group before moving, getting rid
of broken safeguards and placing a useful one to perf_install_in_context().

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: mathieu.poirier@linaro.org
Cc: will.deacon@arm.com
Fixes: bed5b25ad9c8a ("perf: Add a pmu capability for "exclusive" events")
Link: https://lkml.kernel.org/r/20190701110755.24646-1-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  5 +++++
 kernel/events/core.c       | 34 ++++++++++++++++++++++------------
 2 files changed, 27 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 16e38c286d46..e8ad3c590a23 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1055,6 +1055,11 @@ static inline int in_software_context(struct perf_event *event)
 	return event->ctx->pmu->task_ctx_nr == perf_sw_context;
 }
 
+static inline int is_exclusive_pmu(struct pmu *pmu)
+{
+	return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE;
+}
+
 extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5dd19bedbf64..eea9d52b010c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2553,6 +2553,9 @@ unlock:
 	return ret;
 }
 
+static bool exclusive_event_installable(struct perf_event *event,
+					struct perf_event_context *ctx);
+
 /*
  * Attach a performance event to a context.
  *
@@ -2567,6 +2570,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 
 	lockdep_assert_held(&ctx->mutex);
 
+	WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
+
 	if (event->cpu != -1)
 		event->cpu = cpu;
 
@@ -4360,7 +4365,7 @@ static int exclusive_event_init(struct perf_event *event)
 {
 	struct pmu *pmu = event->pmu;
 
-	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+	if (!is_exclusive_pmu(pmu))
 		return 0;
 
 	/*
@@ -4391,7 +4396,7 @@ static void exclusive_event_destroy(struct perf_event *event)
 {
 	struct pmu *pmu = event->pmu;
 
-	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+	if (!is_exclusive_pmu(pmu))
 		return;
 
 	/* see comment in exclusive_event_init() */
@@ -4411,14 +4416,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
 	return false;
 }
 
-/* Called under the same ctx::mutex as perf_install_in_context() */
 static bool exclusive_event_installable(struct perf_event *event,
 					struct perf_event_context *ctx)
 {
 	struct perf_event *iter_event;
 	struct pmu *pmu = event->pmu;
 
-	if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+	lockdep_assert_held(&ctx->mutex);
+
+	if (!is_exclusive_pmu(pmu))
 		return true;
 
 	list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
@@ -10947,11 +10953,6 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_alloc;
 	}
 
-	if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
-		err = -EBUSY;
-		goto err_context;
-	}
-
 	/*
 	 * Look up the group leader (we will attach this event to it):
 	 */
@@ -11039,6 +11040,18 @@ SYSCALL_DEFINE5(perf_event_open,
 				move_group = 0;
 			}
 		}
+
+		/*
+		 * Failure to create exclusive events returns -EBUSY.
+		 */
+		err = -EBUSY;
+		if (!exclusive_event_installable(group_leader, ctx))
+			goto err_locked;
+
+		for_each_sibling_event(sibling, group_leader) {
+			if (!exclusive_event_installable(sibling, ctx))
+				goto err_locked;
+		}
 	} else {
 		mutex_lock(&ctx->mutex);
 	}
@@ -11075,9 +11088,6 @@ SYSCALL_DEFINE5(perf_event_open,
 	 * because we need to serialize with concurrent event creation.
 	 */
 	if (!exclusive_event_installable(event, ctx)) {
-		/* exclusive and group stuff are assumed mutually exclusive */
-		WARN_ON_ONCE(move_group);
-
 		err = -EBUSY;
 		goto err_locked;
 	}
-- 
cgit v1.2.3-59-g8ed1b


From 028b6e8a89de9133a869bb4cd1bc72445b1ec8ca Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Sun, 14 Jul 2019 19:20:47 +0300
Subject: clone: fix CLONE_PIDFD support

The introduction of clone3 syscall accidentally broke CLONE_PIDFD
support in traditional clone syscall on compat x86 and those
architectures that use do_fork to implement clone syscall.

This bug was found by strace test suite.

Link: https://strace.io/logs/strace/2019-07-12
Fixes: 7f192e3cd316 ("fork: add clone3")
Bisected-and-tested-by: Anatoly Pugachev <matorola@gmail.com>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Link: https://lore.kernel.org/r/20190714162047.GB10389@altlinux.org
Signed-off-by: Christian Brauner <christian@brauner.io>
---
 arch/x86/ia32/sys_ia32.c   |  4 ++++
 include/linux/sched/task.h |  1 +
 kernel/fork.c              | 17 +++++++++++++++--
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 64a6c952091e..21790307121e 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -239,6 +239,7 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
 {
 	struct kernel_clone_args args = {
 		.flags		= (clone_flags & ~CSIGNAL),
+		.pidfd		= parent_tidptr,
 		.child_tid	= child_tidptr,
 		.parent_tid	= parent_tidptr,
 		.exit_signal	= (clone_flags & CSIGNAL),
@@ -246,5 +247,8 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags,
 		.tls		= tls_val,
 	};
 
+	if (!legacy_clone_args_valid(&args))
+		return -EINVAL;
+
 	return _do_fork(&args);
 }
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 109a0df5af39..0497091e40c1 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -89,6 +89,7 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern long _do_fork(struct kernel_clone_args *kargs);
+extern bool legacy_clone_args_valid(const struct kernel_clone_args *kargs);
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 struct mm_struct *copy_init_mm(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8f3e2d97d771..ef1e05a68827 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2406,6 +2406,16 @@ long _do_fork(struct kernel_clone_args *args)
 	return nr;
 }
 
+bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
+{
+	/* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+	if ((kargs->flags & CLONE_PIDFD) &&
+	    (kargs->flags & CLONE_PARENT_SETTID))
+		return false;
+
+	return true;
+}
+
 #ifndef CONFIG_HAVE_COPY_THREAD_TLS
 /* For compatibility with architectures that call do_fork directly rather than
  * using the syscall entry points below. */
@@ -2417,6 +2427,7 @@ long do_fork(unsigned long clone_flags,
 {
 	struct kernel_clone_args args = {
 		.flags		= (clone_flags & ~CSIGNAL),
+		.pidfd		= parent_tidptr,
 		.child_tid	= child_tidptr,
 		.parent_tid	= parent_tidptr,
 		.exit_signal	= (clone_flags & CSIGNAL),
@@ -2424,6 +2435,9 @@ long do_fork(unsigned long clone_flags,
 		.stack_size	= stack_size,
 	};
 
+	if (!legacy_clone_args_valid(&args))
+		return -EINVAL;
+
 	return _do_fork(&args);
 }
 #endif
@@ -2505,8 +2519,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		.tls		= tls,
 	};
 
-	/* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
-	if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
+	if (!legacy_clone_args_valid(&args))
 		return -EINVAL;
 
 	return _do_fork(&args);
-- 
cgit v1.2.3-59-g8ed1b


From 6dfc43d3a19174faead54575c204aee106225f43 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 15 Jul 2019 15:16:20 +1000
Subject: mm: adjust apply_to_pfn_range interface for dropped token.

mm/pgtable: drop pgtable_t variable from pte_fn_t functions
drops the token came in via the hmm tree, this caused lots of
conflicts, but applying this cleanup patch should reduce it
to something easier to handle. Just accept the token is unused
at this point.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/mm.h    | 2 +-
 mm/as_dirty_helpers.c | 6 ++----
 mm/memory.c           | 6 +++---
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 798cdda9560e..c45f936bd81c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2687,7 +2687,7 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
 struct pfn_range_apply;
-typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
+typedef int (*pter_fn_t)(pte_t *pte, unsigned long addr,
 			 struct pfn_range_apply *closure);
 struct pfn_range_apply {
 	struct mm_struct *mm;
diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c
index f600e31534fb..6352a3729408 100644
--- a/mm/as_dirty_helpers.c
+++ b/mm/as_dirty_helpers.c
@@ -26,7 +26,6 @@ struct apply_as {
 /**
  * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
  * @pte: Pointer to the pte
- * @token: Page table token, see apply_to_pfn_range()
  * @addr: The virtual page address
  * @closure: Pointer to a struct pfn_range_apply embedded in a
  * struct apply_as
@@ -36,7 +35,7 @@ struct apply_as {
  *
  * Return: Always zero.
  */
-static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
+static int apply_pt_wrprotect(pte_t *pte,
 			      unsigned long addr,
 			      struct pfn_range_apply *closure)
 {
@@ -78,7 +77,6 @@ struct apply_as_clean {
 /**
  * apply_pt_clean - Leaf pte callback to clean a pte
  * @pte: Pointer to the pte
- * @token: Page table token, see apply_to_pfn_range()
  * @addr: The virtual page address
  * @closure: Pointer to a struct pfn_range_apply embedded in a
  * struct apply_as_clean
@@ -91,7 +89,7 @@ struct apply_as_clean {
  *
  * Return: Always zero.
  */
-static int apply_pt_clean(pte_t *pte, pgtable_t token,
+static int apply_pt_clean(pte_t *pte,
 			  unsigned long addr,
 			  struct pfn_range_apply *closure)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 462aa47f8878..b8218e962231 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2053,7 +2053,7 @@ static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
 	token = pmd_pgtable(*pmd);
 
 	do {
-		err = closure->ptefn(pte++, token, addr, closure);
+		err = closure->ptefn(pte++, addr, closure);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
@@ -2194,14 +2194,14 @@ struct page_range_apply {
  * Callback wrapper to enable use of apply_to_pfn_range for
  * the apply_to_page_range interface
  */
-static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
+static int apply_to_page_range_wrapper(pte_t *pte,
 				       unsigned long addr,
 				       struct pfn_range_apply *pter)
 {
 	struct page_range_apply *pra =
 		container_of(pter, typeof(*pra), pter);
 
-	return pra->fn(pte, token, addr, pra->data);
+	return pra->fn(pte, NULL, addr, pra->data);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From dfd6f9ad36368b8dbd5f5a2b2f0a4705ae69a323 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 12 Jul 2019 11:01:21 +0200
Subject: ACPI: fix false-positive -Wuninitialized warning

clang gets confused by an uninitialized variable in what looks
to it like a never executed code path:

arch/x86/kernel/acpi/boot.c:618:13: error: variable 'polarity' is uninitialized when used here [-Werror,-Wuninitialized]
        polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
                   ^~~~~~~~
arch/x86/kernel/acpi/boot.c:606:32: note: initialize the variable 'polarity' to silence this warning
        int rc, irq, trigger, polarity;
                                      ^
                                       = 0
arch/x86/kernel/acpi/boot.c:617:12: error: variable 'trigger' is uninitialized when used here [-Werror,-Wuninitialized]
        trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
                  ^~~~~~~
arch/x86/kernel/acpi/boot.c:606:22: note: initialize the variable 'trigger' to silence this warning
        int rc, irq, trigger, polarity;
                            ^
                             = 0

This is unfortunately a design decision in clang and won't be fixed.

Changing the acpi_get_override_irq() macro to an inline function
reliably avoids the issue.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 451e7b544342..8309923eafe1 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -324,7 +324,10 @@ struct irq_domain *acpi_irq_create_hierarchy(unsigned int flags,
 #ifdef CONFIG_X86_IO_APIC
 extern int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
 #else
-#define acpi_get_override_irq(gsi, trigger, polarity) (-1)
+static inline int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
+{
+	return -1;
+}
 #endif
 /*
  * This function undoes the effect of one call to acpi_register_gsi().
-- 
cgit v1.2.3-59-g8ed1b


From 8da04e05cdfc715d414a1c5f8318c03030eb68fb Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 15 Jul 2019 09:56:30 +1000
Subject: intel_rapl: need linux/cpuhotplug.h for enum cpuhp_state

Fixes: 7ebf8eff63b4 ("intel_rapl: introduce struct rapl_if_private")
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/intel_rapl.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index 0c179d92d110..efb3ce892c20 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -12,6 +12,7 @@
 
 #include <linux/types.h>
 #include <linux/powercap.h>
+#include <linux/cpuhotplug.h>
 
 enum rapl_domain_type {
 	RAPL_DOMAIN_PACKAGE,	/* entire package/socket */
-- 
cgit v1.2.3-59-g8ed1b


From 387b14684f94483cbbb72843db406ec9a8d0d6d2 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Wed, 10 Apr 2019 08:32:41 -0300
Subject: docs: locking: convert docs to ReST and rename to *.rst

Convert the locking documents to ReST and add them to the
kernel development book where it belongs.

Most of the stuff here is just to make Sphinx to properly
parse the text file, as they're already in good shape,
not requiring massive changes in order to be parsed.

The conversion is actually:
  - add blank lines and identation in order to identify paragraphs;
  - fix tables markups;
  - add some lists markups;
  - mark literal blocks;
  - adjust title markups.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Federico Vaga <federico.vaga@vaga.pv.it>
---
 Documentation/kernel-hacking/locking.rst           |   2 +-
 Documentation/locking/index.rst                    |  24 +
 Documentation/locking/lockdep-design.rst           | 394 ++++++++++++++
 Documentation/locking/lockdep-design.txt           | 389 --------------
 Documentation/locking/lockstat.rst                 | 204 ++++++++
 Documentation/locking/lockstat.txt                 | 183 -------
 Documentation/locking/locktorture.rst              | 170 ++++++
 Documentation/locking/locktorture.txt              | 145 ------
 Documentation/locking/mutex-design.rst             | 152 ++++++
 Documentation/locking/mutex-design.txt             | 142 -----
 Documentation/locking/rt-mutex-design.rst          | 574 +++++++++++++++++++++
 Documentation/locking/rt-mutex-design.txt          | 559 --------------------
 Documentation/locking/rt-mutex.rst                 |  77 +++
 Documentation/locking/rt-mutex.txt                 |  73 ---
 Documentation/locking/spinlocks.rst                | 177 +++++++
 Documentation/locking/spinlocks.txt                | 167 ------
 Documentation/locking/ww-mutex-design.rst          | 393 ++++++++++++++
 Documentation/locking/ww-mutex-design.txt          | 383 --------------
 Documentation/pi-futex.txt                         |   2 +-
 .../translations/it_IT/kernel-hacking/locking.rst  |   2 +-
 drivers/gpu/drm/drm_modeset_lock.c                 |   2 +-
 include/linux/lockdep.h                            |   2 +-
 include/linux/mutex.h                              |   2 +-
 include/linux/rwsem.h                              |   2 +-
 kernel/locking/mutex.c                             |   2 +-
 kernel/locking/rtmutex.c                           |   2 +-
 lib/Kconfig.debug                                  |   4 +-
 27 files changed, 2176 insertions(+), 2052 deletions(-)
 create mode 100644 Documentation/locking/index.rst
 create mode 100644 Documentation/locking/lockdep-design.rst
 delete mode 100644 Documentation/locking/lockdep-design.txt
 create mode 100644 Documentation/locking/lockstat.rst
 delete mode 100644 Documentation/locking/lockstat.txt
 create mode 100644 Documentation/locking/locktorture.rst
 delete mode 100644 Documentation/locking/locktorture.txt
 create mode 100644 Documentation/locking/mutex-design.rst
 delete mode 100644 Documentation/locking/mutex-design.txt
 create mode 100644 Documentation/locking/rt-mutex-design.rst
 delete mode 100644 Documentation/locking/rt-mutex-design.txt
 create mode 100644 Documentation/locking/rt-mutex.rst
 delete mode 100644 Documentation/locking/rt-mutex.txt
 create mode 100644 Documentation/locking/spinlocks.rst
 delete mode 100644 Documentation/locking/spinlocks.txt
 create mode 100644 Documentation/locking/ww-mutex-design.rst
 delete mode 100644 Documentation/locking/ww-mutex-design.txt

(limited to 'include/linux')

diff --git a/Documentation/kernel-hacking/locking.rst b/Documentation/kernel-hacking/locking.rst
index dc698ea456e0..a8518ac0d31d 100644
--- a/Documentation/kernel-hacking/locking.rst
+++ b/Documentation/kernel-hacking/locking.rst
@@ -1364,7 +1364,7 @@ Futex API reference
 Further reading
 ===============
 
--  ``Documentation/locking/spinlocks.txt``: Linus Torvalds' spinlocking
+-  ``Documentation/locking/spinlocks.rst``: Linus Torvalds' spinlocking
    tutorial in the kernel sources.
 
 -  Unix Systems for Modern Architectures: Symmetric Multiprocessing and
diff --git a/Documentation/locking/index.rst b/Documentation/locking/index.rst
new file mode 100644
index 000000000000..ef5da7fe9aac
--- /dev/null
+++ b/Documentation/locking/index.rst
@@ -0,0 +1,24 @@
+:orphan:
+
+=======
+locking
+=======
+
+.. toctree::
+    :maxdepth: 1
+
+    lockdep-design
+    lockstat
+    locktorture
+    mutex-design
+    rt-mutex-design
+    rt-mutex
+    spinlocks
+    ww-mutex-design
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/locking/lockdep-design.rst b/Documentation/locking/lockdep-design.rst
new file mode 100644
index 000000000000..23fcbc4d3fc0
--- /dev/null
+++ b/Documentation/locking/lockdep-design.rst
@@ -0,0 +1,394 @@
+Runtime locking correctness validator
+=====================================
+
+started by Ingo Molnar <mingo@redhat.com>
+
+additions by Arjan van de Ven <arjan@linux.intel.com>
+
+Lock-class
+----------
+
+The basic object the validator operates upon is a 'class' of locks.
+
+A class of locks is a group of locks that are logically the same with
+respect to locking rules, even if the locks may have multiple (possibly
+tens of thousands of) instantiations. For example a lock in the inode
+struct is one class, while each inode has its own instantiation of that
+lock class.
+
+The validator tracks the 'usage state' of lock-classes, and it tracks
+the dependencies between different lock-classes. Lock usage indicates
+how a lock is used with regard to its IRQ contexts, while lock
+dependency can be understood as lock order, where L1 -> L2 suggests that
+a task is attempting to acquire L2 while holding L1. From lockdep's
+perspective, the two locks (L1 and L2) are not necessarily related; that
+dependency just means the order ever happened. The validator maintains a
+continuing effort to prove lock usages and dependencies are correct or
+the validator will shoot a splat if incorrect.
+
+A lock-class's behavior is constructed by its instances collectively:
+when the first instance of a lock-class is used after bootup the class
+gets registered, then all (subsequent) instances will be mapped to the
+class and hence their usages and dependecies will contribute to those of
+the class. A lock-class does not go away when a lock instance does, but
+it can be removed if the memory space of the lock class (static or
+dynamic) is reclaimed, this happens for example when a module is
+unloaded or a workqueue is destroyed.
+
+State
+-----
+
+The validator tracks lock-class usage history and divides the usage into
+(4 usages * n STATEs + 1) categories:
+
+where the 4 usages can be:
+- 'ever held in STATE context'
+- 'ever held as readlock in STATE context'
+- 'ever held with STATE enabled'
+- 'ever held as readlock with STATE enabled'
+
+where the n STATEs are coded in kernel/locking/lockdep_states.h and as of
+now they include:
+- hardirq
+- softirq
+
+where the last 1 category is:
+- 'ever used'                                       [ == !unused        ]
+
+When locking rules are violated, these usage bits are presented in the
+locking error messages, inside curlies, with a total of 2 * n STATEs bits.
+A contrived example::
+
+   modprobe/2287 is trying to acquire lock:
+    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+
+   but task is already holding lock:
+    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+
+
+For a given lock, the bit positions from left to right indicate the usage
+of the lock and readlock (if exists), for each of the n STATEs listed
+above respectively, and the character displayed at each bit position
+indicates:
+
+   ===  ===================================================
+   '.'  acquired while irqs disabled and not in irq context
+   '-'  acquired in irq context
+   '+'  acquired with irqs enabled
+   '?'  acquired in irq context with irqs enabled.
+   ===  ===================================================
+
+The bits are illustrated with an example::
+
+    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
+                         ||||
+                         ||| \-> softirq disabled and not in softirq context
+                         || \--> acquired in softirq context
+                         | \---> hardirq disabled and not in hardirq context
+                          \----> acquired in hardirq context
+
+
+For a given STATE, whether the lock is ever acquired in that STATE
+context and whether that STATE is enabled yields four possible cases as
+shown in the table below. The bit character is able to indicate which
+exact case is for the lock as of the reporting time.
+
+  +--------------+-------------+--------------+
+  |              | irq enabled | irq disabled |
+  +--------------+-------------+--------------+
+  | ever in irq  |      ?      |       -      |
+  +--------------+-------------+--------------+
+  | never in irq |      +      |       .      |
+  +--------------+-------------+--------------+
+
+The character '-' suggests irq is disabled because if otherwise the
+charactor '?' would have been shown instead. Similar deduction can be
+applied for '+' too.
+
+Unused locks (e.g., mutexes) cannot be part of the cause of an error.
+
+
+Single-lock state rules:
+------------------------
+
+A lock is irq-safe means it was ever used in an irq context, while a lock
+is irq-unsafe means it was ever acquired with irq enabled.
+
+A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The
+following states must be exclusive: only one of them is allowed to be set
+for any lock-class based on its usage::
+
+ <hardirq-safe> or <hardirq-unsafe>
+ <softirq-safe> or <softirq-unsafe>
+
+This is because if a lock can be used in irq context (irq-safe) then it
+cannot be ever acquired with irq enabled (irq-unsafe). Otherwise, a
+deadlock may happen. For example, in the scenario that after this lock
+was acquired but before released, if the context is interrupted this
+lock will be attempted to acquire twice, which creates a deadlock,
+referred to as lock recursion deadlock.
+
+The validator detects and reports lock usage that violates these
+single-lock state rules.
+
+Multi-lock dependency rules:
+----------------------------
+
+The same lock-class must not be acquired twice, because this could lead
+to lock recursion deadlocks.
+
+Furthermore, two locks can not be taken in inverse order::
+
+ <L1> -> <L2>
+ <L2> -> <L1>
+
+because this could lead to a deadlock - referred to as lock inversion
+deadlock - as attempts to acquire the two locks form a circle which
+could lead to the two contexts waiting for each other permanently. The
+validator will find such dependency circle in arbitrary complexity,
+i.e., there can be any other locking sequence between the acquire-lock
+operations; the validator will still find whether these locks can be
+acquired in a circular fashion.
+
+Furthermore, the following usage based lock dependencies are not allowed
+between any two lock-classes::
+
+   <hardirq-safe>   ->  <hardirq-unsafe>
+   <softirq-safe>   ->  <softirq-unsafe>
+
+The first rule comes from the fact that a hardirq-safe lock could be
+taken by a hardirq context, interrupting a hardirq-unsafe lock - and
+thus could result in a lock inversion deadlock. Likewise, a softirq-safe
+lock could be taken by an softirq context, interrupting a softirq-unsafe
+lock.
+
+The above rules are enforced for any locking sequence that occurs in the
+kernel: when acquiring a new lock, the validator checks whether there is
+any rule violation between the new lock and any of the held locks.
+
+When a lock-class changes its state, the following aspects of the above
+dependency rules are enforced:
+
+- if a new hardirq-safe lock is discovered, we check whether it
+  took any hardirq-unsafe lock in the past.
+
+- if a new softirq-safe lock is discovered, we check whether it took
+  any softirq-unsafe lock in the past.
+
+- if a new hardirq-unsafe lock is discovered, we check whether any
+  hardirq-safe lock took it in the past.
+
+- if a new softirq-unsafe lock is discovered, we check whether any
+  softirq-safe lock took it in the past.
+
+(Again, we do these checks too on the basis that an interrupt context
+could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which
+could lead to a lock inversion deadlock - even if that lock scenario did
+not trigger in practice yet.)
+
+Exception: Nested data dependencies leading to nested locking
+-------------------------------------------------------------
+
+There are a few cases where the Linux kernel acquires more than one
+instance of the same lock-class. Such cases typically happen when there
+is some sort of hierarchy within objects of the same type. In these
+cases there is an inherent "natural" ordering between the two objects
+(defined by the properties of the hierarchy), and the kernel grabs the
+locks in this fixed order on each of the objects.
+
+An example of such an object hierarchy that results in "nested locking"
+is that of a "whole disk" block-dev object and a "partition" block-dev
+object; the partition is "part of" the whole device and as long as one
+always takes the whole disk lock as a higher lock than the partition
+lock, the lock ordering is fully correct. The validator does not
+automatically detect this natural ordering, as the locking rule behind
+the ordering is not static.
+
+In order to teach the validator about this correct usage model, new
+versions of the various locking primitives were added that allow you to
+specify a "nesting level". An example call, for the block device mutex,
+looks like this::
+
+  enum bdev_bd_mutex_lock_class
+  {
+       BD_MUTEX_NORMAL,
+       BD_MUTEX_WHOLE,
+       BD_MUTEX_PARTITION
+  };
+
+mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION);
+
+In this case the locking is done on a bdev object that is known to be a
+partition.
+
+The validator treats a lock that is taken in such a nested fashion as a
+separate (sub)class for the purposes of validation.
+
+Note: When changing code to use the _nested() primitives, be careful and
+check really thoroughly that the hierarchy is correctly mapped; otherwise
+you can get false positives or false negatives.
+
+Annotations
+-----------
+
+Two constructs can be used to annotate and check where and if certain locks
+must be held: lockdep_assert_held*(&lock) and lockdep_*pin_lock(&lock).
+
+As the name suggests, lockdep_assert_held* family of macros assert that a
+particular lock is held at a certain time (and generate a WARN() otherwise).
+This annotation is largely used all over the kernel, e.g. kernel/sched/
+core.c::
+
+  void update_rq_clock(struct rq *rq)
+  {
+	s64 delta;
+
+	lockdep_assert_held(&rq->lock);
+	[...]
+  }
+
+where holding rq->lock is required to safely update a rq's clock.
+
+The other family of macros is lockdep_*pin_lock(), which is admittedly only
+used for rq->lock ATM. Despite their limited adoption these annotations
+generate a WARN() if the lock of interest is "accidentally" unlocked. This turns
+out to be especially helpful to debug code with callbacks, where an upper
+layer assumes a lock remains taken, but a lower layer thinks it can maybe drop
+and reacquire the lock ("unwittingly" introducing races). lockdep_pin_lock()
+returns a 'struct pin_cookie' that is then used by lockdep_unpin_lock() to check
+that nobody tampered with the lock, e.g. kernel/sched/sched.h::
+
+  static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
+  {
+	rf->cookie = lockdep_pin_lock(&rq->lock);
+	[...]
+  }
+
+  static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
+  {
+	[...]
+	lockdep_unpin_lock(&rq->lock, rf->cookie);
+  }
+
+While comments about locking requirements might provide useful information,
+the runtime checks performed by annotations are invaluable when debugging
+locking problems and they carry the same level of details when inspecting
+code.  Always prefer annotations when in doubt!
+
+Proof of 100% correctness:
+--------------------------
+
+The validator achieves perfect, mathematical 'closure' (proof of locking
+correctness) in the sense that for every simple, standalone single-task
+locking sequence that occurred at least once during the lifetime of the
+kernel, the validator proves it with a 100% certainty that no
+combination and timing of these locking sequences can cause any class of
+lock related deadlock. [1]_
+
+I.e. complex multi-CPU and multi-task locking scenarios do not have to
+occur in practice to prove a deadlock: only the simple 'component'
+locking chains have to occur at least once (anytime, in any
+task/context) for the validator to be able to prove correctness. (For
+example, complex deadlocks that would normally need more than 3 CPUs and
+a very unlikely constellation of tasks, irq-contexts and timings to
+occur, can be detected on a plain, lightly loaded single-CPU system as
+well!)
+
+This radically decreases the complexity of locking related QA of the
+kernel: what has to be done during QA is to trigger as many "simple"
+single-task locking dependencies in the kernel as possible, at least
+once, to prove locking correctness - instead of having to trigger every
+possible combination of locking interaction between CPUs, combined with
+every possible hardirq and softirq nesting scenario (which is impossible
+to do in practice).
+
+.. [1]
+
+    assuming that the validator itself is 100% correct, and no other
+    part of the system corrupts the state of the validator in any way.
+    We also assume that all NMI/SMM paths [which could interrupt
+    even hardirq-disabled codepaths] are correct and do not interfere
+    with the validator. We also assume that the 64-bit 'chain hash'
+    value is unique for every lock-chain in the system. Also, lock
+    recursion must not be higher than 20.
+
+Performance:
+------------
+
+The above rules require **massive** amounts of runtime checking. If we did
+that for every lock taken and for every irqs-enable event, it would
+render the system practically unusably slow. The complexity of checking
+is O(N^2), so even with just a few hundred lock-classes we'd have to do
+tens of thousands of checks for every event.
+
+This problem is solved by checking any given 'locking scenario' (unique
+sequence of locks taken after each other) only once. A simple stack of
+held locks is maintained, and a lightweight 64-bit hash value is
+calculated, which hash is unique for every lock chain. The hash value,
+when the chain is validated for the first time, is then put into a hash
+table, which hash-table can be checked in a lockfree manner. If the
+locking chain occurs again later on, the hash table tells us that we
+don't have to validate the chain again.
+
+Troubleshooting:
+----------------
+
+The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
+Exceeding this number will trigger the following lockdep warning:
+
+	(DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
+
+By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
+desktop systems have less than 1,000 lock classes, so this warning
+normally results from lock-class leakage or failure to properly
+initialize locks.  These two problems are illustrated below:
+
+1.	Repeated module loading and unloading while running the validator
+	will result in lock-class leakage.  The issue here is that each
+	load of the module will create a new set of lock classes for
+	that module's locks, but module unloading does not remove old
+	classes (see below discussion of reuse of lock classes for why).
+	Therefore, if that module is loaded and unloaded repeatedly,
+	the number of lock classes will eventually reach the maximum.
+
+2.	Using structures such as arrays that have large numbers of
+	locks that are not explicitly initialized.  For example,
+	a hash table with 8192 buckets where each bucket has its own
+	spinlock_t will consume 8192 lock classes -unless- each spinlock
+	is explicitly initialized at runtime, for example, using the
+	run-time spin_lock_init() as opposed to compile-time initializers
+	such as __SPIN_LOCK_UNLOCKED().  Failure to properly initialize
+	the per-bucket spinlocks would guarantee lock-class overflow.
+	In contrast, a loop that called spin_lock_init() on each lock
+	would place all 8192 locks into a single lock class.
+
+	The moral of this story is that you should always explicitly
+	initialize your locks.
+
+One might argue that the validator should be modified to allow
+lock classes to be reused.  However, if you are tempted to make this
+argument, first review the code and think through the changes that would
+be required, keeping in mind that the lock classes to be removed are
+likely to be linked into the lock-dependency graph.  This turns out to
+be harder to do than to say.
+
+Of course, if you do run out of lock classes, the next thing to do is
+to find the offending lock classes.  First, the following command gives
+you the number of lock classes currently in use along with the maximum::
+
+	grep "lock-classes" /proc/lockdep_stats
+
+This command produces the following output on a modest system::
+
+	lock-classes:                          748 [max: 8191]
+
+If the number allocated (748 above) increases continually over time,
+then there is likely a leak.  The following command can be used to
+identify the leaking lock classes::
+
+	grep "BD" /proc/lockdep
+
+Run the command and save the output, then compare against the output from
+a later run of this command to identify the leakers.  This same output
+can also help you find situations where runtime lock initialization has
+been omitted.
diff --git a/Documentation/locking/lockdep-design.txt b/Documentation/locking/lockdep-design.txt
deleted file mode 100644
index f189d130e543..000000000000
--- a/Documentation/locking/lockdep-design.txt
+++ /dev/null
@@ -1,389 +0,0 @@
-Runtime locking correctness validator
-=====================================
-
-started by Ingo Molnar <mingo@redhat.com>
-additions by Arjan van de Ven <arjan@linux.intel.com>
-
-Lock-class
-----------
-
-The basic object the validator operates upon is a 'class' of locks.
-
-A class of locks is a group of locks that are logically the same with
-respect to locking rules, even if the locks may have multiple (possibly
-tens of thousands of) instantiations. For example a lock in the inode
-struct is one class, while each inode has its own instantiation of that
-lock class.
-
-The validator tracks the 'usage state' of lock-classes, and it tracks
-the dependencies between different lock-classes. Lock usage indicates
-how a lock is used with regard to its IRQ contexts, while lock
-dependency can be understood as lock order, where L1 -> L2 suggests that
-a task is attempting to acquire L2 while holding L1. From lockdep's
-perspective, the two locks (L1 and L2) are not necessarily related; that
-dependency just means the order ever happened. The validator maintains a
-continuing effort to prove lock usages and dependencies are correct or
-the validator will shoot a splat if incorrect.
-
-A lock-class's behavior is constructed by its instances collectively:
-when the first instance of a lock-class is used after bootup the class
-gets registered, then all (subsequent) instances will be mapped to the
-class and hence their usages and dependecies will contribute to those of
-the class. A lock-class does not go away when a lock instance does, but
-it can be removed if the memory space of the lock class (static or
-dynamic) is reclaimed, this happens for example when a module is
-unloaded or a workqueue is destroyed.
-
-State
------
-
-The validator tracks lock-class usage history and divides the usage into
-(4 usages * n STATEs + 1) categories:
-
-where the 4 usages can be:
-- 'ever held in STATE context'
-- 'ever held as readlock in STATE context'
-- 'ever held with STATE enabled'
-- 'ever held as readlock with STATE enabled'
-
-where the n STATEs are coded in kernel/locking/lockdep_states.h and as of
-now they include:
-- hardirq
-- softirq
-
-where the last 1 category is:
-- 'ever used'                                       [ == !unused        ]
-
-When locking rules are violated, these usage bits are presented in the
-locking error messages, inside curlies, with a total of 2 * n STATEs bits.
-A contrived example:
-
-   modprobe/2287 is trying to acquire lock:
-    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-
-   but task is already holding lock:
-    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-
-
-For a given lock, the bit positions from left to right indicate the usage
-of the lock and readlock (if exists), for each of the n STATEs listed
-above respectively, and the character displayed at each bit position
-indicates:
-
-   '.'  acquired while irqs disabled and not in irq context
-   '-'  acquired in irq context
-   '+'  acquired with irqs enabled
-   '?'  acquired in irq context with irqs enabled.
-
-The bits are illustrated with an example:
-
-    (&sio_locks[i].lock){-.-.}, at: [<c02867fd>] mutex_lock+0x21/0x24
-                         ||||
-                         ||| \-> softirq disabled and not in softirq context
-                         || \--> acquired in softirq context
-                         | \---> hardirq disabled and not in hardirq context
-                          \----> acquired in hardirq context
-
-
-For a given STATE, whether the lock is ever acquired in that STATE
-context and whether that STATE is enabled yields four possible cases as
-shown in the table below. The bit character is able to indicate which
-exact case is for the lock as of the reporting time.
-
-   -------------------------------------------
-  |              | irq enabled | irq disabled |
-  |-------------------------------------------|
-  | ever in irq  |      ?      |       -      |
-  |-------------------------------------------|
-  | never in irq |      +      |       .      |
-   -------------------------------------------
-
-The character '-' suggests irq is disabled because if otherwise the
-charactor '?' would have been shown instead. Similar deduction can be
-applied for '+' too.
-
-Unused locks (e.g., mutexes) cannot be part of the cause of an error.
-
-
-Single-lock state rules:
-------------------------
-
-A lock is irq-safe means it was ever used in an irq context, while a lock
-is irq-unsafe means it was ever acquired with irq enabled.
-
-A softirq-unsafe lock-class is automatically hardirq-unsafe as well. The
-following states must be exclusive: only one of them is allowed to be set
-for any lock-class based on its usage:
-
- <hardirq-safe> or <hardirq-unsafe>
- <softirq-safe> or <softirq-unsafe>
-
-This is because if a lock can be used in irq context (irq-safe) then it
-cannot be ever acquired with irq enabled (irq-unsafe). Otherwise, a
-deadlock may happen. For example, in the scenario that after this lock
-was acquired but before released, if the context is interrupted this
-lock will be attempted to acquire twice, which creates a deadlock,
-referred to as lock recursion deadlock.
-
-The validator detects and reports lock usage that violates these
-single-lock state rules.
-
-Multi-lock dependency rules:
-----------------------------
-
-The same lock-class must not be acquired twice, because this could lead
-to lock recursion deadlocks.
-
-Furthermore, two locks can not be taken in inverse order:
-
- <L1> -> <L2>
- <L2> -> <L1>
-
-because this could lead to a deadlock - referred to as lock inversion
-deadlock - as attempts to acquire the two locks form a circle which
-could lead to the two contexts waiting for each other permanently. The
-validator will find such dependency circle in arbitrary complexity,
-i.e., there can be any other locking sequence between the acquire-lock
-operations; the validator will still find whether these locks can be
-acquired in a circular fashion.
-
-Furthermore, the following usage based lock dependencies are not allowed
-between any two lock-classes:
-
-   <hardirq-safe>   ->  <hardirq-unsafe>
-   <softirq-safe>   ->  <softirq-unsafe>
-
-The first rule comes from the fact that a hardirq-safe lock could be
-taken by a hardirq context, interrupting a hardirq-unsafe lock - and
-thus could result in a lock inversion deadlock. Likewise, a softirq-safe
-lock could be taken by an softirq context, interrupting a softirq-unsafe
-lock.
-
-The above rules are enforced for any locking sequence that occurs in the
-kernel: when acquiring a new lock, the validator checks whether there is
-any rule violation between the new lock and any of the held locks.
-
-When a lock-class changes its state, the following aspects of the above
-dependency rules are enforced:
-
-- if a new hardirq-safe lock is discovered, we check whether it
-  took any hardirq-unsafe lock in the past.
-
-- if a new softirq-safe lock is discovered, we check whether it took
-  any softirq-unsafe lock in the past.
-
-- if a new hardirq-unsafe lock is discovered, we check whether any
-  hardirq-safe lock took it in the past.
-
-- if a new softirq-unsafe lock is discovered, we check whether any
-  softirq-safe lock took it in the past.
-
-(Again, we do these checks too on the basis that an interrupt context
-could interrupt _any_ of the irq-unsafe or hardirq-unsafe locks, which
-could lead to a lock inversion deadlock - even if that lock scenario did
-not trigger in practice yet.)
-
-Exception: Nested data dependencies leading to nested locking
--------------------------------------------------------------
-
-There are a few cases where the Linux kernel acquires more than one
-instance of the same lock-class. Such cases typically happen when there
-is some sort of hierarchy within objects of the same type. In these
-cases there is an inherent "natural" ordering between the two objects
-(defined by the properties of the hierarchy), and the kernel grabs the
-locks in this fixed order on each of the objects.
-
-An example of such an object hierarchy that results in "nested locking"
-is that of a "whole disk" block-dev object and a "partition" block-dev
-object; the partition is "part of" the whole device and as long as one
-always takes the whole disk lock as a higher lock than the partition
-lock, the lock ordering is fully correct. The validator does not
-automatically detect this natural ordering, as the locking rule behind
-the ordering is not static.
-
-In order to teach the validator about this correct usage model, new
-versions of the various locking primitives were added that allow you to
-specify a "nesting level". An example call, for the block device mutex,
-looks like this:
-
-enum bdev_bd_mutex_lock_class
-{
-       BD_MUTEX_NORMAL,
-       BD_MUTEX_WHOLE,
-       BD_MUTEX_PARTITION
-};
-
- mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_PARTITION);
-
-In this case the locking is done on a bdev object that is known to be a
-partition.
-
-The validator treats a lock that is taken in such a nested fashion as a
-separate (sub)class for the purposes of validation.
-
-Note: When changing code to use the _nested() primitives, be careful and
-check really thoroughly that the hierarchy is correctly mapped; otherwise
-you can get false positives or false negatives.
-
-Annotations
------------
-
-Two constructs can be used to annotate and check where and if certain locks
-must be held: lockdep_assert_held*(&lock) and lockdep_*pin_lock(&lock).
-
-As the name suggests, lockdep_assert_held* family of macros assert that a
-particular lock is held at a certain time (and generate a WARN() otherwise).
-This annotation is largely used all over the kernel, e.g. kernel/sched/
-core.c
-
-  void update_rq_clock(struct rq *rq)
-  {
-	s64 delta;
-
-	lockdep_assert_held(&rq->lock);
-	[...]
-  }
-
-where holding rq->lock is required to safely update a rq's clock.
-
-The other family of macros is lockdep_*pin_lock(), which is admittedly only
-used for rq->lock ATM. Despite their limited adoption these annotations
-generate a WARN() if the lock of interest is "accidentally" unlocked. This turns
-out to be especially helpful to debug code with callbacks, where an upper
-layer assumes a lock remains taken, but a lower layer thinks it can maybe drop
-and reacquire the lock ("unwittingly" introducing races). lockdep_pin_lock()
-returns a 'struct pin_cookie' that is then used by lockdep_unpin_lock() to check
-that nobody tampered with the lock, e.g. kernel/sched/sched.h
-
-  static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
-  {
-	rf->cookie = lockdep_pin_lock(&rq->lock);
-	[...]
-  }
-
-  static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
-  {
-	[...]
-	lockdep_unpin_lock(&rq->lock, rf->cookie);
-  }
-
-While comments about locking requirements might provide useful information,
-the runtime checks performed by annotations are invaluable when debugging
-locking problems and they carry the same level of details when inspecting
-code.  Always prefer annotations when in doubt!
-
-Proof of 100% correctness:
---------------------------
-
-The validator achieves perfect, mathematical 'closure' (proof of locking
-correctness) in the sense that for every simple, standalone single-task
-locking sequence that occurred at least once during the lifetime of the
-kernel, the validator proves it with a 100% certainty that no
-combination and timing of these locking sequences can cause any class of
-lock related deadlock. [*]
-
-I.e. complex multi-CPU and multi-task locking scenarios do not have to
-occur in practice to prove a deadlock: only the simple 'component'
-locking chains have to occur at least once (anytime, in any
-task/context) for the validator to be able to prove correctness. (For
-example, complex deadlocks that would normally need more than 3 CPUs and
-a very unlikely constellation of tasks, irq-contexts and timings to
-occur, can be detected on a plain, lightly loaded single-CPU system as
-well!)
-
-This radically decreases the complexity of locking related QA of the
-kernel: what has to be done during QA is to trigger as many "simple"
-single-task locking dependencies in the kernel as possible, at least
-once, to prove locking correctness - instead of having to trigger every
-possible combination of locking interaction between CPUs, combined with
-every possible hardirq and softirq nesting scenario (which is impossible
-to do in practice).
-
-[*] assuming that the validator itself is 100% correct, and no other
-    part of the system corrupts the state of the validator in any way.
-    We also assume that all NMI/SMM paths [which could interrupt
-    even hardirq-disabled codepaths] are correct and do not interfere
-    with the validator. We also assume that the 64-bit 'chain hash'
-    value is unique for every lock-chain in the system. Also, lock
-    recursion must not be higher than 20.
-
-Performance:
-------------
-
-The above rules require _massive_ amounts of runtime checking. If we did
-that for every lock taken and for every irqs-enable event, it would
-render the system practically unusably slow. The complexity of checking
-is O(N^2), so even with just a few hundred lock-classes we'd have to do
-tens of thousands of checks for every event.
-
-This problem is solved by checking any given 'locking scenario' (unique
-sequence of locks taken after each other) only once. A simple stack of
-held locks is maintained, and a lightweight 64-bit hash value is
-calculated, which hash is unique for every lock chain. The hash value,
-when the chain is validated for the first time, is then put into a hash
-table, which hash-table can be checked in a lockfree manner. If the
-locking chain occurs again later on, the hash table tells us that we
-don't have to validate the chain again.
-
-Troubleshooting:
-----------------
-
-The validator tracks a maximum of MAX_LOCKDEP_KEYS number of lock classes.
-Exceeding this number will trigger the following lockdep warning:
-
-	(DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
-
-By default, MAX_LOCKDEP_KEYS is currently set to 8191, and typical
-desktop systems have less than 1,000 lock classes, so this warning
-normally results from lock-class leakage or failure to properly
-initialize locks.  These two problems are illustrated below:
-
-1.	Repeated module loading and unloading while running the validator
-	will result in lock-class leakage.  The issue here is that each
-	load of the module will create a new set of lock classes for
-	that module's locks, but module unloading does not remove old
-	classes (see below discussion of reuse of lock classes for why).
-	Therefore, if that module is loaded and unloaded repeatedly,
-	the number of lock classes will eventually reach the maximum.
-
-2.	Using structures such as arrays that have large numbers of
-	locks that are not explicitly initialized.  For example,
-	a hash table with 8192 buckets where each bucket has its own
-	spinlock_t will consume 8192 lock classes -unless- each spinlock
-	is explicitly initialized at runtime, for example, using the
-	run-time spin_lock_init() as opposed to compile-time initializers
-	such as __SPIN_LOCK_UNLOCKED().  Failure to properly initialize
-	the per-bucket spinlocks would guarantee lock-class overflow.
-	In contrast, a loop that called spin_lock_init() on each lock
-	would place all 8192 locks into a single lock class.
-
-	The moral of this story is that you should always explicitly
-	initialize your locks.
-
-One might argue that the validator should be modified to allow
-lock classes to be reused.  However, if you are tempted to make this
-argument, first review the code and think through the changes that would
-be required, keeping in mind that the lock classes to be removed are
-likely to be linked into the lock-dependency graph.  This turns out to
-be harder to do than to say.
-
-Of course, if you do run out of lock classes, the next thing to do is
-to find the offending lock classes.  First, the following command gives
-you the number of lock classes currently in use along with the maximum:
-
-	grep "lock-classes" /proc/lockdep_stats
-
-This command produces the following output on a modest system:
-
-	 lock-classes:                          748 [max: 8191]
-
-If the number allocated (748 above) increases continually over time,
-then there is likely a leak.  The following command can be used to
-identify the leaking lock classes:
-
-	grep "BD" /proc/lockdep
-
-Run the command and save the output, then compare against the output from
-a later run of this command to identify the leakers.  This same output
-can also help you find situations where runtime lock initialization has
-been omitted.
diff --git a/Documentation/locking/lockstat.rst b/Documentation/locking/lockstat.rst
new file mode 100644
index 000000000000..536eab8dbd99
--- /dev/null
+++ b/Documentation/locking/lockstat.rst
@@ -0,0 +1,204 @@
+===============
+Lock Statistics
+===============
+
+What
+====
+
+As the name suggests, it provides statistics on locks.
+
+
+Why
+===
+
+Because things like lock contention can severely impact performance.
+
+How
+===
+
+Lockdep already has hooks in the lock functions and maps lock instances to
+lock classes. We build on that (see Documentation/locking/lockdep-design.rst).
+The graph below shows the relation between the lock functions and the various
+hooks therein::
+
+        __acquire
+            |
+           lock _____
+            |        \
+            |    __contended
+            |         |
+            |       <wait>
+            | _______/
+            |/
+            |
+       __acquired
+            |
+            .
+          <hold>
+            .
+            |
+       __release
+            |
+         unlock
+
+  lock, unlock	- the regular lock functions
+  __*		- the hooks
+  <> 		- states
+
+With these hooks we provide the following statistics:
+
+ con-bounces
+	- number of lock contention that involved x-cpu data
+ contentions
+	- number of lock acquisitions that had to wait
+ wait time
+     min
+	- shortest (non-0) time we ever had to wait for a lock
+     max
+	- longest time we ever had to wait for a lock
+     total
+	- total time we spend waiting on this lock
+     avg
+	- average time spent waiting on this lock
+ acq-bounces
+	- number of lock acquisitions that involved x-cpu data
+ acquisitions
+	- number of times we took the lock
+ hold time
+     min
+	- shortest (non-0) time we ever held the lock
+     max
+	- longest time we ever held the lock
+     total
+	- total time this lock was held
+     avg
+	- average time this lock was held
+
+These numbers are gathered per lock class, per read/write state (when
+applicable).
+
+It also tracks 4 contention points per class. A contention point is a call site
+that had to wait on lock acquisition.
+
+Configuration
+-------------
+
+Lock statistics are enabled via CONFIG_LOCK_STAT.
+
+Usage
+-----
+
+Enable collection of statistics::
+
+	# echo 1 >/proc/sys/kernel/lock_stat
+
+Disable collection of statistics::
+
+	# echo 0 >/proc/sys/kernel/lock_stat
+
+Look at the current lock statistics::
+
+  ( line numbers not part of actual output, done for clarity in the explanation
+    below )
+
+  # less /proc/lock_stat
+
+  01 lock_stat version 0.4
+  02-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+  03                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
+  04-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+  05
+  06                         &mm->mmap_sem-W:            46             84           0.26         939.10       16371.53         194.90          47291        2922365           0.16     2220301.69 17464026916.32        5975.99
+  07                         &mm->mmap_sem-R:            37            100           1.31      299502.61      325629.52        3256.30         212344       34316685           0.10        7744.91    95016910.20           2.77
+  08                         ---------------
+  09                           &mm->mmap_sem              1          [<ffffffff811502a7>] khugepaged_scan_mm_slot+0x57/0x280
+  10                           &mm->mmap_sem             96          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
+  11                           &mm->mmap_sem             34          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
+  12                           &mm->mmap_sem             17          [<ffffffff81127e71>] vm_munmap+0x41/0x80
+  13                         ---------------
+  14                           &mm->mmap_sem              1          [<ffffffff81046fda>] dup_mmap+0x2a/0x3f0
+  15                           &mm->mmap_sem             60          [<ffffffff81129e29>] SyS_mprotect+0xe9/0x250
+  16                           &mm->mmap_sem             41          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
+  17                           &mm->mmap_sem             68          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
+  18
+  19.............................................................................................................................................................................................................................
+  20
+  21                         unix_table_lock:           110            112           0.21          49.24         163.91           1.46          21094          66312           0.12         624.42       31589.81           0.48
+  22                         ---------------
+  23                         unix_table_lock             45          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
+  24                         unix_table_lock             47          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
+  25                         unix_table_lock             15          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
+  26                         unix_table_lock              5          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
+  27                         ---------------
+  28                         unix_table_lock             39          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
+  29                         unix_table_lock             49          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
+  30                         unix_table_lock             20          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
+  31                         unix_table_lock              4          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
+
+
+This excerpt shows the first two lock class statistics. Line 01 shows the
+output version - each time the format changes this will be updated. Line 02-04
+show the header with column descriptions. Lines 05-18 and 20-31 show the actual
+statistics. These statistics come in two parts; the actual stats separated by a
+short separator (line 08, 13) from the contention points.
+
+Lines 09-12 show the first 4 recorded contention points (the code
+which tries to get the lock) and lines 14-17 show the first 4 recorded
+contended points (the lock holder). It is possible that the max
+con-bounces point is missing in the statistics.
+
+The first lock (05-18) is a read/write lock, and shows two lines above the
+short separator. The contention points don't match the column descriptors,
+they have two: contentions and [<IP>] symbol. The second set of contention
+points are the points we're contending with.
+
+The integer part of the time values is in us.
+
+Dealing with nested locks, subclasses may appear::
+
+  32...........................................................................................................................................................................................................................
+  33
+  34                               &rq->lock:       13128          13128           0.43         190.53      103881.26           7.91          97454        3453404           0.00         401.11    13224683.11           3.82
+  35                               ---------
+  36                               &rq->lock          645          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+  37                               &rq->lock          297          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+  38                               &rq->lock          360          [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
+  39                               &rq->lock          428          [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
+  40                               ---------
+  41                               &rq->lock           77          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
+  42                               &rq->lock          174          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+  43                               &rq->lock         4715          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+  44                               &rq->lock          893          [<ffffffff81340524>] schedule+0x157/0x7b8
+  45
+  46...........................................................................................................................................................................................................................
+  47
+  48                             &rq->lock/1:        1526          11488           0.33         388.73      136294.31          11.86          21461          38404           0.00          37.93      109388.53           2.84
+  49                             -----------
+  50                             &rq->lock/1        11526          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+  51                             -----------
+  52                             &rq->lock/1         5645          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
+  53                             &rq->lock/1         1224          [<ffffffff81340524>] schedule+0x157/0x7b8
+  54                             &rq->lock/1         4336          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
+  55                             &rq->lock/1          181          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
+
+Line 48 shows statistics for the second subclass (/1) of &rq->lock class
+(subclass starts from 0), since in this case, as line 50 suggests,
+double_rq_lock actually acquires a nested lock of two spinlocks.
+
+View the top contending locks::
+
+  # grep : /proc/lock_stat | head
+			clockevents_lock:       2926159        2947636           0.15       46882.81  1784540466.34         605.41        3381345        3879161           0.00        2260.97    53178395.68          13.71
+		     tick_broadcast_lock:        346460         346717           0.18        2257.43    39364622.71         113.54        3642919        4242696           0.00        2263.79    49173646.60          11.59
+		  &mapping->i_mmap_mutex:        203896         203899           3.36      645530.05 31767507988.39      155800.21        3361776        8893984           0.17        2254.15    14110121.02           1.59
+			       &rq->lock:        135014         136909           0.18         606.09      842160.68           6.15        1540728       10436146           0.00         728.72    17606683.41           1.69
+	       &(&zone->lru_lock)->rlock:         93000          94934           0.16          59.18      188253.78           1.98        1199912        3809894           0.15         391.40     3559518.81           0.93
+			 tasklist_lock-W:         40667          41130           0.23        1189.42      428980.51          10.43         270278         510106           0.16         653.51     3939674.91           7.72
+			 tasklist_lock-R:         21298          21305           0.20        1310.05      215511.12          10.12         186204         241258           0.14        1162.33     1179779.23           4.89
+			      rcu_node_1:         47656          49022           0.16         635.41      193616.41           3.95         844888        1865423           0.00         764.26     1656226.96           0.89
+       &(&dentry->d_lockref.lock)->rlock:         39791          40179           0.15        1302.08       88851.96           2.21        2790851       12527025           0.10        1910.75     3379714.27           0.27
+			      rcu_node_0:         29203          30064           0.16         786.55     1555573.00          51.74          88963         244254           0.00         398.87      428872.51           1.76
+
+Clear the statistics::
+
+  # echo 0 > /proc/lock_stat
diff --git a/Documentation/locking/lockstat.txt b/Documentation/locking/lockstat.txt
deleted file mode 100644
index fdbeb0c45ef3..000000000000
--- a/Documentation/locking/lockstat.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-
-LOCK STATISTICS
-
-- WHAT
-
-As the name suggests, it provides statistics on locks.
-
-- WHY
-
-Because things like lock contention can severely impact performance.
-
-- HOW
-
-Lockdep already has hooks in the lock functions and maps lock instances to
-lock classes. We build on that (see Documentation/locking/lockdep-design.txt).
-The graph below shows the relation between the lock functions and the various
-hooks therein.
-
-        __acquire
-            |
-           lock _____
-            |        \
-            |    __contended
-            |         |
-            |       <wait>
-            | _______/
-            |/
-            |
-       __acquired
-            |
-            .
-          <hold>
-            .
-            |
-       __release
-            |
-         unlock
-
-lock, unlock	- the regular lock functions
-__*		- the hooks
-<> 		- states
-
-With these hooks we provide the following statistics:
-
- con-bounces       - number of lock contention that involved x-cpu data
- contentions       - number of lock acquisitions that had to wait
- wait time min     - shortest (non-0) time we ever had to wait for a lock
-           max     - longest time we ever had to wait for a lock
-	   total   - total time we spend waiting on this lock
-	   avg     - average time spent waiting on this lock
- acq-bounces       - number of lock acquisitions that involved x-cpu data
- acquisitions      - number of times we took the lock
- hold time min     - shortest (non-0) time we ever held the lock
-	   max     - longest time we ever held the lock
-	   total   - total time this lock was held
-	   avg     - average time this lock was held
-
-These numbers are gathered per lock class, per read/write state (when
-applicable).
-
-It also tracks 4 contention points per class. A contention point is a call site
-that had to wait on lock acquisition.
-
- - CONFIGURATION
-
-Lock statistics are enabled via CONFIG_LOCK_STAT.
-
- - USAGE
-
-Enable collection of statistics:
-
-# echo 1 >/proc/sys/kernel/lock_stat
-
-Disable collection of statistics:
-
-# echo 0 >/proc/sys/kernel/lock_stat
-
-Look at the current lock statistics:
-
-( line numbers not part of actual output, done for clarity in the explanation
-  below )
-
-# less /proc/lock_stat
-
-01 lock_stat version 0.4
-02-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-03                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total   waittime-avg    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total   holdtime-avg
-04-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
-05
-06                         &mm->mmap_sem-W:            46             84           0.26         939.10       16371.53         194.90          47291        2922365           0.16     2220301.69 17464026916.32        5975.99
-07                         &mm->mmap_sem-R:            37            100           1.31      299502.61      325629.52        3256.30         212344       34316685           0.10        7744.91    95016910.20           2.77
-08                         ---------------
-09                           &mm->mmap_sem              1          [<ffffffff811502a7>] khugepaged_scan_mm_slot+0x57/0x280
-10                           &mm->mmap_sem             96          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
-11                           &mm->mmap_sem             34          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
-12                           &mm->mmap_sem             17          [<ffffffff81127e71>] vm_munmap+0x41/0x80
-13                         ---------------
-14                           &mm->mmap_sem              1          [<ffffffff81046fda>] dup_mmap+0x2a/0x3f0
-15                           &mm->mmap_sem             60          [<ffffffff81129e29>] SyS_mprotect+0xe9/0x250
-16                           &mm->mmap_sem             41          [<ffffffff815351c4>] __do_page_fault+0x1d4/0x510
-17                           &mm->mmap_sem             68          [<ffffffff81113d77>] vm_mmap_pgoff+0x87/0xd0
-18
-19.............................................................................................................................................................................................................................
-20
-21                         unix_table_lock:           110            112           0.21          49.24         163.91           1.46          21094          66312           0.12         624.42       31589.81           0.48
-22                         ---------------
-23                         unix_table_lock             45          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
-24                         unix_table_lock             47          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
-25                         unix_table_lock             15          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
-26                         unix_table_lock              5          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
-27                         ---------------
-28                         unix_table_lock             39          [<ffffffff8150b111>] unix_release_sock+0x31/0x250
-29                         unix_table_lock             49          [<ffffffff8150ad8e>] unix_create1+0x16e/0x1b0
-30                         unix_table_lock             20          [<ffffffff8150ca37>] unix_find_other+0x117/0x230
-31                         unix_table_lock              4          [<ffffffff8150a09f>] unix_autobind+0x11f/0x1b0
-
-
-This excerpt shows the first two lock class statistics. Line 01 shows the
-output version - each time the format changes this will be updated. Line 02-04
-show the header with column descriptions. Lines 05-18 and 20-31 show the actual
-statistics. These statistics come in two parts; the actual stats separated by a
-short separator (line 08, 13) from the contention points.
-
-Lines 09-12 show the first 4 recorded contention points (the code
-which tries to get the lock) and lines 14-17 show the first 4 recorded
-contended points (the lock holder). It is possible that the max
-con-bounces point is missing in the statistics.
-
-The first lock (05-18) is a read/write lock, and shows two lines above the
-short separator. The contention points don't match the column descriptors,
-they have two: contentions and [<IP>] symbol. The second set of contention
-points are the points we're contending with.
-
-The integer part of the time values is in us.
-
-Dealing with nested locks, subclasses may appear:
-
-32...........................................................................................................................................................................................................................
-33
-34                               &rq->lock:       13128          13128           0.43         190.53      103881.26           7.91          97454        3453404           0.00         401.11    13224683.11           3.82
-35                               ---------
-36                               &rq->lock          645          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
-37                               &rq->lock          297          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-38                               &rq->lock          360          [<ffffffff8103c4c5>] select_task_rq_fair+0x1f0/0x74a
-39                               &rq->lock          428          [<ffffffff81045f98>] scheduler_tick+0x46/0x1fb
-40                               ---------
-41                               &rq->lock           77          [<ffffffff8103bfc4>] task_rq_lock+0x43/0x75
-42                               &rq->lock          174          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-43                               &rq->lock         4715          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
-44                               &rq->lock          893          [<ffffffff81340524>] schedule+0x157/0x7b8
-45
-46...........................................................................................................................................................................................................................
-47
-48                             &rq->lock/1:        1526          11488           0.33         388.73      136294.31          11.86          21461          38404           0.00          37.93      109388.53           2.84
-49                             -----------
-50                             &rq->lock/1        11526          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
-51                             -----------
-52                             &rq->lock/1         5645          [<ffffffff8103ed4b>] double_rq_lock+0x42/0x54
-53                             &rq->lock/1         1224          [<ffffffff81340524>] schedule+0x157/0x7b8
-54                             &rq->lock/1         4336          [<ffffffff8103ed58>] double_rq_lock+0x4f/0x54
-55                             &rq->lock/1          181          [<ffffffff8104ba65>] try_to_wake_up+0x127/0x25a
-
-Line 48 shows statistics for the second subclass (/1) of &rq->lock class
-(subclass starts from 0), since in this case, as line 50 suggests,
-double_rq_lock actually acquires a nested lock of two spinlocks.
-
-View the top contending locks:
-
-# grep : /proc/lock_stat | head
-			clockevents_lock:       2926159        2947636           0.15       46882.81  1784540466.34         605.41        3381345        3879161           0.00        2260.97    53178395.68          13.71
-		     tick_broadcast_lock:        346460         346717           0.18        2257.43    39364622.71         113.54        3642919        4242696           0.00        2263.79    49173646.60          11.59
-		  &mapping->i_mmap_mutex:        203896         203899           3.36      645530.05 31767507988.39      155800.21        3361776        8893984           0.17        2254.15    14110121.02           1.59
-			       &rq->lock:        135014         136909           0.18         606.09      842160.68           6.15        1540728       10436146           0.00         728.72    17606683.41           1.69
-	       &(&zone->lru_lock)->rlock:         93000          94934           0.16          59.18      188253.78           1.98        1199912        3809894           0.15         391.40     3559518.81           0.93
-			 tasklist_lock-W:         40667          41130           0.23        1189.42      428980.51          10.43         270278         510106           0.16         653.51     3939674.91           7.72
-			 tasklist_lock-R:         21298          21305           0.20        1310.05      215511.12          10.12         186204         241258           0.14        1162.33     1179779.23           4.89
-			      rcu_node_1:         47656          49022           0.16         635.41      193616.41           3.95         844888        1865423           0.00         764.26     1656226.96           0.89
-       &(&dentry->d_lockref.lock)->rlock:         39791          40179           0.15        1302.08       88851.96           2.21        2790851       12527025           0.10        1910.75     3379714.27           0.27
-			      rcu_node_0:         29203          30064           0.16         786.55     1555573.00          51.74          88963         244254           0.00         398.87      428872.51           1.76
-
-Clear the statistics:
-
-# echo 0 > /proc/lock_stat
diff --git a/Documentation/locking/locktorture.rst b/Documentation/locking/locktorture.rst
new file mode 100644
index 000000000000..e79eeeca3ac6
--- /dev/null
+++ b/Documentation/locking/locktorture.rst
@@ -0,0 +1,170 @@
+==================================
+Kernel Lock Torture Test Operation
+==================================
+
+CONFIG_LOCK_TORTURE_TEST
+========================
+
+The CONFIG LOCK_TORTURE_TEST config option provides a kernel module
+that runs torture tests on core kernel locking primitives. The kernel
+module, 'locktorture', may be built after the fact on the running
+kernel to be tested, if desired. The tests periodically output status
+messages via printk(), which can be examined via the dmesg (perhaps
+grepping for "torture").  The test is started when the module is loaded,
+and stops when the module is unloaded. This program is based on how RCU
+is tortured, via rcutorture.
+
+This torture test consists of creating a number of kernel threads which
+acquire the lock and hold it for specific amount of time, thus simulating
+different critical region behaviors. The amount of contention on the lock
+can be simulated by either enlarging this critical region hold time and/or
+creating more kthreads.
+
+
+Module Parameters
+=================
+
+This module has the following parameters:
+
+
+Locktorture-specific
+--------------------
+
+nwriters_stress
+		  Number of kernel threads that will stress exclusive lock
+		  ownership (writers). The default value is twice the number
+		  of online CPUs.
+
+nreaders_stress
+		  Number of kernel threads that will stress shared lock
+		  ownership (readers). The default is the same amount of writer
+		  locks. If the user did not specify nwriters_stress, then
+		  both readers and writers be the amount of online CPUs.
+
+torture_type
+		  Type of lock to torture. By default, only spinlocks will
+		  be tortured. This module can torture the following locks,
+		  with string values as follows:
+
+		     - "lock_busted":
+				Simulates a buggy lock implementation.
+
+		     - "spin_lock":
+				spin_lock() and spin_unlock() pairs.
+
+		     - "spin_lock_irq":
+				spin_lock_irq() and spin_unlock_irq() pairs.
+
+		     - "rw_lock":
+				read/write lock() and unlock() rwlock pairs.
+
+		     - "rw_lock_irq":
+				read/write lock_irq() and unlock_irq()
+				rwlock pairs.
+
+		     - "mutex_lock":
+				mutex_lock() and mutex_unlock() pairs.
+
+		     - "rtmutex_lock":
+				rtmutex_lock() and rtmutex_unlock() pairs.
+				Kernel must have CONFIG_RT_MUTEX=y.
+
+		     - "rwsem_lock":
+				read/write down() and up() semaphore pairs.
+
+
+Torture-framework (RCU + locking)
+---------------------------------
+
+shutdown_secs
+		  The number of seconds to run the test before terminating
+		  the test and powering off the system.  The default is
+		  zero, which disables test termination and system shutdown.
+		  This capability is useful for automated testing.
+
+onoff_interval
+		  The number of seconds between each attempt to execute a
+		  randomly selected CPU-hotplug operation.  Defaults
+		  to zero, which disables CPU hotplugging.  In
+		  CONFIG_HOTPLUG_CPU=n kernels, locktorture will silently
+		  refuse to do any CPU-hotplug operations regardless of
+		  what value is specified for onoff_interval.
+
+onoff_holdoff
+		  The number of seconds to wait until starting CPU-hotplug
+		  operations.  This would normally only be used when
+		  locktorture was built into the kernel and started
+		  automatically at boot time, in which case it is useful
+		  in order to avoid confusing boot-time code with CPUs
+		  coming and going. This parameter is only useful if
+		  CONFIG_HOTPLUG_CPU is enabled.
+
+stat_interval
+		  Number of seconds between statistics-related printk()s.
+		  By default, locktorture will report stats every 60 seconds.
+		  Setting the interval to zero causes the statistics to
+		  be printed -only- when the module is unloaded, and this
+		  is the default.
+
+stutter
+		  The length of time to run the test before pausing for this
+		  same period of time.  Defaults to "stutter=5", so as
+		  to run and pause for (roughly) five-second intervals.
+		  Specifying "stutter=0" causes the test to run continuously
+		  without pausing, which is the old default behavior.
+
+shuffle_interval
+		  The number of seconds to keep the test threads affinitied
+		  to a particular subset of the CPUs, defaults to 3 seconds.
+		  Used in conjunction with test_no_idle_hz.
+
+verbose
+		  Enable verbose debugging printing, via printk(). Enabled
+		  by default. This extra information is mostly related to
+		  high-level errors and reports from the main 'torture'
+		  framework.
+
+
+Statistics
+==========
+
+Statistics are printed in the following format::
+
+  spin_lock-torture: Writes:  Total: 93746064  Max/Min: 0/0   Fail: 0
+     (A)		    (B)		   (C)		  (D)	       (E)
+
+  (A): Lock type that is being tortured -- torture_type parameter.
+
+  (B): Number of writer lock acquisitions. If dealing with a read/write
+       primitive a second "Reads" statistics line is printed.
+
+  (C): Number of times the lock was acquired.
+
+  (D): Min and max number of times threads failed to acquire the lock.
+
+  (E): true/false values if there were errors acquiring the lock. This should
+       -only- be positive if there is a bug in the locking primitive's
+       implementation. Otherwise a lock should never fail (i.e., spin_lock()).
+       Of course, the same applies for (C), above. A dummy example of this is
+       the "lock_busted" type.
+
+Usage
+=====
+
+The following script may be used to torture locks::
+
+	#!/bin/sh
+
+	modprobe locktorture
+	sleep 3600
+	rmmod locktorture
+	dmesg | grep torture:
+
+The output can be manually inspected for the error flag of "!!!".
+One could of course create a more elaborate script that automatically
+checked for such errors.  The "rmmod" command forces a "SUCCESS",
+"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed.  The first
+two are self-explanatory, while the last indicates that while there
+were no locking failures, CPU-hotplug problems were detected.
+
+Also see: Documentation/RCU/torture.txt
diff --git a/Documentation/locking/locktorture.txt b/Documentation/locking/locktorture.txt
deleted file mode 100644
index 6a8df4cd19bf..000000000000
--- a/Documentation/locking/locktorture.txt
+++ /dev/null
@@ -1,145 +0,0 @@
-Kernel Lock Torture Test Operation
-
-CONFIG_LOCK_TORTURE_TEST
-
-The CONFIG LOCK_TORTURE_TEST config option provides a kernel module
-that runs torture tests on core kernel locking primitives. The kernel
-module, 'locktorture', may be built after the fact on the running
-kernel to be tested, if desired. The tests periodically output status
-messages via printk(), which can be examined via the dmesg (perhaps
-grepping for "torture").  The test is started when the module is loaded,
-and stops when the module is unloaded. This program is based on how RCU
-is tortured, via rcutorture.
-
-This torture test consists of creating a number of kernel threads which
-acquire the lock and hold it for specific amount of time, thus simulating
-different critical region behaviors. The amount of contention on the lock
-can be simulated by either enlarging this critical region hold time and/or
-creating more kthreads.
-
-
-MODULE PARAMETERS
-
-This module has the following parameters:
-
-
-	    ** Locktorture-specific **
-
-nwriters_stress   Number of kernel threads that will stress exclusive lock
-		  ownership (writers). The default value is twice the number
-		  of online CPUs.
-
-nreaders_stress   Number of kernel threads that will stress shared lock
-		  ownership (readers). The default is the same amount of writer
-		  locks. If the user did not specify nwriters_stress, then
-		  both readers and writers be the amount of online CPUs.
-
-torture_type	  Type of lock to torture. By default, only spinlocks will
-		  be tortured. This module can torture the following locks,
-		  with string values as follows:
-
-		     o "lock_busted": Simulates a buggy lock implementation.
-
-		     o "spin_lock": spin_lock() and spin_unlock() pairs.
-
-		     o "spin_lock_irq": spin_lock_irq() and spin_unlock_irq()
-					pairs.
-
-		     o "rw_lock": read/write lock() and unlock() rwlock pairs.
-
-		     o "rw_lock_irq": read/write lock_irq() and unlock_irq()
-				      rwlock pairs.
-
-		     o "mutex_lock": mutex_lock() and mutex_unlock() pairs.
-
-		     o "rtmutex_lock": rtmutex_lock() and rtmutex_unlock()
-				       pairs. Kernel must have CONFIG_RT_MUTEX=y.
-
-		     o "rwsem_lock": read/write down() and up() semaphore pairs.
-
-
-	    ** Torture-framework (RCU + locking) **
-
-shutdown_secs	  The number of seconds to run the test before terminating
-		  the test and powering off the system.  The default is
-		  zero, which disables test termination and system shutdown.
-		  This capability is useful for automated testing.
-
-onoff_interval	  The number of seconds between each attempt to execute a
-		  randomly selected CPU-hotplug operation.  Defaults
-		  to zero, which disables CPU hotplugging.  In
-		  CONFIG_HOTPLUG_CPU=n kernels, locktorture will silently
-		  refuse to do any CPU-hotplug operations regardless of
-		  what value is specified for onoff_interval.
-
-onoff_holdoff	  The number of seconds to wait until starting CPU-hotplug
-		  operations.  This would normally only be used when
-		  locktorture was built into the kernel and started
-		  automatically at boot time, in which case it is useful
-		  in order to avoid confusing boot-time code with CPUs
-		  coming and going. This parameter is only useful if
-		  CONFIG_HOTPLUG_CPU is enabled.
-
-stat_interval	  Number of seconds between statistics-related printk()s.
-		  By default, locktorture will report stats every 60 seconds.
-		  Setting the interval to zero causes the statistics to
-		  be printed -only- when the module is unloaded, and this
-		  is the default.
-
-stutter		  The length of time to run the test before pausing for this
-		  same period of time.  Defaults to "stutter=5", so as
-		  to run and pause for (roughly) five-second intervals.
-		  Specifying "stutter=0" causes the test to run continuously
-		  without pausing, which is the old default behavior.
-
-shuffle_interval  The number of seconds to keep the test threads affinitied
-		  to a particular subset of the CPUs, defaults to 3 seconds.
-		  Used in conjunction with test_no_idle_hz.
-
-verbose		  Enable verbose debugging printing, via printk(). Enabled
-		  by default. This extra information is mostly related to
-		  high-level errors and reports from the main 'torture'
-		  framework.
-
-
-STATISTICS
-
-Statistics are printed in the following format:
-
-spin_lock-torture: Writes:  Total: 93746064  Max/Min: 0/0   Fail: 0
-   (A)		    (B)		   (C)		  (D)	       (E)
-
-(A): Lock type that is being tortured -- torture_type parameter.
-
-(B): Number of writer lock acquisitions. If dealing with a read/write primitive
-     a second "Reads" statistics line is printed.
-
-(C): Number of times the lock was acquired.
-
-(D): Min and max number of times threads failed to acquire the lock.
-
-(E): true/false values if there were errors acquiring the lock. This should
-     -only- be positive if there is a bug in the locking primitive's
-     implementation. Otherwise a lock should never fail (i.e., spin_lock()).
-     Of course, the same applies for (C), above. A dummy example of this is
-     the "lock_busted" type.
-
-USAGE
-
-The following script may be used to torture locks:
-
-	#!/bin/sh
-
-	modprobe locktorture
-	sleep 3600
-	rmmod locktorture
-	dmesg | grep torture:
-
-The output can be manually inspected for the error flag of "!!!".
-One could of course create a more elaborate script that automatically
-checked for such errors.  The "rmmod" command forces a "SUCCESS",
-"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed.  The first
-two are self-explanatory, while the last indicates that while there
-were no locking failures, CPU-hotplug problems were detected.
-
-Also see: Documentation/RCU/torture.txt
diff --git a/Documentation/locking/mutex-design.rst b/Documentation/locking/mutex-design.rst
new file mode 100644
index 000000000000..4d8236b81fa5
--- /dev/null
+++ b/Documentation/locking/mutex-design.rst
@@ -0,0 +1,152 @@
+=======================
+Generic Mutex Subsystem
+=======================
+
+started by Ingo Molnar <mingo@redhat.com>
+
+updated by Davidlohr Bueso <davidlohr@hp.com>
+
+What are mutexes?
+-----------------
+
+In the Linux kernel, mutexes refer to a particular locking primitive
+that enforces serialization on shared memory systems, and not only to
+the generic term referring to 'mutual exclusion' found in academia
+or similar theoretical text books. Mutexes are sleeping locks which
+behave similarly to binary semaphores, and were introduced in 2006[1]
+as an alternative to these. This new data structure provided a number
+of advantages, including simpler interfaces, and at that time smaller
+code (see Disadvantages).
+
+[1] http://lwn.net/Articles/164802/
+
+Implementation
+--------------
+
+Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h
+and implemented in kernel/locking/mutex.c. These locks use an atomic variable
+(->owner) to keep track of the lock state during its lifetime.  Field owner
+actually contains `struct task_struct *` to the current lock owner and it is
+therefore NULL if not currently owned. Since task_struct pointers are aligned
+at at least L1_CACHE_BYTES, low bits (3) are used to store extra state (e.g.,
+if waiter list is non-empty).  In its most basic form it also includes a
+wait-queue and a spinlock that serializes access to it. Furthermore,
+CONFIG_MUTEX_SPIN_ON_OWNER=y systems use a spinner MCS lock (->osq), described
+below in (ii).
+
+When acquiring a mutex, there are three possible paths that can be
+taken, depending on the state of the lock:
+
+(i) fastpath: tries to atomically acquire the lock by cmpxchg()ing the owner with
+    the current task. This only works in the uncontended case (cmpxchg() checks
+    against 0UL, so all 3 state bits above have to be 0). If the lock is
+    contended it goes to the next possible path.
+
+(ii) midpath: aka optimistic spinning, tries to spin for acquisition
+     while the lock owner is running and there are no other tasks ready
+     to run that have higher priority (need_resched). The rationale is
+     that if the lock owner is running, it is likely to release the lock
+     soon. The mutex spinners are queued up using MCS lock so that only
+     one spinner can compete for the mutex.
+
+     The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock
+     with the desirable properties of being fair and with each cpu trying
+     to acquire the lock spinning on a local variable. It avoids expensive
+     cacheline bouncing that common test-and-set spinlock implementations
+     incur. An MCS-like lock is specially tailored for optimistic spinning
+     for sleeping lock implementation. An important feature of the customized
+     MCS lock is that it has the extra property that spinners are able to exit
+     the MCS spinlock queue when they need to reschedule. This further helps
+     avoid situations where MCS spinners that need to reschedule would continue
+     waiting to spin on mutex owner, only to go directly to slowpath upon
+     obtaining the MCS lock.
+
+
+(iii) slowpath: last resort, if the lock is still unable to be acquired,
+      the task is added to the wait-queue and sleeps until woken up by the
+      unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE.
+
+While formally kernel mutexes are sleepable locks, it is path (ii) that
+makes them more practically a hybrid type. By simply not interrupting a
+task and busy-waiting for a few cycles instead of immediately sleeping,
+the performance of this lock has been seen to significantly improve a
+number of workloads. Note that this technique is also used for rw-semaphores.
+
+Semantics
+---------
+
+The mutex subsystem checks and enforces the following rules:
+
+    - Only one task can hold the mutex at a time.
+    - Only the owner can unlock the mutex.
+    - Multiple unlocks are not permitted.
+    - Recursive locking/unlocking is not permitted.
+    - A mutex must only be initialized via the API (see below).
+    - A task may not exit with a mutex held.
+    - Memory areas where held locks reside must not be freed.
+    - Held mutexes must not be reinitialized.
+    - Mutexes may not be used in hardware or software interrupt
+      contexts such as tasklets and timers.
+
+These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled.
+In addition, the mutex debugging code also implements a number of other
+features that make lock debugging easier and faster:
+
+    - Uses symbolic names of mutexes, whenever they are printed
+      in debug output.
+    - Point-of-acquire tracking, symbolic lookup of function names,
+      list of all locks held in the system, printout of them.
+    - Owner tracking.
+    - Detects self-recursing locks and prints out all relevant info.
+    - Detects multi-task circular deadlocks and prints out all affected
+      locks and tasks (and only those tasks).
+
+
+Interfaces
+----------
+Statically define the mutex::
+
+   DEFINE_MUTEX(name);
+
+Dynamically initialize the mutex::
+
+   mutex_init(mutex);
+
+Acquire the mutex, uninterruptible::
+
+   void mutex_lock(struct mutex *lock);
+   void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
+   int  mutex_trylock(struct mutex *lock);
+
+Acquire the mutex, interruptible::
+
+   int mutex_lock_interruptible_nested(struct mutex *lock,
+				       unsigned int subclass);
+   int mutex_lock_interruptible(struct mutex *lock);
+
+Acquire the mutex, interruptible, if dec to 0::
+
+   int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
+
+Unlock the mutex::
+
+   void mutex_unlock(struct mutex *lock);
+
+Test if the mutex is taken::
+
+   int mutex_is_locked(struct mutex *lock);
+
+Disadvantages
+-------------
+
+Unlike its original design and purpose, 'struct mutex' is among the largest
+locks in the kernel. E.g: on x86-64 it is 32 bytes, where 'struct semaphore'
+is 24 bytes and rw_semaphore is 40 bytes. Larger structure sizes mean more CPU
+cache and memory footprint.
+
+When to use mutexes
+-------------------
+
+Unless the strict semantics of mutexes are unsuitable and/or the critical
+region prevents the lock from being shared, always prefer them to any other
+locking primitive.
diff --git a/Documentation/locking/mutex-design.txt b/Documentation/locking/mutex-design.txt
deleted file mode 100644
index 818aca19612f..000000000000
--- a/Documentation/locking/mutex-design.txt
+++ /dev/null
@@ -1,142 +0,0 @@
-Generic Mutex Subsystem
-
-started by Ingo Molnar <mingo@redhat.com>
-updated by Davidlohr Bueso <davidlohr@hp.com>
-
-What are mutexes?
------------------
-
-In the Linux kernel, mutexes refer to a particular locking primitive
-that enforces serialization on shared memory systems, and not only to
-the generic term referring to 'mutual exclusion' found in academia
-or similar theoretical text books. Mutexes are sleeping locks which
-behave similarly to binary semaphores, and were introduced in 2006[1]
-as an alternative to these. This new data structure provided a number
-of advantages, including simpler interfaces, and at that time smaller
-code (see Disadvantages).
-
-[1] http://lwn.net/Articles/164802/
-
-Implementation
---------------
-
-Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h
-and implemented in kernel/locking/mutex.c. These locks use an atomic variable
-(->owner) to keep track of the lock state during its lifetime.  Field owner
-actually contains 'struct task_struct *' to the current lock owner and it is
-therefore NULL if not currently owned. Since task_struct pointers are aligned
-at at least L1_CACHE_BYTES, low bits (3) are used to store extra state (e.g.,
-if waiter list is non-empty).  In its most basic form it also includes a
-wait-queue and a spinlock that serializes access to it. Furthermore,
-CONFIG_MUTEX_SPIN_ON_OWNER=y systems use a spinner MCS lock (->osq), described
-below in (ii).
-
-When acquiring a mutex, there are three possible paths that can be
-taken, depending on the state of the lock:
-
-(i) fastpath: tries to atomically acquire the lock by cmpxchg()ing the owner with
-    the current task. This only works in the uncontended case (cmpxchg() checks
-    against 0UL, so all 3 state bits above have to be 0). If the lock is
-    contended it goes to the next possible path.
-
-(ii) midpath: aka optimistic spinning, tries to spin for acquisition
-     while the lock owner is running and there are no other tasks ready
-     to run that have higher priority (need_resched). The rationale is
-     that if the lock owner is running, it is likely to release the lock
-     soon. The mutex spinners are queued up using MCS lock so that only
-     one spinner can compete for the mutex.
-
-     The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spinlock
-     with the desirable properties of being fair and with each cpu trying
-     to acquire the lock spinning on a local variable. It avoids expensive
-     cacheline bouncing that common test-and-set spinlock implementations
-     incur. An MCS-like lock is specially tailored for optimistic spinning
-     for sleeping lock implementation. An important feature of the customized
-     MCS lock is that it has the extra property that spinners are able to exit
-     the MCS spinlock queue when they need to reschedule. This further helps
-     avoid situations where MCS spinners that need to reschedule would continue
-     waiting to spin on mutex owner, only to go directly to slowpath upon
-     obtaining the MCS lock.
-
-
-(iii) slowpath: last resort, if the lock is still unable to be acquired,
-      the task is added to the wait-queue and sleeps until woken up by the
-      unlock path. Under normal circumstances it blocks as TASK_UNINTERRUPTIBLE.
-
-While formally kernel mutexes are sleepable locks, it is path (ii) that
-makes them more practically a hybrid type. By simply not interrupting a
-task and busy-waiting for a few cycles instead of immediately sleeping,
-the performance of this lock has been seen to significantly improve a
-number of workloads. Note that this technique is also used for rw-semaphores.
-
-Semantics
----------
-
-The mutex subsystem checks and enforces the following rules:
-
-    - Only one task can hold the mutex at a time.
-    - Only the owner can unlock the mutex.
-    - Multiple unlocks are not permitted.
-    - Recursive locking/unlocking is not permitted.
-    - A mutex must only be initialized via the API (see below).
-    - A task may not exit with a mutex held.
-    - Memory areas where held locks reside must not be freed.
-    - Held mutexes must not be reinitialized.
-    - Mutexes may not be used in hardware or software interrupt
-      contexts such as tasklets and timers.
-
-These semantics are fully enforced when CONFIG DEBUG_MUTEXES is enabled.
-In addition, the mutex debugging code also implements a number of other
-features that make lock debugging easier and faster:
-
-    - Uses symbolic names of mutexes, whenever they are printed
-      in debug output.
-    - Point-of-acquire tracking, symbolic lookup of function names,
-      list of all locks held in the system, printout of them.
-    - Owner tracking.
-    - Detects self-recursing locks and prints out all relevant info.
-    - Detects multi-task circular deadlocks and prints out all affected
-      locks and tasks (and only those tasks).
-
-
-Interfaces
-----------
-Statically define the mutex:
-   DEFINE_MUTEX(name);
-
-Dynamically initialize the mutex:
-   mutex_init(mutex);
-
-Acquire the mutex, uninterruptible:
-   void mutex_lock(struct mutex *lock);
-   void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
-   int  mutex_trylock(struct mutex *lock);
-
-Acquire the mutex, interruptible:
-   int mutex_lock_interruptible_nested(struct mutex *lock,
-				       unsigned int subclass);
-   int mutex_lock_interruptible(struct mutex *lock);
-
-Acquire the mutex, interruptible, if dec to 0:
-   int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
-
-Unlock the mutex:
-   void mutex_unlock(struct mutex *lock);
-
-Test if the mutex is taken:
-   int mutex_is_locked(struct mutex *lock);
-
-Disadvantages
--------------
-
-Unlike its original design and purpose, 'struct mutex' is among the largest
-locks in the kernel. E.g: on x86-64 it is 32 bytes, where 'struct semaphore'
-is 24 bytes and rw_semaphore is 40 bytes. Larger structure sizes mean more CPU
-cache and memory footprint.
-
-When to use mutexes
--------------------
-
-Unless the strict semantics of mutexes are unsuitable and/or the critical
-region prevents the lock from being shared, always prefer them to any other
-locking primitive.
diff --git a/Documentation/locking/rt-mutex-design.rst b/Documentation/locking/rt-mutex-design.rst
new file mode 100644
index 000000000000..59c2a64efb21
--- /dev/null
+++ b/Documentation/locking/rt-mutex-design.rst
@@ -0,0 +1,574 @@
+==============================
+RT-mutex implementation design
+==============================
+
+Copyright (c) 2006 Steven Rostedt
+
+Licensed under the GNU Free Documentation License, Version 1.2
+
+
+This document tries to describe the design of the rtmutex.c implementation.
+It doesn't describe the reasons why rtmutex.c exists. For that please see
+Documentation/locking/rt-mutex.rst.  Although this document does explain problems
+that happen without this code, but that is in the concept to understand
+what the code actually is doing.
+
+The goal of this document is to help others understand the priority
+inheritance (PI) algorithm that is used, as well as reasons for the
+decisions that were made to implement PI in the manner that was done.
+
+
+Unbounded Priority Inversion
+----------------------------
+
+Priority inversion is when a lower priority process executes while a higher
+priority process wants to run.  This happens for several reasons, and
+most of the time it can't be helped.  Anytime a high priority process wants
+to use a resource that a lower priority process has (a mutex for example),
+the high priority process must wait until the lower priority process is done
+with the resource.  This is a priority inversion.  What we want to prevent
+is something called unbounded priority inversion.  That is when the high
+priority process is prevented from running by a lower priority process for
+an undetermined amount of time.
+
+The classic example of unbounded priority inversion is where you have three
+processes, let's call them processes A, B, and C, where A is the highest
+priority process, C is the lowest, and B is in between. A tries to grab a lock
+that C owns and must wait and lets C run to release the lock. But in the
+meantime, B executes, and since B is of a higher priority than C, it preempts C,
+but by doing so, it is in fact preempting A which is a higher priority process.
+Now there's no way of knowing how long A will be sleeping waiting for C
+to release the lock, because for all we know, B is a CPU hog and will
+never give C a chance to release the lock.  This is called unbounded priority
+inversion.
+
+Here's a little ASCII art to show the problem::
+
+     grab lock L1 (owned by C)
+       |
+  A ---+
+          C preempted by B
+            |
+  C    +----+
+
+  B         +-------->
+                  B now keeps A from running.
+
+
+Priority Inheritance (PI)
+-------------------------
+
+There are several ways to solve this issue, but other ways are out of scope
+for this document.  Here we only discuss PI.
+
+PI is where a process inherits the priority of another process if the other
+process blocks on a lock owned by the current process.  To make this easier
+to understand, let's use the previous example, with processes A, B, and C again.
+
+This time, when A blocks on the lock owned by C, C would inherit the priority
+of A.  So now if B becomes runnable, it would not preempt C, since C now has
+the high priority of A.  As soon as C releases the lock, it loses its
+inherited priority, and A then can continue with the resource that C had.
+
+Terminology
+-----------
+
+Here I explain some terminology that is used in this document to help describe
+the design that is used to implement PI.
+
+PI chain
+         - The PI chain is an ordered series of locks and processes that cause
+           processes to inherit priorities from a previous process that is
+           blocked on one of its locks.  This is described in more detail
+           later in this document.
+
+mutex
+         - In this document, to differentiate from locks that implement
+           PI and spin locks that are used in the PI code, from now on
+           the PI locks will be called a mutex.
+
+lock
+         - In this document from now on, I will use the term lock when
+           referring to spin locks that are used to protect parts of the PI
+           algorithm.  These locks disable preemption for UP (when
+           CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
+           entering critical sections simultaneously.
+
+spin lock
+         - Same as lock above.
+
+waiter
+         - A waiter is a struct that is stored on the stack of a blocked
+           process.  Since the scope of the waiter is within the code for
+           a process being blocked on the mutex, it is fine to allocate
+           the waiter on the process's stack (local variable).  This
+           structure holds a pointer to the task, as well as the mutex that
+           the task is blocked on.  It also has rbtree node structures to
+           place the task in the waiters rbtree of a mutex as well as the
+           pi_waiters rbtree of a mutex owner task (described below).
+
+           waiter is sometimes used in reference to the task that is waiting
+           on a mutex. This is the same as waiter->task.
+
+waiters
+         - A list of processes that are blocked on a mutex.
+
+top waiter
+         - The highest priority process waiting on a specific mutex.
+
+top pi waiter
+              - The highest priority process waiting on one of the mutexes
+                that a specific process owns.
+
+Note:
+       task and process are used interchangeably in this document, mostly to
+       differentiate between two processes that are being described together.
+
+
+PI chain
+--------
+
+The PI chain is a list of processes and mutexes that may cause priority
+inheritance to take place.  Multiple chains may converge, but a chain
+would never diverge, since a process can't be blocked on more than one
+mutex at a time.
+
+Example::
+
+   Process:  A, B, C, D, E
+   Mutexes:  L1, L2, L3, L4
+
+   A owns: L1
+           B blocked on L1
+           B owns L2
+                  C blocked on L2
+                  C owns L3
+                         D blocked on L3
+                         D owns L4
+                                E blocked on L4
+
+The chain would be::
+
+   E->L4->D->L3->C->L2->B->L1->A
+
+To show where two chains merge, we could add another process F and
+another mutex L5 where B owns L5 and F is blocked on mutex L5.
+
+The chain for F would be::
+
+   F->L5->B->L1->A
+
+Since a process may own more than one mutex, but never be blocked on more than
+one, the chains merge.
+
+Here we show both chains::
+
+   E->L4->D->L3->C->L2-+
+                       |
+                       +->B->L1->A
+                       |
+                 F->L5-+
+
+For PI to work, the processes at the right end of these chains (or we may
+also call it the Top of the chain) must be equal to or higher in priority
+than the processes to the left or below in the chain.
+
+Also since a mutex may have more than one process blocked on it, we can
+have multiple chains merge at mutexes.  If we add another process G that is
+blocked on mutex L2::
+
+  G->L2->B->L1->A
+
+And once again, to show how this can grow I will show the merging chains
+again::
+
+   E->L4->D->L3->C-+
+                   +->L2-+
+                   |     |
+                 G-+     +->B->L1->A
+                         |
+                   F->L5-+
+
+If process G has the highest priority in the chain, then all the tasks up
+the chain (A and B in this example), must have their priorities increased
+to that of G.
+
+Mutex Waiters Tree
+------------------
+
+Every mutex keeps track of all the waiters that are blocked on itself. The
+mutex has a rbtree to store these waiters by priority.  This tree is protected
+by a spin lock that is located in the struct of the mutex. This lock is called
+wait_lock.
+
+
+Task PI Tree
+------------
+
+To keep track of the PI chains, each process has its own PI rbtree.  This is
+a tree of all top waiters of the mutexes that are owned by the process.
+Note that this tree only holds the top waiters and not all waiters that are
+blocked on mutexes owned by the process.
+
+The top of the task's PI tree is always the highest priority task that
+is waiting on a mutex that is owned by the task.  So if the task has
+inherited a priority, it will always be the priority of the task that is
+at the top of this tree.
+
+This tree is stored in the task structure of a process as a rbtree called
+pi_waiters.  It is protected by a spin lock also in the task structure,
+called pi_lock.  This lock may also be taken in interrupt context, so when
+locking the pi_lock, interrupts must be disabled.
+
+
+Depth of the PI Chain
+---------------------
+
+The maximum depth of the PI chain is not dynamic, and could actually be
+defined.  But is very complex to figure it out, since it depends on all
+the nesting of mutexes.  Let's look at the example where we have 3 mutexes,
+L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
+The following shows a locking order of L1->L2->L3, but may not actually
+be directly nested that way::
+
+  void func1(void)
+  {
+	mutex_lock(L1);
+
+	/* do anything */
+
+	mutex_unlock(L1);
+  }
+
+  void func2(void)
+  {
+	mutex_lock(L1);
+	mutex_lock(L2);
+
+	/* do something */
+
+	mutex_unlock(L2);
+	mutex_unlock(L1);
+  }
+
+  void func3(void)
+  {
+	mutex_lock(L2);
+	mutex_lock(L3);
+
+	/* do something else */
+
+	mutex_unlock(L3);
+	mutex_unlock(L2);
+  }
+
+  void func4(void)
+  {
+	mutex_lock(L3);
+
+	/* do something again */
+
+	mutex_unlock(L3);
+  }
+
+Now we add 4 processes that run each of these functions separately.
+Processes A, B, C, and D which run functions func1, func2, func3 and func4
+respectively, and such that D runs first and A last.  With D being preempted
+in func4 in the "do something again" area, we have a locking that follows::
+
+  D owns L3
+         C blocked on L3
+         C owns L2
+                B blocked on L2
+                B owns L1
+                       A blocked on L1
+
+  And thus we have the chain A->L1->B->L2->C->L3->D.
+
+This gives us a PI depth of 4 (four processes), but looking at any of the
+functions individually, it seems as though they only have at most a locking
+depth of two.  So, although the locking depth is defined at compile time,
+it still is very difficult to find the possibilities of that depth.
+
+Now since mutexes can be defined by user-land applications, we don't want a DOS
+type of application that nests large amounts of mutexes to create a large
+PI chain, and have the code holding spin locks while looking at a large
+amount of data.  So to prevent this, the implementation not only implements
+a maximum lock depth, but also only holds at most two different locks at a
+time, as it walks the PI chain.  More about this below.
+
+
+Mutex owner and flags
+---------------------
+
+The mutex structure contains a pointer to the owner of the mutex.  If the
+mutex is not owned, this owner is set to NULL.  Since all architectures
+have the task structure on at least a two byte alignment (and if this is
+not true, the rtmutex.c code will be broken!), this allows for the least
+significant bit to be used as a flag.  Bit 0 is used as the "Has Waiters"
+flag. It's set whenever there are waiters on a mutex.
+
+See Documentation/locking/rt-mutex.rst for further details.
+
+cmpxchg Tricks
+--------------
+
+Some architectures implement an atomic cmpxchg (Compare and Exchange).  This
+is used (when applicable) to keep the fast path of grabbing and releasing
+mutexes short.
+
+cmpxchg is basically the following function performed atomically::
+
+  unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
+  {
+	unsigned long T = *A;
+	if (*A == *B) {
+		*A = *C;
+	}
+	return T;
+  }
+  #define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
+
+This is really nice to have, since it allows you to only update a variable
+if the variable is what you expect it to be.  You know if it succeeded if
+the return value (the old value of A) is equal to B.
+
+The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
+the architecture does not support CMPXCHG, then this macro is simply set
+to fail every time.  But if CMPXCHG is supported, then this will
+help out extremely to keep the fast path short.
+
+The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
+the system for architectures that support it.  This will also be explained
+later in this document.
+
+
+Priority adjustments
+--------------------
+
+The implementation of the PI code in rtmutex.c has several places that a
+process must adjust its priority.  With the help of the pi_waiters of a
+process this is rather easy to know what needs to be adjusted.
+
+The functions implementing the task adjustments are rt_mutex_adjust_prio
+and rt_mutex_setprio. rt_mutex_setprio is only used in rt_mutex_adjust_prio.
+
+rt_mutex_adjust_prio examines the priority of the task, and the highest
+priority process that is waiting any of mutexes owned by the task. Since
+the pi_waiters of a task holds an order by priority of all the top waiters
+of all the mutexes that the task owns, we simply need to compare the top
+pi waiter to its own normal/deadline priority and take the higher one.
+Then rt_mutex_setprio is called to adjust the priority of the task to the
+new priority. Note that rt_mutex_setprio is defined in kernel/sched/core.c
+to implement the actual change in priority.
+
+Note:
+	For the "prio" field in task_struct, the lower the number, the
+	higher the priority. A "prio" of 5 is of higher priority than a
+	"prio" of 10.
+
+It is interesting to note that rt_mutex_adjust_prio can either increase
+or decrease the priority of the task.  In the case that a higher priority
+process has just blocked on a mutex owned by the task, rt_mutex_adjust_prio
+would increase/boost the task's priority.  But if a higher priority task
+were for some reason to leave the mutex (timeout or signal), this same function
+would decrease/unboost the priority of the task.  That is because the pi_waiters
+always contains the highest priority task that is waiting on a mutex owned
+by the task, so we only need to compare the priority of that top pi waiter
+to the normal priority of the given task.
+
+
+High level overview of the PI chain walk
+----------------------------------------
+
+The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
+
+The implementation has gone through several iterations, and has ended up
+with what we believe is the best.  It walks the PI chain by only grabbing
+at most two locks at a time, and is very efficient.
+
+The rt_mutex_adjust_prio_chain can be used either to boost or lower process
+priorities.
+
+rt_mutex_adjust_prio_chain is called with a task to be checked for PI
+(de)boosting (the owner of a mutex that a process is blocking on), a flag to
+check for deadlocking, the mutex that the task owns, a pointer to a waiter
+that is the process's waiter struct that is blocked on the mutex (although this
+parameter may be NULL for deboosting), a pointer to the mutex on which the task
+is blocked, and a top_task as the top waiter of the mutex.
+
+For this explanation, I will not mention deadlock detection. This explanation
+will try to stay at a high level.
+
+When this function is called, there are no locks held.  That also means
+that the state of the owner and lock can change when entered into this function.
+
+Before this function is called, the task has already had rt_mutex_adjust_prio
+performed on it.  This means that the task is set to the priority that it
+should be at, but the rbtree nodes of the task's waiter have not been updated
+with the new priorities, and this task may not be in the proper locations
+in the pi_waiters and waiters trees that the task is blocked on. This function
+solves all that.
+
+The main operation of this function is summarized by Thomas Gleixner in
+rtmutex.c. See the 'Chain walk basics and protection scope' comment for further
+details.
+
+Taking of a mutex (The walk through)
+------------------------------------
+
+OK, now let's take a look at the detailed walk through of what happens when
+taking a mutex.
+
+The first thing that is tried is the fast taking of the mutex.  This is
+done when we have CMPXCHG enabled (otherwise the fast taking automatically
+fails).  Only when the owner field of the mutex is NULL can the lock be
+taken with the CMPXCHG and nothing else needs to be done.
+
+If there is contention on the lock, we go about the slow path
+(rt_mutex_slowlock).
+
+The slow path function is where the task's waiter structure is created on
+the stack.  This is because the waiter structure is only needed for the
+scope of this function.  The waiter structure holds the nodes to store
+the task on the waiters tree of the mutex, and if need be, the pi_waiters
+tree of the owner.
+
+The wait_lock of the mutex is taken since the slow path of unlocking the
+mutex also takes this lock.
+
+We then call try_to_take_rt_mutex.  This is where the architecture that
+does not implement CMPXCHG would always grab the lock (if there's no
+contention).
+
+try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
+slow path.  The first thing that is done here is an atomic setting of
+the "Has Waiters" flag of the mutex's owner field. By setting this flag
+now, the current owner of the mutex being contended for can't release the mutex
+without going into the slow unlock path, and it would then need to grab the
+wait_lock, which this code currently holds. So setting the "Has Waiters" flag
+forces the current owner to synchronize with this code.
+
+The lock is taken if the following are true:
+
+   1) The lock has no owner
+   2) The current task is the highest priority against all other
+      waiters of the lock
+
+If the task succeeds to acquire the lock, then the task is set as the
+owner of the lock, and if the lock still has waiters, the top_waiter
+(highest priority task waiting on the lock) is added to this task's
+pi_waiters tree.
+
+If the lock is not taken by try_to_take_rt_mutex(), then the
+task_blocks_on_rt_mutex() function is called. This will add the task to
+the lock's waiter tree and propagate the pi chain of the lock as well
+as the lock's owner's pi_waiters tree. This is described in the next
+section.
+
+Task blocks on mutex
+--------------------
+
+The accounting of a mutex and process is done with the waiter structure of
+the process.  The "task" field is set to the process, and the "lock" field
+to the mutex.  The rbtree node of waiter are initialized to the processes
+current priority.
+
+Since the wait_lock was taken at the entry of the slow lock, we can safely
+add the waiter to the task waiter tree.  If the current process is the
+highest priority process currently waiting on this mutex, then we remove the
+previous top waiter process (if it exists) from the pi_waiters of the owner,
+and add the current process to that tree.  Since the pi_waiter of the owner
+has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
+should adjust its priority accordingly.
+
+If the owner is also blocked on a lock, and had its pi_waiters changed
+(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
+and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
+
+Now all locks are released, and if the current process is still blocked on a
+mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
+
+Waking up in the loop
+---------------------
+
+The task can then wake up for a couple of reasons:
+  1) The previous lock owner released the lock, and the task now is top_waiter
+  2) we received a signal or timeout
+
+In both cases, the task will try again to acquire the lock. If it
+does, then it will take itself off the waiters tree and set itself back
+to the TASK_RUNNING state.
+
+In first case, if the lock was acquired by another task before this task
+could get the lock, then it will go back to sleep and wait to be woken again.
+
+The second case is only applicable for tasks that are grabbing a mutex
+that can wake up before getting the lock, either due to a signal or
+a timeout (i.e. rt_mutex_timed_futex_lock()). When woken, it will try to
+take the lock again, if it succeeds, then the task will return with the
+lock held, otherwise it will return with -EINTR if the task was woken
+by a signal, or -ETIMEDOUT if it timed out.
+
+
+Unlocking the Mutex
+-------------------
+
+The unlocking of a mutex also has a fast path for those architectures with
+CMPXCHG.  Since the taking of a mutex on contention always sets the
+"Has Waiters" flag of the mutex's owner, we use this to know if we need to
+take the slow path when unlocking the mutex.  If the mutex doesn't have any
+waiters, the owner field of the mutex would equal the current process and
+the mutex can be unlocked by just replacing the owner field with NULL.
+
+If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
+the slow unlock path is taken.
+
+The first thing done in the slow unlock path is to take the wait_lock of the
+mutex.  This synchronizes the locking and unlocking of the mutex.
+
+A check is made to see if the mutex has waiters or not.  On architectures that
+do not have CMPXCHG, this is the location that the owner of the mutex will
+determine if a waiter needs to be awoken or not.  On architectures that
+do have CMPXCHG, that check is done in the fast path, but it is still needed
+in the slow path too.  If a waiter of a mutex woke up because of a signal
+or timeout between the time the owner failed the fast path CMPXCHG check and
+the grabbing of the wait_lock, the mutex may not have any waiters, thus the
+owner still needs to make this check. If there are no waiters then the mutex
+owner field is set to NULL, the wait_lock is released and nothing more is
+needed.
+
+If there are waiters, then we need to wake one up.
+
+On the wake up code, the pi_lock of the current owner is taken.  The top
+waiter of the lock is found and removed from the waiters tree of the mutex
+as well as the pi_waiters tree of the current owner. The "Has Waiters" bit is
+marked to prevent lower priority tasks from stealing the lock.
+
+Finally we unlock the pi_lock of the pending owner and wake it up.
+
+
+Contact
+-------
+
+For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
+
+
+Credits
+-------
+
+Author:  Steven Rostedt <rostedt@goodmis.org>
+
+Updated: Alex Shi <alex.shi@linaro.org>	- 7/6/2017
+
+Original Reviewers:
+		     Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and
+		     Randy Dunlap
+
+Update (7/6/2017) Reviewers: Steven Rostedt and Sebastian Siewior
+
+Updates
+-------
+
+This document was originally written for 2.6.17-rc3-mm1
+was updated on 4.12
diff --git a/Documentation/locking/rt-mutex-design.txt b/Documentation/locking/rt-mutex-design.txt
deleted file mode 100644
index 3d7b865539cc..000000000000
--- a/Documentation/locking/rt-mutex-design.txt
+++ /dev/null
@@ -1,559 +0,0 @@
-#
-# Copyright (c) 2006 Steven Rostedt
-# Licensed under the GNU Free Documentation License, Version 1.2
-#
-
-RT-mutex implementation design
-------------------------------
-
-This document tries to describe the design of the rtmutex.c implementation.
-It doesn't describe the reasons why rtmutex.c exists. For that please see
-Documentation/locking/rt-mutex.txt.  Although this document does explain problems
-that happen without this code, but that is in the concept to understand
-what the code actually is doing.
-
-The goal of this document is to help others understand the priority
-inheritance (PI) algorithm that is used, as well as reasons for the
-decisions that were made to implement PI in the manner that was done.
-
-
-Unbounded Priority Inversion
-----------------------------
-
-Priority inversion is when a lower priority process executes while a higher
-priority process wants to run.  This happens for several reasons, and
-most of the time it can't be helped.  Anytime a high priority process wants
-to use a resource that a lower priority process has (a mutex for example),
-the high priority process must wait until the lower priority process is done
-with the resource.  This is a priority inversion.  What we want to prevent
-is something called unbounded priority inversion.  That is when the high
-priority process is prevented from running by a lower priority process for
-an undetermined amount of time.
-
-The classic example of unbounded priority inversion is where you have three
-processes, let's call them processes A, B, and C, where A is the highest
-priority process, C is the lowest, and B is in between. A tries to grab a lock
-that C owns and must wait and lets C run to release the lock. But in the
-meantime, B executes, and since B is of a higher priority than C, it preempts C,
-but by doing so, it is in fact preempting A which is a higher priority process.
-Now there's no way of knowing how long A will be sleeping waiting for C
-to release the lock, because for all we know, B is a CPU hog and will
-never give C a chance to release the lock.  This is called unbounded priority
-inversion.
-
-Here's a little ASCII art to show the problem.
-
-   grab lock L1 (owned by C)
-     |
-A ---+
-        C preempted by B
-          |
-C    +----+
-
-B         +-------->
-                B now keeps A from running.
-
-
-Priority Inheritance (PI)
--------------------------
-
-There are several ways to solve this issue, but other ways are out of scope
-for this document.  Here we only discuss PI.
-
-PI is where a process inherits the priority of another process if the other
-process blocks on a lock owned by the current process.  To make this easier
-to understand, let's use the previous example, with processes A, B, and C again.
-
-This time, when A blocks on the lock owned by C, C would inherit the priority
-of A.  So now if B becomes runnable, it would not preempt C, since C now has
-the high priority of A.  As soon as C releases the lock, it loses its
-inherited priority, and A then can continue with the resource that C had.
-
-Terminology
------------
-
-Here I explain some terminology that is used in this document to help describe
-the design that is used to implement PI.
-
-PI chain - The PI chain is an ordered series of locks and processes that cause
-           processes to inherit priorities from a previous process that is
-           blocked on one of its locks.  This is described in more detail
-           later in this document.
-
-mutex    - In this document, to differentiate from locks that implement
-           PI and spin locks that are used in the PI code, from now on
-           the PI locks will be called a mutex.
-
-lock     - In this document from now on, I will use the term lock when
-           referring to spin locks that are used to protect parts of the PI
-           algorithm.  These locks disable preemption for UP (when
-           CONFIG_PREEMPT is enabled) and on SMP prevents multiple CPUs from
-           entering critical sections simultaneously.
-
-spin lock - Same as lock above.
-
-waiter   - A waiter is a struct that is stored on the stack of a blocked
-           process.  Since the scope of the waiter is within the code for
-           a process being blocked on the mutex, it is fine to allocate
-           the waiter on the process's stack (local variable).  This
-           structure holds a pointer to the task, as well as the mutex that
-           the task is blocked on.  It also has rbtree node structures to
-           place the task in the waiters rbtree of a mutex as well as the
-           pi_waiters rbtree of a mutex owner task (described below).
-
-           waiter is sometimes used in reference to the task that is waiting
-           on a mutex. This is the same as waiter->task.
-
-waiters  - A list of processes that are blocked on a mutex.
-
-top waiter - The highest priority process waiting on a specific mutex.
-
-top pi waiter - The highest priority process waiting on one of the mutexes
-                that a specific process owns.
-
-Note:  task and process are used interchangeably in this document, mostly to
-       differentiate between two processes that are being described together.
-
-
-PI chain
---------
-
-The PI chain is a list of processes and mutexes that may cause priority
-inheritance to take place.  Multiple chains may converge, but a chain
-would never diverge, since a process can't be blocked on more than one
-mutex at a time.
-
-Example:
-
-   Process:  A, B, C, D, E
-   Mutexes:  L1, L2, L3, L4
-
-   A owns: L1
-           B blocked on L1
-           B owns L2
-                  C blocked on L2
-                  C owns L3
-                         D blocked on L3
-                         D owns L4
-                                E blocked on L4
-
-The chain would be:
-
-   E->L4->D->L3->C->L2->B->L1->A
-
-To show where two chains merge, we could add another process F and
-another mutex L5 where B owns L5 and F is blocked on mutex L5.
-
-The chain for F would be:
-
-   F->L5->B->L1->A
-
-Since a process may own more than one mutex, but never be blocked on more than
-one, the chains merge.
-
-Here we show both chains:
-
-   E->L4->D->L3->C->L2-+
-                       |
-                       +->B->L1->A
-                       |
-                 F->L5-+
-
-For PI to work, the processes at the right end of these chains (or we may
-also call it the Top of the chain) must be equal to or higher in priority
-than the processes to the left or below in the chain.
-
-Also since a mutex may have more than one process blocked on it, we can
-have multiple chains merge at mutexes.  If we add another process G that is
-blocked on mutex L2:
-
-  G->L2->B->L1->A
-
-And once again, to show how this can grow I will show the merging chains
-again.
-
-   E->L4->D->L3->C-+
-                   +->L2-+
-                   |     |
-                 G-+     +->B->L1->A
-                         |
-                   F->L5-+
-
-If process G has the highest priority in the chain, then all the tasks up
-the chain (A and B in this example), must have their priorities increased
-to that of G.
-
-Mutex Waiters Tree
------------------
-
-Every mutex keeps track of all the waiters that are blocked on itself. The
-mutex has a rbtree to store these waiters by priority.  This tree is protected
-by a spin lock that is located in the struct of the mutex. This lock is called
-wait_lock.
-
-
-Task PI Tree
-------------
-
-To keep track of the PI chains, each process has its own PI rbtree.  This is
-a tree of all top waiters of the mutexes that are owned by the process.
-Note that this tree only holds the top waiters and not all waiters that are
-blocked on mutexes owned by the process.
-
-The top of the task's PI tree is always the highest priority task that
-is waiting on a mutex that is owned by the task.  So if the task has
-inherited a priority, it will always be the priority of the task that is
-at the top of this tree.
-
-This tree is stored in the task structure of a process as a rbtree called
-pi_waiters.  It is protected by a spin lock also in the task structure,
-called pi_lock.  This lock may also be taken in interrupt context, so when
-locking the pi_lock, interrupts must be disabled.
-
-
-Depth of the PI Chain
----------------------
-
-The maximum depth of the PI chain is not dynamic, and could actually be
-defined.  But is very complex to figure it out, since it depends on all
-the nesting of mutexes.  Let's look at the example where we have 3 mutexes,
-L1, L2, and L3, and four separate functions func1, func2, func3 and func4.
-The following shows a locking order of L1->L2->L3, but may not actually
-be directly nested that way.
-
-void func1(void)
-{
-	mutex_lock(L1);
-
-	/* do anything */
-
-	mutex_unlock(L1);
-}
-
-void func2(void)
-{
-	mutex_lock(L1);
-	mutex_lock(L2);
-
-	/* do something */
-
-	mutex_unlock(L2);
-	mutex_unlock(L1);
-}
-
-void func3(void)
-{
-	mutex_lock(L2);
-	mutex_lock(L3);
-
-	/* do something else */
-
-	mutex_unlock(L3);
-	mutex_unlock(L2);
-}
-
-void func4(void)
-{
-	mutex_lock(L3);
-
-	/* do something again */
-
-	mutex_unlock(L3);
-}
-
-Now we add 4 processes that run each of these functions separately.
-Processes A, B, C, and D which run functions func1, func2, func3 and func4
-respectively, and such that D runs first and A last.  With D being preempted
-in func4 in the "do something again" area, we have a locking that follows:
-
-D owns L3
-       C blocked on L3
-       C owns L2
-              B blocked on L2
-              B owns L1
-                     A blocked on L1
-
-And thus we have the chain A->L1->B->L2->C->L3->D.
-
-This gives us a PI depth of 4 (four processes), but looking at any of the
-functions individually, it seems as though they only have at most a locking
-depth of two.  So, although the locking depth is defined at compile time,
-it still is very difficult to find the possibilities of that depth.
-
-Now since mutexes can be defined by user-land applications, we don't want a DOS
-type of application that nests large amounts of mutexes to create a large
-PI chain, and have the code holding spin locks while looking at a large
-amount of data.  So to prevent this, the implementation not only implements
-a maximum lock depth, but also only holds at most two different locks at a
-time, as it walks the PI chain.  More about this below.
-
-
-Mutex owner and flags
----------------------
-
-The mutex structure contains a pointer to the owner of the mutex.  If the
-mutex is not owned, this owner is set to NULL.  Since all architectures
-have the task structure on at least a two byte alignment (and if this is
-not true, the rtmutex.c code will be broken!), this allows for the least
-significant bit to be used as a flag.  Bit 0 is used as the "Has Waiters"
-flag. It's set whenever there are waiters on a mutex.
-
-See Documentation/locking/rt-mutex.txt for further details.
-
-cmpxchg Tricks
---------------
-
-Some architectures implement an atomic cmpxchg (Compare and Exchange).  This
-is used (when applicable) to keep the fast path of grabbing and releasing
-mutexes short.
-
-cmpxchg is basically the following function performed atomically:
-
-unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C)
-{
-	unsigned long T = *A;
-	if (*A == *B) {
-		*A = *C;
-	}
-	return T;
-}
-#define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c)
-
-This is really nice to have, since it allows you to only update a variable
-if the variable is what you expect it to be.  You know if it succeeded if
-the return value (the old value of A) is equal to B.
-
-The macro rt_mutex_cmpxchg is used to try to lock and unlock mutexes. If
-the architecture does not support CMPXCHG, then this macro is simply set
-to fail every time.  But if CMPXCHG is supported, then this will
-help out extremely to keep the fast path short.
-
-The use of rt_mutex_cmpxchg with the flags in the owner field help optimize
-the system for architectures that support it.  This will also be explained
-later in this document.
-
-
-Priority adjustments
---------------------
-
-The implementation of the PI code in rtmutex.c has several places that a
-process must adjust its priority.  With the help of the pi_waiters of a
-process this is rather easy to know what needs to be adjusted.
-
-The functions implementing the task adjustments are rt_mutex_adjust_prio
-and rt_mutex_setprio. rt_mutex_setprio is only used in rt_mutex_adjust_prio.
-
-rt_mutex_adjust_prio examines the priority of the task, and the highest
-priority process that is waiting any of mutexes owned by the task. Since
-the pi_waiters of a task holds an order by priority of all the top waiters
-of all the mutexes that the task owns, we simply need to compare the top
-pi waiter to its own normal/deadline priority and take the higher one.
-Then rt_mutex_setprio is called to adjust the priority of the task to the
-new priority. Note that rt_mutex_setprio is defined in kernel/sched/core.c
-to implement the actual change in priority.
-
-(Note:  For the "prio" field in task_struct, the lower the number, the
-	higher the priority. A "prio" of 5 is of higher priority than a
-	"prio" of 10.)
-
-It is interesting to note that rt_mutex_adjust_prio can either increase
-or decrease the priority of the task.  In the case that a higher priority
-process has just blocked on a mutex owned by the task, rt_mutex_adjust_prio
-would increase/boost the task's priority.  But if a higher priority task
-were for some reason to leave the mutex (timeout or signal), this same function
-would decrease/unboost the priority of the task.  That is because the pi_waiters
-always contains the highest priority task that is waiting on a mutex owned
-by the task, so we only need to compare the priority of that top pi waiter
-to the normal priority of the given task.
-
-
-High level overview of the PI chain walk
-----------------------------------------
-
-The PI chain walk is implemented by the function rt_mutex_adjust_prio_chain.
-
-The implementation has gone through several iterations, and has ended up
-with what we believe is the best.  It walks the PI chain by only grabbing
-at most two locks at a time, and is very efficient.
-
-The rt_mutex_adjust_prio_chain can be used either to boost or lower process
-priorities.
-
-rt_mutex_adjust_prio_chain is called with a task to be checked for PI
-(de)boosting (the owner of a mutex that a process is blocking on), a flag to
-check for deadlocking, the mutex that the task owns, a pointer to a waiter
-that is the process's waiter struct that is blocked on the mutex (although this
-parameter may be NULL for deboosting), a pointer to the mutex on which the task
-is blocked, and a top_task as the top waiter of the mutex.
-
-For this explanation, I will not mention deadlock detection. This explanation
-will try to stay at a high level.
-
-When this function is called, there are no locks held.  That also means
-that the state of the owner and lock can change when entered into this function.
-
-Before this function is called, the task has already had rt_mutex_adjust_prio
-performed on it.  This means that the task is set to the priority that it
-should be at, but the rbtree nodes of the task's waiter have not been updated
-with the new priorities, and this task may not be in the proper locations
-in the pi_waiters and waiters trees that the task is blocked on. This function
-solves all that.
-
-The main operation of this function is summarized by Thomas Gleixner in
-rtmutex.c. See the 'Chain walk basics and protection scope' comment for further
-details.
-
-Taking of a mutex (The walk through)
-------------------------------------
-
-OK, now let's take a look at the detailed walk through of what happens when
-taking a mutex.
-
-The first thing that is tried is the fast taking of the mutex.  This is
-done when we have CMPXCHG enabled (otherwise the fast taking automatically
-fails).  Only when the owner field of the mutex is NULL can the lock be
-taken with the CMPXCHG and nothing else needs to be done.
-
-If there is contention on the lock, we go about the slow path
-(rt_mutex_slowlock).
-
-The slow path function is where the task's waiter structure is created on
-the stack.  This is because the waiter structure is only needed for the
-scope of this function.  The waiter structure holds the nodes to store
-the task on the waiters tree of the mutex, and if need be, the pi_waiters
-tree of the owner.
-
-The wait_lock of the mutex is taken since the slow path of unlocking the
-mutex also takes this lock.
-
-We then call try_to_take_rt_mutex.  This is where the architecture that
-does not implement CMPXCHG would always grab the lock (if there's no
-contention).
-
-try_to_take_rt_mutex is used every time the task tries to grab a mutex in the
-slow path.  The first thing that is done here is an atomic setting of
-the "Has Waiters" flag of the mutex's owner field. By setting this flag
-now, the current owner of the mutex being contended for can't release the mutex
-without going into the slow unlock path, and it would then need to grab the
-wait_lock, which this code currently holds. So setting the "Has Waiters" flag
-forces the current owner to synchronize with this code.
-
-The lock is taken if the following are true:
-   1) The lock has no owner
-   2) The current task is the highest priority against all other
-      waiters of the lock
-
-If the task succeeds to acquire the lock, then the task is set as the
-owner of the lock, and if the lock still has waiters, the top_waiter
-(highest priority task waiting on the lock) is added to this task's
-pi_waiters tree.
-
-If the lock is not taken by try_to_take_rt_mutex(), then the
-task_blocks_on_rt_mutex() function is called. This will add the task to
-the lock's waiter tree and propagate the pi chain of the lock as well
-as the lock's owner's pi_waiters tree. This is described in the next
-section.
-
-Task blocks on mutex
---------------------
-
-The accounting of a mutex and process is done with the waiter structure of
-the process.  The "task" field is set to the process, and the "lock" field
-to the mutex.  The rbtree node of waiter are initialized to the processes
-current priority.
-
-Since the wait_lock was taken at the entry of the slow lock, we can safely
-add the waiter to the task waiter tree.  If the current process is the
-highest priority process currently waiting on this mutex, then we remove the
-previous top waiter process (if it exists) from the pi_waiters of the owner,
-and add the current process to that tree.  Since the pi_waiter of the owner
-has changed, we call rt_mutex_adjust_prio on the owner to see if the owner
-should adjust its priority accordingly.
-
-If the owner is also blocked on a lock, and had its pi_waiters changed
-(or deadlock checking is on), we unlock the wait_lock of the mutex and go ahead
-and run rt_mutex_adjust_prio_chain on the owner, as described earlier.
-
-Now all locks are released, and if the current process is still blocked on a
-mutex (waiter "task" field is not NULL), then we go to sleep (call schedule).
-
-Waking up in the loop
----------------------
-
-The task can then wake up for a couple of reasons:
-  1) The previous lock owner released the lock, and the task now is top_waiter
-  2) we received a signal or timeout
-
-In both cases, the task will try again to acquire the lock. If it
-does, then it will take itself off the waiters tree and set itself back
-to the TASK_RUNNING state.
-
-In first case, if the lock was acquired by another task before this task
-could get the lock, then it will go back to sleep and wait to be woken again.
-
-The second case is only applicable for tasks that are grabbing a mutex
-that can wake up before getting the lock, either due to a signal or
-a timeout (i.e. rt_mutex_timed_futex_lock()). When woken, it will try to
-take the lock again, if it succeeds, then the task will return with the
-lock held, otherwise it will return with -EINTR if the task was woken
-by a signal, or -ETIMEDOUT if it timed out.
-
-
-Unlocking the Mutex
--------------------
-
-The unlocking of a mutex also has a fast path for those architectures with
-CMPXCHG.  Since the taking of a mutex on contention always sets the
-"Has Waiters" flag of the mutex's owner, we use this to know if we need to
-take the slow path when unlocking the mutex.  If the mutex doesn't have any
-waiters, the owner field of the mutex would equal the current process and
-the mutex can be unlocked by just replacing the owner field with NULL.
-
-If the owner field has the "Has Waiters" bit set (or CMPXCHG is not available),
-the slow unlock path is taken.
-
-The first thing done in the slow unlock path is to take the wait_lock of the
-mutex.  This synchronizes the locking and unlocking of the mutex.
-
-A check is made to see if the mutex has waiters or not.  On architectures that
-do not have CMPXCHG, this is the location that the owner of the mutex will
-determine if a waiter needs to be awoken or not.  On architectures that
-do have CMPXCHG, that check is done in the fast path, but it is still needed
-in the slow path too.  If a waiter of a mutex woke up because of a signal
-or timeout between the time the owner failed the fast path CMPXCHG check and
-the grabbing of the wait_lock, the mutex may not have any waiters, thus the
-owner still needs to make this check. If there are no waiters then the mutex
-owner field is set to NULL, the wait_lock is released and nothing more is
-needed.
-
-If there are waiters, then we need to wake one up.
-
-On the wake up code, the pi_lock of the current owner is taken.  The top
-waiter of the lock is found and removed from the waiters tree of the mutex
-as well as the pi_waiters tree of the current owner. The "Has Waiters" bit is
-marked to prevent lower priority tasks from stealing the lock.
-
-Finally we unlock the pi_lock of the pending owner and wake it up.
-
-
-Contact
--------
-
-For updates on this document, please email Steven Rostedt <rostedt@goodmis.org>
-
-
-Credits
--------
-
-Author:  Steven Rostedt <rostedt@goodmis.org>
-Updated: Alex Shi <alex.shi@linaro.org>	- 7/6/2017
-
-Original Reviewers:  Ingo Molnar, Thomas Gleixner, Thomas Duetsch, and
-		     Randy Dunlap
-Update (7/6/2017) Reviewers: Steven Rostedt and Sebastian Siewior
-
-Updates
--------
-
-This document was originally written for 2.6.17-rc3-mm1
-was updated on 4.12
diff --git a/Documentation/locking/rt-mutex.rst b/Documentation/locking/rt-mutex.rst
new file mode 100644
index 000000000000..c365dc302081
--- /dev/null
+++ b/Documentation/locking/rt-mutex.rst
@@ -0,0 +1,77 @@
+==================================
+RT-mutex subsystem with PI support
+==================================
+
+RT-mutexes with priority inheritance are used to support PI-futexes,
+which enable pthread_mutex_t priority inheritance attributes
+(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
+about PI-futexes.]
+
+This technology was developed in the -rt tree and streamlined for
+pthread_mutex support.
+
+Basic principles:
+-----------------
+
+RT-mutexes extend the semantics of simple mutexes by the priority
+inheritance protocol.
+
+A low priority owner of a rt-mutex inherits the priority of a higher
+priority waiter until the rt-mutex is released. If the temporarily
+boosted owner blocks on a rt-mutex itself it propagates the priority
+boosting to the owner of the other rt_mutex it gets blocked on. The
+priority boosting is immediately removed once the rt_mutex has been
+unlocked.
+
+This approach allows us to shorten the block of high-prio tasks on
+mutexes which protect shared resources. Priority inheritance is not a
+magic bullet for poorly designed applications, but it allows
+well-designed applications to use userspace locks in critical parts of
+an high priority thread, without losing determinism.
+
+The enqueueing of the waiters into the rtmutex waiter tree is done in
+priority order. For same priorities FIFO order is chosen. For each
+rtmutex, only the top priority waiter is enqueued into the owner's
+priority waiters tree. This tree too queues in priority order. Whenever
+the top priority waiter of a task changes (for example it timed out or
+got a signal), the priority of the owner task is readjusted. The
+priority enqueueing is handled by "pi_waiters".
+
+RT-mutexes are optimized for fastpath operations and have no internal
+locking overhead when locking an uncontended mutex or unlocking a mutex
+without waiters. The optimized fastpath operations require cmpxchg
+support. [If that is not available then the rt-mutex internal spinlock
+is used]
+
+The state of the rt-mutex is tracked via the owner field of the rt-mutex
+structure:
+
+lock->owner holds the task_struct pointer of the owner. Bit 0 is used to
+keep track of the "lock has waiters" state:
+
+ ============ ======= ================================================
+ owner        bit0    Notes
+ ============ ======= ================================================
+ NULL         0       lock is free (fast acquire possible)
+ NULL         1       lock is free and has waiters and the top waiter
+		      is going to take the lock [1]_
+ taskpointer  0       lock is held (fast release possible)
+ taskpointer  1       lock is held and has waiters [2]_
+ ============ ======= ================================================
+
+The fast atomic compare exchange based acquire and release is only
+possible when bit 0 of lock->owner is 0.
+
+.. [1] It also can be a transitional state when grabbing the lock
+       with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+       we need to set the bit0 before looking at the lock, and the owner may
+       be NULL in this small time, hence this can be a transitional state.
+
+.. [2] There is a small time when bit 0 is set but there are no
+       waiters. This can happen when grabbing the lock in the slow path.
+       To prevent a cmpxchg of the owner releasing the lock, we need to
+       set this bit before looking at the lock.
+
+BTW, there is still technically a "Pending Owner", it's just not called
+that anymore. The pending owner happens to be the top_waiter of a lock
+that has no owner and has been woken up to grab the lock.
diff --git a/Documentation/locking/rt-mutex.txt b/Documentation/locking/rt-mutex.txt
deleted file mode 100644
index 35793e003041..000000000000
--- a/Documentation/locking/rt-mutex.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-RT-mutex subsystem with PI support
-----------------------------------
-
-RT-mutexes with priority inheritance are used to support PI-futexes,
-which enable pthread_mutex_t priority inheritance attributes
-(PTHREAD_PRIO_INHERIT). [See Documentation/pi-futex.txt for more details
-about PI-futexes.]
-
-This technology was developed in the -rt tree and streamlined for
-pthread_mutex support.
-
-Basic principles:
------------------
-
-RT-mutexes extend the semantics of simple mutexes by the priority
-inheritance protocol.
-
-A low priority owner of a rt-mutex inherits the priority of a higher
-priority waiter until the rt-mutex is released. If the temporarily
-boosted owner blocks on a rt-mutex itself it propagates the priority
-boosting to the owner of the other rt_mutex it gets blocked on. The
-priority boosting is immediately removed once the rt_mutex has been
-unlocked.
-
-This approach allows us to shorten the block of high-prio tasks on
-mutexes which protect shared resources. Priority inheritance is not a
-magic bullet for poorly designed applications, but it allows
-well-designed applications to use userspace locks in critical parts of
-an high priority thread, without losing determinism.
-
-The enqueueing of the waiters into the rtmutex waiter tree is done in
-priority order. For same priorities FIFO order is chosen. For each
-rtmutex, only the top priority waiter is enqueued into the owner's
-priority waiters tree. This tree too queues in priority order. Whenever
-the top priority waiter of a task changes (for example it timed out or
-got a signal), the priority of the owner task is readjusted. The
-priority enqueueing is handled by "pi_waiters".
-
-RT-mutexes are optimized for fastpath operations and have no internal
-locking overhead when locking an uncontended mutex or unlocking a mutex
-without waiters. The optimized fastpath operations require cmpxchg
-support. [If that is not available then the rt-mutex internal spinlock
-is used]
-
-The state of the rt-mutex is tracked via the owner field of the rt-mutex
-structure:
-
-lock->owner holds the task_struct pointer of the owner. Bit 0 is used to
-keep track of the "lock has waiters" state.
-
- owner        bit0
- NULL         0       lock is free (fast acquire possible)
- NULL         1       lock is free and has waiters and the top waiter
-			is going to take the lock*
- taskpointer  0       lock is held (fast release possible)
- taskpointer  1       lock is held and has waiters**
-
-The fast atomic compare exchange based acquire and release is only
-possible when bit 0 of lock->owner is 0.
-
-(*) It also can be a transitional state when grabbing the lock
-with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
-we need to set the bit0 before looking at the lock, and the owner may be
-NULL in this small time, hence this can be a transitional state.
-
-(**) There is a small time when bit 0 is set but there are no
-waiters. This can happen when grabbing the lock in the slow path.
-To prevent a cmpxchg of the owner releasing the lock, we need to
-set this bit before looking at the lock.
-
-BTW, there is still technically a "Pending Owner", it's just not called
-that anymore. The pending owner happens to be the top_waiter of a lock
-that has no owner and has been woken up to grab the lock.
diff --git a/Documentation/locking/spinlocks.rst b/Documentation/locking/spinlocks.rst
new file mode 100644
index 000000000000..098107fb7d86
--- /dev/null
+++ b/Documentation/locking/spinlocks.rst
@@ -0,0 +1,177 @@
+===============
+Locking lessons
+===============
+
+Lesson 1: Spin locks
+====================
+
+The most basic primitive for locking is spinlock::
+
+  static DEFINE_SPINLOCK(xxx_lock);
+
+	unsigned long flags;
+
+	spin_lock_irqsave(&xxx_lock, flags);
+	... critical section here ..
+	spin_unlock_irqrestore(&xxx_lock, flags);
+
+The above is always safe. It will disable interrupts _locally_, but the
+spinlock itself will guarantee the global lock, so it will guarantee that
+there is only one thread-of-control within the region(s) protected by that
+lock. This works well even under UP also, so the code does _not_ need to
+worry about UP vs SMP issues: the spinlocks work correctly under both.
+
+   NOTE! Implications of spin_locks for memory are further described in:
+
+     Documentation/memory-barriers.txt
+
+       (5) LOCK operations.
+
+       (6) UNLOCK operations.
+
+The above is usually pretty simple (you usually need and want only one
+spinlock for most things - using more than one spinlock can make things a
+lot more complex and even slower and is usually worth it only for
+sequences that you **know** need to be split up: avoid it at all cost if you
+aren't sure).
+
+This is really the only really hard part about spinlocks: once you start
+using spinlocks they tend to expand to areas you might not have noticed
+before, because you have to make sure the spinlocks correctly protect the
+shared data structures **everywhere** they are used. The spinlocks are most
+easily added to places that are completely independent of other code (for
+example, internal driver data structures that nobody else ever touches).
+
+   NOTE! The spin-lock is safe only when you **also** use the lock itself
+   to do locking across CPU's, which implies that EVERYTHING that
+   touches a shared variable has to agree about the spinlock they want
+   to use.
+
+----
+
+Lesson 2: reader-writer spinlocks.
+==================================
+
+If your data accesses have a very natural pattern where you usually tend
+to mostly read from the shared variables, the reader-writer locks
+(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
+readers to be in the same critical region at once, but if somebody wants
+to change the variables it has to get an exclusive write lock.
+
+   NOTE! reader-writer locks require more atomic memory operations than
+   simple spinlocks.  Unless the reader critical section is long, you
+   are better off just using spinlocks.
+
+The routines look the same as above::
+
+   rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
+
+	unsigned long flags;
+
+	read_lock_irqsave(&xxx_lock, flags);
+	.. critical section that only reads the info ...
+	read_unlock_irqrestore(&xxx_lock, flags);
+
+	write_lock_irqsave(&xxx_lock, flags);
+	.. read and write exclusive access to the info ...
+	write_unlock_irqrestore(&xxx_lock, flags);
+
+The above kind of lock may be useful for complex data structures like
+linked lists, especially searching for entries without changing the list
+itself.  The read lock allows many concurrent readers.  Anything that
+**changes** the list will have to get the write lock.
+
+   NOTE! RCU is better for list traversal, but requires careful
+   attention to design detail (see Documentation/RCU/listRCU.txt).
+
+Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
+time need to do any changes (even if you don't do it every time), you have
+to get the write-lock at the very beginning.
+
+   NOTE! We are working hard to remove reader-writer spinlocks in most
+   cases, so please don't add a new one without consensus.  (Instead, see
+   Documentation/RCU/rcu.txt for complete information.)
+
+----
+
+Lesson 3: spinlocks revisited.
+==============================
+
+The single spin-lock primitives above are by no means the only ones. They
+are the most safe ones, and the ones that work under all circumstances,
+but partly **because** they are safe they are also fairly slow. They are slower
+than they'd need to be, because they do have to disable interrupts
+(which is just a single instruction on a x86, but it's an expensive one -
+and on other architectures it can be worse).
+
+If you have a case where you have to protect a data structure across
+several CPU's and you want to use spinlocks you can potentially use
+cheaper versions of the spinlocks. IFF you know that the spinlocks are
+never used in interrupt handlers, you can use the non-irq versions::
+
+	spin_lock(&lock);
+	...
+	spin_unlock(&lock);
+
+(and the equivalent read-write versions too, of course). The spinlock will
+guarantee the same kind of exclusive access, and it will be much faster.
+This is useful if you know that the data in question is only ever
+manipulated from a "process context", ie no interrupts involved.
+
+The reasons you mustn't use these versions if you have interrupts that
+play with the spinlock is that you can get deadlocks::
+
+	spin_lock(&lock);
+	...
+		<- interrupt comes in:
+			spin_lock(&lock);
+
+where an interrupt tries to lock an already locked variable. This is ok if
+the other interrupt happens on another CPU, but it is _not_ ok if the
+interrupt happens on the same CPU that already holds the lock, because the
+lock will obviously never be released (because the interrupt is waiting
+for the lock, and the lock-holder is interrupted by the interrupt and will
+not continue until the interrupt has been processed).
+
+(This is also the reason why the irq-versions of the spinlocks only need
+to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
+on other CPU's, because an interrupt on another CPU doesn't interrupt the
+CPU that holds the lock, so the lock-holder can continue and eventually
+releases the lock).
+
+Note that you can be clever with read-write locks and interrupts. For
+example, if you know that the interrupt only ever gets a read-lock, then
+you can use a non-irq version of read locks everywhere - because they
+don't block on each other (and thus there is no dead-lock wrt interrupts.
+But when you do the write-lock, you have to use the irq-safe version.
+
+For an example of being clever with rw-locks, see the "waitqueue_lock"
+handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
+within an interrupt, they only read the queue in order to know whom to
+wake up. So read-locks are safe (which is good: they are very common
+indeed), while write-locks need to protect themselves against interrupts.
+
+		Linus
+
+----
+
+Reference information:
+======================
+
+For dynamic initialization, use spin_lock_init() or rwlock_init() as
+appropriate::
+
+   spinlock_t xxx_lock;
+   rwlock_t xxx_rw_lock;
+
+   static int __init xxx_init(void)
+   {
+	spin_lock_init(&xxx_lock);
+	rwlock_init(&xxx_rw_lock);
+	...
+   }
+
+   module_init(xxx_init);
+
+For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
+__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
diff --git a/Documentation/locking/spinlocks.txt b/Documentation/locking/spinlocks.txt
deleted file mode 100644
index ff35e40bdf5b..000000000000
--- a/Documentation/locking/spinlocks.txt
+++ /dev/null
@@ -1,167 +0,0 @@
-Lesson 1: Spin locks
-
-The most basic primitive for locking is spinlock.
-
-static DEFINE_SPINLOCK(xxx_lock);
-
-	unsigned long flags;
-
-	spin_lock_irqsave(&xxx_lock, flags);
-	... critical section here ..
-	spin_unlock_irqrestore(&xxx_lock, flags);
-
-The above is always safe. It will disable interrupts _locally_, but the
-spinlock itself will guarantee the global lock, so it will guarantee that
-there is only one thread-of-control within the region(s) protected by that
-lock. This works well even under UP also, so the code does _not_ need to
-worry about UP vs SMP issues: the spinlocks work correctly under both.
-
-   NOTE! Implications of spin_locks for memory are further described in:
-
-     Documentation/memory-barriers.txt
-       (5) LOCK operations.
-       (6) UNLOCK operations.
-
-The above is usually pretty simple (you usually need and want only one
-spinlock for most things - using more than one spinlock can make things a
-lot more complex and even slower and is usually worth it only for
-sequences that you _know_ need to be split up: avoid it at all cost if you
-aren't sure).
-
-This is really the only really hard part about spinlocks: once you start
-using spinlocks they tend to expand to areas you might not have noticed
-before, because you have to make sure the spinlocks correctly protect the
-shared data structures _everywhere_ they are used. The spinlocks are most
-easily added to places that are completely independent of other code (for
-example, internal driver data structures that nobody else ever touches).
-
-   NOTE! The spin-lock is safe only when you _also_ use the lock itself
-   to do locking across CPU's, which implies that EVERYTHING that
-   touches a shared variable has to agree about the spinlock they want
-   to use.
-
-----
-
-Lesson 2: reader-writer spinlocks.
-
-If your data accesses have a very natural pattern where you usually tend
-to mostly read from the shared variables, the reader-writer locks
-(rw_lock) versions of the spinlocks are sometimes useful. They allow multiple
-readers to be in the same critical region at once, but if somebody wants
-to change the variables it has to get an exclusive write lock.
-
-   NOTE! reader-writer locks require more atomic memory operations than
-   simple spinlocks.  Unless the reader critical section is long, you
-   are better off just using spinlocks.
-
-The routines look the same as above:
-
-   rwlock_t xxx_lock = __RW_LOCK_UNLOCKED(xxx_lock);
-
-	unsigned long flags;
-
-	read_lock_irqsave(&xxx_lock, flags);
-	.. critical section that only reads the info ...
-	read_unlock_irqrestore(&xxx_lock, flags);
-
-	write_lock_irqsave(&xxx_lock, flags);
-	.. read and write exclusive access to the info ...
-	write_unlock_irqrestore(&xxx_lock, flags);
-
-The above kind of lock may be useful for complex data structures like
-linked lists, especially searching for entries without changing the list
-itself.  The read lock allows many concurrent readers.  Anything that
-_changes_ the list will have to get the write lock.
-
-   NOTE! RCU is better for list traversal, but requires careful
-   attention to design detail (see Documentation/RCU/listRCU.txt).
-
-Also, you cannot "upgrade" a read-lock to a write-lock, so if you at _any_
-time need to do any changes (even if you don't do it every time), you have
-to get the write-lock at the very beginning.
-
-   NOTE! We are working hard to remove reader-writer spinlocks in most
-   cases, so please don't add a new one without consensus.  (Instead, see
-   Documentation/RCU/rcu.txt for complete information.)
-
-----
-
-Lesson 3: spinlocks revisited.
-
-The single spin-lock primitives above are by no means the only ones. They
-are the most safe ones, and the ones that work under all circumstances,
-but partly _because_ they are safe they are also fairly slow. They are slower
-than they'd need to be, because they do have to disable interrupts
-(which is just a single instruction on a x86, but it's an expensive one -
-and on other architectures it can be worse).
-
-If you have a case where you have to protect a data structure across
-several CPU's and you want to use spinlocks you can potentially use
-cheaper versions of the spinlocks. IFF you know that the spinlocks are
-never used in interrupt handlers, you can use the non-irq versions:
-
-	spin_lock(&lock);
-	...
-	spin_unlock(&lock);
-
-(and the equivalent read-write versions too, of course). The spinlock will
-guarantee the same kind of exclusive access, and it will be much faster.
-This is useful if you know that the data in question is only ever
-manipulated from a "process context", ie no interrupts involved.
-
-The reasons you mustn't use these versions if you have interrupts that
-play with the spinlock is that you can get deadlocks:
-
-	spin_lock(&lock);
-	...
-		<- interrupt comes in:
-			spin_lock(&lock);
-
-where an interrupt tries to lock an already locked variable. This is ok if
-the other interrupt happens on another CPU, but it is _not_ ok if the
-interrupt happens on the same CPU that already holds the lock, because the
-lock will obviously never be released (because the interrupt is waiting
-for the lock, and the lock-holder is interrupted by the interrupt and will
-not continue until the interrupt has been processed).
-
-(This is also the reason why the irq-versions of the spinlocks only need
-to disable the _local_ interrupts - it's ok to use spinlocks in interrupts
-on other CPU's, because an interrupt on another CPU doesn't interrupt the
-CPU that holds the lock, so the lock-holder can continue and eventually
-releases the lock).
-
-Note that you can be clever with read-write locks and interrupts. For
-example, if you know that the interrupt only ever gets a read-lock, then
-you can use a non-irq version of read locks everywhere - because they
-don't block on each other (and thus there is no dead-lock wrt interrupts.
-But when you do the write-lock, you have to use the irq-safe version.
-
-For an example of being clever with rw-locks, see the "waitqueue_lock"
-handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
-within an interrupt, they only read the queue in order to know whom to
-wake up. So read-locks are safe (which is good: they are very common
-indeed), while write-locks need to protect themselves against interrupts.
-
-		Linus
-
-----
-
-Reference information:
-
-For dynamic initialization, use spin_lock_init() or rwlock_init() as
-appropriate:
-
-   spinlock_t xxx_lock;
-   rwlock_t xxx_rw_lock;
-
-   static int __init xxx_init(void)
-   {
-	spin_lock_init(&xxx_lock);
-	rwlock_init(&xxx_rw_lock);
-	...
-   }
-
-   module_init(xxx_init);
-
-For static initialization, use DEFINE_SPINLOCK() / DEFINE_RWLOCK() or
-__SPIN_LOCK_UNLOCKED() / __RW_LOCK_UNLOCKED() as appropriate.
diff --git a/Documentation/locking/ww-mutex-design.rst b/Documentation/locking/ww-mutex-design.rst
new file mode 100644
index 000000000000..1846c199da23
--- /dev/null
+++ b/Documentation/locking/ww-mutex-design.rst
@@ -0,0 +1,393 @@
+======================================
+Wound/Wait Deadlock-Proof Mutex Design
+======================================
+
+Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
+
+Motivation for WW-Mutexes
+-------------------------
+
+GPU's do operations that commonly involve many buffers.  Those buffers
+can be shared across contexts/processes, exist in different memory
+domains (for example VRAM vs system memory), and so on.  And with
+PRIME / dmabuf, they can even be shared across devices.  So there are
+a handful of situations where the driver needs to wait for buffers to
+become ready.  If you think about this in terms of waiting on a buffer
+mutex for it to become available, this presents a problem because
+there is no way to guarantee that buffers appear in a execbuf/batch in
+the same order in all contexts.  That is directly under control of
+userspace, and a result of the sequence of GL calls that an application
+makes.	Which results in the potential for deadlock.  The problem gets
+more complex when you consider that the kernel may need to migrate the
+buffer(s) into VRAM before the GPU operates on the buffer(s), which
+may in turn require evicting some other buffers (and you don't want to
+evict other buffers which are already queued up to the GPU), but for a
+simplified understanding of the problem you can ignore this.
+
+The algorithm that the TTM graphics subsystem came up with for dealing with
+this problem is quite simple.  For each group of buffers (execbuf) that need
+to be locked, the caller would be assigned a unique reservation id/ticket,
+from a global counter.  In case of deadlock while locking all the buffers
+associated with a execbuf, the one with the lowest reservation ticket (i.e.
+the oldest task) wins, and the one with the higher reservation id (i.e. the
+younger task) unlocks all of the buffers that it has already locked, and then
+tries again.
+
+In the RDBMS literature, a reservation ticket is associated with a transaction.
+and the deadlock handling approach is called Wait-Die. The name is based on
+the actions of a locking thread when it encounters an already locked mutex.
+If the transaction holding the lock is younger, the locking transaction waits.
+If the transaction holding the lock is older, the locking transaction backs off
+and dies. Hence Wait-Die.
+There is also another algorithm called Wound-Wait:
+If the transaction holding the lock is younger, the locking transaction
+wounds the transaction holding the lock, requesting it to die.
+If the transaction holding the lock is older, it waits for the other
+transaction. Hence Wound-Wait.
+The two algorithms are both fair in that a transaction will eventually succeed.
+However, the Wound-Wait algorithm is typically stated to generate fewer backoffs
+compared to Wait-Die, but is, on the other hand, associated with more work than
+Wait-Die when recovering from a backoff. Wound-Wait is also a preemptive
+algorithm in that transactions are wounded by other transactions, and that
+requires a reliable way to pick up up the wounded condition and preempt the
+running transaction. Note that this is not the same as process preemption. A
+Wound-Wait transaction is considered preempted when it dies (returning
+-EDEADLK) following a wound.
+
+Concepts
+--------
+
+Compared to normal mutexes two additional concepts/objects show up in the lock
+interface for w/w mutexes:
+
+Acquire context: To ensure eventual forward progress it is important the a task
+trying to acquire locks doesn't grab a new reservation id, but keeps the one it
+acquired when starting the lock acquisition. This ticket is stored in the
+acquire context. Furthermore the acquire context keeps track of debugging state
+to catch w/w mutex interface abuse. An acquire context is representing a
+transaction.
+
+W/w class: In contrast to normal mutexes the lock class needs to be explicit for
+w/w mutexes, since it is required to initialize the acquire context. The lock
+class also specifies what algorithm to use, Wound-Wait or Wait-Die.
+
+Furthermore there are three different class of w/w lock acquire functions:
+
+* Normal lock acquisition with a context, using ww_mutex_lock.
+
+* Slowpath lock acquisition on the contending lock, used by the task that just
+  killed its transaction after having dropped all already acquired locks.
+  These functions have the _slow postfix.
+
+  From a simple semantics point-of-view the _slow functions are not strictly
+  required, since simply calling the normal ww_mutex_lock functions on the
+  contending lock (after having dropped all other already acquired locks) will
+  work correctly. After all if no other ww mutex has been acquired yet there's
+  no deadlock potential and hence the ww_mutex_lock call will block and not
+  prematurely return -EDEADLK. The advantage of the _slow functions is in
+  interface safety:
+
+  - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow
+    has a void return type. Note that since ww mutex code needs loops/retries
+    anyway the __must_check doesn't result in spurious warnings, even though the
+    very first lock operation can never fail.
+  - When full debugging is enabled ww_mutex_lock_slow checks that all acquired
+    ww mutex have been released (preventing deadlocks) and makes sure that we
+    block on the contending lock (preventing spinning through the -EDEADLK
+    slowpath until the contended lock can be acquired).
+
+* Functions to only acquire a single w/w mutex, which results in the exact same
+  semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL
+  context.
+
+  Again this is not strictly required. But often you only want to acquire a
+  single lock in which case it's pointless to set up an acquire context (and so
+  better to avoid grabbing a deadlock avoidance ticket).
+
+Of course, all the usual variants for handling wake-ups due to signals are also
+provided.
+
+Usage
+-----
+
+The algorithm (Wait-Die vs Wound-Wait) is chosen by using either
+DEFINE_WW_CLASS() (Wound-Wait) or DEFINE_WD_CLASS() (Wait-Die)
+As a rough rule of thumb, use Wound-Wait iff you
+expect the number of simultaneous competing transactions to be typically small,
+and you want to reduce the number of rollbacks.
+
+Three different ways to acquire locks within the same w/w class. Common
+definitions for methods #1 and #2::
+
+  static DEFINE_WW_CLASS(ww_class);
+
+  struct obj {
+	struct ww_mutex lock;
+	/* obj data */
+  };
+
+  struct obj_entry {
+	struct list_head head;
+	struct obj *obj;
+  };
+
+Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
+This is useful if a list of required objects is already tracked somewhere.
+Furthermore the lock helper can use propagate the -EALREADY return code back to
+the caller as a signal that an object is twice on the list. This is useful if
+the list is constructed from userspace input and the ABI requires userspace to
+not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl)::
+
+  int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj *res_obj = NULL;
+	struct obj_entry *contended_entry = NULL;
+	struct obj_entry *entry;
+
+	ww_acquire_init(ctx, &ww_class);
+
+  retry:
+	list_for_each_entry (entry, list, head) {
+		if (entry->obj == res_obj) {
+			res_obj = NULL;
+			continue;
+		}
+		ret = ww_mutex_lock(&entry->obj->lock, ctx);
+		if (ret < 0) {
+			contended_entry = entry;
+			goto err;
+		}
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+
+  err:
+	list_for_each_entry_continue_reverse (entry, list, head)
+		ww_mutex_unlock(&entry->obj->lock);
+
+	if (res_obj)
+		ww_mutex_unlock(&res_obj->lock);
+
+	if (ret == -EDEADLK) {
+		/* we lost out in a seqno race, lock and retry.. */
+		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
+		res_obj = contended_entry->obj;
+		goto retry;
+	}
+	ww_acquire_fini(ctx);
+
+	return ret;
+  }
+
+Method 2, using a list in execbuf->buffers that can be reordered. Same semantics
+of duplicate entry detection using -EALREADY as method 1 above. But the
+list-reordering allows for a bit more idiomatic code::
+
+  int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj_entry *entry, *entry2;
+
+	ww_acquire_init(ctx, &ww_class);
+
+	list_for_each_entry (entry, list, head) {
+		ret = ww_mutex_lock(&entry->obj->lock, ctx);
+		if (ret < 0) {
+			entry2 = entry;
+
+			list_for_each_entry_continue_reverse (entry2, list, head)
+				ww_mutex_unlock(&entry2->obj->lock);
+
+			if (ret != -EDEADLK) {
+				ww_acquire_fini(ctx);
+				return ret;
+			}
+
+			/* we lost out in a seqno race, lock and retry.. */
+			ww_mutex_lock_slow(&entry->obj->lock, ctx);
+
+			/*
+			 * Move buf to head of the list, this will point
+			 * buf->next to the first unlocked entry,
+			 * restarting the for loop.
+			 */
+			list_del(&entry->head);
+			list_add(&entry->head, list);
+		}
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+  }
+
+Unlocking works the same way for both methods #1 and #2::
+
+  void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj_entry *entry;
+
+	list_for_each_entry (entry, list, head)
+		ww_mutex_unlock(&entry->obj->lock);
+
+	ww_acquire_fini(ctx);
+  }
+
+Method 3 is useful if the list of objects is constructed ad-hoc and not upfront,
+e.g. when adjusting edges in a graph where each node has its own ww_mutex lock,
+and edges can only be changed when holding the locks of all involved nodes. w/w
+mutexes are a natural fit for such a case for two reasons:
+
+- They can handle lock-acquisition in any order which allows us to start walking
+  a graph from a starting point and then iteratively discovering new edges and
+  locking down the nodes those edges connect to.
+- Due to the -EALREADY return code signalling that a given objects is already
+  held there's no need for additional book-keeping to break cycles in the graph
+  or keep track off which looks are already held (when using more than one node
+  as a starting point).
+
+Note that this approach differs in two important ways from the above methods:
+
+- Since the list of objects is dynamically constructed (and might very well be
+  different when retrying due to hitting the -EDEADLK die condition) there's
+  no need to keep any object on a persistent list when it's not locked. We can
+  therefore move the list_head into the object itself.
+- On the other hand the dynamic object list construction also means that the -EALREADY return
+  code can't be propagated.
+
+Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a
+list of starting nodes (passed in from userspace) using one of the above
+methods. And then lock any additional objects affected by the operations using
+method #3 below. The backoff/retry procedure will be a bit more involved, since
+when the dynamic locking step hits -EDEADLK we also need to unlock all the
+objects acquired with the fixed list. But the w/w mutex debug checks will catch
+any interface misuse for these cases.
+
+Also, method 3 can't fail the lock acquisition step since it doesn't return
+-EALREADY. Of course this would be different when using the _interruptible
+variants, but that's outside of the scope of these examples here::
+
+  struct obj {
+	struct ww_mutex ww_mutex;
+	struct list_head locked_list;
+  };
+
+  static DEFINE_WW_CLASS(ww_class);
+
+  void __unlock_objs(struct list_head *list)
+  {
+	struct obj *entry, *temp;
+
+	list_for_each_entry_safe (entry, temp, list, locked_list) {
+		/* need to do that before unlocking, since only the current lock holder is
+		allowed to use object */
+		list_del(&entry->locked_list);
+		ww_mutex_unlock(entry->ww_mutex)
+	}
+  }
+
+  void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	struct obj *obj;
+
+	ww_acquire_init(ctx, &ww_class);
+
+  retry:
+	/* re-init loop start state */
+	loop {
+		/* magic code which walks over a graph and decides which objects
+		 * to lock */
+
+		ret = ww_mutex_lock(obj->ww_mutex, ctx);
+		if (ret == -EALREADY) {
+			/* we have that one already, get to the next object */
+			continue;
+		}
+		if (ret == -EDEADLK) {
+			__unlock_objs(list);
+
+			ww_mutex_lock_slow(obj, ctx);
+			list_add(&entry->locked_list, list);
+			goto retry;
+		}
+
+		/* locked a new object, add it to the list */
+		list_add_tail(&entry->locked_list, list);
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+  }
+
+  void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+  {
+	__unlock_objs(list);
+	ww_acquire_fini(ctx);
+  }
+
+Method 4: Only lock one single objects. In that case deadlock detection and
+prevention is obviously overkill, since with grabbing just one lock you can't
+produce a deadlock within just one class. To simplify this case the w/w mutex
+api can be used with a NULL context.
+
+Implementation Details
+----------------------
+
+Design:
+^^^^^^^
+
+  ww_mutex currently encapsulates a struct mutex, this means no extra overhead for
+  normal mutex locks, which are far more common. As such there is only a small
+  increase in code size if wait/wound mutexes are not used.
+
+  We maintain the following invariants for the wait list:
+
+  (1) Waiters with an acquire context are sorted by stamp order; waiters
+      without an acquire context are interspersed in FIFO order.
+  (2) For Wait-Die, among waiters with contexts, only the first one can have
+      other locks acquired already (ctx->acquired > 0). Note that this waiter
+      may come after other waiters without contexts in the list.
+
+  The Wound-Wait preemption is implemented with a lazy-preemption scheme:
+  The wounded status of the transaction is checked only when there is
+  contention for a new lock and hence a true chance of deadlock. In that
+  situation, if the transaction is wounded, it backs off, clears the
+  wounded status and retries. A great benefit of implementing preemption in
+  this way is that the wounded transaction can identify a contending lock to
+  wait for before restarting the transaction. Just blindly restarting the
+  transaction would likely make the transaction end up in a situation where
+  it would have to back off again.
+
+  In general, not much contention is expected. The locks are typically used to
+  serialize access to resources for devices, and optimization focus should
+  therefore be directed towards the uncontended cases.
+
+Lockdep:
+^^^^^^^^
+
+  Special care has been taken to warn for as many cases of api abuse
+  as possible. Some common api abuses will be caught with
+  CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended.
+
+  Some of the errors which will be warned about:
+   - Forgetting to call ww_acquire_fini or ww_acquire_init.
+   - Attempting to lock more mutexes after ww_acquire_done.
+   - Attempting to lock the wrong mutex after -EDEADLK and
+     unlocking all mutexes.
+   - Attempting to lock the right mutex after -EDEADLK,
+     before unlocking all mutexes.
+
+   - Calling ww_mutex_lock_slow before -EDEADLK was returned.
+
+   - Unlocking mutexes with the wrong unlock function.
+   - Calling one of the ww_acquire_* twice on the same context.
+   - Using a different ww_class for the mutex than for the ww_acquire_ctx.
+   - Normal lockdep errors that can result in deadlocks.
+
+  Some of the lockdep errors that can result in deadlocks:
+   - Calling ww_acquire_init to initialize a second ww_acquire_ctx before
+     having called ww_acquire_fini on the first.
+   - 'normal' deadlocks that can occur.
+
+FIXME:
+  Update this section once we have the TASK_DEADLOCK task state flag magic
+  implemented.
diff --git a/Documentation/locking/ww-mutex-design.txt b/Documentation/locking/ww-mutex-design.txt
deleted file mode 100644
index f0ed7c30e695..000000000000
--- a/Documentation/locking/ww-mutex-design.txt
+++ /dev/null
@@ -1,383 +0,0 @@
-Wound/Wait Deadlock-Proof Mutex Design
-======================================
-
-Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
-
-Motivation for WW-Mutexes
--------------------------
-
-GPU's do operations that commonly involve many buffers.  Those buffers
-can be shared across contexts/processes, exist in different memory
-domains (for example VRAM vs system memory), and so on.  And with
-PRIME / dmabuf, they can even be shared across devices.  So there are
-a handful of situations where the driver needs to wait for buffers to
-become ready.  If you think about this in terms of waiting on a buffer
-mutex for it to become available, this presents a problem because
-there is no way to guarantee that buffers appear in a execbuf/batch in
-the same order in all contexts.  That is directly under control of
-userspace, and a result of the sequence of GL calls that an application
-makes.	Which results in the potential for deadlock.  The problem gets
-more complex when you consider that the kernel may need to migrate the
-buffer(s) into VRAM before the GPU operates on the buffer(s), which
-may in turn require evicting some other buffers (and you don't want to
-evict other buffers which are already queued up to the GPU), but for a
-simplified understanding of the problem you can ignore this.
-
-The algorithm that the TTM graphics subsystem came up with for dealing with
-this problem is quite simple.  For each group of buffers (execbuf) that need
-to be locked, the caller would be assigned a unique reservation id/ticket,
-from a global counter.  In case of deadlock while locking all the buffers
-associated with a execbuf, the one with the lowest reservation ticket (i.e.
-the oldest task) wins, and the one with the higher reservation id (i.e. the
-younger task) unlocks all of the buffers that it has already locked, and then
-tries again.
-
-In the RDBMS literature, a reservation ticket is associated with a transaction.
-and the deadlock handling approach is called Wait-Die. The name is based on
-the actions of a locking thread when it encounters an already locked mutex.
-If the transaction holding the lock is younger, the locking transaction waits.
-If the transaction holding the lock is older, the locking transaction backs off
-and dies. Hence Wait-Die.
-There is also another algorithm called Wound-Wait:
-If the transaction holding the lock is younger, the locking transaction
-wounds the transaction holding the lock, requesting it to die.
-If the transaction holding the lock is older, it waits for the other
-transaction. Hence Wound-Wait.
-The two algorithms are both fair in that a transaction will eventually succeed.
-However, the Wound-Wait algorithm is typically stated to generate fewer backoffs
-compared to Wait-Die, but is, on the other hand, associated with more work than
-Wait-Die when recovering from a backoff. Wound-Wait is also a preemptive
-algorithm in that transactions are wounded by other transactions, and that
-requires a reliable way to pick up up the wounded condition and preempt the
-running transaction. Note that this is not the same as process preemption. A
-Wound-Wait transaction is considered preempted when it dies (returning
--EDEADLK) following a wound.
-
-Concepts
---------
-
-Compared to normal mutexes two additional concepts/objects show up in the lock
-interface for w/w mutexes:
-
-Acquire context: To ensure eventual forward progress it is important the a task
-trying to acquire locks doesn't grab a new reservation id, but keeps the one it
-acquired when starting the lock acquisition. This ticket is stored in the
-acquire context. Furthermore the acquire context keeps track of debugging state
-to catch w/w mutex interface abuse. An acquire context is representing a
-transaction.
-
-W/w class: In contrast to normal mutexes the lock class needs to be explicit for
-w/w mutexes, since it is required to initialize the acquire context. The lock
-class also specifies what algorithm to use, Wound-Wait or Wait-Die.
-
-Furthermore there are three different class of w/w lock acquire functions:
-
-* Normal lock acquisition with a context, using ww_mutex_lock.
-
-* Slowpath lock acquisition on the contending lock, used by the task that just
-  killed its transaction after having dropped all already acquired locks.
-  These functions have the _slow postfix.
-
-  From a simple semantics point-of-view the _slow functions are not strictly
-  required, since simply calling the normal ww_mutex_lock functions on the
-  contending lock (after having dropped all other already acquired locks) will
-  work correctly. After all if no other ww mutex has been acquired yet there's
-  no deadlock potential and hence the ww_mutex_lock call will block and not
-  prematurely return -EDEADLK. The advantage of the _slow functions is in
-  interface safety:
-  - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow
-    has a void return type. Note that since ww mutex code needs loops/retries
-    anyway the __must_check doesn't result in spurious warnings, even though the
-    very first lock operation can never fail.
-  - When full debugging is enabled ww_mutex_lock_slow checks that all acquired
-    ww mutex have been released (preventing deadlocks) and makes sure that we
-    block on the contending lock (preventing spinning through the -EDEADLK
-    slowpath until the contended lock can be acquired).
-
-* Functions to only acquire a single w/w mutex, which results in the exact same
-  semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL
-  context.
-
-  Again this is not strictly required. But often you only want to acquire a
-  single lock in which case it's pointless to set up an acquire context (and so
-  better to avoid grabbing a deadlock avoidance ticket).
-
-Of course, all the usual variants for handling wake-ups due to signals are also
-provided.
-
-Usage
------
-
-The algorithm (Wait-Die vs Wound-Wait) is chosen by using either
-DEFINE_WW_CLASS() (Wound-Wait) or DEFINE_WD_CLASS() (Wait-Die)
-As a rough rule of thumb, use Wound-Wait iff you
-expect the number of simultaneous competing transactions to be typically small,
-and you want to reduce the number of rollbacks.
-
-Three different ways to acquire locks within the same w/w class. Common
-definitions for methods #1 and #2:
-
-static DEFINE_WW_CLASS(ww_class);
-
-struct obj {
-	struct ww_mutex lock;
-	/* obj data */
-};
-
-struct obj_entry {
-	struct list_head head;
-	struct obj *obj;
-};
-
-Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
-This is useful if a list of required objects is already tracked somewhere.
-Furthermore the lock helper can use propagate the -EALREADY return code back to
-the caller as a signal that an object is twice on the list. This is useful if
-the list is constructed from userspace input and the ABI requires userspace to
-not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl).
-
-int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj *res_obj = NULL;
-	struct obj_entry *contended_entry = NULL;
-	struct obj_entry *entry;
-
-	ww_acquire_init(ctx, &ww_class);
-
-retry:
-	list_for_each_entry (entry, list, head) {
-		if (entry->obj == res_obj) {
-			res_obj = NULL;
-			continue;
-		}
-		ret = ww_mutex_lock(&entry->obj->lock, ctx);
-		if (ret < 0) {
-			contended_entry = entry;
-			goto err;
-		}
-	}
-
-	ww_acquire_done(ctx);
-	return 0;
-
-err:
-	list_for_each_entry_continue_reverse (entry, list, head)
-		ww_mutex_unlock(&entry->obj->lock);
-
-	if (res_obj)
-		ww_mutex_unlock(&res_obj->lock);
-
-	if (ret == -EDEADLK) {
-		/* we lost out in a seqno race, lock and retry.. */
-		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
-		res_obj = contended_entry->obj;
-		goto retry;
-	}
-	ww_acquire_fini(ctx);
-
-	return ret;
-}
-
-Method 2, using a list in execbuf->buffers that can be reordered. Same semantics
-of duplicate entry detection using -EALREADY as method 1 above. But the
-list-reordering allows for a bit more idiomatic code.
-
-int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj_entry *entry, *entry2;
-
-	ww_acquire_init(ctx, &ww_class);
-
-	list_for_each_entry (entry, list, head) {
-		ret = ww_mutex_lock(&entry->obj->lock, ctx);
-		if (ret < 0) {
-			entry2 = entry;
-
-			list_for_each_entry_continue_reverse (entry2, list, head)
-				ww_mutex_unlock(&entry2->obj->lock);
-
-			if (ret != -EDEADLK) {
-				ww_acquire_fini(ctx);
-				return ret;
-			}
-
-			/* we lost out in a seqno race, lock and retry.. */
-			ww_mutex_lock_slow(&entry->obj->lock, ctx);
-
-			/*
-			 * Move buf to head of the list, this will point
-			 * buf->next to the first unlocked entry,
-			 * restarting the for loop.
-			 */
-			list_del(&entry->head);
-			list_add(&entry->head, list);
-		}
-	}
-
-	ww_acquire_done(ctx);
-	return 0;
-}
-
-Unlocking works the same way for both methods #1 and #2:
-
-void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj_entry *entry;
-
-	list_for_each_entry (entry, list, head)
-		ww_mutex_unlock(&entry->obj->lock);
-
-	ww_acquire_fini(ctx);
-}
-
-Method 3 is useful if the list of objects is constructed ad-hoc and not upfront,
-e.g. when adjusting edges in a graph where each node has its own ww_mutex lock,
-and edges can only be changed when holding the locks of all involved nodes. w/w
-mutexes are a natural fit for such a case for two reasons:
-- They can handle lock-acquisition in any order which allows us to start walking
-  a graph from a starting point and then iteratively discovering new edges and
-  locking down the nodes those edges connect to.
-- Due to the -EALREADY return code signalling that a given objects is already
-  held there's no need for additional book-keeping to break cycles in the graph
-  or keep track off which looks are already held (when using more than one node
-  as a starting point).
-
-Note that this approach differs in two important ways from the above methods:
-- Since the list of objects is dynamically constructed (and might very well be
-  different when retrying due to hitting the -EDEADLK die condition) there's
-  no need to keep any object on a persistent list when it's not locked. We can
-  therefore move the list_head into the object itself.
-- On the other hand the dynamic object list construction also means that the -EALREADY return
-  code can't be propagated.
-
-Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a
-list of starting nodes (passed in from userspace) using one of the above
-methods. And then lock any additional objects affected by the operations using
-method #3 below. The backoff/retry procedure will be a bit more involved, since
-when the dynamic locking step hits -EDEADLK we also need to unlock all the
-objects acquired with the fixed list. But the w/w mutex debug checks will catch
-any interface misuse for these cases.
-
-Also, method 3 can't fail the lock acquisition step since it doesn't return
--EALREADY. Of course this would be different when using the _interruptible
-variants, but that's outside of the scope of these examples here.
-
-struct obj {
-	struct ww_mutex ww_mutex;
-	struct list_head locked_list;
-};
-
-static DEFINE_WW_CLASS(ww_class);
-
-void __unlock_objs(struct list_head *list)
-{
-	struct obj *entry, *temp;
-
-	list_for_each_entry_safe (entry, temp, list, locked_list) {
-		/* need to do that before unlocking, since only the current lock holder is
-		allowed to use object */
-		list_del(&entry->locked_list);
-		ww_mutex_unlock(entry->ww_mutex)
-	}
-}
-
-void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	struct obj *obj;
-
-	ww_acquire_init(ctx, &ww_class);
-
-retry:
-	/* re-init loop start state */
-	loop {
-		/* magic code which walks over a graph and decides which objects
-		 * to lock */
-
-		ret = ww_mutex_lock(obj->ww_mutex, ctx);
-		if (ret == -EALREADY) {
-			/* we have that one already, get to the next object */
-			continue;
-		}
-		if (ret == -EDEADLK) {
-			__unlock_objs(list);
-
-			ww_mutex_lock_slow(obj, ctx);
-			list_add(&entry->locked_list, list);
-			goto retry;
-		}
-
-		/* locked a new object, add it to the list */
-		list_add_tail(&entry->locked_list, list);
-	}
-
-	ww_acquire_done(ctx);
-	return 0;
-}
-
-void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
-{
-	__unlock_objs(list);
-	ww_acquire_fini(ctx);
-}
-
-Method 4: Only lock one single objects. In that case deadlock detection and
-prevention is obviously overkill, since with grabbing just one lock you can't
-produce a deadlock within just one class. To simplify this case the w/w mutex
-api can be used with a NULL context.
-
-Implementation Details
-----------------------
-
-Design:
-  ww_mutex currently encapsulates a struct mutex, this means no extra overhead for
-  normal mutex locks, which are far more common. As such there is only a small
-  increase in code size if wait/wound mutexes are not used.
-
-  We maintain the following invariants for the wait list:
-  (1) Waiters with an acquire context are sorted by stamp order; waiters
-      without an acquire context are interspersed in FIFO order.
-  (2) For Wait-Die, among waiters with contexts, only the first one can have
-      other locks acquired already (ctx->acquired > 0). Note that this waiter
-      may come after other waiters without contexts in the list.
-
-  The Wound-Wait preemption is implemented with a lazy-preemption scheme:
-  The wounded status of the transaction is checked only when there is
-  contention for a new lock and hence a true chance of deadlock. In that
-  situation, if the transaction is wounded, it backs off, clears the
-  wounded status and retries. A great benefit of implementing preemption in
-  this way is that the wounded transaction can identify a contending lock to
-  wait for before restarting the transaction. Just blindly restarting the
-  transaction would likely make the transaction end up in a situation where
-  it would have to back off again.
-
-  In general, not much contention is expected. The locks are typically used to
-  serialize access to resources for devices, and optimization focus should
-  therefore be directed towards the uncontended cases.
-
-Lockdep:
-  Special care has been taken to warn for as many cases of api abuse
-  as possible. Some common api abuses will be caught with
-  CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended.
-
-  Some of the errors which will be warned about:
-   - Forgetting to call ww_acquire_fini or ww_acquire_init.
-   - Attempting to lock more mutexes after ww_acquire_done.
-   - Attempting to lock the wrong mutex after -EDEADLK and
-     unlocking all mutexes.
-   - Attempting to lock the right mutex after -EDEADLK,
-     before unlocking all mutexes.
-
-   - Calling ww_mutex_lock_slow before -EDEADLK was returned.
-
-   - Unlocking mutexes with the wrong unlock function.
-   - Calling one of the ww_acquire_* twice on the same context.
-   - Using a different ww_class for the mutex than for the ww_acquire_ctx.
-   - Normal lockdep errors that can result in deadlocks.
-
-  Some of the lockdep errors that can result in deadlocks:
-   - Calling ww_acquire_init to initialize a second ww_acquire_ctx before
-     having called ww_acquire_fini on the first.
-   - 'normal' deadlocks that can occur.
-
-FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic
-implemented.
diff --git a/Documentation/pi-futex.txt b/Documentation/pi-futex.txt
index b154f6c0c36e..c33ba2befbf8 100644
--- a/Documentation/pi-futex.txt
+++ b/Documentation/pi-futex.txt
@@ -119,4 +119,4 @@ properties of futexes, and all four combinations are possible: futex,
 robust-futex, PI-futex, robust+PI-futex.
 
 More details about priority inheritance can be found in
-Documentation/locking/rt-mutex.txt.
+Documentation/locking/rt-mutex.rst.
diff --git a/Documentation/translations/it_IT/kernel-hacking/locking.rst b/Documentation/translations/it_IT/kernel-hacking/locking.rst
index 5fd8a1abd2be..b9a6be4b8499 100644
--- a/Documentation/translations/it_IT/kernel-hacking/locking.rst
+++ b/Documentation/translations/it_IT/kernel-hacking/locking.rst
@@ -1404,7 +1404,7 @@ Riferimento per l'API dei Futex
 Approfondimenti
 ===============
 
--  ``Documentation/locking/spinlocks.txt``: la guida di Linus Torvalds agli
+-  ``Documentation/locking/spinlocks.rst``: la guida di Linus Torvalds agli
    spinlock del kernel.
 
 -  Unix Systems for Modern Architectures: Symmetric Multiprocessing and
diff --git a/drivers/gpu/drm/drm_modeset_lock.c b/drivers/gpu/drm/drm_modeset_lock.c
index 81dd11901ffd..cb5671d32ada 100644
--- a/drivers/gpu/drm/drm_modeset_lock.c
+++ b/drivers/gpu/drm/drm_modeset_lock.c
@@ -36,7 +36,7 @@
  * of extra utility/tracking out of our acquire-ctx.  This is provided
  * by &struct drm_modeset_lock and &struct drm_modeset_acquire_ctx.
  *
- * For basic principles of &ww_mutex, see: Documentation/locking/ww-mutex-design.txt
+ * For basic principles of &ww_mutex, see: Documentation/locking/ww-mutex-design.rst
  *
  * The basic usage pattern is to::
  *
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 57baa27f238c..0b0d7259276d 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -5,7 +5,7 @@
  *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  *
- * see Documentation/locking/lockdep-design.txt for more details.
+ * see Documentation/locking/lockdep-design.rst for more details.
  */
 #ifndef __LINUX_LOCKDEP_H
 #define __LINUX_LOCKDEP_H
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3093dd162424..dcd03fee6e01 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -151,7 +151,7 @@ static inline bool mutex_is_locked(struct mutex *lock)
 
 /*
  * See kernel/locking/mutex.c for detailed documentation of these APIs.
- * Also see Documentation/locking/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.rst.
  */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index e401358c4e7e..9d9c663987d8 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -160,7 +160,7 @@ extern void downgrade_write(struct rw_semaphore *sem);
  * static then another method for expressing nested locking is
  * the explicit definition of lock class keys and the use of
  * lockdep_set_class() at lock initialization time.
- * See Documentation/locking/lockdep-design.txt for more details.)
+ * See Documentation/locking/lockdep-design.rst for more details.)
  */
 extern void down_read_nested(struct rw_semaphore *sem, int subclass);
 extern void down_write_nested(struct rw_semaphore *sem, int subclass);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 0c601ae072b3..edd1c082dbf5 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -16,7 +16,7 @@
  *    by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
  *    and Sven Dietrich.
  *
- * Also see Documentation/locking/mutex-design.txt.
+ * Also see Documentation/locking/mutex-design.rst.
  */
 #include <linux/mutex.h>
 #include <linux/ww_mutex.h>
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 38fbf9fa7f1b..fa83d36e30c6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -9,7 +9,7 @@
  *  Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
  *  Copyright (C) 2006 Esben Nielsen
  *
- *  See Documentation/locking/rt-mutex-design.txt for details.
+ *  See Documentation/locking/rt-mutex-design.rst for details.
  */
 #include <linux/spinlock.h>
 #include <linux/export.h>
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4ac4ca21a30a..a858b55e8ac7 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1139,7 +1139,7 @@ config PROVE_LOCKING
 	 the proof of observed correctness is also maintained for an
 	 arbitrary combination of these separate locking variants.
 
-	 For more details, see Documentation/locking/lockdep-design.txt.
+	 For more details, see Documentation/locking/lockdep-design.rst.
 
 config LOCK_STAT
 	bool "Lock usage statistics"
@@ -1153,7 +1153,7 @@ config LOCK_STAT
 	help
 	 This feature enables tracking lock contention points
 
-	 For more details, see Documentation/locking/lockstat.txt
+	 For more details, see Documentation/locking/lockstat.rst
 
 	 This also enables lock events required by "perf lock",
 	 subcommand of perf.
-- 
cgit v1.2.3-59-g8ed1b


From 720594f691e5c8fb0624f3653b20b24ba8e57742 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Sat, 13 Apr 2019 22:54:53 -0300
Subject: docs: connector: convert to ReST and rename to connector.rst

As it has some function definitions, move them to connector.h.

The remaining conversion is actually:
  - add blank lines and identation in order to identify paragraphs;
  - fix tables markups;
  - add some lists markups;
  - mark literal blocks;
  - adjust title markups.

At its new index.rst, let's add a :orphan: while this is not linked to
the main index.rst file, in order to avoid build warnings.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/connector/connector.rst | 156 +++++++++++++++++++++++++++
 Documentation/connector/connector.txt | 196 ----------------------------------
 drivers/w1/Kconfig                    |   2 +-
 include/linux/connector.h             |  63 ++++++++++-
 samples/Kconfig                       |   2 +-
 5 files changed, 220 insertions(+), 199 deletions(-)
 create mode 100644 Documentation/connector/connector.rst
 delete mode 100644 Documentation/connector/connector.txt

(limited to 'include/linux')

diff --git a/Documentation/connector/connector.rst b/Documentation/connector/connector.rst
new file mode 100644
index 000000000000..24e26dc22dbf
--- /dev/null
+++ b/Documentation/connector/connector.rst
@@ -0,0 +1,156 @@
+:orphan:
+
+================
+Kernel Connector
+================
+
+Kernel connector - new netlink based userspace <-> kernel space easy
+to use communication module.
+
+The Connector driver makes it easy to connect various agents using a
+netlink based network.  One must register a callback and an identifier.
+When the driver receives a special netlink message with the appropriate
+identifier, the appropriate callback will be called.
+
+From the userspace point of view it's quite straightforward:
+
+	- socket();
+	- bind();
+	- send();
+	- recv();
+
+But if kernelspace wants to use the full power of such connections, the
+driver writer must create special sockets, must know about struct sk_buff
+handling, etc...  The Connector driver allows any kernelspace agents to use
+netlink based networking for inter-process communication in a significantly
+easier way::
+
+  int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
+  void cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask);
+  void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask);
+
+  struct cb_id
+  {
+	__u32			idx;
+	__u32			val;
+  };
+
+idx and val are unique identifiers which must be registered in the
+connector.h header for in-kernel usage.  `void (*callback) (void *)` is a
+callback function which will be called when a message with above idx.val
+is received by the connector core.  The argument for that function must
+be dereferenced to `struct cn_msg *`::
+
+  struct cn_msg
+  {
+	struct cb_id		id;
+
+	__u32			seq;
+	__u32			ack;
+
+	__u32			len;	/* Length of the following data */
+	__u8			data[0];
+  };
+
+Connector interfaces
+====================
+
+ .. kernel-doc:: include/linux/connector.h
+
+ Note:
+   When registering new callback user, connector core assigns
+   netlink group to the user which is equal to its id.idx.
+
+Protocol description
+====================
+
+The current framework offers a transport layer with fixed headers.  The
+recommended protocol which uses such a header is as following:
+
+msg->seq and msg->ack are used to determine message genealogy.  When
+someone sends a message, they use a locally unique sequence and random
+acknowledge number.  The sequence number may be copied into
+nlmsghdr->nlmsg_seq too.
+
+The sequence number is incremented with each message sent.
+
+If you expect a reply to the message, then the sequence number in the
+received message MUST be the same as in the original message, and the
+acknowledge number MUST be the same + 1.
+
+If we receive a message and its sequence number is not equal to one we
+are expecting, then it is a new message.  If we receive a message and
+its sequence number is the same as one we are expecting, but its
+acknowledge is not equal to the sequence number in the original
+message + 1, then it is a new message.
+
+Obviously, the protocol header contains the above id.
+
+The connector allows event notification in the following form: kernel
+driver or userspace process can ask connector to notify it when
+selected ids will be turned on or off (registered or unregistered its
+callback).  It is done by sending a special command to the connector
+driver (it also registers itself with id={-1, -1}).
+
+As example of this usage can be found in the cn_test.c module which
+uses the connector to request notification and to send messages.
+
+Reliability
+===========
+
+Netlink itself is not a reliable protocol.  That means that messages can
+be lost due to memory pressure or process' receiving queue overflowed,
+so caller is warned that it must be prepared.  That is why the struct
+cn_msg [main connector's message header] contains u32 seq and u32 ack
+fields.
+
+Userspace usage
+===============
+
+2.6.14 has a new netlink socket implementation, which by default does not
+allow people to send data to netlink groups other than 1.
+So, if you wish to use a netlink socket (for example using connector)
+with a different group number, the userspace application must subscribe to
+that group first.  It can be achieved by the following pseudocode::
+
+  s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+
+  l_local.nl_family = AF_NETLINK;
+  l_local.nl_groups = 12345;
+  l_local.nl_pid = 0;
+
+  if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
+	perror("bind");
+	close(s);
+	return -1;
+  }
+
+  {
+	int on = l_local.nl_groups;
+	setsockopt(s, 270, 1, &on, sizeof(on));
+  }
+
+Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket
+option.  To drop a multicast subscription, one should call the above socket
+option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0.
+
+2.6.14 netlink code only allows to select a group which is less or equal to
+the maximum group number, which is used at netlink_kernel_create() time.
+In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use
+group number 12345, you must increment CN_NETLINK_USERS to that number.
+Additional 0xf numbers are allocated to be used by non-in-kernel users.
+
+Due to this limitation, group 0xffffffff does not work now, so one can
+not use add/remove connector's group notifications, but as far as I know,
+only cn_test.c test module used it.
+
+Some work in netlink area is still being done, so things can be changed in
+2.6.15 timeframe, if it will happen, documentation will be updated for that
+kernel.
+
+Code samples
+============
+
+Sample code for a connector test module and user space can be found
+in samples/connector/. To build this code, enable CONFIG_CONNECTOR
+and CONFIG_SAMPLES.
diff --git a/Documentation/connector/connector.txt b/Documentation/connector/connector.txt
deleted file mode 100644
index ab7ca897fab7..000000000000
--- a/Documentation/connector/connector.txt
+++ /dev/null
@@ -1,196 +0,0 @@
-/*****************************************/
-Kernel Connector.
-/*****************************************/
-
-Kernel connector - new netlink based userspace <-> kernel space easy
-to use communication module.
-
-The Connector driver makes it easy to connect various agents using a
-netlink based network.  One must register a callback and an identifier.
-When the driver receives a special netlink message with the appropriate
-identifier, the appropriate callback will be called.
-
-From the userspace point of view it's quite straightforward:
-
-	socket();
-	bind();
-	send();
-	recv();
-
-But if kernelspace wants to use the full power of such connections, the
-driver writer must create special sockets, must know about struct sk_buff
-handling, etc...  The Connector driver allows any kernelspace agents to use
-netlink based networking for inter-process communication in a significantly
-easier way:
-
-int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
-void cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __group, int gfp_mask);
-void cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __group, int gfp_mask);
-
-struct cb_id
-{
-	__u32			idx;
-	__u32			val;
-};
-
-idx and val are unique identifiers which must be registered in the
-connector.h header for in-kernel usage.  void (*callback) (void *) is a
-callback function which will be called when a message with above idx.val
-is received by the connector core.  The argument for that function must
-be dereferenced to struct cn_msg *.
-
-struct cn_msg
-{
-	struct cb_id		id;
-
-	__u32			seq;
-	__u32			ack;
-
-	__u32			len;		/* Length of the following data */
-	__u8			data[0];
-};
-
-/*****************************************/
-Connector interfaces.
-/*****************************************/
-
-int cn_add_callback(struct cb_id *id, char *name, void (*callback) (struct cn_msg *, struct netlink_skb_parms *));
-
- Registers new callback with connector core.
-
- struct cb_id *id		- unique connector's user identifier.
-				  It must be registered in connector.h for legal in-kernel users.
- char *name			- connector's callback symbolic name.
- void (*callback) (struct cn..)	- connector's callback.
-				  cn_msg and the sender's credentials
-
-
-void cn_del_callback(struct cb_id *id);
-
- Unregisters new callback with connector core.
-
- struct cb_id *id		- unique connector's user identifier.
-
-
-int cn_netlink_send_multi(struct cn_msg *msg, u16 len, u32 portid, u32 __groups, int gfp_mask);
-int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 __groups, int gfp_mask);
-
- Sends message to the specified groups.  It can be safely called from
- softirq context, but may silently fail under strong memory pressure.
- If there are no listeners for given group -ESRCH can be returned.
-
- struct cn_msg *		- message header(with attached data).
- u16 len			- for *_multi multiple cn_msg messages can be sent
- u32 port			- destination port.
- 				  If non-zero the message will be sent to the
-				  given port, which should be set to the
-				  original sender.
- u32 __group			- destination group.
-				  If port and __group is zero, then appropriate group will
-				  be searched through all registered connector users,
-				  and message will be delivered to the group which was
-				  created for user with the same ID as in msg.
-				  If __group is not zero, then message will be delivered
-				  to the specified group.
- int gfp_mask			- GFP mask.
-
- Note: When registering new callback user, connector core assigns
- netlink group to the user which is equal to its id.idx.
-
-/*****************************************/
-Protocol description.
-/*****************************************/
-
-The current framework offers a transport layer with fixed headers.  The
-recommended protocol which uses such a header is as following:
-
-msg->seq and msg->ack are used to determine message genealogy.  When
-someone sends a message, they use a locally unique sequence and random
-acknowledge number.  The sequence number may be copied into
-nlmsghdr->nlmsg_seq too.
-
-The sequence number is incremented with each message sent.
-
-If you expect a reply to the message, then the sequence number in the
-received message MUST be the same as in the original message, and the
-acknowledge number MUST be the same + 1.
-
-If we receive a message and its sequence number is not equal to one we
-are expecting, then it is a new message.  If we receive a message and
-its sequence number is the same as one we are expecting, but its
-acknowledge is not equal to the sequence number in the original
-message + 1, then it is a new message.
-
-Obviously, the protocol header contains the above id.
-
-The connector allows event notification in the following form: kernel
-driver or userspace process can ask connector to notify it when
-selected ids will be turned on or off (registered or unregistered its
-callback).  It is done by sending a special command to the connector
-driver (it also registers itself with id={-1, -1}).
-
-As example of this usage can be found in the cn_test.c module which
-uses the connector to request notification and to send messages.
-
-/*****************************************/
-Reliability.
-/*****************************************/
-
-Netlink itself is not a reliable protocol.  That means that messages can
-be lost due to memory pressure or process' receiving queue overflowed,
-so caller is warned that it must be prepared.  That is why the struct
-cn_msg [main connector's message header] contains u32 seq and u32 ack
-fields.
-
-/*****************************************/
-Userspace usage.
-/*****************************************/
-
-2.6.14 has a new netlink socket implementation, which by default does not
-allow people to send data to netlink groups other than 1.
-So, if you wish to use a netlink socket (for example using connector)
-with a different group number, the userspace application must subscribe to
-that group first.  It can be achieved by the following pseudocode:
-
-s = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
-
-l_local.nl_family = AF_NETLINK;
-l_local.nl_groups = 12345;
-l_local.nl_pid = 0;
-
-if (bind(s, (struct sockaddr *)&l_local, sizeof(struct sockaddr_nl)) == -1) {
-	perror("bind");
-	close(s);
-	return -1;
-}
-
-{
-	int on = l_local.nl_groups;
-	setsockopt(s, 270, 1, &on, sizeof(on));
-}
-
-Where 270 above is SOL_NETLINK, and 1 is a NETLINK_ADD_MEMBERSHIP socket
-option.  To drop a multicast subscription, one should call the above socket
-option with the NETLINK_DROP_MEMBERSHIP parameter which is defined as 0.
-
-2.6.14 netlink code only allows to select a group which is less or equal to
-the maximum group number, which is used at netlink_kernel_create() time.
-In case of connector it is CN_NETLINK_USERS + 0xf, so if you want to use
-group number 12345, you must increment CN_NETLINK_USERS to that number.
-Additional 0xf numbers are allocated to be used by non-in-kernel users.
-
-Due to this limitation, group 0xffffffff does not work now, so one can
-not use add/remove connector's group notifications, but as far as I know, 
-only cn_test.c test module used it.
-
-Some work in netlink area is still being done, so things can be changed in
-2.6.15 timeframe, if it will happen, documentation will be updated for that
-kernel.
-
-/*****************************************/
-Code samples
-/*****************************************/
-
-Sample code for a connector test module and user space can be found
-in samples/connector/. To build this code, enable CONFIG_CONNECTOR
-and CONFIG_SAMPLES.
diff --git a/drivers/w1/Kconfig b/drivers/w1/Kconfig
index 03dd57581df7..160053c0baea 100644
--- a/drivers/w1/Kconfig
+++ b/drivers/w1/Kconfig
@@ -19,7 +19,7 @@ config W1_CON
 	default y
 	---help---
 	  This allows to communicate with userspace using connector. For more
-	  information see <file:Documentation/connector/connector.txt>.
+	  information see <file:Documentation/connector/connector.rst>.
 	  There are three types of messages between w1 core and userspace:
 	  1. Events. They are generated each time new master or slave device found
 		either due to automatic or requested search.
diff --git a/include/linux/connector.h b/include/linux/connector.h
index 1d72ef76f24f..6b6c7396a584 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -55,10 +55,71 @@ struct cn_dev {
 	struct cn_queue_dev *cbdev;
 };
 
+/**
+ * cn_add_callback() - Registers new callback with connector core.
+ *
+ * @id:		unique connector's user identifier.
+ *		It must be registered in connector.h for legal
+ *		in-kernel users.
+ * @name:	connector's callback symbolic name.
+ * @callback:	connector's callback.
+ * 		parameters are %cn_msg and the sender's credentials
+ */
 int cn_add_callback(struct cb_id *id, const char *name,
 		    void (*callback)(struct cn_msg *, struct netlink_skb_parms *));
-void cn_del_callback(struct cb_id *);
+/**
+ * cn_del_callback() - Unregisters new callback with connector core.
+ *
+ * @id:		unique connector's user identifier.
+ */
+void cn_del_callback(struct cb_id *id);
+
+
+/**
+ * cn_netlink_send_mult - Sends message to the specified groups.
+ *
+ * @msg: 	message header(with attached data).
+ * @len:	Number of @msg to be sent.
+ * @portid:	destination port.
+ *		If non-zero the message will be sent to the given port,
+ *		which should be set to the original sender.
+ * @group:	destination group.
+ * 		If @portid and @group is zero, then appropriate group will
+ *		be searched through all registered connector users, and
+ *		message will be delivered to the group which was created
+ *		for user with the same ID as in @msg.
+ *		If @group is not zero, then message will be delivered
+ *		to the specified group.
+ * @gfp_mask:	GFP mask.
+ *
+ * It can be safely called from softirq context, but may silently
+ * fail under strong memory pressure.
+ *
+ * If there are no listeners for given group %-ESRCH can be returned.
+ */
 int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 group, gfp_t gfp_mask);
+
+/**
+ * cn_netlink_send_mult - Sends message to the specified groups.
+ *
+ * @msg:	message header(with attached data).
+ * @portid:	destination port.
+ *		If non-zero the message will be sent to the given port,
+ *		which should be set to the original sender.
+ * @group:	destination group.
+ * 		If @portid and @group is zero, then appropriate group will
+ *		be searched through all registered connector users, and
+ *		message will be delivered to the group which was created
+ *		for user with the same ID as in @msg.
+ *		If @group is not zero, then message will be delivered
+ *		to the specified group.
+ * @gfp_mask:	GFP mask.
+ *
+ * It can be safely called from softirq context, but may silently
+ * fail under strong memory pressure.
+ *
+ * If there are no listeners for given group %-ESRCH can be returned.
+ */
 int cn_netlink_send(struct cn_msg *msg, u32 portid, u32 group, gfp_t gfp_mask);
 
 int cn_queue_add_callback(struct cn_queue_dev *dev, const char *name,
diff --git a/samples/Kconfig b/samples/Kconfig
index 71b5e833dd9e..155da47dc6a4 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -99,7 +99,7 @@ config SAMPLE_CONNECTOR
 	  When enabled, this builds both a sample kernel module for
 	  the connector interface and a user space tool to communicate
 	  with it.
-	  See also Documentation/connector/connector.txt
+	  See also Documentation/connector/connector.rst
 
 config SAMPLE_HIDRAW
 	bool "hidraw sample"
-- 
cgit v1.2.3-59-g8ed1b


From fe34c89d25429e079ba67416529514120dd715f8 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Tue, 18 Jun 2019 12:34:59 -0300
Subject: docs: driver-model: move it to the driver-api book

The audience for the Kernel driver-model is clearly Kernel hackers.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com> # ice driver changes
---
 Documentation/driver-api/driver-model/binding.rst  |  98 +++++
 Documentation/driver-api/driver-model/bus.rst      | 146 +++++++
 Documentation/driver-api/driver-model/class.rst    | 149 +++++++
 .../driver-api/driver-model/design-patterns.rst    | 116 ++++++
 Documentation/driver-api/driver-model/device.rst   | 109 +++++
 Documentation/driver-api/driver-model/devres.rst   | 414 +++++++++++++++++++
 Documentation/driver-api/driver-model/driver.rst   | 223 ++++++++++
 Documentation/driver-api/driver-model/index.rst    |  24 ++
 Documentation/driver-api/driver-model/overview.rst | 124 ++++++
 Documentation/driver-api/driver-model/platform.rst | 246 +++++++++++
 Documentation/driver-api/driver-model/porting.rst  | 448 +++++++++++++++++++++
 Documentation/driver-api/gpio/driver.rst           |   2 +-
 Documentation/driver-api/index.rst                 |   1 +
 Documentation/driver-model/binding.rst             |  98 -----
 Documentation/driver-model/bus.rst                 | 146 -------
 Documentation/driver-model/class.rst               | 149 -------
 Documentation/driver-model/design-patterns.rst     | 116 ------
 Documentation/driver-model/device.rst              | 109 -----
 Documentation/driver-model/devres.rst              | 414 -------------------
 Documentation/driver-model/driver.rst              | 223 ----------
 Documentation/driver-model/index.rst               |  26 --
 Documentation/driver-model/overview.rst            | 124 ------
 Documentation/driver-model/platform.rst            | 246 -----------
 Documentation/driver-model/porting.rst             | 448 ---------------------
 Documentation/eisa.txt                             |   4 +-
 Documentation/filesystems/sysfs.txt                |   2 +-
 Documentation/hwmon/submitting-patches.rst         |   2 +-
 .../translations/zh_CN/filesystems/sysfs.txt       |   2 +-
 drivers/base/platform.c                            |   2 +-
 drivers/gpio/gpio-cs5535.c                         |   2 +-
 drivers/net/ethernet/intel/ice/ice_main.c          |   2 +-
 drivers/staging/unisys/Documentation/overview.txt  |   4 +-
 include/linux/device.h                             |   2 +-
 include/linux/platform_device.h                    |   2 +-
 scripts/coccinelle/free/devm_free.cocci            |   2 +-
 35 files changed, 2112 insertions(+), 2113 deletions(-)
 create mode 100644 Documentation/driver-api/driver-model/binding.rst
 create mode 100644 Documentation/driver-api/driver-model/bus.rst
 create mode 100644 Documentation/driver-api/driver-model/class.rst
 create mode 100644 Documentation/driver-api/driver-model/design-patterns.rst
 create mode 100644 Documentation/driver-api/driver-model/device.rst
 create mode 100644 Documentation/driver-api/driver-model/devres.rst
 create mode 100644 Documentation/driver-api/driver-model/driver.rst
 create mode 100644 Documentation/driver-api/driver-model/index.rst
 create mode 100644 Documentation/driver-api/driver-model/overview.rst
 create mode 100644 Documentation/driver-api/driver-model/platform.rst
 create mode 100644 Documentation/driver-api/driver-model/porting.rst
 delete mode 100644 Documentation/driver-model/binding.rst
 delete mode 100644 Documentation/driver-model/bus.rst
 delete mode 100644 Documentation/driver-model/class.rst
 delete mode 100644 Documentation/driver-model/design-patterns.rst
 delete mode 100644 Documentation/driver-model/device.rst
 delete mode 100644 Documentation/driver-model/devres.rst
 delete mode 100644 Documentation/driver-model/driver.rst
 delete mode 100644 Documentation/driver-model/index.rst
 delete mode 100644 Documentation/driver-model/overview.rst
 delete mode 100644 Documentation/driver-model/platform.rst
 delete mode 100644 Documentation/driver-model/porting.rst

(limited to 'include/linux')

diff --git a/Documentation/driver-api/driver-model/binding.rst b/Documentation/driver-api/driver-model/binding.rst
new file mode 100644
index 000000000000..7ea1d7a41e1d
--- /dev/null
+++ b/Documentation/driver-api/driver-model/binding.rst
@@ -0,0 +1,98 @@
+==============
+Driver Binding
+==============
+
+Driver binding is the process of associating a device with a device
+driver that can control it. Bus drivers have typically handled this
+because there have been bus-specific structures to represent the
+devices and the drivers. With generic device and device driver
+structures, most of the binding can take place using common code.
+
+
+Bus
+~~~
+
+The bus type structure contains a list of all devices that are on that bus
+type in the system. When device_register is called for a device, it is
+inserted into the end of this list. The bus object also contains a
+list of all drivers of that bus type. When driver_register is called
+for a driver, it is inserted at the end of this list. These are the
+two events which trigger driver binding.
+
+
+device_register
+~~~~~~~~~~~~~~~
+
+When a new device is added, the bus's list of drivers is iterated over
+to find one that supports it. In order to determine that, the device
+ID of the device must match one of the device IDs that the driver
+supports. The format and semantics for comparing IDs is bus-specific.
+Instead of trying to derive a complex state machine and matching
+algorithm, it is up to the bus driver to provide a callback to compare
+a device against the IDs of a driver. The bus returns 1 if a match was
+found; 0 otherwise.
+
+int match(struct device * dev, struct device_driver * drv);
+
+If a match is found, the device's driver field is set to the driver
+and the driver's probe callback is called. This gives the driver a
+chance to verify that it really does support the hardware, and that
+it's in a working state.
+
+Device Class
+~~~~~~~~~~~~
+
+Upon the successful completion of probe, the device is registered with
+the class to which it belongs. Device drivers belong to one and only one
+class, and that is set in the driver's devclass field.
+devclass_add_device is called to enumerate the device within the class
+and actually register it with the class, which happens with the
+class's register_dev callback.
+
+
+Driver
+~~~~~~
+
+When a driver is attached to a device, the device is inserted into the
+driver's list of devices.
+
+
+sysfs
+~~~~~
+
+A symlink is created in the bus's 'devices' directory that points to
+the device's directory in the physical hierarchy.
+
+A symlink is created in the driver's 'devices' directory that points
+to the device's directory in the physical hierarchy.
+
+A directory for the device is created in the class's directory. A
+symlink is created in that directory that points to the device's
+physical location in the sysfs tree.
+
+A symlink can be created (though this isn't done yet) in the device's
+physical directory to either its class directory, or the class's
+top-level directory. One can also be created to point to its driver's
+directory also.
+
+
+driver_register
+~~~~~~~~~~~~~~~
+
+The process is almost identical for when a new driver is added.
+The bus's list of devices is iterated over to find a match. Devices
+that already have a driver are skipped. All the devices are iterated
+over, to bind as many devices as possible to the driver.
+
+
+Removal
+~~~~~~~
+
+When a device is removed, the reference count for it will eventually
+go to 0. When it does, the remove callback of the driver is called. It
+is removed from the driver's list of devices and the reference count
+of the driver is decremented. All symlinks between the two are removed.
+
+When a driver is removed, the list of devices that it supports is
+iterated over, and the driver's remove callback is called for each
+one. The device is removed from that list and the symlinks removed.
diff --git a/Documentation/driver-api/driver-model/bus.rst b/Documentation/driver-api/driver-model/bus.rst
new file mode 100644
index 000000000000..016b15a6e8ea
--- /dev/null
+++ b/Documentation/driver-api/driver-model/bus.rst
@@ -0,0 +1,146 @@
+=========
+Bus Types
+=========
+
+Definition
+~~~~~~~~~~
+See the kerneldoc for the struct bus_type.
+
+int bus_register(struct bus_type * bus);
+
+
+Declaration
+~~~~~~~~~~~
+
+Each bus type in the kernel (PCI, USB, etc) should declare one static
+object of this type. They must initialize the name field, and may
+optionally initialize the match callback::
+
+   struct bus_type pci_bus_type = {
+          .name	= "pci",
+          .match	= pci_bus_match,
+   };
+
+The structure should be exported to drivers in a header file:
+
+extern struct bus_type pci_bus_type;
+
+
+Registration
+~~~~~~~~~~~~
+
+When a bus driver is initialized, it calls bus_register. This
+initializes the rest of the fields in the bus object and inserts it
+into a global list of bus types. Once the bus object is registered,
+the fields in it are usable by the bus driver.
+
+
+Callbacks
+~~~~~~~~~
+
+match(): Attaching Drivers to Devices
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The format of device ID structures and the semantics for comparing
+them are inherently bus-specific. Drivers typically declare an array
+of device IDs of devices they support that reside in a bus-specific
+driver structure.
+
+The purpose of the match callback is to give the bus an opportunity to
+determine if a particular driver supports a particular device by
+comparing the device IDs the driver supports with the device ID of a
+particular device, without sacrificing bus-specific functionality or
+type-safety.
+
+When a driver is registered with the bus, the bus's list of devices is
+iterated over, and the match callback is called for each device that
+does not have a driver associated with it.
+
+
+
+Device and Driver Lists
+~~~~~~~~~~~~~~~~~~~~~~~
+
+The lists of devices and drivers are intended to replace the local
+lists that many buses keep. They are lists of struct devices and
+struct device_drivers, respectively. Bus drivers are free to use the
+lists as they please, but conversion to the bus-specific type may be
+necessary.
+
+The LDM core provides helper functions for iterating over each list::
+
+  int bus_for_each_dev(struct bus_type * bus, struct device * start,
+		       void * data,
+		       int (*fn)(struct device *, void *));
+
+  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
+		       void * data, int (*fn)(struct device_driver *, void *));
+
+These helpers iterate over the respective list, and call the callback
+for each device or driver in the list. All list accesses are
+synchronized by taking the bus's lock (read currently). The reference
+count on each object in the list is incremented before the callback is
+called; it is decremented after the next object has been obtained. The
+lock is not held when calling the callback.
+
+
+sysfs
+~~~~~~~~
+There is a top-level directory named 'bus'.
+
+Each bus gets a directory in the bus directory, along with two default
+directories::
+
+	/sys/bus/pci/
+	|-- devices
+	`-- drivers
+
+Drivers registered with the bus get a directory in the bus's drivers
+directory::
+
+	/sys/bus/pci/
+	|-- devices
+	`-- drivers
+	    |-- Intel ICH
+	    |-- Intel ICH Joystick
+	    |-- agpgart
+	    `-- e100
+
+Each device that is discovered on a bus of that type gets a symlink in
+the bus's devices directory to the device's directory in the physical
+hierarchy::
+
+	/sys/bus/pci/
+	|-- devices
+	|   |-- 00:00.0 -> ../../../root/pci0/00:00.0
+	|   |-- 00:01.0 -> ../../../root/pci0/00:01.0
+	|   `-- 00:02.0 -> ../../../root/pci0/00:02.0
+	`-- drivers
+
+
+Exporting Attributes
+~~~~~~~~~~~~~~~~~~~~
+
+::
+
+  struct bus_attribute {
+	struct attribute	attr;
+	ssize_t (*show)(struct bus_type *, char * buf);
+	ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
+  };
+
+Bus drivers can export attributes using the BUS_ATTR_RW macro that works
+similarly to the DEVICE_ATTR_RW macro for devices. For example, a
+definition like this::
+
+	static BUS_ATTR_RW(debug);
+
+is equivalent to declaring::
+
+	static bus_attribute bus_attr_debug;
+
+This can then be used to add and remove the attribute from the bus's
+sysfs directory using::
+
+	int bus_create_file(struct bus_type *, struct bus_attribute *);
+	void bus_remove_file(struct bus_type *, struct bus_attribute *);
diff --git a/Documentation/driver-api/driver-model/class.rst b/Documentation/driver-api/driver-model/class.rst
new file mode 100644
index 000000000000..fff55b80e86a
--- /dev/null
+++ b/Documentation/driver-api/driver-model/class.rst
@@ -0,0 +1,149 @@
+==============
+Device Classes
+==============
+
+Introduction
+~~~~~~~~~~~~
+A device class describes a type of device, like an audio or network
+device. The following device classes have been identified:
+
+<Insert List of Device Classes Here>
+
+
+Each device class defines a set of semantics and a programming interface
+that devices of that class adhere to. Device drivers are the
+implementation of that programming interface for a particular device on
+a particular bus.
+
+Device classes are agnostic with respect to what bus a device resides
+on.
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+The device class structure looks like::
+
+
+  typedef int (*devclass_add)(struct device *);
+  typedef void (*devclass_remove)(struct device *);
+
+See the kerneldoc for the struct class.
+
+A typical device class definition would look like::
+
+  struct device_class input_devclass = {
+        .name		= "input",
+        .add_device	= input_add_device,
+	.remove_device	= input_remove_device,
+  };
+
+Each device class structure should be exported in a header file so it
+can be used by drivers, extensions and interfaces.
+
+Device classes are registered and unregistered with the core using::
+
+  int devclass_register(struct device_class * cls);
+  void devclass_unregister(struct device_class * cls);
+
+
+Devices
+~~~~~~~
+As devices are bound to drivers, they are added to the device class
+that the driver belongs to. Before the driver model core, this would
+typically happen during the driver's probe() callback, once the device
+has been initialized. It now happens after the probe() callback
+finishes from the core.
+
+The device is enumerated in the class. Each time a device is added to
+the class, the class's devnum field is incremented and assigned to the
+device. The field is never decremented, so if the device is removed
+from the class and re-added, it will receive a different enumerated
+value.
+
+The class is allowed to create a class-specific structure for the
+device and store it in the device's class_data pointer.
+
+There is no list of devices in the device class. Each driver has a
+list of devices that it supports. The device class has a list of
+drivers of that particular class. To access all of the devices in the
+class, iterate over the device lists of each driver in the class.
+
+
+Device Drivers
+~~~~~~~~~~~~~~
+Device drivers are added to device classes when they are registered
+with the core. A driver specifies the class it belongs to by setting
+the struct device_driver::devclass field.
+
+
+sysfs directory structure
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+There is a top-level sysfs directory named 'class'.
+
+Each class gets a directory in the class directory, along with two
+default subdirectories::
+
+        class/
+        `-- input
+            |-- devices
+            `-- drivers
+
+
+Drivers registered with the class get a symlink in the drivers/ directory
+that points to the driver's directory (under its bus directory)::
+
+   class/
+   `-- input
+       |-- devices
+       `-- drivers
+           `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/
+
+
+Each device gets a symlink in the devices/ directory that points to the
+device's directory in the physical hierarchy::
+
+   class/
+   `-- input
+       |-- devices
+       |   `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
+       `-- drivers
+
+
+Exporting Attributes
+~~~~~~~~~~~~~~~~~~~~
+
+::
+
+  struct devclass_attribute {
+        struct attribute        attr;
+        ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off);
+        ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off);
+  };
+
+Class drivers can export attributes using the DEVCLASS_ATTR macro that works
+similarly to the DEVICE_ATTR macro for devices. For example, a definition
+like this::
+
+  static DEVCLASS_ATTR(debug,0644,show_debug,store_debug);
+
+is equivalent to declaring::
+
+  static devclass_attribute devclass_attr_debug;
+
+The bus driver can add and remove the attribute from the class's
+sysfs directory using::
+
+  int devclass_create_file(struct device_class *, struct devclass_attribute *);
+  void devclass_remove_file(struct device_class *, struct devclass_attribute *);
+
+In the example above, the file will be named 'debug' in placed in the
+class's directory in sysfs.
+
+
+Interfaces
+~~~~~~~~~~
+There may exist multiple mechanisms for accessing the same device of a
+particular class type. Device interfaces describe these mechanisms.
+
+When a device is added to a device class, the core attempts to add it
+to every interface that is registered with the device class.
diff --git a/Documentation/driver-api/driver-model/design-patterns.rst b/Documentation/driver-api/driver-model/design-patterns.rst
new file mode 100644
index 000000000000..41eb8f41f7dd
--- /dev/null
+++ b/Documentation/driver-api/driver-model/design-patterns.rst
@@ -0,0 +1,116 @@
+=============================
+Device Driver Design Patterns
+=============================
+
+This document describes a few common design patterns found in device drivers.
+It is likely that subsystem maintainers will ask driver developers to
+conform to these design patterns.
+
+1. State Container
+2. container_of()
+
+
+1. State Container
+~~~~~~~~~~~~~~~~~~
+
+While the kernel contains a few device drivers that assume that they will
+only be probed() once on a certain system (singletons), it is custom to assume
+that the device the driver binds to will appear in several instances. This
+means that the probe() function and all callbacks need to be reentrant.
+
+The most common way to achieve this is to use the state container design
+pattern. It usually has this form::
+
+  struct foo {
+      spinlock_t lock; /* Example member */
+      (...)
+  };
+
+  static int foo_probe(...)
+  {
+      struct foo *foo;
+
+      foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL);
+      if (!foo)
+          return -ENOMEM;
+      spin_lock_init(&foo->lock);
+      (...)
+  }
+
+This will create an instance of struct foo in memory every time probe() is
+called. This is our state container for this instance of the device driver.
+Of course it is then necessary to always pass this instance of the
+state around to all functions that need access to the state and its members.
+
+For example, if the driver is registering an interrupt handler, you would
+pass around a pointer to struct foo like this::
+
+  static irqreturn_t foo_handler(int irq, void *arg)
+  {
+      struct foo *foo = arg;
+      (...)
+  }
+
+  static int foo_probe(...)
+  {
+      struct foo *foo;
+
+      (...)
+      ret = request_irq(irq, foo_handler, 0, "foo", foo);
+  }
+
+This way you always get a pointer back to the correct instance of foo in
+your interrupt handler.
+
+
+2. container_of()
+~~~~~~~~~~~~~~~~~
+
+Continuing on the above example we add an offloaded work::
+
+  struct foo {
+      spinlock_t lock;
+      struct workqueue_struct *wq;
+      struct work_struct offload;
+      (...)
+  };
+
+  static void foo_work(struct work_struct *work)
+  {
+      struct foo *foo = container_of(work, struct foo, offload);
+
+      (...)
+  }
+
+  static irqreturn_t foo_handler(int irq, void *arg)
+  {
+      struct foo *foo = arg;
+
+      queue_work(foo->wq, &foo->offload);
+      (...)
+  }
+
+  static int foo_probe(...)
+  {
+      struct foo *foo;
+
+      foo->wq = create_singlethread_workqueue("foo-wq");
+      INIT_WORK(&foo->offload, foo_work);
+      (...)
+  }
+
+The design pattern is the same for an hrtimer or something similar that will
+return a single argument which is a pointer to a struct member in the
+callback.
+
+container_of() is a macro defined in <linux/kernel.h>
+
+What container_of() does is to obtain a pointer to the containing struct from
+a pointer to a member by a simple subtraction using the offsetof() macro from
+standard C, which allows something similar to object oriented behaviours.
+Notice that the contained member must not be a pointer, but an actual member
+for this to work.
+
+We can see here that we avoid having global pointers to our struct foo *
+instance this way, while still keeping the number of parameters passed to the
+work function to a single pointer.
diff --git a/Documentation/driver-api/driver-model/device.rst b/Documentation/driver-api/driver-model/device.rst
new file mode 100644
index 000000000000..2b868d49d349
--- /dev/null
+++ b/Documentation/driver-api/driver-model/device.rst
@@ -0,0 +1,109 @@
+==========================
+The Basic Device Structure
+==========================
+
+See the kerneldoc for the struct device.
+
+
+Programming Interface
+~~~~~~~~~~~~~~~~~~~~~
+The bus driver that discovers the device uses this to register the
+device with the core::
+
+  int device_register(struct device * dev);
+
+The bus should initialize the following fields:
+
+    - parent
+    - name
+    - bus_id
+    - bus
+
+A device is removed from the core when its reference count goes to
+0. The reference count can be adjusted using::
+
+  struct device * get_device(struct device * dev);
+  void put_device(struct device * dev);
+
+get_device() will return a pointer to the struct device passed to it
+if the reference is not already 0 (if it's in the process of being
+removed already).
+
+A driver can access the lock in the device structure using::
+
+  void lock_device(struct device * dev);
+  void unlock_device(struct device * dev);
+
+
+Attributes
+~~~~~~~~~~
+
+::
+
+  struct device_attribute {
+	struct attribute	attr;
+	ssize_t (*show)(struct device *dev, struct device_attribute *attr,
+			char *buf);
+	ssize_t (*store)(struct device *dev, struct device_attribute *attr,
+			 const char *buf, size_t count);
+  };
+
+Attributes of devices can be exported by a device driver through sysfs.
+
+Please see Documentation/filesystems/sysfs.txt for more information
+on how sysfs works.
+
+As explained in Documentation/kobject.txt, device attributes must be
+created before the KOBJ_ADD uevent is generated. The only way to realize
+that is by defining an attribute group.
+
+Attributes are declared using a macro called DEVICE_ATTR::
+
+  #define DEVICE_ATTR(name,mode,show,store)
+
+Example:::
+
+  static DEVICE_ATTR(type, 0444, show_type, NULL);
+  static DEVICE_ATTR(power, 0644, show_power, store_power);
+
+This declares two structures of type struct device_attribute with respective
+names 'dev_attr_type' and 'dev_attr_power'. These two attributes can be
+organized as follows into a group::
+
+  static struct attribute *dev_attrs[] = {
+	&dev_attr_type.attr,
+	&dev_attr_power.attr,
+	NULL,
+  };
+
+  static struct attribute_group dev_attr_group = {
+	.attrs = dev_attrs,
+  };
+
+  static const struct attribute_group *dev_attr_groups[] = {
+	&dev_attr_group,
+	NULL,
+  };
+
+This array of groups can then be associated with a device by setting the
+group pointer in struct device before device_register() is invoked::
+
+        dev->groups = dev_attr_groups;
+        device_register(dev);
+
+The device_register() function will use the 'groups' pointer to create the
+device attributes and the device_unregister() function will use this pointer
+to remove the device attributes.
+
+Word of warning:  While the kernel allows device_create_file() and
+device_remove_file() to be called on a device at any time, userspace has
+strict expectations on when attributes get created.  When a new device is
+registered in the kernel, a uevent is generated to notify userspace (like
+udev) that a new device is available.  If attributes are added after the
+device is registered, then userspace won't get notified and userspace will
+not know about the new attributes.
+
+This is important for device driver that need to publish additional
+attributes for a device at driver probe time.  If the device driver simply
+calls device_create_file() on the device structure passed to it, then
+userspace will never be notified of the new attributes.
diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
new file mode 100644
index 000000000000..4ac99122b5f1
--- /dev/null
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -0,0 +1,414 @@
+================================
+Devres - Managed Device Resource
+================================
+
+Tejun Heo	<teheo@suse.de>
+
+First draft	10 January 2007
+
+.. contents
+
+   1. Intro			: Huh? Devres?
+   2. Devres			: Devres in a nutshell
+   3. Devres Group		: Group devres'es and release them together
+   4. Details			: Life time rules, calling context, ...
+   5. Overhead			: How much do we have to pay for this?
+   6. List of managed interfaces: Currently implemented managed interfaces
+
+
+1. Intro
+--------
+
+devres came up while trying to convert libata to use iomap.  Each
+iomapped address should be kept and unmapped on driver detach.  For
+example, a plain SFF ATA controller (that is, good old PCI IDE) in
+native mode makes use of 5 PCI BARs and all of them should be
+maintained.
+
+As with many other device drivers, libata low level drivers have
+sufficient bugs in ->remove and ->probe failure path.  Well, yes,
+that's probably because libata low level driver developers are lazy
+bunch, but aren't all low level driver developers?  After spending a
+day fiddling with braindamaged hardware with no document or
+braindamaged document, if it's finally working, well, it's working.
+
+For one reason or another, low level drivers don't receive as much
+attention or testing as core code, and bugs on driver detach or
+initialization failure don't happen often enough to be noticeable.
+Init failure path is worse because it's much less travelled while
+needs to handle multiple entry points.
+
+So, many low level drivers end up leaking resources on driver detach
+and having half broken failure path implementation in ->probe() which
+would leak resources or even cause oops when failure occurs.  iomap
+adds more to this mix.  So do msi and msix.
+
+
+2. Devres
+---------
+
+devres is basically linked list of arbitrarily sized memory areas
+associated with a struct device.  Each devres entry is associated with
+a release function.  A devres can be released in several ways.  No
+matter what, all devres entries are released on driver detach.  On
+release, the associated release function is invoked and then the
+devres entry is freed.
+
+Managed interface is created for resources commonly used by device
+drivers using devres.  For example, coherent DMA memory is acquired
+using dma_alloc_coherent().  The managed version is called
+dmam_alloc_coherent().  It is identical to dma_alloc_coherent() except
+for the DMA memory allocated using it is managed and will be
+automatically released on driver detach.  Implementation looks like
+the following::
+
+  struct dma_devres {
+	size_t		size;
+	void		*vaddr;
+	dma_addr_t	dma_handle;
+  };
+
+  static void dmam_coherent_release(struct device *dev, void *res)
+  {
+	struct dma_devres *this = res;
+
+	dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle);
+  }
+
+  dmam_alloc_coherent(dev, size, dma_handle, gfp)
+  {
+	struct dma_devres *dr;
+	void *vaddr;
+
+	dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp);
+	...
+
+	/* alloc DMA memory as usual */
+	vaddr = dma_alloc_coherent(...);
+	...
+
+	/* record size, vaddr, dma_handle in dr */
+	dr->vaddr = vaddr;
+	...
+
+	devres_add(dev, dr);
+
+	return vaddr;
+  }
+
+If a driver uses dmam_alloc_coherent(), the area is guaranteed to be
+freed whether initialization fails half-way or the device gets
+detached.  If most resources are acquired using managed interface, a
+driver can have much simpler init and exit code.  Init path basically
+looks like the following::
+
+  my_init_one()
+  {
+	struct mydev *d;
+
+	d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	d->ring = dmam_alloc_coherent(...);
+	if (!d->ring)
+		return -ENOMEM;
+
+	if (check something)
+		return -EINVAL;
+	...
+
+	return register_to_upper_layer(d);
+  }
+
+And exit path::
+
+  my_remove_one()
+  {
+	unregister_from_upper_layer(d);
+	shutdown_my_hardware();
+  }
+
+As shown above, low level drivers can be simplified a lot by using
+devres.  Complexity is shifted from less maintained low level drivers
+to better maintained higher layer.  Also, as init failure path is
+shared with exit path, both can get more testing.
+
+Note though that when converting current calls or assignments to
+managed devm_* versions it is up to you to check if internal operations
+like allocating memory, have failed. Managed resources pertains to the
+freeing of these resources *only* - all other checks needed are still
+on you. In some cases this may mean introducing checks that were not
+necessary before moving to the managed devm_* calls.
+
+
+3. Devres group
+---------------
+
+Devres entries can be grouped using devres group.  When a group is
+released, all contained normal devres entries and properly nested
+groups are released.  One usage is to rollback series of acquired
+resources on failure.  For example::
+
+  if (!devres_open_group(dev, NULL, GFP_KERNEL))
+	return -ENOMEM;
+
+  acquire A;
+  if (failed)
+	goto err;
+
+  acquire B;
+  if (failed)
+	goto err;
+  ...
+
+  devres_remove_group(dev, NULL);
+  return 0;
+
+ err:
+  devres_release_group(dev, NULL);
+  return err_code;
+
+As resource acquisition failure usually means probe failure, constructs
+like above are usually useful in midlayer driver (e.g. libata core
+layer) where interface function shouldn't have side effect on failure.
+For LLDs, just returning error code suffices in most cases.
+
+Each group is identified by `void *id`.  It can either be explicitly
+specified by @id argument to devres_open_group() or automatically
+created by passing NULL as @id as in the above example.  In both
+cases, devres_open_group() returns the group's id.  The returned id
+can be passed to other devres functions to select the target group.
+If NULL is given to those functions, the latest open group is
+selected.
+
+For example, you can do something like the following::
+
+  int my_midlayer_create_something()
+  {
+	if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL))
+		return -ENOMEM;
+
+	...
+
+	devres_close_group(dev, my_midlayer_create_something);
+	return 0;
+  }
+
+  void my_midlayer_destroy_something()
+  {
+	devres_release_group(dev, my_midlayer_create_something);
+  }
+
+
+4. Details
+----------
+
+Lifetime of a devres entry begins on devres allocation and finishes
+when it is released or destroyed (removed and freed) - no reference
+counting.
+
+devres core guarantees atomicity to all basic devres operations and
+has support for single-instance devres types (atomic
+lookup-and-add-if-not-found).  Other than that, synchronizing
+concurrent accesses to allocated devres data is caller's
+responsibility.  This is usually non-issue because bus ops and
+resource allocations already do the job.
+
+For an example of single-instance devres type, read pcim_iomap_table()
+in lib/devres.c.
+
+All devres interface functions can be called without context if the
+right gfp mask is given.
+
+
+5. Overhead
+-----------
+
+Each devres bookkeeping info is allocated together with requested data
+area.  With debug option turned off, bookkeeping info occupies 16
+bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded
+up to ull alignment).  If singly linked list is used, it can be
+reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit).
+
+Each devres group occupies 8 pointers.  It can be reduced to 6 if
+singly linked list is used.
+
+Memory space overhead on ahci controller with two ports is between 300
+and 400 bytes on 32bit machine after naive conversion (we can
+certainly invest a bit more effort into libata core layer).
+
+
+6. List of managed interfaces
+-----------------------------
+
+CLOCK
+  devm_clk_get()
+  devm_clk_get_optional()
+  devm_clk_put()
+  devm_clk_hw_register()
+  devm_of_clk_add_hw_provider()
+  devm_clk_hw_register_clkdev()
+
+DMA
+  dmaenginem_async_device_register()
+  dmam_alloc_coherent()
+  dmam_alloc_attrs()
+  dmam_free_coherent()
+  dmam_pool_create()
+  dmam_pool_destroy()
+
+DRM
+  devm_drm_dev_init()
+
+GPIO
+  devm_gpiod_get()
+  devm_gpiod_get_index()
+  devm_gpiod_get_index_optional()
+  devm_gpiod_get_optional()
+  devm_gpiod_put()
+  devm_gpiod_unhinge()
+  devm_gpiochip_add_data()
+  devm_gpio_request()
+  devm_gpio_request_one()
+  devm_gpio_free()
+
+I2C
+  devm_i2c_new_dummy_device()
+
+IIO
+  devm_iio_device_alloc()
+  devm_iio_device_free()
+  devm_iio_device_register()
+  devm_iio_device_unregister()
+  devm_iio_kfifo_allocate()
+  devm_iio_kfifo_free()
+  devm_iio_triggered_buffer_setup()
+  devm_iio_triggered_buffer_cleanup()
+  devm_iio_trigger_alloc()
+  devm_iio_trigger_free()
+  devm_iio_trigger_register()
+  devm_iio_trigger_unregister()
+  devm_iio_channel_get()
+  devm_iio_channel_release()
+  devm_iio_channel_get_all()
+  devm_iio_channel_release_all()
+
+INPUT
+  devm_input_allocate_device()
+
+IO region
+  devm_release_mem_region()
+  devm_release_region()
+  devm_release_resource()
+  devm_request_mem_region()
+  devm_request_region()
+  devm_request_resource()
+
+IOMAP
+  devm_ioport_map()
+  devm_ioport_unmap()
+  devm_ioremap()
+  devm_ioremap_nocache()
+  devm_ioremap_wc()
+  devm_ioremap_resource() : checks resource, requests memory region, ioremaps
+  devm_iounmap()
+  pcim_iomap()
+  pcim_iomap_regions()	: do request_region() and iomap() on multiple BARs
+  pcim_iomap_table()	: array of mapped addresses indexed by BAR
+  pcim_iounmap()
+
+IRQ
+  devm_free_irq()
+  devm_request_any_context_irq()
+  devm_request_irq()
+  devm_request_threaded_irq()
+  devm_irq_alloc_descs()
+  devm_irq_alloc_desc()
+  devm_irq_alloc_desc_at()
+  devm_irq_alloc_desc_from()
+  devm_irq_alloc_descs_from()
+  devm_irq_alloc_generic_chip()
+  devm_irq_setup_generic_chip()
+  devm_irq_sim_init()
+
+LED
+  devm_led_classdev_register()
+  devm_led_classdev_unregister()
+
+MDIO
+  devm_mdiobus_alloc()
+  devm_mdiobus_alloc_size()
+  devm_mdiobus_free()
+
+MEM
+  devm_free_pages()
+  devm_get_free_pages()
+  devm_kasprintf()
+  devm_kcalloc()
+  devm_kfree()
+  devm_kmalloc()
+  devm_kmalloc_array()
+  devm_kmemdup()
+  devm_kstrdup()
+  devm_kvasprintf()
+  devm_kzalloc()
+
+MFD
+  devm_mfd_add_devices()
+
+MUX
+  devm_mux_chip_alloc()
+  devm_mux_chip_register()
+  devm_mux_control_get()
+
+PER-CPU MEM
+  devm_alloc_percpu()
+  devm_free_percpu()
+
+PCI
+  devm_pci_alloc_host_bridge()  : managed PCI host bridge allocation
+  devm_pci_remap_cfgspace()	: ioremap PCI configuration space
+  devm_pci_remap_cfg_resource()	: ioremap PCI configuration space resource
+  pcim_enable_device()		: after success, all PCI ops become managed
+  pcim_pin_device()		: keep PCI device enabled after release
+
+PHY
+  devm_usb_get_phy()
+  devm_usb_put_phy()
+
+PINCTRL
+  devm_pinctrl_get()
+  devm_pinctrl_put()
+  devm_pinctrl_register()
+  devm_pinctrl_unregister()
+
+POWER
+  devm_reboot_mode_register()
+  devm_reboot_mode_unregister()
+
+PWM
+  devm_pwm_get()
+  devm_pwm_put()
+
+REGULATOR
+  devm_regulator_bulk_get()
+  devm_regulator_get()
+  devm_regulator_put()
+  devm_regulator_register()
+
+RESET
+  devm_reset_control_get()
+  devm_reset_controller_register()
+
+SERDEV
+  devm_serdev_device_open()
+
+SLAVE DMA ENGINE
+  devm_acpi_dma_controller_register()
+
+SPI
+  devm_spi_register_master()
+
+WATCHDOG
+  devm_watchdog_register_device()
diff --git a/Documentation/driver-api/driver-model/driver.rst b/Documentation/driver-api/driver-model/driver.rst
new file mode 100644
index 000000000000..11d281506a04
--- /dev/null
+++ b/Documentation/driver-api/driver-model/driver.rst
@@ -0,0 +1,223 @@
+==============
+Device Drivers
+==============
+
+See the kerneldoc for the struct device_driver.
+
+
+Allocation
+~~~~~~~~~~
+
+Device drivers are statically allocated structures. Though there may
+be multiple devices in a system that a driver supports, struct
+device_driver represents the driver as a whole (not a particular
+device instance).
+
+Initialization
+~~~~~~~~~~~~~~
+
+The driver must initialize at least the name and bus fields. It should
+also initialize the devclass field (when it arrives), so it may obtain
+the proper linkage internally. It should also initialize as many of
+the callbacks as possible, though each is optional.
+
+Declaration
+~~~~~~~~~~~
+
+As stated above, struct device_driver objects are statically
+allocated. Below is an example declaration of the eepro100
+driver. This declaration is hypothetical only; it relies on the driver
+being converted completely to the new model::
+
+  static struct device_driver eepro100_driver = {
+         .name		= "eepro100",
+         .bus		= &pci_bus_type,
+
+         .probe		= eepro100_probe,
+         .remove		= eepro100_remove,
+         .suspend		= eepro100_suspend,
+         .resume		= eepro100_resume,
+  };
+
+Most drivers will not be able to be converted completely to the new
+model because the bus they belong to has a bus-specific structure with
+bus-specific fields that cannot be generalized.
+
+The most common example of this are device ID structures. A driver
+typically defines an array of device IDs that it supports. The format
+of these structures and the semantics for comparing device IDs are
+completely bus-specific. Defining them as bus-specific entities would
+sacrifice type-safety, so we keep bus-specific structures around.
+
+Bus-specific drivers should include a generic struct device_driver in
+the definition of the bus-specific driver. Like this::
+
+  struct pci_driver {
+         const struct pci_device_id *id_table;
+         struct device_driver	  driver;
+  };
+
+A definition that included bus-specific fields would look like
+(using the eepro100 driver again)::
+
+  static struct pci_driver eepro100_driver = {
+         .id_table       = eepro100_pci_tbl,
+         .driver	       = {
+		.name		= "eepro100",
+		.bus		= &pci_bus_type,
+		.probe		= eepro100_probe,
+		.remove		= eepro100_remove,
+		.suspend	= eepro100_suspend,
+		.resume		= eepro100_resume,
+         },
+  };
+
+Some may find the syntax of embedded struct initialization awkward or
+even a bit ugly. So far, it's the best way we've found to do what we want...
+
+Registration
+~~~~~~~~~~~~
+
+::
+
+  int driver_register(struct device_driver *drv);
+
+The driver registers the structure on startup. For drivers that have
+no bus-specific fields (i.e. don't have a bus-specific driver
+structure), they would use driver_register and pass a pointer to their
+struct device_driver object.
+
+Most drivers, however, will have a bus-specific structure and will
+need to register with the bus using something like pci_driver_register.
+
+It is important that drivers register their driver structure as early as
+possible. Registration with the core initializes several fields in the
+struct device_driver object, including the reference count and the
+lock. These fields are assumed to be valid at all times and may be
+used by the device model core or the bus driver.
+
+
+Transition Bus Drivers
+~~~~~~~~~~~~~~~~~~~~~~
+
+By defining wrapper functions, the transition to the new model can be
+made easier. Drivers can ignore the generic structure altogether and
+let the bus wrapper fill in the fields. For the callbacks, the bus can
+define generic callbacks that forward the call to the bus-specific
+callbacks of the drivers.
+
+This solution is intended to be only temporary. In order to get class
+information in the driver, the drivers must be modified anyway. Since
+converting drivers to the new model should reduce some infrastructural
+complexity and code size, it is recommended that they are converted as
+class information is added.
+
+Access
+~~~~~~
+
+Once the object has been registered, it may access the common fields of
+the object, like the lock and the list of devices::
+
+  int driver_for_each_dev(struct device_driver *drv, void *data,
+			  int (*callback)(struct device *dev, void *data));
+
+The devices field is a list of all the devices that have been bound to
+the driver. The LDM core provides a helper function to operate on all
+the devices a driver controls. This helper locks the driver on each
+node access, and does proper reference counting on each device as it
+accesses it.
+
+
+sysfs
+~~~~~
+
+When a driver is registered, a sysfs directory is created in its
+bus's directory. In this directory, the driver can export an interface
+to userspace to control operation of the driver on a global basis;
+e.g. toggling debugging output in the driver.
+
+A future feature of this directory will be a 'devices' directory. This
+directory will contain symlinks to the directories of devices it
+supports.
+
+
+
+Callbacks
+~~~~~~~~~
+
+::
+
+	int	(*probe)	(struct device *dev);
+
+The probe() entry is called in task context, with the bus's rwsem locked
+and the driver partially bound to the device.  Drivers commonly use
+container_of() to convert "dev" to a bus-specific type, both in probe()
+and other routines.  That type often provides device resource data, such
+as pci_dev.resource[] or platform_device.resources, which is used in
+addition to dev->platform_data to initialize the driver.
+
+This callback holds the driver-specific logic to bind the driver to a
+given device.  That includes verifying that the device is present, that
+it's a version the driver can handle, that driver data structures can
+be allocated and initialized, and that any hardware can be initialized.
+Drivers often store a pointer to their state with dev_set_drvdata().
+When the driver has successfully bound itself to that device, then probe()
+returns zero and the driver model code will finish its part of binding
+the driver to that device.
+
+A driver's probe() may return a negative errno value to indicate that
+the driver did not bind to this device, in which case it should have
+released all resources it allocated::
+
+	int 	(*remove)	(struct device *dev);
+
+remove is called to unbind a driver from a device. This may be
+called if a device is physically removed from the system, if the
+driver module is being unloaded, during a reboot sequence, or
+in other cases.
+
+It is up to the driver to determine if the device is present or
+not. It should free any resources allocated specifically for the
+device; i.e. anything in the device's driver_data field.
+
+If the device is still present, it should quiesce the device and place
+it into a supported low-power state::
+
+	int	(*suspend)	(struct device *dev, pm_message_t state);
+
+suspend is called to put the device in a low power state::
+
+	int	(*resume)	(struct device *dev);
+
+Resume is used to bring a device back from a low power state.
+
+
+Attributes
+~~~~~~~~~~
+
+::
+
+  struct driver_attribute {
+          struct attribute        attr;
+          ssize_t (*show)(struct device_driver *driver, char *buf);
+          ssize_t (*store)(struct device_driver *, const char *buf, size_t count);
+  };
+
+Device drivers can export attributes via their sysfs directories.
+Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO
+macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO
+macros.
+
+Example::
+
+	DRIVER_ATTR_RW(debug);
+
+This is equivalent to declaring::
+
+	struct driver_attribute driver_attr_debug;
+
+This can then be used to add and remove the attribute from the
+driver's directory using::
+
+  int driver_create_file(struct device_driver *, const struct driver_attribute *);
+  void driver_remove_file(struct device_driver *, const struct driver_attribute *);
diff --git a/Documentation/driver-api/driver-model/index.rst b/Documentation/driver-api/driver-model/index.rst
new file mode 100644
index 000000000000..755016422269
--- /dev/null
+++ b/Documentation/driver-api/driver-model/index.rst
@@ -0,0 +1,24 @@
+============
+Driver Model
+============
+
+.. toctree::
+   :maxdepth: 1
+
+   binding
+   bus
+   class
+   design-patterns
+   device
+   devres
+   driver
+   overview
+   platform
+   porting
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/driver-model/overview.rst b/Documentation/driver-api/driver-model/overview.rst
new file mode 100644
index 000000000000..d4d1e9b40e0c
--- /dev/null
+++ b/Documentation/driver-api/driver-model/overview.rst
@@ -0,0 +1,124 @@
+=============================
+The Linux Kernel Device Model
+=============================
+
+Patrick Mochel	<mochel@digitalimplant.org>
+
+Drafted 26 August 2002
+Updated 31 January 2006
+
+
+Overview
+~~~~~~~~
+
+The Linux Kernel Driver Model is a unification of all the disparate driver
+models that were previously used in the kernel. It is intended to augment the
+bus-specific drivers for bridges and devices by consolidating a set of data
+and operations into globally accessible data structures.
+
+Traditional driver models implemented some sort of tree-like structure
+(sometimes just a list) for the devices they control. There wasn't any
+uniformity across the different bus types.
+
+The current driver model provides a common, uniform data model for describing
+a bus and the devices that can appear under the bus. The unified bus
+model includes a set of common attributes which all busses carry, and a set
+of common callbacks, such as device discovery during bus probing, bus
+shutdown, bus power management, etc.
+
+The common device and bridge interface reflects the goals of the modern
+computer: namely the ability to do seamless device "plug and play", power
+management, and hot plug. In particular, the model dictated by Intel and
+Microsoft (namely ACPI) ensures that almost every device on almost any bus
+on an x86-compatible system can work within this paradigm.  Of course,
+not every bus is able to support all such operations, although most
+buses support most of those operations.
+
+
+Downstream Access
+~~~~~~~~~~~~~~~~~
+
+Common data fields have been moved out of individual bus layers into a common
+data structure. These fields must still be accessed by the bus layers,
+and sometimes by the device-specific drivers.
+
+Other bus layers are encouraged to do what has been done for the PCI layer.
+struct pci_dev now looks like this::
+
+  struct pci_dev {
+	...
+
+	struct device dev;     /* Generic device interface */
+	...
+  };
+
+Note first that the struct device dev within the struct pci_dev is
+statically allocated. This means only one allocation on device discovery.
+
+Note also that that struct device dev is not necessarily defined at the
+front of the pci_dev structure.  This is to make people think about what
+they're doing when switching between the bus driver and the global driver,
+and to discourage meaningless and incorrect casts between the two.
+
+The PCI bus layer freely accesses the fields of struct device. It knows about
+the structure of struct pci_dev, and it should know the structure of struct
+device. Individual PCI device drivers that have been converted to the current
+driver model generally do not and should not touch the fields of struct device,
+unless there is a compelling reason to do so.
+
+The above abstraction prevents unnecessary pain during transitional phases.
+If it were not done this way, then when a field was renamed or removed, every
+downstream driver would break.  On the other hand, if only the bus layer
+(and not the device layer) accesses the struct device, it is only the bus
+layer that needs to change.
+
+
+User Interface
+~~~~~~~~~~~~~~
+
+By virtue of having a complete hierarchical view of all the devices in the
+system, exporting a complete hierarchical view to userspace becomes relatively
+easy. This has been accomplished by implementing a special purpose virtual
+file system named sysfs.
+
+Almost all mainstream Linux distros mount this filesystem automatically; you
+can see some variation of the following in the output of the "mount" command::
+
+  $ mount
+  ...
+  none on /sys type sysfs (rw,noexec,nosuid,nodev)
+  ...
+  $
+
+The auto-mounting of sysfs is typically accomplished by an entry similar to
+the following in the /etc/fstab file::
+
+  none     	/sys	sysfs    defaults	  	0 0
+
+or something similar in the /lib/init/fstab file on Debian-based systems::
+
+  none            /sys    sysfs    nodev,noexec,nosuid    0 0
+
+If sysfs is not automatically mounted, you can always do it manually with::
+
+	# mount -t sysfs sysfs /sys
+
+Whenever a device is inserted into the tree, a directory is created for it.
+This directory may be populated at each layer of discovery - the global layer,
+the bus layer, or the device layer.
+
+The global layer currently creates two files - 'name' and 'power'. The
+former only reports the name of the device. The latter reports the
+current power state of the device. It will also be used to set the current
+power state.
+
+The bus layer may also create files for the devices it finds while probing the
+bus. For example, the PCI layer currently creates 'irq' and 'resource' files
+for each PCI device.
+
+A device-specific driver may also export files in its directory to expose
+device-specific data or tunable interfaces.
+
+More information about the sysfs directory layout can be found in
+the other documents in this directory and in the file
+Documentation/filesystems/sysfs.txt.
diff --git a/Documentation/driver-api/driver-model/platform.rst b/Documentation/driver-api/driver-model/platform.rst
new file mode 100644
index 000000000000..334dd4071ae4
--- /dev/null
+++ b/Documentation/driver-api/driver-model/platform.rst
@@ -0,0 +1,246 @@
+============================
+Platform Devices and Drivers
+============================
+
+See <linux/platform_device.h> for the driver model interface to the
+platform bus:  platform_device, and platform_driver.  This pseudo-bus
+is used to connect devices on busses with minimal infrastructure,
+like those used to integrate peripherals on many system-on-chip
+processors, or some "legacy" PC interconnects; as opposed to large
+formally specified ones like PCI or USB.
+
+
+Platform devices
+~~~~~~~~~~~~~~~~
+Platform devices are devices that typically appear as autonomous
+entities in the system. This includes legacy port-based devices and
+host bridges to peripheral buses, and most controllers integrated
+into system-on-chip platforms.  What they usually have in common
+is direct addressing from a CPU bus.  Rarely, a platform_device will
+be connected through a segment of some other kind of bus; but its
+registers will still be directly addressable.
+
+Platform devices are given a name, used in driver binding, and a
+list of resources such as addresses and IRQs::
+
+  struct platform_device {
+	const char	*name;
+	u32		id;
+	struct device	dev;
+	u32		num_resources;
+	struct resource	*resource;
+  };
+
+
+Platform drivers
+~~~~~~~~~~~~~~~~
+Platform drivers follow the standard driver model convention, where
+discovery/enumeration is handled outside the drivers, and drivers
+provide probe() and remove() methods.  They support power management
+and shutdown notifications using the standard conventions::
+
+  struct platform_driver {
+	int (*probe)(struct platform_device *);
+	int (*remove)(struct platform_device *);
+	void (*shutdown)(struct platform_device *);
+	int (*suspend)(struct platform_device *, pm_message_t state);
+	int (*suspend_late)(struct platform_device *, pm_message_t state);
+	int (*resume_early)(struct platform_device *);
+	int (*resume)(struct platform_device *);
+	struct device_driver driver;
+  };
+
+Note that probe() should in general verify that the specified device hardware
+actually exists; sometimes platform setup code can't be sure.  The probing
+can use device resources, including clocks, and device platform_data.
+
+Platform drivers register themselves the normal way::
+
+	int platform_driver_register(struct platform_driver *drv);
+
+Or, in common situations where the device is known not to be hot-pluggable,
+the probe() routine can live in an init section to reduce the driver's
+runtime memory footprint::
+
+	int platform_driver_probe(struct platform_driver *drv,
+			  int (*probe)(struct platform_device *))
+
+Kernel modules can be composed of several platform drivers. The platform core
+provides helpers to register and unregister an array of drivers::
+
+	int __platform_register_drivers(struct platform_driver * const *drivers,
+				      unsigned int count, struct module *owner);
+	void platform_unregister_drivers(struct platform_driver * const *drivers,
+					 unsigned int count);
+
+If one of the drivers fails to register, all drivers registered up to that
+point will be unregistered in reverse order. Note that there is a convenience
+macro that passes THIS_MODULE as owner parameter::
+
+	#define platform_register_drivers(drivers, count)
+
+
+Device Enumeration
+~~~~~~~~~~~~~~~~~~
+As a rule, platform specific (and often board-specific) setup code will
+register platform devices::
+
+	int platform_device_register(struct platform_device *pdev);
+
+	int platform_add_devices(struct platform_device **pdevs, int ndev);
+
+The general rule is to register only those devices that actually exist,
+but in some cases extra devices might be registered.  For example, a kernel
+might be configured to work with an external network adapter that might not
+be populated on all boards, or likewise to work with an integrated controller
+that some boards might not hook up to any peripherals.
+
+In some cases, boot firmware will export tables describing the devices
+that are populated on a given board.   Without such tables, often the
+only way for system setup code to set up the correct devices is to build
+a kernel for a specific target board.  Such board-specific kernels are
+common with embedded and custom systems development.
+
+In many cases, the memory and IRQ resources associated with the platform
+device are not enough to let the device's driver work.  Board setup code
+will often provide additional information using the device's platform_data
+field to hold additional information.
+
+Embedded systems frequently need one or more clocks for platform devices,
+which are normally kept off until they're actively needed (to save power).
+System setup also associates those clocks with the device, so that that
+calls to clk_get(&pdev->dev, clock_name) return them as needed.
+
+
+Legacy Drivers:  Device Probing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Some drivers are not fully converted to the driver model, because they take
+on a non-driver role:  the driver registers its platform device, rather than
+leaving that for system infrastructure.  Such drivers can't be hotplugged
+or coldplugged, since those mechanisms require device creation to be in a
+different system component than the driver.
+
+The only "good" reason for this is to handle older system designs which, like
+original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware
+configuration.  Newer systems have largely abandoned that model, in favor of
+bus-level support for dynamic configuration (PCI, USB), or device tables
+provided by the boot firmware (e.g. PNPACPI on x86).  There are too many
+conflicting options about what might be where, and even educated guesses by
+an operating system will be wrong often enough to make trouble.
+
+This style of driver is discouraged.  If you're updating such a driver,
+please try to move the device enumeration to a more appropriate location,
+outside the driver.  This will usually be cleanup, since such drivers
+tend to already have "normal" modes, such as ones using device nodes that
+were created by PNP or by platform device setup.
+
+None the less, there are some APIs to support such legacy drivers.  Avoid
+using these calls except with such hotplug-deficient drivers::
+
+	struct platform_device *platform_device_alloc(
+			const char *name, int id);
+
+You can use platform_device_alloc() to dynamically allocate a device, which
+you will then initialize with resources and platform_device_register().
+A better solution is usually::
+
+	struct platform_device *platform_device_register_simple(
+			const char *name, int id,
+			struct resource *res, unsigned int nres);
+
+You can use platform_device_register_simple() as a one-step call to allocate
+and register a device.
+
+
+Device Naming and Driver Binding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The platform_device.dev.bus_id is the canonical name for the devices.
+It's built from two components:
+
+    * platform_device.name ... which is also used to for driver matching.
+
+    * platform_device.id ... the device instance number, or else "-1"
+      to indicate there's only one.
+
+These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and
+"serial/3" indicates bus_id "serial.3"; both would use the platform_driver
+named "serial".  While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id)
+and use the platform_driver called "my_rtc".
+
+Driver binding is performed automatically by the driver core, invoking
+driver probe() after finding a match between device and driver.  If the
+probe() succeeds, the driver and device are bound as usual.  There are
+three different ways to find such a match:
+
+    - Whenever a device is registered, the drivers for that bus are
+      checked for matches.  Platform devices should be registered very
+      early during system boot.
+
+    - When a driver is registered using platform_driver_register(), all
+      unbound devices on that bus are checked for matches.  Drivers
+      usually register later during booting, or by module loading.
+
+    - Registering a driver using platform_driver_probe() works just like
+      using platform_driver_register(), except that the driver won't
+      be probed later if another device registers.  (Which is OK, since
+      this interface is only for use with non-hotpluggable devices.)
+
+
+Early Platform Devices and Drivers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The early platform interfaces provide platform data to platform device
+drivers early on during the system boot. The code is built on top of the
+early_param() command line parsing and can be executed very early on.
+
+Example: "earlyprintk" class early serial console in 6 steps
+
+1. Registering early platform device data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code registers platform device data using the function
+early_platform_add_devices(). In the case of early serial console this
+should be hardware configuration for the serial port. Devices registered
+at this point will later on be matched against early platform drivers.
+
+2. Parsing kernel command line
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code calls parse_early_param() to parse the kernel
+command line. This will execute all matching early_param() callbacks.
+User specified early platform devices will be registered at this point.
+For the early serial console case the user can specify port on the
+kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
+the class string, "serial" is the name of the platform driver and
+0 is the platform device id. If the id is -1 then the dot and the
+id can be omitted.
+
+3. Installing early platform drivers belonging to a certain class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code may optionally force registration of all early
+platform drivers belonging to a certain class using the function
+early_platform_driver_register_all(). User specified devices from
+step 2 have priority over these. This step is omitted by the serial
+driver example since the early serial driver code should be disabled
+unless the user has specified port on the kernel command line.
+
+4. Early platform driver registration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Compiled-in platform drivers making use of early_platform_init() are
+automatically registered during step 2 or 3. The serial driver example
+should use early_platform_init("earlyprintk", &platform_driver).
+
+5. Probing of early platform drivers belonging to a certain class
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The architecture code calls early_platform_driver_probe() to match
+registered early platform devices associated with a certain class with
+registered early platform drivers. Matched devices will get probed().
+This step can be executed at any point during the early boot. As soon
+as possible may be good for the serial port case.
+
+6. Inside the early platform driver probe()
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The driver code needs to take special care during early boot, especially
+when it comes to memory allocation and interrupt registration. The code
+in the probe() function can use is_early_platform_device() to check if
+it is called at early platform device or at the regular platform device
+time. The early serial driver performs register_console() at this point.
+
+For further information, see <linux/platform_device.h>.
diff --git a/Documentation/driver-api/driver-model/porting.rst b/Documentation/driver-api/driver-model/porting.rst
new file mode 100644
index 000000000000..931ea879af3f
--- /dev/null
+++ b/Documentation/driver-api/driver-model/porting.rst
@@ -0,0 +1,448 @@
+=======================================
+Porting Drivers to the New Driver Model
+=======================================
+
+Patrick Mochel
+
+7 January 2003
+
+
+Overview
+
+Please refer to `Documentation/driver-api/driver-model/*.rst` for definitions of
+various driver types and concepts.
+
+Most of the work of porting devices drivers to the new model happens
+at the bus driver layer. This was intentional, to minimize the
+negative effect on kernel drivers, and to allow a gradual transition
+of bus drivers.
+
+In a nutshell, the driver model consists of a set of objects that can
+be embedded in larger, bus-specific objects. Fields in these generic
+objects can replace fields in the bus-specific objects.
+
+The generic objects must be registered with the driver model core. By
+doing so, they will exported via the sysfs filesystem. sysfs can be
+mounted by doing::
+
+	# mount -t sysfs sysfs /sys
+
+
+
+The Process
+
+Step 0: Read include/linux/device.h for object and function definitions.
+
+Step 1: Registering the bus driver.
+
+
+- Define a struct bus_type for the bus driver::
+
+    struct bus_type pci_bus_type = {
+          .name           = "pci",
+    };
+
+
+- Register the bus type.
+
+  This should be done in the initialization function for the bus type,
+  which is usually the module_init(), or equivalent, function::
+
+    static int __init pci_driver_init(void)
+    {
+            return bus_register(&pci_bus_type);
+    }
+
+    subsys_initcall(pci_driver_init);
+
+
+  The bus type may be unregistered (if the bus driver may be compiled
+  as a module) by doing::
+
+     bus_unregister(&pci_bus_type);
+
+
+- Export the bus type for others to use.
+
+  Other code may wish to reference the bus type, so declare it in a
+  shared header file and export the symbol.
+
+From include/linux/pci.h::
+
+  extern struct bus_type pci_bus_type;
+
+
+From file the above code appears in::
+
+  EXPORT_SYMBOL(pci_bus_type);
+
+
+
+- This will cause the bus to show up in /sys/bus/pci/ with two
+  subdirectories: 'devices' and 'drivers'::
+
+    # tree -d /sys/bus/pci/
+    /sys/bus/pci/
+    |-- devices
+    `-- drivers
+
+
+
+Step 2: Registering Devices.
+
+struct device represents a single device. It mainly contains metadata
+describing the relationship the device has to other entities.
+
+
+- Embed a struct device in the bus-specific device type::
+
+
+    struct pci_dev {
+           ...
+           struct  device  dev;            /* Generic device interface */
+           ...
+    };
+
+  It is recommended that the generic device not be the first item in
+  the struct to discourage programmers from doing mindless casts
+  between the object types. Instead macros, or inline functions,
+  should be created to convert from the generic object type::
+
+
+    #define to_pci_dev(n) container_of(n, struct pci_dev, dev)
+
+    or
+
+    static inline struct pci_dev * to_pci_dev(struct kobject * kobj)
+    {
+	return container_of(n, struct pci_dev, dev);
+    }
+
+  This allows the compiler to verify type-safety of the operations
+  that are performed (which is Good).
+
+
+- Initialize the device on registration.
+
+  When devices are discovered or registered with the bus type, the
+  bus driver should initialize the generic device. The most important
+  things to initialize are the bus_id, parent, and bus fields.
+
+  The bus_id is an ASCII string that contains the device's address on
+  the bus. The format of this string is bus-specific. This is
+  necessary for representing devices in sysfs.
+
+  parent is the physical parent of the device. It is important that
+  the bus driver sets this field correctly.
+
+  The driver model maintains an ordered list of devices that it uses
+  for power management. This list must be in order to guarantee that
+  devices are shutdown before their physical parents, and vice versa.
+  The order of this list is determined by the parent of registered
+  devices.
+
+  Also, the location of the device's sysfs directory depends on a
+  device's parent. sysfs exports a directory structure that mirrors
+  the device hierarchy. Accurately setting the parent guarantees that
+  sysfs will accurately represent the hierarchy.
+
+  The device's bus field is a pointer to the bus type the device
+  belongs to. This should be set to the bus_type that was declared
+  and initialized before.
+
+  Optionally, the bus driver may set the device's name and release
+  fields.
+
+  The name field is an ASCII string describing the device, like
+
+     "ATI Technologies Inc Radeon QD"
+
+  The release field is a callback that the driver model core calls
+  when the device has been removed, and all references to it have
+  been released. More on this in a moment.
+
+
+- Register the device.
+
+  Once the generic device has been initialized, it can be registered
+  with the driver model core by doing::
+
+       device_register(&dev->dev);
+
+  It can later be unregistered by doing::
+
+       device_unregister(&dev->dev);
+
+  This should happen on buses that support hotpluggable devices.
+  If a bus driver unregisters a device, it should not immediately free
+  it. It should instead wait for the driver model core to call the
+  device's release method, then free the bus-specific object.
+  (There may be other code that is currently referencing the device
+  structure, and it would be rude to free the device while that is
+  happening).
+
+
+  When the device is registered, a directory in sysfs is created.
+  The PCI tree in sysfs looks like::
+
+    /sys/devices/pci0/
+    |-- 00:00.0
+    |-- 00:01.0
+    |   `-- 01:00.0
+    |-- 00:02.0
+    |   `-- 02:1f.0
+    |       `-- 03:00.0
+    |-- 00:1e.0
+    |   `-- 04:04.0
+    |-- 00:1f.0
+    |-- 00:1f.1
+    |   |-- ide0
+    |   |   |-- 0.0
+    |   |   `-- 0.1
+    |   `-- ide1
+    |       `-- 1.0
+    |-- 00:1f.2
+    |-- 00:1f.3
+    `-- 00:1f.5
+
+  Also, symlinks are created in the bus's 'devices' directory
+  that point to the device's directory in the physical hierarchy::
+
+    /sys/bus/pci/devices/
+    |-- 00:00.0 -> ../../../devices/pci0/00:00.0
+    |-- 00:01.0 -> ../../../devices/pci0/00:01.0
+    |-- 00:02.0 -> ../../../devices/pci0/00:02.0
+    |-- 00:1e.0 -> ../../../devices/pci0/00:1e.0
+    |-- 00:1f.0 -> ../../../devices/pci0/00:1f.0
+    |-- 00:1f.1 -> ../../../devices/pci0/00:1f.1
+    |-- 00:1f.2 -> ../../../devices/pci0/00:1f.2
+    |-- 00:1f.3 -> ../../../devices/pci0/00:1f.3
+    |-- 00:1f.5 -> ../../../devices/pci0/00:1f.5
+    |-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0
+    |-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0
+    |-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0
+    `-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0
+
+
+
+Step 3: Registering Drivers.
+
+struct device_driver is a simple driver structure that contains a set
+of operations that the driver model core may call.
+
+
+- Embed a struct device_driver in the bus-specific driver.
+
+  Just like with devices, do something like::
+
+    struct pci_driver {
+           ...
+           struct device_driver    driver;
+    };
+
+
+- Initialize the generic driver structure.
+
+  When the driver registers with the bus (e.g. doing pci_register_driver()),
+  initialize the necessary fields of the driver: the name and bus
+  fields.
+
+
+- Register the driver.
+
+  After the generic driver has been initialized, call::
+
+	driver_register(&drv->driver);
+
+  to register the driver with the core.
+
+  When the driver is unregistered from the bus, unregister it from the
+  core by doing::
+
+        driver_unregister(&drv->driver);
+
+  Note that this will block until all references to the driver have
+  gone away. Normally, there will not be any.
+
+
+- Sysfs representation.
+
+  Drivers are exported via sysfs in their bus's 'driver's directory.
+  For example::
+
+    /sys/bus/pci/drivers/
+    |-- 3c59x
+    |-- Ensoniq AudioPCI
+    |-- agpgart-amdk7
+    |-- e100
+    `-- serial
+
+
+Step 4: Define Generic Methods for Drivers.
+
+struct device_driver defines a set of operations that the driver model
+core calls. Most of these operations are probably similar to
+operations the bus already defines for drivers, but taking different
+parameters.
+
+It would be difficult and tedious to force every driver on a bus to
+simultaneously convert their drivers to generic format. Instead, the
+bus driver should define single instances of the generic methods that
+forward call to the bus-specific drivers. For instance::
+
+
+  static int pci_device_remove(struct device * dev)
+  {
+          struct pci_dev * pci_dev = to_pci_dev(dev);
+          struct pci_driver * drv = pci_dev->driver;
+
+          if (drv) {
+                  if (drv->remove)
+                          drv->remove(pci_dev);
+                  pci_dev->driver = NULL;
+          }
+          return 0;
+  }
+
+
+The generic driver should be initialized with these methods before it
+is registered::
+
+        /* initialize common driver fields */
+        drv->driver.name = drv->name;
+        drv->driver.bus = &pci_bus_type;
+        drv->driver.probe = pci_device_probe;
+        drv->driver.resume = pci_device_resume;
+        drv->driver.suspend = pci_device_suspend;
+        drv->driver.remove = pci_device_remove;
+
+        /* register with core */
+        driver_register(&drv->driver);
+
+
+Ideally, the bus should only initialize the fields if they are not
+already set. This allows the drivers to implement their own generic
+methods.
+
+
+Step 5: Support generic driver binding.
+
+The model assumes that a device or driver can be dynamically
+registered with the bus at any time. When registration happens,
+devices must be bound to a driver, or drivers must be bound to all
+devices that it supports.
+
+A driver typically contains a list of device IDs that it supports. The
+bus driver compares these IDs to the IDs of devices registered with it.
+The format of the device IDs, and the semantics for comparing them are
+bus-specific, so the generic model does attempt to generalize them.
+
+Instead, a bus may supply a method in struct bus_type that does the
+comparison::
+
+  int (*match)(struct device * dev, struct device_driver * drv);
+
+match should return positive value if the driver supports the device,
+and zero otherwise. It may also return error code (for example
+-EPROBE_DEFER) if determining that given driver supports the device is
+not possible.
+
+When a device is registered, the bus's list of drivers is iterated
+over. bus->match() is called for each one until a match is found.
+
+When a driver is registered, the bus's list of devices is iterated
+over. bus->match() is called for each device that is not already
+claimed by a driver.
+
+When a device is successfully bound to a driver, device->driver is
+set, the device is added to a per-driver list of devices, and a
+symlink is created in the driver's sysfs directory that points to the
+device's physical directory::
+
+  /sys/bus/pci/drivers/
+  |-- 3c59x
+  |   `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0
+  |-- Ensoniq AudioPCI
+  |-- agpgart-amdk7
+  |   `-- 00:00.0 -> ../../../../devices/pci0/00:00.0
+  |-- e100
+  |   `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0
+  `-- serial
+
+
+This driver binding should replace the existing driver binding
+mechanism the bus currently uses.
+
+
+Step 6: Supply a hotplug callback.
+
+Whenever a device is registered with the driver model core, the
+userspace program /sbin/hotplug is called to notify userspace.
+Users can define actions to perform when a device is inserted or
+removed.
+
+The driver model core passes several arguments to userspace via
+environment variables, including
+
+- ACTION: set to 'add' or 'remove'
+- DEVPATH: set to the device's physical path in sysfs.
+
+A bus driver may also supply additional parameters for userspace to
+consume. To do this, a bus must implement the 'hotplug' method in
+struct bus_type::
+
+     int (*hotplug) (struct device *dev, char **envp,
+                     int num_envp, char *buffer, int buffer_size);
+
+This is called immediately before /sbin/hotplug is executed.
+
+
+Step 7: Cleaning up the bus driver.
+
+The generic bus, device, and driver structures provide several fields
+that can replace those defined privately to the bus driver.
+
+- Device list.
+
+struct bus_type contains a list of all devices registered with the bus
+type. This includes all devices on all instances of that bus type.
+An internal list that the bus uses may be removed, in favor of using
+this one.
+
+The core provides an iterator to access these devices::
+
+  int bus_for_each_dev(struct bus_type * bus, struct device * start,
+                       void * data, int (*fn)(struct device *, void *));
+
+
+- Driver list.
+
+struct bus_type also contains a list of all drivers registered with
+it. An internal list of drivers that the bus driver maintains may
+be removed in favor of using the generic one.
+
+The drivers may be iterated over, like devices::
+
+  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
+                       void * data, int (*fn)(struct device_driver *, void *));
+
+
+Please see drivers/base/bus.c for more information.
+
+
+- rwsem
+
+struct bus_type contains an rwsem that protects all core accesses to
+the device and driver lists. This can be used by the bus driver
+internally, and should be used when accessing the device or driver
+lists the bus maintains.
+
+
+- Device and driver fields.
+
+Some of the fields in struct device and struct device_driver duplicate
+fields in the bus-specific representations of these objects. Feel free
+to remove the bus-specific ones and favor the generic ones. Note
+though, that this will likely mean fixing up all the drivers that
+reference the bus-specific fields (though those should all be 1-line
+changes).
diff --git a/Documentation/driver-api/gpio/driver.rst b/Documentation/driver-api/gpio/driver.rst
index 349f2dc33029..921c71a3d683 100644
--- a/Documentation/driver-api/gpio/driver.rst
+++ b/Documentation/driver-api/gpio/driver.rst
@@ -399,7 +399,7 @@ symbol:
   will pass the struct gpio_chip* for the chip to all IRQ callbacks, so the
   callbacks need to embed the gpio_chip in its state container and obtain a
   pointer to the container using container_of().
-  (See Documentation/driver-model/design-patterns.rst)
+  (See Documentation/driver-api/driver-model/design-patterns.rst)
 
 - gpiochip_irqchip_add_nested(): adds a nested cascaded irqchip to a gpiochip,
   as discussed above regarding different types of cascaded irqchips. The
diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index b4c993ff7655..9fb03b7bdeb1 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -14,6 +14,7 @@ available subsections can be seen below.
 .. toctree::
    :maxdepth: 2
 
+   driver-model/index
    basics
    infrastructure
    early-userspace/index
diff --git a/Documentation/driver-model/binding.rst b/Documentation/driver-model/binding.rst
deleted file mode 100644
index 7ea1d7a41e1d..000000000000
--- a/Documentation/driver-model/binding.rst
+++ /dev/null
@@ -1,98 +0,0 @@
-==============
-Driver Binding
-==============
-
-Driver binding is the process of associating a device with a device
-driver that can control it. Bus drivers have typically handled this
-because there have been bus-specific structures to represent the
-devices and the drivers. With generic device and device driver
-structures, most of the binding can take place using common code.
-
-
-Bus
-~~~
-
-The bus type structure contains a list of all devices that are on that bus
-type in the system. When device_register is called for a device, it is
-inserted into the end of this list. The bus object also contains a
-list of all drivers of that bus type. When driver_register is called
-for a driver, it is inserted at the end of this list. These are the
-two events which trigger driver binding.
-
-
-device_register
-~~~~~~~~~~~~~~~
-
-When a new device is added, the bus's list of drivers is iterated over
-to find one that supports it. In order to determine that, the device
-ID of the device must match one of the device IDs that the driver
-supports. The format and semantics for comparing IDs is bus-specific.
-Instead of trying to derive a complex state machine and matching
-algorithm, it is up to the bus driver to provide a callback to compare
-a device against the IDs of a driver. The bus returns 1 if a match was
-found; 0 otherwise.
-
-int match(struct device * dev, struct device_driver * drv);
-
-If a match is found, the device's driver field is set to the driver
-and the driver's probe callback is called. This gives the driver a
-chance to verify that it really does support the hardware, and that
-it's in a working state.
-
-Device Class
-~~~~~~~~~~~~
-
-Upon the successful completion of probe, the device is registered with
-the class to which it belongs. Device drivers belong to one and only one
-class, and that is set in the driver's devclass field.
-devclass_add_device is called to enumerate the device within the class
-and actually register it with the class, which happens with the
-class's register_dev callback.
-
-
-Driver
-~~~~~~
-
-When a driver is attached to a device, the device is inserted into the
-driver's list of devices.
-
-
-sysfs
-~~~~~
-
-A symlink is created in the bus's 'devices' directory that points to
-the device's directory in the physical hierarchy.
-
-A symlink is created in the driver's 'devices' directory that points
-to the device's directory in the physical hierarchy.
-
-A directory for the device is created in the class's directory. A
-symlink is created in that directory that points to the device's
-physical location in the sysfs tree.
-
-A symlink can be created (though this isn't done yet) in the device's
-physical directory to either its class directory, or the class's
-top-level directory. One can also be created to point to its driver's
-directory also.
-
-
-driver_register
-~~~~~~~~~~~~~~~
-
-The process is almost identical for when a new driver is added.
-The bus's list of devices is iterated over to find a match. Devices
-that already have a driver are skipped. All the devices are iterated
-over, to bind as many devices as possible to the driver.
-
-
-Removal
-~~~~~~~
-
-When a device is removed, the reference count for it will eventually
-go to 0. When it does, the remove callback of the driver is called. It
-is removed from the driver's list of devices and the reference count
-of the driver is decremented. All symlinks between the two are removed.
-
-When a driver is removed, the list of devices that it supports is
-iterated over, and the driver's remove callback is called for each
-one. The device is removed from that list and the symlinks removed.
diff --git a/Documentation/driver-model/bus.rst b/Documentation/driver-model/bus.rst
deleted file mode 100644
index 016b15a6e8ea..000000000000
--- a/Documentation/driver-model/bus.rst
+++ /dev/null
@@ -1,146 +0,0 @@
-=========
-Bus Types
-=========
-
-Definition
-~~~~~~~~~~
-See the kerneldoc for the struct bus_type.
-
-int bus_register(struct bus_type * bus);
-
-
-Declaration
-~~~~~~~~~~~
-
-Each bus type in the kernel (PCI, USB, etc) should declare one static
-object of this type. They must initialize the name field, and may
-optionally initialize the match callback::
-
-   struct bus_type pci_bus_type = {
-          .name	= "pci",
-          .match	= pci_bus_match,
-   };
-
-The structure should be exported to drivers in a header file:
-
-extern struct bus_type pci_bus_type;
-
-
-Registration
-~~~~~~~~~~~~
-
-When a bus driver is initialized, it calls bus_register. This
-initializes the rest of the fields in the bus object and inserts it
-into a global list of bus types. Once the bus object is registered,
-the fields in it are usable by the bus driver.
-
-
-Callbacks
-~~~~~~~~~
-
-match(): Attaching Drivers to Devices
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The format of device ID structures and the semantics for comparing
-them are inherently bus-specific. Drivers typically declare an array
-of device IDs of devices they support that reside in a bus-specific
-driver structure.
-
-The purpose of the match callback is to give the bus an opportunity to
-determine if a particular driver supports a particular device by
-comparing the device IDs the driver supports with the device ID of a
-particular device, without sacrificing bus-specific functionality or
-type-safety.
-
-When a driver is registered with the bus, the bus's list of devices is
-iterated over, and the match callback is called for each device that
-does not have a driver associated with it.
-
-
-
-Device and Driver Lists
-~~~~~~~~~~~~~~~~~~~~~~~
-
-The lists of devices and drivers are intended to replace the local
-lists that many buses keep. They are lists of struct devices and
-struct device_drivers, respectively. Bus drivers are free to use the
-lists as they please, but conversion to the bus-specific type may be
-necessary.
-
-The LDM core provides helper functions for iterating over each list::
-
-  int bus_for_each_dev(struct bus_type * bus, struct device * start,
-		       void * data,
-		       int (*fn)(struct device *, void *));
-
-  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
-		       void * data, int (*fn)(struct device_driver *, void *));
-
-These helpers iterate over the respective list, and call the callback
-for each device or driver in the list. All list accesses are
-synchronized by taking the bus's lock (read currently). The reference
-count on each object in the list is incremented before the callback is
-called; it is decremented after the next object has been obtained. The
-lock is not held when calling the callback.
-
-
-sysfs
-~~~~~~~~
-There is a top-level directory named 'bus'.
-
-Each bus gets a directory in the bus directory, along with two default
-directories::
-
-	/sys/bus/pci/
-	|-- devices
-	`-- drivers
-
-Drivers registered with the bus get a directory in the bus's drivers
-directory::
-
-	/sys/bus/pci/
-	|-- devices
-	`-- drivers
-	    |-- Intel ICH
-	    |-- Intel ICH Joystick
-	    |-- agpgart
-	    `-- e100
-
-Each device that is discovered on a bus of that type gets a symlink in
-the bus's devices directory to the device's directory in the physical
-hierarchy::
-
-	/sys/bus/pci/
-	|-- devices
-	|   |-- 00:00.0 -> ../../../root/pci0/00:00.0
-	|   |-- 00:01.0 -> ../../../root/pci0/00:01.0
-	|   `-- 00:02.0 -> ../../../root/pci0/00:02.0
-	`-- drivers
-
-
-Exporting Attributes
-~~~~~~~~~~~~~~~~~~~~
-
-::
-
-  struct bus_attribute {
-	struct attribute	attr;
-	ssize_t (*show)(struct bus_type *, char * buf);
-	ssize_t (*store)(struct bus_type *, const char * buf, size_t count);
-  };
-
-Bus drivers can export attributes using the BUS_ATTR_RW macro that works
-similarly to the DEVICE_ATTR_RW macro for devices. For example, a
-definition like this::
-
-	static BUS_ATTR_RW(debug);
-
-is equivalent to declaring::
-
-	static bus_attribute bus_attr_debug;
-
-This can then be used to add and remove the attribute from the bus's
-sysfs directory using::
-
-	int bus_create_file(struct bus_type *, struct bus_attribute *);
-	void bus_remove_file(struct bus_type *, struct bus_attribute *);
diff --git a/Documentation/driver-model/class.rst b/Documentation/driver-model/class.rst
deleted file mode 100644
index fff55b80e86a..000000000000
--- a/Documentation/driver-model/class.rst
+++ /dev/null
@@ -1,149 +0,0 @@
-==============
-Device Classes
-==============
-
-Introduction
-~~~~~~~~~~~~
-A device class describes a type of device, like an audio or network
-device. The following device classes have been identified:
-
-<Insert List of Device Classes Here>
-
-
-Each device class defines a set of semantics and a programming interface
-that devices of that class adhere to. Device drivers are the
-implementation of that programming interface for a particular device on
-a particular bus.
-
-Device classes are agnostic with respect to what bus a device resides
-on.
-
-
-Programming Interface
-~~~~~~~~~~~~~~~~~~~~~
-The device class structure looks like::
-
-
-  typedef int (*devclass_add)(struct device *);
-  typedef void (*devclass_remove)(struct device *);
-
-See the kerneldoc for the struct class.
-
-A typical device class definition would look like::
-
-  struct device_class input_devclass = {
-        .name		= "input",
-        .add_device	= input_add_device,
-	.remove_device	= input_remove_device,
-  };
-
-Each device class structure should be exported in a header file so it
-can be used by drivers, extensions and interfaces.
-
-Device classes are registered and unregistered with the core using::
-
-  int devclass_register(struct device_class * cls);
-  void devclass_unregister(struct device_class * cls);
-
-
-Devices
-~~~~~~~
-As devices are bound to drivers, they are added to the device class
-that the driver belongs to. Before the driver model core, this would
-typically happen during the driver's probe() callback, once the device
-has been initialized. It now happens after the probe() callback
-finishes from the core.
-
-The device is enumerated in the class. Each time a device is added to
-the class, the class's devnum field is incremented and assigned to the
-device. The field is never decremented, so if the device is removed
-from the class and re-added, it will receive a different enumerated
-value.
-
-The class is allowed to create a class-specific structure for the
-device and store it in the device's class_data pointer.
-
-There is no list of devices in the device class. Each driver has a
-list of devices that it supports. The device class has a list of
-drivers of that particular class. To access all of the devices in the
-class, iterate over the device lists of each driver in the class.
-
-
-Device Drivers
-~~~~~~~~~~~~~~
-Device drivers are added to device classes when they are registered
-with the core. A driver specifies the class it belongs to by setting
-the struct device_driver::devclass field.
-
-
-sysfs directory structure
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-There is a top-level sysfs directory named 'class'.
-
-Each class gets a directory in the class directory, along with two
-default subdirectories::
-
-        class/
-        `-- input
-            |-- devices
-            `-- drivers
-
-
-Drivers registered with the class get a symlink in the drivers/ directory
-that points to the driver's directory (under its bus directory)::
-
-   class/
-   `-- input
-       |-- devices
-       `-- drivers
-           `-- usb:usb_mouse -> ../../../bus/drivers/usb_mouse/
-
-
-Each device gets a symlink in the devices/ directory that points to the
-device's directory in the physical hierarchy::
-
-   class/
-   `-- input
-       |-- devices
-       |   `-- 1 -> ../../../root/pci0/00:1f.0/usb_bus/00:1f.2-1:0/
-       `-- drivers
-
-
-Exporting Attributes
-~~~~~~~~~~~~~~~~~~~~
-
-::
-
-  struct devclass_attribute {
-        struct attribute        attr;
-        ssize_t (*show)(struct device_class *, char * buf, size_t count, loff_t off);
-        ssize_t (*store)(struct device_class *, const char * buf, size_t count, loff_t off);
-  };
-
-Class drivers can export attributes using the DEVCLASS_ATTR macro that works
-similarly to the DEVICE_ATTR macro for devices. For example, a definition
-like this::
-
-  static DEVCLASS_ATTR(debug,0644,show_debug,store_debug);
-
-is equivalent to declaring::
-
-  static devclass_attribute devclass_attr_debug;
-
-The bus driver can add and remove the attribute from the class's
-sysfs directory using::
-
-  int devclass_create_file(struct device_class *, struct devclass_attribute *);
-  void devclass_remove_file(struct device_class *, struct devclass_attribute *);
-
-In the example above, the file will be named 'debug' in placed in the
-class's directory in sysfs.
-
-
-Interfaces
-~~~~~~~~~~
-There may exist multiple mechanisms for accessing the same device of a
-particular class type. Device interfaces describe these mechanisms.
-
-When a device is added to a device class, the core attempts to add it
-to every interface that is registered with the device class.
diff --git a/Documentation/driver-model/design-patterns.rst b/Documentation/driver-model/design-patterns.rst
deleted file mode 100644
index 41eb8f41f7dd..000000000000
--- a/Documentation/driver-model/design-patterns.rst
+++ /dev/null
@@ -1,116 +0,0 @@
-=============================
-Device Driver Design Patterns
-=============================
-
-This document describes a few common design patterns found in device drivers.
-It is likely that subsystem maintainers will ask driver developers to
-conform to these design patterns.
-
-1. State Container
-2. container_of()
-
-
-1. State Container
-~~~~~~~~~~~~~~~~~~
-
-While the kernel contains a few device drivers that assume that they will
-only be probed() once on a certain system (singletons), it is custom to assume
-that the device the driver binds to will appear in several instances. This
-means that the probe() function and all callbacks need to be reentrant.
-
-The most common way to achieve this is to use the state container design
-pattern. It usually has this form::
-
-  struct foo {
-      spinlock_t lock; /* Example member */
-      (...)
-  };
-
-  static int foo_probe(...)
-  {
-      struct foo *foo;
-
-      foo = devm_kzalloc(dev, sizeof(*foo), GFP_KERNEL);
-      if (!foo)
-          return -ENOMEM;
-      spin_lock_init(&foo->lock);
-      (...)
-  }
-
-This will create an instance of struct foo in memory every time probe() is
-called. This is our state container for this instance of the device driver.
-Of course it is then necessary to always pass this instance of the
-state around to all functions that need access to the state and its members.
-
-For example, if the driver is registering an interrupt handler, you would
-pass around a pointer to struct foo like this::
-
-  static irqreturn_t foo_handler(int irq, void *arg)
-  {
-      struct foo *foo = arg;
-      (...)
-  }
-
-  static int foo_probe(...)
-  {
-      struct foo *foo;
-
-      (...)
-      ret = request_irq(irq, foo_handler, 0, "foo", foo);
-  }
-
-This way you always get a pointer back to the correct instance of foo in
-your interrupt handler.
-
-
-2. container_of()
-~~~~~~~~~~~~~~~~~
-
-Continuing on the above example we add an offloaded work::
-
-  struct foo {
-      spinlock_t lock;
-      struct workqueue_struct *wq;
-      struct work_struct offload;
-      (...)
-  };
-
-  static void foo_work(struct work_struct *work)
-  {
-      struct foo *foo = container_of(work, struct foo, offload);
-
-      (...)
-  }
-
-  static irqreturn_t foo_handler(int irq, void *arg)
-  {
-      struct foo *foo = arg;
-
-      queue_work(foo->wq, &foo->offload);
-      (...)
-  }
-
-  static int foo_probe(...)
-  {
-      struct foo *foo;
-
-      foo->wq = create_singlethread_workqueue("foo-wq");
-      INIT_WORK(&foo->offload, foo_work);
-      (...)
-  }
-
-The design pattern is the same for an hrtimer or something similar that will
-return a single argument which is a pointer to a struct member in the
-callback.
-
-container_of() is a macro defined in <linux/kernel.h>
-
-What container_of() does is to obtain a pointer to the containing struct from
-a pointer to a member by a simple subtraction using the offsetof() macro from
-standard C, which allows something similar to object oriented behaviours.
-Notice that the contained member must not be a pointer, but an actual member
-for this to work.
-
-We can see here that we avoid having global pointers to our struct foo *
-instance this way, while still keeping the number of parameters passed to the
-work function to a single pointer.
diff --git a/Documentation/driver-model/device.rst b/Documentation/driver-model/device.rst
deleted file mode 100644
index 2b868d49d349..000000000000
--- a/Documentation/driver-model/device.rst
+++ /dev/null
@@ -1,109 +0,0 @@
-==========================
-The Basic Device Structure
-==========================
-
-See the kerneldoc for the struct device.
-
-
-Programming Interface
-~~~~~~~~~~~~~~~~~~~~~
-The bus driver that discovers the device uses this to register the
-device with the core::
-
-  int device_register(struct device * dev);
-
-The bus should initialize the following fields:
-
-    - parent
-    - name
-    - bus_id
-    - bus
-
-A device is removed from the core when its reference count goes to
-0. The reference count can be adjusted using::
-
-  struct device * get_device(struct device * dev);
-  void put_device(struct device * dev);
-
-get_device() will return a pointer to the struct device passed to it
-if the reference is not already 0 (if it's in the process of being
-removed already).
-
-A driver can access the lock in the device structure using::
-
-  void lock_device(struct device * dev);
-  void unlock_device(struct device * dev);
-
-
-Attributes
-~~~~~~~~~~
-
-::
-
-  struct device_attribute {
-	struct attribute	attr;
-	ssize_t (*show)(struct device *dev, struct device_attribute *attr,
-			char *buf);
-	ssize_t (*store)(struct device *dev, struct device_attribute *attr,
-			 const char *buf, size_t count);
-  };
-
-Attributes of devices can be exported by a device driver through sysfs.
-
-Please see Documentation/filesystems/sysfs.txt for more information
-on how sysfs works.
-
-As explained in Documentation/kobject.txt, device attributes must be
-created before the KOBJ_ADD uevent is generated. The only way to realize
-that is by defining an attribute group.
-
-Attributes are declared using a macro called DEVICE_ATTR::
-
-  #define DEVICE_ATTR(name,mode,show,store)
-
-Example:::
-
-  static DEVICE_ATTR(type, 0444, show_type, NULL);
-  static DEVICE_ATTR(power, 0644, show_power, store_power);
-
-This declares two structures of type struct device_attribute with respective
-names 'dev_attr_type' and 'dev_attr_power'. These two attributes can be
-organized as follows into a group::
-
-  static struct attribute *dev_attrs[] = {
-	&dev_attr_type.attr,
-	&dev_attr_power.attr,
-	NULL,
-  };
-
-  static struct attribute_group dev_attr_group = {
-	.attrs = dev_attrs,
-  };
-
-  static const struct attribute_group *dev_attr_groups[] = {
-	&dev_attr_group,
-	NULL,
-  };
-
-This array of groups can then be associated with a device by setting the
-group pointer in struct device before device_register() is invoked::
-
-        dev->groups = dev_attr_groups;
-        device_register(dev);
-
-The device_register() function will use the 'groups' pointer to create the
-device attributes and the device_unregister() function will use this pointer
-to remove the device attributes.
-
-Word of warning:  While the kernel allows device_create_file() and
-device_remove_file() to be called on a device at any time, userspace has
-strict expectations on when attributes get created.  When a new device is
-registered in the kernel, a uevent is generated to notify userspace (like
-udev) that a new device is available.  If attributes are added after the
-device is registered, then userspace won't get notified and userspace will
-not know about the new attributes.
-
-This is important for device driver that need to publish additional
-attributes for a device at driver probe time.  If the device driver simply
-calls device_create_file() on the device structure passed to it, then
-userspace will never be notified of the new attributes.
diff --git a/Documentation/driver-model/devres.rst b/Documentation/driver-model/devres.rst
deleted file mode 100644
index 4ac99122b5f1..000000000000
--- a/Documentation/driver-model/devres.rst
+++ /dev/null
@@ -1,414 +0,0 @@
-================================
-Devres - Managed Device Resource
-================================
-
-Tejun Heo	<teheo@suse.de>
-
-First draft	10 January 2007
-
-.. contents
-
-   1. Intro			: Huh? Devres?
-   2. Devres			: Devres in a nutshell
-   3. Devres Group		: Group devres'es and release them together
-   4. Details			: Life time rules, calling context, ...
-   5. Overhead			: How much do we have to pay for this?
-   6. List of managed interfaces: Currently implemented managed interfaces
-
-
-1. Intro
---------
-
-devres came up while trying to convert libata to use iomap.  Each
-iomapped address should be kept and unmapped on driver detach.  For
-example, a plain SFF ATA controller (that is, good old PCI IDE) in
-native mode makes use of 5 PCI BARs and all of them should be
-maintained.
-
-As with many other device drivers, libata low level drivers have
-sufficient bugs in ->remove and ->probe failure path.  Well, yes,
-that's probably because libata low level driver developers are lazy
-bunch, but aren't all low level driver developers?  After spending a
-day fiddling with braindamaged hardware with no document or
-braindamaged document, if it's finally working, well, it's working.
-
-For one reason or another, low level drivers don't receive as much
-attention or testing as core code, and bugs on driver detach or
-initialization failure don't happen often enough to be noticeable.
-Init failure path is worse because it's much less travelled while
-needs to handle multiple entry points.
-
-So, many low level drivers end up leaking resources on driver detach
-and having half broken failure path implementation in ->probe() which
-would leak resources or even cause oops when failure occurs.  iomap
-adds more to this mix.  So do msi and msix.
-
-
-2. Devres
----------
-
-devres is basically linked list of arbitrarily sized memory areas
-associated with a struct device.  Each devres entry is associated with
-a release function.  A devres can be released in several ways.  No
-matter what, all devres entries are released on driver detach.  On
-release, the associated release function is invoked and then the
-devres entry is freed.
-
-Managed interface is created for resources commonly used by device
-drivers using devres.  For example, coherent DMA memory is acquired
-using dma_alloc_coherent().  The managed version is called
-dmam_alloc_coherent().  It is identical to dma_alloc_coherent() except
-for the DMA memory allocated using it is managed and will be
-automatically released on driver detach.  Implementation looks like
-the following::
-
-  struct dma_devres {
-	size_t		size;
-	void		*vaddr;
-	dma_addr_t	dma_handle;
-  };
-
-  static void dmam_coherent_release(struct device *dev, void *res)
-  {
-	struct dma_devres *this = res;
-
-	dma_free_coherent(dev, this->size, this->vaddr, this->dma_handle);
-  }
-
-  dmam_alloc_coherent(dev, size, dma_handle, gfp)
-  {
-	struct dma_devres *dr;
-	void *vaddr;
-
-	dr = devres_alloc(dmam_coherent_release, sizeof(*dr), gfp);
-	...
-
-	/* alloc DMA memory as usual */
-	vaddr = dma_alloc_coherent(...);
-	...
-
-	/* record size, vaddr, dma_handle in dr */
-	dr->vaddr = vaddr;
-	...
-
-	devres_add(dev, dr);
-
-	return vaddr;
-  }
-
-If a driver uses dmam_alloc_coherent(), the area is guaranteed to be
-freed whether initialization fails half-way or the device gets
-detached.  If most resources are acquired using managed interface, a
-driver can have much simpler init and exit code.  Init path basically
-looks like the following::
-
-  my_init_one()
-  {
-	struct mydev *d;
-
-	d = devm_kzalloc(dev, sizeof(*d), GFP_KERNEL);
-	if (!d)
-		return -ENOMEM;
-
-	d->ring = dmam_alloc_coherent(...);
-	if (!d->ring)
-		return -ENOMEM;
-
-	if (check something)
-		return -EINVAL;
-	...
-
-	return register_to_upper_layer(d);
-  }
-
-And exit path::
-
-  my_remove_one()
-  {
-	unregister_from_upper_layer(d);
-	shutdown_my_hardware();
-  }
-
-As shown above, low level drivers can be simplified a lot by using
-devres.  Complexity is shifted from less maintained low level drivers
-to better maintained higher layer.  Also, as init failure path is
-shared with exit path, both can get more testing.
-
-Note though that when converting current calls or assignments to
-managed devm_* versions it is up to you to check if internal operations
-like allocating memory, have failed. Managed resources pertains to the
-freeing of these resources *only* - all other checks needed are still
-on you. In some cases this may mean introducing checks that were not
-necessary before moving to the managed devm_* calls.
-
-
-3. Devres group
----------------
-
-Devres entries can be grouped using devres group.  When a group is
-released, all contained normal devres entries and properly nested
-groups are released.  One usage is to rollback series of acquired
-resources on failure.  For example::
-
-  if (!devres_open_group(dev, NULL, GFP_KERNEL))
-	return -ENOMEM;
-
-  acquire A;
-  if (failed)
-	goto err;
-
-  acquire B;
-  if (failed)
-	goto err;
-  ...
-
-  devres_remove_group(dev, NULL);
-  return 0;
-
- err:
-  devres_release_group(dev, NULL);
-  return err_code;
-
-As resource acquisition failure usually means probe failure, constructs
-like above are usually useful in midlayer driver (e.g. libata core
-layer) where interface function shouldn't have side effect on failure.
-For LLDs, just returning error code suffices in most cases.
-
-Each group is identified by `void *id`.  It can either be explicitly
-specified by @id argument to devres_open_group() or automatically
-created by passing NULL as @id as in the above example.  In both
-cases, devres_open_group() returns the group's id.  The returned id
-can be passed to other devres functions to select the target group.
-If NULL is given to those functions, the latest open group is
-selected.
-
-For example, you can do something like the following::
-
-  int my_midlayer_create_something()
-  {
-	if (!devres_open_group(dev, my_midlayer_create_something, GFP_KERNEL))
-		return -ENOMEM;
-
-	...
-
-	devres_close_group(dev, my_midlayer_create_something);
-	return 0;
-  }
-
-  void my_midlayer_destroy_something()
-  {
-	devres_release_group(dev, my_midlayer_create_something);
-  }
-
-
-4. Details
-----------
-
-Lifetime of a devres entry begins on devres allocation and finishes
-when it is released or destroyed (removed and freed) - no reference
-counting.
-
-devres core guarantees atomicity to all basic devres operations and
-has support for single-instance devres types (atomic
-lookup-and-add-if-not-found).  Other than that, synchronizing
-concurrent accesses to allocated devres data is caller's
-responsibility.  This is usually non-issue because bus ops and
-resource allocations already do the job.
-
-For an example of single-instance devres type, read pcim_iomap_table()
-in lib/devres.c.
-
-All devres interface functions can be called without context if the
-right gfp mask is given.
-
-
-5. Overhead
------------
-
-Each devres bookkeeping info is allocated together with requested data
-area.  With debug option turned off, bookkeeping info occupies 16
-bytes on 32bit machines and 24 bytes on 64bit (three pointers rounded
-up to ull alignment).  If singly linked list is used, it can be
-reduced to two pointers (8 bytes on 32bit, 16 bytes on 64bit).
-
-Each devres group occupies 8 pointers.  It can be reduced to 6 if
-singly linked list is used.
-
-Memory space overhead on ahci controller with two ports is between 300
-and 400 bytes on 32bit machine after naive conversion (we can
-certainly invest a bit more effort into libata core layer).
-
-
-6. List of managed interfaces
------------------------------
-
-CLOCK
-  devm_clk_get()
-  devm_clk_get_optional()
-  devm_clk_put()
-  devm_clk_hw_register()
-  devm_of_clk_add_hw_provider()
-  devm_clk_hw_register_clkdev()
-
-DMA
-  dmaenginem_async_device_register()
-  dmam_alloc_coherent()
-  dmam_alloc_attrs()
-  dmam_free_coherent()
-  dmam_pool_create()
-  dmam_pool_destroy()
-
-DRM
-  devm_drm_dev_init()
-
-GPIO
-  devm_gpiod_get()
-  devm_gpiod_get_index()
-  devm_gpiod_get_index_optional()
-  devm_gpiod_get_optional()
-  devm_gpiod_put()
-  devm_gpiod_unhinge()
-  devm_gpiochip_add_data()
-  devm_gpio_request()
-  devm_gpio_request_one()
-  devm_gpio_free()
-
-I2C
-  devm_i2c_new_dummy_device()
-
-IIO
-  devm_iio_device_alloc()
-  devm_iio_device_free()
-  devm_iio_device_register()
-  devm_iio_device_unregister()
-  devm_iio_kfifo_allocate()
-  devm_iio_kfifo_free()
-  devm_iio_triggered_buffer_setup()
-  devm_iio_triggered_buffer_cleanup()
-  devm_iio_trigger_alloc()
-  devm_iio_trigger_free()
-  devm_iio_trigger_register()
-  devm_iio_trigger_unregister()
-  devm_iio_channel_get()
-  devm_iio_channel_release()
-  devm_iio_channel_get_all()
-  devm_iio_channel_release_all()
-
-INPUT
-  devm_input_allocate_device()
-
-IO region
-  devm_release_mem_region()
-  devm_release_region()
-  devm_release_resource()
-  devm_request_mem_region()
-  devm_request_region()
-  devm_request_resource()
-
-IOMAP
-  devm_ioport_map()
-  devm_ioport_unmap()
-  devm_ioremap()
-  devm_ioremap_nocache()
-  devm_ioremap_wc()
-  devm_ioremap_resource() : checks resource, requests memory region, ioremaps
-  devm_iounmap()
-  pcim_iomap()
-  pcim_iomap_regions()	: do request_region() and iomap() on multiple BARs
-  pcim_iomap_table()	: array of mapped addresses indexed by BAR
-  pcim_iounmap()
-
-IRQ
-  devm_free_irq()
-  devm_request_any_context_irq()
-  devm_request_irq()
-  devm_request_threaded_irq()
-  devm_irq_alloc_descs()
-  devm_irq_alloc_desc()
-  devm_irq_alloc_desc_at()
-  devm_irq_alloc_desc_from()
-  devm_irq_alloc_descs_from()
-  devm_irq_alloc_generic_chip()
-  devm_irq_setup_generic_chip()
-  devm_irq_sim_init()
-
-LED
-  devm_led_classdev_register()
-  devm_led_classdev_unregister()
-
-MDIO
-  devm_mdiobus_alloc()
-  devm_mdiobus_alloc_size()
-  devm_mdiobus_free()
-
-MEM
-  devm_free_pages()
-  devm_get_free_pages()
-  devm_kasprintf()
-  devm_kcalloc()
-  devm_kfree()
-  devm_kmalloc()
-  devm_kmalloc_array()
-  devm_kmemdup()
-  devm_kstrdup()
-  devm_kvasprintf()
-  devm_kzalloc()
-
-MFD
-  devm_mfd_add_devices()
-
-MUX
-  devm_mux_chip_alloc()
-  devm_mux_chip_register()
-  devm_mux_control_get()
-
-PER-CPU MEM
-  devm_alloc_percpu()
-  devm_free_percpu()
-
-PCI
-  devm_pci_alloc_host_bridge()  : managed PCI host bridge allocation
-  devm_pci_remap_cfgspace()	: ioremap PCI configuration space
-  devm_pci_remap_cfg_resource()	: ioremap PCI configuration space resource
-  pcim_enable_device()		: after success, all PCI ops become managed
-  pcim_pin_device()		: keep PCI device enabled after release
-
-PHY
-  devm_usb_get_phy()
-  devm_usb_put_phy()
-
-PINCTRL
-  devm_pinctrl_get()
-  devm_pinctrl_put()
-  devm_pinctrl_register()
-  devm_pinctrl_unregister()
-
-POWER
-  devm_reboot_mode_register()
-  devm_reboot_mode_unregister()
-
-PWM
-  devm_pwm_get()
-  devm_pwm_put()
-
-REGULATOR
-  devm_regulator_bulk_get()
-  devm_regulator_get()
-  devm_regulator_put()
-  devm_regulator_register()
-
-RESET
-  devm_reset_control_get()
-  devm_reset_controller_register()
-
-SERDEV
-  devm_serdev_device_open()
-
-SLAVE DMA ENGINE
-  devm_acpi_dma_controller_register()
-
-SPI
-  devm_spi_register_master()
-
-WATCHDOG
-  devm_watchdog_register_device()
diff --git a/Documentation/driver-model/driver.rst b/Documentation/driver-model/driver.rst
deleted file mode 100644
index 11d281506a04..000000000000
--- a/Documentation/driver-model/driver.rst
+++ /dev/null
@@ -1,223 +0,0 @@
-==============
-Device Drivers
-==============
-
-See the kerneldoc for the struct device_driver.
-
-
-Allocation
-~~~~~~~~~~
-
-Device drivers are statically allocated structures. Though there may
-be multiple devices in a system that a driver supports, struct
-device_driver represents the driver as a whole (not a particular
-device instance).
-
-Initialization
-~~~~~~~~~~~~~~
-
-The driver must initialize at least the name and bus fields. It should
-also initialize the devclass field (when it arrives), so it may obtain
-the proper linkage internally. It should also initialize as many of
-the callbacks as possible, though each is optional.
-
-Declaration
-~~~~~~~~~~~
-
-As stated above, struct device_driver objects are statically
-allocated. Below is an example declaration of the eepro100
-driver. This declaration is hypothetical only; it relies on the driver
-being converted completely to the new model::
-
-  static struct device_driver eepro100_driver = {
-         .name		= "eepro100",
-         .bus		= &pci_bus_type,
-
-         .probe		= eepro100_probe,
-         .remove		= eepro100_remove,
-         .suspend		= eepro100_suspend,
-         .resume		= eepro100_resume,
-  };
-
-Most drivers will not be able to be converted completely to the new
-model because the bus they belong to has a bus-specific structure with
-bus-specific fields that cannot be generalized.
-
-The most common example of this are device ID structures. A driver
-typically defines an array of device IDs that it supports. The format
-of these structures and the semantics for comparing device IDs are
-completely bus-specific. Defining them as bus-specific entities would
-sacrifice type-safety, so we keep bus-specific structures around.
-
-Bus-specific drivers should include a generic struct device_driver in
-the definition of the bus-specific driver. Like this::
-
-  struct pci_driver {
-         const struct pci_device_id *id_table;
-         struct device_driver	  driver;
-  };
-
-A definition that included bus-specific fields would look like
-(using the eepro100 driver again)::
-
-  static struct pci_driver eepro100_driver = {
-         .id_table       = eepro100_pci_tbl,
-         .driver	       = {
-		.name		= "eepro100",
-		.bus		= &pci_bus_type,
-		.probe		= eepro100_probe,
-		.remove		= eepro100_remove,
-		.suspend	= eepro100_suspend,
-		.resume		= eepro100_resume,
-         },
-  };
-
-Some may find the syntax of embedded struct initialization awkward or
-even a bit ugly. So far, it's the best way we've found to do what we want...
-
-Registration
-~~~~~~~~~~~~
-
-::
-
-  int driver_register(struct device_driver *drv);
-
-The driver registers the structure on startup. For drivers that have
-no bus-specific fields (i.e. don't have a bus-specific driver
-structure), they would use driver_register and pass a pointer to their
-struct device_driver object.
-
-Most drivers, however, will have a bus-specific structure and will
-need to register with the bus using something like pci_driver_register.
-
-It is important that drivers register their driver structure as early as
-possible. Registration with the core initializes several fields in the
-struct device_driver object, including the reference count and the
-lock. These fields are assumed to be valid at all times and may be
-used by the device model core or the bus driver.
-
-
-Transition Bus Drivers
-~~~~~~~~~~~~~~~~~~~~~~
-
-By defining wrapper functions, the transition to the new model can be
-made easier. Drivers can ignore the generic structure altogether and
-let the bus wrapper fill in the fields. For the callbacks, the bus can
-define generic callbacks that forward the call to the bus-specific
-callbacks of the drivers.
-
-This solution is intended to be only temporary. In order to get class
-information in the driver, the drivers must be modified anyway. Since
-converting drivers to the new model should reduce some infrastructural
-complexity and code size, it is recommended that they are converted as
-class information is added.
-
-Access
-~~~~~~
-
-Once the object has been registered, it may access the common fields of
-the object, like the lock and the list of devices::
-
-  int driver_for_each_dev(struct device_driver *drv, void *data,
-			  int (*callback)(struct device *dev, void *data));
-
-The devices field is a list of all the devices that have been bound to
-the driver. The LDM core provides a helper function to operate on all
-the devices a driver controls. This helper locks the driver on each
-node access, and does proper reference counting on each device as it
-accesses it.
-
-
-sysfs
-~~~~~
-
-When a driver is registered, a sysfs directory is created in its
-bus's directory. In this directory, the driver can export an interface
-to userspace to control operation of the driver on a global basis;
-e.g. toggling debugging output in the driver.
-
-A future feature of this directory will be a 'devices' directory. This
-directory will contain symlinks to the directories of devices it
-supports.
-
-
-
-Callbacks
-~~~~~~~~~
-
-::
-
-	int	(*probe)	(struct device *dev);
-
-The probe() entry is called in task context, with the bus's rwsem locked
-and the driver partially bound to the device.  Drivers commonly use
-container_of() to convert "dev" to a bus-specific type, both in probe()
-and other routines.  That type often provides device resource data, such
-as pci_dev.resource[] or platform_device.resources, which is used in
-addition to dev->platform_data to initialize the driver.
-
-This callback holds the driver-specific logic to bind the driver to a
-given device.  That includes verifying that the device is present, that
-it's a version the driver can handle, that driver data structures can
-be allocated and initialized, and that any hardware can be initialized.
-Drivers often store a pointer to their state with dev_set_drvdata().
-When the driver has successfully bound itself to that device, then probe()
-returns zero and the driver model code will finish its part of binding
-the driver to that device.
-
-A driver's probe() may return a negative errno value to indicate that
-the driver did not bind to this device, in which case it should have
-released all resources it allocated::
-
-	int 	(*remove)	(struct device *dev);
-
-remove is called to unbind a driver from a device. This may be
-called if a device is physically removed from the system, if the
-driver module is being unloaded, during a reboot sequence, or
-in other cases.
-
-It is up to the driver to determine if the device is present or
-not. It should free any resources allocated specifically for the
-device; i.e. anything in the device's driver_data field.
-
-If the device is still present, it should quiesce the device and place
-it into a supported low-power state::
-
-	int	(*suspend)	(struct device *dev, pm_message_t state);
-
-suspend is called to put the device in a low power state::
-
-	int	(*resume)	(struct device *dev);
-
-Resume is used to bring a device back from a low power state.
-
-
-Attributes
-~~~~~~~~~~
-
-::
-
-  struct driver_attribute {
-          struct attribute        attr;
-          ssize_t (*show)(struct device_driver *driver, char *buf);
-          ssize_t (*store)(struct device_driver *, const char *buf, size_t count);
-  };
-
-Device drivers can export attributes via their sysfs directories.
-Drivers can declare attributes using a DRIVER_ATTR_RW and DRIVER_ATTR_RO
-macro that works identically to the DEVICE_ATTR_RW and DEVICE_ATTR_RO
-macros.
-
-Example::
-
-	DRIVER_ATTR_RW(debug);
-
-This is equivalent to declaring::
-
-	struct driver_attribute driver_attr_debug;
-
-This can then be used to add and remove the attribute from the
-driver's directory using::
-
-  int driver_create_file(struct device_driver *, const struct driver_attribute *);
-  void driver_remove_file(struct device_driver *, const struct driver_attribute *);
diff --git a/Documentation/driver-model/index.rst b/Documentation/driver-model/index.rst
deleted file mode 100644
index 9f85d579ce56..000000000000
--- a/Documentation/driver-model/index.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-:orphan:
-
-============
-Driver Model
-============
-
-.. toctree::
-   :maxdepth: 1
-
-   binding
-   bus
-   class
-   design-patterns
-   device
-   devres
-   driver
-   overview
-   platform
-   porting
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/driver-model/overview.rst b/Documentation/driver-model/overview.rst
deleted file mode 100644
index d4d1e9b40e0c..000000000000
--- a/Documentation/driver-model/overview.rst
+++ /dev/null
@@ -1,124 +0,0 @@
-=============================
-The Linux Kernel Device Model
-=============================
-
-Patrick Mochel	<mochel@digitalimplant.org>
-
-Drafted 26 August 2002
-Updated 31 January 2006
-
-
-Overview
-~~~~~~~~
-
-The Linux Kernel Driver Model is a unification of all the disparate driver
-models that were previously used in the kernel. It is intended to augment the
-bus-specific drivers for bridges and devices by consolidating a set of data
-and operations into globally accessible data structures.
-
-Traditional driver models implemented some sort of tree-like structure
-(sometimes just a list) for the devices they control. There wasn't any
-uniformity across the different bus types.
-
-The current driver model provides a common, uniform data model for describing
-a bus and the devices that can appear under the bus. The unified bus
-model includes a set of common attributes which all busses carry, and a set
-of common callbacks, such as device discovery during bus probing, bus
-shutdown, bus power management, etc.
-
-The common device and bridge interface reflects the goals of the modern
-computer: namely the ability to do seamless device "plug and play", power
-management, and hot plug. In particular, the model dictated by Intel and
-Microsoft (namely ACPI) ensures that almost every device on almost any bus
-on an x86-compatible system can work within this paradigm.  Of course,
-not every bus is able to support all such operations, although most
-buses support most of those operations.
-
-
-Downstream Access
-~~~~~~~~~~~~~~~~~
-
-Common data fields have been moved out of individual bus layers into a common
-data structure. These fields must still be accessed by the bus layers,
-and sometimes by the device-specific drivers.
-
-Other bus layers are encouraged to do what has been done for the PCI layer.
-struct pci_dev now looks like this::
-
-  struct pci_dev {
-	...
-
-	struct device dev;     /* Generic device interface */
-	...
-  };
-
-Note first that the struct device dev within the struct pci_dev is
-statically allocated. This means only one allocation on device discovery.
-
-Note also that that struct device dev is not necessarily defined at the
-front of the pci_dev structure.  This is to make people think about what
-they're doing when switching between the bus driver and the global driver,
-and to discourage meaningless and incorrect casts between the two.
-
-The PCI bus layer freely accesses the fields of struct device. It knows about
-the structure of struct pci_dev, and it should know the structure of struct
-device. Individual PCI device drivers that have been converted to the current
-driver model generally do not and should not touch the fields of struct device,
-unless there is a compelling reason to do so.
-
-The above abstraction prevents unnecessary pain during transitional phases.
-If it were not done this way, then when a field was renamed or removed, every
-downstream driver would break.  On the other hand, if only the bus layer
-(and not the device layer) accesses the struct device, it is only the bus
-layer that needs to change.
-
-
-User Interface
-~~~~~~~~~~~~~~
-
-By virtue of having a complete hierarchical view of all the devices in the
-system, exporting a complete hierarchical view to userspace becomes relatively
-easy. This has been accomplished by implementing a special purpose virtual
-file system named sysfs.
-
-Almost all mainstream Linux distros mount this filesystem automatically; you
-can see some variation of the following in the output of the "mount" command::
-
-  $ mount
-  ...
-  none on /sys type sysfs (rw,noexec,nosuid,nodev)
-  ...
-  $
-
-The auto-mounting of sysfs is typically accomplished by an entry similar to
-the following in the /etc/fstab file::
-
-  none     	/sys	sysfs    defaults	  	0 0
-
-or something similar in the /lib/init/fstab file on Debian-based systems::
-
-  none            /sys    sysfs    nodev,noexec,nosuid    0 0
-
-If sysfs is not automatically mounted, you can always do it manually with::
-
-	# mount -t sysfs sysfs /sys
-
-Whenever a device is inserted into the tree, a directory is created for it.
-This directory may be populated at each layer of discovery - the global layer,
-the bus layer, or the device layer.
-
-The global layer currently creates two files - 'name' and 'power'. The
-former only reports the name of the device. The latter reports the
-current power state of the device. It will also be used to set the current
-power state.
-
-The bus layer may also create files for the devices it finds while probing the
-bus. For example, the PCI layer currently creates 'irq' and 'resource' files
-for each PCI device.
-
-A device-specific driver may also export files in its directory to expose
-device-specific data or tunable interfaces.
-
-More information about the sysfs directory layout can be found in
-the other documents in this directory and in the file
-Documentation/filesystems/sysfs.txt.
diff --git a/Documentation/driver-model/platform.rst b/Documentation/driver-model/platform.rst
deleted file mode 100644
index 334dd4071ae4..000000000000
--- a/Documentation/driver-model/platform.rst
+++ /dev/null
@@ -1,246 +0,0 @@
-============================
-Platform Devices and Drivers
-============================
-
-See <linux/platform_device.h> for the driver model interface to the
-platform bus:  platform_device, and platform_driver.  This pseudo-bus
-is used to connect devices on busses with minimal infrastructure,
-like those used to integrate peripherals on many system-on-chip
-processors, or some "legacy" PC interconnects; as opposed to large
-formally specified ones like PCI or USB.
-
-
-Platform devices
-~~~~~~~~~~~~~~~~
-Platform devices are devices that typically appear as autonomous
-entities in the system. This includes legacy port-based devices and
-host bridges to peripheral buses, and most controllers integrated
-into system-on-chip platforms.  What they usually have in common
-is direct addressing from a CPU bus.  Rarely, a platform_device will
-be connected through a segment of some other kind of bus; but its
-registers will still be directly addressable.
-
-Platform devices are given a name, used in driver binding, and a
-list of resources such as addresses and IRQs::
-
-  struct platform_device {
-	const char	*name;
-	u32		id;
-	struct device	dev;
-	u32		num_resources;
-	struct resource	*resource;
-  };
-
-
-Platform drivers
-~~~~~~~~~~~~~~~~
-Platform drivers follow the standard driver model convention, where
-discovery/enumeration is handled outside the drivers, and drivers
-provide probe() and remove() methods.  They support power management
-and shutdown notifications using the standard conventions::
-
-  struct platform_driver {
-	int (*probe)(struct platform_device *);
-	int (*remove)(struct platform_device *);
-	void (*shutdown)(struct platform_device *);
-	int (*suspend)(struct platform_device *, pm_message_t state);
-	int (*suspend_late)(struct platform_device *, pm_message_t state);
-	int (*resume_early)(struct platform_device *);
-	int (*resume)(struct platform_device *);
-	struct device_driver driver;
-  };
-
-Note that probe() should in general verify that the specified device hardware
-actually exists; sometimes platform setup code can't be sure.  The probing
-can use device resources, including clocks, and device platform_data.
-
-Platform drivers register themselves the normal way::
-
-	int platform_driver_register(struct platform_driver *drv);
-
-Or, in common situations where the device is known not to be hot-pluggable,
-the probe() routine can live in an init section to reduce the driver's
-runtime memory footprint::
-
-	int platform_driver_probe(struct platform_driver *drv,
-			  int (*probe)(struct platform_device *))
-
-Kernel modules can be composed of several platform drivers. The platform core
-provides helpers to register and unregister an array of drivers::
-
-	int __platform_register_drivers(struct platform_driver * const *drivers,
-				      unsigned int count, struct module *owner);
-	void platform_unregister_drivers(struct platform_driver * const *drivers,
-					 unsigned int count);
-
-If one of the drivers fails to register, all drivers registered up to that
-point will be unregistered in reverse order. Note that there is a convenience
-macro that passes THIS_MODULE as owner parameter::
-
-	#define platform_register_drivers(drivers, count)
-
-
-Device Enumeration
-~~~~~~~~~~~~~~~~~~
-As a rule, platform specific (and often board-specific) setup code will
-register platform devices::
-
-	int platform_device_register(struct platform_device *pdev);
-
-	int platform_add_devices(struct platform_device **pdevs, int ndev);
-
-The general rule is to register only those devices that actually exist,
-but in some cases extra devices might be registered.  For example, a kernel
-might be configured to work with an external network adapter that might not
-be populated on all boards, or likewise to work with an integrated controller
-that some boards might not hook up to any peripherals.
-
-In some cases, boot firmware will export tables describing the devices
-that are populated on a given board.   Without such tables, often the
-only way for system setup code to set up the correct devices is to build
-a kernel for a specific target board.  Such board-specific kernels are
-common with embedded and custom systems development.
-
-In many cases, the memory and IRQ resources associated with the platform
-device are not enough to let the device's driver work.  Board setup code
-will often provide additional information using the device's platform_data
-field to hold additional information.
-
-Embedded systems frequently need one or more clocks for platform devices,
-which are normally kept off until they're actively needed (to save power).
-System setup also associates those clocks with the device, so that that
-calls to clk_get(&pdev->dev, clock_name) return them as needed.
-
-
-Legacy Drivers:  Device Probing
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Some drivers are not fully converted to the driver model, because they take
-on a non-driver role:  the driver registers its platform device, rather than
-leaving that for system infrastructure.  Such drivers can't be hotplugged
-or coldplugged, since those mechanisms require device creation to be in a
-different system component than the driver.
-
-The only "good" reason for this is to handle older system designs which, like
-original IBM PCs, rely on error-prone "probe-the-hardware" models for hardware
-configuration.  Newer systems have largely abandoned that model, in favor of
-bus-level support for dynamic configuration (PCI, USB), or device tables
-provided by the boot firmware (e.g. PNPACPI on x86).  There are too many
-conflicting options about what might be where, and even educated guesses by
-an operating system will be wrong often enough to make trouble.
-
-This style of driver is discouraged.  If you're updating such a driver,
-please try to move the device enumeration to a more appropriate location,
-outside the driver.  This will usually be cleanup, since such drivers
-tend to already have "normal" modes, such as ones using device nodes that
-were created by PNP or by platform device setup.
-
-None the less, there are some APIs to support such legacy drivers.  Avoid
-using these calls except with such hotplug-deficient drivers::
-
-	struct platform_device *platform_device_alloc(
-			const char *name, int id);
-
-You can use platform_device_alloc() to dynamically allocate a device, which
-you will then initialize with resources and platform_device_register().
-A better solution is usually::
-
-	struct platform_device *platform_device_register_simple(
-			const char *name, int id,
-			struct resource *res, unsigned int nres);
-
-You can use platform_device_register_simple() as a one-step call to allocate
-and register a device.
-
-
-Device Naming and Driver Binding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The platform_device.dev.bus_id is the canonical name for the devices.
-It's built from two components:
-
-    * platform_device.name ... which is also used to for driver matching.
-
-    * platform_device.id ... the device instance number, or else "-1"
-      to indicate there's only one.
-
-These are concatenated, so name/id "serial"/0 indicates bus_id "serial.0", and
-"serial/3" indicates bus_id "serial.3"; both would use the platform_driver
-named "serial".  While "my_rtc"/-1 would be bus_id "my_rtc" (no instance id)
-and use the platform_driver called "my_rtc".
-
-Driver binding is performed automatically by the driver core, invoking
-driver probe() after finding a match between device and driver.  If the
-probe() succeeds, the driver and device are bound as usual.  There are
-three different ways to find such a match:
-
-    - Whenever a device is registered, the drivers for that bus are
-      checked for matches.  Platform devices should be registered very
-      early during system boot.
-
-    - When a driver is registered using platform_driver_register(), all
-      unbound devices on that bus are checked for matches.  Drivers
-      usually register later during booting, or by module loading.
-
-    - Registering a driver using platform_driver_probe() works just like
-      using platform_driver_register(), except that the driver won't
-      be probed later if another device registers.  (Which is OK, since
-      this interface is only for use with non-hotpluggable devices.)
-
-
-Early Platform Devices and Drivers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The early platform interfaces provide platform data to platform device
-drivers early on during the system boot. The code is built on top of the
-early_param() command line parsing and can be executed very early on.
-
-Example: "earlyprintk" class early serial console in 6 steps
-
-1. Registering early platform device data
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code registers platform device data using the function
-early_platform_add_devices(). In the case of early serial console this
-should be hardware configuration for the serial port. Devices registered
-at this point will later on be matched against early platform drivers.
-
-2. Parsing kernel command line
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code calls parse_early_param() to parse the kernel
-command line. This will execute all matching early_param() callbacks.
-User specified early platform devices will be registered at this point.
-For the early serial console case the user can specify port on the
-kernel command line as "earlyprintk=serial.0" where "earlyprintk" is
-the class string, "serial" is the name of the platform driver and
-0 is the platform device id. If the id is -1 then the dot and the
-id can be omitted.
-
-3. Installing early platform drivers belonging to a certain class
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code may optionally force registration of all early
-platform drivers belonging to a certain class using the function
-early_platform_driver_register_all(). User specified devices from
-step 2 have priority over these. This step is omitted by the serial
-driver example since the early serial driver code should be disabled
-unless the user has specified port on the kernel command line.
-
-4. Early platform driver registration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Compiled-in platform drivers making use of early_platform_init() are
-automatically registered during step 2 or 3. The serial driver example
-should use early_platform_init("earlyprintk", &platform_driver).
-
-5. Probing of early platform drivers belonging to a certain class
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The architecture code calls early_platform_driver_probe() to match
-registered early platform devices associated with a certain class with
-registered early platform drivers. Matched devices will get probed().
-This step can be executed at any point during the early boot. As soon
-as possible may be good for the serial port case.
-
-6. Inside the early platform driver probe()
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The driver code needs to take special care during early boot, especially
-when it comes to memory allocation and interrupt registration. The code
-in the probe() function can use is_early_platform_device() to check if
-it is called at early platform device or at the regular platform device
-time. The early serial driver performs register_console() at this point.
-
-For further information, see <linux/platform_device.h>.
diff --git a/Documentation/driver-model/porting.rst b/Documentation/driver-model/porting.rst
deleted file mode 100644
index ae4bf843c1d6..000000000000
--- a/Documentation/driver-model/porting.rst
+++ /dev/null
@@ -1,448 +0,0 @@
-=======================================
-Porting Drivers to the New Driver Model
-=======================================
-
-Patrick Mochel
-
-7 January 2003
-
-
-Overview
-
-Please refer to `Documentation/driver-model/*.rst` for definitions of
-various driver types and concepts.
-
-Most of the work of porting devices drivers to the new model happens
-at the bus driver layer. This was intentional, to minimize the
-negative effect on kernel drivers, and to allow a gradual transition
-of bus drivers.
-
-In a nutshell, the driver model consists of a set of objects that can
-be embedded in larger, bus-specific objects. Fields in these generic
-objects can replace fields in the bus-specific objects.
-
-The generic objects must be registered with the driver model core. By
-doing so, they will exported via the sysfs filesystem. sysfs can be
-mounted by doing::
-
-	# mount -t sysfs sysfs /sys
-
-
-
-The Process
-
-Step 0: Read include/linux/device.h for object and function definitions.
-
-Step 1: Registering the bus driver.
-
-
-- Define a struct bus_type for the bus driver::
-
-    struct bus_type pci_bus_type = {
-          .name           = "pci",
-    };
-
-
-- Register the bus type.
-
-  This should be done in the initialization function for the bus type,
-  which is usually the module_init(), or equivalent, function::
-
-    static int __init pci_driver_init(void)
-    {
-            return bus_register(&pci_bus_type);
-    }
-
-    subsys_initcall(pci_driver_init);
-
-
-  The bus type may be unregistered (if the bus driver may be compiled
-  as a module) by doing::
-
-     bus_unregister(&pci_bus_type);
-
-
-- Export the bus type for others to use.
-
-  Other code may wish to reference the bus type, so declare it in a
-  shared header file and export the symbol.
-
-From include/linux/pci.h::
-
-  extern struct bus_type pci_bus_type;
-
-
-From file the above code appears in::
-
-  EXPORT_SYMBOL(pci_bus_type);
-
-
-
-- This will cause the bus to show up in /sys/bus/pci/ with two
-  subdirectories: 'devices' and 'drivers'::
-
-    # tree -d /sys/bus/pci/
-    /sys/bus/pci/
-    |-- devices
-    `-- drivers
-
-
-
-Step 2: Registering Devices.
-
-struct device represents a single device. It mainly contains metadata
-describing the relationship the device has to other entities.
-
-
-- Embed a struct device in the bus-specific device type::
-
-
-    struct pci_dev {
-           ...
-           struct  device  dev;            /* Generic device interface */
-           ...
-    };
-
-  It is recommended that the generic device not be the first item in
-  the struct to discourage programmers from doing mindless casts
-  between the object types. Instead macros, or inline functions,
-  should be created to convert from the generic object type::
-
-
-    #define to_pci_dev(n) container_of(n, struct pci_dev, dev)
-
-    or
-
-    static inline struct pci_dev * to_pci_dev(struct kobject * kobj)
-    {
-	return container_of(n, struct pci_dev, dev);
-    }
-
-  This allows the compiler to verify type-safety of the operations
-  that are performed (which is Good).
-
-
-- Initialize the device on registration.
-
-  When devices are discovered or registered with the bus type, the
-  bus driver should initialize the generic device. The most important
-  things to initialize are the bus_id, parent, and bus fields.
-
-  The bus_id is an ASCII string that contains the device's address on
-  the bus. The format of this string is bus-specific. This is
-  necessary for representing devices in sysfs.
-
-  parent is the physical parent of the device. It is important that
-  the bus driver sets this field correctly.
-
-  The driver model maintains an ordered list of devices that it uses
-  for power management. This list must be in order to guarantee that
-  devices are shutdown before their physical parents, and vice versa.
-  The order of this list is determined by the parent of registered
-  devices.
-
-  Also, the location of the device's sysfs directory depends on a
-  device's parent. sysfs exports a directory structure that mirrors
-  the device hierarchy. Accurately setting the parent guarantees that
-  sysfs will accurately represent the hierarchy.
-
-  The device's bus field is a pointer to the bus type the device
-  belongs to. This should be set to the bus_type that was declared
-  and initialized before.
-
-  Optionally, the bus driver may set the device's name and release
-  fields.
-
-  The name field is an ASCII string describing the device, like
-
-     "ATI Technologies Inc Radeon QD"
-
-  The release field is a callback that the driver model core calls
-  when the device has been removed, and all references to it have
-  been released. More on this in a moment.
-
-
-- Register the device.
-
-  Once the generic device has been initialized, it can be registered
-  with the driver model core by doing::
-
-       device_register(&dev->dev);
-
-  It can later be unregistered by doing::
-
-       device_unregister(&dev->dev);
-
-  This should happen on buses that support hotpluggable devices.
-  If a bus driver unregisters a device, it should not immediately free
-  it. It should instead wait for the driver model core to call the
-  device's release method, then free the bus-specific object.
-  (There may be other code that is currently referencing the device
-  structure, and it would be rude to free the device while that is
-  happening).
-
-
-  When the device is registered, a directory in sysfs is created.
-  The PCI tree in sysfs looks like::
-
-    /sys/devices/pci0/
-    |-- 00:00.0
-    |-- 00:01.0
-    |   `-- 01:00.0
-    |-- 00:02.0
-    |   `-- 02:1f.0
-    |       `-- 03:00.0
-    |-- 00:1e.0
-    |   `-- 04:04.0
-    |-- 00:1f.0
-    |-- 00:1f.1
-    |   |-- ide0
-    |   |   |-- 0.0
-    |   |   `-- 0.1
-    |   `-- ide1
-    |       `-- 1.0
-    |-- 00:1f.2
-    |-- 00:1f.3
-    `-- 00:1f.5
-
-  Also, symlinks are created in the bus's 'devices' directory
-  that point to the device's directory in the physical hierarchy::
-
-    /sys/bus/pci/devices/
-    |-- 00:00.0 -> ../../../devices/pci0/00:00.0
-    |-- 00:01.0 -> ../../../devices/pci0/00:01.0
-    |-- 00:02.0 -> ../../../devices/pci0/00:02.0
-    |-- 00:1e.0 -> ../../../devices/pci0/00:1e.0
-    |-- 00:1f.0 -> ../../../devices/pci0/00:1f.0
-    |-- 00:1f.1 -> ../../../devices/pci0/00:1f.1
-    |-- 00:1f.2 -> ../../../devices/pci0/00:1f.2
-    |-- 00:1f.3 -> ../../../devices/pci0/00:1f.3
-    |-- 00:1f.5 -> ../../../devices/pci0/00:1f.5
-    |-- 01:00.0 -> ../../../devices/pci0/00:01.0/01:00.0
-    |-- 02:1f.0 -> ../../../devices/pci0/00:02.0/02:1f.0
-    |-- 03:00.0 -> ../../../devices/pci0/00:02.0/02:1f.0/03:00.0
-    `-- 04:04.0 -> ../../../devices/pci0/00:1e.0/04:04.0
-
-
-
-Step 3: Registering Drivers.
-
-struct device_driver is a simple driver structure that contains a set
-of operations that the driver model core may call.
-
-
-- Embed a struct device_driver in the bus-specific driver.
-
-  Just like with devices, do something like::
-
-    struct pci_driver {
-           ...
-           struct device_driver    driver;
-    };
-
-
-- Initialize the generic driver structure.
-
-  When the driver registers with the bus (e.g. doing pci_register_driver()),
-  initialize the necessary fields of the driver: the name and bus
-  fields.
-
-
-- Register the driver.
-
-  After the generic driver has been initialized, call::
-
-	driver_register(&drv->driver);
-
-  to register the driver with the core.
-
-  When the driver is unregistered from the bus, unregister it from the
-  core by doing::
-
-        driver_unregister(&drv->driver);
-
-  Note that this will block until all references to the driver have
-  gone away. Normally, there will not be any.
-
-
-- Sysfs representation.
-
-  Drivers are exported via sysfs in their bus's 'driver's directory.
-  For example::
-
-    /sys/bus/pci/drivers/
-    |-- 3c59x
-    |-- Ensoniq AudioPCI
-    |-- agpgart-amdk7
-    |-- e100
-    `-- serial
-
-
-Step 4: Define Generic Methods for Drivers.
-
-struct device_driver defines a set of operations that the driver model
-core calls. Most of these operations are probably similar to
-operations the bus already defines for drivers, but taking different
-parameters.
-
-It would be difficult and tedious to force every driver on a bus to
-simultaneously convert their drivers to generic format. Instead, the
-bus driver should define single instances of the generic methods that
-forward call to the bus-specific drivers. For instance::
-
-
-  static int pci_device_remove(struct device * dev)
-  {
-          struct pci_dev * pci_dev = to_pci_dev(dev);
-          struct pci_driver * drv = pci_dev->driver;
-
-          if (drv) {
-                  if (drv->remove)
-                          drv->remove(pci_dev);
-                  pci_dev->driver = NULL;
-          }
-          return 0;
-  }
-
-
-The generic driver should be initialized with these methods before it
-is registered::
-
-        /* initialize common driver fields */
-        drv->driver.name = drv->name;
-        drv->driver.bus = &pci_bus_type;
-        drv->driver.probe = pci_device_probe;
-        drv->driver.resume = pci_device_resume;
-        drv->driver.suspend = pci_device_suspend;
-        drv->driver.remove = pci_device_remove;
-
-        /* register with core */
-        driver_register(&drv->driver);
-
-
-Ideally, the bus should only initialize the fields if they are not
-already set. This allows the drivers to implement their own generic
-methods.
-
-
-Step 5: Support generic driver binding.
-
-The model assumes that a device or driver can be dynamically
-registered with the bus at any time. When registration happens,
-devices must be bound to a driver, or drivers must be bound to all
-devices that it supports.
-
-A driver typically contains a list of device IDs that it supports. The
-bus driver compares these IDs to the IDs of devices registered with it.
-The format of the device IDs, and the semantics for comparing them are
-bus-specific, so the generic model does attempt to generalize them.
-
-Instead, a bus may supply a method in struct bus_type that does the
-comparison::
-
-  int (*match)(struct device * dev, struct device_driver * drv);
-
-match should return positive value if the driver supports the device,
-and zero otherwise. It may also return error code (for example
--EPROBE_DEFER) if determining that given driver supports the device is
-not possible.
-
-When a device is registered, the bus's list of drivers is iterated
-over. bus->match() is called for each one until a match is found.
-
-When a driver is registered, the bus's list of devices is iterated
-over. bus->match() is called for each device that is not already
-claimed by a driver.
-
-When a device is successfully bound to a driver, device->driver is
-set, the device is added to a per-driver list of devices, and a
-symlink is created in the driver's sysfs directory that points to the
-device's physical directory::
-
-  /sys/bus/pci/drivers/
-  |-- 3c59x
-  |   `-- 00:0b.0 -> ../../../../devices/pci0/00:0b.0
-  |-- Ensoniq AudioPCI
-  |-- agpgart-amdk7
-  |   `-- 00:00.0 -> ../../../../devices/pci0/00:00.0
-  |-- e100
-  |   `-- 00:0c.0 -> ../../../../devices/pci0/00:0c.0
-  `-- serial
-
-
-This driver binding should replace the existing driver binding
-mechanism the bus currently uses.
-
-
-Step 6: Supply a hotplug callback.
-
-Whenever a device is registered with the driver model core, the
-userspace program /sbin/hotplug is called to notify userspace.
-Users can define actions to perform when a device is inserted or
-removed.
-
-The driver model core passes several arguments to userspace via
-environment variables, including
-
-- ACTION: set to 'add' or 'remove'
-- DEVPATH: set to the device's physical path in sysfs.
-
-A bus driver may also supply additional parameters for userspace to
-consume. To do this, a bus must implement the 'hotplug' method in
-struct bus_type::
-
-     int (*hotplug) (struct device *dev, char **envp,
-                     int num_envp, char *buffer, int buffer_size);
-
-This is called immediately before /sbin/hotplug is executed.
-
-
-Step 7: Cleaning up the bus driver.
-
-The generic bus, device, and driver structures provide several fields
-that can replace those defined privately to the bus driver.
-
-- Device list.
-
-struct bus_type contains a list of all devices registered with the bus
-type. This includes all devices on all instances of that bus type.
-An internal list that the bus uses may be removed, in favor of using
-this one.
-
-The core provides an iterator to access these devices::
-
-  int bus_for_each_dev(struct bus_type * bus, struct device * start,
-                       void * data, int (*fn)(struct device *, void *));
-
-
-- Driver list.
-
-struct bus_type also contains a list of all drivers registered with
-it. An internal list of drivers that the bus driver maintains may
-be removed in favor of using the generic one.
-
-The drivers may be iterated over, like devices::
-
-  int bus_for_each_drv(struct bus_type * bus, struct device_driver * start,
-                       void * data, int (*fn)(struct device_driver *, void *));
-
-
-Please see drivers/base/bus.c for more information.
-
-
-- rwsem
-
-struct bus_type contains an rwsem that protects all core accesses to
-the device and driver lists. This can be used by the bus driver
-internally, and should be used when accessing the device or driver
-lists the bus maintains.
-
-
-- Device and driver fields.
-
-Some of the fields in struct device and struct device_driver duplicate
-fields in the bus-specific representations of these objects. Feel free
-to remove the bus-specific ones and favor the generic ones. Note
-though, that this will likely mean fixing up all the drivers that
-reference the bus-specific fields (though those should all be 1-line
-changes).
diff --git a/Documentation/eisa.txt b/Documentation/eisa.txt
index f388545a85a7..c07565ba57da 100644
--- a/Documentation/eisa.txt
+++ b/Documentation/eisa.txt
@@ -103,7 +103,7 @@ id_table	an array of NULL terminated EISA id strings,
 		(driver_data).
 
 driver		a generic driver, such as described in
-		Documentation/driver-model/driver.rst. Only .name,
+		Documentation/driver-api/driver-model/driver.rst. Only .name,
 		.probe and .remove members are mandatory.
 =============== ====================================================
 
@@ -152,7 +152,7 @@ state    set of flags indicating the state of the device. Current
 	 flags are EISA_CONFIG_ENABLED and EISA_CONFIG_FORCED.
 res	 set of four 256 bytes I/O regions allocated to this device
 dma_mask DMA mask set from the parent device.
-dev	 generic device (see Documentation/driver-model/device.rst)
+dev	 generic device (see Documentation/driver-api/driver-model/device.rst)
 ======== ============================================================
 
 You can get the 'struct eisa_device' from 'struct device' using the
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index 5b5311f9358d..ddf15b1b0d5a 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -319,7 +319,7 @@ quick way to lookup the sysfs interface for a device from the result of
 a stat(2) operation.
 
 More information can driver-model specific features can be found in
-Documentation/driver-model/. 
+Documentation/driver-api/driver-model/.
 
 
 TODO: Finish this section.
diff --git a/Documentation/hwmon/submitting-patches.rst b/Documentation/hwmon/submitting-patches.rst
index d5b05d3e54ba..452fc28d8e0b 100644
--- a/Documentation/hwmon/submitting-patches.rst
+++ b/Documentation/hwmon/submitting-patches.rst
@@ -89,7 +89,7 @@ increase the chances of your change being accepted.
   console. Excessive logging can seriously affect system performance.
 
 * Use devres functions whenever possible to allocate resources. For rationale
-  and supported functions, please see Documentation/driver-model/devres.rst.
+  and supported functions, please see Documentation/driver-api/driver-model/devres.rst.
   If a function is not supported by devres, consider using devm_add_action().
 
 * If the driver has a detect function, make sure it is silent. Debug messages
diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt
index 452271dda141..ee1f37da5b23 100644
--- a/Documentation/translations/zh_CN/filesystems/sysfs.txt
+++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt
@@ -288,7 +288,7 @@ dev/ 包含两个子目录： char/ 和 block/。在这两个子目录中，有
 中相应的设备。/sys/dev 提供一个通过一个 stat(2) 操作结果，查找
 设备 sysfs 接口快捷的方法。
 
-更多有关 driver-model 的特性信息可以在 Documentation/driver-model/
+更多有关 driver-model 的特性信息可以在 Documentation/driver-api/driver-model/
 中找到。
 
 
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 713903290385..506a0175a5a7 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -5,7 +5,7 @@
  * Copyright (c) 2002-3 Patrick Mochel
  * Copyright (c) 2002-3 Open Source Development Labs
  *
- * Please see Documentation/driver-model/platform.rst for more
+ * Please see Documentation/driver-api/driver-model/platform.rst for more
  * information.
  */
 
diff --git a/drivers/gpio/gpio-cs5535.c b/drivers/gpio/gpio-cs5535.c
index 3611a0571667..53b24e3ae7de 100644
--- a/drivers/gpio/gpio-cs5535.c
+++ b/drivers/gpio/gpio-cs5535.c
@@ -41,7 +41,7 @@ MODULE_PARM_DESC(mask, "GPIO channel mask.");
 
 /*
  * FIXME: convert this singleton driver to use the state container
- * design pattern, see Documentation/driver-model/design-patterns.rst
+ * design pattern, see Documentation/driver-api/driver-model/design-patterns.rst
  */
 static struct cs5535_gpio_chip {
 	struct gpio_chip chip;
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 41c90f2ddb31..63db08d9bafa 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -2286,7 +2286,7 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 	struct ice_hw *hw;
 	int err;
 
-	/* this driver uses devres, see Documentation/driver-model/devres.rst */
+	/* this driver uses devres, see Documentation/driver-api/driver-model/devres.rst */
 	err = pcim_enable_device(pdev);
 	if (err)
 		return err;
diff --git a/drivers/staging/unisys/Documentation/overview.txt b/drivers/staging/unisys/Documentation/overview.txt
index 9ab30af265a5..f8a4144b239c 100644
--- a/drivers/staging/unisys/Documentation/overview.txt
+++ b/drivers/staging/unisys/Documentation/overview.txt
@@ -15,7 +15,7 @@ normally be unsharable, specifically:
 * visorinput - keyboard and mouse
 
 These drivers conform to the standard Linux bus/device model described
-within Documentation/driver-model/, and utilize a driver named visorbus to
+within Documentation/driver-api/driver-model/, and utilize a driver named visorbus to
 present the virtual busses involved. Drivers in the 'visor*' driver set are
 commonly referred to as "guest drivers" or "client drivers".  All drivers
 except visorbus expose a device of a specific usable class to the Linux guest
@@ -141,7 +141,7 @@ called automatically by the visorbus driver at appropriate times:
 -----------------------------------
 
 Because visorbus is a standard Linux bus driver in the model described in
-Documentation/driver-model/, the hierarchy of s-Par virtual devices is
+Documentation/driver-api/driver-model/, the hierarchy of s-Par virtual devices is
 published in the sysfs tree beneath /bus/visorbus/, e.g.,
 /sys/bus/visorbus/devices/ might look like:
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 5eabfa0c4dee..c330b75c6c57 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -6,7 +6,7 @@
  * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
  * Copyright (c) 2008-2009 Novell Inc.
  *
- * See Documentation/driver-model/ for more information.
+ * See Documentation/driver-api/driver-model/ for more information.
  */
 
 #ifndef _DEVICE_H_
diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index beb25f277889..9bc36b589827 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -4,7 +4,7 @@
  *
  * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
  *
- * See Documentation/driver-model/ for more information.
+ * See Documentation/driver-api/driver-model/ for more information.
  */
 
 #ifndef _PLATFORM_DEVICE_H_
diff --git a/scripts/coccinelle/free/devm_free.cocci b/scripts/coccinelle/free/devm_free.cocci
index fefd0331a2de..441799b5359b 100644
--- a/scripts/coccinelle/free/devm_free.cocci
+++ b/scripts/coccinelle/free/devm_free.cocci
@@ -3,7 +3,7 @@
 /// functions.  Values allocated using the devm_functions are freed when
 /// the device is detached, and thus the use of the standard freeing
 /// function would cause a double free.
-/// See Documentation/driver-model/devres.rst for more information.
+/// See Documentation/driver-api/driver-model/devres.rst for more information.
 ///
 /// A difficulty of detecting this problem is that the standard freeing
 /// function might be called from a different function than the one
-- 
cgit v1.2.3-59-g8ed1b


From da82c92f1150f66afabf78d2c85ef9ac18dc6d38 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 27 Jun 2019 13:08:35 -0300
Subject: docs: cgroup-v1: add it to the admin-guide book

Those files belong to the admin guide, so add them.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 .../admin-guide/cgroup-v1/blkio-controller.rst     |  302 ++++++
 Documentation/admin-guide/cgroup-v1/cgroups.rst    |  695 ++++++++++++++
 Documentation/admin-guide/cgroup-v1/cpuacct.rst    |   50 +
 Documentation/admin-guide/cgroup-v1/cpusets.rst    |  866 +++++++++++++++++
 Documentation/admin-guide/cgroup-v1/devices.rst    |  132 +++
 .../admin-guide/cgroup-v1/freezer-subsystem.rst    |  127 +++
 Documentation/admin-guide/cgroup-v1/hugetlb.rst    |   50 +
 Documentation/admin-guide/cgroup-v1/index.rst      |   28 +
 Documentation/admin-guide/cgroup-v1/memcg_test.rst |  355 +++++++
 Documentation/admin-guide/cgroup-v1/memory.rst     | 1003 ++++++++++++++++++++
 Documentation/admin-guide/cgroup-v1/net_cls.rst    |   44 +
 Documentation/admin-guide/cgroup-v1/net_prio.rst   |   57 ++
 Documentation/admin-guide/cgroup-v1/pids.rst       |   92 ++
 Documentation/admin-guide/cgroup-v1/rdma.rst       |  117 +++
 Documentation/admin-guide/cgroup-v2.rst            |    2 +-
 Documentation/admin-guide/index.rst                |    1 +
 Documentation/admin-guide/kernel-parameters.txt    |    4 +-
 .../admin-guide/mm/numa_memory_policy.rst          |    2 +-
 Documentation/block/bfq-iosched.rst                |    2 +-
 Documentation/cgroup-v1/blkio-controller.rst       |  302 ------
 Documentation/cgroup-v1/cgroups.rst                |  695 --------------
 Documentation/cgroup-v1/cpuacct.rst                |   50 -
 Documentation/cgroup-v1/cpusets.rst                |  866 -----------------
 Documentation/cgroup-v1/devices.rst                |  132 ---
 Documentation/cgroup-v1/freezer-subsystem.rst      |  127 ---
 Documentation/cgroup-v1/hugetlb.rst                |   50 -
 Documentation/cgroup-v1/index.rst                  |   30 -
 Documentation/cgroup-v1/memcg_test.rst             |  355 -------
 Documentation/cgroup-v1/memory.rst                 | 1003 --------------------
 Documentation/cgroup-v1/net_cls.rst                |   44 -
 Documentation/cgroup-v1/net_prio.rst               |   57 --
 Documentation/cgroup-v1/pids.rst                   |   92 --
 Documentation/cgroup-v1/rdma.rst                   |  117 ---
 Documentation/filesystems/tmpfs.txt                |    2 +-
 Documentation/kernel-per-CPU-kthreads.txt          |    2 +-
 Documentation/scheduler/sched-deadline.rst         |    2 +-
 Documentation/scheduler/sched-design-CFS.rst       |    2 +-
 Documentation/scheduler/sched-rt-group.rst         |    2 +-
 Documentation/vm/numa.rst                          |    4 +-
 Documentation/vm/page_migration.rst                |    2 +-
 Documentation/vm/unevictable-lru.rst               |    2 +-
 Documentation/x86/x86_64/fake-numa-for-cpusets.rst |    4 +-
 MAINTAINERS                                        |    4 +-
 block/Kconfig                                      |    2 +-
 include/linux/cgroup-defs.h                        |    2 +-
 include/uapi/linux/bpf.h                           |    2 +-
 init/Kconfig                                       |    4 +-
 kernel/cgroup/cpuset.c                             |    2 +-
 security/device_cgroup.c                           |    2 +-
 tools/include/uapi/linux/bpf.h                     |    2 +-
 50 files changed, 3945 insertions(+), 3946 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/blkio-controller.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/cgroups.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/cpuacct.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/cpusets.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/devices.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/hugetlb.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/index.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/memcg_test.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/memory.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/net_cls.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/net_prio.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/pids.rst
 create mode 100644 Documentation/admin-guide/cgroup-v1/rdma.rst
 delete mode 100644 Documentation/cgroup-v1/blkio-controller.rst
 delete mode 100644 Documentation/cgroup-v1/cgroups.rst
 delete mode 100644 Documentation/cgroup-v1/cpuacct.rst
 delete mode 100644 Documentation/cgroup-v1/cpusets.rst
 delete mode 100644 Documentation/cgroup-v1/devices.rst
 delete mode 100644 Documentation/cgroup-v1/freezer-subsystem.rst
 delete mode 100644 Documentation/cgroup-v1/hugetlb.rst
 delete mode 100644 Documentation/cgroup-v1/index.rst
 delete mode 100644 Documentation/cgroup-v1/memcg_test.rst
 delete mode 100644 Documentation/cgroup-v1/memory.rst
 delete mode 100644 Documentation/cgroup-v1/net_cls.rst
 delete mode 100644 Documentation/cgroup-v1/net_prio.rst
 delete mode 100644 Documentation/cgroup-v1/pids.rst
 delete mode 100644 Documentation/cgroup-v1/rdma.rst

(limited to 'include/linux')

diff --git a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
new file mode 100644
index 000000000000..1d7d962933be
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst
@@ -0,0 +1,302 @@
+===================
+Block IO Controller
+===================
+
+Overview
+========
+cgroup subsys "blkio" implements the block io controller. There seems to be
+a need of various kinds of IO control policies (like proportional BW, max BW)
+both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
+Plan is to use the same cgroup based management interface for blkio controller
+and based on user options switch IO policies in the background.
+
+One IO control policy is throttling policy which can be used to
+specify upper IO rate limits on devices. This policy is implemented in
+generic block layer and can be used on leaf nodes as well as higher
+level logical devices like device mapper.
+
+HOWTO
+=====
+Throttling/Upper Limit policy
+-----------------------------
+- Enable Block IO controller::
+
+	CONFIG_BLK_CGROUP=y
+
+- Enable throttling in block layer::
+
+	CONFIG_BLK_DEV_THROTTLING=y
+
+- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
+
+        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
+
+- Specify a bandwidth rate on particular device for root group. The format
+  for policy is "<major>:<minor>  <bytes_per_second>"::
+
+        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
+
+  Above will put a limit of 1MB/second on reads happening for root group
+  on device having major/minor number 8:16.
+
+- Run dd to read a file and see if rate is throttled to 1MB/s or not::
+
+        # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
+        1024+0 records in
+        1024+0 records out
+        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
+
+ Limits for writes can be put using blkio.throttle.write_bps_device file.
+
+Hierarchical Cgroups
+====================
+
+Throttling implements hierarchy support; however,
+throttling's hierarchy support is enabled iff "sane_behavior" is
+enabled from cgroup side, which currently is a development option and
+not publicly available.
+
+If somebody created a hierarchy like as follows::
+
+			root
+			/  \
+		     test1 test2
+			|
+		     test3
+
+Throttling with "sane_behavior" will handle the
+hierarchy correctly. For throttling, all limits apply
+to the whole subtree while all statistics are local to the IOs
+directly generated by tasks in that cgroup.
+
+Throttling without "sane_behavior" enabled from cgroup side will
+practically treat all groups at same level as if it looks like the
+following::
+
+				pivot
+			     /  /   \  \
+			root  test1 test2  test3
+
+Various user visible config options
+===================================
+CONFIG_BLK_CGROUP
+	- Block IO controller.
+
+CONFIG_BFQ_CGROUP_DEBUG
+	- Debug help. Right now some additional stats file show up in cgroup
+	  if this option is enabled.
+
+CONFIG_BLK_DEV_THROTTLING
+	- Enable block device throttling support in block layer.
+
+Details of cgroup files
+=======================
+Proportional weight policy files
+--------------------------------
+- blkio.weight
+	- Specifies per cgroup weight. This is default weight of the group
+	  on all the devices until and unless overridden by per device rule.
+	  (See blkio.weight_device).
+	  Currently allowed range of weights is from 10 to 1000.
+
+- blkio.weight_device
+	- One can specify per cgroup per device rules using this interface.
+	  These rules override the default value of group weight as specified
+	  by blkio.weight.
+
+	  Following is the format::
+
+	    # echo dev_maj:dev_minor weight > blkio.weight_device
+
+	  Configure weight=300 on /dev/sdb (8:16) in this cgroup::
+
+	    # echo 8:16 300 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:16    300
+
+	  Configure weight=500 on /dev/sda (8:0) in this cgroup::
+
+	    # echo 8:0 500 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:0     500
+	    8:16    300
+
+	  Remove specific weight for /dev/sda in this cgroup::
+
+	    # echo 8:0 0 > blkio.weight_device
+	    # cat blkio.weight_device
+	    dev     weight
+	    8:16    300
+
+- blkio.leaf_weight[_device]
+	- Equivalents of blkio.weight[_device] for the purpose of
+          deciding how much weight tasks in the given cgroup has while
+          competing with the cgroup's child cgroups. For details,
+          please refer to Documentation/block/cfq-iosched.txt.
+
+- blkio.time
+	- disk time allocated to cgroup per device in milliseconds. First
+	  two fields specify the major and minor number of the device and
+	  third field specifies the disk time allocated to group in
+	  milliseconds.
+
+- blkio.sectors
+	- number of sectors transferred to/from disk by the group. First
+	  two fields specify the major and minor number of the device and
+	  third field specifies the number of sectors transferred by the
+	  group to/from the device.
+
+- blkio.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+- blkio.io_serviced
+	- Number of IOs (bio) issued to the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
+
+- blkio.io_service_time
+	- Total amount of time between request dispatch and request completion
+	  for the IOs done by this cgroup. This is in nanoseconds to make it
+	  meaningful for flash devices too. For devices with queue depth of 1,
+	  this time represents the actual service time. When queue_depth > 1,
+	  that is no longer true as requests may be served out of order. This
+	  may cause the service time for a given IO to include the service time
+	  of multiple IOs when served out of order which may result in total
+	  io_service_time > actual time elapsed. This time is further divided by
+	  the type of operation - read or write, sync or async. First two fields
+	  specify the major and minor number of the device, third field
+	  specifies the operation type and the fourth field specifies the
+	  io_service_time in ns.
+
+- blkio.io_wait_time
+	- Total amount of time the IOs for this cgroup spent waiting in the
+	  scheduler queues for service. This can be greater than the total time
+	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
+	  measure of total time the cgroup spent waiting but rather a measure of
+	  the wait_time for its individual IOs. For devices with queue_depth > 1
+	  this metric does not include the time spent waiting for service once
+	  the IO is dispatched to the device but till it actually gets serviced
+	  (there might be a time lag here due to re-ordering of requests by the
+	  device). This is in nanoseconds to make it meaningful for flash
+	  devices too. This time is further divided by the type of operation -
+	  read or write, sync or async. First two fields specify the major and
+	  minor number of the device, third field specifies the operation type
+	  and the fourth field specifies the io_wait_time in ns.
+
+- blkio.io_merged
+	- Total number of bios/requests merged into requests belonging to this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
+- blkio.io_queued
+	- Total number of requests queued up at any given instant for this
+	  cgroup. This is further divided by the type of operation - read or
+	  write, sync or async.
+
+- blkio.avg_queue_size
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  The average queue size for this cgroup over the entire time of this
+	  cgroup's existence. Queue size samples are taken each time one of the
+	  queues of this cgroup gets a timeslice.
+
+- blkio.group_wait_time
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  This is the amount of time the cgroup had to wait since it became busy
+	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+	  its queues. This is different from the io_wait_time which is the
+	  cumulative total of the amount of time spent by each IO in that cgroup
+	  waiting in the scheduler queue. This is in nanoseconds. If this is
+	  read when the cgroup is in a waiting (for timeslice) state, the stat
+	  will only report the group_wait_time accumulated till the last time it
+	  got a timeslice and will not include the current delta.
+
+- blkio.empty_time
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  This is the amount of time a cgroup spends without any pending
+	  requests when not being served, i.e., it does not include any time
+	  spent idling for one of the queues of the cgroup. This is in
+	  nanoseconds. If this is read when the cgroup is in an empty state,
+	  the stat will only report the empty_time accumulated till the last
+	  time it had a pending request and will not include the current delta.
+
+- blkio.idle_time
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
+	  This is the amount of time spent by the IO scheduler idling for a
+	  given cgroup in anticipation of a better request than the existing ones
+	  from other queues/cgroups. This is in nanoseconds. If this is read
+	  when the cgroup is in an idling state, the stat will only report the
+	  idle_time accumulated till the last idle period and will not include
+	  the current delta.
+
+- blkio.dequeue
+	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This
+	  gives the statistics about how many a times a group was dequeued
+	  from service tree of the device. First two fields specify the major
+	  and minor number of the device and third field specifies the number
+	  of times a group was dequeued from a particular device.
+
+- blkio.*_recursive
+	- Recursive version of various stats. These files show the
+          same information as their non-recursive counterparts but
+          include stats from all the descendant cgroups.
+
+Throttling/Upper limit policy files
+-----------------------------------
+- blkio.throttle.read_bps_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in bytes per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
+
+- blkio.throttle.write_bps_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in bytes per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
+
+- blkio.throttle.read_iops_device
+	- Specifies upper limit on READ rate from the device. IO rate is
+	  specified in IO per second. Rules are per device. Following is
+	  the format::
+
+	   echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
+
+- blkio.throttle.write_iops_device
+	- Specifies upper limit on WRITE rate to the device. IO rate is
+	  specified in io per second. Rules are per device. Following is
+	  the format::
+
+	    echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
+
+Note: If both BW and IOPS rules are specified for a device, then IO is
+      subjected to both the constraints.
+
+- blkio.throttle.io_serviced
+	- Number of IOs (bio) issued to the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of IOs.
+
+- blkio.throttle.io_service_bytes
+	- Number of bytes transferred to/from the disk by the group. These
+	  are further divided by the type of operation - read or write, sync
+	  or async. First two fields specify the major and minor number of the
+	  device, third field specifies the operation type and the fourth field
+	  specifies the number of bytes.
+
+Common files among various policies
+-----------------------------------
+- blkio.reset_stats
+	- Writing an int to this file will result in resetting all the stats
+	  for that cgroup.
diff --git a/Documentation/admin-guide/cgroup-v1/cgroups.rst b/Documentation/admin-guide/cgroup-v1/cgroups.rst
new file mode 100644
index 000000000000..b0688011ed06
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cgroups.rst
@@ -0,0 +1,695 @@
+==============
+Control Groups
+==============
+
+Written by Paul Menage <menage@google.com> based on
+Documentation/admin-guide/cgroup-v1/cpusets.rst
+
+Original copyright statements from cpusets.txt:
+
+Portions Copyright (C) 2004 BULL SA.
+
+Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+
+Modified by Paul Jackson <pj@sgi.com>
+
+Modified by Christoph Lameter <cl@linux.com>
+
+.. CONTENTS:
+
+	1. Control Groups
+	1.1 What are cgroups ?
+	1.2 Why are cgroups needed ?
+	1.3 How are cgroups implemented ?
+	1.4 What does notify_on_release do ?
+	1.5 What does clone_children do ?
+	1.6 How do I use cgroups ?
+	2. Usage Examples and Syntax
+	2.1 Basic Usage
+	2.2 Attaching processes
+	2.3 Mounting hierarchies by name
+	3. Kernel API
+	3.1 Overview
+	3.2 Synchronization
+	3.3 Subsystem API
+	4. Extended attributes usage
+	5. Questions
+
+1. Control Groups
+=================
+
+1.1 What are cgroups ?
+----------------------
+
+Control Groups provide a mechanism for aggregating/partitioning sets of
+tasks, and all their future children, into hierarchical groups with
+specialized behaviour.
+
+Definitions:
+
+A *cgroup* associates a set of tasks with a set of parameters for one
+or more subsystems.
+
+A *subsystem* is a module that makes use of the task grouping
+facilities provided by cgroups to treat groups of tasks in
+particular ways. A subsystem is typically a "resource controller" that
+schedules a resource or applies per-cgroup limits, but it may be
+anything that wants to act on a group of processes, e.g. a
+virtualization subsystem.
+
+A *hierarchy* is a set of cgroups arranged in a tree, such that
+every task in the system is in exactly one of the cgroups in the
+hierarchy, and a set of subsystems; each subsystem has system-specific
+state attached to each cgroup in the hierarchy.  Each hierarchy has
+an instance of the cgroup virtual filesystem associated with it.
+
+At any one time there may be multiple active hierarchies of task
+cgroups. Each hierarchy is a partition of all tasks in the system.
+
+User-level code may create and destroy cgroups by name in an
+instance of the cgroup virtual file system, specify and query to
+which cgroup a task is assigned, and list the task PIDs assigned to
+a cgroup. Those creations and assignments only affect the hierarchy
+associated with that instance of the cgroup file system.
+
+On their own, the only use for cgroups is for simple job
+tracking. The intention is that other subsystems hook into the generic
+cgroup support to provide new attributes for cgroups, such as
+accounting/limiting the resources which processes in a cgroup can
+access. For example, cpusets (see Documentation/admin-guide/cgroup-v1/cpusets.rst) allow
+you to associate a set of CPUs and a set of memory nodes with the
+tasks in each cgroup.
+
+1.2 Why are cgroups needed ?
+----------------------------
+
+There are multiple efforts to provide process aggregations in the
+Linux kernel, mainly for resource-tracking purposes. Such efforts
+include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
+namespaces. These all require the basic notion of a
+grouping/partitioning of processes, with newly forked processes ending
+up in the same group (cgroup) as their parent process.
+
+The kernel cgroup patch provides the minimum essential kernel
+mechanisms required to efficiently implement such groups. It has
+minimal impact on the system fast paths, and provides hooks for
+specific subsystems such as cpusets to provide additional behaviour as
+desired.
+
+Multiple hierarchy support is provided to allow for situations where
+the division of tasks into cgroups is distinctly different for
+different subsystems - having parallel hierarchies allows each
+hierarchy to be a natural division of tasks, without having to handle
+complex combinations of tasks that would be present if several
+unrelated subsystems needed to be forced into the same tree of
+cgroups.
+
+At one extreme, each resource controller or subsystem could be in a
+separate hierarchy; at the other extreme, all subsystems
+would be attached to the same hierarchy.
+
+As an example of a scenario (originally proposed by vatsa@in.ibm.com)
+that can benefit from multiple hierarchies, consider a large
+university server with various users - students, professors, system
+tasks etc. The resource planning for this server could be along the
+following lines::
+
+       CPU :          "Top cpuset"
+                       /       \
+               CPUSet1         CPUSet2
+                  |               |
+               (Professors)    (Students)
+
+               In addition (system tasks) are attached to topcpuset (so
+               that they can run anywhere) with a limit of 20%
+
+       Memory : Professors (50%), Students (30%), system (20%)
+
+       Disk : Professors (50%), Students (30%), system (20%)
+
+       Network : WWW browsing (20%), Network File System (60%), others (20%)
+                               / \
+               Professors (15%)  students (5%)
+
+Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
+into the NFS network class.
+
+At the same time Firefox/Lynx will share an appropriate CPU/Memory class
+depending on who launched it (prof/student).
+
+With the ability to classify tasks differently for different resources
+(by putting those resource subsystems in different hierarchies),
+the admin can easily set up a script which receives exec notifications
+and depending on who is launching the browser he can::
+
+    # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
+
+With only a single hierarchy, he now would potentially have to create
+a separate cgroup for every browser launched and associate it with
+appropriate network and other resource class.  This may lead to
+proliferation of such cgroups.
+
+Also let's say that the administrator would like to give enhanced network
+access temporarily to a student's browser (since it is night and the user
+wants to do online gaming :))  OR give one of the student's simulation
+apps enhanced CPU power.
+
+With ability to write PIDs directly to resource classes, it's just a
+matter of::
+
+       # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
+       (after some time)
+       # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
+
+Without this ability, the administrator would have to split the cgroup into
+multiple separate ones and then associate the new cgroups with the
+new resource classes.
+
+
+
+1.3 How are cgroups implemented ?
+---------------------------------
+
+Control Groups extends the kernel as follows:
+
+ - Each task in the system has a reference-counted pointer to a
+   css_set.
+
+ - A css_set contains a set of reference-counted pointers to
+   cgroup_subsys_state objects, one for each cgroup subsystem
+   registered in the system. There is no direct link from a task to
+   the cgroup of which it's a member in each hierarchy, but this
+   can be determined by following pointers through the
+   cgroup_subsys_state objects. This is because accessing the
+   subsystem state is something that's expected to happen frequently
+   and in performance-critical code, whereas operations that require a
+   task's actual cgroup assignments (in particular, moving between
+   cgroups) are less common. A linked list runs through the cg_list
+   field of each task_struct using the css_set, anchored at
+   css_set->tasks.
+
+ - A cgroup hierarchy filesystem can be mounted for browsing and
+   manipulation from user space.
+
+ - You can list all the tasks (by PID) attached to any cgroup.
+
+The implementation of cgroups requires a few, simple hooks
+into the rest of the kernel, none in performance-critical paths:
+
+ - in init/main.c, to initialize the root cgroups and initial
+   css_set at system boot.
+
+ - in fork and exit, to attach and detach a task from its css_set.
+
+In addition, a new file system of type "cgroup" may be mounted, to
+enable browsing and modifying the cgroups presently known to the
+kernel.  When mounting a cgroup hierarchy, you may specify a
+comma-separated list of subsystems to mount as the filesystem mount
+options.  By default, mounting the cgroup filesystem attempts to
+mount a hierarchy containing all registered subsystems.
+
+If an active hierarchy with exactly the same set of subsystems already
+exists, it will be reused for the new mount. If no existing hierarchy
+matches, and any of the requested subsystems are in use in an existing
+hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
+is activated, associated with the requested subsystems.
+
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
+
+When a cgroup filesystem is unmounted, if there are any
+child cgroups created below the top-level cgroup, that hierarchy
+will remain active even though unmounted; if there are no
+child cgroups then the hierarchy will be deactivated.
+
+No new system calls are added for cgroups - all support for
+querying and modifying cgroups is via this cgroup file system.
+
+Each task under /proc has an added file named 'cgroup' displaying,
+for each active hierarchy, the subsystem names and the cgroup name
+as the path relative to the root of the cgroup file system.
+
+Each cgroup is represented by a directory in the cgroup file system
+containing the following files describing that cgroup:
+
+ - tasks: list of tasks (by PID) attached to that cgroup.  This list
+   is not guaranteed to be sorted.  Writing a thread ID into this file
+   moves the thread into this cgroup.
+ - cgroup.procs: list of thread group IDs in the cgroup.  This list is
+   not guaranteed to be sorted or free of duplicate TGIDs, and userspace
+   should sort/uniquify the list if this property is required.
+   Writing a thread group ID into this file moves all threads in that
+   group into this cgroup.
+ - notify_on_release flag: run the release agent on exit?
+ - release_agent: the path to use for release notifications (this file
+   exists in the top cgroup only)
+
+Other subsystems such as cpusets may add additional files in each
+cgroup dir.
+
+New cgroups are created using the mkdir system call or shell
+command.  The properties of a cgroup, such as its flags, are
+modified by writing to the appropriate file in that cgroups
+directory, as listed above.
+
+The named hierarchical structure of nested cgroups allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cgroup allows organizing the work load
+on a system into related sets of tasks.  A task may be re-attached to
+any other cgroup, if allowed by the permissions on the necessary
+cgroup file system directories.
+
+When a task is moved from one cgroup to another, it gets a new
+css_set pointer - if there's an already existing css_set with the
+desired collection of cgroups then that group is reused, otherwise a new
+css_set is allocated. The appropriate existing css_set is located by
+looking into a hash table.
+
+To allow access from a cgroup to the css_sets (and hence tasks)
+that comprise it, a set of cg_cgroup_link objects form a lattice;
+each cg_cgroup_link is linked into a list of cg_cgroup_links for
+a single cgroup on its cgrp_link_list field, and a list of
+cg_cgroup_links for a single css_set on its cg_link_list.
+
+Thus the set of tasks in a cgroup can be listed by iterating over
+each css_set that references the cgroup, and sub-iterating over
+each css_set's task set.
+
+The use of a Linux virtual file system (vfs) to represent the
+cgroup hierarchy provides for a familiar permission and name space
+for cgroups, with a minimum of additional kernel code.
+
+1.4 What does notify_on_release do ?
+------------------------------------
+
+If the notify_on_release flag is enabled (1) in a cgroup, then
+whenever the last task in the cgroup leaves (exits or attaches to
+some other cgroup) and the last child cgroup of that cgroup
+is removed, then the kernel runs the command specified by the contents
+of the "release_agent" file in that hierarchy's root directory,
+supplying the pathname (relative to the mount point of the cgroup
+file system) of the abandoned cgroup.  This enables automatic
+removal of abandoned cgroups.  The default value of
+notify_on_release in the root cgroup at system boot is disabled
+(0).  The default value of other cgroups at creation is the current
+value of their parents' notify_on_release settings. The default value of
+a cgroup hierarchy's release_agent path is empty.
+
+1.5 What does clone_children do ?
+---------------------------------
+
+This flag only affects the cpuset controller. If the clone_children
+flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
+configuration from the parent during initialization.
+
+1.6 How do I use cgroups ?
+--------------------------
+
+To start a new job that is to be contained within a cgroup, using
+the "cpuset" cgroup subsystem, the steps are something like::
+
+ 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
+ 2) mkdir /sys/fs/cgroup/cpuset
+ 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
+    the /sys/fs/cgroup/cpuset virtual file system.
+ 5) Start a task that will be the "founding father" of the new job.
+ 6) Attach that task to the new cgroup by writing its PID to the
+    /sys/fs/cgroup/cpuset tasks file for that cgroup.
+ 7) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cgroup
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cgroup::
+
+  mount -t tmpfs cgroup_root /sys/fs/cgroup
+  mkdir /sys/fs/cgroup/cpuset
+  mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
+  cd /sys/fs/cgroup/cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cgroup Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cgroup
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using cgroups can be done through the cgroup
+virtual filesystem.
+
+To mount a cgroup hierarchy with all available subsystems, type::
+
+  # mount -t cgroup xxx /sys/fs/cgroup
+
+The "xxx" is not interpreted by the cgroup code, but will appear in
+/proc/mounts so may be any useful identifying string that you like.
+
+Note: Some subsystems do not work without some user input first.  For instance,
+if cpusets are enabled the user will have to populate the cpus and mems files
+for each new cgroup created before that group can be used.
+
+As explained in section `1.2 Why are cgroups needed?` you should create
+different hierarchies of cgroups for each single resource or group of
+resources you want to control. Therefore, you should mount a tmpfs on
+/sys/fs/cgroup and create directories for each cgroup resource or resource
+group::
+
+  # mount -t tmpfs cgroup_root /sys/fs/cgroup
+  # mkdir /sys/fs/cgroup/rg1
+
+To mount a cgroup hierarchy with just the cpuset and memory
+subsystems, type::
+
+  # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
+
+While remounting cgroups is currently supported, it is not recommend
+to use it. Remounting allows changing bound subsystems and
+release_agent. Rebinding is hardly useful as it only works when the
+hierarchy is empty and release_agent itself should be replaced with
+conventional fsnotify. The support for remounting will be removed in
+the future.
+
+To Specify a hierarchy's release_agent::
+
+  # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
+    xxx /sys/fs/cgroup/rg1
+
+Note that specifying 'release_agent' more than once will return failure.
+
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
+
+Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
+tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
+is the cgroup that holds the whole system.
+
+If you want to change the value of release_agent::
+
+  # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
+
+It can also be changed via remount.
+
+If you want to create a new cgroup under /sys/fs/cgroup/rg1::
+
+  # cd /sys/fs/cgroup/rg1
+  # mkdir my_cgroup
+
+Now you want to do something with this cgroup:
+
+  # cd my_cgroup
+
+In this directory you can find several files::
+
+  # ls
+  cgroup.procs notify_on_release tasks
+  (plus whatever files added by the attached subsystems)
+
+Now attach your shell to this cgroup::
+
+  # /bin/echo $$ > tasks
+
+You can also create cgroups inside your cgroup by using mkdir in this
+directory::
+
+  # mkdir my_sub_cs
+
+To remove a cgroup, just use rmdir::
+
+  # rmdir my_sub_cs
+
+This will fail if the cgroup is in use (has cgroups inside, or
+has processes attached, or is held alive by other subsystem-specific
+reference).
+
+2.2 Attaching processes
+-----------------------
+
+::
+
+  # /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another::
+
+  # /bin/echo PID1 > tasks
+  # /bin/echo PID2 > tasks
+	  ...
+  # /bin/echo PIDn > tasks
+
+You can attach the current shell task by echoing 0::
+
+  # echo 0 > tasks
+
+You can use the cgroup.procs file instead of the tasks file to move all
+threads in a threadgroup at once. Echoing the PID of any task in a
+threadgroup to cgroup.procs causes all tasks in that threadgroup to be
+attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
+in the writing task's threadgroup.
+
+Note: Since every task is always a member of exactly one cgroup in each
+mounted hierarchy, to remove a task from its current cgroup you must
+move it into a new cgroup (possibly the root cgroup) by writing to the
+new cgroup's tasks file.
+
+Note: Due to some restrictions enforced by some cgroup subsystems, moving
+a process to another cgroup can fail.
+
+2.3 Mounting hierarchies by name
+--------------------------------
+
+Passing the name=<x> option when mounting a cgroups hierarchy
+associates the given name with the hierarchy.  This can be used when
+mounting a pre-existing hierarchy, in order to refer to it by name
+rather than by its set of active subsystems.  Each hierarchy is either
+nameless, or has a unique name.
+
+The name should match [\w.-]+
+
+When passing a name=<x> option for a new hierarchy, you need to
+specify subsystems manually; the legacy behaviour of mounting all
+subsystems when none are explicitly specified is not supported when
+you give a subsystem a name.
+
+The name of the subsystem appears as part of the hierarchy description
+in /proc/mounts and /proc/<pid>/cgroups.
+
+
+3. Kernel API
+=============
+
+3.1 Overview
+------------
+
+Each kernel subsystem that wants to hook into the generic cgroup
+system needs to create a cgroup_subsys object. This contains
+various methods, which are callbacks from the cgroup system, along
+with a subsystem ID which will be assigned by the cgroup system.
+
+Other fields in the cgroup_subsys object include:
+
+- subsys_id: a unique array index for the subsystem, indicating which
+  entry in cgroup->subsys[] this subsystem should be managing.
+
+- name: should be initialized to a unique subsystem name. Should be
+  no longer than MAX_CGROUP_TYPE_NAMELEN.
+
+- early_init: indicate if the subsystem needs early initialization
+  at system boot.
+
+Each cgroup object created by the system has an array of pointers,
+indexed by subsystem ID; this pointer is entirely managed by the
+subsystem; the generic cgroup code will never touch this pointer.
+
+3.2 Synchronization
+-------------------
+
+There is a global mutex, cgroup_mutex, used by the cgroup
+system. This should be taken by anything that wants to modify a
+cgroup. It may also be taken to prevent cgroups from being
+modified, but more specific locks may be more appropriate in that
+situation.
+
+See kernel/cgroup.c for more details.
+
+Subsystems can take/release the cgroup_mutex via the functions
+cgroup_lock()/cgroup_unlock().
+
+Accessing a task's cgroup pointer may be done in the following ways:
+- while holding cgroup_mutex
+- while holding the task's alloc_lock (via task_lock())
+- inside an rcu_read_lock() section via rcu_dereference()
+
+3.3 Subsystem API
+-----------------
+
+Each subsystem should:
+
+- add an entry in linux/cgroup_subsys.h
+- define a cgroup_subsys object called <name>_cgrp_subsys
+
+Each subsystem may export the following methods. The only mandatory
+methods are css_alloc/free. Any others that are null are presumed to
+be successful no-ops.
+
+``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+Called to allocate a subsystem state object for a cgroup. The
+subsystem should allocate its subsystem state object for the passed
+cgroup, returning a pointer to the new object on success or a
+ERR_PTR() value. On success, the subsystem pointer should point to
+a structure of type cgroup_subsys_state (typically embedded in a
+larger subsystem-specific object), which will be initialized by the
+cgroup system. Note that this will be called at initialization to
+create the root subsystem state for this subsystem; this case can be
+identified by the passed cgroup object having a NULL parent (since
+it's the root of the hierarchy) and may be an appropriate place for
+initialization code.
+
+``int css_online(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+Called after @cgrp successfully completed all allocations and made
+visible to cgroup_for_each_child/descendant_*() iterators. The
+subsystem may choose to fail creation by returning -errno. This
+callback can be used to implement reliable state sharing and
+propagation along the hierarchy. See the comment on
+cgroup_for_each_descendant_pre() for details.
+
+``void css_offline(struct cgroup *cgrp);``
+(cgroup_mutex held by caller)
+
+This is the counterpart of css_online() and called iff css_online()
+has succeeded on @cgrp. This signifies the beginning of the end of
+@cgrp. @cgrp is being removed and the subsystem should start dropping
+all references it's holding on @cgrp. When all references are dropped,
+cgroup removal will proceed to the next step - css_free(). After this
+callback, @cgrp should be considered dead to the subsystem.
+
+``void css_free(struct cgroup *cgrp)``
+(cgroup_mutex held by caller)
+
+The cgroup system is about to free @cgrp; the subsystem should free
+its subsystem state object. By the time this method is called, @cgrp
+is completely unused; @cgrp->parent is still valid. (Note - can also
+be called for a newly-created cgroup if an error occurs after this
+subsystem's create() method has been called for the new cgroup).
+
+``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called prior to moving one or more tasks into a cgroup; if the
+subsystem returns an error, this will abort the attach operation.
+@tset contains the tasks to be attached and is guaranteed to have at
+least one task in it.
+
+If there are multiple tasks in the taskset, then:
+  - it's guaranteed that all are from the same thread group
+  - @tset contains all tasks from the thread group whether or not
+    they're switching cgroups
+  - the first task is the leader
+
+Each @tset entry also contains the task's old cgroup and tasks which
+aren't switching cgroup can be skipped easily using the
+cgroup_taskset_for_each() iterator. Note that this isn't called on a
+fork. If this method returns 0 (success) then this should remain valid
+while the caller holds cgroup_mutex and it is ensured that either
+attach() or cancel_attach() will be called in future.
+
+``void css_reset(struct cgroup_subsys_state *css)``
+(cgroup_mutex held by caller)
+
+An optional operation which should restore @css's configuration to the
+initial state.  This is currently only used on the unified hierarchy
+when a subsystem is disabled on a cgroup through
+"cgroup.subtree_control" but should remain enabled because other
+subsystems depend on it.  cgroup core makes such a css invisible by
+removing the associated interface files and invokes this callback so
+that the hidden subsystem can return to the initial neutral state.
+This prevents unexpected resource control from a hidden css and
+ensures that the configuration is in the initial state when it is made
+visible again later.
+
+``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called when a task attach operation has failed after can_attach() has succeeded.
+A subsystem whose can_attach() has some side-effects should provide this
+function, so that the subsystem can implement a rollback. If not, not necessary.
+This will be called only about subsystems whose can_attach() operation have
+succeeded. The parameters are identical to can_attach().
+
+``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
+(cgroup_mutex held by caller)
+
+Called after the task has been attached to the cgroup, to allow any
+post-attachment activity that requires memory allocations or blocking.
+The parameters are identical to can_attach().
+
+``void fork(struct task_struct *task)``
+
+Called when a task is forked into a cgroup.
+
+``void exit(struct task_struct *task)``
+
+Called during task exit.
+
+``void free(struct task_struct *task)``
+
+Called when the task_struct is freed.
+
+``void bind(struct cgroup *root)``
+(cgroup_mutex held by caller)
+
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
+
+4. Extended attribute usage
+===========================
+
+cgroup filesystem supports certain types of extended attributes in its
+directories and files.  The current supported types are:
+
+	- Trusted (XATTR_TRUSTED)
+	- Security (XATTR_SECURITY)
+
+Both require CAP_SYS_ADMIN capability to set.
+
+Like in tmpfs, the extended attributes in cgroup filesystem are stored
+using kernel memory and it's advised to keep the usage at minimum.  This
+is the reason why user defined extended attributes are not supported, since
+any user can do it and there's no limit in the value size.
+
+The current known users for this feature are SELinux to limit cgroup usage
+in containers and systemd for assorted meta data like main PID in a cgroup
+(systemd creates a cgroup per service).
+
+5. Questions
+============
+
+::
+
+  Q: what's up with this '/bin/echo' ?
+  A: bash's builtin 'echo' command does not check calls to write() against
+     errors. If you use it in the cgroup file system, you won't be
+     able to tell whether a command succeeded or failed.
+
+  Q: When I attach processes, only the first of the line gets really attached !
+  A: We can only return one error code per call to write(). So you should also
+     put only ONE PID.
diff --git a/Documentation/admin-guide/cgroup-v1/cpuacct.rst b/Documentation/admin-guide/cgroup-v1/cpuacct.rst
new file mode 100644
index 000000000000..d30ed81d2ad7
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cpuacct.rst
@@ -0,0 +1,50 @@
+=========================
+CPU Accounting Controller
+=========================
+
+The CPU accounting controller is used to group tasks using cgroups and
+account the CPU usage of these groups of tasks.
+
+The CPU accounting controller supports multi-hierarchy groups. An accounting
+group accumulates the CPU usage of all of its child groups and the tasks
+directly present in its group.
+
+Accounting groups can be created by first mounting the cgroup filesystem::
+
+  # mount -t cgroup -ocpuacct none /sys/fs/cgroup
+
+With the above step, the initial or the parent accounting group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
+by this group which is essentially the CPU time obtained by all the tasks
+in the system.
+
+New accounting groups can be created under the parent group /sys/fs/cgroup::
+
+  # cd /sys/fs/cgroup
+  # mkdir g1
+  # echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it. CPU time consumed by this bash and its children
+can be obtained from g1/cpuacct.usage and the same is accumulated in
+/sys/fs/cgroup/cpuacct.usage also.
+
+cpuacct.stat file lists a few statistics which further divide the
+CPU time obtained by the cgroup into user and system times. Currently
+the following statistics are supported:
+
+user: Time spent by tasks of the cgroup in user mode.
+system: Time spent by tasks of the cgroup in kernel mode.
+
+user and system are in USER_HZ unit.
+
+cpuacct controller uses percpu_counter interface to collect user and
+system times. This has two side effects:
+
+- It is theoretically possible to see wrong values for user and system times.
+  This is because percpu_counter_read() on 32bit systems isn't safe
+  against concurrent writes.
+- It is possible to see slightly outdated values for user and system times
+  due to the batch processing nature of percpu_counter.
diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst
new file mode 100644
index 000000000000..86a6ae995d54
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst
@@ -0,0 +1,866 @@
+=======
+CPUSETS
+=======
+
+Copyright (C) 2004 BULL SA.
+
+Written by Simon.Derr@bull.net
+
+- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
+- Modified by Paul Jackson <pj@sgi.com>
+- Modified by Christoph Lameter <cl@linux.com>
+- Modified by Paul Menage <menage@google.com>
+- Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+
+.. CONTENTS:
+
+   1. Cpusets
+     1.1 What are cpusets ?
+     1.2 Why are cpusets needed ?
+     1.3 How are cpusets implemented ?
+     1.4 What are exclusive cpusets ?
+     1.5 What is memory_pressure ?
+     1.6 What is memory spread ?
+     1.7 What is sched_load_balance ?
+     1.8 What is sched_relax_domain_level ?
+     1.9 How do I use cpusets ?
+   2. Usage Examples and Syntax
+     2.1 Basic Usage
+     2.2 Adding/removing cpus
+     2.3 Setting flags
+     2.4 Attaching processes
+   3. Questions
+   4. Contact
+
+1. Cpusets
+==========
+
+1.1 What are cpusets ?
+----------------------
+
+Cpusets provide a mechanism for assigning a set of CPUs and Memory
+Nodes to a set of tasks.   In this document "Memory Node" refers to
+an on-line node that contains memory.
+
+Cpusets constrain the CPU and Memory placement of tasks to only
+the resources within a task's current cpuset.  They form a nested
+hierarchy visible in a virtual file system.  These are the essential
+hooks, beyond what is already present, required to manage dynamic
+job placement on large systems.
+
+Cpusets use the generic cgroup subsystem described in
+Documentation/admin-guide/cgroup-v1/cgroups.rst.
+
+Requests by a task, using the sched_setaffinity(2) system call to
+include CPUs in its CPU affinity mask, and using the mbind(2) and
+set_mempolicy(2) system calls to include Memory Nodes in its memory
+policy, are both filtered through that task's cpuset, filtering out any
+CPUs or Memory Nodes not in that cpuset.  The scheduler will not
+schedule a task on a CPU that is not allowed in its cpus_allowed
+vector, and the kernel page allocator will not allocate a page on a
+node that is not allowed in the requesting task's mems_allowed vector.
+
+User level code may create and destroy cpusets by name in the cgroup
+virtual file system, manage the attributes and permissions of these
+cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
+specify and query to which cpuset a task is assigned, and list the
+task pids assigned to a cpuset.
+
+
+1.2 Why are cpusets needed ?
+----------------------------
+
+The management of large computer systems, with many processors (CPUs),
+complex memory cache hierarchies and multiple Memory Nodes having
+non-uniform access times (NUMA) presents additional challenges for
+the efficient scheduling and memory placement of processes.
+
+Frequently more modest sized systems can be operated with adequate
+efficiency just by letting the operating system automatically share
+the available CPU and Memory resources amongst the requesting tasks.
+
+But larger systems, which benefit more from careful processor and
+memory placement to reduce memory access times and contention,
+and which typically represent a larger investment for the customer,
+can benefit from explicitly placing jobs on properly sized subsets of
+the system.
+
+This can be especially valuable on:
+
+    * Web Servers running multiple instances of the same web application,
+    * Servers running different applications (for instance, a web server
+      and a database), or
+    * NUMA systems running large HPC applications with demanding
+      performance characteristics.
+
+These subsets, or "soft partitions" must be able to be dynamically
+adjusted, as the job mix changes, without impacting other concurrently
+executing jobs. The location of the running jobs pages may also be moved
+when the memory locations are changed.
+
+The kernel cpuset patch provides the minimum essential kernel
+mechanisms required to efficiently implement such subsets.  It
+leverages existing CPU and Memory Placement facilities in the Linux
+kernel to avoid any additional impact on the critical scheduler or
+memory allocator code.
+
+
+1.3 How are cpusets implemented ?
+---------------------------------
+
+Cpusets provide a Linux kernel mechanism to constrain which CPUs and
+Memory Nodes are used by a process or set of processes.
+
+The Linux kernel already has a pair of mechanisms to specify on which
+CPUs a task may be scheduled (sched_setaffinity) and on which Memory
+Nodes it may obtain memory (mbind, set_mempolicy).
+
+Cpusets extends these two mechanisms as follows:
+
+ - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
+   kernel.
+ - Each task in the system is attached to a cpuset, via a pointer
+   in the task structure to a reference counted cgroup structure.
+ - Calls to sched_setaffinity are filtered to just those CPUs
+   allowed in that task's cpuset.
+ - Calls to mbind and set_mempolicy are filtered to just
+   those Memory Nodes allowed in that task's cpuset.
+ - The root cpuset contains all the systems CPUs and Memory
+   Nodes.
+ - For any cpuset, one can define child cpusets containing a subset
+   of the parents CPU and Memory Node resources.
+ - The hierarchy of cpusets can be mounted at /dev/cpuset, for
+   browsing and manipulation from user space.
+ - A cpuset may be marked exclusive, which ensures that no other
+   cpuset (except direct ancestors and descendants) may contain
+   any overlapping CPUs or Memory Nodes.
+ - You can list all the tasks (by pid) attached to any cpuset.
+
+The implementation of cpusets requires a few, simple hooks
+into the rest of the kernel, none in performance critical paths:
+
+ - in init/main.c, to initialize the root cpuset at system boot.
+ - in fork and exit, to attach and detach a task from its cpuset.
+ - in sched_setaffinity, to mask the requested CPUs by what's
+   allowed in that task's cpuset.
+ - in sched.c migrate_live_tasks(), to keep migrating tasks within
+   the CPUs allowed by their cpuset, if possible.
+ - in the mbind and set_mempolicy system calls, to mask the requested
+   Memory Nodes by what's allowed in that task's cpuset.
+ - in page_alloc.c, to restrict memory to allowed nodes.
+ - in vmscan.c, to restrict page recovery to the current cpuset.
+
+You should mount the "cgroup" filesystem type in order to enable
+browsing and modifying the cpusets presently known to the kernel.  No
+new system calls are added for cpusets - all support for querying and
+modifying cpusets is via this cpuset file system.
+
+The /proc/<pid>/status file for each task has four added lines,
+displaying the task's cpus_allowed (on which CPUs it may be scheduled)
+and mems_allowed (on which Memory Nodes it may obtain memory),
+in the two formats seen in the following example::
+
+  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
+  Cpus_allowed_list:      0-127
+  Mems_allowed:   ffffffff,ffffffff
+  Mems_allowed_list:      0-63
+
+Each cpuset is represented by a directory in the cgroup file system
+containing (on top of the standard cgroup files) the following
+files describing that cpuset:
+
+ - cpuset.cpus: list of CPUs in that cpuset
+ - cpuset.mems: list of Memory Nodes in that cpuset
+ - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
+ - cpuset.cpu_exclusive flag: is cpu placement exclusive?
+ - cpuset.mem_exclusive flag: is memory placement exclusive?
+ - cpuset.mem_hardwall flag:  is memory allocation hardwalled
+ - cpuset.memory_pressure: measure of how much paging pressure in cpuset
+ - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
+ - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
+ - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
+ - cpuset.sched_relax_domain_level: the searching range when migrating tasks
+
+In addition, only the root cpuset has the following file:
+
+ - cpuset.memory_pressure_enabled flag: compute memory_pressure?
+
+New cpusets are created using the mkdir system call or shell
+command.  The properties of a cpuset, such as its flags, allowed
+CPUs and Memory Nodes, and attached tasks, are modified by writing
+to the appropriate file in that cpusets directory, as listed above.
+
+The named hierarchical structure of nested cpusets allows partitioning
+a large system into nested, dynamically changeable, "soft-partitions".
+
+The attachment of each task, automatically inherited at fork by any
+children of that task, to a cpuset allows organizing the work load
+on a system into related sets of tasks such that each set is constrained
+to using the CPUs and Memory Nodes of a particular cpuset.  A task
+may be re-attached to any other cpuset, if allowed by the permissions
+on the necessary cpuset file system directories.
+
+Such management of a system "in the large" integrates smoothly with
+the detailed placement done on individual tasks and memory regions
+using the sched_setaffinity, mbind and set_mempolicy system calls.
+
+The following rules apply to each cpuset:
+
+ - Its CPUs and Memory Nodes must be a subset of its parents.
+ - It can't be marked exclusive unless its parent is.
+ - If its cpu or memory is exclusive, they may not overlap any sibling.
+
+These rules, and the natural hierarchy of cpusets, enable efficient
+enforcement of the exclusive guarantee, without having to scan all
+cpusets every time any of them change to ensure nothing overlaps a
+exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
+to represent the cpuset hierarchy provides for a familiar permission
+and name space for cpusets, with a minimum of additional kernel code.
+
+The cpus and mems files in the root (top_cpuset) cpuset are
+read-only.  The cpus file automatically tracks the value of
+cpu_online_mask using a CPU hotplug notifier, and the mems file
+automatically tracks the value of node_states[N_MEMORY]--i.e.,
+nodes with memory--using the cpuset_track_online_nodes() hook.
+
+
+1.4 What are exclusive cpusets ?
+--------------------------------
+
+If a cpuset is cpu or mem exclusive, no other cpuset, other than
+a direct ancestor or descendant, may share any of the same CPUs or
+Memory Nodes.
+
+A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
+i.e. it restricts kernel allocations for page, buffer and other data
+commonly shared by the kernel across multiple users.  All cpusets,
+whether hardwalled or not, restrict allocations of memory for user
+space.  This enables configuring a system so that several independent
+jobs can share common kernel data, such as file system pages, while
+isolating each job's user allocation in its own cpuset.  To do this,
+construct a large mem_exclusive cpuset to hold all the jobs, and
+construct child, non-mem_exclusive cpusets for each individual job.
+Only a small amount of typical kernel memory, such as requests from
+interrupt handlers, is allowed to be taken outside even a
+mem_exclusive cpuset.
+
+
+1.5 What is memory_pressure ?
+-----------------------------
+The memory_pressure of a cpuset provides a simple per-cpuset metric
+of the rate that the tasks in a cpuset are attempting to free up in
+use memory on the nodes of the cpuset to satisfy additional memory
+requests.
+
+This enables batch managers monitoring jobs running in dedicated
+cpusets to efficiently detect what level of memory pressure that job
+is causing.
+
+This is useful both on tightly managed systems running a wide mix of
+submitted jobs, which may choose to terminate or re-prioritize jobs that
+are trying to use more memory than allowed on the nodes assigned to them,
+and with tightly coupled, long running, massively parallel scientific
+computing jobs that will dramatically fail to meet required performance
+goals if they start to use more memory than allowed to them.
+
+This mechanism provides a very economical way for the batch manager
+to monitor a cpuset for signs of memory pressure.  It's up to the
+batch manager or other user code to decide what to do about it and
+take action.
+
+==>
+    Unless this feature is enabled by writing "1" to the special file
+    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
+    code of __alloc_pages() for this metric reduces to simply noticing
+    that the cpuset_memory_pressure_enabled flag is zero.  So only
+    systems that enable this feature will compute the metric.
+
+Why a per-cpuset, running average:
+
+    Because this meter is per-cpuset, rather than per-task or mm,
+    the system load imposed by a batch scheduler monitoring this
+    metric is sharply reduced on large systems, because a scan of
+    the tasklist can be avoided on each set of queries.
+
+    Because this meter is a running average, instead of an accumulating
+    counter, a batch scheduler can detect memory pressure with a
+    single read, instead of having to read and accumulate results
+    for a period of time.
+
+    Because this meter is per-cpuset rather than per-task or mm,
+    the batch scheduler can obtain the key information, memory
+    pressure in a cpuset, with a single read, rather than having to
+    query and accumulate results over all the (dynamically changing)
+    set of tasks in the cpuset.
+
+A per-cpuset simple digital filter (requires a spinlock and 3 words
+of data per-cpuset) is kept, and updated by any task attached to that
+cpuset, if it enters the synchronous (direct) page reclaim code.
+
+A per-cpuset file provides an integer number representing the recent
+(half-life of 10 seconds) rate of direct page reclaims caused by
+the tasks in the cpuset, in units of reclaims attempted per second,
+times 1000.
+
+
+1.6 What is memory spread ?
+---------------------------
+There are two boolean flag files per cpuset that control where the
+kernel allocates pages for the file system buffers and related in
+kernel data structures.  They are called 'cpuset.memory_spread_page' and
+'cpuset.memory_spread_slab'.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
+the kernel will spread the file system buffers (page cache) evenly
+over all the nodes that the faulting task is allowed to use, instead
+of preferring to put those pages on the node where the task is running.
+
+If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
+then the kernel will spread some file system related slab caches,
+such as for inodes and dentries evenly over all the nodes that the
+faulting task is allowed to use, instead of preferring to put those
+pages on the node where the task is running.
+
+The setting of these flags does not affect anonymous data segment or
+stack segment pages of a task.
+
+By default, both kinds of memory spreading are off, and memory
+pages are allocated on the node local to where the task is running,
+except perhaps as modified by the task's NUMA mempolicy or cpuset
+configuration, so long as sufficient free memory pages are available.
+
+When new cpusets are created, they inherit the memory spread settings
+of their parent.
+
+Setting memory spreading causes allocations for the affected page
+or slab caches to ignore the task's NUMA mempolicy and be spread
+instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
+mempolicies will not notice any change in these calls as a result of
+their containing task's memory spread settings.  If memory spreading
+is turned off, then the currently specified NUMA mempolicy once again
+applies to memory page allocations.
+
+Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
+files.  By default they contain "0", meaning that the feature is off
+for that cpuset.  If a "1" is written to that file, then that turns
+the named feature on.
+
+The implementation is simple.
+
+Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
+PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
+joins that cpuset.  The page allocation calls for the page cache
+is modified to perform an inline check for this PFA_SPREAD_PAGE task
+flag, and if set, a call to a new routine cpuset_mem_spread_node()
+returns the node to prefer for the allocation.
+
+Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
+PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
+pages from the node returned by cpuset_mem_spread_node().
+
+The cpuset_mem_spread_node() routine is also simple.  It uses the
+value of a per-task rotor cpuset_mem_spread_rotor to select the next
+node in the current task's mems_allowed to prefer for the allocation.
+
+This memory placement policy is also known (in other contexts) as
+round-robin or interleave.
+
+This policy can provide substantial improvements for jobs that need
+to place thread local data on the corresponding node, but that need
+to access large file system data sets that need to be spread across
+the several nodes in the jobs cpuset in order to fit.  Without this
+policy, especially for jobs that might have one thread reading in the
+data set, the memory allocation across the nodes in the jobs cpuset
+can become very uneven.
+
+1.7 What is sched_load_balance ?
+--------------------------------
+
+The kernel scheduler (kernel/sched/core.c) automatically load balances
+tasks.  If one CPU is underutilized, kernel code running on that
+CPU will look for tasks on other more overloaded CPUs and move those
+tasks to itself, within the constraints of such placement mechanisms
+as cpusets and sched_setaffinity.
+
+The algorithmic cost of load balancing and its impact on key shared
+kernel data structures such as the task list increases more than
+linearly with the number of CPUs being balanced.  So the scheduler
+has support to partition the systems CPUs into a number of sched
+domains such that it only load balances within each sched domain.
+Each sched domain covers some subset of the CPUs in the system;
+no two sched domains overlap; some CPUs might not be in any sched
+domain and hence won't be load balanced.
+
+Put simply, it costs less to balance between two smaller sched domains
+than one big one, but doing so means that overloads in one of the
+two domains won't be load balanced to the other one.
+
+By default, there is one sched domain covering all CPUs, including those
+marked isolated using the kernel boot time "isolcpus=" argument. However,
+the isolated CPUs will not participate in load balancing, and will not
+have tasks running on them unless explicitly assigned.
+
+This default load balancing across all CPUs is not well suited for
+the following two situations:
+
+ 1) On large systems, load balancing across many CPUs is expensive.
+    If the system is managed using cpusets to place independent jobs
+    on separate sets of CPUs, full load balancing is unnecessary.
+ 2) Systems supporting realtime on some CPUs need to minimize
+    system overhead on those CPUs, including avoiding task load
+    balancing if that is not needed.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
+setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
+be contained in a single sched domain, ensuring that load balancing
+can move a task (not otherwised pinned, as by sched_setaffinity)
+from any CPU in that cpuset to any other.
+
+When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
+scheduler will avoid load balancing across the CPUs in that cpuset,
+--except-- in so far as is necessary because some overlapping cpuset
+has "sched_load_balance" enabled.
+
+So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
+enabled, then the scheduler will have one sched domain covering all
+CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
+cpusets won't matter, as we're already fully load balancing.
+
+Therefore in the above two situations, the top cpuset flag
+"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
+child cpusets have this flag enabled.
+
+When doing this, you don't usually want to leave any unpinned tasks in
+the top cpuset that might use non-trivial amounts of CPU, as such tasks
+may be artificially constrained to some subset of CPUs, depending on
+the particulars of this flag setting in descendant cpusets.  Even if
+such a task could use spare CPU cycles in some other CPUs, the kernel
+scheduler might not consider the possibility of load balancing that
+task to that underused CPU.
+
+Of course, tasks pinned to a particular CPU can be left in a cpuset
+that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
+else anyway.
+
+There is an impedance mismatch here, between cpusets and sched domains.
+Cpusets are hierarchical and nest.  Sched domains are flat; they don't
+overlap and each CPU is in at most one sched domain.
+
+It is necessary for sched domains to be flat because load balancing
+across partially overlapping sets of CPUs would risk unstable dynamics
+that would be beyond our understanding.  So if each of two partially
+overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
+form a single sched domain that is a superset of both.  We won't move
+a task to a CPU outside its cpuset, but the scheduler load balancing
+code might waste some compute cycles considering that possibility.
+
+This mismatch is why there is not a simple one-to-one relation
+between which cpusets have the flag "cpuset.sched_load_balance" enabled,
+and the sched domain configuration.  If a cpuset enables the flag, it
+will get balancing across all its CPUs, but if it disables the flag,
+it will only be assured of no load balancing if no other overlapping
+cpuset enables the flag.
+
+If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
+one of them has this flag enabled, then the other may find its
+tasks only partially load balanced, just on the overlapping CPUs.
+This is just the general case of the top_cpuset example given a few
+paragraphs above.  In the general case, as in the top cpuset case,
+don't leave tasks that might use non-trivial amounts of CPU in
+such partially load balanced cpusets, as they may be artificially
+constrained to some subset of the CPUs allowed to them, for lack of
+load balancing to the other CPUs.
+
+CPUs in "cpuset.isolcpus" were excluded from load balancing by the
+isolcpus= kernel boot option, and will never be load balanced regardless
+of the value of "cpuset.sched_load_balance" in any cpuset.
+
+1.7.1 sched_load_balance implementation details.
+------------------------------------------------
+
+The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
+to most cpuset flags.)  When enabled for a cpuset, the kernel will
+ensure that it can load balance across all the CPUs in that cpuset
+(makes sure that all the CPUs in the cpus_allowed of that cpuset are
+in the same sched domain.)
+
+If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
+then they will be (must be) both in the same sched domain.
+
+If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
+then by the above that means there is a single sched domain covering
+the whole system, regardless of any other cpuset settings.
+
+The kernel commits to user space that it will avoid load balancing
+where it can.  It will pick as fine a granularity partition of sched
+domains as it can while still providing load balancing for any set
+of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
+
+The internal kernel cpuset to scheduler interface passes from the
+cpuset code to the scheduler code a partition of the load balanced
+CPUs in the system. This partition is a set of subsets (represented
+as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
+all the CPUs that must be load balanced.
+
+The cpuset code builds a new such partition and passes it to the
+scheduler sched domain setup code, to have the sched domains rebuilt
+as necessary, whenever:
+
+ - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
+ - or CPUs come or go from a cpuset with this flag enabled,
+ - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
+   and with this flag enabled changes,
+ - or a cpuset with non-empty CPUs and with this flag enabled is removed,
+ - or a cpu is offlined/onlined.
+
+This partition exactly defines what sched domains the scheduler should
+setup - one sched domain for each element (struct cpumask) in the
+partition.
+
+The scheduler remembers the currently active sched domain partitions.
+When the scheduler routine partition_sched_domains() is invoked from
+the cpuset code to update these sched domains, it compares the new
+partition requested with the current, and updates its sched domains,
+removing the old and adding the new, for each change.
+
+
+1.8 What is sched_relax_domain_level ?
+--------------------------------------
+
+In sched domain, the scheduler migrates tasks in 2 ways; periodic load
+balance on tick, and at time of some schedule events.
+
+When a task is woken up, scheduler try to move the task on idle CPU.
+For example, if a task A running on CPU X activates another task B
+on the same CPU X, and if CPU Y is X's sibling and performing idle,
+then scheduler migrate task B to CPU Y so that task B can start on
+CPU Y without waiting task A on CPU X.
+
+And if a CPU run out of tasks in its runqueue, the CPU try to pull
+extra tasks from other busy CPUs to help them before it is going to
+be idle.
+
+Of course it takes some searching cost to find movable tasks and/or
+idle CPUs, the scheduler might not search all CPUs in the domain
+every time.  In fact, in some architectures, the searching ranges on
+events are limited in the same socket or node where the CPU locates,
+while the load balance on tick searches all.
+
+For example, assume CPU Z is relatively far from CPU X.  Even if CPU Z
+is idle while CPU X and the siblings are busy, scheduler can't migrate
+woken task B from X to Z since it is out of its searching range.
+As the result, task B on CPU X need to wait task A or wait load balance
+on the next tick.  For some applications in special situation, waiting
+1 tick may be too long.
+
+The 'cpuset.sched_relax_domain_level' file allows you to request changing
+this searching range as you like.  This file takes int value which
+indicates size of searching range in levels ideally as follows,
+otherwise initial value -1 that indicates the cpuset has no request.
+
+====== ===========================================================
+  -1   no request. use system default or follow request of others.
+   0   no search.
+   1   search siblings (hyperthreads in a core).
+   2   search cores in a package.
+   3   search cpus in a node [= system wide on non-NUMA system]
+   4   search nodes in a chunk of node [on NUMA system]
+   5   search system wide [on NUMA system]
+====== ===========================================================
+
+The system default is architecture dependent.  The system default
+can be changed using the relax_domain_level= boot parameter.
+
+This file is per-cpuset and affect the sched domain where the cpuset
+belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
+is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
+there is no sched domain belonging the cpuset.
+
+If multiple cpusets are overlapping and hence they form a single sched
+domain, the largest value among those is used.  Be careful, if one
+requests 0 and others are -1 then 0 is used.
+
+Note that modifying this file will have both good and bad effects,
+and whether it is acceptable or not depends on your situation.
+Don't modify this file if you are not sure.
+
+If your situation is:
+
+ - The migration costs between each cpu can be assumed considerably
+   small(for you) due to your special application's behavior or
+   special hardware support for CPU cache etc.
+ - The searching cost doesn't have impact(for you) or you can make
+   the searching cost enough small by managing cpuset to compact etc.
+ - The latency is required even it sacrifices cache hit rate etc.
+   then increasing 'sched_relax_domain_level' would benefit you.
+
+
+1.9 How do I use cpusets ?
+--------------------------
+
+In order to minimize the impact of cpusets on critical kernel
+code, such as the scheduler, and due to the fact that the kernel
+does not support one task updating the memory placement of another
+task directly, the impact on a task of changing its cpuset CPU
+or Memory Node placement, or of changing to which cpuset a task
+is attached, is subtle.
+
+If a cpuset has its Memory Nodes modified, then for each task attached
+to that cpuset, the next time that the kernel attempts to allocate
+a page of memory for that task, the kernel will notice the change
+in the task's cpuset, and update its per-task memory placement to
+remain within the new cpusets memory placement.  If the task was using
+mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
+its new cpuset, then the task will continue to use whatever subset
+of MPOL_BIND nodes are still allowed in the new cpuset.  If the task
+was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
+in the new cpuset, then the task will be essentially treated as if it
+was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
+as queried by get_mempolicy(), doesn't change).  If a task is moved
+from one cpuset to another, then the kernel will adjust the task's
+memory placement, as above, the next time that the kernel attempts
+to allocate a page of memory for that task.
+
+If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
+will have its allowed CPU placement changed immediately.  Similarly,
+if a task's pid is written to another cpuset's 'tasks' file, then its
+allowed CPU placement is changed immediately.  If such a task had been
+bound to some subset of its cpuset using the sched_setaffinity() call,
+the task will be allowed to run on any CPU allowed in its new cpuset,
+negating the effect of the prior sched_setaffinity() call.
+
+In summary, the memory placement of a task whose cpuset is changed is
+updated by the kernel, on the next allocation of a page for that task,
+and the processor placement is updated immediately.
+
+Normally, once a page is allocated (given a physical page
+of main memory) then that page stays on whatever node it
+was allocated, so long as it remains allocated, even if the
+cpusets memory placement policy 'cpuset.mems' subsequently changes.
+If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
+tasks are attached to that cpuset, any pages that task had
+allocated to it on nodes in its previous cpuset are migrated
+to the task's new cpuset. The relative placement of the page within
+the cpuset is preserved during these migration operations if possible.
+For example if the page was on the second valid node of the prior cpuset
+then the page will be placed on the second valid node of the new cpuset.
+
+Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
+'cpuset.mems' file is modified, pages allocated to tasks in that
+cpuset, that were on nodes in the previous setting of 'cpuset.mems',
+will be moved to nodes in the new setting of 'mems.'
+Pages that were not in the task's prior cpuset, or in the cpuset's
+prior 'cpuset.mems' setting, will not be moved.
+
+There is an exception to the above.  If hotplug functionality is used
+to remove all the CPUs that are currently assigned to a cpuset,
+then all the tasks in that cpuset will be moved to the nearest ancestor
+with non-empty cpus.  But the moving of some (or all) tasks might fail if
+cpuset is bound with another cgroup subsystem which has some restrictions
+on task attaching.  In this failing case, those tasks will stay
+in the original cpuset, and the kernel will automatically update
+their cpus_allowed to allow all online CPUs.  When memory hotplug
+functionality for removing Memory Nodes is available, a similar exception
+is expected to apply there as well.  In general, the kernel prefers to
+violate cpuset placement, over starving a task that has had all
+its allowed CPUs or Memory Nodes taken offline.
+
+There is a second exception to the above.  GFP_ATOMIC requests are
+kernel internal allocations that must be satisfied, immediately.
+The kernel may drop some request, in rare cases even panic, if a
+GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
+the current task's cpuset, then we relax the cpuset, and look for
+memory anywhere we can find it.  It's better to violate the cpuset
+than stress the kernel.
+
+To start a new job that is to be contained within a cpuset, the steps are:
+
+ 1) mkdir /sys/fs/cgroup/cpuset
+ 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+ 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
+    the /sys/fs/cgroup/cpuset virtual file system.
+ 4) Start a task that will be the "founding father" of the new job.
+ 5) Attach that task to the new cpuset by writing its pid to the
+    /sys/fs/cgroup/cpuset tasks file for that cpuset.
+ 6) fork, exec or clone the job tasks from this founding father task.
+
+For example, the following sequence of commands will setup a cpuset
+named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
+and then start a subshell 'sh' in that cpuset::
+
+  mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
+  cd /sys/fs/cgroup/cpuset
+  mkdir Charlie
+  cd Charlie
+  /bin/echo 2-3 > cpuset.cpus
+  /bin/echo 1 > cpuset.mems
+  /bin/echo $$ > tasks
+  sh
+  # The subshell 'sh' is now running in cpuset Charlie
+  # The next line should display '/Charlie'
+  cat /proc/self/cpuset
+
+There are ways to query or modify cpusets:
+
+ - via the cpuset file system directly, using the various cd, mkdir, echo,
+   cat, rmdir commands from the shell, or their equivalent from C.
+ - via the C library libcpuset.
+ - via the C library libcgroup.
+   (http://sourceforge.net/projects/libcg/)
+ - via the python application cset.
+   (http://code.google.com/p/cpuset/)
+
+The sched_setaffinity calls can also be done at the shell prompt using
+SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
+calls can be done at the shell prompt using the numactl command
+(part of Andi Kleen's numa package).
+
+2. Usage Examples and Syntax
+============================
+
+2.1 Basic Usage
+---------------
+
+Creating, modifying, using the cpusets can be done through the cpuset
+virtual filesystem.
+
+To mount it, type:
+# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
+
+Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
+tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
+is the cpuset that holds the whole system.
+
+If you want to create a new cpuset under /sys/fs/cgroup/cpuset::
+
+  # cd /sys/fs/cgroup/cpuset
+  # mkdir my_cpuset
+
+Now you want to do something with this cpuset::
+
+  # cd my_cpuset
+
+In this directory you can find several files::
+
+  # ls
+  cgroup.clone_children  cpuset.memory_pressure
+  cgroup.event_control   cpuset.memory_spread_page
+  cgroup.procs           cpuset.memory_spread_slab
+  cpuset.cpu_exclusive   cpuset.mems
+  cpuset.cpus            cpuset.sched_load_balance
+  cpuset.mem_exclusive   cpuset.sched_relax_domain_level
+  cpuset.mem_hardwall    notify_on_release
+  cpuset.memory_migrate  tasks
+
+Reading them will give you information about the state of this cpuset:
+the CPUs and Memory Nodes it can use, the processes that are using
+it, its properties.  By writing to these files you can manipulate
+the cpuset.
+
+Set some flags::
+
+  # /bin/echo 1 > cpuset.cpu_exclusive
+
+Add some cpus::
+
+  # /bin/echo 0-7 > cpuset.cpus
+
+Add some mems::
+
+  # /bin/echo 0-7 > cpuset.mems
+
+Now attach your shell to this cpuset::
+
+  # /bin/echo $$ > tasks
+
+You can also create cpusets inside your cpuset by using mkdir in this
+directory::
+
+  # mkdir my_sub_cs
+
+To remove a cpuset, just use rmdir::
+
+  # rmdir my_sub_cs
+
+This will fail if the cpuset is in use (has cpusets inside, or has
+processes attached).
+
+Note that for legacy reasons, the "cpuset" filesystem exists as a
+wrapper around the cgroup filesystem.
+
+The command::
+
+  mount -t cpuset X /sys/fs/cgroup/cpuset
+
+is equivalent to::
+
+  mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
+  echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
+
+2.2 Adding/removing cpus
+------------------------
+
+This is the syntax to use when writing in the cpus or mems files
+in cpuset directories::
+
+  # /bin/echo 1-4 > cpuset.cpus		-> set cpus list to cpus 1,2,3,4
+  # /bin/echo 1,2,3,4 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4
+
+To add a CPU to a cpuset, write the new list of CPUs including the
+CPU to be added. To add 6 to the above cpuset::
+
+  # /bin/echo 1-4,6 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4,6
+
+Similarly to remove a CPU from a cpuset, write the new list of CPUs
+without the CPU to be removed.
+
+To remove all the CPUs::
+
+  # /bin/echo "" > cpuset.cpus		-> clear cpus list
+
+2.3 Setting flags
+-----------------
+
+The syntax is very simple::
+
+  # /bin/echo 1 > cpuset.cpu_exclusive 	-> set flag 'cpuset.cpu_exclusive'
+  # /bin/echo 0 > cpuset.cpu_exclusive 	-> unset flag 'cpuset.cpu_exclusive'
+
+2.4 Attaching processes
+-----------------------
+
+::
+
+  # /bin/echo PID > tasks
+
+Note that it is PID, not PIDs. You can only attach ONE task at a time.
+If you have several tasks to attach, you have to do it one after another::
+
+  # /bin/echo PID1 > tasks
+  # /bin/echo PID2 > tasks
+	...
+  # /bin/echo PIDn > tasks
+
+
+3. Questions
+============
+
+Q:
+   what's up with this '/bin/echo' ?
+
+A:
+   bash's builtin 'echo' command does not check calls to write() against
+   errors. If you use it in the cpuset file system, you won't be
+   able to tell whether a command succeeded or failed.
+
+Q:
+   When I attach processes, only the first of the line gets really attached !
+
+A:
+   We can only return one error code per call to write(). So you should also
+   put only ONE pid.
+
+4. Contact
+==========
+
+Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/admin-guide/cgroup-v1/devices.rst b/Documentation/admin-guide/cgroup-v1/devices.rst
new file mode 100644
index 000000000000..e1886783961e
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/devices.rst
@@ -0,0 +1,132 @@
+===========================
+Device Whitelist Controller
+===========================
+
+1. Description
+==============
+
+Implement a cgroup to track and enforce open and mknod restrictions
+on device files.  A device cgroup associates a device access
+whitelist with each cgroup.  A whitelist entry has 4 fields.
+'type' is a (all), c (char), or b (block).  'all' means it applies
+to all types and all major and minor numbers.  Major and minor are
+either an integer or * for all.  Access is a composition of r
+(read), w (write), and m (mknod).
+
+The root device cgroup starts with rwm to 'all'.  A child device
+cgroup gets a copy of the parent.  Administrators can then remove
+devices from the whitelist or add new entries.  A child cgroup can
+never receive a device access which is denied by its parent.
+
+2. User Interface
+=================
+
+An entry is added using devices.allow, and removed using
+devices.deny.  For instance::
+
+	echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
+
+allows cgroup 1 to read and mknod the device usually known as
+/dev/null.  Doing::
+
+	echo a > /sys/fs/cgroup/1/devices.deny
+
+will remove the default 'a *:* rwm' entry. Doing::
+
+	echo a > /sys/fs/cgroup/1/devices.allow
+
+will add the 'a *:* rwm' entry to the whitelist.
+
+3. Security
+===========
+
+Any task can move itself between cgroups.  This clearly won't
+suffice, but we can decide the best way to adequately restrict
+movement as people get some experience with this.  We may just want
+to require CAP_SYS_ADMIN, which at least is a separate bit from
+CAP_MKNOD.  We may want to just refuse moving to a cgroup which
+isn't a descendant of the current one.  Or we may want to use
+CAP_MAC_ADMIN, since we really are trying to lock down root.
+
+CAP_SYS_ADMIN is needed to modify the whitelist or move another
+task to a new cgroup.  (Again we'll probably want to change that).
+
+A cgroup may not be granted more permissions than the cgroup's
+parent has.
+
+4. Hierarchy
+============
+
+device cgroups maintain hierarchy by making sure a cgroup never has more
+access permissions than its parent.  Every time an entry is written to
+a cgroup's devices.deny file, all its children will have that entry removed
+from their whitelist and all the locally set whitelist entries will be
+re-evaluated.  In case one of the locally set whitelist entries would provide
+more access than the cgroup's parent, it'll be removed from the whitelist.
+
+Example::
+
+      A
+     / \
+        B
+
+    group        behavior	exceptions
+    A            allow		"b 8:* rwm", "c 116:1 rw"
+    B            deny		"c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
+
+If a device is denied in group A::
+
+	# echo "c 116:* r" > A/devices.deny
+
+it'll propagate down and after revalidating B's entries, the whitelist entry
+"c 116:2 rwm" will be removed::
+
+    group        whitelist entries                        denied devices
+    A            all                                      "b 8:* rwm", "c 116:* rw"
+    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
+
+In case parent's exceptions change and local exceptions are not allowed
+anymore, they'll be deleted.
+
+Notice that new whitelist entries will not be propagated::
+
+      A
+     / \
+        B
+
+    group        whitelist entries                        denied devices
+    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+when adding ``c *:3 rwm``::
+
+	# echo "c *:3 rwm" >A/devices.allow
+
+the result::
+
+    group        whitelist entries                        denied devices
+    A            "c *:3 rwm", "c 1:5 r"                   all the rest
+    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
+
+but now it'll be possible to add new entries to B::
+
+	# echo "c 2:3 rwm" >B/devices.allow
+	# echo "c 50:3 r" >B/devices.allow
+
+or even::
+
+	# echo "c *:3 rwm" >B/devices.allow
+
+Allowing or denying all by writing 'a' to devices.allow or devices.deny will
+not be possible once the device cgroups has children.
+
+4.1 Hierarchy (internal implementation)
+---------------------------------------
+
+device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
+list of exceptions.  The internal state is controlled using the same user
+interface to preserve compatibility with the previous whitelist-only
+implementation.  Removal or addition of exceptions that will reduce the access
+to devices will be propagated down the hierarchy.
+For every propagated exception, the effective rules will be re-evaluated based
+on current parent's access rules.
diff --git a/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
new file mode 100644
index 000000000000..582d3427de3f
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst
@@ -0,0 +1,127 @@
+==============
+Cgroup Freezer
+==============
+
+The cgroup freezer is useful to batch job management system which start
+and stop sets of tasks in order to schedule the resources of a machine
+according to the desires of a system administrator. This sort of program
+is often used on HPC clusters to schedule access to the cluster as a
+whole. The cgroup freezer uses cgroups to describe the set of tasks to
+be started/stopped by the batch job management system. It also provides
+a means to start and stop the tasks composing the job.
+
+The cgroup freezer will also be useful for checkpointing running groups
+of tasks. The freezer allows the checkpoint code to obtain a consistent
+image of the tasks by attempting to force the tasks in a cgroup into a
+quiescent state. Once the tasks are quiescent another task can
+walk /proc or invoke a kernel interface to gather information about the
+quiesced tasks. Checkpointed tasks can be restarted later should a
+recoverable error occur. This also allows the checkpointed tasks to be
+migrated between nodes in a cluster by copying the gathered information
+to another node and restarting the tasks there.
+
+Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
+and resuming tasks in userspace. Both of these signals are observable
+from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
+blocked, or ignored it can be seen by waiting or ptracing parent tasks.
+SIGCONT is especially unsuitable since it can be caught by the task. Any
+programs designed to watch for SIGSTOP and SIGCONT could be broken by
+attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
+demonstrate this problem using nested bash shells::
+
+	$ echo $$
+	16644
+	$ bash
+	$ echo $$
+	16690
+
+	From a second, unrelated bash shell:
+	$ kill -SIGSTOP 16690
+	$ kill -SIGCONT 16690
+
+	<at this point 16690 exits and causes 16644 to exit too>
+
+This happens because bash can observe both signals and choose how it
+responds to them.
+
+Another example of a program which catches and responds to these
+signals is gdb. In fact any program designed to use ptrace is likely to
+have a problem with this method of stopping and resuming tasks.
+
+In contrast, the cgroup freezer uses the kernel freezer code to
+prevent the freeze/unfreeze cycle from becoming visible to the tasks
+being frozen. This allows the bash example above and gdb to run as
+expected.
+
+The cgroup freezer is hierarchical. Freezing a cgroup freezes all
+tasks belonging to the cgroup and all its descendant cgroups. Each
+cgroup has its own state (self-state) and the state inherited from the
+parent (parent-state). Iff both states are THAWED, the cgroup is
+THAWED.
+
+The following cgroupfs files are created by cgroup freezer.
+
+* freezer.state: Read-write.
+
+  When read, returns the effective state of the cgroup - "THAWED",
+  "FREEZING" or "FROZEN". This is the combined self and parent-states.
+  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
+
+  FREEZING cgroup transitions into FROZEN state when all tasks
+  belonging to the cgroup and its descendants become frozen. Note that
+  a cgroup reverts to FREEZING from FROZEN after a new task is added
+  to the cgroup or one of its descendant cgroups until the new task is
+  frozen.
+
+  When written, sets the self-state of the cgroup. Two values are
+  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
+  if not already freezing, enters FREEZING state along with all its
+  descendant cgroups.
+
+  If THAWED is written, the self-state of the cgroup is changed to
+  THAWED.  Note that the effective state may not change to THAWED if
+  the parent-state is still freezing. If a cgroup's effective state
+  becomes THAWED, all its descendants which are freezing because of
+  the cgroup also leave the freezing state.
+
+* freezer.self_freezing: Read only.
+
+  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
+  This value is 1 iff the last write to freezer.state was "FROZEN".
+
+* freezer.parent_freezing: Read only.
+
+  Shows the parent-state.  0 if none of the cgroup's ancestors is
+  frozen; otherwise, 1.
+
+The root cgroup is non-freezable and the above interface files don't
+exist.
+
+* Examples of usage::
+
+   # mkdir /sys/fs/cgroup/freezer
+   # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
+   # mkdir /sys/fs/cgroup/freezer/0
+   # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
+
+to get status of the freezer subsystem::
+
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   THAWED
+
+to freeze all tasks in the container::
+
+   # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   FREEZING
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   FROZEN
+
+to unfreeze all tasks in the container::
+
+   # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
+   # cat /sys/fs/cgroup/freezer/0/freezer.state
+   THAWED
+
+This is the basic mechanism which should do the right thing for user space task
+in a simple scenario.
diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
new file mode 100644
index 000000000000..a3902aa253a9
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst
@@ -0,0 +1,50 @@
+==================
+HugeTLB Controller
+==================
+
+The HugeTLB controller allows to limit the HugeTLB usage per control group and
+enforces the controller limit during page fault. Since HugeTLB doesn't
+support page reclaim, enforcing the limit at page fault time implies that,
+the application will get SIGBUS signal if it tries to access HugeTLB pages
+beyond its limit. This requires the application to know beforehand how much
+HugeTLB pages it would require for its use.
+
+HugeTLB controller can be created by first mounting the cgroup filesystem.
+
+# mount -t cgroup -o hugetlb none /sys/fs/cgroup
+
+With the above step, the initial or the parent HugeTLB group becomes
+visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
+the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
+
+New groups can be created under the parent group /sys/fs/cgroup::
+
+  # cd /sys/fs/cgroup
+  # mkdir g1
+  # echo $$ > g1/tasks
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it.
+
+Brief summary of control files::
+
+ hugetlb.<hugepagesize>.limit_in_bytes     # set/show limit of "hugepagesize" hugetlb usage
+ hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb  usage recorded
+ hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
+ hugetlb.<hugepagesize>.failcnt		   # show the number of allocation failure due to HugeTLB limit
+
+For a system supporting three hugepage sizes (64k, 32M and 1G), the control
+files include::
+
+  hugetlb.1GB.limit_in_bytes
+  hugetlb.1GB.max_usage_in_bytes
+  hugetlb.1GB.usage_in_bytes
+  hugetlb.1GB.failcnt
+  hugetlb.64KB.limit_in_bytes
+  hugetlb.64KB.max_usage_in_bytes
+  hugetlb.64KB.usage_in_bytes
+  hugetlb.64KB.failcnt
+  hugetlb.32MB.limit_in_bytes
+  hugetlb.32MB.max_usage_in_bytes
+  hugetlb.32MB.usage_in_bytes
+  hugetlb.32MB.failcnt
diff --git a/Documentation/admin-guide/cgroup-v1/index.rst b/Documentation/admin-guide/cgroup-v1/index.rst
new file mode 100644
index 000000000000..10bf48bae0b0
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/index.rst
@@ -0,0 +1,28 @@
+========================
+Control Groups version 1
+========================
+
+.. toctree::
+    :maxdepth: 1
+
+    cgroups
+
+    blkio-controller
+    cpuacct
+    cpusets
+    devices
+    freezer-subsystem
+    hugetlb
+    memcg_test
+    memory
+    net_cls
+    net_prio
+    pids
+    rdma
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/admin-guide/cgroup-v1/memcg_test.rst b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
new file mode 100644
index 000000000000..3f7115e07b5d
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/memcg_test.rst
@@ -0,0 +1,355 @@
+=====================================================
+Memory Resource Controller(Memcg) Implementation Memo
+=====================================================
+
+Last Updated: 2010/2
+
+Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
+
+Because VM is getting complex (one of reasons is memcg...), memcg's behavior
+is complex. This is a document for memcg's internal behavior.
+Please note that implementation details can be changed.
+
+(*) Topics on API should be in Documentation/admin-guide/cgroup-v1/memory.rst)
+
+0. How to record usage ?
+========================
+
+   2 objects are used.
+
+   page_cgroup ....an object per page.
+
+	Allocated at boot or memory hotplug. Freed at memory hot removal.
+
+   swap_cgroup ... an entry per swp_entry.
+
+	Allocated at swapon(). Freed at swapoff().
+
+   The page_cgroup has USED bit and double count against a page_cgroup never
+   occurs. swap_cgroup is used only when a charged page is swapped-out.
+
+1. Charge
+=========
+
+   a page/swp_entry may be charged (usage += PAGE_SIZE) at
+
+	mem_cgroup_try_charge()
+
+2. Uncharge
+===========
+
+  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
+
+	mem_cgroup_uncharge()
+	  Called when a page's refcount goes down to 0.
+
+	mem_cgroup_uncharge_swap()
+	  Called when swp_entry's refcnt goes down to 0. A charge against swap
+	  disappears.
+
+3. charge-commit-cancel
+=======================
+
+	Memcg pages are charged in two steps:
+
+		- mem_cgroup_try_charge()
+		- mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
+
+	At try_charge(), there are no flags to say "this page is charged".
+	at this point, usage += PAGE_SIZE.
+
+	At commit(), the page is associated with the memcg.
+
+	At cancel(), simply usage -= PAGE_SIZE.
+
+Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
+
+4. Anonymous
+============
+
+	Anonymous page is newly allocated at
+		  - page fault into MAP_ANONYMOUS mapping.
+		  - Copy-On-Write.
+
+	4.1 Swap-in.
+	At swap-in, the page is taken from swap-cache. There are 2 cases.
+
+	(a) If the SwapCache is newly allocated and read, it has no charges.
+	(b) If the SwapCache has been mapped by processes, it has been
+	    charged already.
+
+	4.2 Swap-out.
+	At swap-out, typical state transition is below.
+
+	(a) add to swap cache. (marked as SwapCache)
+	    swp_entry's refcnt += 1.
+	(b) fully unmapped.
+	    swp_entry's refcnt += # of ptes.
+	(c) write back to swap.
+	(d) delete from swap cache. (remove from SwapCache)
+	    swp_entry's refcnt -= 1.
+
+
+	Finally, at task exit,
+	(e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
+
+5. Page Cache
+=============
+
+	Page Cache is charged at
+	- add_to_page_cache_locked().
+
+	The logic is very clear. (About migration, see below)
+
+	Note:
+	  __remove_from_page_cache() is called by remove_from_page_cache()
+	  and __remove_mapping().
+
+6. Shmem(tmpfs) Page Cache
+===========================
+
+	The best way to understand shmem's page state transition is to read
+	mm/shmem.c.
+
+	But brief explanation of the behavior of memcg around shmem will be
+	helpful to understand the logic.
+
+	Shmem's page (just leaf page, not direct/indirect block) can be on
+
+		- radix-tree of shmem's inode.
+		- SwapCache.
+		- Both on radix-tree and SwapCache. This happens at swap-in
+		  and swap-out,
+
+	It's charged when...
+
+	- A new page is added to shmem's radix-tree.
+	- A swp page is read. (move a charge from swap_cgroup to page_cgroup)
+
+7. Page Migration
+=================
+
+	mem_cgroup_migrate()
+
+8. LRU
+======
+        Each memcg has its own private LRU. Now, its handling is under global
+	VM's control (means that it's handled under global pgdat->lru_lock).
+	Almost all routines around memcg's LRU is called by global LRU's
+	list management functions under pgdat->lru_lock.
+
+	A special function is mem_cgroup_isolate_pages(). This scans
+	memcg's private LRU and call __isolate_lru_page() to extract a page
+	from LRU.
+
+	(By __isolate_lru_page(), the page is removed from both of global and
+	private LRU.)
+
+
+9. Typical Tests.
+=================
+
+ Tests for racy cases.
+
+9.1 Small limit to memcg.
+-------------------------
+
+	When you do test to do racy case, it's good test to set memcg's limit
+	to be very small rather than GB. Many races found in the test under
+	xKB or xxMB limits.
+
+	(Memory behavior under GB and Memory behavior under MB shows very
+	different situation.)
+
+9.2 Shmem
+---------
+
+	Historically, memcg's shmem handling was poor and we saw some amount
+	of troubles here. This is because shmem is page-cache but can be
+	SwapCache. Test with shmem/tmpfs is always good test.
+
+9.3 Migration
+-------------
+
+	For NUMA, migration is an another special case. To do easy test, cpuset
+	is useful. Following is a sample script to do migration::
+
+		mount -t cgroup -o cpuset none /opt/cpuset
+
+		mkdir /opt/cpuset/01
+		echo 1 > /opt/cpuset/01/cpuset.cpus
+		echo 0 > /opt/cpuset/01/cpuset.mems
+		echo 1 > /opt/cpuset/01/cpuset.memory_migrate
+		mkdir /opt/cpuset/02
+		echo 1 > /opt/cpuset/02/cpuset.cpus
+		echo 1 > /opt/cpuset/02/cpuset.mems
+		echo 1 > /opt/cpuset/02/cpuset.memory_migrate
+
+	In above set, when you moves a task from 01 to 02, page migration to
+	node 0 to node 1 will occur. Following is a script to migrate all
+	under cpuset.::
+
+		--
+		move_task()
+		{
+		for pid in $1
+		do
+			/bin/echo $pid >$2/tasks 2>/dev/null
+			echo -n $pid
+			echo -n " "
+		done
+		echo END
+		}
+
+		G1_TASK=`cat ${G1}/tasks`
+		G2_TASK=`cat ${G2}/tasks`
+		move_task "${G1_TASK}" ${G2} &
+		--
+
+9.4 Memory hotplug
+------------------
+
+	memory hotplug test is one of good test.
+
+	to offline memory, do following::
+
+		# echo offline > /sys/devices/system/memory/memoryXXX/state
+
+	(XXX is the place of memory)
+
+	This is an easy way to test page migration, too.
+
+9.5 mkdir/rmdir
+---------------
+
+	When using hierarchy, mkdir/rmdir test should be done.
+	Use tests like the following::
+
+		echo 1 >/opt/cgroup/01/memory/use_hierarchy
+		mkdir /opt/cgroup/01/child_a
+		mkdir /opt/cgroup/01/child_b
+
+		set limit to 01.
+		add limit to 01/child_b
+		run jobs under child_a and child_b
+
+	create/delete following groups at random while jobs are running::
+
+		/opt/cgroup/01/child_a/child_aa
+		/opt/cgroup/01/child_b/child_bb
+		/opt/cgroup/01/child_c
+
+	running new jobs in new group is also good.
+
+9.6 Mount with other subsystems
+-------------------------------
+
+	Mounting with other subsystems is a good test because there is a
+	race and lock dependency with other cgroup subsystems.
+
+	example::
+
+		# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
+
+	and do task move, mkdir, rmdir etc...under this.
+
+9.7 swapoff
+-----------
+
+	Besides management of swap is one of complicated parts of memcg,
+	call path of swap-in at swapoff is not same as usual swap-in path..
+	It's worth to be tested explicitly.
+
+	For example, test like following is good:
+
+	(Shell-A)::
+
+		# mount -t cgroup none /cgroup -o memory
+		# mkdir /cgroup/test
+		# echo 40M > /cgroup/test/memory.limit_in_bytes
+		# echo 0 > /cgroup/test/tasks
+
+	Run malloc(100M) program under this. You'll see 60M of swaps.
+
+	(Shell-B)::
+
+		# move all tasks in /cgroup/test to /cgroup
+		# /sbin/swapoff -a
+		# rmdir /cgroup/test
+		# kill malloc task.
+
+	Of course, tmpfs v.s. swapoff test should be tested, too.
+
+9.8 OOM-Killer
+--------------
+
+	Out-of-memory caused by memcg's limit will kill tasks under
+	the memcg. When hierarchy is used, a task under hierarchy
+	will be killed by the kernel.
+
+	In this case, panic_on_oom shouldn't be invoked and tasks
+	in other groups shouldn't be killed.
+
+	It's not difficult to cause OOM under memcg as following.
+
+	Case A) when you can swapoff::
+
+		#swapoff -a
+		#echo 50M > /memory.limit_in_bytes
+
+	run 51M of malloc
+
+	Case B) when you use mem+swap limitation::
+
+		#echo 50M > memory.limit_in_bytes
+		#echo 50M > memory.memsw.limit_in_bytes
+
+	run 51M of malloc
+
+9.9 Move charges at task migration
+----------------------------------
+
+	Charges associated with a task can be moved along with task migration.
+
+	(Shell-A)::
+
+		#mkdir /cgroup/A
+		#echo $$ >/cgroup/A/tasks
+
+	run some programs which uses some amount of memory in /cgroup/A.
+
+	(Shell-B)::
+
+		#mkdir /cgroup/B
+		#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
+		#echo "pid of the program running in group A" >/cgroup/B/tasks
+
+	You can see charges have been moved by reading ``*.usage_in_bytes`` or
+	memory.stat of both A and B.
+
+	See 8.2 of Documentation/admin-guide/cgroup-v1/memory.rst to see what value should
+	be written to move_charge_at_immigrate.
+
+9.10 Memory thresholds
+----------------------
+
+	Memory controller implements memory thresholds using cgroups notification
+	API. You can use tools/cgroup/cgroup_event_listener.c to test it.
+
+	(Shell-A) Create cgroup and run event listener::
+
+		# mkdir /cgroup/A
+		# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
+
+	(Shell-B) Add task to cgroup and try to allocate and free memory::
+
+		# echo $$ >/cgroup/A/tasks
+		# a="$(dd if=/dev/zero bs=1M count=10)"
+		# a=
+
+	You will see message from cgroup_event_listener every time you cross
+	the thresholds.
+
+	Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
+
+	It's good idea to test root cgroup as well.
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
new file mode 100644
index 000000000000..41bdc038dad9
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -0,0 +1,1003 @@
+==========================
+Memory Resource Controller
+==========================
+
+NOTE:
+      This document is hopelessly outdated and it asks for a complete
+      rewrite. It still contains a useful information so we are keeping it
+      here but make sure to check the current code if you need a deeper
+      understanding.
+
+NOTE:
+      The Memory Resource Controller has generically been referred to as the
+      memory controller in this document. Do not confuse memory controller
+      used here with the memory controller that is used in hardware.
+
+(For editors) In this document:
+      When we mention a cgroup (cgroupfs's directory) with memory controller,
+      we call it "memory cgroup". When you see git-log and source code, you'll
+      see patch's title and function names tend to use "memcg".
+      In this document, we avoid using it.
+
+Benefits and Purpose of the memory controller
+=============================================
+
+The memory controller isolates the memory behaviour of a group of tasks
+from the rest of the system. The article on LWN [12] mentions some probable
+uses of the memory controller. The memory controller can be used to
+
+a. Isolate an application or a group of applications
+   Memory-hungry applications can be isolated and limited to a smaller
+   amount of memory.
+b. Create a cgroup with a limited amount of memory; this can be used
+   as a good alternative to booting with mem=XXXX.
+c. Virtualization solutions can control the amount of memory they want
+   to assign to a virtual machine instance.
+d. A CD/DVD burner could control the amount of memory used by the
+   rest of the system to ensure that burning does not fail due to lack
+   of available memory.
+e. There are several other use cases; find one or use the controller just
+   for fun (to learn and hack on the VM subsystem).
+
+Current Status: linux-2.6.34-mmotm(development version of 2010/April)
+
+Features:
+
+ - accounting anonymous pages, file caches, swap caches usage and limiting them.
+ - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
+ - optionally, memory+swap usage can be accounted and limited.
+ - hierarchical accounting
+ - soft limit
+ - moving (recharging) account at moving a task is selectable.
+ - usage threshold notifier
+ - memory pressure notifier
+ - oom-killer disable knob and oom-notifier
+ - Root cgroup has no limit controls.
+
+ Kernel memory support is a work in progress, and the current version provides
+ basically functionality. (See Section 2.7)
+
+Brief summary of control files.
+
+==================================== ==========================================
+ tasks				     attach a task(thread) and show list of
+				     threads
+ cgroup.procs			     show list of processes
+ cgroup.event_control		     an interface for event_fd()
+ memory.usage_in_bytes		     show current usage for memory
+				     (See 5.5 for details)
+ memory.memsw.usage_in_bytes	     show current usage for memory+Swap
+				     (See 5.5 for details)
+ memory.limit_in_bytes		     set/show limit of memory usage
+ memory.memsw.limit_in_bytes	     set/show limit of memory+Swap usage
+ memory.failcnt			     show the number of memory usage hits limits
+ memory.memsw.failcnt		     show the number of memory+Swap hits limits
+ memory.max_usage_in_bytes	     show max memory usage recorded
+ memory.memsw.max_usage_in_bytes     show max memory+Swap usage recorded
+ memory.soft_limit_in_bytes	     set/show soft limit of memory usage
+ memory.stat			     show various statistics
+ memory.use_hierarchy		     set/show hierarchical account enabled
+ memory.force_empty		     trigger forced page reclaim
+ memory.pressure_level		     set memory pressure notifications
+ memory.swappiness		     set/show swappiness parameter of vmscan
+				     (See sysctl's vm.swappiness)
+ memory.move_charge_at_immigrate     set/show controls of moving charges
+ memory.oom_control		     set/show oom controls.
+ memory.numa_stat		     show the number of memory usage per numa
+				     node
+
+ memory.kmem.limit_in_bytes          set/show hard limit for kernel memory
+ memory.kmem.usage_in_bytes          show current kernel memory allocation
+ memory.kmem.failcnt                 show the number of kernel memory usage
+				     hits limits
+ memory.kmem.max_usage_in_bytes      show max kernel memory usage recorded
+
+ memory.kmem.tcp.limit_in_bytes      set/show hard limit for tcp buf memory
+ memory.kmem.tcp.usage_in_bytes      show current tcp buf memory allocation
+ memory.kmem.tcp.failcnt             show the number of tcp buf memory usage
+				     hits limits
+ memory.kmem.tcp.max_usage_in_bytes  show max tcp buf memory usage recorded
+==================================== ==========================================
+
+1. History
+==========
+
+The memory controller has a long history. A request for comments for the memory
+controller was posted by Balbir Singh [1]. At the time the RFC was posted
+there were several implementations for memory control. The goal of the
+RFC was to build consensus and agreement for the minimal features required
+for memory control. The first RSS controller was posted by Balbir Singh[2]
+in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
+RSS controller. At OLS, at the resource management BoF, everyone suggested
+that we handle both page cache and RSS together. Another request was raised
+to allow user space handling of OOM. The current memory controller is
+at version 6; it combines both mapped (RSS) and unmapped Page
+Cache Control [11].
+
+2. Memory Control
+=================
+
+Memory is a unique resource in the sense that it is present in a limited
+amount. If a task requires a lot of CPU processing, the task can spread
+its processing over a period of hours, days, months or years, but with
+memory, the same physical memory needs to be reused to accomplish the task.
+
+The memory controller implementation has been divided into phases. These
+are:
+
+1. Memory controller
+2. mlock(2) controller
+3. Kernel user memory accounting and slab control
+4. user mappings length controller
+
+The memory controller is the first controller developed.
+
+2.1. Design
+-----------
+
+The core of the design is a counter called the page_counter. The
+page_counter tracks the current memory usage and limit of the group of
+processes associated with the controller. Each cgroup has a memory controller
+specific data structure (mem_cgroup) associated with it.
+
+2.2. Accounting
+---------------
+
+::
+
+		+--------------------+
+		|  mem_cgroup        |
+		|  (page_counter)    |
+		+--------------------+
+		 /            ^      \
+		/             |       \
+           +---------------+  |        +---------------+
+           | mm_struct     |  |....    | mm_struct     |
+           |               |  |        |               |
+           +---------------+  |        +---------------+
+                              |
+                              + --------------+
+                                              |
+           +---------------+           +------+--------+
+           | page          +---------->  page_cgroup|
+           |               |           |               |
+           +---------------+           +---------------+
+
+             (Figure 1: Hierarchy of Accounting)
+
+
+Figure 1 shows the important aspects of the controller
+
+1. Accounting happens per cgroup
+2. Each mm_struct knows about which cgroup it belongs to
+3. Each page has a pointer to the page_cgroup, which in turn knows the
+   cgroup it belongs to
+
+The accounting is done as follows: mem_cgroup_charge_common() is invoked to
+set up the necessary data structures and check if the cgroup that is being
+charged is over its limit. If it is, then reclaim is invoked on the cgroup.
+More details can be found in the reclaim section of this document.
+If everything goes well, a page meta-data-structure called page_cgroup is
+updated. page_cgroup has its own LRU on cgroup.
+(*) page_cgroup structure is allocated at boot/memory-hotplug time.
+
+2.2.1 Accounting details
+------------------------
+
+All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
+Some pages which are never reclaimable and will not be on the LRU
+are not accounted. We just account pages under usual VM management.
+
+RSS pages are accounted at page_fault unless they've already been accounted
+for earlier. A file page will be accounted for as Page Cache when it's
+inserted into inode (radix-tree). While it's mapped into the page tables of
+processes, duplicate accounting is carefully avoided.
+
+An RSS page is unaccounted when it's fully unmapped. A PageCache page is
+unaccounted when it's removed from radix-tree. Even if RSS pages are fully
+unmapped (by kswapd), they may exist as SwapCache in the system until they
+are really freed. Such SwapCaches are also accounted.
+A swapped-in page is not accounted until it's mapped.
+
+Note: The kernel does swapin-readahead and reads multiple swaps at once.
+This means swapped-in pages may contain pages for other tasks than a task
+causing page fault. So, we avoid accounting at swap-in I/O.
+
+At page migration, accounting information is kept.
+
+Note: we just account pages-on-LRU because our purpose is to control amount
+of used pages; not-on-LRU pages tend to be out-of-control from VM view.
+
+2.3 Shared Page Accounting
+--------------------------
+
+Shared pages are accounted on the basis of the first touch approach. The
+cgroup that first touches a page is accounted for the page. The principle
+behind this approach is that a cgroup that aggressively uses a shared
+page will eventually get charged for it (once it is uncharged from
+the cgroup that brought it in -- this will happen on memory pressure).
+
+But see section 8.2: when moving a task to another cgroup, its pages may
+be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
+
+Exception: If CONFIG_MEMCG_SWAP is not used.
+When you do swapoff and make swapped-out pages of shmem(tmpfs) to
+be backed into memory in force, charges for pages are accounted against the
+caller of swapoff rather than the users of shmem.
+
+2.4 Swap Extension (CONFIG_MEMCG_SWAP)
+--------------------------------------
+
+Swap Extension allows you to record charge for swap. A swapped-in page is
+charged back to original page allocator if possible.
+
+When swap is accounted, following files are added.
+
+ - memory.memsw.usage_in_bytes.
+ - memory.memsw.limit_in_bytes.
+
+memsw means memory+swap. Usage of memory+swap is limited by
+memsw.limit_in_bytes.
+
+Example: Assume a system with 4G of swap. A task which allocates 6G of memory
+(by mistake) under 2G memory limitation will use all swap.
+In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
+By using the memsw limit, you can avoid system OOM which can be caused by swap
+shortage.
+
+**why 'memory+swap' rather than swap**
+
+The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
+to move account from memory to swap...there is no change in usage of
+memory+swap. In other words, when we want to limit the usage of swap without
+affecting global LRU, memory+swap limit is better than just limiting swap from
+an OS point of view.
+
+**What happens when a cgroup hits memory.memsw.limit_in_bytes**
+
+When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
+in this cgroup. Then, swap-out will not be done by cgroup routine and file
+caches are dropped. But as mentioned above, global LRU can do swapout memory
+from it for sanity of the system's memory management state. You can't forbid
+it by cgroup.
+
+2.5 Reclaim
+-----------
+
+Each cgroup maintains a per cgroup LRU which has the same structure as
+global VM. When a cgroup goes over its limit, we first try
+to reclaim memory from the cgroup so as to make space for the new
+pages that the cgroup has touched. If the reclaim is unsuccessful,
+an OOM routine is invoked to select and kill the bulkiest task in the
+cgroup. (See 10. OOM Control below.)
+
+The reclaim algorithm has not been modified for cgroups, except that
+pages that are selected for reclaiming come from the per-cgroup LRU
+list.
+
+NOTE:
+  Reclaim does not work for the root cgroup, since we cannot set any
+  limits on the root cgroup.
+
+Note2:
+  When panic_on_oom is set to "2", the whole system will panic.
+
+When oom event notifier is registered, event will be delivered.
+(See oom_control section)
+
+2.6 Locking
+-----------
+
+   lock_page_cgroup()/unlock_page_cgroup() should not be called under
+   the i_pages lock.
+
+   Other lock order is following:
+
+   PG_locked.
+     mm->page_table_lock
+         pgdat->lru_lock
+	   lock_page_cgroup.
+
+  In many cases, just lock_page_cgroup() is called.
+
+  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
+  pgdat->lru_lock, it has no lock of its own.
+
+2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
+-----------------------------------------------
+
+With the Kernel memory extension, the Memory Controller is able to limit
+the amount of kernel memory used by the system. Kernel memory is fundamentally
+different than user memory, since it can't be swapped out, which makes it
+possible to DoS the system by consuming too much of this precious resource.
+
+Kernel memory accounting is enabled for all memory cgroups by default. But
+it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel
+at boot time. In this case, kernel memory will not be accounted at all.
+
+Kernel memory limits are not imposed for the root cgroup. Usage for the root
+cgroup may or may not be accounted. The memory used is accumulated into
+memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
+(currently only for tcp).
+
+The main "kmem" counter is fed into the main counter, so kmem charges will
+also be visible from the user counter.
+
+Currently no soft limit is implemented for kernel memory. It is future work
+to trigger slab reclaim when those limits are reached.
+
+2.7.1 Current Kernel Memory resources accounted
+-----------------------------------------------
+
+stack pages:
+  every process consumes some stack pages. By accounting into
+  kernel memory, we prevent new processes from being created when the kernel
+  memory usage is too high.
+
+slab pages:
+  pages allocated by the SLAB or SLUB allocator are tracked. A copy
+  of each kmem_cache is created every time the cache is touched by the first time
+  from inside the memcg. The creation is done lazily, so some objects can still be
+  skipped while the cache is being created. All objects in a slab page should
+  belong to the same memcg. This only fails to hold when a task is migrated to a
+  different memcg during the page allocation by the cache.
+
+sockets memory pressure:
+  some sockets protocols have memory pressure
+  thresholds. The Memory Controller allows them to be controlled individually
+  per cgroup, instead of globally.
+
+tcp memory pressure:
+  sockets memory pressure for the tcp protocol.
+
+2.7.2 Common use cases
+----------------------
+
+Because the "kmem" counter is fed to the main user counter, kernel memory can
+never be limited completely independently of user memory. Say "U" is the user
+limit, and "K" the kernel limit. There are three possible ways limits can be
+set:
+
+U != 0, K = unlimited:
+    This is the standard memcg limitation mechanism already present before kmem
+    accounting. Kernel memory is completely ignored.
+
+U != 0, K < U:
+    Kernel memory is a subset of the user memory. This setup is useful in
+    deployments where the total amount of memory per-cgroup is overcommited.
+    Overcommiting kernel memory limits is definitely not recommended, since the
+    box can still run out of non-reclaimable memory.
+    In this case, the admin could set up K so that the sum of all groups is
+    never greater than the total memory, and freely set U at the cost of his
+    QoS.
+
+WARNING:
+    In the current implementation, memory reclaim will NOT be
+    triggered for a cgroup when it hits K while staying below U, which makes
+    this setup impractical.
+
+U != 0, K >= U:
+    Since kmem charges will also be fed to the user counter and reclaim will be
+    triggered for the cgroup for both kinds of memory. This setup gives the
+    admin a unified view of memory, and it is also useful for people who just
+    want to track kernel memory usage.
+
+3. User Interface
+=================
+
+3.0. Configuration
+------------------
+
+a. Enable CONFIG_CGROUPS
+b. Enable CONFIG_MEMCG
+c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
+d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
+
+3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
+-------------------------------------------------------------------
+
+::
+
+	# mount -t tmpfs none /sys/fs/cgroup
+	# mkdir /sys/fs/cgroup/memory
+	# mount -t cgroup none /sys/fs/cgroup/memory -o memory
+
+3.2. Make the new group and move bash into it::
+
+	# mkdir /sys/fs/cgroup/memory/0
+	# echo $$ > /sys/fs/cgroup/memory/0/tasks
+
+Since now we're in the 0 cgroup, we can alter the memory limit::
+
+	# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+
+NOTE:
+  We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
+  mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
+  Gibibytes.)
+
+NOTE:
+  We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
+
+NOTE:
+  We cannot set limits on the root cgroup any more.
+
+::
+
+  # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
+  4194304
+
+We can check the usage::
+
+  # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
+  1216512
+
+A successful write to this file does not guarantee a successful setting of
+this limit to the value written into the file. This can be due to a
+number of factors, such as rounding up to page boundaries or the total
+availability of memory on the system. The user is required to re-read
+this file after a write to guarantee the value committed by the kernel::
+
+  # echo 1 > memory.limit_in_bytes
+  # cat memory.limit_in_bytes
+  4096
+
+The memory.failcnt field gives the number of times that the cgroup limit was
+exceeded.
+
+The memory.stat file gives accounting information. Now, the number of
+caches, RSS and Active pages/Inactive pages are shown.
+
+4. Testing
+==========
+
+For testing features and implementation, see memcg_test.txt.
+
+Performance test is also important. To see pure memory controller's overhead,
+testing on tmpfs will give you good numbers of small overheads.
+Example: do kernel make on tmpfs.
+
+Page-fault scalability is also important. At measuring parallel
+page fault test, multi-process test may be better than multi-thread
+test because it has noise of shared objects/status.
+
+But the above two are testing extreme situations.
+Trying usual test under memory controller is always helpful.
+
+4.1 Troubleshooting
+-------------------
+
+Sometimes a user might find that the application under a cgroup is
+terminated by the OOM killer. There are several causes for this:
+
+1. The cgroup limit is too low (just too low to do anything useful)
+2. The user is using anonymous memory and swap is turned off or too low
+
+A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
+some of the pages cached in the cgroup (page cache pages).
+
+To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
+seeing what happens will be helpful.
+
+4.2 Task migration
+------------------
+
+When a task migrates from one cgroup to another, its charge is not
+carried forward by default. The pages allocated from the original cgroup still
+remain charged to it, the charge is dropped when the page is freed or
+reclaimed.
+
+You can move charges of a task along with task migration.
+See 8. "Move charges at task migration"
+
+4.3 Removing a cgroup
+---------------------
+
+A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
+cgroup might have some charge associated with it, even though all
+tasks have migrated away from it. (because we charge against pages, not
+against tasks.)
+
+We move the stats to root (if use_hierarchy==0) or parent (if
+use_hierarchy==1), and no change on the charge except uncharging
+from the child.
+
+Charges recorded in swap information is not updated at removal of cgroup.
+Recorded information is discarded and a cgroup which uses swap (swapcache)
+will be charged as a new owner of it.
+
+About use_hierarchy, see Section 6.
+
+5. Misc. interfaces
+===================
+
+5.1 force_empty
+---------------
+  memory.force_empty interface is provided to make cgroup's memory usage empty.
+  When writing anything to this::
+
+    # echo 0 > memory.force_empty
+
+  the cgroup will be reclaimed and as many pages reclaimed as possible.
+
+  The typical use case for this interface is before calling rmdir().
+  Though rmdir() offlines memcg, but the memcg may still stay there due to
+  charged file caches. Some out-of-use page caches may keep charged until
+  memory pressure happens. If you want to avoid that, force_empty will be useful.
+
+  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
+  kernel pages will still be seen. This is not considered a failure and the
+  write will still return success. In this case, it is expected that
+  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
+
+  About use_hierarchy, see Section 6.
+
+5.2 stat file
+-------------
+
+memory.stat file includes following statistics
+
+per-memory cgroup local status
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+=============== ===============================================================
+cache		# of bytes of page cache memory.
+rss		# of bytes of anonymous and swap cache memory (includes
+		transparent hugepages).
+rss_huge	# of bytes of anonymous transparent hugepages.
+mapped_file	# of bytes of mapped file (includes tmpfs/shmem)
+pgpgin		# of charging events to the memory cgroup. The charging
+		event happens each time a page is accounted as either mapped
+		anon page(RSS) or cache page(Page Cache) to the cgroup.
+pgpgout		# of uncharging events to the memory cgroup. The uncharging
+		event happens each time a page is unaccounted from the cgroup.
+swap		# of bytes of swap usage
+dirty		# of bytes that are waiting to get written back to the disk.
+writeback	# of bytes of file/anon cache that are queued for syncing to
+		disk.
+inactive_anon	# of bytes of anonymous and swap cache memory on inactive
+		LRU list.
+active_anon	# of bytes of anonymous and swap cache memory on active
+		LRU list.
+inactive_file	# of bytes of file-backed memory on inactive LRU list.
+active_file	# of bytes of file-backed memory on active LRU list.
+unevictable	# of bytes of memory that cannot be reclaimed (mlocked etc).
+=============== ===============================================================
+
+status considering hierarchy (see memory.use_hierarchy settings)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================= ===================================================
+hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy
+			  under which the memory cgroup is
+hierarchical_memsw_limit  # of bytes of memory+swap limit with regard to
+			  hierarchy under which memory cgroup is.
+
+total_<counter>		  # hierarchical version of <counter>, which in
+			  addition to the cgroup's own value includes the
+			  sum of all hierarchical children's values of
+			  <counter>, i.e. total_cache
+========================= ===================================================
+
+The following additional stats are dependent on CONFIG_DEBUG_VM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+========================= ========================================
+recent_rotated_anon	  VM internal parameter. (see mm/vmscan.c)
+recent_rotated_file	  VM internal parameter. (see mm/vmscan.c)
+recent_scanned_anon	  VM internal parameter. (see mm/vmscan.c)
+recent_scanned_file	  VM internal parameter. (see mm/vmscan.c)
+========================= ========================================
+
+Memo:
+	recent_rotated means recent frequency of LRU rotation.
+	recent_scanned means recent # of scans to LRU.
+	showing for better debug please see the code for meanings.
+
+Note:
+	Only anonymous and swap cache memory is listed as part of 'rss' stat.
+	This should not be confused with the true 'resident set size' or the
+	amount of physical memory used by the cgroup.
+
+	'rss + mapped_file" will give you resident set size of cgroup.
+
+	(Note: file and shmem may be shared among other cgroups. In that case,
+	mapped_file is accounted only when the memory cgroup is owner of page
+	cache.)
+
+5.3 swappiness
+--------------
+
+Overrides /proc/sys/vm/swappiness for the particular group. The tunable
+in the root cgroup corresponds to the global swappiness setting.
+
+Please note that unlike during the global reclaim, limit reclaim
+enforces that 0 swappiness really prevents from any swapping even if
+there is a swap storage available. This might lead to memcg OOM killer
+if there are no file pages to reclaim.
+
+5.4 failcnt
+-----------
+
+A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
+This failcnt(== failure count) shows the number of times that a usage counter
+hit its limit. When a memory cgroup hits a limit, failcnt increases and
+memory under it will be reclaimed.
+
+You can reset failcnt by writing 0 to failcnt file::
+
+	# echo 0 > .../memory.failcnt
+
+5.5 usage_in_bytes
+------------------
+
+For efficiency, as other kernel components, memory cgroup uses some optimization
+to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
+method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
+value for efficient access. (Of course, when necessary, it's synchronized.)
+If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
+value in memory.stat(see 5.2).
+
+5.6 numa_stat
+-------------
+
+This is similar to numa_maps but operates on a per-memcg basis.  This is
+useful for providing visibility into the numa locality information within
+an memcg since the pages are allowed to be allocated from any physical
+node.  One of the use cases is evaluating application performance by
+combining this information with the application's CPU allocation.
+
+Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
+per-node page counts including "hierarchical_<counter>" which sums up all
+hierarchical children's values in addition to the memcg's own value.
+
+The output format of memory.numa_stat is::
+
+  total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+  hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
+
+The "total" count is sum of file + anon + unevictable.
+
+6. Hierarchy support
+====================
+
+The memory controller supports a deep hierarchy and hierarchical accounting.
+The hierarchy is created by creating the appropriate cgroups in the
+cgroup filesystem. Consider for example, the following cgroup filesystem
+hierarchy::
+
+	       root
+	     /  |   \
+            /	|    \
+	   a	b     c
+		      | \
+		      |  \
+		      d   e
+
+In the diagram above, with hierarchical accounting enabled, all memory
+usage of e, is accounted to its ancestors up until the root (i.e, c and root),
+that has memory.use_hierarchy enabled. If one of the ancestors goes over its
+limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
+children of the ancestor.
+
+6.1 Enabling hierarchical accounting and reclaim
+------------------------------------------------
+
+A memory cgroup by default disables the hierarchy feature. Support
+can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup::
+
+	# echo 1 > memory.use_hierarchy
+
+The feature can be disabled by::
+
+	# echo 0 > memory.use_hierarchy
+
+NOTE1:
+       Enabling/disabling will fail if either the cgroup already has other
+       cgroups created below it, or if the parent cgroup has use_hierarchy
+       enabled.
+
+NOTE2:
+       When panic_on_oom is set to "2", the whole system will panic in
+       case of an OOM event in any cgroup.
+
+7. Soft limits
+==============
+
+Soft limits allow for greater sharing of memory. The idea behind soft limits
+is to allow control groups to use as much of the memory as needed, provided
+
+a. There is no memory contention
+b. They do not exceed their hard limit
+
+When the system detects memory contention or low memory, control groups
+are pushed back to their soft limits. If the soft limit of each control
+group is very high, they are pushed back as much as possible to make
+sure that one control group does not starve the others of memory.
+
+Please note that soft limits is a best-effort feature; it comes with
+no guarantees, but it does its best to make sure that when memory is
+heavily contended for, memory is allocated based on the soft limit
+hints/setup. Currently soft limit based reclaim is set up such that
+it gets invoked from balance_pgdat (kswapd).
+
+7.1 Interface
+-------------
+
+Soft limits can be setup by using the following commands (in this example we
+assume a soft limit of 256 MiB)::
+
+	# echo 256M > memory.soft_limit_in_bytes
+
+If we want to change this to 1G, we can at any time use::
+
+	# echo 1G > memory.soft_limit_in_bytes
+
+NOTE1:
+       Soft limits take effect over a long period of time, since they involve
+       reclaiming memory for balancing between memory cgroups
+NOTE2:
+       It is recommended to set the soft limit always below the hard limit,
+       otherwise the hard limit will take precedence.
+
+8. Move charges at task migration
+=================================
+
+Users can move charges associated with a task along with task migration, that
+is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
+This feature is not supported in !CONFIG_MMU environments because of lack of
+page tables.
+
+8.1 Interface
+-------------
+
+This feature is disabled by default. It can be enabled (and disabled again) by
+writing to memory.move_charge_at_immigrate of the destination cgroup.
+
+If you want to enable it::
+
+	# echo (some positive value) > memory.move_charge_at_immigrate
+
+Note:
+      Each bits of move_charge_at_immigrate has its own meaning about what type
+      of charges should be moved. See 8.2 for details.
+Note:
+      Charges are moved only when you move mm->owner, in other words,
+      a leader of a thread group.
+Note:
+      If we cannot find enough space for the task in the destination cgroup, we
+      try to make space by reclaiming memory. Task migration may fail if we
+      cannot make enough space.
+Note:
+      It can take several seconds if you move charges much.
+
+And if you want disable it again::
+
+	# echo 0 > memory.move_charge_at_immigrate
+
+8.2 Type of charges which can be moved
+--------------------------------------
+
+Each bit in move_charge_at_immigrate has its own meaning about what type of
+charges should be moved. But in any case, it must be noted that an account of
+a page or a swap can be moved only when it is charged to the task's current
+(old) memory cgroup.
+
++---+--------------------------------------------------------------------------+
+|bit| what type of charges would be moved ?                                    |
++===+==========================================================================+
+| 0 | A charge of an anonymous page (or swap of it) used by the target task.   |
+|   | You must enable Swap Extension (see 2.4) to enable move of swap charges. |
++---+--------------------------------------------------------------------------+
+| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) |
+|   | and swaps of tmpfs file) mmapped by the target task. Unlike the case of  |
+|   | anonymous pages, file pages (and swaps) in the range mmapped by the task |
+|   | will be moved even if the task hasn't done page fault, i.e. they might   |
+|   | not be the task's "RSS", but other task's "RSS" that maps the same file. |
+|   | And mapcount of the page is ignored (the page can be moved even if       |
+|   | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to    |
+|   | enable move of swap charges.                                             |
++---+--------------------------------------------------------------------------+
+
+8.3 TODO
+--------
+
+- All of moving charge operations are done under cgroup_mutex. It's not good
+  behavior to hold the mutex too long, so we may need some trick.
+
+9. Memory thresholds
+====================
+
+Memory cgroup implements memory thresholds using the cgroups notification
+API (see cgroups.txt). It allows to register multiple memory and memsw
+thresholds and gets notifications when it crosses.
+
+To register a threshold, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
+- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
+  cgroup.event_control.
+
+Application will be notified through eventfd when memory usage crosses
+threshold in any direction.
+
+It's applicable for root and non-root cgroup.
+
+10. OOM Control
+===============
+
+memory.oom_control file is for OOM notification and other controls.
+
+Memory cgroup implements OOM notifier using the cgroup notification
+API (See cgroups.txt). It allows to register multiple OOM notification
+delivery and gets notification when OOM happens.
+
+To register a notifier, an application must:
+
+ - create an eventfd using eventfd(2)
+ - open memory.oom_control file
+ - write string like "<event_fd> <fd of memory.oom_control>" to
+   cgroup.event_control
+
+The application will be notified through eventfd when OOM happens.
+OOM notification doesn't work for the root cgroup.
+
+You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
+
+	#echo 1 > memory.oom_control
+
+If OOM-killer is disabled, tasks under cgroup will hang/sleep
+in memory cgroup's OOM-waitqueue when they request accountable memory.
+
+For running them, you have to relax the memory cgroup's OOM status by
+
+	* enlarge limit or reduce usage.
+
+To reduce usage,
+
+	* kill some tasks.
+	* move some tasks to other group with account migration.
+	* remove some files (on tmpfs?)
+
+Then, stopped tasks will work again.
+
+At reading, current status of OOM is shown.
+
+	- oom_kill_disable 0 or 1
+	  (if 1, oom-killer is disabled)
+	- under_oom	   0 or 1
+	  (if 1, the memory cgroup is under OOM, tasks may be stopped.)
+
+11. Memory Pressure
+===================
+
+The pressure level notifications can be used to monitor the memory
+allocation cost; based on the pressure, applications can implement
+different strategies of managing their memory resources. The pressure
+levels are defined as following:
+
+The "low" level means that the system is reclaiming memory for new
+allocations. Monitoring this reclaiming activity might be useful for
+maintaining cache level. Upon notification, the program (typically
+"Activity Manager") might analyze vmstat and act in advance (i.e.
+prematurely shutdown unimportant services).
+
+The "medium" level means that the system is experiencing medium memory
+pressure, the system might be making swap, paging out active file caches,
+etc. Upon this event applications may decide to further analyze
+vmstat/zoneinfo/memcg or internal memory usage statistics and free any
+resources that can be easily reconstructed or re-read from a disk.
+
+The "critical" level means that the system is actively thrashing, it is
+about to out of memory (OOM) or even the in-kernel OOM killer is on its
+way to trigger. Applications should do whatever they can to help the
+system. It might be too late to consult with vmstat or any other
+statistics, so it's advisable to take an immediate action.
+
+By default, events are propagated upward until the event is handled, i.e. the
+events are not pass-through. For example, you have three cgroups: A->B->C. Now
+you set up an event listener on cgroups A, B and C, and suppose group C
+experiences some pressure. In this situation, only group C will receive the
+notification, i.e. groups A and B will not receive it. This is done to avoid
+excessive "broadcasting" of messages, which disturbs the system and which is
+especially bad if we are low on memory or thrashing. Group B, will receive
+notification only if there are no event listers for group C.
+
+There are three optional modes that specify different propagation behavior:
+
+ - "default": this is the default behavior specified above. This mode is the
+   same as omitting the optional mode parameter, preserved by backwards
+   compatibility.
+
+ - "hierarchy": events always propagate up to the root, similar to the default
+   behavior, except that propagation continues regardless of whether there are
+   event listeners at each level, with the "hierarchy" mode. In the above
+   example, groups A, B, and C will receive notification of memory pressure.
+
+ - "local": events are pass-through, i.e. they only receive notifications when
+   memory pressure is experienced in the memcg for which the notification is
+   registered. In the above example, group C will receive notification if
+   registered for "local" notification and the group experiences memory
+   pressure. However, group B will never receive notification, regardless if
+   there is an event listener for group C or not, if group B is registered for
+   local notification.
+
+The level and event notification mode ("hierarchy" or "local", if necessary) are
+specified by a comma-delimited string, i.e. "low,hierarchy" specifies
+hierarchical, pass-through, notification for all ancestor memcgs. Notification
+that is the default, non pass-through behavior, does not specify a mode.
+"medium,local" specifies pass-through notification for the medium level.
+
+The file memory.pressure_level is only used to setup an eventfd. To
+register a notification, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.pressure_level;
+- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
+  to cgroup.event_control.
+
+Application will be notified through eventfd when memory pressure is at
+the specific level (or higher). Read/write operations to
+memory.pressure_level are no implemented.
+
+Test:
+
+   Here is a small script example that makes a new cgroup, sets up a
+   memory limit, sets up a notification in the cgroup and then makes child
+   cgroup experience a critical pressure::
+
+	# cd /sys/fs/cgroup/memory/
+	# mkdir foo
+	# cd foo
+	# cgroup_event_listener memory.pressure_level low,hierarchy &
+	# echo 8000000 > memory.limit_in_bytes
+	# echo 8000000 > memory.memsw.limit_in_bytes
+	# echo $$ > tasks
+	# dd if=/dev/zero | read x
+
+   (Expect a bunch of notifications, and eventually, the oom-killer will
+   trigger.)
+
+12. TODO
+========
+
+1. Make per-cgroup scanner reclaim not-shared pages first
+2. Teach controller to account for shared-pages
+3. Start reclamation in the background when the limit is
+   not yet hit but the usage is getting closer
+
+Summary
+=======
+
+Overall, the memory controller has been a stable controller and has been
+commented and discussed quite extensively in the community.
+
+References
+==========
+
+1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
+2. Singh, Balbir. Memory Controller (RSS Control),
+   http://lwn.net/Articles/222762/
+3. Emelianov, Pavel. Resource controllers based on process cgroups
+   http://lkml.org/lkml/2007/3/6/198
+4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
+   http://lkml.org/lkml/2007/4/9/78
+5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
+   http://lkml.org/lkml/2007/5/30/244
+6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
+7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
+   subsystem (v3), http://lwn.net/Articles/235534/
+8. Singh, Balbir. RSS controller v2 test results (lmbench),
+   http://lkml.org/lkml/2007/5/17/232
+9. Singh, Balbir. RSS controller v2 AIM9 results
+   http://lkml.org/lkml/2007/5/18/1
+10. Singh, Balbir. Memory controller v6 test results,
+    http://lkml.org/lkml/2007/8/19/36
+11. Singh, Balbir. Memory controller introduction (v6),
+    http://lkml.org/lkml/2007/8/17/69
+12. Corbet, Jonathan, Controlling memory use in cgroups,
+    http://lwn.net/Articles/243795/
diff --git a/Documentation/admin-guide/cgroup-v1/net_cls.rst b/Documentation/admin-guide/cgroup-v1/net_cls.rst
new file mode 100644
index 000000000000..a2cf272af7a0
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/net_cls.rst
@@ -0,0 +1,44 @@
+=========================
+Network classifier cgroup
+=========================
+
+The Network classifier cgroup provides an interface to
+tag network packets with a class identifier (classid).
+
+The Traffic Controller (tc) can be used to assign
+different priorities to packets from different cgroups.
+Also, Netfilter (iptables) can use this tag to perform
+actions on such packets.
+
+Creating a net_cls cgroups instance creates a net_cls.classid file.
+This net_cls.classid value is initialized to 0.
+
+You can write hexadecimal values to net_cls.classid; the format for these
+values is 0xAAAABBBB; AAAA is the major handle number and BBBB
+is the minor handle number.
+Reading net_cls.classid yields a decimal result.
+
+Example::
+
+	mkdir /sys/fs/cgroup/net_cls
+	mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
+	mkdir /sys/fs/cgroup/net_cls/0
+	echo 0x100001 >  /sys/fs/cgroup/net_cls/0/net_cls.classid
+
+- setting a 10:1 handle::
+
+	cat /sys/fs/cgroup/net_cls/0/net_cls.classid
+	1048577
+
+- configuring tc::
+
+	tc qdisc add dev eth0 root handle 10: htb
+	tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
+
+- creating traffic class 10:1::
+
+	tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
+
+configuring iptables, basic example::
+
+	iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/admin-guide/cgroup-v1/net_prio.rst b/Documentation/admin-guide/cgroup-v1/net_prio.rst
new file mode 100644
index 000000000000..b40905871c64
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/net_prio.rst
@@ -0,0 +1,57 @@
+=======================
+Network priority cgroup
+=======================
+
+The Network priority cgroup provides an interface to allow an administrator to
+dynamically set the priority of network traffic generated by various
+applications
+
+Nominally, an application would set the priority of its traffic via the
+SO_PRIORITY socket option.  This however, is not always possible because:
+
+1) The application may not have been coded to set this value
+2) The priority of application traffic is often a site-specific administrative
+   decision rather than an application defined one.
+
+This cgroup allows an administrator to assign a process to a group which defines
+the priority of egress traffic on a given interface. Network priority groups can
+be created by first mounting the cgroup filesystem::
+
+	# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
+
+With the above step, the initial group acting as the parent accounting group
+becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
+the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
+
+Each net_prio cgroup contains two files that are subsystem specific
+
+net_prio.prioidx
+  This file is read-only, and is simply informative.  It contains a unique
+  integer value that the kernel uses as an internal representation of this
+  cgroup.
+
+net_prio.ifpriomap
+  This file contains a map of the priorities assigned to traffic originating
+  from processes in this group and egressing the system on various interfaces.
+  It contains a list of tuples in the form <ifname priority>.  Contents of this
+  file can be modified by echoing a string into the file using the same tuple
+  format. For example::
+
+	echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
+
+This command would force any traffic originating from processes belonging to the
+iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
+said traffic set to the value 5. The parent accounting group also has a
+writeable 'net_prio.ifpriomap' file that can be used to set a system default
+priority.
+
+Priorities are set immediately prior to queueing a frame to the device
+queueing discipline (qdisc) so priorities will be assigned prior to the hardware
+queue selection being made.
+
+One usage for the net_prio cgroup is with mqprio qdisc allowing application
+traffic to be steered to hardware/driver based traffic classes. These mappings
+can then be managed by administrators or other networking protocols such as
+DCBX.
+
+A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/admin-guide/cgroup-v1/pids.rst b/Documentation/admin-guide/cgroup-v1/pids.rst
new file mode 100644
index 000000000000..6acebd9e72c8
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/pids.rst
@@ -0,0 +1,92 @@
+=========================
+Process Number Controller
+=========================
+
+Abstract
+--------
+
+The process number controller is used to allow a cgroup hierarchy to stop any
+new tasks from being fork()'d or clone()'d after a certain limit is reached.
+
+Since it is trivial to hit the task limit without hitting any kmemcg limits in
+place, PIDs are a fundamental resource. As such, PID exhaustion must be
+preventable in the scope of a cgroup hierarchy by allowing resource limiting of
+the number of tasks in a cgroup.
+
+Usage
+-----
+
+In order to use the `pids` controller, set the maximum number of tasks in
+pids.max (this is not available in the root cgroup for obvious reasons). The
+number of processes currently in the cgroup is given by pids.current.
+
+Organisational operations are not blocked by cgroup policies, so it is possible
+to have pids.current > pids.max. This can be done by either setting the limit to
+be smaller than pids.current, or attaching enough processes to the cgroup such
+that pids.current > pids.max. However, it is not possible to violate a cgroup
+policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
+creation of a new process would cause a cgroup policy to be violated.
+
+To set a cgroup to have no limit, set pids.max to "max". This is the default for
+all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
+limit in the hierarchy is followed).
+
+pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
+superset of parent/child/pids.current.
+
+The pids.events file contains event counters:
+
+  - max: Number of times fork failed because limit was hit.
+
+Example
+-------
+
+First, we mount the pids controller::
+
+	# mkdir -p /sys/fs/cgroup/pids
+	# mount -t cgroup -o pids none /sys/fs/cgroup/pids
+
+Then we create a hierarchy, set limits and attach processes to it::
+
+	# mkdir -p /sys/fs/cgroup/pids/parent/child
+	# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
+	# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	#
+
+It should be noted that attempts to overcome the set limit (2 in this case) will
+fail::
+
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	# ( /bin/echo "Here's some processes for you." | cat )
+	sh: fork: Resource temporary unavailable
+	#
+
+Even if we migrate to a child cgroup (which doesn't have a set limit), we will
+not be able to overcome the most stringent limit in the hierarchy (in this case,
+parent's)::
+
+	# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
+	# cat /sys/fs/cgroup/pids/parent/pids.current
+	2
+	# cat /sys/fs/cgroup/pids/parent/child/pids.current
+	2
+	# cat /sys/fs/cgroup/pids/parent/child/pids.max
+	max
+	# ( /bin/echo "Here's some processes for you." | cat )
+	sh: fork: Resource temporary unavailable
+	#
+
+We can set a limit that is smaller than pids.current, which will stop any new
+processes from being forked at all (note that the shell itself counts towards
+pids.current)::
+
+	# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
+	# /bin/echo "We can't even spawn a single process now."
+	sh: fork: Resource temporary unavailable
+	# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
+	# /bin/echo "We can't even spawn a single process now."
+	sh: fork: Resource temporary unavailable
+	#
diff --git a/Documentation/admin-guide/cgroup-v1/rdma.rst b/Documentation/admin-guide/cgroup-v1/rdma.rst
new file mode 100644
index 000000000000..2fcb0a9bf790
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/rdma.rst
@@ -0,0 +1,117 @@
+===============
+RDMA Controller
+===============
+
+.. Contents
+
+   1. Overview
+     1-1. What is RDMA controller?
+     1-2. Why RDMA controller needed?
+     1-3. How is RDMA controller implemented?
+   2. Usage Examples
+
+1. Overview
+===========
+
+1-1. What is RDMA controller?
+-----------------------------
+
+RDMA controller allows user to limit RDMA/IB specific resources that a given
+set of processes can use. These processes are grouped using RDMA controller.
+
+RDMA controller defines two resources which can be limited for processes of a
+cgroup.
+
+1-2. Why RDMA controller needed?
+--------------------------------
+
+Currently user space applications can easily take away all the rdma verb
+specific resources such as AH, CQ, QP, MR etc. Due to which other applications
+in other cgroup or kernel space ULPs may not even get chance to allocate any
+rdma resources. This can lead to service unavailability.
+
+Therefore RDMA controller is needed through which resource consumption
+of processes can be limited. Through this controller different rdma
+resources can be accounted.
+
+1-3. How is RDMA controller implemented?
+----------------------------------------
+
+RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
+resource accounting per cgroup, per device using resource pool structure.
+Each such resource pool is limited up to 64 resources in given resource pool
+by rdma cgroup, which can be extended later if required.
+
+This resource pool object is linked to the cgroup css. Typically there
+are 0 to 4 resource pool instances per cgroup, per device in most use cases.
+But nothing limits to have it more. At present hundreds of RDMA devices per
+single cgroup may not be handled optimally, however there is no
+known use case or requirement for such configuration either.
+
+Since RDMA resources can be allocated from any process and can be freed by any
+of the child processes which shares the address space, rdma resources are
+always owned by the creator cgroup css. This allows process migration from one
+to other cgroup without major complexity of transferring resource ownership;
+because such ownership is not really present due to shared nature of
+rdma resources. Linking resources around css also ensures that cgroups can be
+deleted after processes migrated. This allow progress migration as well with
+active resources, even though that is not a primary use case.
+
+Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
+the caller. Same rdma cgroup should be passed while uncharging the resource.
+This also allows process migrated with active RDMA resource to charge
+to new owner cgroup for new resource. It also allows to uncharge resource of
+a process from previously charged cgroup which is migrated to new cgroup,
+even though that is not a primary use case.
+
+Resource pool object is created in following situations.
+(a) User sets the limit and no previous resource pool exist for the device
+of interest for the cgroup.
+(b) No resource limits were configured, but IB/RDMA stack tries to
+charge the resource. So that it correctly uncharge them when applications are
+running without limits and later on when limits are enforced during uncharging,
+otherwise usage count will drop to negative.
+
+Resource pool is destroyed if all the resource limits are set to max and
+it is the last resource getting deallocated.
+
+User should set all the limit to max value if it intents to remove/unconfigure
+the resource pool for a particular device.
+
+IB stack honors limits enforced by the rdma controller. When application
+query about maximum resource limits of IB device, it returns minimum of
+what is configured by user for a given cgroup and what is supported by
+IB device.
+
+Following resources can be accounted by rdma controller.
+
+  ==========    =============================
+  hca_handle	Maximum number of HCA Handles
+  hca_object 	Maximum number of HCA Objects
+  ==========    =============================
+
+2. Usage Examples
+=================
+
+(a) Configure resource limit::
+
+	echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
+	echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
+
+(b) Query resource limit::
+
+	cat /sys/fs/cgroup/rdma/2/rdma.max
+	#Output:
+	mlx4_0 hca_handle=2 hca_object=2000
+	ocrdma1 hca_handle=3 hca_object=max
+
+(c) Query current usage::
+
+	cat /sys/fs/cgroup/rdma/2/rdma.current
+	#Output:
+	mlx4_0 hca_handle=1 hca_object=20
+	ocrdma1 hca_handle=1 hca_object=23
+
+(d) Delete resource limit::
+
+	echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 080b18ce2a5d..ed4c5977d6e1 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -9,7 +9,7 @@ This is the authoritative documentation on the design, interface and
 conventions of cgroup v2.  It describes all userland-visible aspects
 of cgroup including core and specific controller behaviors.  All
 future changes must be reflected in this document.  Documentation for
-v1 is available under Documentation/cgroup-v1/.
+v1 is available under Documentation/admin-guide/cgroup-v1/.
 
 .. CONTENTS
 
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index 1f0d9b939311..a5fdb1a846ce 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -59,6 +59,7 @@ configure specific aspects of kernel behavior to your liking.
 
    initrd
    cgroup-v2
+   cgroup-v1/index
    serial-console
    braille-console
    parport
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 78576aa45cce..a571a67e0c85 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4089,7 +4089,7 @@
 
 	relax_domain_level=
 			[KNL, SMP] Set scheduler's default relax_domain_level.
-			See Documentation/cgroup-v1/cpusets.rst.
+			See Documentation/admin-guide/cgroup-v1/cpusets.rst.
 
 	reserve=	[KNL,BUGS] Force kernel to ignore I/O ports or memory
 			Format: <base1>,<size1>[,<base2>,<size2>,...]
@@ -4599,7 +4599,7 @@
 	swapaccount=[0|1]
 			[KNL] Enable accounting of swap in memory resource
 			controller if no parameter or 1 is given or disable
-			it if 0 is given (See Documentation/cgroup-v1/memory.rst)
+			it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
 
 	swiotlb=	[ARM,IA-64,PPC,MIPS,X86]
 			Format: { <int> | force | noforce }
diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst
index 546f174e5d6a..8463f5538fda 100644
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -15,7 +15,7 @@ document attempts to describe the concepts and APIs of the 2.6 memory policy
 support.
 
 Memory policies should not be confused with cpusets
-(``Documentation/cgroup-v1/cpusets.rst``)
+(``Documentation/admin-guide/cgroup-v1/cpusets.rst``)
 which is an administrative mechanism for restricting the nodes from which
 memory may be allocated by a set of processes. Memory policies are a
 programming interface that a NUMA-aware application can take advantage of.  When
diff --git a/Documentation/block/bfq-iosched.rst b/Documentation/block/bfq-iosched.rst
index 2c13b2fc1888..0d237d402860 100644
--- a/Documentation/block/bfq-iosched.rst
+++ b/Documentation/block/bfq-iosched.rst
@@ -547,7 +547,7 @@ As for cgroups-v1 (blkio controller), the exact set of stat files
 created, and kept up-to-date by bfq, depends on whether
 CONFIG_BFQ_CGROUP_DEBUG is set. If it is set, then bfq creates all
 the stat files documented in
-Documentation/cgroup-v1/blkio-controller.rst. If, instead,
+Documentation/admin-guide/cgroup-v1/blkio-controller.rst. If, instead,
 CONFIG_BFQ_CGROUP_DEBUG is not set, then bfq creates only the files::
 
   blkio.bfq.io_service_bytes
diff --git a/Documentation/cgroup-v1/blkio-controller.rst b/Documentation/cgroup-v1/blkio-controller.rst
deleted file mode 100644
index 1d7d962933be..000000000000
--- a/Documentation/cgroup-v1/blkio-controller.rst
+++ /dev/null
@@ -1,302 +0,0 @@
-===================
-Block IO Controller
-===================
-
-Overview
-========
-cgroup subsys "blkio" implements the block io controller. There seems to be
-a need of various kinds of IO control policies (like proportional BW, max BW)
-both at leaf nodes as well as at intermediate nodes in a storage hierarchy.
-Plan is to use the same cgroup based management interface for blkio controller
-and based on user options switch IO policies in the background.
-
-One IO control policy is throttling policy which can be used to
-specify upper IO rate limits on devices. This policy is implemented in
-generic block layer and can be used on leaf nodes as well as higher
-level logical devices like device mapper.
-
-HOWTO
-=====
-Throttling/Upper Limit policy
------------------------------
-- Enable Block IO controller::
-
-	CONFIG_BLK_CGROUP=y
-
-- Enable throttling in block layer::
-
-	CONFIG_BLK_DEV_THROTTLING=y
-
-- Mount blkio controller (see cgroups.txt, Why are cgroups needed?)::
-
-        mount -t cgroup -o blkio none /sys/fs/cgroup/blkio
-
-- Specify a bandwidth rate on particular device for root group. The format
-  for policy is "<major>:<minor>  <bytes_per_second>"::
-
-        echo "8:16  1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device
-
-  Above will put a limit of 1MB/second on reads happening for root group
-  on device having major/minor number 8:16.
-
-- Run dd to read a file and see if rate is throttled to 1MB/s or not::
-
-        # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024
-        1024+0 records in
-        1024+0 records out
-        4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s
-
- Limits for writes can be put using blkio.throttle.write_bps_device file.
-
-Hierarchical Cgroups
-====================
-
-Throttling implements hierarchy support; however,
-throttling's hierarchy support is enabled iff "sane_behavior" is
-enabled from cgroup side, which currently is a development option and
-not publicly available.
-
-If somebody created a hierarchy like as follows::
-
-			root
-			/  \
-		     test1 test2
-			|
-		     test3
-
-Throttling with "sane_behavior" will handle the
-hierarchy correctly. For throttling, all limits apply
-to the whole subtree while all statistics are local to the IOs
-directly generated by tasks in that cgroup.
-
-Throttling without "sane_behavior" enabled from cgroup side will
-practically treat all groups at same level as if it looks like the
-following::
-
-				pivot
-			     /  /   \  \
-			root  test1 test2  test3
-
-Various user visible config options
-===================================
-CONFIG_BLK_CGROUP
-	- Block IO controller.
-
-CONFIG_BFQ_CGROUP_DEBUG
-	- Debug help. Right now some additional stats file show up in cgroup
-	  if this option is enabled.
-
-CONFIG_BLK_DEV_THROTTLING
-	- Enable block device throttling support in block layer.
-
-Details of cgroup files
-=======================
-Proportional weight policy files
---------------------------------
-- blkio.weight
-	- Specifies per cgroup weight. This is default weight of the group
-	  on all the devices until and unless overridden by per device rule.
-	  (See blkio.weight_device).
-	  Currently allowed range of weights is from 10 to 1000.
-
-- blkio.weight_device
-	- One can specify per cgroup per device rules using this interface.
-	  These rules override the default value of group weight as specified
-	  by blkio.weight.
-
-	  Following is the format::
-
-	    # echo dev_maj:dev_minor weight > blkio.weight_device
-
-	  Configure weight=300 on /dev/sdb (8:16) in this cgroup::
-
-	    # echo 8:16 300 > blkio.weight_device
-	    # cat blkio.weight_device
-	    dev     weight
-	    8:16    300
-
-	  Configure weight=500 on /dev/sda (8:0) in this cgroup::
-
-	    # echo 8:0 500 > blkio.weight_device
-	    # cat blkio.weight_device
-	    dev     weight
-	    8:0     500
-	    8:16    300
-
-	  Remove specific weight for /dev/sda in this cgroup::
-
-	    # echo 8:0 0 > blkio.weight_device
-	    # cat blkio.weight_device
-	    dev     weight
-	    8:16    300
-
-- blkio.leaf_weight[_device]
-	- Equivalents of blkio.weight[_device] for the purpose of
-          deciding how much weight tasks in the given cgroup has while
-          competing with the cgroup's child cgroups. For details,
-          please refer to Documentation/block/cfq-iosched.txt.
-
-- blkio.time
-	- disk time allocated to cgroup per device in milliseconds. First
-	  two fields specify the major and minor number of the device and
-	  third field specifies the disk time allocated to group in
-	  milliseconds.
-
-- blkio.sectors
-	- number of sectors transferred to/from disk by the group. First
-	  two fields specify the major and minor number of the device and
-	  third field specifies the number of sectors transferred by the
-	  group to/from the device.
-
-- blkio.io_service_bytes
-	- Number of bytes transferred to/from the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of bytes.
-
-- blkio.io_serviced
-	- Number of IOs (bio) issued to the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of IOs.
-
-- blkio.io_service_time
-	- Total amount of time between request dispatch and request completion
-	  for the IOs done by this cgroup. This is in nanoseconds to make it
-	  meaningful for flash devices too. For devices with queue depth of 1,
-	  this time represents the actual service time. When queue_depth > 1,
-	  that is no longer true as requests may be served out of order. This
-	  may cause the service time for a given IO to include the service time
-	  of multiple IOs when served out of order which may result in total
-	  io_service_time > actual time elapsed. This time is further divided by
-	  the type of operation - read or write, sync or async. First two fields
-	  specify the major and minor number of the device, third field
-	  specifies the operation type and the fourth field specifies the
-	  io_service_time in ns.
-
-- blkio.io_wait_time
-	- Total amount of time the IOs for this cgroup spent waiting in the
-	  scheduler queues for service. This can be greater than the total time
-	  elapsed since it is cumulative io_wait_time for all IOs. It is not a
-	  measure of total time the cgroup spent waiting but rather a measure of
-	  the wait_time for its individual IOs. For devices with queue_depth > 1
-	  this metric does not include the time spent waiting for service once
-	  the IO is dispatched to the device but till it actually gets serviced
-	  (there might be a time lag here due to re-ordering of requests by the
-	  device). This is in nanoseconds to make it meaningful for flash
-	  devices too. This time is further divided by the type of operation -
-	  read or write, sync or async. First two fields specify the major and
-	  minor number of the device, third field specifies the operation type
-	  and the fourth field specifies the io_wait_time in ns.
-
-- blkio.io_merged
-	- Total number of bios/requests merged into requests belonging to this
-	  cgroup. This is further divided by the type of operation - read or
-	  write, sync or async.
-
-- blkio.io_queued
-	- Total number of requests queued up at any given instant for this
-	  cgroup. This is further divided by the type of operation - read or
-	  write, sync or async.
-
-- blkio.avg_queue_size
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
-	  The average queue size for this cgroup over the entire time of this
-	  cgroup's existence. Queue size samples are taken each time one of the
-	  queues of this cgroup gets a timeslice.
-
-- blkio.group_wait_time
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
-	  This is the amount of time the cgroup had to wait since it became busy
-	  (i.e., went from 0 to 1 request queued) to get a timeslice for one of
-	  its queues. This is different from the io_wait_time which is the
-	  cumulative total of the amount of time spent by each IO in that cgroup
-	  waiting in the scheduler queue. This is in nanoseconds. If this is
-	  read when the cgroup is in a waiting (for timeslice) state, the stat
-	  will only report the group_wait_time accumulated till the last time it
-	  got a timeslice and will not include the current delta.
-
-- blkio.empty_time
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
-	  This is the amount of time a cgroup spends without any pending
-	  requests when not being served, i.e., it does not include any time
-	  spent idling for one of the queues of the cgroup. This is in
-	  nanoseconds. If this is read when the cgroup is in an empty state,
-	  the stat will only report the empty_time accumulated till the last
-	  time it had a pending request and will not include the current delta.
-
-- blkio.idle_time
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y.
-	  This is the amount of time spent by the IO scheduler idling for a
-	  given cgroup in anticipation of a better request than the existing ones
-	  from other queues/cgroups. This is in nanoseconds. If this is read
-	  when the cgroup is in an idling state, the stat will only report the
-	  idle_time accumulated till the last idle period and will not include
-	  the current delta.
-
-- blkio.dequeue
-	- Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This
-	  gives the statistics about how many a times a group was dequeued
-	  from service tree of the device. First two fields specify the major
-	  and minor number of the device and third field specifies the number
-	  of times a group was dequeued from a particular device.
-
-- blkio.*_recursive
-	- Recursive version of various stats. These files show the
-          same information as their non-recursive counterparts but
-          include stats from all the descendant cgroups.
-
-Throttling/Upper limit policy files
------------------------------------
-- blkio.throttle.read_bps_device
-	- Specifies upper limit on READ rate from the device. IO rate is
-	  specified in bytes per second. Rules are per device. Following is
-	  the format::
-
-	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.read_bps_device
-
-- blkio.throttle.write_bps_device
-	- Specifies upper limit on WRITE rate to the device. IO rate is
-	  specified in bytes per second. Rules are per device. Following is
-	  the format::
-
-	    echo "<major>:<minor>  <rate_bytes_per_second>" > /cgrp/blkio.throttle.write_bps_device
-
-- blkio.throttle.read_iops_device
-	- Specifies upper limit on READ rate from the device. IO rate is
-	  specified in IO per second. Rules are per device. Following is
-	  the format::
-
-	   echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.read_iops_device
-
-- blkio.throttle.write_iops_device
-	- Specifies upper limit on WRITE rate to the device. IO rate is
-	  specified in io per second. Rules are per device. Following is
-	  the format::
-
-	    echo "<major>:<minor>  <rate_io_per_second>" > /cgrp/blkio.throttle.write_iops_device
-
-Note: If both BW and IOPS rules are specified for a device, then IO is
-      subjected to both the constraints.
-
-- blkio.throttle.io_serviced
-	- Number of IOs (bio) issued to the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of IOs.
-
-- blkio.throttle.io_service_bytes
-	- Number of bytes transferred to/from the disk by the group. These
-	  are further divided by the type of operation - read or write, sync
-	  or async. First two fields specify the major and minor number of the
-	  device, third field specifies the operation type and the fourth field
-	  specifies the number of bytes.
-
-Common files among various policies
------------------------------------
-- blkio.reset_stats
-	- Writing an int to this file will result in resetting all the stats
-	  for that cgroup.
diff --git a/Documentation/cgroup-v1/cgroups.rst b/Documentation/cgroup-v1/cgroups.rst
deleted file mode 100644
index 46bbe7e022d4..000000000000
--- a/Documentation/cgroup-v1/cgroups.rst
+++ /dev/null
@@ -1,695 +0,0 @@
-==============
-Control Groups
-==============
-
-Written by Paul Menage <menage@google.com> based on
-Documentation/cgroup-v1/cpusets.rst
-
-Original copyright statements from cpusets.txt:
-
-Portions Copyright (C) 2004 BULL SA.
-
-Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-
-Modified by Paul Jackson <pj@sgi.com>
-
-Modified by Christoph Lameter <cl@linux.com>
-
-.. CONTENTS:
-
-	1. Control Groups
-	1.1 What are cgroups ?
-	1.2 Why are cgroups needed ?
-	1.3 How are cgroups implemented ?
-	1.4 What does notify_on_release do ?
-	1.5 What does clone_children do ?
-	1.6 How do I use cgroups ?
-	2. Usage Examples and Syntax
-	2.1 Basic Usage
-	2.2 Attaching processes
-	2.3 Mounting hierarchies by name
-	3. Kernel API
-	3.1 Overview
-	3.2 Synchronization
-	3.3 Subsystem API
-	4. Extended attributes usage
-	5. Questions
-
-1. Control Groups
-=================
-
-1.1 What are cgroups ?
-----------------------
-
-Control Groups provide a mechanism for aggregating/partitioning sets of
-tasks, and all their future children, into hierarchical groups with
-specialized behaviour.
-
-Definitions:
-
-A *cgroup* associates a set of tasks with a set of parameters for one
-or more subsystems.
-
-A *subsystem* is a module that makes use of the task grouping
-facilities provided by cgroups to treat groups of tasks in
-particular ways. A subsystem is typically a "resource controller" that
-schedules a resource or applies per-cgroup limits, but it may be
-anything that wants to act on a group of processes, e.g. a
-virtualization subsystem.
-
-A *hierarchy* is a set of cgroups arranged in a tree, such that
-every task in the system is in exactly one of the cgroups in the
-hierarchy, and a set of subsystems; each subsystem has system-specific
-state attached to each cgroup in the hierarchy.  Each hierarchy has
-an instance of the cgroup virtual filesystem associated with it.
-
-At any one time there may be multiple active hierarchies of task
-cgroups. Each hierarchy is a partition of all tasks in the system.
-
-User-level code may create and destroy cgroups by name in an
-instance of the cgroup virtual file system, specify and query to
-which cgroup a task is assigned, and list the task PIDs assigned to
-a cgroup. Those creations and assignments only affect the hierarchy
-associated with that instance of the cgroup file system.
-
-On their own, the only use for cgroups is for simple job
-tracking. The intention is that other subsystems hook into the generic
-cgroup support to provide new attributes for cgroups, such as
-accounting/limiting the resources which processes in a cgroup can
-access. For example, cpusets (see Documentation/cgroup-v1/cpusets.rst) allow
-you to associate a set of CPUs and a set of memory nodes with the
-tasks in each cgroup.
-
-1.2 Why are cgroups needed ?
-----------------------------
-
-There are multiple efforts to provide process aggregations in the
-Linux kernel, mainly for resource-tracking purposes. Such efforts
-include cpusets, CKRM/ResGroups, UserBeanCounters, and virtual server
-namespaces. These all require the basic notion of a
-grouping/partitioning of processes, with newly forked processes ending
-up in the same group (cgroup) as their parent process.
-
-The kernel cgroup patch provides the minimum essential kernel
-mechanisms required to efficiently implement such groups. It has
-minimal impact on the system fast paths, and provides hooks for
-specific subsystems such as cpusets to provide additional behaviour as
-desired.
-
-Multiple hierarchy support is provided to allow for situations where
-the division of tasks into cgroups is distinctly different for
-different subsystems - having parallel hierarchies allows each
-hierarchy to be a natural division of tasks, without having to handle
-complex combinations of tasks that would be present if several
-unrelated subsystems needed to be forced into the same tree of
-cgroups.
-
-At one extreme, each resource controller or subsystem could be in a
-separate hierarchy; at the other extreme, all subsystems
-would be attached to the same hierarchy.
-
-As an example of a scenario (originally proposed by vatsa@in.ibm.com)
-that can benefit from multiple hierarchies, consider a large
-university server with various users - students, professors, system
-tasks etc. The resource planning for this server could be along the
-following lines::
-
-       CPU :          "Top cpuset"
-                       /       \
-               CPUSet1         CPUSet2
-                  |               |
-               (Professors)    (Students)
-
-               In addition (system tasks) are attached to topcpuset (so
-               that they can run anywhere) with a limit of 20%
-
-       Memory : Professors (50%), Students (30%), system (20%)
-
-       Disk : Professors (50%), Students (30%), system (20%)
-
-       Network : WWW browsing (20%), Network File System (60%), others (20%)
-                               / \
-               Professors (15%)  students (5%)
-
-Browsers like Firefox/Lynx go into the WWW network class, while (k)nfsd goes
-into the NFS network class.
-
-At the same time Firefox/Lynx will share an appropriate CPU/Memory class
-depending on who launched it (prof/student).
-
-With the ability to classify tasks differently for different resources
-(by putting those resource subsystems in different hierarchies),
-the admin can easily set up a script which receives exec notifications
-and depending on who is launching the browser he can::
-
-    # echo browser_pid > /sys/fs/cgroup/<restype>/<userclass>/tasks
-
-With only a single hierarchy, he now would potentially have to create
-a separate cgroup for every browser launched and associate it with
-appropriate network and other resource class.  This may lead to
-proliferation of such cgroups.
-
-Also let's say that the administrator would like to give enhanced network
-access temporarily to a student's browser (since it is night and the user
-wants to do online gaming :))  OR give one of the student's simulation
-apps enhanced CPU power.
-
-With ability to write PIDs directly to resource classes, it's just a
-matter of::
-
-       # echo pid > /sys/fs/cgroup/network/<new_class>/tasks
-       (after some time)
-       # echo pid > /sys/fs/cgroup/network/<orig_class>/tasks
-
-Without this ability, the administrator would have to split the cgroup into
-multiple separate ones and then associate the new cgroups with the
-new resource classes.
-
-
-
-1.3 How are cgroups implemented ?
----------------------------------
-
-Control Groups extends the kernel as follows:
-
- - Each task in the system has a reference-counted pointer to a
-   css_set.
-
- - A css_set contains a set of reference-counted pointers to
-   cgroup_subsys_state objects, one for each cgroup subsystem
-   registered in the system. There is no direct link from a task to
-   the cgroup of which it's a member in each hierarchy, but this
-   can be determined by following pointers through the
-   cgroup_subsys_state objects. This is because accessing the
-   subsystem state is something that's expected to happen frequently
-   and in performance-critical code, whereas operations that require a
-   task's actual cgroup assignments (in particular, moving between
-   cgroups) are less common. A linked list runs through the cg_list
-   field of each task_struct using the css_set, anchored at
-   css_set->tasks.
-
- - A cgroup hierarchy filesystem can be mounted for browsing and
-   manipulation from user space.
-
- - You can list all the tasks (by PID) attached to any cgroup.
-
-The implementation of cgroups requires a few, simple hooks
-into the rest of the kernel, none in performance-critical paths:
-
- - in init/main.c, to initialize the root cgroups and initial
-   css_set at system boot.
-
- - in fork and exit, to attach and detach a task from its css_set.
-
-In addition, a new file system of type "cgroup" may be mounted, to
-enable browsing and modifying the cgroups presently known to the
-kernel.  When mounting a cgroup hierarchy, you may specify a
-comma-separated list of subsystems to mount as the filesystem mount
-options.  By default, mounting the cgroup filesystem attempts to
-mount a hierarchy containing all registered subsystems.
-
-If an active hierarchy with exactly the same set of subsystems already
-exists, it will be reused for the new mount. If no existing hierarchy
-matches, and any of the requested subsystems are in use in an existing
-hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
-is activated, associated with the requested subsystems.
-
-It's not currently possible to bind a new subsystem to an active
-cgroup hierarchy, or to unbind a subsystem from an active cgroup
-hierarchy. This may be possible in future, but is fraught with nasty
-error-recovery issues.
-
-When a cgroup filesystem is unmounted, if there are any
-child cgroups created below the top-level cgroup, that hierarchy
-will remain active even though unmounted; if there are no
-child cgroups then the hierarchy will be deactivated.
-
-No new system calls are added for cgroups - all support for
-querying and modifying cgroups is via this cgroup file system.
-
-Each task under /proc has an added file named 'cgroup' displaying,
-for each active hierarchy, the subsystem names and the cgroup name
-as the path relative to the root of the cgroup file system.
-
-Each cgroup is represented by a directory in the cgroup file system
-containing the following files describing that cgroup:
-
- - tasks: list of tasks (by PID) attached to that cgroup.  This list
-   is not guaranteed to be sorted.  Writing a thread ID into this file
-   moves the thread into this cgroup.
- - cgroup.procs: list of thread group IDs in the cgroup.  This list is
-   not guaranteed to be sorted or free of duplicate TGIDs, and userspace
-   should sort/uniquify the list if this property is required.
-   Writing a thread group ID into this file moves all threads in that
-   group into this cgroup.
- - notify_on_release flag: run the release agent on exit?
- - release_agent: the path to use for release notifications (this file
-   exists in the top cgroup only)
-
-Other subsystems such as cpusets may add additional files in each
-cgroup dir.
-
-New cgroups are created using the mkdir system call or shell
-command.  The properties of a cgroup, such as its flags, are
-modified by writing to the appropriate file in that cgroups
-directory, as listed above.
-
-The named hierarchical structure of nested cgroups allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cgroup allows organizing the work load
-on a system into related sets of tasks.  A task may be re-attached to
-any other cgroup, if allowed by the permissions on the necessary
-cgroup file system directories.
-
-When a task is moved from one cgroup to another, it gets a new
-css_set pointer - if there's an already existing css_set with the
-desired collection of cgroups then that group is reused, otherwise a new
-css_set is allocated. The appropriate existing css_set is located by
-looking into a hash table.
-
-To allow access from a cgroup to the css_sets (and hence tasks)
-that comprise it, a set of cg_cgroup_link objects form a lattice;
-each cg_cgroup_link is linked into a list of cg_cgroup_links for
-a single cgroup on its cgrp_link_list field, and a list of
-cg_cgroup_links for a single css_set on its cg_link_list.
-
-Thus the set of tasks in a cgroup can be listed by iterating over
-each css_set that references the cgroup, and sub-iterating over
-each css_set's task set.
-
-The use of a Linux virtual file system (vfs) to represent the
-cgroup hierarchy provides for a familiar permission and name space
-for cgroups, with a minimum of additional kernel code.
-
-1.4 What does notify_on_release do ?
-------------------------------------
-
-If the notify_on_release flag is enabled (1) in a cgroup, then
-whenever the last task in the cgroup leaves (exits or attaches to
-some other cgroup) and the last child cgroup of that cgroup
-is removed, then the kernel runs the command specified by the contents
-of the "release_agent" file in that hierarchy's root directory,
-supplying the pathname (relative to the mount point of the cgroup
-file system) of the abandoned cgroup.  This enables automatic
-removal of abandoned cgroups.  The default value of
-notify_on_release in the root cgroup at system boot is disabled
-(0).  The default value of other cgroups at creation is the current
-value of their parents' notify_on_release settings. The default value of
-a cgroup hierarchy's release_agent path is empty.
-
-1.5 What does clone_children do ?
----------------------------------
-
-This flag only affects the cpuset controller. If the clone_children
-flag is enabled (1) in a cgroup, a new cpuset cgroup will copy its
-configuration from the parent during initialization.
-
-1.6 How do I use cgroups ?
---------------------------
-
-To start a new job that is to be contained within a cgroup, using
-the "cpuset" cgroup subsystem, the steps are something like::
-
- 1) mount -t tmpfs cgroup_root /sys/fs/cgroup
- 2) mkdir /sys/fs/cgroup/cpuset
- 3) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 4) Create the new cgroup by doing mkdir's and write's (or echo's) in
-    the /sys/fs/cgroup/cpuset virtual file system.
- 5) Start a task that will be the "founding father" of the new job.
- 6) Attach that task to the new cgroup by writing its PID to the
-    /sys/fs/cgroup/cpuset tasks file for that cgroup.
- 7) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cgroup
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cgroup::
-
-  mount -t tmpfs cgroup_root /sys/fs/cgroup
-  mkdir /sys/fs/cgroup/cpuset
-  mount -t cgroup cpuset -ocpuset /sys/fs/cgroup/cpuset
-  cd /sys/fs/cgroup/cpuset
-  mkdir Charlie
-  cd Charlie
-  /bin/echo 2-3 > cpuset.cpus
-  /bin/echo 1 > cpuset.mems
-  /bin/echo $$ > tasks
-  sh
-  # The subshell 'sh' is now running in cgroup Charlie
-  # The next line should display '/Charlie'
-  cat /proc/self/cgroup
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using cgroups can be done through the cgroup
-virtual filesystem.
-
-To mount a cgroup hierarchy with all available subsystems, type::
-
-  # mount -t cgroup xxx /sys/fs/cgroup
-
-The "xxx" is not interpreted by the cgroup code, but will appear in
-/proc/mounts so may be any useful identifying string that you like.
-
-Note: Some subsystems do not work without some user input first.  For instance,
-if cpusets are enabled the user will have to populate the cpus and mems files
-for each new cgroup created before that group can be used.
-
-As explained in section `1.2 Why are cgroups needed?` you should create
-different hierarchies of cgroups for each single resource or group of
-resources you want to control. Therefore, you should mount a tmpfs on
-/sys/fs/cgroup and create directories for each cgroup resource or resource
-group::
-
-  # mount -t tmpfs cgroup_root /sys/fs/cgroup
-  # mkdir /sys/fs/cgroup/rg1
-
-To mount a cgroup hierarchy with just the cpuset and memory
-subsystems, type::
-
-  # mount -t cgroup -o cpuset,memory hier1 /sys/fs/cgroup/rg1
-
-While remounting cgroups is currently supported, it is not recommend
-to use it. Remounting allows changing bound subsystems and
-release_agent. Rebinding is hardly useful as it only works when the
-hierarchy is empty and release_agent itself should be replaced with
-conventional fsnotify. The support for remounting will be removed in
-the future.
-
-To Specify a hierarchy's release_agent::
-
-  # mount -t cgroup -o cpuset,release_agent="/sbin/cpuset_release_agent" \
-    xxx /sys/fs/cgroup/rg1
-
-Note that specifying 'release_agent' more than once will return failure.
-
-Note that changing the set of subsystems is currently only supported
-when the hierarchy consists of a single (root) cgroup. Supporting
-the ability to arbitrarily bind/unbind subsystems from an existing
-cgroup hierarchy is intended to be implemented in the future.
-
-Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
-tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
-is the cgroup that holds the whole system.
-
-If you want to change the value of release_agent::
-
-  # echo "/sbin/new_release_agent" > /sys/fs/cgroup/rg1/release_agent
-
-It can also be changed via remount.
-
-If you want to create a new cgroup under /sys/fs/cgroup/rg1::
-
-  # cd /sys/fs/cgroup/rg1
-  # mkdir my_cgroup
-
-Now you want to do something with this cgroup:
-
-  # cd my_cgroup
-
-In this directory you can find several files::
-
-  # ls
-  cgroup.procs notify_on_release tasks
-  (plus whatever files added by the attached subsystems)
-
-Now attach your shell to this cgroup::
-
-  # /bin/echo $$ > tasks
-
-You can also create cgroups inside your cgroup by using mkdir in this
-directory::
-
-  # mkdir my_sub_cs
-
-To remove a cgroup, just use rmdir::
-
-  # rmdir my_sub_cs
-
-This will fail if the cgroup is in use (has cgroups inside, or
-has processes attached, or is held alive by other subsystem-specific
-reference).
-
-2.2 Attaching processes
------------------------
-
-::
-
-  # /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another::
-
-  # /bin/echo PID1 > tasks
-  # /bin/echo PID2 > tasks
-	  ...
-  # /bin/echo PIDn > tasks
-
-You can attach the current shell task by echoing 0::
-
-  # echo 0 > tasks
-
-You can use the cgroup.procs file instead of the tasks file to move all
-threads in a threadgroup at once. Echoing the PID of any task in a
-threadgroup to cgroup.procs causes all tasks in that threadgroup to be
-attached to the cgroup. Writing 0 to cgroup.procs moves all tasks
-in the writing task's threadgroup.
-
-Note: Since every task is always a member of exactly one cgroup in each
-mounted hierarchy, to remove a task from its current cgroup you must
-move it into a new cgroup (possibly the root cgroup) by writing to the
-new cgroup's tasks file.
-
-Note: Due to some restrictions enforced by some cgroup subsystems, moving
-a process to another cgroup can fail.
-
-2.3 Mounting hierarchies by name
---------------------------------
-
-Passing the name=<x> option when mounting a cgroups hierarchy
-associates the given name with the hierarchy.  This can be used when
-mounting a pre-existing hierarchy, in order to refer to it by name
-rather than by its set of active subsystems.  Each hierarchy is either
-nameless, or has a unique name.
-
-The name should match [\w.-]+
-
-When passing a name=<x> option for a new hierarchy, you need to
-specify subsystems manually; the legacy behaviour of mounting all
-subsystems when none are explicitly specified is not supported when
-you give a subsystem a name.
-
-The name of the subsystem appears as part of the hierarchy description
-in /proc/mounts and /proc/<pid>/cgroups.
-
-
-3. Kernel API
-=============
-
-3.1 Overview
-------------
-
-Each kernel subsystem that wants to hook into the generic cgroup
-system needs to create a cgroup_subsys object. This contains
-various methods, which are callbacks from the cgroup system, along
-with a subsystem ID which will be assigned by the cgroup system.
-
-Other fields in the cgroup_subsys object include:
-
-- subsys_id: a unique array index for the subsystem, indicating which
-  entry in cgroup->subsys[] this subsystem should be managing.
-
-- name: should be initialized to a unique subsystem name. Should be
-  no longer than MAX_CGROUP_TYPE_NAMELEN.
-
-- early_init: indicate if the subsystem needs early initialization
-  at system boot.
-
-Each cgroup object created by the system has an array of pointers,
-indexed by subsystem ID; this pointer is entirely managed by the
-subsystem; the generic cgroup code will never touch this pointer.
-
-3.2 Synchronization
--------------------
-
-There is a global mutex, cgroup_mutex, used by the cgroup
-system. This should be taken by anything that wants to modify a
-cgroup. It may also be taken to prevent cgroups from being
-modified, but more specific locks may be more appropriate in that
-situation.
-
-See kernel/cgroup.c for more details.
-
-Subsystems can take/release the cgroup_mutex via the functions
-cgroup_lock()/cgroup_unlock().
-
-Accessing a task's cgroup pointer may be done in the following ways:
-- while holding cgroup_mutex
-- while holding the task's alloc_lock (via task_lock())
-- inside an rcu_read_lock() section via rcu_dereference()
-
-3.3 Subsystem API
------------------
-
-Each subsystem should:
-
-- add an entry in linux/cgroup_subsys.h
-- define a cgroup_subsys object called <name>_cgrp_subsys
-
-Each subsystem may export the following methods. The only mandatory
-methods are css_alloc/free. Any others that are null are presumed to
-be successful no-ops.
-
-``struct cgroup_subsys_state *css_alloc(struct cgroup *cgrp)``
-(cgroup_mutex held by caller)
-
-Called to allocate a subsystem state object for a cgroup. The
-subsystem should allocate its subsystem state object for the passed
-cgroup, returning a pointer to the new object on success or a
-ERR_PTR() value. On success, the subsystem pointer should point to
-a structure of type cgroup_subsys_state (typically embedded in a
-larger subsystem-specific object), which will be initialized by the
-cgroup system. Note that this will be called at initialization to
-create the root subsystem state for this subsystem; this case can be
-identified by the passed cgroup object having a NULL parent (since
-it's the root of the hierarchy) and may be an appropriate place for
-initialization code.
-
-``int css_online(struct cgroup *cgrp)``
-(cgroup_mutex held by caller)
-
-Called after @cgrp successfully completed all allocations and made
-visible to cgroup_for_each_child/descendant_*() iterators. The
-subsystem may choose to fail creation by returning -errno. This
-callback can be used to implement reliable state sharing and
-propagation along the hierarchy. See the comment on
-cgroup_for_each_descendant_pre() for details.
-
-``void css_offline(struct cgroup *cgrp);``
-(cgroup_mutex held by caller)
-
-This is the counterpart of css_online() and called iff css_online()
-has succeeded on @cgrp. This signifies the beginning of the end of
-@cgrp. @cgrp is being removed and the subsystem should start dropping
-all references it's holding on @cgrp. When all references are dropped,
-cgroup removal will proceed to the next step - css_free(). After this
-callback, @cgrp should be considered dead to the subsystem.
-
-``void css_free(struct cgroup *cgrp)``
-(cgroup_mutex held by caller)
-
-The cgroup system is about to free @cgrp; the subsystem should free
-its subsystem state object. By the time this method is called, @cgrp
-is completely unused; @cgrp->parent is still valid. (Note - can also
-be called for a newly-created cgroup if an error occurs after this
-subsystem's create() method has been called for the new cgroup).
-
-``int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
-(cgroup_mutex held by caller)
-
-Called prior to moving one or more tasks into a cgroup; if the
-subsystem returns an error, this will abort the attach operation.
-@tset contains the tasks to be attached and is guaranteed to have at
-least one task in it.
-
-If there are multiple tasks in the taskset, then:
-  - it's guaranteed that all are from the same thread group
-  - @tset contains all tasks from the thread group whether or not
-    they're switching cgroups
-  - the first task is the leader
-
-Each @tset entry also contains the task's old cgroup and tasks which
-aren't switching cgroup can be skipped easily using the
-cgroup_taskset_for_each() iterator. Note that this isn't called on a
-fork. If this method returns 0 (success) then this should remain valid
-while the caller holds cgroup_mutex and it is ensured that either
-attach() or cancel_attach() will be called in future.
-
-``void css_reset(struct cgroup_subsys_state *css)``
-(cgroup_mutex held by caller)
-
-An optional operation which should restore @css's configuration to the
-initial state.  This is currently only used on the unified hierarchy
-when a subsystem is disabled on a cgroup through
-"cgroup.subtree_control" but should remain enabled because other
-subsystems depend on it.  cgroup core makes such a css invisible by
-removing the associated interface files and invokes this callback so
-that the hidden subsystem can return to the initial neutral state.
-This prevents unexpected resource control from a hidden css and
-ensures that the configuration is in the initial state when it is made
-visible again later.
-
-``void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
-(cgroup_mutex held by caller)
-
-Called when a task attach operation has failed after can_attach() has succeeded.
-A subsystem whose can_attach() has some side-effects should provide this
-function, so that the subsystem can implement a rollback. If not, not necessary.
-This will be called only about subsystems whose can_attach() operation have
-succeeded. The parameters are identical to can_attach().
-
-``void attach(struct cgroup *cgrp, struct cgroup_taskset *tset)``
-(cgroup_mutex held by caller)
-
-Called after the task has been attached to the cgroup, to allow any
-post-attachment activity that requires memory allocations or blocking.
-The parameters are identical to can_attach().
-
-``void fork(struct task_struct *task)``
-
-Called when a task is forked into a cgroup.
-
-``void exit(struct task_struct *task)``
-
-Called during task exit.
-
-``void free(struct task_struct *task)``
-
-Called when the task_struct is freed.
-
-``void bind(struct cgroup *root)``
-(cgroup_mutex held by caller)
-
-Called when a cgroup subsystem is rebound to a different hierarchy
-and root cgroup. Currently this will only involve movement between
-the default hierarchy (which never has sub-cgroups) and a hierarchy
-that is being created/destroyed (and hence has no sub-cgroups).
-
-4. Extended attribute usage
-===========================
-
-cgroup filesystem supports certain types of extended attributes in its
-directories and files.  The current supported types are:
-
-	- Trusted (XATTR_TRUSTED)
-	- Security (XATTR_SECURITY)
-
-Both require CAP_SYS_ADMIN capability to set.
-
-Like in tmpfs, the extended attributes in cgroup filesystem are stored
-using kernel memory and it's advised to keep the usage at minimum.  This
-is the reason why user defined extended attributes are not supported, since
-any user can do it and there's no limit in the value size.
-
-The current known users for this feature are SELinux to limit cgroup usage
-in containers and systemd for assorted meta data like main PID in a cgroup
-(systemd creates a cgroup per service).
-
-5. Questions
-============
-
-::
-
-  Q: what's up with this '/bin/echo' ?
-  A: bash's builtin 'echo' command does not check calls to write() against
-     errors. If you use it in the cgroup file system, you won't be
-     able to tell whether a command succeeded or failed.
-
-  Q: When I attach processes, only the first of the line gets really attached !
-  A: We can only return one error code per call to write(). So you should also
-     put only ONE PID.
diff --git a/Documentation/cgroup-v1/cpuacct.rst b/Documentation/cgroup-v1/cpuacct.rst
deleted file mode 100644
index d30ed81d2ad7..000000000000
--- a/Documentation/cgroup-v1/cpuacct.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-=========================
-CPU Accounting Controller
-=========================
-
-The CPU accounting controller is used to group tasks using cgroups and
-account the CPU usage of these groups of tasks.
-
-The CPU accounting controller supports multi-hierarchy groups. An accounting
-group accumulates the CPU usage of all of its child groups and the tasks
-directly present in its group.
-
-Accounting groups can be created by first mounting the cgroup filesystem::
-
-  # mount -t cgroup -ocpuacct none /sys/fs/cgroup
-
-With the above step, the initial or the parent accounting group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-/sys/fs/cgroup/cpuacct.usage gives the CPU time (in nanoseconds) obtained
-by this group which is essentially the CPU time obtained by all the tasks
-in the system.
-
-New accounting groups can be created under the parent group /sys/fs/cgroup::
-
-  # cd /sys/fs/cgroup
-  # mkdir g1
-  # echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it. CPU time consumed by this bash and its children
-can be obtained from g1/cpuacct.usage and the same is accumulated in
-/sys/fs/cgroup/cpuacct.usage also.
-
-cpuacct.stat file lists a few statistics which further divide the
-CPU time obtained by the cgroup into user and system times. Currently
-the following statistics are supported:
-
-user: Time spent by tasks of the cgroup in user mode.
-system: Time spent by tasks of the cgroup in kernel mode.
-
-user and system are in USER_HZ unit.
-
-cpuacct controller uses percpu_counter interface to collect user and
-system times. This has two side effects:
-
-- It is theoretically possible to see wrong values for user and system times.
-  This is because percpu_counter_read() on 32bit systems isn't safe
-  against concurrent writes.
-- It is possible to see slightly outdated values for user and system times
-  due to the batch processing nature of percpu_counter.
diff --git a/Documentation/cgroup-v1/cpusets.rst b/Documentation/cgroup-v1/cpusets.rst
deleted file mode 100644
index b6a42cdea72b..000000000000
--- a/Documentation/cgroup-v1/cpusets.rst
+++ /dev/null
@@ -1,866 +0,0 @@
-=======
-CPUSETS
-=======
-
-Copyright (C) 2004 BULL SA.
-
-Written by Simon.Derr@bull.net
-
-- Portions Copyright (c) 2004-2006 Silicon Graphics, Inc.
-- Modified by Paul Jackson <pj@sgi.com>
-- Modified by Christoph Lameter <cl@linux.com>
-- Modified by Paul Menage <menage@google.com>
-- Modified by Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
-
-.. CONTENTS:
-
-   1. Cpusets
-     1.1 What are cpusets ?
-     1.2 Why are cpusets needed ?
-     1.3 How are cpusets implemented ?
-     1.4 What are exclusive cpusets ?
-     1.5 What is memory_pressure ?
-     1.6 What is memory spread ?
-     1.7 What is sched_load_balance ?
-     1.8 What is sched_relax_domain_level ?
-     1.9 How do I use cpusets ?
-   2. Usage Examples and Syntax
-     2.1 Basic Usage
-     2.2 Adding/removing cpus
-     2.3 Setting flags
-     2.4 Attaching processes
-   3. Questions
-   4. Contact
-
-1. Cpusets
-==========
-
-1.1 What are cpusets ?
-----------------------
-
-Cpusets provide a mechanism for assigning a set of CPUs and Memory
-Nodes to a set of tasks.   In this document "Memory Node" refers to
-an on-line node that contains memory.
-
-Cpusets constrain the CPU and Memory placement of tasks to only
-the resources within a task's current cpuset.  They form a nested
-hierarchy visible in a virtual file system.  These are the essential
-hooks, beyond what is already present, required to manage dynamic
-job placement on large systems.
-
-Cpusets use the generic cgroup subsystem described in
-Documentation/cgroup-v1/cgroups.rst.
-
-Requests by a task, using the sched_setaffinity(2) system call to
-include CPUs in its CPU affinity mask, and using the mbind(2) and
-set_mempolicy(2) system calls to include Memory Nodes in its memory
-policy, are both filtered through that task's cpuset, filtering out any
-CPUs or Memory Nodes not in that cpuset.  The scheduler will not
-schedule a task on a CPU that is not allowed in its cpus_allowed
-vector, and the kernel page allocator will not allocate a page on a
-node that is not allowed in the requesting task's mems_allowed vector.
-
-User level code may create and destroy cpusets by name in the cgroup
-virtual file system, manage the attributes and permissions of these
-cpusets and which CPUs and Memory Nodes are assigned to each cpuset,
-specify and query to which cpuset a task is assigned, and list the
-task pids assigned to a cpuset.
-
-
-1.2 Why are cpusets needed ?
-----------------------------
-
-The management of large computer systems, with many processors (CPUs),
-complex memory cache hierarchies and multiple Memory Nodes having
-non-uniform access times (NUMA) presents additional challenges for
-the efficient scheduling and memory placement of processes.
-
-Frequently more modest sized systems can be operated with adequate
-efficiency just by letting the operating system automatically share
-the available CPU and Memory resources amongst the requesting tasks.
-
-But larger systems, which benefit more from careful processor and
-memory placement to reduce memory access times and contention,
-and which typically represent a larger investment for the customer,
-can benefit from explicitly placing jobs on properly sized subsets of
-the system.
-
-This can be especially valuable on:
-
-    * Web Servers running multiple instances of the same web application,
-    * Servers running different applications (for instance, a web server
-      and a database), or
-    * NUMA systems running large HPC applications with demanding
-      performance characteristics.
-
-These subsets, or "soft partitions" must be able to be dynamically
-adjusted, as the job mix changes, without impacting other concurrently
-executing jobs. The location of the running jobs pages may also be moved
-when the memory locations are changed.
-
-The kernel cpuset patch provides the minimum essential kernel
-mechanisms required to efficiently implement such subsets.  It
-leverages existing CPU and Memory Placement facilities in the Linux
-kernel to avoid any additional impact on the critical scheduler or
-memory allocator code.
-
-
-1.3 How are cpusets implemented ?
----------------------------------
-
-Cpusets provide a Linux kernel mechanism to constrain which CPUs and
-Memory Nodes are used by a process or set of processes.
-
-The Linux kernel already has a pair of mechanisms to specify on which
-CPUs a task may be scheduled (sched_setaffinity) and on which Memory
-Nodes it may obtain memory (mbind, set_mempolicy).
-
-Cpusets extends these two mechanisms as follows:
-
- - Cpusets are sets of allowed CPUs and Memory Nodes, known to the
-   kernel.
- - Each task in the system is attached to a cpuset, via a pointer
-   in the task structure to a reference counted cgroup structure.
- - Calls to sched_setaffinity are filtered to just those CPUs
-   allowed in that task's cpuset.
- - Calls to mbind and set_mempolicy are filtered to just
-   those Memory Nodes allowed in that task's cpuset.
- - The root cpuset contains all the systems CPUs and Memory
-   Nodes.
- - For any cpuset, one can define child cpusets containing a subset
-   of the parents CPU and Memory Node resources.
- - The hierarchy of cpusets can be mounted at /dev/cpuset, for
-   browsing and manipulation from user space.
- - A cpuset may be marked exclusive, which ensures that no other
-   cpuset (except direct ancestors and descendants) may contain
-   any overlapping CPUs or Memory Nodes.
- - You can list all the tasks (by pid) attached to any cpuset.
-
-The implementation of cpusets requires a few, simple hooks
-into the rest of the kernel, none in performance critical paths:
-
- - in init/main.c, to initialize the root cpuset at system boot.
- - in fork and exit, to attach and detach a task from its cpuset.
- - in sched_setaffinity, to mask the requested CPUs by what's
-   allowed in that task's cpuset.
- - in sched.c migrate_live_tasks(), to keep migrating tasks within
-   the CPUs allowed by their cpuset, if possible.
- - in the mbind and set_mempolicy system calls, to mask the requested
-   Memory Nodes by what's allowed in that task's cpuset.
- - in page_alloc.c, to restrict memory to allowed nodes.
- - in vmscan.c, to restrict page recovery to the current cpuset.
-
-You should mount the "cgroup" filesystem type in order to enable
-browsing and modifying the cpusets presently known to the kernel.  No
-new system calls are added for cpusets - all support for querying and
-modifying cpusets is via this cpuset file system.
-
-The /proc/<pid>/status file for each task has four added lines,
-displaying the task's cpus_allowed (on which CPUs it may be scheduled)
-and mems_allowed (on which Memory Nodes it may obtain memory),
-in the two formats seen in the following example::
-
-  Cpus_allowed:   ffffffff,ffffffff,ffffffff,ffffffff
-  Cpus_allowed_list:      0-127
-  Mems_allowed:   ffffffff,ffffffff
-  Mems_allowed_list:      0-63
-
-Each cpuset is represented by a directory in the cgroup file system
-containing (on top of the standard cgroup files) the following
-files describing that cpuset:
-
- - cpuset.cpus: list of CPUs in that cpuset
- - cpuset.mems: list of Memory Nodes in that cpuset
- - cpuset.memory_migrate flag: if set, move pages to cpusets nodes
- - cpuset.cpu_exclusive flag: is cpu placement exclusive?
- - cpuset.mem_exclusive flag: is memory placement exclusive?
- - cpuset.mem_hardwall flag:  is memory allocation hardwalled
- - cpuset.memory_pressure: measure of how much paging pressure in cpuset
- - cpuset.memory_spread_page flag: if set, spread page cache evenly on allowed nodes
- - cpuset.memory_spread_slab flag: if set, spread slab cache evenly on allowed nodes
- - cpuset.sched_load_balance flag: if set, load balance within CPUs on that cpuset
- - cpuset.sched_relax_domain_level: the searching range when migrating tasks
-
-In addition, only the root cpuset has the following file:
-
- - cpuset.memory_pressure_enabled flag: compute memory_pressure?
-
-New cpusets are created using the mkdir system call or shell
-command.  The properties of a cpuset, such as its flags, allowed
-CPUs and Memory Nodes, and attached tasks, are modified by writing
-to the appropriate file in that cpusets directory, as listed above.
-
-The named hierarchical structure of nested cpusets allows partitioning
-a large system into nested, dynamically changeable, "soft-partitions".
-
-The attachment of each task, automatically inherited at fork by any
-children of that task, to a cpuset allows organizing the work load
-on a system into related sets of tasks such that each set is constrained
-to using the CPUs and Memory Nodes of a particular cpuset.  A task
-may be re-attached to any other cpuset, if allowed by the permissions
-on the necessary cpuset file system directories.
-
-Such management of a system "in the large" integrates smoothly with
-the detailed placement done on individual tasks and memory regions
-using the sched_setaffinity, mbind and set_mempolicy system calls.
-
-The following rules apply to each cpuset:
-
- - Its CPUs and Memory Nodes must be a subset of its parents.
- - It can't be marked exclusive unless its parent is.
- - If its cpu or memory is exclusive, they may not overlap any sibling.
-
-These rules, and the natural hierarchy of cpusets, enable efficient
-enforcement of the exclusive guarantee, without having to scan all
-cpusets every time any of them change to ensure nothing overlaps a
-exclusive cpuset.  Also, the use of a Linux virtual file system (vfs)
-to represent the cpuset hierarchy provides for a familiar permission
-and name space for cpusets, with a minimum of additional kernel code.
-
-The cpus and mems files in the root (top_cpuset) cpuset are
-read-only.  The cpus file automatically tracks the value of
-cpu_online_mask using a CPU hotplug notifier, and the mems file
-automatically tracks the value of node_states[N_MEMORY]--i.e.,
-nodes with memory--using the cpuset_track_online_nodes() hook.
-
-
-1.4 What are exclusive cpusets ?
---------------------------------
-
-If a cpuset is cpu or mem exclusive, no other cpuset, other than
-a direct ancestor or descendant, may share any of the same CPUs or
-Memory Nodes.
-
-A cpuset that is cpuset.mem_exclusive *or* cpuset.mem_hardwall is "hardwalled",
-i.e. it restricts kernel allocations for page, buffer and other data
-commonly shared by the kernel across multiple users.  All cpusets,
-whether hardwalled or not, restrict allocations of memory for user
-space.  This enables configuring a system so that several independent
-jobs can share common kernel data, such as file system pages, while
-isolating each job's user allocation in its own cpuset.  To do this,
-construct a large mem_exclusive cpuset to hold all the jobs, and
-construct child, non-mem_exclusive cpusets for each individual job.
-Only a small amount of typical kernel memory, such as requests from
-interrupt handlers, is allowed to be taken outside even a
-mem_exclusive cpuset.
-
-
-1.5 What is memory_pressure ?
------------------------------
-The memory_pressure of a cpuset provides a simple per-cpuset metric
-of the rate that the tasks in a cpuset are attempting to free up in
-use memory on the nodes of the cpuset to satisfy additional memory
-requests.
-
-This enables batch managers monitoring jobs running in dedicated
-cpusets to efficiently detect what level of memory pressure that job
-is causing.
-
-This is useful both on tightly managed systems running a wide mix of
-submitted jobs, which may choose to terminate or re-prioritize jobs that
-are trying to use more memory than allowed on the nodes assigned to them,
-and with tightly coupled, long running, massively parallel scientific
-computing jobs that will dramatically fail to meet required performance
-goals if they start to use more memory than allowed to them.
-
-This mechanism provides a very economical way for the batch manager
-to monitor a cpuset for signs of memory pressure.  It's up to the
-batch manager or other user code to decide what to do about it and
-take action.
-
-==>
-    Unless this feature is enabled by writing "1" to the special file
-    /dev/cpuset/memory_pressure_enabled, the hook in the rebalance
-    code of __alloc_pages() for this metric reduces to simply noticing
-    that the cpuset_memory_pressure_enabled flag is zero.  So only
-    systems that enable this feature will compute the metric.
-
-Why a per-cpuset, running average:
-
-    Because this meter is per-cpuset, rather than per-task or mm,
-    the system load imposed by a batch scheduler monitoring this
-    metric is sharply reduced on large systems, because a scan of
-    the tasklist can be avoided on each set of queries.
-
-    Because this meter is a running average, instead of an accumulating
-    counter, a batch scheduler can detect memory pressure with a
-    single read, instead of having to read and accumulate results
-    for a period of time.
-
-    Because this meter is per-cpuset rather than per-task or mm,
-    the batch scheduler can obtain the key information, memory
-    pressure in a cpuset, with a single read, rather than having to
-    query and accumulate results over all the (dynamically changing)
-    set of tasks in the cpuset.
-
-A per-cpuset simple digital filter (requires a spinlock and 3 words
-of data per-cpuset) is kept, and updated by any task attached to that
-cpuset, if it enters the synchronous (direct) page reclaim code.
-
-A per-cpuset file provides an integer number representing the recent
-(half-life of 10 seconds) rate of direct page reclaims caused by
-the tasks in the cpuset, in units of reclaims attempted per second,
-times 1000.
-
-
-1.6 What is memory spread ?
----------------------------
-There are two boolean flag files per cpuset that control where the
-kernel allocates pages for the file system buffers and related in
-kernel data structures.  They are called 'cpuset.memory_spread_page' and
-'cpuset.memory_spread_slab'.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_page' is set, then
-the kernel will spread the file system buffers (page cache) evenly
-over all the nodes that the faulting task is allowed to use, instead
-of preferring to put those pages on the node where the task is running.
-
-If the per-cpuset boolean flag file 'cpuset.memory_spread_slab' is set,
-then the kernel will spread some file system related slab caches,
-such as for inodes and dentries evenly over all the nodes that the
-faulting task is allowed to use, instead of preferring to put those
-pages on the node where the task is running.
-
-The setting of these flags does not affect anonymous data segment or
-stack segment pages of a task.
-
-By default, both kinds of memory spreading are off, and memory
-pages are allocated on the node local to where the task is running,
-except perhaps as modified by the task's NUMA mempolicy or cpuset
-configuration, so long as sufficient free memory pages are available.
-
-When new cpusets are created, they inherit the memory spread settings
-of their parent.
-
-Setting memory spreading causes allocations for the affected page
-or slab caches to ignore the task's NUMA mempolicy and be spread
-instead.    Tasks using mbind() or set_mempolicy() calls to set NUMA
-mempolicies will not notice any change in these calls as a result of
-their containing task's memory spread settings.  If memory spreading
-is turned off, then the currently specified NUMA mempolicy once again
-applies to memory page allocations.
-
-Both 'cpuset.memory_spread_page' and 'cpuset.memory_spread_slab' are boolean flag
-files.  By default they contain "0", meaning that the feature is off
-for that cpuset.  If a "1" is written to that file, then that turns
-the named feature on.
-
-The implementation is simple.
-
-Setting the flag 'cpuset.memory_spread_page' turns on a per-process flag
-PFA_SPREAD_PAGE for each task that is in that cpuset or subsequently
-joins that cpuset.  The page allocation calls for the page cache
-is modified to perform an inline check for this PFA_SPREAD_PAGE task
-flag, and if set, a call to a new routine cpuset_mem_spread_node()
-returns the node to prefer for the allocation.
-
-Similarly, setting 'cpuset.memory_spread_slab' turns on the flag
-PFA_SPREAD_SLAB, and appropriately marked slab caches will allocate
-pages from the node returned by cpuset_mem_spread_node().
-
-The cpuset_mem_spread_node() routine is also simple.  It uses the
-value of a per-task rotor cpuset_mem_spread_rotor to select the next
-node in the current task's mems_allowed to prefer for the allocation.
-
-This memory placement policy is also known (in other contexts) as
-round-robin or interleave.
-
-This policy can provide substantial improvements for jobs that need
-to place thread local data on the corresponding node, but that need
-to access large file system data sets that need to be spread across
-the several nodes in the jobs cpuset in order to fit.  Without this
-policy, especially for jobs that might have one thread reading in the
-data set, the memory allocation across the nodes in the jobs cpuset
-can become very uneven.
-
-1.7 What is sched_load_balance ?
---------------------------------
-
-The kernel scheduler (kernel/sched/core.c) automatically load balances
-tasks.  If one CPU is underutilized, kernel code running on that
-CPU will look for tasks on other more overloaded CPUs and move those
-tasks to itself, within the constraints of such placement mechanisms
-as cpusets and sched_setaffinity.
-
-The algorithmic cost of load balancing and its impact on key shared
-kernel data structures such as the task list increases more than
-linearly with the number of CPUs being balanced.  So the scheduler
-has support to partition the systems CPUs into a number of sched
-domains such that it only load balances within each sched domain.
-Each sched domain covers some subset of the CPUs in the system;
-no two sched domains overlap; some CPUs might not be in any sched
-domain and hence won't be load balanced.
-
-Put simply, it costs less to balance between two smaller sched domains
-than one big one, but doing so means that overloads in one of the
-two domains won't be load balanced to the other one.
-
-By default, there is one sched domain covering all CPUs, including those
-marked isolated using the kernel boot time "isolcpus=" argument. However,
-the isolated CPUs will not participate in load balancing, and will not
-have tasks running on them unless explicitly assigned.
-
-This default load balancing across all CPUs is not well suited for
-the following two situations:
-
- 1) On large systems, load balancing across many CPUs is expensive.
-    If the system is managed using cpusets to place independent jobs
-    on separate sets of CPUs, full load balancing is unnecessary.
- 2) Systems supporting realtime on some CPUs need to minimize
-    system overhead on those CPUs, including avoiding task load
-    balancing if that is not needed.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is enabled (the default
-setting), it requests that all the CPUs in that cpusets allowed 'cpuset.cpus'
-be contained in a single sched domain, ensuring that load balancing
-can move a task (not otherwised pinned, as by sched_setaffinity)
-from any CPU in that cpuset to any other.
-
-When the per-cpuset flag "cpuset.sched_load_balance" is disabled, then the
-scheduler will avoid load balancing across the CPUs in that cpuset,
---except-- in so far as is necessary because some overlapping cpuset
-has "sched_load_balance" enabled.
-
-So, for example, if the top cpuset has the flag "cpuset.sched_load_balance"
-enabled, then the scheduler will have one sched domain covering all
-CPUs, and the setting of the "cpuset.sched_load_balance" flag in any other
-cpusets won't matter, as we're already fully load balancing.
-
-Therefore in the above two situations, the top cpuset flag
-"cpuset.sched_load_balance" should be disabled, and only some of the smaller,
-child cpusets have this flag enabled.
-
-When doing this, you don't usually want to leave any unpinned tasks in
-the top cpuset that might use non-trivial amounts of CPU, as such tasks
-may be artificially constrained to some subset of CPUs, depending on
-the particulars of this flag setting in descendant cpusets.  Even if
-such a task could use spare CPU cycles in some other CPUs, the kernel
-scheduler might not consider the possibility of load balancing that
-task to that underused CPU.
-
-Of course, tasks pinned to a particular CPU can be left in a cpuset
-that disables "cpuset.sched_load_balance" as those tasks aren't going anywhere
-else anyway.
-
-There is an impedance mismatch here, between cpusets and sched domains.
-Cpusets are hierarchical and nest.  Sched domains are flat; they don't
-overlap and each CPU is in at most one sched domain.
-
-It is necessary for sched domains to be flat because load balancing
-across partially overlapping sets of CPUs would risk unstable dynamics
-that would be beyond our understanding.  So if each of two partially
-overlapping cpusets enables the flag 'cpuset.sched_load_balance', then we
-form a single sched domain that is a superset of both.  We won't move
-a task to a CPU outside its cpuset, but the scheduler load balancing
-code might waste some compute cycles considering that possibility.
-
-This mismatch is why there is not a simple one-to-one relation
-between which cpusets have the flag "cpuset.sched_load_balance" enabled,
-and the sched domain configuration.  If a cpuset enables the flag, it
-will get balancing across all its CPUs, but if it disables the flag,
-it will only be assured of no load balancing if no other overlapping
-cpuset enables the flag.
-
-If two cpusets have partially overlapping 'cpuset.cpus' allowed, and only
-one of them has this flag enabled, then the other may find its
-tasks only partially load balanced, just on the overlapping CPUs.
-This is just the general case of the top_cpuset example given a few
-paragraphs above.  In the general case, as in the top cpuset case,
-don't leave tasks that might use non-trivial amounts of CPU in
-such partially load balanced cpusets, as they may be artificially
-constrained to some subset of the CPUs allowed to them, for lack of
-load balancing to the other CPUs.
-
-CPUs in "cpuset.isolcpus" were excluded from load balancing by the
-isolcpus= kernel boot option, and will never be load balanced regardless
-of the value of "cpuset.sched_load_balance" in any cpuset.
-
-1.7.1 sched_load_balance implementation details.
-------------------------------------------------
-
-The per-cpuset flag 'cpuset.sched_load_balance' defaults to enabled (contrary
-to most cpuset flags.)  When enabled for a cpuset, the kernel will
-ensure that it can load balance across all the CPUs in that cpuset
-(makes sure that all the CPUs in the cpus_allowed of that cpuset are
-in the same sched domain.)
-
-If two overlapping cpusets both have 'cpuset.sched_load_balance' enabled,
-then they will be (must be) both in the same sched domain.
-
-If, as is the default, the top cpuset has 'cpuset.sched_load_balance' enabled,
-then by the above that means there is a single sched domain covering
-the whole system, regardless of any other cpuset settings.
-
-The kernel commits to user space that it will avoid load balancing
-where it can.  It will pick as fine a granularity partition of sched
-domains as it can while still providing load balancing for any set
-of CPUs allowed to a cpuset having 'cpuset.sched_load_balance' enabled.
-
-The internal kernel cpuset to scheduler interface passes from the
-cpuset code to the scheduler code a partition of the load balanced
-CPUs in the system. This partition is a set of subsets (represented
-as an array of struct cpumask) of CPUs, pairwise disjoint, that cover
-all the CPUs that must be load balanced.
-
-The cpuset code builds a new such partition and passes it to the
-scheduler sched domain setup code, to have the sched domains rebuilt
-as necessary, whenever:
-
- - the 'cpuset.sched_load_balance' flag of a cpuset with non-empty CPUs changes,
- - or CPUs come or go from a cpuset with this flag enabled,
- - or 'cpuset.sched_relax_domain_level' value of a cpuset with non-empty CPUs
-   and with this flag enabled changes,
- - or a cpuset with non-empty CPUs and with this flag enabled is removed,
- - or a cpu is offlined/onlined.
-
-This partition exactly defines what sched domains the scheduler should
-setup - one sched domain for each element (struct cpumask) in the
-partition.
-
-The scheduler remembers the currently active sched domain partitions.
-When the scheduler routine partition_sched_domains() is invoked from
-the cpuset code to update these sched domains, it compares the new
-partition requested with the current, and updates its sched domains,
-removing the old and adding the new, for each change.
-
-
-1.8 What is sched_relax_domain_level ?
---------------------------------------
-
-In sched domain, the scheduler migrates tasks in 2 ways; periodic load
-balance on tick, and at time of some schedule events.
-
-When a task is woken up, scheduler try to move the task on idle CPU.
-For example, if a task A running on CPU X activates another task B
-on the same CPU X, and if CPU Y is X's sibling and performing idle,
-then scheduler migrate task B to CPU Y so that task B can start on
-CPU Y without waiting task A on CPU X.
-
-And if a CPU run out of tasks in its runqueue, the CPU try to pull
-extra tasks from other busy CPUs to help them before it is going to
-be idle.
-
-Of course it takes some searching cost to find movable tasks and/or
-idle CPUs, the scheduler might not search all CPUs in the domain
-every time.  In fact, in some architectures, the searching ranges on
-events are limited in the same socket or node where the CPU locates,
-while the load balance on tick searches all.
-
-For example, assume CPU Z is relatively far from CPU X.  Even if CPU Z
-is idle while CPU X and the siblings are busy, scheduler can't migrate
-woken task B from X to Z since it is out of its searching range.
-As the result, task B on CPU X need to wait task A or wait load balance
-on the next tick.  For some applications in special situation, waiting
-1 tick may be too long.
-
-The 'cpuset.sched_relax_domain_level' file allows you to request changing
-this searching range as you like.  This file takes int value which
-indicates size of searching range in levels ideally as follows,
-otherwise initial value -1 that indicates the cpuset has no request.
-
-====== ===========================================================
-  -1   no request. use system default or follow request of others.
-   0   no search.
-   1   search siblings (hyperthreads in a core).
-   2   search cores in a package.
-   3   search cpus in a node [= system wide on non-NUMA system]
-   4   search nodes in a chunk of node [on NUMA system]
-   5   search system wide [on NUMA system]
-====== ===========================================================
-
-The system default is architecture dependent.  The system default
-can be changed using the relax_domain_level= boot parameter.
-
-This file is per-cpuset and affect the sched domain where the cpuset
-belongs to.  Therefore if the flag 'cpuset.sched_load_balance' of a cpuset
-is disabled, then 'cpuset.sched_relax_domain_level' have no effect since
-there is no sched domain belonging the cpuset.
-
-If multiple cpusets are overlapping and hence they form a single sched
-domain, the largest value among those is used.  Be careful, if one
-requests 0 and others are -1 then 0 is used.
-
-Note that modifying this file will have both good and bad effects,
-and whether it is acceptable or not depends on your situation.
-Don't modify this file if you are not sure.
-
-If your situation is:
-
- - The migration costs between each cpu can be assumed considerably
-   small(for you) due to your special application's behavior or
-   special hardware support for CPU cache etc.
- - The searching cost doesn't have impact(for you) or you can make
-   the searching cost enough small by managing cpuset to compact etc.
- - The latency is required even it sacrifices cache hit rate etc.
-   then increasing 'sched_relax_domain_level' would benefit you.
-
-
-1.9 How do I use cpusets ?
---------------------------
-
-In order to minimize the impact of cpusets on critical kernel
-code, such as the scheduler, and due to the fact that the kernel
-does not support one task updating the memory placement of another
-task directly, the impact on a task of changing its cpuset CPU
-or Memory Node placement, or of changing to which cpuset a task
-is attached, is subtle.
-
-If a cpuset has its Memory Nodes modified, then for each task attached
-to that cpuset, the next time that the kernel attempts to allocate
-a page of memory for that task, the kernel will notice the change
-in the task's cpuset, and update its per-task memory placement to
-remain within the new cpusets memory placement.  If the task was using
-mempolicy MPOL_BIND, and the nodes to which it was bound overlap with
-its new cpuset, then the task will continue to use whatever subset
-of MPOL_BIND nodes are still allowed in the new cpuset.  If the task
-was using MPOL_BIND and now none of its MPOL_BIND nodes are allowed
-in the new cpuset, then the task will be essentially treated as if it
-was MPOL_BIND bound to the new cpuset (even though its NUMA placement,
-as queried by get_mempolicy(), doesn't change).  If a task is moved
-from one cpuset to another, then the kernel will adjust the task's
-memory placement, as above, the next time that the kernel attempts
-to allocate a page of memory for that task.
-
-If a cpuset has its 'cpuset.cpus' modified, then each task in that cpuset
-will have its allowed CPU placement changed immediately.  Similarly,
-if a task's pid is written to another cpuset's 'tasks' file, then its
-allowed CPU placement is changed immediately.  If such a task had been
-bound to some subset of its cpuset using the sched_setaffinity() call,
-the task will be allowed to run on any CPU allowed in its new cpuset,
-negating the effect of the prior sched_setaffinity() call.
-
-In summary, the memory placement of a task whose cpuset is changed is
-updated by the kernel, on the next allocation of a page for that task,
-and the processor placement is updated immediately.
-
-Normally, once a page is allocated (given a physical page
-of main memory) then that page stays on whatever node it
-was allocated, so long as it remains allocated, even if the
-cpusets memory placement policy 'cpuset.mems' subsequently changes.
-If the cpuset flag file 'cpuset.memory_migrate' is set true, then when
-tasks are attached to that cpuset, any pages that task had
-allocated to it on nodes in its previous cpuset are migrated
-to the task's new cpuset. The relative placement of the page within
-the cpuset is preserved during these migration operations if possible.
-For example if the page was on the second valid node of the prior cpuset
-then the page will be placed on the second valid node of the new cpuset.
-
-Also if 'cpuset.memory_migrate' is set true, then if that cpuset's
-'cpuset.mems' file is modified, pages allocated to tasks in that
-cpuset, that were on nodes in the previous setting of 'cpuset.mems',
-will be moved to nodes in the new setting of 'mems.'
-Pages that were not in the task's prior cpuset, or in the cpuset's
-prior 'cpuset.mems' setting, will not be moved.
-
-There is an exception to the above.  If hotplug functionality is used
-to remove all the CPUs that are currently assigned to a cpuset,
-then all the tasks in that cpuset will be moved to the nearest ancestor
-with non-empty cpus.  But the moving of some (or all) tasks might fail if
-cpuset is bound with another cgroup subsystem which has some restrictions
-on task attaching.  In this failing case, those tasks will stay
-in the original cpuset, and the kernel will automatically update
-their cpus_allowed to allow all online CPUs.  When memory hotplug
-functionality for removing Memory Nodes is available, a similar exception
-is expected to apply there as well.  In general, the kernel prefers to
-violate cpuset placement, over starving a task that has had all
-its allowed CPUs or Memory Nodes taken offline.
-
-There is a second exception to the above.  GFP_ATOMIC requests are
-kernel internal allocations that must be satisfied, immediately.
-The kernel may drop some request, in rare cases even panic, if a
-GFP_ATOMIC alloc fails.  If the request cannot be satisfied within
-the current task's cpuset, then we relax the cpuset, and look for
-memory anywhere we can find it.  It's better to violate the cpuset
-than stress the kernel.
-
-To start a new job that is to be contained within a cpuset, the steps are:
-
- 1) mkdir /sys/fs/cgroup/cpuset
- 2) mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
- 3) Create the new cpuset by doing mkdir's and write's (or echo's) in
-    the /sys/fs/cgroup/cpuset virtual file system.
- 4) Start a task that will be the "founding father" of the new job.
- 5) Attach that task to the new cpuset by writing its pid to the
-    /sys/fs/cgroup/cpuset tasks file for that cpuset.
- 6) fork, exec or clone the job tasks from this founding father task.
-
-For example, the following sequence of commands will setup a cpuset
-named "Charlie", containing just CPUs 2 and 3, and Memory Node 1,
-and then start a subshell 'sh' in that cpuset::
-
-  mount -t cgroup -ocpuset cpuset /sys/fs/cgroup/cpuset
-  cd /sys/fs/cgroup/cpuset
-  mkdir Charlie
-  cd Charlie
-  /bin/echo 2-3 > cpuset.cpus
-  /bin/echo 1 > cpuset.mems
-  /bin/echo $$ > tasks
-  sh
-  # The subshell 'sh' is now running in cpuset Charlie
-  # The next line should display '/Charlie'
-  cat /proc/self/cpuset
-
-There are ways to query or modify cpusets:
-
- - via the cpuset file system directly, using the various cd, mkdir, echo,
-   cat, rmdir commands from the shell, or their equivalent from C.
- - via the C library libcpuset.
- - via the C library libcgroup.
-   (http://sourceforge.net/projects/libcg/)
- - via the python application cset.
-   (http://code.google.com/p/cpuset/)
-
-The sched_setaffinity calls can also be done at the shell prompt using
-SGI's runon or Robert Love's taskset.  The mbind and set_mempolicy
-calls can be done at the shell prompt using the numactl command
-(part of Andi Kleen's numa package).
-
-2. Usage Examples and Syntax
-============================
-
-2.1 Basic Usage
----------------
-
-Creating, modifying, using the cpusets can be done through the cpuset
-virtual filesystem.
-
-To mount it, type:
-# mount -t cgroup -o cpuset cpuset /sys/fs/cgroup/cpuset
-
-Then under /sys/fs/cgroup/cpuset you can find a tree that corresponds to the
-tree of the cpusets in the system. For instance, /sys/fs/cgroup/cpuset
-is the cpuset that holds the whole system.
-
-If you want to create a new cpuset under /sys/fs/cgroup/cpuset::
-
-  # cd /sys/fs/cgroup/cpuset
-  # mkdir my_cpuset
-
-Now you want to do something with this cpuset::
-
-  # cd my_cpuset
-
-In this directory you can find several files::
-
-  # ls
-  cgroup.clone_children  cpuset.memory_pressure
-  cgroup.event_control   cpuset.memory_spread_page
-  cgroup.procs           cpuset.memory_spread_slab
-  cpuset.cpu_exclusive   cpuset.mems
-  cpuset.cpus            cpuset.sched_load_balance
-  cpuset.mem_exclusive   cpuset.sched_relax_domain_level
-  cpuset.mem_hardwall    notify_on_release
-  cpuset.memory_migrate  tasks
-
-Reading them will give you information about the state of this cpuset:
-the CPUs and Memory Nodes it can use, the processes that are using
-it, its properties.  By writing to these files you can manipulate
-the cpuset.
-
-Set some flags::
-
-  # /bin/echo 1 > cpuset.cpu_exclusive
-
-Add some cpus::
-
-  # /bin/echo 0-7 > cpuset.cpus
-
-Add some mems::
-
-  # /bin/echo 0-7 > cpuset.mems
-
-Now attach your shell to this cpuset::
-
-  # /bin/echo $$ > tasks
-
-You can also create cpusets inside your cpuset by using mkdir in this
-directory::
-
-  # mkdir my_sub_cs
-
-To remove a cpuset, just use rmdir::
-
-  # rmdir my_sub_cs
-
-This will fail if the cpuset is in use (has cpusets inside, or has
-processes attached).
-
-Note that for legacy reasons, the "cpuset" filesystem exists as a
-wrapper around the cgroup filesystem.
-
-The command::
-
-  mount -t cpuset X /sys/fs/cgroup/cpuset
-
-is equivalent to::
-
-  mount -t cgroup -ocpuset,noprefix X /sys/fs/cgroup/cpuset
-  echo "/sbin/cpuset_release_agent" > /sys/fs/cgroup/cpuset/release_agent
-
-2.2 Adding/removing cpus
-------------------------
-
-This is the syntax to use when writing in the cpus or mems files
-in cpuset directories::
-
-  # /bin/echo 1-4 > cpuset.cpus		-> set cpus list to cpus 1,2,3,4
-  # /bin/echo 1,2,3,4 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4
-
-To add a CPU to a cpuset, write the new list of CPUs including the
-CPU to be added. To add 6 to the above cpuset::
-
-  # /bin/echo 1-4,6 > cpuset.cpus	-> set cpus list to cpus 1,2,3,4,6
-
-Similarly to remove a CPU from a cpuset, write the new list of CPUs
-without the CPU to be removed.
-
-To remove all the CPUs::
-
-  # /bin/echo "" > cpuset.cpus		-> clear cpus list
-
-2.3 Setting flags
------------------
-
-The syntax is very simple::
-
-  # /bin/echo 1 > cpuset.cpu_exclusive 	-> set flag 'cpuset.cpu_exclusive'
-  # /bin/echo 0 > cpuset.cpu_exclusive 	-> unset flag 'cpuset.cpu_exclusive'
-
-2.4 Attaching processes
------------------------
-
-::
-
-  # /bin/echo PID > tasks
-
-Note that it is PID, not PIDs. You can only attach ONE task at a time.
-If you have several tasks to attach, you have to do it one after another::
-
-  # /bin/echo PID1 > tasks
-  # /bin/echo PID2 > tasks
-	...
-  # /bin/echo PIDn > tasks
-
-
-3. Questions
-============
-
-Q:
-   what's up with this '/bin/echo' ?
-
-A:
-   bash's builtin 'echo' command does not check calls to write() against
-   errors. If you use it in the cpuset file system, you won't be
-   able to tell whether a command succeeded or failed.
-
-Q:
-   When I attach processes, only the first of the line gets really attached !
-
-A:
-   We can only return one error code per call to write(). So you should also
-   put only ONE pid.
-
-4. Contact
-==========
-
-Web: http://www.bullopensource.org/cpuset
diff --git a/Documentation/cgroup-v1/devices.rst b/Documentation/cgroup-v1/devices.rst
deleted file mode 100644
index e1886783961e..000000000000
--- a/Documentation/cgroup-v1/devices.rst
+++ /dev/null
@@ -1,132 +0,0 @@
-===========================
-Device Whitelist Controller
-===========================
-
-1. Description
-==============
-
-Implement a cgroup to track and enforce open and mknod restrictions
-on device files.  A device cgroup associates a device access
-whitelist with each cgroup.  A whitelist entry has 4 fields.
-'type' is a (all), c (char), or b (block).  'all' means it applies
-to all types and all major and minor numbers.  Major and minor are
-either an integer or * for all.  Access is a composition of r
-(read), w (write), and m (mknod).
-
-The root device cgroup starts with rwm to 'all'.  A child device
-cgroup gets a copy of the parent.  Administrators can then remove
-devices from the whitelist or add new entries.  A child cgroup can
-never receive a device access which is denied by its parent.
-
-2. User Interface
-=================
-
-An entry is added using devices.allow, and removed using
-devices.deny.  For instance::
-
-	echo 'c 1:3 mr' > /sys/fs/cgroup/1/devices.allow
-
-allows cgroup 1 to read and mknod the device usually known as
-/dev/null.  Doing::
-
-	echo a > /sys/fs/cgroup/1/devices.deny
-
-will remove the default 'a *:* rwm' entry. Doing::
-
-	echo a > /sys/fs/cgroup/1/devices.allow
-
-will add the 'a *:* rwm' entry to the whitelist.
-
-3. Security
-===========
-
-Any task can move itself between cgroups.  This clearly won't
-suffice, but we can decide the best way to adequately restrict
-movement as people get some experience with this.  We may just want
-to require CAP_SYS_ADMIN, which at least is a separate bit from
-CAP_MKNOD.  We may want to just refuse moving to a cgroup which
-isn't a descendant of the current one.  Or we may want to use
-CAP_MAC_ADMIN, since we really are trying to lock down root.
-
-CAP_SYS_ADMIN is needed to modify the whitelist or move another
-task to a new cgroup.  (Again we'll probably want to change that).
-
-A cgroup may not be granted more permissions than the cgroup's
-parent has.
-
-4. Hierarchy
-============
-
-device cgroups maintain hierarchy by making sure a cgroup never has more
-access permissions than its parent.  Every time an entry is written to
-a cgroup's devices.deny file, all its children will have that entry removed
-from their whitelist and all the locally set whitelist entries will be
-re-evaluated.  In case one of the locally set whitelist entries would provide
-more access than the cgroup's parent, it'll be removed from the whitelist.
-
-Example::
-
-      A
-     / \
-        B
-
-    group        behavior	exceptions
-    A            allow		"b 8:* rwm", "c 116:1 rw"
-    B            deny		"c 1:3 rwm", "c 116:2 rwm", "b 3:* rwm"
-
-If a device is denied in group A::
-
-	# echo "c 116:* r" > A/devices.deny
-
-it'll propagate down and after revalidating B's entries, the whitelist entry
-"c 116:2 rwm" will be removed::
-
-    group        whitelist entries                        denied devices
-    A            all                                      "b 8:* rwm", "c 116:* rw"
-    B            "c 1:3 rwm", "b 3:* rwm"                 all the rest
-
-In case parent's exceptions change and local exceptions are not allowed
-anymore, they'll be deleted.
-
-Notice that new whitelist entries will not be propagated::
-
-      A
-     / \
-        B
-
-    group        whitelist entries                        denied devices
-    A            "c 1:3 rwm", "c 1:5 r"                   all the rest
-    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
-
-when adding ``c *:3 rwm``::
-
-	# echo "c *:3 rwm" >A/devices.allow
-
-the result::
-
-    group        whitelist entries                        denied devices
-    A            "c *:3 rwm", "c 1:5 r"                   all the rest
-    B            "c 1:3 rwm", "c 1:5 r"                   all the rest
-
-but now it'll be possible to add new entries to B::
-
-	# echo "c 2:3 rwm" >B/devices.allow
-	# echo "c 50:3 r" >B/devices.allow
-
-or even::
-
-	# echo "c *:3 rwm" >B/devices.allow
-
-Allowing or denying all by writing 'a' to devices.allow or devices.deny will
-not be possible once the device cgroups has children.
-
-4.1 Hierarchy (internal implementation)
----------------------------------------
-
-device cgroups is implemented internally using a behavior (ALLOW, DENY) and a
-list of exceptions.  The internal state is controlled using the same user
-interface to preserve compatibility with the previous whitelist-only
-implementation.  Removal or addition of exceptions that will reduce the access
-to devices will be propagated down the hierarchy.
-For every propagated exception, the effective rules will be re-evaluated based
-on current parent's access rules.
diff --git a/Documentation/cgroup-v1/freezer-subsystem.rst b/Documentation/cgroup-v1/freezer-subsystem.rst
deleted file mode 100644
index 582d3427de3f..000000000000
--- a/Documentation/cgroup-v1/freezer-subsystem.rst
+++ /dev/null
@@ -1,127 +0,0 @@
-==============
-Cgroup Freezer
-==============
-
-The cgroup freezer is useful to batch job management system which start
-and stop sets of tasks in order to schedule the resources of a machine
-according to the desires of a system administrator. This sort of program
-is often used on HPC clusters to schedule access to the cluster as a
-whole. The cgroup freezer uses cgroups to describe the set of tasks to
-be started/stopped by the batch job management system. It also provides
-a means to start and stop the tasks composing the job.
-
-The cgroup freezer will also be useful for checkpointing running groups
-of tasks. The freezer allows the checkpoint code to obtain a consistent
-image of the tasks by attempting to force the tasks in a cgroup into a
-quiescent state. Once the tasks are quiescent another task can
-walk /proc or invoke a kernel interface to gather information about the
-quiesced tasks. Checkpointed tasks can be restarted later should a
-recoverable error occur. This also allows the checkpointed tasks to be
-migrated between nodes in a cluster by copying the gathered information
-to another node and restarting the tasks there.
-
-Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping
-and resuming tasks in userspace. Both of these signals are observable
-from within the tasks we wish to freeze. While SIGSTOP cannot be caught,
-blocked, or ignored it can be seen by waiting or ptracing parent tasks.
-SIGCONT is especially unsuitable since it can be caught by the task. Any
-programs designed to watch for SIGSTOP and SIGCONT could be broken by
-attempting to use SIGSTOP and SIGCONT to stop and resume tasks. We can
-demonstrate this problem using nested bash shells::
-
-	$ echo $$
-	16644
-	$ bash
-	$ echo $$
-	16690
-
-	From a second, unrelated bash shell:
-	$ kill -SIGSTOP 16690
-	$ kill -SIGCONT 16690
-
-	<at this point 16690 exits and causes 16644 to exit too>
-
-This happens because bash can observe both signals and choose how it
-responds to them.
-
-Another example of a program which catches and responds to these
-signals is gdb. In fact any program designed to use ptrace is likely to
-have a problem with this method of stopping and resuming tasks.
-
-In contrast, the cgroup freezer uses the kernel freezer code to
-prevent the freeze/unfreeze cycle from becoming visible to the tasks
-being frozen. This allows the bash example above and gdb to run as
-expected.
-
-The cgroup freezer is hierarchical. Freezing a cgroup freezes all
-tasks belonging to the cgroup and all its descendant cgroups. Each
-cgroup has its own state (self-state) and the state inherited from the
-parent (parent-state). Iff both states are THAWED, the cgroup is
-THAWED.
-
-The following cgroupfs files are created by cgroup freezer.
-
-* freezer.state: Read-write.
-
-  When read, returns the effective state of the cgroup - "THAWED",
-  "FREEZING" or "FROZEN". This is the combined self and parent-states.
-  If any is freezing, the cgroup is freezing (FREEZING or FROZEN).
-
-  FREEZING cgroup transitions into FROZEN state when all tasks
-  belonging to the cgroup and its descendants become frozen. Note that
-  a cgroup reverts to FREEZING from FROZEN after a new task is added
-  to the cgroup or one of its descendant cgroups until the new task is
-  frozen.
-
-  When written, sets the self-state of the cgroup. Two values are
-  allowed - "FROZEN" and "THAWED". If FROZEN is written, the cgroup,
-  if not already freezing, enters FREEZING state along with all its
-  descendant cgroups.
-
-  If THAWED is written, the self-state of the cgroup is changed to
-  THAWED.  Note that the effective state may not change to THAWED if
-  the parent-state is still freezing. If a cgroup's effective state
-  becomes THAWED, all its descendants which are freezing because of
-  the cgroup also leave the freezing state.
-
-* freezer.self_freezing: Read only.
-
-  Shows the self-state. 0 if the self-state is THAWED; otherwise, 1.
-  This value is 1 iff the last write to freezer.state was "FROZEN".
-
-* freezer.parent_freezing: Read only.
-
-  Shows the parent-state.  0 if none of the cgroup's ancestors is
-  frozen; otherwise, 1.
-
-The root cgroup is non-freezable and the above interface files don't
-exist.
-
-* Examples of usage::
-
-   # mkdir /sys/fs/cgroup/freezer
-   # mount -t cgroup -ofreezer freezer /sys/fs/cgroup/freezer
-   # mkdir /sys/fs/cgroup/freezer/0
-   # echo $some_pid > /sys/fs/cgroup/freezer/0/tasks
-
-to get status of the freezer subsystem::
-
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   THAWED
-
-to freeze all tasks in the container::
-
-   # echo FROZEN > /sys/fs/cgroup/freezer/0/freezer.state
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   FREEZING
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   FROZEN
-
-to unfreeze all tasks in the container::
-
-   # echo THAWED > /sys/fs/cgroup/freezer/0/freezer.state
-   # cat /sys/fs/cgroup/freezer/0/freezer.state
-   THAWED
-
-This is the basic mechanism which should do the right thing for user space task
-in a simple scenario.
diff --git a/Documentation/cgroup-v1/hugetlb.rst b/Documentation/cgroup-v1/hugetlb.rst
deleted file mode 100644
index a3902aa253a9..000000000000
--- a/Documentation/cgroup-v1/hugetlb.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-==================
-HugeTLB Controller
-==================
-
-The HugeTLB controller allows to limit the HugeTLB usage per control group and
-enforces the controller limit during page fault. Since HugeTLB doesn't
-support page reclaim, enforcing the limit at page fault time implies that,
-the application will get SIGBUS signal if it tries to access HugeTLB pages
-beyond its limit. This requires the application to know beforehand how much
-HugeTLB pages it would require for its use.
-
-HugeTLB controller can be created by first mounting the cgroup filesystem.
-
-# mount -t cgroup -o hugetlb none /sys/fs/cgroup
-
-With the above step, the initial or the parent HugeTLB group becomes
-visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in
-the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup.
-
-New groups can be created under the parent group /sys/fs/cgroup::
-
-  # cd /sys/fs/cgroup
-  # mkdir g1
-  # echo $$ > g1/tasks
-
-The above steps create a new group g1 and move the current shell
-process (bash) into it.
-
-Brief summary of control files::
-
- hugetlb.<hugepagesize>.limit_in_bytes     # set/show limit of "hugepagesize" hugetlb usage
- hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb  usage recorded
- hugetlb.<hugepagesize>.usage_in_bytes     # show current usage for "hugepagesize" hugetlb
- hugetlb.<hugepagesize>.failcnt		   # show the number of allocation failure due to HugeTLB limit
-
-For a system supporting three hugepage sizes (64k, 32M and 1G), the control
-files include::
-
-  hugetlb.1GB.limit_in_bytes
-  hugetlb.1GB.max_usage_in_bytes
-  hugetlb.1GB.usage_in_bytes
-  hugetlb.1GB.failcnt
-  hugetlb.64KB.limit_in_bytes
-  hugetlb.64KB.max_usage_in_bytes
-  hugetlb.64KB.usage_in_bytes
-  hugetlb.64KB.failcnt
-  hugetlb.32MB.limit_in_bytes
-  hugetlb.32MB.max_usage_in_bytes
-  hugetlb.32MB.usage_in_bytes
-  hugetlb.32MB.failcnt
diff --git a/Documentation/cgroup-v1/index.rst b/Documentation/cgroup-v1/index.rst
deleted file mode 100644
index fe76d42edc11..000000000000
--- a/Documentation/cgroup-v1/index.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-:orphan:
-
-========================
-Control Groups version 1
-========================
-
-.. toctree::
-    :maxdepth: 1
-
-    cgroups
-
-    blkio-controller
-    cpuacct
-    cpusets
-    devices
-    freezer-subsystem
-    hugetlb
-    memcg_test
-    memory
-    net_cls
-    net_prio
-    pids
-    rdma
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/cgroup-v1/memcg_test.rst b/Documentation/cgroup-v1/memcg_test.rst
deleted file mode 100644
index 91bd18c6a514..000000000000
--- a/Documentation/cgroup-v1/memcg_test.rst
+++ /dev/null
@@ -1,355 +0,0 @@
-=====================================================
-Memory Resource Controller(Memcg) Implementation Memo
-=====================================================
-
-Last Updated: 2010/2
-
-Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34).
-
-Because VM is getting complex (one of reasons is memcg...), memcg's behavior
-is complex. This is a document for memcg's internal behavior.
-Please note that implementation details can be changed.
-
-(*) Topics on API should be in Documentation/cgroup-v1/memory.rst)
-
-0. How to record usage ?
-========================
-
-   2 objects are used.
-
-   page_cgroup ....an object per page.
-
-	Allocated at boot or memory hotplug. Freed at memory hot removal.
-
-   swap_cgroup ... an entry per swp_entry.
-
-	Allocated at swapon(). Freed at swapoff().
-
-   The page_cgroup has USED bit and double count against a page_cgroup never
-   occurs. swap_cgroup is used only when a charged page is swapped-out.
-
-1. Charge
-=========
-
-   a page/swp_entry may be charged (usage += PAGE_SIZE) at
-
-	mem_cgroup_try_charge()
-
-2. Uncharge
-===========
-
-  a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
-
-	mem_cgroup_uncharge()
-	  Called when a page's refcount goes down to 0.
-
-	mem_cgroup_uncharge_swap()
-	  Called when swp_entry's refcnt goes down to 0. A charge against swap
-	  disappears.
-
-3. charge-commit-cancel
-=======================
-
-	Memcg pages are charged in two steps:
-
-		- mem_cgroup_try_charge()
-		- mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
-
-	At try_charge(), there are no flags to say "this page is charged".
-	at this point, usage += PAGE_SIZE.
-
-	At commit(), the page is associated with the memcg.
-
-	At cancel(), simply usage -= PAGE_SIZE.
-
-Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
-
-4. Anonymous
-============
-
-	Anonymous page is newly allocated at
-		  - page fault into MAP_ANONYMOUS mapping.
-		  - Copy-On-Write.
-
-	4.1 Swap-in.
-	At swap-in, the page is taken from swap-cache. There are 2 cases.
-
-	(a) If the SwapCache is newly allocated and read, it has no charges.
-	(b) If the SwapCache has been mapped by processes, it has been
-	    charged already.
-
-	4.2 Swap-out.
-	At swap-out, typical state transition is below.
-
-	(a) add to swap cache. (marked as SwapCache)
-	    swp_entry's refcnt += 1.
-	(b) fully unmapped.
-	    swp_entry's refcnt += # of ptes.
-	(c) write back to swap.
-	(d) delete from swap cache. (remove from SwapCache)
-	    swp_entry's refcnt -= 1.
-
-
-	Finally, at task exit,
-	(e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
-
-5. Page Cache
-=============
-
-	Page Cache is charged at
-	- add_to_page_cache_locked().
-
-	The logic is very clear. (About migration, see below)
-
-	Note:
-	  __remove_from_page_cache() is called by remove_from_page_cache()
-	  and __remove_mapping().
-
-6. Shmem(tmpfs) Page Cache
-===========================
-
-	The best way to understand shmem's page state transition is to read
-	mm/shmem.c.
-
-	But brief explanation of the behavior of memcg around shmem will be
-	helpful to understand the logic.
-
-	Shmem's page (just leaf page, not direct/indirect block) can be on
-
-		- radix-tree of shmem's inode.
-		- SwapCache.
-		- Both on radix-tree and SwapCache. This happens at swap-in
-		  and swap-out,
-
-	It's charged when...
-
-	- A new page is added to shmem's radix-tree.
-	- A swp page is read. (move a charge from swap_cgroup to page_cgroup)
-
-7. Page Migration
-=================
-
-	mem_cgroup_migrate()
-
-8. LRU
-======
-        Each memcg has its own private LRU. Now, its handling is under global
-	VM's control (means that it's handled under global pgdat->lru_lock).
-	Almost all routines around memcg's LRU is called by global LRU's
-	list management functions under pgdat->lru_lock.
-
-	A special function is mem_cgroup_isolate_pages(). This scans
-	memcg's private LRU and call __isolate_lru_page() to extract a page
-	from LRU.
-
-	(By __isolate_lru_page(), the page is removed from both of global and
-	private LRU.)
-
-
-9. Typical Tests.
-=================
-
- Tests for racy cases.
-
-9.1 Small limit to memcg.
--------------------------
-
-	When you do test to do racy case, it's good test to set memcg's limit
-	to be very small rather than GB. Many races found in the test under
-	xKB or xxMB limits.
-
-	(Memory behavior under GB and Memory behavior under MB shows very
-	different situation.)
-
-9.2 Shmem
----------
-
-	Historically, memcg's shmem handling was poor and we saw some amount
-	of troubles here. This is because shmem is page-cache but can be
-	SwapCache. Test with shmem/tmpfs is always good test.
-
-9.3 Migration
--------------
-
-	For NUMA, migration is an another special case. To do easy test, cpuset
-	is useful. Following is a sample script to do migration::
-
-		mount -t cgroup -o cpuset none /opt/cpuset
-
-		mkdir /opt/cpuset/01
-		echo 1 > /opt/cpuset/01/cpuset.cpus
-		echo 0 > /opt/cpuset/01/cpuset.mems
-		echo 1 > /opt/cpuset/01/cpuset.memory_migrate
-		mkdir /opt/cpuset/02
-		echo 1 > /opt/cpuset/02/cpuset.cpus
-		echo 1 > /opt/cpuset/02/cpuset.mems
-		echo 1 > /opt/cpuset/02/cpuset.memory_migrate
-
-	In above set, when you moves a task from 01 to 02, page migration to
-	node 0 to node 1 will occur. Following is a script to migrate all
-	under cpuset.::
-
-		--
-		move_task()
-		{
-		for pid in $1
-		do
-			/bin/echo $pid >$2/tasks 2>/dev/null
-			echo -n $pid
-			echo -n " "
-		done
-		echo END
-		}
-
-		G1_TASK=`cat ${G1}/tasks`
-		G2_TASK=`cat ${G2}/tasks`
-		move_task "${G1_TASK}" ${G2} &
-		--
-
-9.4 Memory hotplug
-------------------
-
-	memory hotplug test is one of good test.
-
-	to offline memory, do following::
-
-		# echo offline > /sys/devices/system/memory/memoryXXX/state
-
-	(XXX is the place of memory)
-
-	This is an easy way to test page migration, too.
-
-9.5 mkdir/rmdir
----------------
-
-	When using hierarchy, mkdir/rmdir test should be done.
-	Use tests like the following::
-
-		echo 1 >/opt/cgroup/01/memory/use_hierarchy
-		mkdir /opt/cgroup/01/child_a
-		mkdir /opt/cgroup/01/child_b
-
-		set limit to 01.
-		add limit to 01/child_b
-		run jobs under child_a and child_b
-
-	create/delete following groups at random while jobs are running::
-
-		/opt/cgroup/01/child_a/child_aa
-		/opt/cgroup/01/child_b/child_bb
-		/opt/cgroup/01/child_c
-
-	running new jobs in new group is also good.
-
-9.6 Mount with other subsystems
--------------------------------
-
-	Mounting with other subsystems is a good test because there is a
-	race and lock dependency with other cgroup subsystems.
-
-	example::
-
-		# mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices
-
-	and do task move, mkdir, rmdir etc...under this.
-
-9.7 swapoff
------------
-
-	Besides management of swap is one of complicated parts of memcg,
-	call path of swap-in at swapoff is not same as usual swap-in path..
-	It's worth to be tested explicitly.
-
-	For example, test like following is good:
-
-	(Shell-A)::
-
-		# mount -t cgroup none /cgroup -o memory
-		# mkdir /cgroup/test
-		# echo 40M > /cgroup/test/memory.limit_in_bytes
-		# echo 0 > /cgroup/test/tasks
-
-	Run malloc(100M) program under this. You'll see 60M of swaps.
-
-	(Shell-B)::
-
-		# move all tasks in /cgroup/test to /cgroup
-		# /sbin/swapoff -a
-		# rmdir /cgroup/test
-		# kill malloc task.
-
-	Of course, tmpfs v.s. swapoff test should be tested, too.
-
-9.8 OOM-Killer
---------------
-
-	Out-of-memory caused by memcg's limit will kill tasks under
-	the memcg. When hierarchy is used, a task under hierarchy
-	will be killed by the kernel.
-
-	In this case, panic_on_oom shouldn't be invoked and tasks
-	in other groups shouldn't be killed.
-
-	It's not difficult to cause OOM under memcg as following.
-
-	Case A) when you can swapoff::
-
-		#swapoff -a
-		#echo 50M > /memory.limit_in_bytes
-
-	run 51M of malloc
-
-	Case B) when you use mem+swap limitation::
-
-		#echo 50M > memory.limit_in_bytes
-		#echo 50M > memory.memsw.limit_in_bytes
-
-	run 51M of malloc
-
-9.9 Move charges at task migration
-----------------------------------
-
-	Charges associated with a task can be moved along with task migration.
-
-	(Shell-A)::
-
-		#mkdir /cgroup/A
-		#echo $$ >/cgroup/A/tasks
-
-	run some programs which uses some amount of memory in /cgroup/A.
-
-	(Shell-B)::
-
-		#mkdir /cgroup/B
-		#echo 1 >/cgroup/B/memory.move_charge_at_immigrate
-		#echo "pid of the program running in group A" >/cgroup/B/tasks
-
-	You can see charges have been moved by reading ``*.usage_in_bytes`` or
-	memory.stat of both A and B.
-
-	See 8.2 of Documentation/cgroup-v1/memory.rst to see what value should
-	be written to move_charge_at_immigrate.
-
-9.10 Memory thresholds
-----------------------
-
-	Memory controller implements memory thresholds using cgroups notification
-	API. You can use tools/cgroup/cgroup_event_listener.c to test it.
-
-	(Shell-A) Create cgroup and run event listener::
-
-		# mkdir /cgroup/A
-		# ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M
-
-	(Shell-B) Add task to cgroup and try to allocate and free memory::
-
-		# echo $$ >/cgroup/A/tasks
-		# a="$(dd if=/dev/zero bs=1M count=10)"
-		# a=
-
-	You will see message from cgroup_event_listener every time you cross
-	the thresholds.
-
-	Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds.
-
-	It's good idea to test root cgroup as well.
diff --git a/Documentation/cgroup-v1/memory.rst b/Documentation/cgroup-v1/memory.rst
deleted file mode 100644
index 41bdc038dad9..000000000000
--- a/Documentation/cgroup-v1/memory.rst
+++ /dev/null
@@ -1,1003 +0,0 @@
-==========================
-Memory Resource Controller
-==========================
-
-NOTE:
-      This document is hopelessly outdated and it asks for a complete
-      rewrite. It still contains a useful information so we are keeping it
-      here but make sure to check the current code if you need a deeper
-      understanding.
-
-NOTE:
-      The Memory Resource Controller has generically been referred to as the
-      memory controller in this document. Do not confuse memory controller
-      used here with the memory controller that is used in hardware.
-
-(For editors) In this document:
-      When we mention a cgroup (cgroupfs's directory) with memory controller,
-      we call it "memory cgroup". When you see git-log and source code, you'll
-      see patch's title and function names tend to use "memcg".
-      In this document, we avoid using it.
-
-Benefits and Purpose of the memory controller
-=============================================
-
-The memory controller isolates the memory behaviour of a group of tasks
-from the rest of the system. The article on LWN [12] mentions some probable
-uses of the memory controller. The memory controller can be used to
-
-a. Isolate an application or a group of applications
-   Memory-hungry applications can be isolated and limited to a smaller
-   amount of memory.
-b. Create a cgroup with a limited amount of memory; this can be used
-   as a good alternative to booting with mem=XXXX.
-c. Virtualization solutions can control the amount of memory they want
-   to assign to a virtual machine instance.
-d. A CD/DVD burner could control the amount of memory used by the
-   rest of the system to ensure that burning does not fail due to lack
-   of available memory.
-e. There are several other use cases; find one or use the controller just
-   for fun (to learn and hack on the VM subsystem).
-
-Current Status: linux-2.6.34-mmotm(development version of 2010/April)
-
-Features:
-
- - accounting anonymous pages, file caches, swap caches usage and limiting them.
- - pages are linked to per-memcg LRU exclusively, and there is no global LRU.
- - optionally, memory+swap usage can be accounted and limited.
- - hierarchical accounting
- - soft limit
- - moving (recharging) account at moving a task is selectable.
- - usage threshold notifier
- - memory pressure notifier
- - oom-killer disable knob and oom-notifier
- - Root cgroup has no limit controls.
-
- Kernel memory support is a work in progress, and the current version provides
- basically functionality. (See Section 2.7)
-
-Brief summary of control files.
-
-==================================== ==========================================
- tasks				     attach a task(thread) and show list of
-				     threads
- cgroup.procs			     show list of processes
- cgroup.event_control		     an interface for event_fd()
- memory.usage_in_bytes		     show current usage for memory
-				     (See 5.5 for details)
- memory.memsw.usage_in_bytes	     show current usage for memory+Swap
-				     (See 5.5 for details)
- memory.limit_in_bytes		     set/show limit of memory usage
- memory.memsw.limit_in_bytes	     set/show limit of memory+Swap usage
- memory.failcnt			     show the number of memory usage hits limits
- memory.memsw.failcnt		     show the number of memory+Swap hits limits
- memory.max_usage_in_bytes	     show max memory usage recorded
- memory.memsw.max_usage_in_bytes     show max memory+Swap usage recorded
- memory.soft_limit_in_bytes	     set/show soft limit of memory usage
- memory.stat			     show various statistics
- memory.use_hierarchy		     set/show hierarchical account enabled
- memory.force_empty		     trigger forced page reclaim
- memory.pressure_level		     set memory pressure notifications
- memory.swappiness		     set/show swappiness parameter of vmscan
-				     (See sysctl's vm.swappiness)
- memory.move_charge_at_immigrate     set/show controls of moving charges
- memory.oom_control		     set/show oom controls.
- memory.numa_stat		     show the number of memory usage per numa
-				     node
-
- memory.kmem.limit_in_bytes          set/show hard limit for kernel memory
- memory.kmem.usage_in_bytes          show current kernel memory allocation
- memory.kmem.failcnt                 show the number of kernel memory usage
-				     hits limits
- memory.kmem.max_usage_in_bytes      show max kernel memory usage recorded
-
- memory.kmem.tcp.limit_in_bytes      set/show hard limit for tcp buf memory
- memory.kmem.tcp.usage_in_bytes      show current tcp buf memory allocation
- memory.kmem.tcp.failcnt             show the number of tcp buf memory usage
-				     hits limits
- memory.kmem.tcp.max_usage_in_bytes  show max tcp buf memory usage recorded
-==================================== ==========================================
-
-1. History
-==========
-
-The memory controller has a long history. A request for comments for the memory
-controller was posted by Balbir Singh [1]. At the time the RFC was posted
-there were several implementations for memory control. The goal of the
-RFC was to build consensus and agreement for the minimal features required
-for memory control. The first RSS controller was posted by Balbir Singh[2]
-in Feb 2007. Pavel Emelianov [3][4][5] has since posted three versions of the
-RSS controller. At OLS, at the resource management BoF, everyone suggested
-that we handle both page cache and RSS together. Another request was raised
-to allow user space handling of OOM. The current memory controller is
-at version 6; it combines both mapped (RSS) and unmapped Page
-Cache Control [11].
-
-2. Memory Control
-=================
-
-Memory is a unique resource in the sense that it is present in a limited
-amount. If a task requires a lot of CPU processing, the task can spread
-its processing over a period of hours, days, months or years, but with
-memory, the same physical memory needs to be reused to accomplish the task.
-
-The memory controller implementation has been divided into phases. These
-are:
-
-1. Memory controller
-2. mlock(2) controller
-3. Kernel user memory accounting and slab control
-4. user mappings length controller
-
-The memory controller is the first controller developed.
-
-2.1. Design
------------
-
-The core of the design is a counter called the page_counter. The
-page_counter tracks the current memory usage and limit of the group of
-processes associated with the controller. Each cgroup has a memory controller
-specific data structure (mem_cgroup) associated with it.
-
-2.2. Accounting
----------------
-
-::
-
-		+--------------------+
-		|  mem_cgroup        |
-		|  (page_counter)    |
-		+--------------------+
-		 /            ^      \
-		/             |       \
-           +---------------+  |        +---------------+
-           | mm_struct     |  |....    | mm_struct     |
-           |               |  |        |               |
-           +---------------+  |        +---------------+
-                              |
-                              + --------------+
-                                              |
-           +---------------+           +------+--------+
-           | page          +---------->  page_cgroup|
-           |               |           |               |
-           +---------------+           +---------------+
-
-             (Figure 1: Hierarchy of Accounting)
-
-
-Figure 1 shows the important aspects of the controller
-
-1. Accounting happens per cgroup
-2. Each mm_struct knows about which cgroup it belongs to
-3. Each page has a pointer to the page_cgroup, which in turn knows the
-   cgroup it belongs to
-
-The accounting is done as follows: mem_cgroup_charge_common() is invoked to
-set up the necessary data structures and check if the cgroup that is being
-charged is over its limit. If it is, then reclaim is invoked on the cgroup.
-More details can be found in the reclaim section of this document.
-If everything goes well, a page meta-data-structure called page_cgroup is
-updated. page_cgroup has its own LRU on cgroup.
-(*) page_cgroup structure is allocated at boot/memory-hotplug time.
-
-2.2.1 Accounting details
-------------------------
-
-All mapped anon pages (RSS) and cache pages (Page Cache) are accounted.
-Some pages which are never reclaimable and will not be on the LRU
-are not accounted. We just account pages under usual VM management.
-
-RSS pages are accounted at page_fault unless they've already been accounted
-for earlier. A file page will be accounted for as Page Cache when it's
-inserted into inode (radix-tree). While it's mapped into the page tables of
-processes, duplicate accounting is carefully avoided.
-
-An RSS page is unaccounted when it's fully unmapped. A PageCache page is
-unaccounted when it's removed from radix-tree. Even if RSS pages are fully
-unmapped (by kswapd), they may exist as SwapCache in the system until they
-are really freed. Such SwapCaches are also accounted.
-A swapped-in page is not accounted until it's mapped.
-
-Note: The kernel does swapin-readahead and reads multiple swaps at once.
-This means swapped-in pages may contain pages for other tasks than a task
-causing page fault. So, we avoid accounting at swap-in I/O.
-
-At page migration, accounting information is kept.
-
-Note: we just account pages-on-LRU because our purpose is to control amount
-of used pages; not-on-LRU pages tend to be out-of-control from VM view.
-
-2.3 Shared Page Accounting
---------------------------
-
-Shared pages are accounted on the basis of the first touch approach. The
-cgroup that first touches a page is accounted for the page. The principle
-behind this approach is that a cgroup that aggressively uses a shared
-page will eventually get charged for it (once it is uncharged from
-the cgroup that brought it in -- this will happen on memory pressure).
-
-But see section 8.2: when moving a task to another cgroup, its pages may
-be recharged to the new cgroup, if move_charge_at_immigrate has been chosen.
-
-Exception: If CONFIG_MEMCG_SWAP is not used.
-When you do swapoff and make swapped-out pages of shmem(tmpfs) to
-be backed into memory in force, charges for pages are accounted against the
-caller of swapoff rather than the users of shmem.
-
-2.4 Swap Extension (CONFIG_MEMCG_SWAP)
---------------------------------------
-
-Swap Extension allows you to record charge for swap. A swapped-in page is
-charged back to original page allocator if possible.
-
-When swap is accounted, following files are added.
-
- - memory.memsw.usage_in_bytes.
- - memory.memsw.limit_in_bytes.
-
-memsw means memory+swap. Usage of memory+swap is limited by
-memsw.limit_in_bytes.
-
-Example: Assume a system with 4G of swap. A task which allocates 6G of memory
-(by mistake) under 2G memory limitation will use all swap.
-In this case, setting memsw.limit_in_bytes=3G will prevent bad use of swap.
-By using the memsw limit, you can avoid system OOM which can be caused by swap
-shortage.
-
-**why 'memory+swap' rather than swap**
-
-The global LRU(kswapd) can swap out arbitrary pages. Swap-out means
-to move account from memory to swap...there is no change in usage of
-memory+swap. In other words, when we want to limit the usage of swap without
-affecting global LRU, memory+swap limit is better than just limiting swap from
-an OS point of view.
-
-**What happens when a cgroup hits memory.memsw.limit_in_bytes**
-
-When a cgroup hits memory.memsw.limit_in_bytes, it's useless to do swap-out
-in this cgroup. Then, swap-out will not be done by cgroup routine and file
-caches are dropped. But as mentioned above, global LRU can do swapout memory
-from it for sanity of the system's memory management state. You can't forbid
-it by cgroup.
-
-2.5 Reclaim
------------
-
-Each cgroup maintains a per cgroup LRU which has the same structure as
-global VM. When a cgroup goes over its limit, we first try
-to reclaim memory from the cgroup so as to make space for the new
-pages that the cgroup has touched. If the reclaim is unsuccessful,
-an OOM routine is invoked to select and kill the bulkiest task in the
-cgroup. (See 10. OOM Control below.)
-
-The reclaim algorithm has not been modified for cgroups, except that
-pages that are selected for reclaiming come from the per-cgroup LRU
-list.
-
-NOTE:
-  Reclaim does not work for the root cgroup, since we cannot set any
-  limits on the root cgroup.
-
-Note2:
-  When panic_on_oom is set to "2", the whole system will panic.
-
-When oom event notifier is registered, event will be delivered.
-(See oom_control section)
-
-2.6 Locking
------------
-
-   lock_page_cgroup()/unlock_page_cgroup() should not be called under
-   the i_pages lock.
-
-   Other lock order is following:
-
-   PG_locked.
-     mm->page_table_lock
-         pgdat->lru_lock
-	   lock_page_cgroup.
-
-  In many cases, just lock_page_cgroup() is called.
-
-  per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
-  pgdat->lru_lock, it has no lock of its own.
-
-2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
------------------------------------------------
-
-With the Kernel memory extension, the Memory Controller is able to limit
-the amount of kernel memory used by the system. Kernel memory is fundamentally
-different than user memory, since it can't be swapped out, which makes it
-possible to DoS the system by consuming too much of this precious resource.
-
-Kernel memory accounting is enabled for all memory cgroups by default. But
-it can be disabled system-wide by passing cgroup.memory=nokmem to the kernel
-at boot time. In this case, kernel memory will not be accounted at all.
-
-Kernel memory limits are not imposed for the root cgroup. Usage for the root
-cgroup may or may not be accounted. The memory used is accumulated into
-memory.kmem.usage_in_bytes, or in a separate counter when it makes sense.
-(currently only for tcp).
-
-The main "kmem" counter is fed into the main counter, so kmem charges will
-also be visible from the user counter.
-
-Currently no soft limit is implemented for kernel memory. It is future work
-to trigger slab reclaim when those limits are reached.
-
-2.7.1 Current Kernel Memory resources accounted
------------------------------------------------
-
-stack pages:
-  every process consumes some stack pages. By accounting into
-  kernel memory, we prevent new processes from being created when the kernel
-  memory usage is too high.
-
-slab pages:
-  pages allocated by the SLAB or SLUB allocator are tracked. A copy
-  of each kmem_cache is created every time the cache is touched by the first time
-  from inside the memcg. The creation is done lazily, so some objects can still be
-  skipped while the cache is being created. All objects in a slab page should
-  belong to the same memcg. This only fails to hold when a task is migrated to a
-  different memcg during the page allocation by the cache.
-
-sockets memory pressure:
-  some sockets protocols have memory pressure
-  thresholds. The Memory Controller allows them to be controlled individually
-  per cgroup, instead of globally.
-
-tcp memory pressure:
-  sockets memory pressure for the tcp protocol.
-
-2.7.2 Common use cases
-----------------------
-
-Because the "kmem" counter is fed to the main user counter, kernel memory can
-never be limited completely independently of user memory. Say "U" is the user
-limit, and "K" the kernel limit. There are three possible ways limits can be
-set:
-
-U != 0, K = unlimited:
-    This is the standard memcg limitation mechanism already present before kmem
-    accounting. Kernel memory is completely ignored.
-
-U != 0, K < U:
-    Kernel memory is a subset of the user memory. This setup is useful in
-    deployments where the total amount of memory per-cgroup is overcommited.
-    Overcommiting kernel memory limits is definitely not recommended, since the
-    box can still run out of non-reclaimable memory.
-    In this case, the admin could set up K so that the sum of all groups is
-    never greater than the total memory, and freely set U at the cost of his
-    QoS.
-
-WARNING:
-    In the current implementation, memory reclaim will NOT be
-    triggered for a cgroup when it hits K while staying below U, which makes
-    this setup impractical.
-
-U != 0, K >= U:
-    Since kmem charges will also be fed to the user counter and reclaim will be
-    triggered for the cgroup for both kinds of memory. This setup gives the
-    admin a unified view of memory, and it is also useful for people who just
-    want to track kernel memory usage.
-
-3. User Interface
-=================
-
-3.0. Configuration
-------------------
-
-a. Enable CONFIG_CGROUPS
-b. Enable CONFIG_MEMCG
-c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
-d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
-
-3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
--------------------------------------------------------------------
-
-::
-
-	# mount -t tmpfs none /sys/fs/cgroup
-	# mkdir /sys/fs/cgroup/memory
-	# mount -t cgroup none /sys/fs/cgroup/memory -o memory
-
-3.2. Make the new group and move bash into it::
-
-	# mkdir /sys/fs/cgroup/memory/0
-	# echo $$ > /sys/fs/cgroup/memory/0/tasks
-
-Since now we're in the 0 cgroup, we can alter the memory limit::
-
-	# echo 4M > /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-
-NOTE:
-  We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
-  mega or gigabytes. (Here, Kilo, Mega, Giga are Kibibytes, Mebibytes,
-  Gibibytes.)
-
-NOTE:
-  We can write "-1" to reset the ``*.limit_in_bytes(unlimited)``.
-
-NOTE:
-  We cannot set limits on the root cgroup any more.
-
-::
-
-  # cat /sys/fs/cgroup/memory/0/memory.limit_in_bytes
-  4194304
-
-We can check the usage::
-
-  # cat /sys/fs/cgroup/memory/0/memory.usage_in_bytes
-  1216512
-
-A successful write to this file does not guarantee a successful setting of
-this limit to the value written into the file. This can be due to a
-number of factors, such as rounding up to page boundaries or the total
-availability of memory on the system. The user is required to re-read
-this file after a write to guarantee the value committed by the kernel::
-
-  # echo 1 > memory.limit_in_bytes
-  # cat memory.limit_in_bytes
-  4096
-
-The memory.failcnt field gives the number of times that the cgroup limit was
-exceeded.
-
-The memory.stat file gives accounting information. Now, the number of
-caches, RSS and Active pages/Inactive pages are shown.
-
-4. Testing
-==========
-
-For testing features and implementation, see memcg_test.txt.
-
-Performance test is also important. To see pure memory controller's overhead,
-testing on tmpfs will give you good numbers of small overheads.
-Example: do kernel make on tmpfs.
-
-Page-fault scalability is also important. At measuring parallel
-page fault test, multi-process test may be better than multi-thread
-test because it has noise of shared objects/status.
-
-But the above two are testing extreme situations.
-Trying usual test under memory controller is always helpful.
-
-4.1 Troubleshooting
--------------------
-
-Sometimes a user might find that the application under a cgroup is
-terminated by the OOM killer. There are several causes for this:
-
-1. The cgroup limit is too low (just too low to do anything useful)
-2. The user is using anonymous memory and swap is turned off or too low
-
-A sync followed by echo 1 > /proc/sys/vm/drop_caches will help get rid of
-some of the pages cached in the cgroup (page cache pages).
-
-To know what happens, disabling OOM_Kill as per "10. OOM Control" (below) and
-seeing what happens will be helpful.
-
-4.2 Task migration
-------------------
-
-When a task migrates from one cgroup to another, its charge is not
-carried forward by default. The pages allocated from the original cgroup still
-remain charged to it, the charge is dropped when the page is freed or
-reclaimed.
-
-You can move charges of a task along with task migration.
-See 8. "Move charges at task migration"
-
-4.3 Removing a cgroup
----------------------
-
-A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a
-cgroup might have some charge associated with it, even though all
-tasks have migrated away from it. (because we charge against pages, not
-against tasks.)
-
-We move the stats to root (if use_hierarchy==0) or parent (if
-use_hierarchy==1), and no change on the charge except uncharging
-from the child.
-
-Charges recorded in swap information is not updated at removal of cgroup.
-Recorded information is discarded and a cgroup which uses swap (swapcache)
-will be charged as a new owner of it.
-
-About use_hierarchy, see Section 6.
-
-5. Misc. interfaces
-===================
-
-5.1 force_empty
----------------
-  memory.force_empty interface is provided to make cgroup's memory usage empty.
-  When writing anything to this::
-
-    # echo 0 > memory.force_empty
-
-  the cgroup will be reclaimed and as many pages reclaimed as possible.
-
-  The typical use case for this interface is before calling rmdir().
-  Though rmdir() offlines memcg, but the memcg may still stay there due to
-  charged file caches. Some out-of-use page caches may keep charged until
-  memory pressure happens. If you want to avoid that, force_empty will be useful.
-
-  Also, note that when memory.kmem.limit_in_bytes is set the charges due to
-  kernel pages will still be seen. This is not considered a failure and the
-  write will still return success. In this case, it is expected that
-  memory.kmem.usage_in_bytes == memory.usage_in_bytes.
-
-  About use_hierarchy, see Section 6.
-
-5.2 stat file
--------------
-
-memory.stat file includes following statistics
-
-per-memory cgroup local status
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-=============== ===============================================================
-cache		# of bytes of page cache memory.
-rss		# of bytes of anonymous and swap cache memory (includes
-		transparent hugepages).
-rss_huge	# of bytes of anonymous transparent hugepages.
-mapped_file	# of bytes of mapped file (includes tmpfs/shmem)
-pgpgin		# of charging events to the memory cgroup. The charging
-		event happens each time a page is accounted as either mapped
-		anon page(RSS) or cache page(Page Cache) to the cgroup.
-pgpgout		# of uncharging events to the memory cgroup. The uncharging
-		event happens each time a page is unaccounted from the cgroup.
-swap		# of bytes of swap usage
-dirty		# of bytes that are waiting to get written back to the disk.
-writeback	# of bytes of file/anon cache that are queued for syncing to
-		disk.
-inactive_anon	# of bytes of anonymous and swap cache memory on inactive
-		LRU list.
-active_anon	# of bytes of anonymous and swap cache memory on active
-		LRU list.
-inactive_file	# of bytes of file-backed memory on inactive LRU list.
-active_file	# of bytes of file-backed memory on active LRU list.
-unevictable	# of bytes of memory that cannot be reclaimed (mlocked etc).
-=============== ===============================================================
-
-status considering hierarchy (see memory.use_hierarchy settings)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-========================= ===================================================
-hierarchical_memory_limit # of bytes of memory limit with regard to hierarchy
-			  under which the memory cgroup is
-hierarchical_memsw_limit  # of bytes of memory+swap limit with regard to
-			  hierarchy under which memory cgroup is.
-
-total_<counter>		  # hierarchical version of <counter>, which in
-			  addition to the cgroup's own value includes the
-			  sum of all hierarchical children's values of
-			  <counter>, i.e. total_cache
-========================= ===================================================
-
-The following additional stats are dependent on CONFIG_DEBUG_VM
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-========================= ========================================
-recent_rotated_anon	  VM internal parameter. (see mm/vmscan.c)
-recent_rotated_file	  VM internal parameter. (see mm/vmscan.c)
-recent_scanned_anon	  VM internal parameter. (see mm/vmscan.c)
-recent_scanned_file	  VM internal parameter. (see mm/vmscan.c)
-========================= ========================================
-
-Memo:
-	recent_rotated means recent frequency of LRU rotation.
-	recent_scanned means recent # of scans to LRU.
-	showing for better debug please see the code for meanings.
-
-Note:
-	Only anonymous and swap cache memory is listed as part of 'rss' stat.
-	This should not be confused with the true 'resident set size' or the
-	amount of physical memory used by the cgroup.
-
-	'rss + mapped_file" will give you resident set size of cgroup.
-
-	(Note: file and shmem may be shared among other cgroups. In that case,
-	mapped_file is accounted only when the memory cgroup is owner of page
-	cache.)
-
-5.3 swappiness
---------------
-
-Overrides /proc/sys/vm/swappiness for the particular group. The tunable
-in the root cgroup corresponds to the global swappiness setting.
-
-Please note that unlike during the global reclaim, limit reclaim
-enforces that 0 swappiness really prevents from any swapping even if
-there is a swap storage available. This might lead to memcg OOM killer
-if there are no file pages to reclaim.
-
-5.4 failcnt
------------
-
-A memory cgroup provides memory.failcnt and memory.memsw.failcnt files.
-This failcnt(== failure count) shows the number of times that a usage counter
-hit its limit. When a memory cgroup hits a limit, failcnt increases and
-memory under it will be reclaimed.
-
-You can reset failcnt by writing 0 to failcnt file::
-
-	# echo 0 > .../memory.failcnt
-
-5.5 usage_in_bytes
-------------------
-
-For efficiency, as other kernel components, memory cgroup uses some optimization
-to avoid unnecessary cacheline false sharing. usage_in_bytes is affected by the
-method and doesn't show 'exact' value of memory (and swap) usage, it's a fuzz
-value for efficient access. (Of course, when necessary, it's synchronized.)
-If you want to know more exact memory usage, you should use RSS+CACHE(+SWAP)
-value in memory.stat(see 5.2).
-
-5.6 numa_stat
--------------
-
-This is similar to numa_maps but operates on a per-memcg basis.  This is
-useful for providing visibility into the numa locality information within
-an memcg since the pages are allowed to be allocated from any physical
-node.  One of the use cases is evaluating application performance by
-combining this information with the application's CPU allocation.
-
-Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable"
-per-node page counts including "hierarchical_<counter>" which sums up all
-hierarchical children's values in addition to the memcg's own value.
-
-The output format of memory.numa_stat is::
-
-  total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
-  file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
-  anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-  unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
-  hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
-
-The "total" count is sum of file + anon + unevictable.
-
-6. Hierarchy support
-====================
-
-The memory controller supports a deep hierarchy and hierarchical accounting.
-The hierarchy is created by creating the appropriate cgroups in the
-cgroup filesystem. Consider for example, the following cgroup filesystem
-hierarchy::
-
-	       root
-	     /  |   \
-            /	|    \
-	   a	b     c
-		      | \
-		      |  \
-		      d   e
-
-In the diagram above, with hierarchical accounting enabled, all memory
-usage of e, is accounted to its ancestors up until the root (i.e, c and root),
-that has memory.use_hierarchy enabled. If one of the ancestors goes over its
-limit, the reclaim algorithm reclaims from the tasks in the ancestor and the
-children of the ancestor.
-
-6.1 Enabling hierarchical accounting and reclaim
-------------------------------------------------
-
-A memory cgroup by default disables the hierarchy feature. Support
-can be enabled by writing 1 to memory.use_hierarchy file of the root cgroup::
-
-	# echo 1 > memory.use_hierarchy
-
-The feature can be disabled by::
-
-	# echo 0 > memory.use_hierarchy
-
-NOTE1:
-       Enabling/disabling will fail if either the cgroup already has other
-       cgroups created below it, or if the parent cgroup has use_hierarchy
-       enabled.
-
-NOTE2:
-       When panic_on_oom is set to "2", the whole system will panic in
-       case of an OOM event in any cgroup.
-
-7. Soft limits
-==============
-
-Soft limits allow for greater sharing of memory. The idea behind soft limits
-is to allow control groups to use as much of the memory as needed, provided
-
-a. There is no memory contention
-b. They do not exceed their hard limit
-
-When the system detects memory contention or low memory, control groups
-are pushed back to their soft limits. If the soft limit of each control
-group is very high, they are pushed back as much as possible to make
-sure that one control group does not starve the others of memory.
-
-Please note that soft limits is a best-effort feature; it comes with
-no guarantees, but it does its best to make sure that when memory is
-heavily contended for, memory is allocated based on the soft limit
-hints/setup. Currently soft limit based reclaim is set up such that
-it gets invoked from balance_pgdat (kswapd).
-
-7.1 Interface
--------------
-
-Soft limits can be setup by using the following commands (in this example we
-assume a soft limit of 256 MiB)::
-
-	# echo 256M > memory.soft_limit_in_bytes
-
-If we want to change this to 1G, we can at any time use::
-
-	# echo 1G > memory.soft_limit_in_bytes
-
-NOTE1:
-       Soft limits take effect over a long period of time, since they involve
-       reclaiming memory for balancing between memory cgroups
-NOTE2:
-       It is recommended to set the soft limit always below the hard limit,
-       otherwise the hard limit will take precedence.
-
-8. Move charges at task migration
-=================================
-
-Users can move charges associated with a task along with task migration, that
-is, uncharge task's pages from the old cgroup and charge them to the new cgroup.
-This feature is not supported in !CONFIG_MMU environments because of lack of
-page tables.
-
-8.1 Interface
--------------
-
-This feature is disabled by default. It can be enabled (and disabled again) by
-writing to memory.move_charge_at_immigrate of the destination cgroup.
-
-If you want to enable it::
-
-	# echo (some positive value) > memory.move_charge_at_immigrate
-
-Note:
-      Each bits of move_charge_at_immigrate has its own meaning about what type
-      of charges should be moved. See 8.2 for details.
-Note:
-      Charges are moved only when you move mm->owner, in other words,
-      a leader of a thread group.
-Note:
-      If we cannot find enough space for the task in the destination cgroup, we
-      try to make space by reclaiming memory. Task migration may fail if we
-      cannot make enough space.
-Note:
-      It can take several seconds if you move charges much.
-
-And if you want disable it again::
-
-	# echo 0 > memory.move_charge_at_immigrate
-
-8.2 Type of charges which can be moved
---------------------------------------
-
-Each bit in move_charge_at_immigrate has its own meaning about what type of
-charges should be moved. But in any case, it must be noted that an account of
-a page or a swap can be moved only when it is charged to the task's current
-(old) memory cgroup.
-
-+---+--------------------------------------------------------------------------+
-|bit| what type of charges would be moved ?                                    |
-+===+==========================================================================+
-| 0 | A charge of an anonymous page (or swap of it) used by the target task.   |
-|   | You must enable Swap Extension (see 2.4) to enable move of swap charges. |
-+---+--------------------------------------------------------------------------+
-| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) |
-|   | and swaps of tmpfs file) mmapped by the target task. Unlike the case of  |
-|   | anonymous pages, file pages (and swaps) in the range mmapped by the task |
-|   | will be moved even if the task hasn't done page fault, i.e. they might   |
-|   | not be the task's "RSS", but other task's "RSS" that maps the same file. |
-|   | And mapcount of the page is ignored (the page can be moved even if       |
-|   | page_mapcount(page) > 1). You must enable Swap Extension (see 2.4) to    |
-|   | enable move of swap charges.                                             |
-+---+--------------------------------------------------------------------------+
-
-8.3 TODO
---------
-
-- All of moving charge operations are done under cgroup_mutex. It's not good
-  behavior to hold the mutex too long, so we may need some trick.
-
-9. Memory thresholds
-====================
-
-Memory cgroup implements memory thresholds using the cgroups notification
-API (see cgroups.txt). It allows to register multiple memory and memsw
-thresholds and gets notifications when it crosses.
-
-To register a threshold, an application must:
-
-- create an eventfd using eventfd(2);
-- open memory.usage_in_bytes or memory.memsw.usage_in_bytes;
-- write string like "<event_fd> <fd of memory.usage_in_bytes> <threshold>" to
-  cgroup.event_control.
-
-Application will be notified through eventfd when memory usage crosses
-threshold in any direction.
-
-It's applicable for root and non-root cgroup.
-
-10. OOM Control
-===============
-
-memory.oom_control file is for OOM notification and other controls.
-
-Memory cgroup implements OOM notifier using the cgroup notification
-API (See cgroups.txt). It allows to register multiple OOM notification
-delivery and gets notification when OOM happens.
-
-To register a notifier, an application must:
-
- - create an eventfd using eventfd(2)
- - open memory.oom_control file
- - write string like "<event_fd> <fd of memory.oom_control>" to
-   cgroup.event_control
-
-The application will be notified through eventfd when OOM happens.
-OOM notification doesn't work for the root cgroup.
-
-You can disable the OOM-killer by writing "1" to memory.oom_control file, as:
-
-	#echo 1 > memory.oom_control
-
-If OOM-killer is disabled, tasks under cgroup will hang/sleep
-in memory cgroup's OOM-waitqueue when they request accountable memory.
-
-For running them, you have to relax the memory cgroup's OOM status by
-
-	* enlarge limit or reduce usage.
-
-To reduce usage,
-
-	* kill some tasks.
-	* move some tasks to other group with account migration.
-	* remove some files (on tmpfs?)
-
-Then, stopped tasks will work again.
-
-At reading, current status of OOM is shown.
-
-	- oom_kill_disable 0 or 1
-	  (if 1, oom-killer is disabled)
-	- under_oom	   0 or 1
-	  (if 1, the memory cgroup is under OOM, tasks may be stopped.)
-
-11. Memory Pressure
-===================
-
-The pressure level notifications can be used to monitor the memory
-allocation cost; based on the pressure, applications can implement
-different strategies of managing their memory resources. The pressure
-levels are defined as following:
-
-The "low" level means that the system is reclaiming memory for new
-allocations. Monitoring this reclaiming activity might be useful for
-maintaining cache level. Upon notification, the program (typically
-"Activity Manager") might analyze vmstat and act in advance (i.e.
-prematurely shutdown unimportant services).
-
-The "medium" level means that the system is experiencing medium memory
-pressure, the system might be making swap, paging out active file caches,
-etc. Upon this event applications may decide to further analyze
-vmstat/zoneinfo/memcg or internal memory usage statistics and free any
-resources that can be easily reconstructed or re-read from a disk.
-
-The "critical" level means that the system is actively thrashing, it is
-about to out of memory (OOM) or even the in-kernel OOM killer is on its
-way to trigger. Applications should do whatever they can to help the
-system. It might be too late to consult with vmstat or any other
-statistics, so it's advisable to take an immediate action.
-
-By default, events are propagated upward until the event is handled, i.e. the
-events are not pass-through. For example, you have three cgroups: A->B->C. Now
-you set up an event listener on cgroups A, B and C, and suppose group C
-experiences some pressure. In this situation, only group C will receive the
-notification, i.e. groups A and B will not receive it. This is done to avoid
-excessive "broadcasting" of messages, which disturbs the system and which is
-especially bad if we are low on memory or thrashing. Group B, will receive
-notification only if there are no event listers for group C.
-
-There are three optional modes that specify different propagation behavior:
-
- - "default": this is the default behavior specified above. This mode is the
-   same as omitting the optional mode parameter, preserved by backwards
-   compatibility.
-
- - "hierarchy": events always propagate up to the root, similar to the default
-   behavior, except that propagation continues regardless of whether there are
-   event listeners at each level, with the "hierarchy" mode. In the above
-   example, groups A, B, and C will receive notification of memory pressure.
-
- - "local": events are pass-through, i.e. they only receive notifications when
-   memory pressure is experienced in the memcg for which the notification is
-   registered. In the above example, group C will receive notification if
-   registered for "local" notification and the group experiences memory
-   pressure. However, group B will never receive notification, regardless if
-   there is an event listener for group C or not, if group B is registered for
-   local notification.
-
-The level and event notification mode ("hierarchy" or "local", if necessary) are
-specified by a comma-delimited string, i.e. "low,hierarchy" specifies
-hierarchical, pass-through, notification for all ancestor memcgs. Notification
-that is the default, non pass-through behavior, does not specify a mode.
-"medium,local" specifies pass-through notification for the medium level.
-
-The file memory.pressure_level is only used to setup an eventfd. To
-register a notification, an application must:
-
-- create an eventfd using eventfd(2);
-- open memory.pressure_level;
-- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
-  to cgroup.event_control.
-
-Application will be notified through eventfd when memory pressure is at
-the specific level (or higher). Read/write operations to
-memory.pressure_level are no implemented.
-
-Test:
-
-   Here is a small script example that makes a new cgroup, sets up a
-   memory limit, sets up a notification in the cgroup and then makes child
-   cgroup experience a critical pressure::
-
-	# cd /sys/fs/cgroup/memory/
-	# mkdir foo
-	# cd foo
-	# cgroup_event_listener memory.pressure_level low,hierarchy &
-	# echo 8000000 > memory.limit_in_bytes
-	# echo 8000000 > memory.memsw.limit_in_bytes
-	# echo $$ > tasks
-	# dd if=/dev/zero | read x
-
-   (Expect a bunch of notifications, and eventually, the oom-killer will
-   trigger.)
-
-12. TODO
-========
-
-1. Make per-cgroup scanner reclaim not-shared pages first
-2. Teach controller to account for shared-pages
-3. Start reclamation in the background when the limit is
-   not yet hit but the usage is getting closer
-
-Summary
-=======
-
-Overall, the memory controller has been a stable controller and has been
-commented and discussed quite extensively in the community.
-
-References
-==========
-
-1. Singh, Balbir. RFC: Memory Controller, http://lwn.net/Articles/206697/
-2. Singh, Balbir. Memory Controller (RSS Control),
-   http://lwn.net/Articles/222762/
-3. Emelianov, Pavel. Resource controllers based on process cgroups
-   http://lkml.org/lkml/2007/3/6/198
-4. Emelianov, Pavel. RSS controller based on process cgroups (v2)
-   http://lkml.org/lkml/2007/4/9/78
-5. Emelianov, Pavel. RSS controller based on process cgroups (v3)
-   http://lkml.org/lkml/2007/5/30/244
-6. Menage, Paul. Control Groups v10, http://lwn.net/Articles/236032/
-7. Vaidyanathan, Srinivasan, Control Groups: Pagecache accounting and control
-   subsystem (v3), http://lwn.net/Articles/235534/
-8. Singh, Balbir. RSS controller v2 test results (lmbench),
-   http://lkml.org/lkml/2007/5/17/232
-9. Singh, Balbir. RSS controller v2 AIM9 results
-   http://lkml.org/lkml/2007/5/18/1
-10. Singh, Balbir. Memory controller v6 test results,
-    http://lkml.org/lkml/2007/8/19/36
-11. Singh, Balbir. Memory controller introduction (v6),
-    http://lkml.org/lkml/2007/8/17/69
-12. Corbet, Jonathan, Controlling memory use in cgroups,
-    http://lwn.net/Articles/243795/
diff --git a/Documentation/cgroup-v1/net_cls.rst b/Documentation/cgroup-v1/net_cls.rst
deleted file mode 100644
index a2cf272af7a0..000000000000
--- a/Documentation/cgroup-v1/net_cls.rst
+++ /dev/null
@@ -1,44 +0,0 @@
-=========================
-Network classifier cgroup
-=========================
-
-The Network classifier cgroup provides an interface to
-tag network packets with a class identifier (classid).
-
-The Traffic Controller (tc) can be used to assign
-different priorities to packets from different cgroups.
-Also, Netfilter (iptables) can use this tag to perform
-actions on such packets.
-
-Creating a net_cls cgroups instance creates a net_cls.classid file.
-This net_cls.classid value is initialized to 0.
-
-You can write hexadecimal values to net_cls.classid; the format for these
-values is 0xAAAABBBB; AAAA is the major handle number and BBBB
-is the minor handle number.
-Reading net_cls.classid yields a decimal result.
-
-Example::
-
-	mkdir /sys/fs/cgroup/net_cls
-	mount -t cgroup -onet_cls net_cls /sys/fs/cgroup/net_cls
-	mkdir /sys/fs/cgroup/net_cls/0
-	echo 0x100001 >  /sys/fs/cgroup/net_cls/0/net_cls.classid
-
-- setting a 10:1 handle::
-
-	cat /sys/fs/cgroup/net_cls/0/net_cls.classid
-	1048577
-
-- configuring tc::
-
-	tc qdisc add dev eth0 root handle 10: htb
-	tc class add dev eth0 parent 10: classid 10:1 htb rate 40mbit
-
-- creating traffic class 10:1::
-
-	tc filter add dev eth0 parent 10: protocol ip prio 10 handle 1: cgroup
-
-configuring iptables, basic example::
-
-	iptables -A OUTPUT -m cgroup ! --cgroup 0x100001 -j DROP
diff --git a/Documentation/cgroup-v1/net_prio.rst b/Documentation/cgroup-v1/net_prio.rst
deleted file mode 100644
index b40905871c64..000000000000
--- a/Documentation/cgroup-v1/net_prio.rst
+++ /dev/null
@@ -1,57 +0,0 @@
-=======================
-Network priority cgroup
-=======================
-
-The Network priority cgroup provides an interface to allow an administrator to
-dynamically set the priority of network traffic generated by various
-applications
-
-Nominally, an application would set the priority of its traffic via the
-SO_PRIORITY socket option.  This however, is not always possible because:
-
-1) The application may not have been coded to set this value
-2) The priority of application traffic is often a site-specific administrative
-   decision rather than an application defined one.
-
-This cgroup allows an administrator to assign a process to a group which defines
-the priority of egress traffic on a given interface. Network priority groups can
-be created by first mounting the cgroup filesystem::
-
-	# mount -t cgroup -onet_prio none /sys/fs/cgroup/net_prio
-
-With the above step, the initial group acting as the parent accounting group
-becomes visible at '/sys/fs/cgroup/net_prio'.  This group includes all tasks in
-the system. '/sys/fs/cgroup/net_prio/tasks' lists the tasks in this cgroup.
-
-Each net_prio cgroup contains two files that are subsystem specific
-
-net_prio.prioidx
-  This file is read-only, and is simply informative.  It contains a unique
-  integer value that the kernel uses as an internal representation of this
-  cgroup.
-
-net_prio.ifpriomap
-  This file contains a map of the priorities assigned to traffic originating
-  from processes in this group and egressing the system on various interfaces.
-  It contains a list of tuples in the form <ifname priority>.  Contents of this
-  file can be modified by echoing a string into the file using the same tuple
-  format. For example::
-
-	echo "eth0 5" > /sys/fs/cgroups/net_prio/iscsi/net_prio.ifpriomap
-
-This command would force any traffic originating from processes belonging to the
-iscsi net_prio cgroup and egressing on interface eth0 to have the priority of
-said traffic set to the value 5. The parent accounting group also has a
-writeable 'net_prio.ifpriomap' file that can be used to set a system default
-priority.
-
-Priorities are set immediately prior to queueing a frame to the device
-queueing discipline (qdisc) so priorities will be assigned prior to the hardware
-queue selection being made.
-
-One usage for the net_prio cgroup is with mqprio qdisc allowing application
-traffic to be steered to hardware/driver based traffic classes. These mappings
-can then be managed by administrators or other networking protocols such as
-DCBX.
-
-A new net_prio cgroup inherits the parent's configuration.
diff --git a/Documentation/cgroup-v1/pids.rst b/Documentation/cgroup-v1/pids.rst
deleted file mode 100644
index 6acebd9e72c8..000000000000
--- a/Documentation/cgroup-v1/pids.rst
+++ /dev/null
@@ -1,92 +0,0 @@
-=========================
-Process Number Controller
-=========================
-
-Abstract
---------
-
-The process number controller is used to allow a cgroup hierarchy to stop any
-new tasks from being fork()'d or clone()'d after a certain limit is reached.
-
-Since it is trivial to hit the task limit without hitting any kmemcg limits in
-place, PIDs are a fundamental resource. As such, PID exhaustion must be
-preventable in the scope of a cgroup hierarchy by allowing resource limiting of
-the number of tasks in a cgroup.
-
-Usage
------
-
-In order to use the `pids` controller, set the maximum number of tasks in
-pids.max (this is not available in the root cgroup for obvious reasons). The
-number of processes currently in the cgroup is given by pids.current.
-
-Organisational operations are not blocked by cgroup policies, so it is possible
-to have pids.current > pids.max. This can be done by either setting the limit to
-be smaller than pids.current, or attaching enough processes to the cgroup such
-that pids.current > pids.max. However, it is not possible to violate a cgroup
-policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
-creation of a new process would cause a cgroup policy to be violated.
-
-To set a cgroup to have no limit, set pids.max to "max". This is the default for
-all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
-limit in the hierarchy is followed).
-
-pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
-superset of parent/child/pids.current.
-
-The pids.events file contains event counters:
-
-  - max: Number of times fork failed because limit was hit.
-
-Example
--------
-
-First, we mount the pids controller::
-
-	# mkdir -p /sys/fs/cgroup/pids
-	# mount -t cgroup -o pids none /sys/fs/cgroup/pids
-
-Then we create a hierarchy, set limits and attach processes to it::
-
-	# mkdir -p /sys/fs/cgroup/pids/parent/child
-	# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
-	# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
-	# cat /sys/fs/cgroup/pids/parent/pids.current
-	2
-	#
-
-It should be noted that attempts to overcome the set limit (2 in this case) will
-fail::
-
-	# cat /sys/fs/cgroup/pids/parent/pids.current
-	2
-	# ( /bin/echo "Here's some processes for you." | cat )
-	sh: fork: Resource temporary unavailable
-	#
-
-Even if we migrate to a child cgroup (which doesn't have a set limit), we will
-not be able to overcome the most stringent limit in the hierarchy (in this case,
-parent's)::
-
-	# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
-	# cat /sys/fs/cgroup/pids/parent/pids.current
-	2
-	# cat /sys/fs/cgroup/pids/parent/child/pids.current
-	2
-	# cat /sys/fs/cgroup/pids/parent/child/pids.max
-	max
-	# ( /bin/echo "Here's some processes for you." | cat )
-	sh: fork: Resource temporary unavailable
-	#
-
-We can set a limit that is smaller than pids.current, which will stop any new
-processes from being forked at all (note that the shell itself counts towards
-pids.current)::
-
-	# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
-	# /bin/echo "We can't even spawn a single process now."
-	sh: fork: Resource temporary unavailable
-	# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
-	# /bin/echo "We can't even spawn a single process now."
-	sh: fork: Resource temporary unavailable
-	#
diff --git a/Documentation/cgroup-v1/rdma.rst b/Documentation/cgroup-v1/rdma.rst
deleted file mode 100644
index 2fcb0a9bf790..000000000000
--- a/Documentation/cgroup-v1/rdma.rst
+++ /dev/null
@@ -1,117 +0,0 @@
-===============
-RDMA Controller
-===============
-
-.. Contents
-
-   1. Overview
-     1-1. What is RDMA controller?
-     1-2. Why RDMA controller needed?
-     1-3. How is RDMA controller implemented?
-   2. Usage Examples
-
-1. Overview
-===========
-
-1-1. What is RDMA controller?
------------------------------
-
-RDMA controller allows user to limit RDMA/IB specific resources that a given
-set of processes can use. These processes are grouped using RDMA controller.
-
-RDMA controller defines two resources which can be limited for processes of a
-cgroup.
-
-1-2. Why RDMA controller needed?
---------------------------------
-
-Currently user space applications can easily take away all the rdma verb
-specific resources such as AH, CQ, QP, MR etc. Due to which other applications
-in other cgroup or kernel space ULPs may not even get chance to allocate any
-rdma resources. This can lead to service unavailability.
-
-Therefore RDMA controller is needed through which resource consumption
-of processes can be limited. Through this controller different rdma
-resources can be accounted.
-
-1-3. How is RDMA controller implemented?
-----------------------------------------
-
-RDMA cgroup allows limit configuration of resources. Rdma cgroup maintains
-resource accounting per cgroup, per device using resource pool structure.
-Each such resource pool is limited up to 64 resources in given resource pool
-by rdma cgroup, which can be extended later if required.
-
-This resource pool object is linked to the cgroup css. Typically there
-are 0 to 4 resource pool instances per cgroup, per device in most use cases.
-But nothing limits to have it more. At present hundreds of RDMA devices per
-single cgroup may not be handled optimally, however there is no
-known use case or requirement for such configuration either.
-
-Since RDMA resources can be allocated from any process and can be freed by any
-of the child processes which shares the address space, rdma resources are
-always owned by the creator cgroup css. This allows process migration from one
-to other cgroup without major complexity of transferring resource ownership;
-because such ownership is not really present due to shared nature of
-rdma resources. Linking resources around css also ensures that cgroups can be
-deleted after processes migrated. This allow progress migration as well with
-active resources, even though that is not a primary use case.
-
-Whenever RDMA resource charging occurs, owner rdma cgroup is returned to
-the caller. Same rdma cgroup should be passed while uncharging the resource.
-This also allows process migrated with active RDMA resource to charge
-to new owner cgroup for new resource. It also allows to uncharge resource of
-a process from previously charged cgroup which is migrated to new cgroup,
-even though that is not a primary use case.
-
-Resource pool object is created in following situations.
-(a) User sets the limit and no previous resource pool exist for the device
-of interest for the cgroup.
-(b) No resource limits were configured, but IB/RDMA stack tries to
-charge the resource. So that it correctly uncharge them when applications are
-running without limits and later on when limits are enforced during uncharging,
-otherwise usage count will drop to negative.
-
-Resource pool is destroyed if all the resource limits are set to max and
-it is the last resource getting deallocated.
-
-User should set all the limit to max value if it intents to remove/unconfigure
-the resource pool for a particular device.
-
-IB stack honors limits enforced by the rdma controller. When application
-query about maximum resource limits of IB device, it returns minimum of
-what is configured by user for a given cgroup and what is supported by
-IB device.
-
-Following resources can be accounted by rdma controller.
-
-  ==========    =============================
-  hca_handle	Maximum number of HCA Handles
-  hca_object 	Maximum number of HCA Objects
-  ==========    =============================
-
-2. Usage Examples
-=================
-
-(a) Configure resource limit::
-
-	echo mlx4_0 hca_handle=2 hca_object=2000 > /sys/fs/cgroup/rdma/1/rdma.max
-	echo ocrdma1 hca_handle=3 > /sys/fs/cgroup/rdma/2/rdma.max
-
-(b) Query resource limit::
-
-	cat /sys/fs/cgroup/rdma/2/rdma.max
-	#Output:
-	mlx4_0 hca_handle=2 hca_object=2000
-	ocrdma1 hca_handle=3 hca_object=max
-
-(c) Query current usage::
-
-	cat /sys/fs/cgroup/rdma/2/rdma.current
-	#Output:
-	mlx4_0 hca_handle=1 hca_object=20
-	ocrdma1 hca_handle=1 hca_object=23
-
-(d) Delete resource limit::
-
-	echo echo mlx4_0 hca_handle=max hca_object=max > /sys/fs/cgroup/rdma/1/rdma.max
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index cad797a8a39e..5ecbc03e6b2f 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -98,7 +98,7 @@ A memory policy with a valid NodeList will be saved, as specified, for
 use at file creation time.  When a task allocates a file in the file
 system, the mount option memory policy will be applied with a NodeList,
 if any, modified by the calling task's cpuset constraints
-[See Documentation/cgroup-v1/cpusets.rst] and any optional flags, listed
+[See Documentation/admin-guide/cgroup-v1/cpusets.rst] and any optional flags, listed
 below.  If the resulting NodeLists is the empty set, the effective memory
 policy for the file will revert to "default" policy.
 
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index 5623b9916411..4f18456dd3b1 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -12,7 +12,7 @@ References
 
 -	Documentation/IRQ-affinity.txt:  Binding interrupts to sets of CPUs.
 
--	Documentation/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
+-	Documentation/admin-guide/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
 
 -	man taskset:  Using the taskset command to bind tasks to sets
 	of CPUs.
diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst
index 3391e86d810c..14a2f7bf63fe 100644
--- a/Documentation/scheduler/sched-deadline.rst
+++ b/Documentation/scheduler/sched-deadline.rst
@@ -669,7 +669,7 @@ Deadline Task Scheduling
 
  -deadline tasks cannot have an affinity mask smaller that the entire
  root_domain they are created on. However, affinities can be specified
- through the cpuset facility (Documentation/cgroup-v1/cpusets.rst).
+ through the cpuset facility (Documentation/admin-guide/cgroup-v1/cpusets.rst).
 
 5.1 SCHED_DEADLINE and cpusets HOWTO
 ------------------------------------
diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
index 53b30d1967cf..a96c72651877 100644
--- a/Documentation/scheduler/sched-design-CFS.rst
+++ b/Documentation/scheduler/sched-design-CFS.rst
@@ -222,7 +222,7 @@ SCHED_BATCH) tasks.
 
    These options need CONFIG_CGROUPS to be defined, and let the administrator
    create arbitrary groups of tasks, using the "cgroup" pseudo filesystem.  See
-   Documentation/cgroup-v1/cgroups.rst for more information about this filesystem.
+   Documentation/admin-guide/cgroup-v1/cgroups.rst for more information about this filesystem.
 
 When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each
 group created using the pseudo filesystem.  See example steps below to create
diff --git a/Documentation/scheduler/sched-rt-group.rst b/Documentation/scheduler/sched-rt-group.rst
index d27d3f3712fd..655a096ec8fb 100644
--- a/Documentation/scheduler/sched-rt-group.rst
+++ b/Documentation/scheduler/sched-rt-group.rst
@@ -133,7 +133,7 @@ This uses the cgroup virtual file system and "<cgroup>/cpu.rt_runtime_us"
 to control the CPU time reserved for each control group.
 
 For more information on working with control groups, you should read
-Documentation/cgroup-v1/cgroups.rst as well.
+Documentation/admin-guide/cgroup-v1/cgroups.rst as well.
 
 Group settings are checked against the following limits in order to keep the
 configuration schedulable:
diff --git a/Documentation/vm/numa.rst b/Documentation/vm/numa.rst
index 130f3cfa1c19..99fdeca917ca 100644
--- a/Documentation/vm/numa.rst
+++ b/Documentation/vm/numa.rst
@@ -67,7 +67,7 @@ nodes.  Each emulated node will manage a fraction of the underlying cells'
 physical memory.  NUMA emluation is useful for testing NUMA kernel and
 application features on non-NUMA platforms, and as a sort of memory resource
 management mechanism when used together with cpusets.
-[see Documentation/cgroup-v1/cpusets.rst]
+[see Documentation/admin-guide/cgroup-v1/cpusets.rst]
 
 For each node with memory, Linux constructs an independent memory management
 subsystem, complete with its own free page lists, in-use page lists, usage
@@ -114,7 +114,7 @@ allocation behavior using Linux NUMA memory policy. [see
 
 System administrators can restrict the CPUs and nodes' memories that a non-
 privileged user can specify in the scheduling or NUMA commands and functions
-using control groups and CPUsets.  [see Documentation/cgroup-v1/cpusets.rst]
+using control groups and CPUsets.  [see Documentation/admin-guide/cgroup-v1/cpusets.rst]
 
 On architectures that do not hide memoryless nodes, Linux will include only
 zones [nodes] with memory in the zonelists.  This means that for a memoryless
diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst
index 35bba27d5fff..1d6cd7db4e43 100644
--- a/Documentation/vm/page_migration.rst
+++ b/Documentation/vm/page_migration.rst
@@ -41,7 +41,7 @@ locations.
 Larger installations usually partition the system using cpusets into
 sections of nodes. Paul Jackson has equipped cpusets with the ability to
 move pages when a task is moved to another cpuset (See
-Documentation/cgroup-v1/cpusets.rst).
+Documentation/admin-guide/cgroup-v1/cpusets.rst).
 Cpusets allows the automation of process locality. If a task is moved to
 a new cpuset then also all its pages are moved with it so that the
 performance of the process does not sink dramatically. Also the pages
diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/vm/unevictable-lru.rst
index 109052215bce..17d0861b0f1d 100644
--- a/Documentation/vm/unevictable-lru.rst
+++ b/Documentation/vm/unevictable-lru.rst
@@ -98,7 +98,7 @@ Memory Control Group Interaction
 --------------------------------
 
 The unevictable LRU facility interacts with the memory control group [aka
-memory controller; see Documentation/cgroup-v1/memory.rst] by extending the
+memory controller; see Documentation/admin-guide/cgroup-v1/memory.rst] by extending the
 lru_list enum.
 
 The memory controller data structure automatically gets a per-zone unevictable
diff --git a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
index 30108684ae87..ff9bcfd2cc14 100644
--- a/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
+++ b/Documentation/x86/x86_64/fake-numa-for-cpusets.rst
@@ -15,7 +15,7 @@ assign them to cpusets and their attached tasks.  This is a way of limiting the
 amount of system memory that are available to a certain class of tasks.
 
 For more information on the features of cpusets, see
-Documentation/cgroup-v1/cpusets.rst.
+Documentation/admin-guide/cgroup-v1/cpusets.rst.
 There are a number of different configurations you can use for your needs.  For
 more information on the numa=fake command line option and its various ways of
 configuring fake nodes, see Documentation/x86/x86_64/boot-options.rst.
@@ -40,7 +40,7 @@ A machine may be split as follows with "numa=fake=4*512," as reported by dmesg::
 	On node 3 totalpages: 131072
 
 Now following the instructions for mounting the cpusets filesystem from
-Documentation/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory
+Documentation/admin-guide/cgroup-v1/cpusets.rst, you can assign fake nodes (i.e. contiguous memory
 address spaces) to individual cpusets::
 
 	[root@xroads /]# mkdir exampleset
diff --git a/MAINTAINERS b/MAINTAINERS
index 0c603ea73034..c1593a668f80 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4158,7 +4158,7 @@ L:	cgroups@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
 S:	Maintained
 F:	Documentation/admin-guide/cgroup-v2.rst
-F:	Documentation/cgroup-v1/
+F:	Documentation/admin-guide/cgroup-v1/
 F:	include/linux/cgroup*
 F:	kernel/cgroup/
 
@@ -4169,7 +4169,7 @@ W:	http://www.bullopensource.org/cpuset/
 W:	http://oss.sgi.com/projects/cpusets/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
 S:	Maintained
-F:	Documentation/cgroup-v1/cpusets.rst
+F:	Documentation/admin-guide/cgroup-v1/cpusets.rst
 F:	include/linux/cpuset.h
 F:	kernel/cgroup/cpuset.c
 
diff --git a/block/Kconfig b/block/Kconfig
index b16b3e075d31..8b5f8e560eb4 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -89,7 +89,7 @@ config BLK_DEV_THROTTLING
 	one needs to mount and use blkio cgroup controller for creating
 	cgroups and specifying per device IO rate policies.
 
-	See Documentation/cgroup-v1/blkio-controller.rst for more information.
+	See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
 
 config BLK_DEV_THROTTLING_LOW
 	bool "Block throttling .low limit interface support (EXPERIMENTAL)"
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c5311935239d..430e219e3aba 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -624,7 +624,7 @@ struct cftype {
 
 /*
  * Control Group subsystem type.
- * See Documentation/cgroup-v1/cgroups.rst for details
+ * See Documentation/admin-guide/cgroup-v1/cgroups.rst for details
  */
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6f68438aa4ed..82699845ef79 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -806,7 +806,7 @@ union bpf_attr {
  * 		based on a user-provided identifier for all traffic coming from
  * 		the tasks belonging to the related cgroup. See also the related
  * 		kernel documentation, available from the Linux sources in file
- * 		*Documentation/cgroup-v1/net_cls.rst*.
+ * 		*Documentation/admin-guide/cgroup-v1/net_cls.rst*.
  *
  * 		The Linux kernel has two versions for cgroups: there are
  * 		cgroups v1 and cgroups v2. Both are available to users, who can
diff --git a/init/Kconfig b/init/Kconfig
index 9eb92ee52d40..381cdfee6e0e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -821,7 +821,7 @@ menuconfig CGROUPS
 	  controls or device isolation.
 	  See
 		- Documentation/scheduler/sched-design-CFS.rst	(CFS)
-		- Documentation/cgroup-v1/ (features for grouping, isolation
+		- Documentation/admin-guide/cgroup-v1/ (features for grouping, isolation
 					  and resource control)
 
 	  Say N if unsure.
@@ -883,7 +883,7 @@ config BLK_CGROUP
 	CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set
 	CONFIG_BLK_DEV_THROTTLING=y.
 
-	See Documentation/cgroup-v1/blkio-controller.rst for more information.
+	See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
 
 config CGROUP_WRITEBACK
 	bool
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b3b02b9c4405..863e434a6020 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -729,7 +729,7 @@ static inline int nr_cpusets(void)
  * load balancing domains (sched domains) as specified by that partial
  * partition.
  *
- * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst
+ * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
  * for a background explanation of this.
  *
  * Does not return errors, on the theory that the callers of this
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index c07196502577..725674f3276d 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -509,7 +509,7 @@ static inline int may_allow_all(struct dev_cgroup *parent)
  * This is one of the three key functions for hierarchy implementation.
  * This function is responsible for re-evaluating all the cgroup's active
  * exceptions due to a parent's exception change.
- * Refer to Documentation/cgroup-v1/devices.rst for more details.
+ * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details.
  */
 static void revalidate_active_exceptions(struct dev_cgroup *devcg)
 {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f506c68b2612..17e2b1713702 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -806,7 +806,7 @@ union bpf_attr {
  * 		based on a user-provided identifier for all traffic coming from
  * 		the tasks belonging to the related cgroup. See also the related
  * 		kernel documentation, available from the Linux sources in file
- * 		*Documentation/cgroup-v1/net_cls.rst*.
+ * 		*Documentation/admin-guide/cgroup-v1/net_cls.rst*.
  *
  * 		The Linux kernel has two versions for cgroups: there are
  * 		cgroups v1 and cgroups v2. Both are available to users, who can
-- 
cgit v1.2.3-59-g8ed1b


From 4f4cfa6c560c93ba180c30675cf845e1597de44c Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 27 Jun 2019 14:56:51 -0300
Subject: docs: admin-guide: add a series of orphaned documents

There are lots of documents that belong to the admin-guide but
are on random places (most under Documentation root dir).

Move them to the admin guide.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Acked-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
---
 Documentation/ABI/stable/sysfs-devices-node        |   2 +-
 Documentation/ABI/testing/procfs-diskstats         |   2 +-
 Documentation/ABI/testing/sysfs-block              |   2 +-
 Documentation/ABI/testing/sysfs-devices-system-cpu |   4 +-
 Documentation/admin-guide/btmrvl.rst               | 124 +++++++
 Documentation/admin-guide/clearing-warn-once.rst   |   9 +
 Documentation/admin-guide/cpu-load.rst             | 114 +++++++
 Documentation/admin-guide/cputopology.rst          | 177 ++++++++++
 .../admin-guide/device-mapper/statistics.rst       |   4 +-
 Documentation/admin-guide/efi-stub.rst             | 100 ++++++
 Documentation/admin-guide/highuid.rst              |  80 +++++
 Documentation/admin-guide/hw-vuln/l1tf.rst         |   2 +-
 Documentation/admin-guide/hw_random.rst            | 105 ++++++
 Documentation/admin-guide/index.rst                |  17 +
 Documentation/admin-guide/iostats.rst              | 197 ++++++++++++
 Documentation/admin-guide/kernel-parameters.txt    |   2 +-
 .../admin-guide/kernel-per-CPU-kthreads.rst        | 356 +++++++++++++++++++++
 Documentation/admin-guide/lcd-panel-cgram.rst      |  27 ++
 Documentation/admin-guide/ldm.rst                  | 121 +++++++
 Documentation/admin-guide/lockup-watchdogs.rst     |  83 +++++
 Documentation/admin-guide/mm/cma_debugfs.rst       |  25 ++
 Documentation/admin-guide/mm/index.rst             |   1 +
 Documentation/admin-guide/numastat.rst             |  30 ++
 Documentation/admin-guide/pnp.rst                  | 292 +++++++++++++++++
 Documentation/admin-guide/rtc.rst                  | 140 ++++++++
 Documentation/admin-guide/svga.rst                 | 249 ++++++++++++++
 Documentation/admin-guide/sysctl/kernel.rst        |   2 +-
 Documentation/admin-guide/video-output.rst         |  34 ++
 Documentation/auxdisplay/lcd-panel-cgram.rst       |  29 --
 Documentation/btmrvl.txt                           | 124 -------
 Documentation/clearing-warn-once.txt               |   9 -
 Documentation/cma/debugfs.rst                      |  27 --
 Documentation/cpu-load.txt                         | 114 -------
 Documentation/cputopology.txt                      | 177 ----------
 Documentation/efi-stub.txt                         | 100 ------
 Documentation/fb/vesafb.rst                        |   2 +-
 Documentation/highuid.txt                          |  80 -----
 Documentation/hw_random.txt                        | 105 ------
 Documentation/iostats.txt                          | 197 ------------
 Documentation/kernel-per-CPU-kthreads.txt          | 356 ---------------------
 Documentation/ldm.txt                              | 121 -------
 Documentation/lockup-watchdogs.txt                 |  83 -----
 Documentation/numastat.txt                         |  30 --
 Documentation/pnp.txt                              | 292 -----------------
 Documentation/rtc.txt                              | 140 --------
 Documentation/svga.txt                             | 249 --------------
 Documentation/video-output.txt                     |  34 --
 Documentation/x86/topology.rst                     |   2 +-
 MAINTAINERS                                        |  12 +-
 arch/arm/Kconfig                                   |   2 +-
 arch/parisc/Kconfig                                |   2 +-
 arch/sh/Kconfig                                    |   2 +-
 arch/sparc/Kconfig                                 |   2 +-
 arch/x86/Kconfig                                   |   4 +-
 block/partitions/Kconfig                           |   2 +-
 drivers/char/Kconfig                               |   4 +-
 drivers/char/hw_random/core.c                      |   2 +-
 include/linux/hw_random.h                          |   2 +-
 58 files changed, 2310 insertions(+), 2296 deletions(-)
 create mode 100644 Documentation/admin-guide/btmrvl.rst
 create mode 100644 Documentation/admin-guide/clearing-warn-once.rst
 create mode 100644 Documentation/admin-guide/cpu-load.rst
 create mode 100644 Documentation/admin-guide/cputopology.rst
 create mode 100644 Documentation/admin-guide/efi-stub.rst
 create mode 100644 Documentation/admin-guide/highuid.rst
 create mode 100644 Documentation/admin-guide/hw_random.rst
 create mode 100644 Documentation/admin-guide/iostats.rst
 create mode 100644 Documentation/admin-guide/kernel-per-CPU-kthreads.rst
 create mode 100644 Documentation/admin-guide/lcd-panel-cgram.rst
 create mode 100644 Documentation/admin-guide/ldm.rst
 create mode 100644 Documentation/admin-guide/lockup-watchdogs.rst
 create mode 100644 Documentation/admin-guide/mm/cma_debugfs.rst
 create mode 100644 Documentation/admin-guide/numastat.rst
 create mode 100644 Documentation/admin-guide/pnp.rst
 create mode 100644 Documentation/admin-guide/rtc.rst
 create mode 100644 Documentation/admin-guide/svga.rst
 create mode 100644 Documentation/admin-guide/video-output.rst
 delete mode 100644 Documentation/auxdisplay/lcd-panel-cgram.rst
 delete mode 100644 Documentation/btmrvl.txt
 delete mode 100644 Documentation/clearing-warn-once.txt
 delete mode 100644 Documentation/cma/debugfs.rst
 delete mode 100644 Documentation/cpu-load.txt
 delete mode 100644 Documentation/cputopology.txt
 delete mode 100644 Documentation/efi-stub.txt
 delete mode 100644 Documentation/highuid.txt
 delete mode 100644 Documentation/hw_random.txt
 delete mode 100644 Documentation/iostats.txt
 delete mode 100644 Documentation/kernel-per-CPU-kthreads.txt
 delete mode 100644 Documentation/ldm.txt
 delete mode 100644 Documentation/lockup-watchdogs.txt
 delete mode 100644 Documentation/numastat.txt
 delete mode 100644 Documentation/pnp.txt
 delete mode 100644 Documentation/rtc.txt
 delete mode 100644 Documentation/svga.txt
 delete mode 100644 Documentation/video-output.txt

(limited to 'include/linux')

diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node
index f7ce68fbd4b9..df8413cf1468 100644
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -61,7 +61,7 @@ Date:		October 2002
 Contact:	Linux Memory Management list <linux-mm@kvack.org>
 Description:
 		The node's hit/miss statistics, in units of pages.
-		See Documentation/numastat.txt
+		See Documentation/admin-guide/numastat.rst
 
 What:		/sys/devices/system/node/nodeX/distance
 Date:		October 2002
diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats
index abac31d216de..2c44b4f1b060 100644
--- a/Documentation/ABI/testing/procfs-diskstats
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -29,4 +29,4 @@ Description:
 		17 - sectors discarded
 		18 - time spent discarding
 
-		For more details refer to Documentation/iostats.txt
+		For more details refer to Documentation/admin-guide/iostats.rst
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index dfad7427817c..f8c7c7126bb1 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -15,7 +15,7 @@ Description:
 		 9 - I/Os currently in progress
 		10 - time spent doing I/Os (ms)
 		11 - weighted time spent doing I/Os (ms)
-		For more details refer Documentation/iostats.txt
+		For more details refer Documentation/admin-guide/iostats.rst
 
 
 What:		/sys/block/<disk>/<part>/stat
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index d404603c6b52..5f7d7b14fa44 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -34,7 +34,7 @@ Description:	CPU topology files that describe kernel limits related to
 		present: cpus that have been identified as being present in
 		the system.
 
-		See Documentation/cputopology.txt for more information.
+		See Documentation/admin-guide/cputopology.rst for more information.
 
 
 What:		/sys/devices/system/cpu/probe
@@ -103,7 +103,7 @@ Description:	CPU topology files that describe a logical CPU's relationship
 		thread_siblings_list: human-readable list of cpu#'s hardware
 		threads within the same core as cpu#
 
-		See Documentation/cputopology.txt for more information.
+		See Documentation/admin-guide/cputopology.rst for more information.
 
 
 What:		/sys/devices/system/cpu/cpuidle/current_driver
diff --git a/Documentation/admin-guide/btmrvl.rst b/Documentation/admin-guide/btmrvl.rst
new file mode 100644
index 000000000000..ec57740ead0c
--- /dev/null
+++ b/Documentation/admin-guide/btmrvl.rst
@@ -0,0 +1,124 @@
+=============
+btmrvl driver
+=============
+
+All commands are used via debugfs interface.
+
+Set/get driver configurations
+=============================
+
+Path:	/debug/btmrvl/config/
+
+gpiogap=[n], hscfgcmd
+	These commands are used to configure the host sleep parameters::
+	bit 8:0  -- Gap
+	bit 16:8 -- GPIO
+
+	where GPIO is the pin number of GPIO used to wake up the host.
+	It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface
+	wakeup will be used instead).
+
+	where Gap is the gap in milli seconds between wakeup signal and
+	wakeup event, or 0xff for special host sleep setting.
+
+	Usage::
+
+		# Use SDIO interface to wake up the host and set GAP to 0x80:
+		echo 0xff80 > /debug/btmrvl/config/gpiogap
+		echo 1 > /debug/btmrvl/config/hscfgcmd
+
+		# Use GPIO pin #3 to wake up the host and set GAP to 0xff:
+		echo 0x03ff >  /debug/btmrvl/config/gpiogap
+		echo 1 > /debug/btmrvl/config/hscfgcmd
+
+psmode=[n], pscmd
+	These commands are used to enable/disable auto sleep mode
+
+	where the option is::
+
+			1 	-- Enable auto sleep mode
+			0 	-- Disable auto sleep mode
+
+	Usage::
+
+		# Enable auto sleep mode
+		echo 1 > /debug/btmrvl/config/psmode
+		echo 1 > /debug/btmrvl/config/pscmd
+
+		# Disable auto sleep mode
+		echo 0 > /debug/btmrvl/config/psmode
+		echo 1 > /debug/btmrvl/config/pscmd
+
+
+hsmode=[n], hscmd
+	These commands are used to enable host sleep or wake up firmware
+
+	where the option is::
+
+			1	-- Enable host sleep
+			0	-- Wake up firmware
+
+	Usage::
+
+		# Enable host sleep
+		echo 1 > /debug/btmrvl/config/hsmode
+		echo 1 > /debug/btmrvl/config/hscmd
+
+		# Wake up firmware
+		echo 0 > /debug/btmrvl/config/hsmode
+		echo 1 > /debug/btmrvl/config/hscmd
+
+
+Get driver status
+=================
+
+Path:	/debug/btmrvl/status/
+
+Usage::
+
+	cat /debug/btmrvl/status/<args>
+
+where the args are:
+
+curpsmode
+	This command displays current auto sleep status.
+
+psstate
+	This command display the power save state.
+
+hsstate
+	This command display the host sleep state.
+
+txdnldrdy
+	This command displays the value of Tx download ready flag.
+
+Issuing a raw hci command
+=========================
+
+Use hcitool to issue raw hci command, refer to hcitool manual
+
+Usage::
+
+	Hcitool cmd <ogf> <ocf> [Parameters]
+
+Interface Control Command::
+
+	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00    --Enable All interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01    --Enable Wlan interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02    --Enable BT interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00    --Disable All interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01    --Disable Wlan interface
+	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02    --Disable BT interface
+
+SD8688 firmware
+===============
+
+Images:
+
+- /lib/firmware/sd8688_helper.bin
+- /lib/firmware/sd8688.bin
+
+
+The images can be downloaded from:
+
+git.infradead.org/users/dwmw2/linux-firmware.git/libertas/
diff --git a/Documentation/admin-guide/clearing-warn-once.rst b/Documentation/admin-guide/clearing-warn-once.rst
new file mode 100644
index 000000000000..211fd926cf00
--- /dev/null
+++ b/Documentation/admin-guide/clearing-warn-once.rst
@@ -0,0 +1,9 @@
+Clearing WARN_ONCE
+------------------
+
+WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once.
+
+echo 1 > /sys/kernel/debug/clear_warn_once
+
+clears the state and allows the warnings to print once again.
+This can be useful after test suite runs to reproduce problems.
diff --git a/Documentation/admin-guide/cpu-load.rst b/Documentation/admin-guide/cpu-load.rst
new file mode 100644
index 000000000000..2d01ce43d2a2
--- /dev/null
+++ b/Documentation/admin-guide/cpu-load.rst
@@ -0,0 +1,114 @@
+========
+CPU load
+========
+
+Linux exports various bits of information via ``/proc/stat`` and
+``/proc/uptime`` that userland tools, such as top(1), use to calculate
+the average time system spent in a particular state, for example::
+
+    $ iostat
+    Linux 2.6.18.3-exp (linmac)     02/20/2007
+
+    avg-cpu:  %user   %nice %system %iowait  %steal   %idle
+              10.01    0.00    2.92    5.44    0.00   81.63
+
+    ...
+
+Here the system thinks that over the default sampling period the
+system spent 10.01% of the time doing work in user space, 2.92% in the
+kernel, and was overall 81.63% of the time idle.
+
+In most cases the ``/proc/stat``	 information reflects the reality quite
+closely, however due to the nature of how/when the kernel collects
+this data sometimes it can not be trusted at all.
+
+So how is this information collected?  Whenever timer interrupt is
+signalled the kernel looks what kind of task was running at this
+moment and increments the counter that corresponds to this tasks
+kind/state.  The problem with this is that the system could have
+switched between various states multiple times between two timer
+interrupts yet the counter is incremented only for the last state.
+
+
+Example
+-------
+
+If we imagine the system with one task that periodically burns cycles
+in the following manner::
+
+     time line between two timer interrupts
+    |--------------------------------------|
+     ^                                    ^
+     |_ something begins working          |
+                                          |_ something goes to sleep
+                                         (only to be awaken quite soon)
+
+In the above situation the system will be 0% loaded according to the
+``/proc/stat`` (since the timer interrupt will always happen when the
+system is executing the idle handler), but in reality the load is
+closer to 99%.
+
+One can imagine many more situations where this behavior of the kernel
+will lead to quite erratic information inside ``/proc/stat``::
+
+
+	/* gcc -o hog smallhog.c */
+	#include <time.h>
+	#include <limits.h>
+	#include <signal.h>
+	#include <sys/time.h>
+	#define HIST 10
+
+	static volatile sig_atomic_t stop;
+
+	static void sighandler (int signr)
+	{
+	(void) signr;
+	stop = 1;
+	}
+	static unsigned long hog (unsigned long niters)
+	{
+	stop = 0;
+	while (!stop && --niters);
+	return niters;
+	}
+	int main (void)
+	{
+	int i;
+	struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
+				.it_value = { .tv_sec = 0, .tv_usec = 1 } };
+	sigset_t set;
+	unsigned long v[HIST];
+	double tmp = 0.0;
+	unsigned long n;
+	signal (SIGALRM, &sighandler);
+	setitimer (ITIMER_REAL, &it, NULL);
+
+	hog (ULONG_MAX);
+	for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
+	for (i = 0; i < HIST; ++i) tmp += v[i];
+	tmp /= HIST;
+	n = tmp - (tmp / 3.0);
+
+	sigemptyset (&set);
+	sigaddset (&set, SIGALRM);
+
+	for (;;) {
+		hog (n);
+		sigwait (&set, &i);
+	}
+	return 0;
+	}
+
+
+References
+----------
+
+- http://lkml.org/lkml/2007/2/12/6
+- Documentation/filesystems/proc.txt (1.8)
+
+
+Thanks
+------
+
+Con Kolivas, Pavel Machek
diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst
new file mode 100644
index 000000000000..b90dafcc8237
--- /dev/null
+++ b/Documentation/admin-guide/cputopology.rst
@@ -0,0 +1,177 @@
+===========================================
+How CPU topology info is exported via sysfs
+===========================================
+
+Export CPU topology info via sysfs. Items (attributes) are similar
+to /proc/cpuinfo output of some architectures.  They reside in
+/sys/devices/system/cpu/cpuX/topology/:
+
+physical_package_id:
+
+	physical package id of cpuX. Typically corresponds to a physical
+	socket number, but the actual value is architecture and platform
+	dependent.
+
+die_id:
+
+	the CPU die ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).  The actual value is
+	architecture and platform dependent.
+
+core_id:
+
+	the CPU core ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).  The actual value is
+	architecture and platform dependent.
+
+book_id:
+
+	the book ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).	The actual value is
+	architecture and platform dependent.
+
+drawer_id:
+
+	the drawer ID of cpuX. Typically it is the hardware platform's
+	identifier (rather than the kernel's).	The actual value is
+	architecture and platform dependent.
+
+core_cpus:
+
+	internal kernel map of CPUs within the same core.
+	(deprecated name: "thread_siblings")
+
+core_cpus_list:
+
+	human-readable list of CPUs within the same core.
+	(deprecated name: "thread_siblings_list");
+
+package_cpus:
+
+	internal kernel map of the CPUs sharing the same physical_package_id.
+	(deprecated name: "core_siblings")
+
+package_cpus_list:
+
+	human-readable list of CPUs sharing the same physical_package_id.
+	(deprecated name: "core_siblings_list")
+
+die_cpus:
+
+	internal kernel map of CPUs within the same die.
+
+die_cpus_list:
+
+	human-readable list of CPUs within the same die.
+
+book_siblings:
+
+	internal kernel map of cpuX's hardware threads within the same
+	book_id.
+
+book_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	book_id.
+
+drawer_siblings:
+
+	internal kernel map of cpuX's hardware threads within the same
+	drawer_id.
+
+drawer_siblings_list:
+
+	human-readable list of cpuX's hardware threads within the same
+	drawer_id.
+
+Architecture-neutral, drivers/base/topology.c, exports these attributes.
+However, the book and drawer related sysfs files will only be created if
+CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
+
+CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
+where they reflect the cpu and cache hierarchy.
+
+For an architecture to support this feature, it must define some of
+these macros in include/asm-XXX/topology.h::
+
+	#define topology_physical_package_id(cpu)
+	#define topology_die_id(cpu)
+	#define topology_core_id(cpu)
+	#define topology_book_id(cpu)
+	#define topology_drawer_id(cpu)
+	#define topology_sibling_cpumask(cpu)
+	#define topology_core_cpumask(cpu)
+	#define topology_die_cpumask(cpu)
+	#define topology_book_cpumask(cpu)
+	#define topology_drawer_cpumask(cpu)
+
+The type of ``**_id macros`` is int.
+The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
+correspond with appropriate ``**_siblings`` sysfs attributes (except for
+topology_sibling_cpumask() which corresponds with thread_siblings).
+
+To be consistent on all architectures, include/linux/topology.h
+provides default definitions for any of the above macros that are
+not defined by include/asm-XXX/topology.h:
+
+1) topology_physical_package_id: -1
+2) topology_die_id: -1
+3) topology_core_id: 0
+4) topology_sibling_cpumask: just the given CPU
+5) topology_core_cpumask: just the given CPU
+6) topology_die_cpumask: just the given CPU
+
+For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
+default definitions for topology_book_id() and topology_book_cpumask().
+For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
+no default definitions for topology_drawer_id() and topology_drawer_cpumask().
+
+Additionally, CPU topology information is provided under
+/sys/devices/system/cpu and includes these files.  The internal
+source for the output is in brackets ("[]").
+
+    =========== ==========================================================
+    kernel_max: the maximum CPU index allowed by the kernel configuration.
+		[NR_CPUS-1]
+
+    offline:	CPUs that are not online because they have been
+		HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit
+		of CPUs allowed by the kernel configuration (kernel_max
+		above). [~cpu_online_mask + cpus >= NR_CPUS]
+
+    online:	CPUs that are online and being scheduled [cpu_online_mask]
+
+    possible:	CPUs that have been allocated resources and can be
+		brought online if they are present. [cpu_possible_mask]
+
+    present:	CPUs that have been identified as being present in the
+		system. [cpu_present_mask]
+    =========== ==========================================================
+
+The format for the above output is compatible with cpulist_parse()
+[see <linux/cpumask.h>].  Some examples follow.
+
+In this example, there are 64 CPUs in the system but cpus 32-63 exceed
+the kernel max which is limited to 0..31 by the NR_CPUS config option
+being 32.  Note also that CPUs 2 and 4-31 are not online but could be
+brought online as they are both present and possible::
+
+     kernel_max: 31
+        offline: 2,4-31,32-63
+         online: 0-1,3
+       possible: 0-31
+        present: 0-31
+
+In this example, the NR_CPUS config option is 128, but the kernel was
+started with possible_cpus=144.  There are 4 CPUs in the system and cpu2
+was manually taken offline (and is the only CPU that can be brought
+online.)::
+
+     kernel_max: 127
+        offline: 2,4-127,128-143
+         online: 0-1,3
+       possible: 0-127
+        present: 0-3
+
+See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter
+as well as more information on the various cpumasks.
diff --git a/Documentation/admin-guide/device-mapper/statistics.rst b/Documentation/admin-guide/device-mapper/statistics.rst
index 3d80a9f850cc..41ded0bc5933 100644
--- a/Documentation/admin-guide/device-mapper/statistics.rst
+++ b/Documentation/admin-guide/device-mapper/statistics.rst
@@ -13,7 +13,7 @@ the range specified.
 
 The I/O statistics counters for each step-sized area of a region are
 in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see:
-Documentation/iostats.txt).  But two extra counters (12 and 13) are
+Documentation/admin-guide/iostats.rst).  But two extra counters (12 and 13) are
 provided: total time spent reading and writing.  When the histogram
 argument is used, the 14th parameter is reported that represents the
 histogram of latencies.  All these counters may be accessed by sending
@@ -151,7 +151,7 @@ Messages
 	  The first 11 counters have the same meaning as
 	  `/sys/block/*/stat or /proc/diskstats`.
 
-	  Please refer to Documentation/iostats.txt for details.
+	  Please refer to Documentation/admin-guide/iostats.rst for details.
 
 	  1. the number of reads completed
 	  2. the number of reads merged
diff --git a/Documentation/admin-guide/efi-stub.rst b/Documentation/admin-guide/efi-stub.rst
new file mode 100644
index 000000000000..833edb0d0bc4
--- /dev/null
+++ b/Documentation/admin-guide/efi-stub.rst
@@ -0,0 +1,100 @@
+=================
+The EFI Boot Stub
+=================
+
+On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade
+as a PE/COFF image, thereby convincing EFI firmware loaders to load
+it as an EFI executable. The code that modifies the bzImage header,
+along with the EFI-specific entry point that the firmware loader
+jumps to are collectively known as the "EFI boot stub", and live in
+arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c,
+respectively. For ARM the EFI stub is implemented in
+arch/arm/boot/compressed/efi-header.S and
+arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared
+between architectures is in drivers/firmware/efi/libstub.
+
+For arm64, there is no compressed kernel support, so the Image itself
+masquerades as a PE/COFF image and the EFI stub is linked into the
+kernel. The arm64 EFI stub lives in arch/arm64/kernel/efi-entry.S
+and drivers/firmware/efi/libstub/arm64-stub.c.
+
+By using the EFI boot stub it's possible to boot a Linux kernel
+without the use of a conventional EFI boot loader, such as grub or
+elilo. Since the EFI boot stub performs the jobs of a boot loader, in
+a certain sense it *IS* the boot loader.
+
+The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option.
+
+
+How to install bzImage.efi
+--------------------------
+
+The bzImage located in arch/x86/boot/bzImage must be copied to the EFI
+System Partition (ESP) and renamed with the extension ".efi". Without
+the extension the EFI firmware loader will refuse to execute it. It's
+not possible to execute bzImage.efi from the usual Linux file systems
+because EFI firmware doesn't have support for them. For ARM the
+arch/arm/boot/zImage should be copied to the system partition, and it
+may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image
+should be copied but not necessarily renamed.
+
+
+Passing kernel parameters from the EFI shell
+--------------------------------------------
+
+Arguments to the kernel can be passed after bzImage.efi, e.g.::
+
+	fs0:> bzImage.efi console=ttyS0 root=/dev/sda4
+
+
+The "initrd=" option
+--------------------
+
+Like most boot loaders, the EFI stub allows the user to specify
+multiple initrd files using the "initrd=" option. This is the only EFI
+stub-specific command line parameter, everything else is passed to the
+kernel when it boots.
+
+The path to the initrd file must be an absolute path from the
+beginning of the ESP, relative path names do not work. Also, the path
+is an EFI-style path and directory elements must be separated with
+backslashes (\). For example, given the following directory layout::
+
+  fs0:>
+	Kernels\
+			bzImage.efi
+			initrd-large.img
+
+	Ramdisks\
+			initrd-small.img
+			initrd-medium.img
+
+to boot with the initrd-large.img file if the current working
+directory is fs0:\Kernels, the following command must be used::
+
+	fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img
+
+Notice how bzImage.efi can be specified with a relative path. That's
+because the image we're executing is interpreted by the EFI shell,
+which understands relative paths, whereas the rest of the command line
+is passed to bzImage.efi.
+
+
+The "dtb=" option
+-----------------
+
+For the ARM and arm64 architectures, a device tree must be provided to
+the kernel. Normally firmware shall supply the device tree via the
+EFI CONFIGURATION TABLE. However, the "dtb=" command line option can
+be used to override the firmware supplied device tree, or to supply
+one when firmware is unable to.
+
+Please note: Firmware adds runtime configuration information to the
+device tree before booting the kernel. If dtb= is used to override
+the device tree, then any runtime data provided by firmware will be
+lost. The dtb= option should only be used either as a debug tool, or
+as a last resort when a device tree is not provided in the EFI
+CONFIGURATION TABLE.
+
+"dtb=" is processed in the same manner as the "initrd=" option that is
+described above.
diff --git a/Documentation/admin-guide/highuid.rst b/Documentation/admin-guide/highuid.rst
new file mode 100644
index 000000000000..6ee70465c0ea
--- /dev/null
+++ b/Documentation/admin-guide/highuid.rst
@@ -0,0 +1,80 @@
+===================================================
+Notes on the change from 16-bit UIDs to 32-bit UIDs
+===================================================
+
+:Author: Chris Wing <wingc@umich.edu>
+:Last updated: January 11, 2000
+
+- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t
+  when communicating between user and kernel space in an ioctl or data
+  structure.
+
+- kernel code should use uid_t and gid_t in kernel-private structures and
+  code.
+
+What's left to be done for 32-bit UIDs on all Linux architectures:
+
+- Disk quotas have an interesting limitation that is not related to the
+  maximum UID/GID. They are limited by the maximum file size on the
+  underlying filesystem, because quota records are written at offsets
+  corresponding to the UID in question.
+  Further investigation is needed to see if the quota system can cope
+  properly with huge UIDs. If it can deal with 64-bit file offsets on all 
+  architectures, this should not be a problem.
+
+- Decide whether or not to keep backwards compatibility with the system
+  accounting file, or if we should break it as the comments suggest
+  (currently, the old 16-bit UID and GID are still written to disk, and
+  part of the former pad space is used to store separate 32-bit UID and
+  GID)
+
+- Need to validate that OS emulation calls the 16-bit UID
+  compatibility syscalls, if the OS being emulated used 16-bit UIDs, or
+  uses the 32-bit UID system calls properly otherwise.
+
+  This affects at least:
+
+	- iBCS on Intel
+
+	- sparc32 emulation on sparc64
+	  (need to support whatever new 32-bit UID system calls are added to
+	  sparc32)
+
+- Validate that all filesystems behave properly.
+
+  At present, 32-bit UIDs _should_ work for:
+
+	- ext2
+	- ufs
+	- isofs
+	- nfs
+	- coda
+	- udf
+
+  Ioctl() fixups have been made for:
+
+	- ncpfs
+	- smbfs
+
+  Filesystems with simple fixups to prevent 16-bit UID wraparound:
+
+	- minix
+	- sysv
+	- qnx4
+
+  Other filesystems have not been checked yet.
+
+- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in
+  all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but
+  more are needed. (as well as new user<->kernel data structures)
+
+- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k,
+  sh, and sparc32. Fixing this is probably not that important, but would
+  require adding a new ELF section.
+
+- The ioctl()s used to control the in-kernel NFS server only support
+  16-bit UIDs on arm, i386, m68k, sh, and sparc32.
+
+- make sure that the UID mapping feature of AX25 networking works properly
+  (it should be safe because it's always used a 32-bit integer to
+  communicate between user and kernel)
diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst
index 656aee262e23..f83212fae4d5 100644
--- a/Documentation/admin-guide/hw-vuln/l1tf.rst
+++ b/Documentation/admin-guide/hw-vuln/l1tf.rst
@@ -241,7 +241,7 @@ Guest mitigation mechanisms
    For further information about confining guests to a single or to a group
    of cores consult the cpusets documentation:
 
-   https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.rst
+   https://www.kernel.org/doc/Documentation/admin-guide/cgroup-v1/cpusets.rst
 
 .. _interrupt_isolation:
 
diff --git a/Documentation/admin-guide/hw_random.rst b/Documentation/admin-guide/hw_random.rst
new file mode 100644
index 000000000000..121de96e395e
--- /dev/null
+++ b/Documentation/admin-guide/hw_random.rst
@@ -0,0 +1,105 @@
+==========================================================
+Linux support for random number generator in i8xx chipsets
+==========================================================
+
+Introduction
+============
+
+The hw_random framework is software that makes use of a
+special hardware feature on your CPU or motherboard,
+a Random Number Generator (RNG).  The software has two parts:
+a core providing the /dev/hwrng character device and its
+sysfs support, plus a hardware-specific driver that plugs
+into that core.
+
+To make the most effective use of these mechanisms, you
+should download the support software as well.  Download the
+latest version of the "rng-tools" package from the
+hw_random driver's official Web site:
+
+	http://sourceforge.net/projects/gkernel/
+
+Those tools use /dev/hwrng to fill the kernel entropy pool,
+which is used internally and exported by the /dev/urandom and
+/dev/random special files.
+
+Theory of operation
+===================
+
+CHARACTER DEVICE.  Using the standard open()
+and read() system calls, you can read random data from
+the hardware RNG device.  This data is NOT CHECKED by any
+fitness tests, and could potentially be bogus (if the
+hardware is faulty or has been tampered with).  Data is only
+output if the hardware "has-data" flag is set, but nevertheless
+a security-conscious person would run fitness tests on the
+data before assuming it is truly random.
+
+The rng-tools package uses such tests in "rngd", and lets you
+run them by hand with a "rngtest" utility.
+
+/dev/hwrng is char device major 10, minor 183.
+
+CLASS DEVICE.  There is a /sys/class/misc/hw_random node with
+two unique attributes, "rng_available" and "rng_current".  The
+"rng_available" attribute lists the hardware-specific drivers
+available, while "rng_current" lists the one which is currently
+connected to /dev/hwrng.  If your system has more than one
+RNG available, you may change the one used by writing a name from
+the list in "rng_available" into "rng_current".
+
+==========================================================================
+
+
+Hardware driver for Intel/AMD/VIA Random Number Generators (RNG)
+	- Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com>
+	- Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com>
+
+
+About the Intel RNG hardware, from the firmware hub datasheet
+=============================================================
+
+The Firmware Hub integrates a Random Number Generator (RNG)
+using thermal noise generated from inherently random quantum
+mechanical properties of silicon. When not generating new random
+bits the RNG circuitry will enter a low power state. Intel will
+provide a binary software driver to give third party software
+access to our RNG for use as a security feature. At this time,
+the RNG is only to be used with a system in an OS-present state.
+
+Intel RNG Driver notes
+======================
+
+FIXME: support poll(2)
+
+.. note::
+
+	request_mem_region was removed, for three reasons:
+
+	1) Only one RNG is supported by this driver;
+	2) The location used by the RNG is a fixed location in
+	   MMIO-addressable memory;
+	3) users with properly working BIOS e820 handling will always
+	   have the region in which the RNG is located reserved, so
+	   request_mem_region calls always fail for proper setups.
+	   However, for people who use mem=XX, BIOS e820 information is
+	   **not** in /proc/iomem, and request_mem_region(RNG_ADDR) can
+	   succeed.
+
+Driver details
+==============
+
+Based on:
+	Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet
+	May 1999 Order Number: 290658-002 R
+
+Intel 82802 Firmware Hub:
+	Random Number Generator
+	Programmer's Reference Manual
+	December 1999 Order Number: 298029-001 R
+
+Intel 82802 Firmware HUB Random Number Generator Driver
+	Copyright (c) 2000 Matt Sottek <msottek@quiknet.com>
+
+Special thanks to Matt Sottek.  I did the "guts", he
+did the "brains" and all the testing.
diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst
index a5fdb1a846ce..4e98f5596da0 100644
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -85,8 +85,25 @@ configure specific aspects of kernel behavior to your liking.
    perf-security
    acpi/index
    aoe/index
+   btmrvl
+   clearing-warn-once
+   cpu-load
+   cputopology
    device-mapper/index
+   efi-stub
+   highuid
+   hw_random
+   iostats
+   kernel-per-CPU-kthreads
    laptops/index
+   lcd-panel-cgram
+   ldm
+   lockup-watchdogs
+   numastat
+   pnp
+   rtc
+   svga
+   video-output
 
 .. only::  subproject and html
 
diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst
new file mode 100644
index 000000000000..5d63b18bd6d1
--- /dev/null
+++ b/Documentation/admin-guide/iostats.rst
@@ -0,0 +1,197 @@
+=====================
+I/O statistics fields
+=====================
+
+Since 2.4.20 (and some versions before, with patches), and 2.5.45,
+more extensive disk statistics have been introduced to help measure disk
+activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do
+the work for you, but in case you are interested in creating your own
+tools, the fields are explained here.
+
+In 2.4 now, the information is found as additional fields in
+``/proc/partitions``.  In 2.6 and upper, the same information is found in two
+places: one is in the file ``/proc/diskstats``, and the other is within
+the sysfs file system, which must be mounted in order to obtain
+the information. Throughout this document we'll assume that sysfs
+is mounted on ``/sys``, although of course it may be mounted anywhere.
+Both ``/proc/diskstats`` and sysfs use the same source for the information
+and so should not differ.
+
+Here are examples of these different formats::
+
+   2.4:
+      3     0   39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+      3     1    9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030
+
+   2.6+ sysfs:
+      446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+      35486    38030    38030    38030
+
+   2.6+ diskstats:
+      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
+      3    1   hda1 35486 38030 38030 38030
+
+   4.18+ diskstats:
+      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
+
+On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
+a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
+
+The advantage of one over the other is that the sysfs choice works well
+if you are watching a known, small set of disks.  ``/proc/diskstats`` may
+be a better choice if you are watching a large number of disks because
+you'll avoid the overhead of 50, 100, or 500 or more opens/closes with
+each snapshot of your disk statistics.
+
+In 2.4, the statistics fields are those after the device name. In
+the above example, the first field of statistics would be 446216.
+By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll
+find just the eleven fields, beginning with 446216.  If you look at
+``/proc/diskstats``, the eleven fields will be preceded by the major and
+minor device numbers, and device name.  Each of these formats provides
+eleven fields of statistics, each meaning exactly the same things.
+All fields except field 9 are cumulative since boot.  Field 9 should
+go to zero as I/Os complete; all others only increase (unless they
+overflow and wrap).  Yes, these are (32-bit or 64-bit) unsigned long
+(native word size) numbers, and on a very busy or long-lived system they
+may wrap. Applications should be prepared to deal with that; unless
+your observations are measured in large numbers of minutes or hours,
+they should not wrap twice before you notice them.
+
+Each set of stats only applies to the indicated device; if you want
+system-wide stats you'll have to find all the devices and sum them all up.
+
+Field  1 -- # of reads completed
+    This is the total number of reads completed successfully.
+
+Field  2 -- # of reads merged, field 6 -- # of writes merged
+    Reads and writes which are adjacent to each other may be merged for
+    efficiency.  Thus two 4K reads may become one 8K read before it is
+    ultimately handed to the disk, and so it will be counted (and queued)
+    as only one I/O.  This field lets you know how often this was done.
+
+Field  3 -- # of sectors read
+    This is the total number of sectors read successfully.
+
+Field  4 -- # of milliseconds spent reading
+    This is the total number of milliseconds spent by all reads (as
+    measured from __make_request() to end_that_request_last()).
+
+Field  5 -- # of writes completed
+    This is the total number of writes completed successfully.
+
+Field  6 -- # of writes merged
+    See the description of field 2.
+
+Field  7 -- # of sectors written
+    This is the total number of sectors written successfully.
+
+Field  8 -- # of milliseconds spent writing
+    This is the total number of milliseconds spent by all writes (as
+    measured from __make_request() to end_that_request_last()).
+
+Field  9 -- # of I/Os currently in progress
+    The only field that should go to zero. Incremented as requests are
+    given to appropriate struct request_queue and decremented as they finish.
+
+Field 10 -- # of milliseconds spent doing I/Os
+    This field increases so long as field 9 is nonzero.
+
+    Since 5.0 this field counts jiffies when at least one request was
+    started or completed. If request runs more than 2 jiffies then some
+    I/O time will not be accounted unless there are other requests.
+
+Field 11 -- weighted # of milliseconds spent doing I/Os
+    This field is incremented at each I/O start, I/O completion, I/O
+    merge, or read of these stats by the number of I/Os in progress
+    (field 9) times the number of milliseconds spent doing I/O since the
+    last update of this field.  This can provide an easy measure of both
+    I/O completion time and the backlog that may be accumulating.
+
+Field 12 -- # of discards completed
+    This is the total number of discards completed successfully.
+
+Field 13 -- # of discards merged
+    See the description of field 2
+
+Field 14 -- # of sectors discarded
+    This is the total number of sectors discarded successfully.
+
+Field 15 -- # of milliseconds spent discarding
+    This is the total number of milliseconds spent by all discards (as
+    measured from __make_request() to end_that_request_last()).
+
+To avoid introducing performance bottlenecks, no locks are held while
+modifying these counters.  This implies that minor inaccuracies may be
+introduced when changes collide, so (for instance) adding up all the
+read I/Os issued per partition should equal those made to the disks ...
+but due to the lack of locking it may only be very close.
+
+In 2.6+, there are counters for each CPU, which make the lack of locking
+almost a non-issue.  When the statistics are read, the per-CPU counters
+are summed (possibly overflowing the unsigned long variable they are
+summed to) and the result given to the user.  There is no convenient
+user interface for accessing the per-CPU counters themselves.
+
+Disks vs Partitions
+-------------------
+
+There were significant changes between 2.4 and 2.6+ in the I/O subsystem.
+As a result, some statistic information disappeared. The translation from
+a disk address relative to a partition to the disk address relative to
+the host disk happens much earlier.  All merges and timings now happen
+at the disk level rather than at both the disk and partition level as
+in 2.4.  Consequently, you'll see a different statistics output on 2.6+ for
+partitions from that for disks.  There are only *four* fields available
+for partitions on 2.6+ machines.  This is reflected in the examples above.
+
+Field  1 -- # of reads issued
+    This is the total number of reads issued to this partition.
+
+Field  2 -- # of sectors read
+    This is the total number of sectors requested to be read from this
+    partition.
+
+Field  3 -- # of writes issued
+    This is the total number of writes issued to this partition.
+
+Field  4 -- # of sectors written
+    This is the total number of sectors requested to be written to
+    this partition.
+
+Note that since the address is translated to a disk-relative one, and no
+record of the partition-relative address is kept, the subsequent success
+or failure of the read cannot be attributed to the partition.  In other
+words, the number of reads for partitions is counted slightly before time
+of queuing for partitions, and at completion for whole disks.  This is
+a subtle distinction that is probably uninteresting for most cases.
+
+More significant is the error induced by counting the numbers of
+reads/writes before merges for partitions and after for disks. Since a
+typical workload usually contains a lot of successive and adjacent requests,
+the number of reads/writes issued can be several times higher than the
+number of reads/writes completed.
+
+In 2.6.25, the full statistic set is again available for partitions and
+disk and partition statistics are consistent again. Since we still don't
+keep record of the partition-relative address, an operation is attributed to
+the partition which contains the first sector of the request after the
+eventual merges. As requests can be merged across partition, this could lead
+to some (probably insignificant) inaccuracy.
+
+Additional notes
+----------------
+
+In 2.6+, sysfs is not mounted by default.  If your distribution of
+Linux hasn't added it already, here's the line you'll want to add to
+your ``/etc/fstab``::
+
+	none /sys sysfs defaults 0 0
+
+
+In 2.6+, all disk statistics were removed from ``/proc/stat``.  In 2.4, they
+appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in
+``/proc/stat`` take a very different format from those in ``/proc/partitions``
+(see proc(5), if your system has it.)
+
+-- ricklind@us.ibm.com
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a571a67e0c85..19b1e3bef56c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5066,7 +5066,7 @@
 
 	vga=		[BOOT,X86-32] Select a particular video mode
 			See Documentation/x86/boot.rst and
-			Documentation/svga.txt.
+			Documentation/admin-guide/svga.rst.
 			Use vga=ask for menu.
 			This is actually a boot loader parameter; the value is
 			passed to the kernel using a special protocol.
diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
new file mode 100644
index 000000000000..4f18456dd3b1
--- /dev/null
+++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst
@@ -0,0 +1,356 @@
+==========================================
+Reducing OS jitter due to per-cpu kthreads
+==========================================
+
+This document lists per-CPU kthreads in the Linux kernel and presents
+options to control their OS jitter.  Note that non-per-CPU kthreads are
+not listed here.  To reduce OS jitter from non-per-CPU kthreads, bind
+them to a "housekeeping" CPU dedicated to such work.
+
+References
+==========
+
+-	Documentation/IRQ-affinity.txt:  Binding interrupts to sets of CPUs.
+
+-	Documentation/admin-guide/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
+
+-	man taskset:  Using the taskset command to bind tasks to sets
+	of CPUs.
+
+-	man sched_setaffinity:  Using the sched_setaffinity() system
+	call to bind tasks to sets of CPUs.
+
+-	/sys/devices/system/cpu/cpuN/online:  Control CPU N's hotplug state,
+	writing "0" to offline and "1" to online.
+
+-	In order to locate kernel-generated OS jitter on CPU N:
+
+		cd /sys/kernel/debug/tracing
+		echo 1 > max_graph_depth # Increase the "1" for more detail
+		echo function_graph > current_tracer
+		# run workload
+		cat per_cpu/cpuN/trace
+
+kthreads
+========
+
+Name:
+  ehca_comp/%u
+
+Purpose:
+  Periodically process Infiniband-related work.
+
+To reduce its OS jitter, do any of the following:
+
+1.	Don't use eHCA Infiniband hardware, instead choosing hardware
+	that does not require per-CPU kthreads.  This will prevent these
+	kthreads from being created in the first place.  (This will
+	work for most people, as this hardware, though important, is
+	relatively old and is produced in relatively low unit volumes.)
+2.	Do all eHCA-Infiniband-related work on other CPUs, including
+	interrupts.
+3.	Rework the eHCA driver so that its per-CPU kthreads are
+	provisioned only on selected CPUs.
+
+
+Name:
+  irq/%d-%s
+
+Purpose:
+  Handle threaded interrupts.
+
+To reduce its OS jitter, do the following:
+
+1.	Use irq affinity to force the irq threads to execute on
+	some other CPU.
+
+Name:
+  kcmtpd_ctr_%d
+
+Purpose:
+  Handle Bluetooth work.
+
+To reduce its OS jitter, do one of the following:
+
+1.	Don't use Bluetooth, in which case these kthreads won't be
+	created in the first place.
+2.	Use irq affinity to force Bluetooth-related interrupts to
+	occur on some other CPU and furthermore initiate all
+	Bluetooth activity on some other CPU.
+
+Name:
+  ksoftirqd/%u
+
+Purpose:
+  Execute softirq handlers when threaded or when under heavy load.
+
+To reduce its OS jitter, each softirq vector must be handled
+separately as follows:
+
+TIMER_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1.	To the extent possible, keep the CPU out of the kernel when it
+	is non-idle, for example, by avoiding system calls and by forcing
+	both kernel threads and interrupts to execute elsewhere.
+2.	Build with CONFIG_HOTPLUG_CPU=y.  After boot completes, force
+	the CPU offline, then bring it back online.  This forces
+	recurring timers to migrate elsewhere.	If you are concerned
+	with multiple CPUs, force them all offline before bringing the
+	first one back online.  Once you have onlined the CPUs in question,
+	do not offline any other CPUs, because doing so could force the
+	timer back onto one of the CPUs in question.
+
+NET_TX_SOFTIRQ and NET_RX_SOFTIRQ
+---------------------------------
+
+Do all of the following:
+
+1.	Force networking interrupts onto other CPUs.
+2.	Initiate any network I/O on other CPUs.
+3.	Once your application has started, prevent CPU-hotplug operations
+	from being initiated from tasks that might run on the CPU to
+	be de-jittered.  (It is OK to force this CPU offline and then
+	bring it back online before you start your application.)
+
+BLOCK_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1.	Force block-device interrupts onto some other CPU.
+2.	Initiate any block I/O on other CPUs.
+3.	Once your application has started, prevent CPU-hotplug operations
+	from being initiated from tasks that might run on the CPU to
+	be de-jittered.  (It is OK to force this CPU offline and then
+	bring it back online before you start your application.)
+
+IRQ_POLL_SOFTIRQ
+----------------
+
+Do all of the following:
+
+1.	Force block-device interrupts onto some other CPU.
+2.	Initiate any block I/O and block-I/O polling on other CPUs.
+3.	Once your application has started, prevent CPU-hotplug operations
+	from being initiated from tasks that might run on the CPU to
+	be de-jittered.  (It is OK to force this CPU offline and then
+	bring it back online before you start your application.)
+
+TASKLET_SOFTIRQ
+---------------
+
+Do one or more of the following:
+
+1.	Avoid use of drivers that use tasklets.  (Such drivers will contain
+	calls to things like tasklet_schedule().)
+2.	Convert all drivers that you must use from tasklets to workqueues.
+3.	Force interrupts for drivers using tasklets onto other CPUs,
+	and also do I/O involving these drivers on other CPUs.
+
+SCHED_SOFTIRQ
+-------------
+
+Do all of the following:
+
+1.	Avoid sending scheduler IPIs to the CPU to be de-jittered,
+	for example, ensure that at most one runnable kthread is present
+	on that CPU.  If a thread that expects to run on the de-jittered
+	CPU awakens, the scheduler will send an IPI that can result in
+	a subsequent SCHED_SOFTIRQ.
+2.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
+	is marked as an adaptive-ticks CPU using the "nohz_full="
+	boot parameter.  This reduces the number of scheduler-clock
+	interrupts that the de-jittered CPU receives, minimizing its
+	chances of being selected to do the load balancing work that
+	runs in SCHED_SOFTIRQ context.
+3.	To the extent possible, keep the CPU out of the kernel when it
+	is non-idle, for example, by avoiding system calls and by
+	forcing both kernel threads and interrupts to execute elsewhere.
+	This further reduces the number of scheduler-clock interrupts
+	received by the de-jittered CPU.
+
+HRTIMER_SOFTIRQ
+---------------
+
+Do all of the following:
+
+1.	To the extent possible, keep the CPU out of the kernel when it
+	is non-idle.  For example, avoid system calls and force both
+	kernel threads and interrupts to execute elsewhere.
+2.	Build with CONFIG_HOTPLUG_CPU=y.  Once boot completes, force the
+	CPU offline, then bring it back online.  This forces recurring
+	timers to migrate elsewhere.  If you are concerned with multiple
+	CPUs, force them all offline before bringing the first one
+	back online.  Once you have onlined the CPUs in question, do not
+	offline any other CPUs, because doing so could force the timer
+	back onto one of the CPUs in question.
+
+RCU_SOFTIRQ
+-----------
+
+Do at least one of the following:
+
+1.	Offload callbacks and keep the CPU in either dyntick-idle or
+	adaptive-ticks state by doing all of the following:
+
+	a.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
+		de-jittered is marked as an adaptive-ticks CPU using the
+		"nohz_full=" boot parameter.  Bind the rcuo kthreads to
+		housekeeping CPUs, which can tolerate OS jitter.
+	b.	To the extent possible, keep the CPU out of the kernel
+		when it is non-idle, for example, by avoiding system
+		calls and by forcing both kernel threads and interrupts
+		to execute elsewhere.
+
+2.	Enable RCU to do its processing remotely via dyntick-idle by
+	doing all of the following:
+
+	a.	Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
+	b.	Ensure that the CPU goes idle frequently, allowing other
+		CPUs to detect that it has passed through an RCU quiescent
+		state.	If the kernel is built with CONFIG_NO_HZ_FULL=y,
+		userspace execution also allows other CPUs to detect that
+		the CPU in question has passed through a quiescent state.
+	c.	To the extent possible, keep the CPU out of the kernel
+		when it is non-idle, for example, by avoiding system
+		calls and by forcing both kernel threads and interrupts
+		to execute elsewhere.
+
+Name:
+  kworker/%u:%d%s (cpu, id, priority)
+
+Purpose:
+  Execute workqueue requests
+
+To reduce its OS jitter, do any of the following:
+
+1.	Run your workload at a real-time priority, which will allow
+	preempting the kworker daemons.
+2.	A given workqueue can be made visible in the sysfs filesystem
+	by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
+	Such a workqueue can be confined to a given subset of the
+	CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
+	files.	The set of WQ_SYSFS workqueues can be displayed using
+	"ls sys/devices/virtual/workqueue".  That said, the workqueues
+	maintainer would like to caution people against indiscriminately
+	sprinkling WQ_SYSFS across all the workqueues.	The reason for
+	caution is that it is easy to add WQ_SYSFS, but because sysfs is
+	part of the formal user/kernel API, it can be nearly impossible
+	to remove it, even if its addition was a mistake.
+3.	Do any of the following needed to avoid jitter that your
+	application cannot tolerate:
+
+	a.	Build your kernel with CONFIG_SLUB=y rather than
+		CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
+		use of each CPU's workqueues to run its cache_reap()
+		function.
+	b.	Avoid using oprofile, thus avoiding OS jitter from
+		wq_sync_buffer().
+	c.	Limit your CPU frequency so that a CPU-frequency
+		governor is not required, possibly enlisting the aid of
+		special heatsinks or other cooling technologies.  If done
+		correctly, and if you CPU architecture permits, you should
+		be able to build your kernel with CONFIG_CPU_FREQ=n to
+		avoid the CPU-frequency governor periodically running
+		on each CPU, including cs_dbs_timer() and od_dbs_timer().
+
+		WARNING:  Please check your CPU specifications to
+		make sure that this is safe on your particular system.
+	d.	As of v3.18, Christoph Lameter's on-demand vmstat workers
+		commit prevents OS jitter due to vmstat_update() on
+		CONFIG_SMP=y systems.  Before v3.18, is not possible
+		to entirely get rid of the OS jitter, but you can
+		decrease its frequency by writing a large value to
+		/proc/sys/vm/stat_interval.  The default value is HZ,
+		for an interval of one second.	Of course, larger values
+		will make your virtual-memory statistics update more
+		slowly.  Of course, you can also run your workload at
+		a real-time priority, thus preempting vmstat_update(),
+		but if your workload is CPU-bound, this is a bad idea.
+		However, there is an RFC patch from Christoph Lameter
+		(based on an earlier one from Gilad Ben-Yossef) that
+		reduces or even eliminates vmstat overhead for some
+		workloads at https://lkml.org/lkml/2013/9/4/379.
+	e.	Boot with "elevator=noop" to avoid workqueue use by
+		the block layer.
+	f.	If running on high-end powerpc servers, build with
+		CONFIG_PPC_RTAS_DAEMON=n.  This prevents the RTAS
+		daemon from running on each CPU every second or so.
+		(This will require editing Kconfig files and will defeat
+		this platform's RAS functionality.)  This avoids jitter
+		due to the rtas_event_scan() function.
+		WARNING:  Please check your CPU specifications to
+		make sure that this is safe on your particular system.
+	g.	If running on Cell Processor, build your kernel with
+		CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
+		spu_gov_work().
+		WARNING:  Please check your CPU specifications to
+		make sure that this is safe on your particular system.
+	h.	If running on PowerMAC, build your kernel with
+		CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
+		avoiding OS jitter from rackmeter_do_timer().
+
+Name:
+  rcuc/%u
+
+Purpose:
+  Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
+
+To reduce its OS jitter, do at least one of the following:
+
+1.	Build the kernel with CONFIG_PREEMPT=n.  This prevents these
+	kthreads from being created in the first place, and also obviates
+	the need for RCU priority boosting.  This approach is feasible
+	for workloads that do not require high degrees of responsiveness.
+2.	Build the kernel with CONFIG_RCU_BOOST=n.  This prevents these
+	kthreads from being created in the first place.  This approach
+	is feasible only if your workload never requires RCU priority
+	boosting, for example, if you ensure frequent idle time on all
+	CPUs that might execute within the kernel.
+3.	Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
+	boot parameter offloading RCU callbacks from all CPUs susceptible
+	to OS jitter.  This approach prevents the rcuc/%u kthreads from
+	having any work to do, so that they are never awakened.
+4.	Ensure that the CPU never enters the kernel, and, in particular,
+	avoid initiating any CPU hotplug operations on this CPU.  This is
+	another way of preventing any callbacks from being queued on the
+	CPU, again preventing the rcuc/%u kthreads from having any work
+	to do.
+
+Name:
+  rcuop/%d and rcuos/%d
+
+Purpose:
+  Offload RCU callbacks from the corresponding CPU.
+
+To reduce its OS jitter, do at least one of the following:
+
+1.	Use affinity, cgroups, or other mechanism to force these kthreads
+	to execute on some other CPU.
+2.	Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these
+	kthreads from being created in the first place.  However, please
+	note that this will not eliminate OS jitter, but will instead
+	shift it to RCU_SOFTIRQ.
+
+Name:
+  watchdog/%u
+
+Purpose:
+  Detect software lockups on each CPU.
+
+To reduce its OS jitter, do at least one of the following:
+
+1.	Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
+	kthreads from being created in the first place.
+2.	Boot with "nosoftlockup=0", which will also prevent these kthreads
+	from being created.  Other related watchdog and softlockup boot
+	parameters may be found in Documentation/admin-guide/kernel-parameters.rst
+	and Documentation/watchdog/watchdog-parameters.rst.
+3.	Echo a zero to /proc/sys/kernel/watchdog to disable the
+	watchdog timer.
+4.	Echo a large number of /proc/sys/kernel/watchdog_thresh in
+	order to reduce the frequency of OS jitter due to the watchdog
+	timer down to a level that is acceptable for your workload.
diff --git a/Documentation/admin-guide/lcd-panel-cgram.rst b/Documentation/admin-guide/lcd-panel-cgram.rst
new file mode 100644
index 000000000000..a3eb00c62f53
--- /dev/null
+++ b/Documentation/admin-guide/lcd-panel-cgram.rst
@@ -0,0 +1,27 @@
+======================================
+Parallel port LCD/Keypad Panel support
+======================================
+
+Some LCDs allow you to define up to 8 characters, mapped to ASCII
+characters 0 to 7. The escape code to define a new character is
+'\e[LG' followed by one digit from 0 to 7, representing the character
+number, and up to 8 couples of hex digits terminated by a semi-colon
+(';'). Each couple of digits represents a line, with 1-bits for each
+illuminated pixel with LSB on the right. Lines are numbered from the
+top of the character to the bottom. On a 5x7 matrix, only the 5 lower
+bits of the 7 first bytes are used for each character. If the string
+is incomplete, only complete lines will be redefined. Here are some
+examples::
+
+  printf "\e[LG0010101050D1F0C04;"  => 0 = [enter]
+  printf "\e[LG1040E1F0000000000;"  => 1 = [up]
+  printf "\e[LG2000000001F0E0400;"  => 2 = [down]
+  printf "\e[LG3040E1F001F0E0400;"  => 3 = [up-down]
+  printf "\e[LG40002060E1E0E0602;"  => 4 = [left]
+  printf "\e[LG500080C0E0F0E0C08;"  => 5 = [right]
+  printf "\e[LG60016051516141400;"  => 6 = "IP"
+
+  printf "\e[LG00103071F1F070301;"  => big speaker
+  printf "\e[LG00002061E1E060200;"  => small speaker
+
+Willy
diff --git a/Documentation/admin-guide/ldm.rst b/Documentation/admin-guide/ldm.rst
new file mode 100644
index 000000000000..12c571368e73
--- /dev/null
+++ b/Documentation/admin-guide/ldm.rst
@@ -0,0 +1,121 @@
+==========================================
+LDM - Logical Disk Manager (Dynamic Disks)
+==========================================
+
+:Author: Originally Written by FlatCap - Richard Russon <ldm@flatcap.org>.
+:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista.
+
+Overview
+--------
+
+Windows 2000, XP, and Vista use a new partitioning scheme.  It is a complete
+replacement for the MSDOS style partitions.  It stores its information in a
+1MiB journalled database at the end of the physical disk.  The size of
+partitions is limited only by disk space.  The maximum number of partitions is
+nearly 2000.
+
+Any partitions created under the LDM are called "Dynamic Disks".  There are no
+longer any primary or extended partitions.  Normal MSDOS style partitions are
+now known as Basic Disks.
+
+If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use
+Dynamic Disks.  The journalling allows Windows to make changes to these
+partitions and filesystems without the need to reboot.
+
+Once the LDM driver has divided up the disk, you can use the MD driver to
+assemble any multi-partition volumes, e.g.  Stripes, RAID5.
+
+To prevent legacy applications from repartitioning the disk, the LDM creates a
+dummy MSDOS partition containing one disk-sized partition.  This is what is
+supported with the Linux LDM driver.
+
+A newer approach that has been implemented with Vista is to put LDM on top of a
+GPT label disk.  This is not supported by the Linux LDM driver yet.
+
+
+Example
+-------
+
+Below we have a 50MiB disk, divided into seven partitions.
+
+.. note::
+
+   The missing 1MiB at the end of the disk is where the LDM database is
+   stored.
+
++-------++--------------+---------+-----++--------------+---------+----+
+|Device || Offset Bytes | Sectors | MiB || Size   Bytes | Sectors | MiB|
++=======++==============+=========+=====++==============+=========+====+
+|hda    ||            0 |       0 |   0 ||     52428800 |  102400 |  50|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda1   ||     51380224 |  100352 |  49 ||      1048576 |    2048 |   1|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda2   ||        16384 |      32 |   0 ||      6979584 |   13632 |   6|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda3   ||      6995968 |   13664 |   6 ||     10485760 |   20480 |  10|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda4   ||     17481728 |   34144 |  16 ||      4194304 |    8192 |   4|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda5   ||     21676032 |   42336 |  20 ||      5242880 |   10240 |   5|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda6   ||     26918912 |   52576 |  25 ||     10485760 |   20480 |  10|
++-------++--------------+---------+-----++--------------+---------+----+
+|hda7   ||     37404672 |   73056 |  35 ||     13959168 |   27264 |  13|
++-------++--------------+---------+-----++--------------+---------+----+
+
+The LDM Database may not store the partitions in the order that they appear on
+disk, but the driver will sort them.
+
+When Linux boots, you will see something like::
+
+  hda: 102400 sectors w/32KiB Cache, CHS=50/64/32
+  hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7
+
+
+Compiling LDM Support
+---------------------
+
+To enable LDM, choose the following two options: 
+
+  - "Advanced partition selection" CONFIG_PARTITION_ADVANCED
+  - "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION
+
+If you believe the driver isn't working as it should, you can enable the extra
+debugging code.  This will produce a LOT of output.  The option is:
+
+  - "Windows LDM extra logging" CONFIG_LDM_DEBUG
+
+N.B. The partition code cannot be compiled as a module.
+
+As with all the partition code, if the driver doesn't see signs of its type of
+partition, it will pass control to another driver, so there is no harm in
+enabling it.
+
+If you have Dynamic Disks but don't enable the driver, then all you will see
+is a dummy MSDOS partition filling the whole disk.  You won't be able to mount
+any of the volumes on the disk.
+
+
+Booting
+-------
+
+If you enable LDM support, then lilo is capable of booting from any of the
+discovered partitions.  However, grub does not understand the LDM partitioning
+and cannot boot from a Dynamic Disk.
+
+
+More Documentation
+------------------
+
+There is an Overview of the LDM together with complete Technical Documentation.
+It is available for download.
+
+  http://www.linux-ntfs.org/
+
+If you have any LDM questions that aren't answered in the documentation, email
+me.
+
+Cheers,
+    FlatCap - Richard Russon
+    ldm@flatcap.org
+
diff --git a/Documentation/admin-guide/lockup-watchdogs.rst b/Documentation/admin-guide/lockup-watchdogs.rst
new file mode 100644
index 000000000000..290840c160af
--- /dev/null
+++ b/Documentation/admin-guide/lockup-watchdogs.rst
@@ -0,0 +1,83 @@
+===============================================================
+Softlockup detector and hardlockup detector (aka nmi_watchdog)
+===============================================================
+
+The Linux kernel can act as a watchdog to detect both soft and hard
+lockups.
+
+A 'softlockup' is defined as a bug that causes the kernel to loop in
+kernel mode for more than 20 seconds (see "Implementation" below for
+details), without giving other tasks a chance to run. The current
+stack trace is displayed upon detection and, by default, the system
+will stay locked up. Alternatively, the kernel can be configured to
+panic; a sysctl, "kernel.softlockup_panic", a kernel parameter,
+"softlockup_panic" (see "Documentation/admin-guide/kernel-parameters.rst" for
+details), and a compile option, "BOOTPARAM_SOFTLOCKUP_PANIC", are
+provided for this.
+
+A 'hardlockup' is defined as a bug that causes the CPU to loop in
+kernel mode for more than 10 seconds (see "Implementation" below for
+details), without letting other interrupts have a chance to run.
+Similarly to the softlockup case, the current stack trace is displayed
+upon detection and the system will stay locked up unless the default
+behavior is changed, which can be done through a sysctl,
+'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC",
+and a kernel parameter, "nmi_watchdog"
+(see "Documentation/admin-guide/kernel-parameters.rst" for details).
+
+The panic option can be used in combination with panic_timeout (this
+timeout is set through the confusingly named "kernel.panic" sysctl),
+to cause the system to reboot automatically after a specified amount
+of time.
+
+Implementation
+==============
+
+The soft and hard lockup detectors are built on top of the hrtimer and
+perf subsystems, respectively. A direct consequence of this is that,
+in principle, they should work in any architecture where these
+subsystems are present.
+
+A periodic hrtimer runs to generate interrupts and kick the watchdog
+task. An NMI perf event is generated every "watchdog_thresh"
+(compile-time initialized to 10 and configurable through sysctl of the
+same name) seconds to check for hardlockups. If any CPU in the system
+does not receive any hrtimer interrupt during that time the
+'hardlockup detector' (the handler for the NMI perf event) will
+generate a kernel warning or call panic, depending on the
+configuration.
+
+The watchdog task is a high priority kernel thread that updates a
+timestamp every time it is scheduled. If that timestamp is not updated
+for 2*watchdog_thresh seconds (the softlockup threshold) the
+'softlockup detector' (coded inside the hrtimer callback function)
+will dump useful debug information to the system log, after which it
+will call panic if it was instructed to do so or resume execution of
+other kernel code.
+
+The period of the hrtimer is 2*watchdog_thresh/5, which means it has
+two or three chances to generate an interrupt before the hardlockup
+detector kicks in.
+
+As explained above, a kernel knob is provided that allows
+administrators to configure the period of the hrtimer and the perf
+event. The right value for a particular environment is a trade-off
+between fast response to lockups and detection overhead.
+
+By default, the watchdog runs on all online cores.  However, on a
+kernel configured with NO_HZ_FULL, by default the watchdog runs only
+on the housekeeping cores, not the cores specified in the "nohz_full"
+boot argument.  If we allowed the watchdog to run by default on
+the "nohz_full" cores, we would have to run timer ticks to activate
+the scheduler, which would prevent the "nohz_full" functionality
+from protecting the user code on those cores from the kernel.
+Of course, disabling it by default on the nohz_full cores means that
+when those cores do enter the kernel, by default we will not be
+able to detect if they lock up.  However, allowing the watchdog
+to continue to run on the housekeeping (non-tickless) cores means
+that we will continue to detect lockups properly on those cores.
+
+In either case, the set of cores excluded from running the watchdog
+may be adjusted via the kernel.watchdog_cpumask sysctl.  For
+nohz_full cores, this may be useful for debugging a case where the
+kernel seems to be hanging on the nohz_full cores.
diff --git a/Documentation/admin-guide/mm/cma_debugfs.rst b/Documentation/admin-guide/mm/cma_debugfs.rst
new file mode 100644
index 000000000000..4e06ffabd78a
--- /dev/null
+++ b/Documentation/admin-guide/mm/cma_debugfs.rst
@@ -0,0 +1,25 @@
+=====================
+CMA Debugfs Interface
+=====================
+
+The CMA debugfs interface is useful to retrieve basic information out of the
+different CMA areas and to test allocation/release in each of the areas.
+
+Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
+kernel's CMA index. So the first CMA zone would be:
+
+	<debugfs>/cma/cma-0
+
+The structure of the files created under that directory is as follows:
+
+ - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
+ - [RO] count: Amount of memory in the CMA area.
+ - [RO] order_per_bit: Order of pages represented by one bit.
+ - [RO] bitmap: The bitmap of page states in the zone.
+ - [WO] alloc: Allocate N pages from that CMA area. For example::
+
+	echo 5 > <debugfs>/cma/cma-2/alloc
+
+would try to allocate 5 pages from the cma-2 area.
+
+ - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
index 5f61a6c429e0..11db46448354 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -26,6 +26,7 @@ the Linux memory management.
    :maxdepth: 1
 
    concepts
+   cma_debugfs
    hugetlbpage
    idle_page_tracking
    ksm
diff --git a/Documentation/admin-guide/numastat.rst b/Documentation/admin-guide/numastat.rst
new file mode 100644
index 000000000000..aaf1667489f8
--- /dev/null
+++ b/Documentation/admin-guide/numastat.rst
@@ -0,0 +1,30 @@
+===============================
+Numa policy hit/miss statistics
+===============================
+
+/sys/devices/system/node/node*/numastat
+
+All units are pages. Hugepages have separate counters.
+
+=============== ============================================================
+numa_hit	A process wanted to allocate memory from this node,
+		and succeeded.
+
+numa_miss	A process wanted to allocate memory from another node,
+		but ended up with memory from this node.
+
+numa_foreign	A process wanted to allocate on this node,
+		but ended up with memory from another one.
+
+local_node	A process ran on this node and got memory from it.
+
+other_node	A process ran on this node and got memory from another node.
+
+interleave_hit 	Interleaving wanted to allocate from this node
+		and succeeded.
+=============== ============================================================
+
+For easier reading you can use the numastat utility from the numactl package
+(http://oss.sgi.com/projects/libnuma/). Note that it only works
+well right now on machines with a small number of CPUs.
+
diff --git a/Documentation/admin-guide/pnp.rst b/Documentation/admin-guide/pnp.rst
new file mode 100644
index 000000000000..bab2d10631f0
--- /dev/null
+++ b/Documentation/admin-guide/pnp.rst
@@ -0,0 +1,292 @@
+=================================
+Linux Plug and Play Documentation
+=================================
+
+:Author: Adam Belay <ambx1@neo.rr.com>
+:Last updated: Oct. 16, 2002
+
+
+Overview
+--------
+
+Plug and Play provides a means of detecting and setting resources for legacy or
+otherwise unconfigurable devices.  The Linux Plug and Play Layer provides these 
+services to compatible drivers.
+
+
+The User Interface
+------------------
+
+The Linux Plug and Play user interface provides a means to activate PnP devices
+for legacy and user level drivers that do not support Linux Plug and Play.  The 
+user interface is integrated into sysfs.
+
+In addition to the standard sysfs file the following are created in each
+device's directory:
+- id - displays a list of support EISA IDs
+- options - displays possible resource configurations
+- resources - displays currently allocated resources and allows resource changes
+
+activating a device
+^^^^^^^^^^^^^^^^^^^
+
+::
+
+	# echo "auto" > resources
+
+this will invoke the automatic resource config system to activate the device
+
+manually activating a device
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+::
+
+	# echo "manual <depnum> <mode>" > resources
+
+	<depnum> - the configuration number
+	<mode> - static or dynamic
+		 static = for next boot
+		 dynamic = now
+
+disabling a device
+^^^^^^^^^^^^^^^^^^
+
+::
+
+	# echo "disable" > resources
+
+
+EXAMPLE:
+
+Suppose you need to activate the floppy disk controller.
+
+1. change to the proper directory, in my case it is
+   /driver/bus/pnp/devices/00:0f::
+
+	# cd /driver/bus/pnp/devices/00:0f
+	# cat name
+	PC standard floppy disk controller
+
+2. check if the device is already active::
+
+	# cat resources
+	DISABLED
+
+  - Notice the string "DISABLED".  This means the device is not active.
+
+3. check the device's possible configurations (optional)::
+
+	# cat options
+	Dependent: 01 - Priority acceptable
+	    port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding
+	    port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding
+	    irq 6
+	    dma 2 8-bit compatible
+	Dependent: 02 - Priority acceptable
+	    port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding
+	    port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding
+	    irq 6
+	    dma 2 8-bit compatible
+
+4. now activate the device::
+
+	# echo "auto" > resources
+
+5. finally check if the device is active::
+
+	# cat resources
+	io 0x3f0-0x3f5
+	io 0x3f7-0x3f7
+	irq 6
+	dma 2
+
+also there are a series of kernel parameters::
+
+	pnp_reserve_irq=irq1[,irq2] ....
+	pnp_reserve_dma=dma1[,dma2] ....
+	pnp_reserve_io=io1,size1[,io2,size2] ....
+	pnp_reserve_mem=mem1,size1[,mem2,size2] ....
+
+
+
+The Unified Plug and Play Layer
+-------------------------------
+
+All Plug and Play drivers, protocols, and services meet at a central location
+called the Plug and Play Layer.  This layer is responsible for the exchange of 
+information between PnP drivers and PnP protocols.  Thus it automatically 
+forwards commands to the proper protocol.  This makes writing PnP drivers 
+significantly easier.
+
+The following functions are available from the Plug and Play Layer:
+
+pnp_get_protocol
+  increments the number of uses by one
+
+pnp_put_protocol
+  deincrements the number of uses by one
+
+pnp_register_protocol
+  use this to register a new PnP protocol
+
+pnp_unregister_protocol
+  use this function to remove a PnP protocol from the Plug and Play Layer
+
+pnp_register_driver
+  adds a PnP driver to the Plug and Play Layer
+
+  this includes driver model integration
+  returns zero for success or a negative error number for failure; count
+  calls to the .add() method if you need to know how many devices bind to
+  the driver
+
+pnp_unregister_driver
+  removes a PnP driver from the Plug and Play Layer
+
+
+
+Plug and Play Protocols
+-----------------------
+
+This section contains information for PnP protocol developers.
+
+The following Protocols are currently available in the computing world:
+
+- PNPBIOS:
+    used for system devices such as serial and parallel ports.
+- ISAPNP:
+    provides PnP support for the ISA bus
+- ACPI:
+    among its many uses, ACPI provides information about system level
+    devices.
+
+It is meant to replace the PNPBIOS.  It is not currently supported by Linux
+Plug and Play but it is planned to be in the near future.
+
+
+Requirements for a Linux PnP protocol:
+1. the protocol must use EISA IDs
+2. the protocol must inform the PnP Layer of a device's current configuration
+
+- the ability to set resources is optional but preferred.
+
+The following are PnP protocol related functions:
+
+pnp_add_device
+  use this function to add a PnP device to the PnP layer
+
+  only call this function when all wanted values are set in the pnp_dev
+  structure
+
+pnp_init_device
+  call this to initialize the PnP structure
+
+pnp_remove_device
+  call this to remove a device from the Plug and Play Layer.
+  it will fail if the device is still in use.
+  automatically will free mem used by the device and related structures
+
+pnp_add_id
+  adds an EISA ID to the list of supported IDs for the specified device
+
+For more information consult the source of a protocol such as
+/drivers/pnp/pnpbios/core.c.
+
+
+
+Linux Plug and Play Drivers
+---------------------------
+
+This section contains information for Linux PnP driver developers.
+
+The New Way
+^^^^^^^^^^^
+
+1. first make a list of supported EISA IDS
+
+   ex::
+
+	static const struct pnp_id pnp_dev_table[] = {
+		/* Standard LPT Printer Port */
+		{.id = "PNP0400", .driver_data = 0},
+		/* ECP Printer Port */
+		{.id = "PNP0401", .driver_data = 0},
+		{.id = ""}
+	};
+
+   Please note that the character 'X' can be used as a wild card in the function
+   portion (last four characters).
+
+   ex::
+
+	/* Unknown PnP modems */
+	{	"PNPCXXX",		UNKNOWN_DEV	},
+
+   Supported PnP card IDs can optionally be defined.
+   ex::
+
+	static const struct pnp_id pnp_card_table[] = {
+		{	"ANYDEVS",		0	},
+		{	"",			0	}
+	};
+
+2. Optionally define probe and remove functions.  It may make sense not to
+   define these functions if the driver already has a reliable method of detecting
+   the resources, such as the parport_pc driver.
+
+   ex::
+
+	static int
+	serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const
+			struct pnp_id *dev_id)
+	{
+	. . .
+
+   ex::
+
+	static void serial_pnp_remove(struct pnp_dev * dev)
+	{
+	. . .
+
+   consult /drivers/serial/8250_pnp.c for more information.
+
+3. create a driver structure
+
+   ex::
+
+	static struct pnp_driver serial_pnp_driver = {
+		.name		= "serial",
+		.card_id_table	= pnp_card_table,
+		.id_table	= pnp_dev_table,
+		.probe		= serial_pnp_probe,
+		.remove		= serial_pnp_remove,
+	};
+
+   * name and id_table cannot be NULL.
+
+4. register the driver
+
+   ex::
+
+	static int __init serial8250_pnp_init(void)
+	{
+		return pnp_register_driver(&serial_pnp_driver);
+	}
+
+The Old Way
+^^^^^^^^^^^
+
+A series of compatibility functions have been created to make it easy to convert
+ISAPNP drivers.  They should serve as a temporary solution only.
+
+They are as follows::
+
+	struct pnp_card *pnp_find_card(unsigned short vendor,
+				       unsigned short device,
+				       struct pnp_card *from)
+
+	struct pnp_dev *pnp_find_dev(struct pnp_card *card,
+				     unsigned short vendor,
+				     unsigned short function,
+				     struct pnp_dev *from)
+
diff --git a/Documentation/admin-guide/rtc.rst b/Documentation/admin-guide/rtc.rst
new file mode 100644
index 000000000000..688c95b11919
--- /dev/null
+++ b/Documentation/admin-guide/rtc.rst
@@ -0,0 +1,140 @@
+=======================================
+Real Time Clock (RTC) Drivers for Linux
+=======================================
+
+When Linux developers talk about a "Real Time Clock", they usually mean
+something that tracks wall clock time and is battery backed so that it
+works even with system power off.  Such clocks will normally not track
+the local time zone or daylight savings time -- unless they dual boot
+with MS-Windows -- but will instead be set to Coordinated Universal Time
+(UTC, formerly "Greenwich Mean Time").
+
+The newest non-PC hardware tends to just count seconds, like the time(2)
+system call reports, but RTCs also very commonly represent time using
+the Gregorian calendar and 24 hour time, as reported by gmtime(3).
+
+Linux has two largely-compatible userspace RTC API families you may
+need to know about:
+
+    *	/dev/rtc ... is the RTC provided by PC compatible systems,
+	so it's not very portable to non-x86 systems.
+
+    *	/dev/rtc0, /dev/rtc1 ... are part of a framework that's
+	supported by a wide variety of RTC chips on all systems.
+
+Programmers need to understand that the PC/AT functionality is not
+always available, and some systems can do much more.  That is, the
+RTCs use the same API to make requests in both RTC frameworks (using
+different filenames of course), but the hardware may not offer the
+same functionality.  For example, not every RTC is hooked up to an
+IRQ, so they can't all issue alarms; and where standard PC RTCs can
+only issue an alarm up to 24 hours in the future, other hardware may
+be able to schedule one any time in the upcoming century.
+
+
+Old PC/AT-Compatible driver:  /dev/rtc
+--------------------------------------
+
+All PCs (even Alpha machines) have a Real Time Clock built into them.
+Usually they are built into the chipset of the computer, but some may
+actually have a Motorola MC146818 (or clone) on the board. This is the
+clock that keeps the date and time while your computer is turned off.
+
+ACPI has standardized that MC146818 functionality, and extended it in
+a few ways (enabling longer alarm periods, and wake-from-hibernate).
+That functionality is NOT exposed in the old driver.
+
+However it can also be used to generate signals from a slow 2Hz to a
+relatively fast 8192Hz, in increments of powers of two. These signals
+are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is
+for...) It can also function as a 24hr alarm, raising IRQ 8 when the
+alarm goes off. The alarm can also be programmed to only check any
+subset of the three programmable values, meaning that it could be set to
+ring on the 30th second of the 30th minute of every hour, for example.
+The clock can also be set to generate an interrupt upon every clock
+update, thus generating a 1Hz signal.
+
+The interrupts are reported via /dev/rtc (major 10, minor 135, read only
+character device) in the form of an unsigned long. The low byte contains
+the type of interrupt (update-done, alarm-rang, or periodic) that was
+raised, and the remaining bytes contain the number of interrupts since
+the last read.  Status information is reported through the pseudo-file
+/proc/driver/rtc if the /proc filesystem was enabled.  The driver has
+built in locking so that only one process is allowed to have the /dev/rtc
+interface open at a time.
+
+A user process can monitor these interrupts by doing a read(2) or a
+select(2) on /dev/rtc -- either will block/stop the user process until
+the next interrupt is received. This is useful for things like
+reasonably high frequency data acquisition where one doesn't want to
+burn up 100% CPU by polling gettimeofday etc. etc.
+
+At high frequencies, or under high loads, the user process should check
+the number of interrupts received since the last read to determine if
+there has been any interrupt "pileup" so to speak. Just for reference, a
+typical 486-33 running a tight read loop on /dev/rtc will start to suffer
+occasional interrupt pileup (i.e. > 1 IRQ event since last read) for
+frequencies above 1024Hz. So you really should check the high bytes
+of the value you read, especially at frequencies above that of the
+normal timer interrupt, which is 100Hz.
+
+Programming and/or enabling interrupt frequencies greater than 64Hz is
+only allowed by root. This is perhaps a bit conservative, but we don't want
+an evil user generating lots of IRQs on a slow 386sx-16, where it might have
+a negative impact on performance. This 64Hz limit can be changed by writing
+a different value to /proc/sys/dev/rtc/max-user-freq. Note that the
+interrupt handler is only a few lines of code to minimize any possibility
+of this effect.
+
+Also, if the kernel time is synchronized with an external source, the 
+kernel will write the time back to the CMOS clock every 11 minutes. In 
+the process of doing this, the kernel briefly turns off RTC periodic 
+interrupts, so be aware of this if you are doing serious work. If you
+don't synchronize the kernel time with an external source (via ntp or
+whatever) then the kernel will keep its hands off the RTC, allowing you
+exclusive access to the device for your applications.
+
+The alarm and/or interrupt frequency are programmed into the RTC via
+various ioctl(2) calls as listed in ./include/linux/rtc.h
+Rather than write 50 pages describing the ioctl() and so on, it is
+perhaps more useful to include a small test program that demonstrates
+how to use them, and demonstrates the features of the driver. This is
+probably a lot more useful to people interested in writing applications
+that will be using this driver.  See the code at the end of this document.
+
+(The original /dev/rtc driver was written by Paul Gortmaker.)
+
+
+New portable "RTC Class" drivers:  /dev/rtcN
+--------------------------------------------
+
+Because Linux supports many non-ACPI and non-PC platforms, some of which
+have more than one RTC style clock, it needed a more portable solution
+than expecting a single battery-backed MC146818 clone on every system.
+Accordingly, a new "RTC Class" framework has been defined.  It offers
+three different userspace interfaces:
+
+    *	/dev/rtcN ... much the same as the older /dev/rtc interface
+
+    *	/sys/class/rtc/rtcN ... sysfs attributes support readonly
+	access to some RTC attributes.
+
+    *	/proc/driver/rtc ... the system clock RTC may expose itself
+	using a procfs interface. If there is no RTC for the system clock,
+	rtc0 is used by default. More information is (currently) shown
+	here than through sysfs.
+
+The RTC Class framework supports a wide variety of RTCs, ranging from those
+integrated into embeddable system-on-chip (SOC) processors to discrete chips
+using I2C, SPI, or some other bus to communicate with the host CPU.  There's
+even support for PC-style RTCs ... including the features exposed on newer PCs
+through ACPI.
+
+The new framework also removes the "one RTC per system" restriction.  For
+example, maybe the low-power battery-backed RTC is a discrete I2C chip, but
+a high functionality RTC is integrated into the SOC.  That system might read
+the system clock from the discrete RTC, but use the integrated one for all
+other tasks, because of its greater functionality.
+
+Check out tools/testing/selftests/rtc/rtctest.c for an example usage of the
+ioctl interface.
diff --git a/Documentation/admin-guide/svga.rst b/Documentation/admin-guide/svga.rst
new file mode 100644
index 000000000000..b6c2f9acca92
--- /dev/null
+++ b/Documentation/admin-guide/svga.rst
@@ -0,0 +1,249 @@
+.. include:: <isonum.txt>
+
+=================================
+Video Mode Selection Support 2.13
+=================================
+
+:Copyright: |copy| 1995--1999 Martin Mares, <mj@ucw.cz>
+
+Intro
+~~~~~
+
+This small document describes the "Video Mode Selection" feature which
+allows the use of various special video modes supported by the video BIOS. Due
+to usage of the BIOS, the selection is limited to boot time (before the
+kernel decompression starts) and works only on 80X86 machines.
+
+.. note::
+
+   Short intro for the impatient: Just use vga=ask for the first time,
+   enter ``scan`` on the video mode prompt, pick the mode you want to use,
+   remember its mode ID (the four-digit hexadecimal number) and then
+   set the vga parameter to this number (converted to decimal first).
+
+The video mode to be used is selected by a kernel parameter which can be
+specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..."
+option of LILO (or some other boot loader you use) or by the "vidmode" utility
+(present in standard Linux utility packages). You can use the following values
+of this parameter::
+
+   NORMAL_VGA - Standard 80x25 mode available on all display adapters.
+
+   EXTENDED_VGA	- Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA.
+
+   ASK_VGA - Display a video mode menu upon startup (see below).
+
+   0..35 - Menu item number (when you have used the menu to view the list of
+      modes available on your adapter, you can specify the menu item you want
+      to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the
+      mode list displayed may vary as the kernel version changes, because the
+      modes are listed in a "first detected -- first displayed" manner. It's
+      better to use absolute mode numbers instead.
+
+   0x.... - Hexadecimal video mode ID (also displayed on the menu, see below
+      for exact meaning of the ID). Warning: rdev and LILO don't support
+      hexadecimal numbers -- you have to convert it to decimal manually.
+
+Menu
+~~~~
+
+The ASK_VGA mode causes the kernel to offer a video mode menu upon
+bootup. It displays a "Press <RETURN> to see video modes available, <SPACE>
+to continue or wait 30 secs" message. If you press <RETURN>, you enter the
+menu, if you press <SPACE> or wait 30 seconds, the kernel will boot up in
+the standard 80x25 mode.
+
+The menu looks like::
+
+	Video adapter: <name-of-detected-video-adapter>
+	Mode:    COLSxROWS:
+	0  0F00  80x25
+	1  0F01  80x50
+	2  0F02  80x43
+	3  0F03  80x26
+	....
+	Enter mode number or ``scan``: <flashing-cursor-here>
+
+<name-of-detected-video-adapter> tells what video adapter did Linux detect
+-- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA
+with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection
+of chipsets is turned off by default as it's inherently unreliable due to
+absolutely insane PC design.
+
+"0  0F00  80x25" means that the first menu item (the menu items are numbered
+from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the
+next section for a description of mode IDs).
+
+<flashing-cursor-here> encourages you to enter the item number or mode ID
+you wish to set and press <RETURN>. If the computer complains something about
+"Unknown mode ID", it is trying to tell you that it isn't possible to set such
+a mode. It's also possible to press only <RETURN> which leaves the current mode.
+
+The mode list usually contains a few basic modes and some VESA modes.  In
+case your chipset has been detected, some chipset-specific modes are shown as
+well (some of these might be missing or unusable on your machine as different
+BIOSes are often shipped with the same card and the mode numbers depend purely
+on the VGA BIOS).
+
+The modes displayed on the menu are partially sorted: The list starts with
+the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and
+80x43), local modes (if the local modes feature is enabled), VESA modes and
+finally SVGA modes for the auto-detected adapter.
+
+If you are not happy with the mode list offered (e.g., if you think your card
+is able to do more), you can enter "scan" instead of item number / mode ID.  The
+program will try to ask the BIOS for all possible video mode numbers and test
+what happens then. The screen will be probably flashing wildly for some time and
+strange noises will be heard from inside the monitor and so on and then, really
+all consistent video modes supported by your BIOS will appear (plus maybe some
+``ghost modes``). If you are afraid this could damage your monitor, don't use
+this function.
+
+After scanning, the mode ordering is a bit different: the auto-detected SVGA
+modes are not listed at all and the modes revealed by ``scan`` are shown before
+all VESA modes.
+
+Mode IDs
+~~~~~~~~
+
+Because of the complexity of all the video stuff, the video mode IDs
+used here are also a bit complex. A video mode ID is a 16-bit number usually
+expressed in a hexadecimal notation (starting with "0x"). You can set a mode
+by entering its mode directly if you know it even if it isn't shown on the menu.
+
+The ID numbers can be divided to those regions::
+
+   0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use
+	outside the menu as this can change from boot to boot (especially if you
+	have used the ``scan`` feature).
+
+   0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number
+	(as presented to INT 10, function 00) increased by 0x0100.
+
+   0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by
+	0x0100. All VESA modes should be autodetected and shown on the menu.
+
+   0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05.
+	(Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60,
+	945=132x28 for the standard Video7 BIOS)
+
+   0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually
+	by modifying one of the standard modes). Currently available:
+	0x0f00	standard 80x25, don't reset mode if already set (=FFFF)
+	0x0f01	standard with 8-point font: 80x43 on EGA, 80x50 on VGA
+	0x0f02	VGA 80x43 (VGA switched to 350 scanlines with a 8-point font)
+	0x0f03	VGA 80x28 (standard VGA scans, but 14-point font)
+	0x0f04	leave current video mode
+	0x0f05	VGA 80x30 (480 scans, 16-point font)
+	0x0f06	VGA 80x34 (480 scans, 14-point font)
+	0x0f07	VGA 80x60 (480 scans, 8-point font)
+	0x0f08	Graphics hack (see the VIDEO_GFX_HACK paragraph below)
+
+   0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC"
+	form where RR is a number of rows and CC is a number of columns.
+	E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc.
+	This is the only fully portable way to refer to a non-standard mode,
+	but it relies on the mode being found and displayed on the menu
+	(remember that mode scanning is not done automatically).
+
+   0xff00 to 0xffff - aliases for backward compatibility:
+	0xffff	equivalent to 0x0f00 (standard 80x25)
+	0xfffe	equivalent to 0x0f01 (EGA 80x43 or VGA 80x50)
+
+If you add 0x8000 to the mode ID, the program will try to recalculate
+vertical display timing according to mode parameters, which can be used to
+eliminate some annoying bugs of certain VGA BIOSes (usually those used for
+cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the
+end of the display.
+
+Options
+~~~~~~~
+
+Build options for arch/x86/boot/* are selected by the kernel kconfig
+utility and the kernel .config file.
+
+VIDEO_GFX_HACK - includes special hack for setting of graphics modes
+to be used later by special drivers.
+Allows to set _any_ BIOS mode including graphic ones and forcing specific
+text screen resolution instead of peeking it from BIOS variables. Don't use
+unless you think you know what you're doing. To activate this setup, use
+mode number 0x0f08 (see the Mode IDs section above).
+
+Still doesn't work?
+~~~~~~~~~~~~~~~~~~~
+
+When the mode detection doesn't work (e.g., the mode list is incorrect or
+the machine hangs instead of displaying the menu), try to switch off some of
+the configuration options listed under "Options". If it fails, you can still use
+your kernel with the video mode set directly via the kernel parameter.
+
+In either case, please send me a bug report containing what _exactly_
+happens and how do the configuration switches affect the behaviour of the bug.
+
+If you start Linux from M$-DOS, you might also use some DOS tools for
+video mode setting. In this case, you must specify the 0x0f04 mode ("leave
+current settings") to Linux, because if you don't and you use any non-standard
+mode, Linux will switch to 80x25 automatically.
+
+If you set some extended mode and there's one or more extra lines on the
+bottom of the display containing already scrolled-out text, your VGA BIOS
+contains the most common video BIOS bug called "incorrect vertical display
+end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately,
+this must be done manually -- no autodetection mechanisms are available.
+
+History
+~~~~~~~
+
+=============== ================================================================
+1.0 (??-Nov-95)	First version supporting all adapters supported by the old
+		setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels
+		and then removed due to instability on some machines.
+2.0 (28-Jan-96)	Rewritten from scratch. Cirrus Logic 64XX support added, almost
+		everything is configurable, the VESA support should be much more
+		stable, explicit mode numbering allowed, "scan" implemented etc.
+2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution
+		supported. Few bugs fixed. VESA modes are listed prior to
+		modes supplied by SVGA autodetection as they are more reliable.
+		CLGD autodetect works better. Doesn't depend on 80x25 being
+		active when started. Scanning fixed. 80x43 (any VGA) added.
+		Code cleaned up.
+2.2 (01-Feb-96)	EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX
+		VESA modes work now). Display end bug workaround supported.
+		Special modes renumbered to allow adding of the "recalculate"
+		flag, 0xffff and 0xfffe became aliases instead of real IDs.
+		Screen contents retained during mode changes.
+2.3 (15-Mar-96)	Changed to work with 1.3.74 kernel.
+2.4 (18-Mar-96)	Added patches by Hans Lermen fixing a memory overwrite problem
+		with some boot loaders. Memory management rewritten to reflect
+		these changes. Unfortunately, screen contents retaining works
+		only with some loaders now.
+		Added a Tseng 132x60 mode.
+2.5 (19-Mar-96)	Fixed a VESA mode scanning bug introduced in 2.4.
+2.6 (25-Mar-96)	Some VESA BIOS errors not reported -- it fixes error reports on
+		several cards with broken VESA code (e.g., ATI VGA).
+2.7 (09-Apr-96)	- Accepted all VESA modes in range 0x100 to 0x7ff, because some
+		  cards use very strange mode numbers.
+		- Added Realtek VGA modes (thanks to Gonzalo Tornaria).
+		- Hardware testing order slightly changed, tests based on ROM
+		  contents done as first.
+		- Added support for special Video7 mode switching functions
+		  (thanks to Tom Vander Aa).
+		- Added 480-scanline modes (especially useful for notebooks,
+		  original version written by hhanemaa@cs.ruu.nl, patched by
+		  Jeff Chua, rewritten by me).
+		- Screen store/restore fixed.
+2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA.
+		- Better recognition of text modes during mode scan.
+2.9 (12-May-96)	- Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!)
+2.10(11-Nov-96) - The whole thing made optional.
+		- Added the CONFIG_VIDEO_400_HACK switch.
+		- Added the CONFIG_VIDEO_GFX_HACK switch.
+		- Code cleanup.
+2.11(03-May-97) - Yet another cleanup, now including also the documentation.
+		- Direct testing of SVGA adapters turned off by default, ``scan``
+		  offered explicitly on the prompt line.
+		- Removed the doc section describing adding of new probing
+		  functions as I try to get rid of _all_ hardware probing here.
+2.12(25-May-98) Added support for VESA frame buffer graphics.
+2.13(14-May-99) Minor documentation fixes.
+=============== ================================================================
diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index a0c1d4ce403a..032c7cd3cede 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -327,7 +327,7 @@ when a hard lockup is detected.
    0 - don't panic on hard lockup
    1 - panic on hard lockup
 
-See Documentation/lockup-watchdogs.txt for more information.  This can
+See Documentation/admin-guide/lockup-watchdogs.rst for more information.  This can
 also be set using the nmi_watchdog kernel parameter.
 
 
diff --git a/Documentation/admin-guide/video-output.rst b/Documentation/admin-guide/video-output.rst
new file mode 100644
index 000000000000..56d6fa2e2368
--- /dev/null
+++ b/Documentation/admin-guide/video-output.rst
@@ -0,0 +1,34 @@
+Video Output Switcher Control
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+2006 luming.yu@intel.com
+
+The output sysfs class driver provides an abstract video output layer that
+can be used to hook platform specific methods to enable/disable video output
+device through common sysfs interface. For example, on my IBM ThinkPad T42
+laptop, The ACPI video driver registered its output devices and read/write
+method for 'state' with output sysfs class. The user interface under sysfs is::
+
+  linux:/sys/class/video_output # tree .
+  .
+  |-- CRT0
+  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+  |   |-- state
+  |   |-- subsystem -> ../../../class/video_output
+  |   `-- uevent
+  |-- DVI0
+  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+  |   |-- state
+  |   |-- subsystem -> ../../../class/video_output
+  |   `-- uevent
+  |-- LCD0
+  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+  |   |-- state
+  |   |-- subsystem -> ../../../class/video_output
+  |   `-- uevent
+  `-- TV0
+     |-- device -> ../../../devices/pci0000:00/0000:00:01.0
+     |-- state
+     |-- subsystem -> ../../../class/video_output
+     `-- uevent
+
diff --git a/Documentation/auxdisplay/lcd-panel-cgram.rst b/Documentation/auxdisplay/lcd-panel-cgram.rst
deleted file mode 100644
index dfef50286018..000000000000
--- a/Documentation/auxdisplay/lcd-panel-cgram.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-:orphan:
-
-======================================
-Parallel port LCD/Keypad Panel support
-======================================
-
-Some LCDs allow you to define up to 8 characters, mapped to ASCII
-characters 0 to 7. The escape code to define a new character is
-'\e[LG' followed by one digit from 0 to 7, representing the character
-number, and up to 8 couples of hex digits terminated by a semi-colon
-(';'). Each couple of digits represents a line, with 1-bits for each
-illuminated pixel with LSB on the right. Lines are numbered from the
-top of the character to the bottom. On a 5x7 matrix, only the 5 lower
-bits of the 7 first bytes are used for each character. If the string
-is incomplete, only complete lines will be redefined. Here are some
-examples::
-
-  printf "\e[LG0010101050D1F0C04;"  => 0 = [enter]
-  printf "\e[LG1040E1F0000000000;"  => 1 = [up]
-  printf "\e[LG2000000001F0E0400;"  => 2 = [down]
-  printf "\e[LG3040E1F001F0E0400;"  => 3 = [up-down]
-  printf "\e[LG40002060E1E0E0602;"  => 4 = [left]
-  printf "\e[LG500080C0E0F0E0C08;"  => 5 = [right]
-  printf "\e[LG60016051516141400;"  => 6 = "IP"
-
-  printf "\e[LG00103071F1F070301;"  => big speaker
-  printf "\e[LG00002061E1E060200;"  => small speaker
-
-Willy
diff --git a/Documentation/btmrvl.txt b/Documentation/btmrvl.txt
deleted file mode 100644
index ec57740ead0c..000000000000
--- a/Documentation/btmrvl.txt
+++ /dev/null
@@ -1,124 +0,0 @@
-=============
-btmrvl driver
-=============
-
-All commands are used via debugfs interface.
-
-Set/get driver configurations
-=============================
-
-Path:	/debug/btmrvl/config/
-
-gpiogap=[n], hscfgcmd
-	These commands are used to configure the host sleep parameters::
-	bit 8:0  -- Gap
-	bit 16:8 -- GPIO
-
-	where GPIO is the pin number of GPIO used to wake up the host.
-	It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface
-	wakeup will be used instead).
-
-	where Gap is the gap in milli seconds between wakeup signal and
-	wakeup event, or 0xff for special host sleep setting.
-
-	Usage::
-
-		# Use SDIO interface to wake up the host and set GAP to 0x80:
-		echo 0xff80 > /debug/btmrvl/config/gpiogap
-		echo 1 > /debug/btmrvl/config/hscfgcmd
-
-		# Use GPIO pin #3 to wake up the host and set GAP to 0xff:
-		echo 0x03ff >  /debug/btmrvl/config/gpiogap
-		echo 1 > /debug/btmrvl/config/hscfgcmd
-
-psmode=[n], pscmd
-	These commands are used to enable/disable auto sleep mode
-
-	where the option is::
-
-			1 	-- Enable auto sleep mode
-			0 	-- Disable auto sleep mode
-
-	Usage::
-
-		# Enable auto sleep mode
-		echo 1 > /debug/btmrvl/config/psmode
-		echo 1 > /debug/btmrvl/config/pscmd
-
-		# Disable auto sleep mode
-		echo 0 > /debug/btmrvl/config/psmode
-		echo 1 > /debug/btmrvl/config/pscmd
-
-
-hsmode=[n], hscmd
-	These commands are used to enable host sleep or wake up firmware
-
-	where the option is::
-
-			1	-- Enable host sleep
-			0	-- Wake up firmware
-
-	Usage::
-
-		# Enable host sleep
-		echo 1 > /debug/btmrvl/config/hsmode
-		echo 1 > /debug/btmrvl/config/hscmd
-
-		# Wake up firmware
-		echo 0 > /debug/btmrvl/config/hsmode
-		echo 1 > /debug/btmrvl/config/hscmd
-
-
-Get driver status
-=================
-
-Path:	/debug/btmrvl/status/
-
-Usage::
-
-	cat /debug/btmrvl/status/<args>
-
-where the args are:
-
-curpsmode
-	This command displays current auto sleep status.
-
-psstate
-	This command display the power save state.
-
-hsstate
-	This command display the host sleep state.
-
-txdnldrdy
-	This command displays the value of Tx download ready flag.
-
-Issuing a raw hci command
-=========================
-
-Use hcitool to issue raw hci command, refer to hcitool manual
-
-Usage::
-
-	Hcitool cmd <ogf> <ocf> [Parameters]
-
-Interface Control Command::
-
-	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00    --Enable All interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01    --Enable Wlan interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02    --Enable BT interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00    --Disable All interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01    --Disable Wlan interface
-	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02    --Disable BT interface
-
-SD8688 firmware
-===============
-
-Images:
-
-- /lib/firmware/sd8688_helper.bin
-- /lib/firmware/sd8688.bin
-
-
-The images can be downloaded from:
-
-git.infradead.org/users/dwmw2/linux-firmware.git/libertas/
diff --git a/Documentation/clearing-warn-once.txt b/Documentation/clearing-warn-once.txt
deleted file mode 100644
index 211fd926cf00..000000000000
--- a/Documentation/clearing-warn-once.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-Clearing WARN_ONCE
-------------------
-
-WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once.
-
-echo 1 > /sys/kernel/debug/clear_warn_once
-
-clears the state and allows the warnings to print once again.
-This can be useful after test suite runs to reproduce problems.
diff --git a/Documentation/cma/debugfs.rst b/Documentation/cma/debugfs.rst
deleted file mode 100644
index 518fe401b5ee..000000000000
--- a/Documentation/cma/debugfs.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-:orphan:
-
-=====================
-CMA Debugfs Interface
-=====================
-
-The CMA debugfs interface is useful to retrieve basic information out of the
-different CMA areas and to test allocation/release in each of the areas.
-
-Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
-kernel's CMA index. So the first CMA zone would be:
-
-	<debugfs>/cma/cma-0
-
-The structure of the files created under that directory is as follows:
-
- - [RO] base_pfn: The base PFN (Page Frame Number) of the zone.
- - [RO] count: Amount of memory in the CMA area.
- - [RO] order_per_bit: Order of pages represented by one bit.
- - [RO] bitmap: The bitmap of page states in the zone.
- - [WO] alloc: Allocate N pages from that CMA area. For example::
-
-	echo 5 > <debugfs>/cma/cma-2/alloc
-
-would try to allocate 5 pages from the cma-2 area.
-
- - [WO] free: Free N pages from that CMA area, similar to the above.
diff --git a/Documentation/cpu-load.txt b/Documentation/cpu-load.txt
deleted file mode 100644
index 2d01ce43d2a2..000000000000
--- a/Documentation/cpu-load.txt
+++ /dev/null
@@ -1,114 +0,0 @@
-========
-CPU load
-========
-
-Linux exports various bits of information via ``/proc/stat`` and
-``/proc/uptime`` that userland tools, such as top(1), use to calculate
-the average time system spent in a particular state, for example::
-
-    $ iostat
-    Linux 2.6.18.3-exp (linmac)     02/20/2007
-
-    avg-cpu:  %user   %nice %system %iowait  %steal   %idle
-              10.01    0.00    2.92    5.44    0.00   81.63
-
-    ...
-
-Here the system thinks that over the default sampling period the
-system spent 10.01% of the time doing work in user space, 2.92% in the
-kernel, and was overall 81.63% of the time idle.
-
-In most cases the ``/proc/stat``	 information reflects the reality quite
-closely, however due to the nature of how/when the kernel collects
-this data sometimes it can not be trusted at all.
-
-So how is this information collected?  Whenever timer interrupt is
-signalled the kernel looks what kind of task was running at this
-moment and increments the counter that corresponds to this tasks
-kind/state.  The problem with this is that the system could have
-switched between various states multiple times between two timer
-interrupts yet the counter is incremented only for the last state.
-
-
-Example
--------
-
-If we imagine the system with one task that periodically burns cycles
-in the following manner::
-
-     time line between two timer interrupts
-    |--------------------------------------|
-     ^                                    ^
-     |_ something begins working          |
-                                          |_ something goes to sleep
-                                         (only to be awaken quite soon)
-
-In the above situation the system will be 0% loaded according to the
-``/proc/stat`` (since the timer interrupt will always happen when the
-system is executing the idle handler), but in reality the load is
-closer to 99%.
-
-One can imagine many more situations where this behavior of the kernel
-will lead to quite erratic information inside ``/proc/stat``::
-
-
-	/* gcc -o hog smallhog.c */
-	#include <time.h>
-	#include <limits.h>
-	#include <signal.h>
-	#include <sys/time.h>
-	#define HIST 10
-
-	static volatile sig_atomic_t stop;
-
-	static void sighandler (int signr)
-	{
-	(void) signr;
-	stop = 1;
-	}
-	static unsigned long hog (unsigned long niters)
-	{
-	stop = 0;
-	while (!stop && --niters);
-	return niters;
-	}
-	int main (void)
-	{
-	int i;
-	struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
-				.it_value = { .tv_sec = 0, .tv_usec = 1 } };
-	sigset_t set;
-	unsigned long v[HIST];
-	double tmp = 0.0;
-	unsigned long n;
-	signal (SIGALRM, &sighandler);
-	setitimer (ITIMER_REAL, &it, NULL);
-
-	hog (ULONG_MAX);
-	for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
-	for (i = 0; i < HIST; ++i) tmp += v[i];
-	tmp /= HIST;
-	n = tmp - (tmp / 3.0);
-
-	sigemptyset (&set);
-	sigaddset (&set, SIGALRM);
-
-	for (;;) {
-		hog (n);
-		sigwait (&set, &i);
-	}
-	return 0;
-	}
-
-
-References
-----------
-
-- http://lkml.org/lkml/2007/2/12/6
-- Documentation/filesystems/proc.txt (1.8)
-
-
-Thanks
-------
-
-Con Kolivas, Pavel Machek
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
deleted file mode 100644
index b90dafcc8237..000000000000
--- a/Documentation/cputopology.txt
+++ /dev/null
@@ -1,177 +0,0 @@
-===========================================
-How CPU topology info is exported via sysfs
-===========================================
-
-Export CPU topology info via sysfs. Items (attributes) are similar
-to /proc/cpuinfo output of some architectures.  They reside in
-/sys/devices/system/cpu/cpuX/topology/:
-
-physical_package_id:
-
-	physical package id of cpuX. Typically corresponds to a physical
-	socket number, but the actual value is architecture and platform
-	dependent.
-
-die_id:
-
-	the CPU die ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).  The actual value is
-	architecture and platform dependent.
-
-core_id:
-
-	the CPU core ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).  The actual value is
-	architecture and platform dependent.
-
-book_id:
-
-	the book ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).	The actual value is
-	architecture and platform dependent.
-
-drawer_id:
-
-	the drawer ID of cpuX. Typically it is the hardware platform's
-	identifier (rather than the kernel's).	The actual value is
-	architecture and platform dependent.
-
-core_cpus:
-
-	internal kernel map of CPUs within the same core.
-	(deprecated name: "thread_siblings")
-
-core_cpus_list:
-
-	human-readable list of CPUs within the same core.
-	(deprecated name: "thread_siblings_list");
-
-package_cpus:
-
-	internal kernel map of the CPUs sharing the same physical_package_id.
-	(deprecated name: "core_siblings")
-
-package_cpus_list:
-
-	human-readable list of CPUs sharing the same physical_package_id.
-	(deprecated name: "core_siblings_list")
-
-die_cpus:
-
-	internal kernel map of CPUs within the same die.
-
-die_cpus_list:
-
-	human-readable list of CPUs within the same die.
-
-book_siblings:
-
-	internal kernel map of cpuX's hardware threads within the same
-	book_id.
-
-book_siblings_list:
-
-	human-readable list of cpuX's hardware threads within the same
-	book_id.
-
-drawer_siblings:
-
-	internal kernel map of cpuX's hardware threads within the same
-	drawer_id.
-
-drawer_siblings_list:
-
-	human-readable list of cpuX's hardware threads within the same
-	drawer_id.
-
-Architecture-neutral, drivers/base/topology.c, exports these attributes.
-However, the book and drawer related sysfs files will only be created if
-CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively.
-
-CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390,
-where they reflect the cpu and cache hierarchy.
-
-For an architecture to support this feature, it must define some of
-these macros in include/asm-XXX/topology.h::
-
-	#define topology_physical_package_id(cpu)
-	#define topology_die_id(cpu)
-	#define topology_core_id(cpu)
-	#define topology_book_id(cpu)
-	#define topology_drawer_id(cpu)
-	#define topology_sibling_cpumask(cpu)
-	#define topology_core_cpumask(cpu)
-	#define topology_die_cpumask(cpu)
-	#define topology_book_cpumask(cpu)
-	#define topology_drawer_cpumask(cpu)
-
-The type of ``**_id macros`` is int.
-The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
-correspond with appropriate ``**_siblings`` sysfs attributes (except for
-topology_sibling_cpumask() which corresponds with thread_siblings).
-
-To be consistent on all architectures, include/linux/topology.h
-provides default definitions for any of the above macros that are
-not defined by include/asm-XXX/topology.h:
-
-1) topology_physical_package_id: -1
-2) topology_die_id: -1
-3) topology_core_id: 0
-4) topology_sibling_cpumask: just the given CPU
-5) topology_core_cpumask: just the given CPU
-6) topology_die_cpumask: just the given CPU
-
-For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
-default definitions for topology_book_id() and topology_book_cpumask().
-For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are
-no default definitions for topology_drawer_id() and topology_drawer_cpumask().
-
-Additionally, CPU topology information is provided under
-/sys/devices/system/cpu and includes these files.  The internal
-source for the output is in brackets ("[]").
-
-    =========== ==========================================================
-    kernel_max: the maximum CPU index allowed by the kernel configuration.
-		[NR_CPUS-1]
-
-    offline:	CPUs that are not online because they have been
-		HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit
-		of CPUs allowed by the kernel configuration (kernel_max
-		above). [~cpu_online_mask + cpus >= NR_CPUS]
-
-    online:	CPUs that are online and being scheduled [cpu_online_mask]
-
-    possible:	CPUs that have been allocated resources and can be
-		brought online if they are present. [cpu_possible_mask]
-
-    present:	CPUs that have been identified as being present in the
-		system. [cpu_present_mask]
-    =========== ==========================================================
-
-The format for the above output is compatible with cpulist_parse()
-[see <linux/cpumask.h>].  Some examples follow.
-
-In this example, there are 64 CPUs in the system but cpus 32-63 exceed
-the kernel max which is limited to 0..31 by the NR_CPUS config option
-being 32.  Note also that CPUs 2 and 4-31 are not online but could be
-brought online as they are both present and possible::
-
-     kernel_max: 31
-        offline: 2,4-31,32-63
-         online: 0-1,3
-       possible: 0-31
-        present: 0-31
-
-In this example, the NR_CPUS config option is 128, but the kernel was
-started with possible_cpus=144.  There are 4 CPUs in the system and cpu2
-was manually taken offline (and is the only CPU that can be brought
-online.)::
-
-     kernel_max: 127
-        offline: 2,4-127,128-143
-         online: 0-1,3
-       possible: 0-127
-        present: 0-3
-
-See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter
-as well as more information on the various cpumasks.
diff --git a/Documentation/efi-stub.txt b/Documentation/efi-stub.txt
deleted file mode 100644
index 833edb0d0bc4..000000000000
--- a/Documentation/efi-stub.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-=================
-The EFI Boot Stub
-=================
-
-On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade
-as a PE/COFF image, thereby convincing EFI firmware loaders to load
-it as an EFI executable. The code that modifies the bzImage header,
-along with the EFI-specific entry point that the firmware loader
-jumps to are collectively known as the "EFI boot stub", and live in
-arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c,
-respectively. For ARM the EFI stub is implemented in
-arch/arm/boot/compressed/efi-header.S and
-arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared
-between architectures is in drivers/firmware/efi/libstub.
-
-For arm64, there is no compressed kernel support, so the Image itself
-masquerades as a PE/COFF image and the EFI stub is linked into the
-kernel. The arm64 EFI stub lives in arch/arm64/kernel/efi-entry.S
-and drivers/firmware/efi/libstub/arm64-stub.c.
-
-By using the EFI boot stub it's possible to boot a Linux kernel
-without the use of a conventional EFI boot loader, such as grub or
-elilo. Since the EFI boot stub performs the jobs of a boot loader, in
-a certain sense it *IS* the boot loader.
-
-The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option.
-
-
-How to install bzImage.efi
---------------------------
-
-The bzImage located in arch/x86/boot/bzImage must be copied to the EFI
-System Partition (ESP) and renamed with the extension ".efi". Without
-the extension the EFI firmware loader will refuse to execute it. It's
-not possible to execute bzImage.efi from the usual Linux file systems
-because EFI firmware doesn't have support for them. For ARM the
-arch/arm/boot/zImage should be copied to the system partition, and it
-may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image
-should be copied but not necessarily renamed.
-
-
-Passing kernel parameters from the EFI shell
---------------------------------------------
-
-Arguments to the kernel can be passed after bzImage.efi, e.g.::
-
-	fs0:> bzImage.efi console=ttyS0 root=/dev/sda4
-
-
-The "initrd=" option
---------------------
-
-Like most boot loaders, the EFI stub allows the user to specify
-multiple initrd files using the "initrd=" option. This is the only EFI
-stub-specific command line parameter, everything else is passed to the
-kernel when it boots.
-
-The path to the initrd file must be an absolute path from the
-beginning of the ESP, relative path names do not work. Also, the path
-is an EFI-style path and directory elements must be separated with
-backslashes (\). For example, given the following directory layout::
-
-  fs0:>
-	Kernels\
-			bzImage.efi
-			initrd-large.img
-
-	Ramdisks\
-			initrd-small.img
-			initrd-medium.img
-
-to boot with the initrd-large.img file if the current working
-directory is fs0:\Kernels, the following command must be used::
-
-	fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img
-
-Notice how bzImage.efi can be specified with a relative path. That's
-because the image we're executing is interpreted by the EFI shell,
-which understands relative paths, whereas the rest of the command line
-is passed to bzImage.efi.
-
-
-The "dtb=" option
------------------
-
-For the ARM and arm64 architectures, a device tree must be provided to
-the kernel. Normally firmware shall supply the device tree via the
-EFI CONFIGURATION TABLE. However, the "dtb=" command line option can
-be used to override the firmware supplied device tree, or to supply
-one when firmware is unable to.
-
-Please note: Firmware adds runtime configuration information to the
-device tree before booting the kernel. If dtb= is used to override
-the device tree, then any runtime data provided by firmware will be
-lost. The dtb= option should only be used either as a debug tool, or
-as a last resort when a device tree is not provided in the EFI
-CONFIGURATION TABLE.
-
-"dtb=" is processed in the same manner as the "initrd=" option that is
-described above.
diff --git a/Documentation/fb/vesafb.rst b/Documentation/fb/vesafb.rst
index 2ed0dfb661cf..6821c87b7893 100644
--- a/Documentation/fb/vesafb.rst
+++ b/Documentation/fb/vesafb.rst
@@ -30,7 +30,7 @@ How to use it?
 ==============
 
 Switching modes is done using the vga=... boot parameter.  Read
-Documentation/svga.txt for details.
+Documentation/admin-guide/svga.rst for details.
 
 You should compile in both vgacon (for text mode) and vesafb (for
 graphics mode). Which of them takes over the console depends on
diff --git a/Documentation/highuid.txt b/Documentation/highuid.txt
deleted file mode 100644
index 6ee70465c0ea..000000000000
--- a/Documentation/highuid.txt
+++ /dev/null
@@ -1,80 +0,0 @@
-===================================================
-Notes on the change from 16-bit UIDs to 32-bit UIDs
-===================================================
-
-:Author: Chris Wing <wingc@umich.edu>
-:Last updated: January 11, 2000
-
-- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t
-  when communicating between user and kernel space in an ioctl or data
-  structure.
-
-- kernel code should use uid_t and gid_t in kernel-private structures and
-  code.
-
-What's left to be done for 32-bit UIDs on all Linux architectures:
-
-- Disk quotas have an interesting limitation that is not related to the
-  maximum UID/GID. They are limited by the maximum file size on the
-  underlying filesystem, because quota records are written at offsets
-  corresponding to the UID in question.
-  Further investigation is needed to see if the quota system can cope
-  properly with huge UIDs. If it can deal with 64-bit file offsets on all 
-  architectures, this should not be a problem.
-
-- Decide whether or not to keep backwards compatibility with the system
-  accounting file, or if we should break it as the comments suggest
-  (currently, the old 16-bit UID and GID are still written to disk, and
-  part of the former pad space is used to store separate 32-bit UID and
-  GID)
-
-- Need to validate that OS emulation calls the 16-bit UID
-  compatibility syscalls, if the OS being emulated used 16-bit UIDs, or
-  uses the 32-bit UID system calls properly otherwise.
-
-  This affects at least:
-
-	- iBCS on Intel
-
-	- sparc32 emulation on sparc64
-	  (need to support whatever new 32-bit UID system calls are added to
-	  sparc32)
-
-- Validate that all filesystems behave properly.
-
-  At present, 32-bit UIDs _should_ work for:
-
-	- ext2
-	- ufs
-	- isofs
-	- nfs
-	- coda
-	- udf
-
-  Ioctl() fixups have been made for:
-
-	- ncpfs
-	- smbfs
-
-  Filesystems with simple fixups to prevent 16-bit UID wraparound:
-
-	- minix
-	- sysv
-	- qnx4
-
-  Other filesystems have not been checked yet.
-
-- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in
-  all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but
-  more are needed. (as well as new user<->kernel data structures)
-
-- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k,
-  sh, and sparc32. Fixing this is probably not that important, but would
-  require adding a new ELF section.
-
-- The ioctl()s used to control the in-kernel NFS server only support
-  16-bit UIDs on arm, i386, m68k, sh, and sparc32.
-
-- make sure that the UID mapping feature of AX25 networking works properly
-  (it should be safe because it's always used a 32-bit integer to
-  communicate between user and kernel)
diff --git a/Documentation/hw_random.txt b/Documentation/hw_random.txt
deleted file mode 100644
index 121de96e395e..000000000000
--- a/Documentation/hw_random.txt
+++ /dev/null
@@ -1,105 +0,0 @@
-==========================================================
-Linux support for random number generator in i8xx chipsets
-==========================================================
-
-Introduction
-============
-
-The hw_random framework is software that makes use of a
-special hardware feature on your CPU or motherboard,
-a Random Number Generator (RNG).  The software has two parts:
-a core providing the /dev/hwrng character device and its
-sysfs support, plus a hardware-specific driver that plugs
-into that core.
-
-To make the most effective use of these mechanisms, you
-should download the support software as well.  Download the
-latest version of the "rng-tools" package from the
-hw_random driver's official Web site:
-
-	http://sourceforge.net/projects/gkernel/
-
-Those tools use /dev/hwrng to fill the kernel entropy pool,
-which is used internally and exported by the /dev/urandom and
-/dev/random special files.
-
-Theory of operation
-===================
-
-CHARACTER DEVICE.  Using the standard open()
-and read() system calls, you can read random data from
-the hardware RNG device.  This data is NOT CHECKED by any
-fitness tests, and could potentially be bogus (if the
-hardware is faulty or has been tampered with).  Data is only
-output if the hardware "has-data" flag is set, but nevertheless
-a security-conscious person would run fitness tests on the
-data before assuming it is truly random.
-
-The rng-tools package uses such tests in "rngd", and lets you
-run them by hand with a "rngtest" utility.
-
-/dev/hwrng is char device major 10, minor 183.
-
-CLASS DEVICE.  There is a /sys/class/misc/hw_random node with
-two unique attributes, "rng_available" and "rng_current".  The
-"rng_available" attribute lists the hardware-specific drivers
-available, while "rng_current" lists the one which is currently
-connected to /dev/hwrng.  If your system has more than one
-RNG available, you may change the one used by writing a name from
-the list in "rng_available" into "rng_current".
-
-==========================================================================
-
-
-Hardware driver for Intel/AMD/VIA Random Number Generators (RNG)
-	- Copyright 2000,2001 Jeff Garzik <jgarzik@pobox.com>
-	- Copyright 2000,2001 Philipp Rumpf <prumpf@mandrakesoft.com>
-
-
-About the Intel RNG hardware, from the firmware hub datasheet
-=============================================================
-
-The Firmware Hub integrates a Random Number Generator (RNG)
-using thermal noise generated from inherently random quantum
-mechanical properties of silicon. When not generating new random
-bits the RNG circuitry will enter a low power state. Intel will
-provide a binary software driver to give third party software
-access to our RNG for use as a security feature. At this time,
-the RNG is only to be used with a system in an OS-present state.
-
-Intel RNG Driver notes
-======================
-
-FIXME: support poll(2)
-
-.. note::
-
-	request_mem_region was removed, for three reasons:
-
-	1) Only one RNG is supported by this driver;
-	2) The location used by the RNG is a fixed location in
-	   MMIO-addressable memory;
-	3) users with properly working BIOS e820 handling will always
-	   have the region in which the RNG is located reserved, so
-	   request_mem_region calls always fail for proper setups.
-	   However, for people who use mem=XX, BIOS e820 information is
-	   **not** in /proc/iomem, and request_mem_region(RNG_ADDR) can
-	   succeed.
-
-Driver details
-==============
-
-Based on:
-	Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet
-	May 1999 Order Number: 290658-002 R
-
-Intel 82802 Firmware Hub:
-	Random Number Generator
-	Programmer's Reference Manual
-	December 1999 Order Number: 298029-001 R
-
-Intel 82802 Firmware HUB Random Number Generator Driver
-	Copyright (c) 2000 Matt Sottek <msottek@quiknet.com>
-
-Special thanks to Matt Sottek.  I did the "guts", he
-did the "brains" and all the testing.
diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt
deleted file mode 100644
index 5d63b18bd6d1..000000000000
--- a/Documentation/iostats.txt
+++ /dev/null
@@ -1,197 +0,0 @@
-=====================
-I/O statistics fields
-=====================
-
-Since 2.4.20 (and some versions before, with patches), and 2.5.45,
-more extensive disk statistics have been introduced to help measure disk
-activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do
-the work for you, but in case you are interested in creating your own
-tools, the fields are explained here.
-
-In 2.4 now, the information is found as additional fields in
-``/proc/partitions``.  In 2.6 and upper, the same information is found in two
-places: one is in the file ``/proc/diskstats``, and the other is within
-the sysfs file system, which must be mounted in order to obtain
-the information. Throughout this document we'll assume that sysfs
-is mounted on ``/sys``, although of course it may be mounted anywhere.
-Both ``/proc/diskstats`` and sysfs use the same source for the information
-and so should not differ.
-
-Here are examples of these different formats::
-
-   2.4:
-      3     0   39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
-      3     1    9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030
-
-   2.6+ sysfs:
-      446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
-      35486    38030    38030    38030
-
-   2.6+ diskstats:
-      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160
-      3    1   hda1 35486 38030 38030 38030
-
-   4.18+ diskstats:
-      3    0   hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0
-
-On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have
-a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``.
-
-The advantage of one over the other is that the sysfs choice works well
-if you are watching a known, small set of disks.  ``/proc/diskstats`` may
-be a better choice if you are watching a large number of disks because
-you'll avoid the overhead of 50, 100, or 500 or more opens/closes with
-each snapshot of your disk statistics.
-
-In 2.4, the statistics fields are those after the device name. In
-the above example, the first field of statistics would be 446216.
-By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll
-find just the eleven fields, beginning with 446216.  If you look at
-``/proc/diskstats``, the eleven fields will be preceded by the major and
-minor device numbers, and device name.  Each of these formats provides
-eleven fields of statistics, each meaning exactly the same things.
-All fields except field 9 are cumulative since boot.  Field 9 should
-go to zero as I/Os complete; all others only increase (unless they
-overflow and wrap).  Yes, these are (32-bit or 64-bit) unsigned long
-(native word size) numbers, and on a very busy or long-lived system they
-may wrap. Applications should be prepared to deal with that; unless
-your observations are measured in large numbers of minutes or hours,
-they should not wrap twice before you notice them.
-
-Each set of stats only applies to the indicated device; if you want
-system-wide stats you'll have to find all the devices and sum them all up.
-
-Field  1 -- # of reads completed
-    This is the total number of reads completed successfully.
-
-Field  2 -- # of reads merged, field 6 -- # of writes merged
-    Reads and writes which are adjacent to each other may be merged for
-    efficiency.  Thus two 4K reads may become one 8K read before it is
-    ultimately handed to the disk, and so it will be counted (and queued)
-    as only one I/O.  This field lets you know how often this was done.
-
-Field  3 -- # of sectors read
-    This is the total number of sectors read successfully.
-
-Field  4 -- # of milliseconds spent reading
-    This is the total number of milliseconds spent by all reads (as
-    measured from __make_request() to end_that_request_last()).
-
-Field  5 -- # of writes completed
-    This is the total number of writes completed successfully.
-
-Field  6 -- # of writes merged
-    See the description of field 2.
-
-Field  7 -- # of sectors written
-    This is the total number of sectors written successfully.
-
-Field  8 -- # of milliseconds spent writing
-    This is the total number of milliseconds spent by all writes (as
-    measured from __make_request() to end_that_request_last()).
-
-Field  9 -- # of I/Os currently in progress
-    The only field that should go to zero. Incremented as requests are
-    given to appropriate struct request_queue and decremented as they finish.
-
-Field 10 -- # of milliseconds spent doing I/Os
-    This field increases so long as field 9 is nonzero.
-
-    Since 5.0 this field counts jiffies when at least one request was
-    started or completed. If request runs more than 2 jiffies then some
-    I/O time will not be accounted unless there are other requests.
-
-Field 11 -- weighted # of milliseconds spent doing I/Os
-    This field is incremented at each I/O start, I/O completion, I/O
-    merge, or read of these stats by the number of I/Os in progress
-    (field 9) times the number of milliseconds spent doing I/O since the
-    last update of this field.  This can provide an easy measure of both
-    I/O completion time and the backlog that may be accumulating.
-
-Field 12 -- # of discards completed
-    This is the total number of discards completed successfully.
-
-Field 13 -- # of discards merged
-    See the description of field 2
-
-Field 14 -- # of sectors discarded
-    This is the total number of sectors discarded successfully.
-
-Field 15 -- # of milliseconds spent discarding
-    This is the total number of milliseconds spent by all discards (as
-    measured from __make_request() to end_that_request_last()).
-
-To avoid introducing performance bottlenecks, no locks are held while
-modifying these counters.  This implies that minor inaccuracies may be
-introduced when changes collide, so (for instance) adding up all the
-read I/Os issued per partition should equal those made to the disks ...
-but due to the lack of locking it may only be very close.
-
-In 2.6+, there are counters for each CPU, which make the lack of locking
-almost a non-issue.  When the statistics are read, the per-CPU counters
-are summed (possibly overflowing the unsigned long variable they are
-summed to) and the result given to the user.  There is no convenient
-user interface for accessing the per-CPU counters themselves.
-
-Disks vs Partitions
--------------------
-
-There were significant changes between 2.4 and 2.6+ in the I/O subsystem.
-As a result, some statistic information disappeared. The translation from
-a disk address relative to a partition to the disk address relative to
-the host disk happens much earlier.  All merges and timings now happen
-at the disk level rather than at both the disk and partition level as
-in 2.4.  Consequently, you'll see a different statistics output on 2.6+ for
-partitions from that for disks.  There are only *four* fields available
-for partitions on 2.6+ machines.  This is reflected in the examples above.
-
-Field  1 -- # of reads issued
-    This is the total number of reads issued to this partition.
-
-Field  2 -- # of sectors read
-    This is the total number of sectors requested to be read from this
-    partition.
-
-Field  3 -- # of writes issued
-    This is the total number of writes issued to this partition.
-
-Field  4 -- # of sectors written
-    This is the total number of sectors requested to be written to
-    this partition.
-
-Note that since the address is translated to a disk-relative one, and no
-record of the partition-relative address is kept, the subsequent success
-or failure of the read cannot be attributed to the partition.  In other
-words, the number of reads for partitions is counted slightly before time
-of queuing for partitions, and at completion for whole disks.  This is
-a subtle distinction that is probably uninteresting for most cases.
-
-More significant is the error induced by counting the numbers of
-reads/writes before merges for partitions and after for disks. Since a
-typical workload usually contains a lot of successive and adjacent requests,
-the number of reads/writes issued can be several times higher than the
-number of reads/writes completed.
-
-In 2.6.25, the full statistic set is again available for partitions and
-disk and partition statistics are consistent again. Since we still don't
-keep record of the partition-relative address, an operation is attributed to
-the partition which contains the first sector of the request after the
-eventual merges. As requests can be merged across partition, this could lead
-to some (probably insignificant) inaccuracy.
-
-Additional notes
-----------------
-
-In 2.6+, sysfs is not mounted by default.  If your distribution of
-Linux hasn't added it already, here's the line you'll want to add to
-your ``/etc/fstab``::
-
-	none /sys sysfs defaults 0 0
-
-
-In 2.6+, all disk statistics were removed from ``/proc/stat``.  In 2.4, they
-appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in
-``/proc/stat`` take a very different format from those in ``/proc/partitions``
-(see proc(5), if your system has it.)
-
--- ricklind@us.ibm.com
diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
deleted file mode 100644
index 4f18456dd3b1..000000000000
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ /dev/null
@@ -1,356 +0,0 @@
-==========================================
-Reducing OS jitter due to per-cpu kthreads
-==========================================
-
-This document lists per-CPU kthreads in the Linux kernel and presents
-options to control their OS jitter.  Note that non-per-CPU kthreads are
-not listed here.  To reduce OS jitter from non-per-CPU kthreads, bind
-them to a "housekeeping" CPU dedicated to such work.
-
-References
-==========
-
--	Documentation/IRQ-affinity.txt:  Binding interrupts to sets of CPUs.
-
--	Documentation/admin-guide/cgroup-v1:  Using cgroups to bind tasks to sets of CPUs.
-
--	man taskset:  Using the taskset command to bind tasks to sets
-	of CPUs.
-
--	man sched_setaffinity:  Using the sched_setaffinity() system
-	call to bind tasks to sets of CPUs.
-
--	/sys/devices/system/cpu/cpuN/online:  Control CPU N's hotplug state,
-	writing "0" to offline and "1" to online.
-
--	In order to locate kernel-generated OS jitter on CPU N:
-
-		cd /sys/kernel/debug/tracing
-		echo 1 > max_graph_depth # Increase the "1" for more detail
-		echo function_graph > current_tracer
-		# run workload
-		cat per_cpu/cpuN/trace
-
-kthreads
-========
-
-Name:
-  ehca_comp/%u
-
-Purpose:
-  Periodically process Infiniband-related work.
-
-To reduce its OS jitter, do any of the following:
-
-1.	Don't use eHCA Infiniband hardware, instead choosing hardware
-	that does not require per-CPU kthreads.  This will prevent these
-	kthreads from being created in the first place.  (This will
-	work for most people, as this hardware, though important, is
-	relatively old and is produced in relatively low unit volumes.)
-2.	Do all eHCA-Infiniband-related work on other CPUs, including
-	interrupts.
-3.	Rework the eHCA driver so that its per-CPU kthreads are
-	provisioned only on selected CPUs.
-
-
-Name:
-  irq/%d-%s
-
-Purpose:
-  Handle threaded interrupts.
-
-To reduce its OS jitter, do the following:
-
-1.	Use irq affinity to force the irq threads to execute on
-	some other CPU.
-
-Name:
-  kcmtpd_ctr_%d
-
-Purpose:
-  Handle Bluetooth work.
-
-To reduce its OS jitter, do one of the following:
-
-1.	Don't use Bluetooth, in which case these kthreads won't be
-	created in the first place.
-2.	Use irq affinity to force Bluetooth-related interrupts to
-	occur on some other CPU and furthermore initiate all
-	Bluetooth activity on some other CPU.
-
-Name:
-  ksoftirqd/%u
-
-Purpose:
-  Execute softirq handlers when threaded or when under heavy load.
-
-To reduce its OS jitter, each softirq vector must be handled
-separately as follows:
-
-TIMER_SOFTIRQ
--------------
-
-Do all of the following:
-
-1.	To the extent possible, keep the CPU out of the kernel when it
-	is non-idle, for example, by avoiding system calls and by forcing
-	both kernel threads and interrupts to execute elsewhere.
-2.	Build with CONFIG_HOTPLUG_CPU=y.  After boot completes, force
-	the CPU offline, then bring it back online.  This forces
-	recurring timers to migrate elsewhere.	If you are concerned
-	with multiple CPUs, force them all offline before bringing the
-	first one back online.  Once you have onlined the CPUs in question,
-	do not offline any other CPUs, because doing so could force the
-	timer back onto one of the CPUs in question.
-
-NET_TX_SOFTIRQ and NET_RX_SOFTIRQ
----------------------------------
-
-Do all of the following:
-
-1.	Force networking interrupts onto other CPUs.
-2.	Initiate any network I/O on other CPUs.
-3.	Once your application has started, prevent CPU-hotplug operations
-	from being initiated from tasks that might run on the CPU to
-	be de-jittered.  (It is OK to force this CPU offline and then
-	bring it back online before you start your application.)
-
-BLOCK_SOFTIRQ
--------------
-
-Do all of the following:
-
-1.	Force block-device interrupts onto some other CPU.
-2.	Initiate any block I/O on other CPUs.
-3.	Once your application has started, prevent CPU-hotplug operations
-	from being initiated from tasks that might run on the CPU to
-	be de-jittered.  (It is OK to force this CPU offline and then
-	bring it back online before you start your application.)
-
-IRQ_POLL_SOFTIRQ
-----------------
-
-Do all of the following:
-
-1.	Force block-device interrupts onto some other CPU.
-2.	Initiate any block I/O and block-I/O polling on other CPUs.
-3.	Once your application has started, prevent CPU-hotplug operations
-	from being initiated from tasks that might run on the CPU to
-	be de-jittered.  (It is OK to force this CPU offline and then
-	bring it back online before you start your application.)
-
-TASKLET_SOFTIRQ
----------------
-
-Do one or more of the following:
-
-1.	Avoid use of drivers that use tasklets.  (Such drivers will contain
-	calls to things like tasklet_schedule().)
-2.	Convert all drivers that you must use from tasklets to workqueues.
-3.	Force interrupts for drivers using tasklets onto other CPUs,
-	and also do I/O involving these drivers on other CPUs.
-
-SCHED_SOFTIRQ
--------------
-
-Do all of the following:
-
-1.	Avoid sending scheduler IPIs to the CPU to be de-jittered,
-	for example, ensure that at most one runnable kthread is present
-	on that CPU.  If a thread that expects to run on the de-jittered
-	CPU awakens, the scheduler will send an IPI that can result in
-	a subsequent SCHED_SOFTIRQ.
-2.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered
-	is marked as an adaptive-ticks CPU using the "nohz_full="
-	boot parameter.  This reduces the number of scheduler-clock
-	interrupts that the de-jittered CPU receives, minimizing its
-	chances of being selected to do the load balancing work that
-	runs in SCHED_SOFTIRQ context.
-3.	To the extent possible, keep the CPU out of the kernel when it
-	is non-idle, for example, by avoiding system calls and by
-	forcing both kernel threads and interrupts to execute elsewhere.
-	This further reduces the number of scheduler-clock interrupts
-	received by the de-jittered CPU.
-
-HRTIMER_SOFTIRQ
----------------
-
-Do all of the following:
-
-1.	To the extent possible, keep the CPU out of the kernel when it
-	is non-idle.  For example, avoid system calls and force both
-	kernel threads and interrupts to execute elsewhere.
-2.	Build with CONFIG_HOTPLUG_CPU=y.  Once boot completes, force the
-	CPU offline, then bring it back online.  This forces recurring
-	timers to migrate elsewhere.  If you are concerned with multiple
-	CPUs, force them all offline before bringing the first one
-	back online.  Once you have onlined the CPUs in question, do not
-	offline any other CPUs, because doing so could force the timer
-	back onto one of the CPUs in question.
-
-RCU_SOFTIRQ
------------
-
-Do at least one of the following:
-
-1.	Offload callbacks and keep the CPU in either dyntick-idle or
-	adaptive-ticks state by doing all of the following:
-
-	a.	CONFIG_NO_HZ_FULL=y and ensure that the CPU to be
-		de-jittered is marked as an adaptive-ticks CPU using the
-		"nohz_full=" boot parameter.  Bind the rcuo kthreads to
-		housekeeping CPUs, which can tolerate OS jitter.
-	b.	To the extent possible, keep the CPU out of the kernel
-		when it is non-idle, for example, by avoiding system
-		calls and by forcing both kernel threads and interrupts
-		to execute elsewhere.
-
-2.	Enable RCU to do its processing remotely via dyntick-idle by
-	doing all of the following:
-
-	a.	Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y.
-	b.	Ensure that the CPU goes idle frequently, allowing other
-		CPUs to detect that it has passed through an RCU quiescent
-		state.	If the kernel is built with CONFIG_NO_HZ_FULL=y,
-		userspace execution also allows other CPUs to detect that
-		the CPU in question has passed through a quiescent state.
-	c.	To the extent possible, keep the CPU out of the kernel
-		when it is non-idle, for example, by avoiding system
-		calls and by forcing both kernel threads and interrupts
-		to execute elsewhere.
-
-Name:
-  kworker/%u:%d%s (cpu, id, priority)
-
-Purpose:
-  Execute workqueue requests
-
-To reduce its OS jitter, do any of the following:
-
-1.	Run your workload at a real-time priority, which will allow
-	preempting the kworker daemons.
-2.	A given workqueue can be made visible in the sysfs filesystem
-	by passing the WQ_SYSFS to that workqueue's alloc_workqueue().
-	Such a workqueue can be confined to a given subset of the
-	CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs
-	files.	The set of WQ_SYSFS workqueues can be displayed using
-	"ls sys/devices/virtual/workqueue".  That said, the workqueues
-	maintainer would like to caution people against indiscriminately
-	sprinkling WQ_SYSFS across all the workqueues.	The reason for
-	caution is that it is easy to add WQ_SYSFS, but because sysfs is
-	part of the formal user/kernel API, it can be nearly impossible
-	to remove it, even if its addition was a mistake.
-3.	Do any of the following needed to avoid jitter that your
-	application cannot tolerate:
-
-	a.	Build your kernel with CONFIG_SLUB=y rather than
-		CONFIG_SLAB=y, thus avoiding the slab allocator's periodic
-		use of each CPU's workqueues to run its cache_reap()
-		function.
-	b.	Avoid using oprofile, thus avoiding OS jitter from
-		wq_sync_buffer().
-	c.	Limit your CPU frequency so that a CPU-frequency
-		governor is not required, possibly enlisting the aid of
-		special heatsinks or other cooling technologies.  If done
-		correctly, and if you CPU architecture permits, you should
-		be able to build your kernel with CONFIG_CPU_FREQ=n to
-		avoid the CPU-frequency governor periodically running
-		on each CPU, including cs_dbs_timer() and od_dbs_timer().
-
-		WARNING:  Please check your CPU specifications to
-		make sure that this is safe on your particular system.
-	d.	As of v3.18, Christoph Lameter's on-demand vmstat workers
-		commit prevents OS jitter due to vmstat_update() on
-		CONFIG_SMP=y systems.  Before v3.18, is not possible
-		to entirely get rid of the OS jitter, but you can
-		decrease its frequency by writing a large value to
-		/proc/sys/vm/stat_interval.  The default value is HZ,
-		for an interval of one second.	Of course, larger values
-		will make your virtual-memory statistics update more
-		slowly.  Of course, you can also run your workload at
-		a real-time priority, thus preempting vmstat_update(),
-		but if your workload is CPU-bound, this is a bad idea.
-		However, there is an RFC patch from Christoph Lameter
-		(based on an earlier one from Gilad Ben-Yossef) that
-		reduces or even eliminates vmstat overhead for some
-		workloads at https://lkml.org/lkml/2013/9/4/379.
-	e.	Boot with "elevator=noop" to avoid workqueue use by
-		the block layer.
-	f.	If running on high-end powerpc servers, build with
-		CONFIG_PPC_RTAS_DAEMON=n.  This prevents the RTAS
-		daemon from running on each CPU every second or so.
-		(This will require editing Kconfig files and will defeat
-		this platform's RAS functionality.)  This avoids jitter
-		due to the rtas_event_scan() function.
-		WARNING:  Please check your CPU specifications to
-		make sure that this is safe on your particular system.
-	g.	If running on Cell Processor, build your kernel with
-		CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
-		spu_gov_work().
-		WARNING:  Please check your CPU specifications to
-		make sure that this is safe on your particular system.
-	h.	If running on PowerMAC, build your kernel with
-		CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
-		avoiding OS jitter from rackmeter_do_timer().
-
-Name:
-  rcuc/%u
-
-Purpose:
-  Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels.
-
-To reduce its OS jitter, do at least one of the following:
-
-1.	Build the kernel with CONFIG_PREEMPT=n.  This prevents these
-	kthreads from being created in the first place, and also obviates
-	the need for RCU priority boosting.  This approach is feasible
-	for workloads that do not require high degrees of responsiveness.
-2.	Build the kernel with CONFIG_RCU_BOOST=n.  This prevents these
-	kthreads from being created in the first place.  This approach
-	is feasible only if your workload never requires RCU priority
-	boosting, for example, if you ensure frequent idle time on all
-	CPUs that might execute within the kernel.
-3.	Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs=
-	boot parameter offloading RCU callbacks from all CPUs susceptible
-	to OS jitter.  This approach prevents the rcuc/%u kthreads from
-	having any work to do, so that they are never awakened.
-4.	Ensure that the CPU never enters the kernel, and, in particular,
-	avoid initiating any CPU hotplug operations on this CPU.  This is
-	another way of preventing any callbacks from being queued on the
-	CPU, again preventing the rcuc/%u kthreads from having any work
-	to do.
-
-Name:
-  rcuop/%d and rcuos/%d
-
-Purpose:
-  Offload RCU callbacks from the corresponding CPU.
-
-To reduce its OS jitter, do at least one of the following:
-
-1.	Use affinity, cgroups, or other mechanism to force these kthreads
-	to execute on some other CPU.
-2.	Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these
-	kthreads from being created in the first place.  However, please
-	note that this will not eliminate OS jitter, but will instead
-	shift it to RCU_SOFTIRQ.
-
-Name:
-  watchdog/%u
-
-Purpose:
-  Detect software lockups on each CPU.
-
-To reduce its OS jitter, do at least one of the following:
-
-1.	Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
-	kthreads from being created in the first place.
-2.	Boot with "nosoftlockup=0", which will also prevent these kthreads
-	from being created.  Other related watchdog and softlockup boot
-	parameters may be found in Documentation/admin-guide/kernel-parameters.rst
-	and Documentation/watchdog/watchdog-parameters.rst.
-3.	Echo a zero to /proc/sys/kernel/watchdog to disable the
-	watchdog timer.
-4.	Echo a large number of /proc/sys/kernel/watchdog_thresh in
-	order to reduce the frequency of OS jitter due to the watchdog
-	timer down to a level that is acceptable for your workload.
diff --git a/Documentation/ldm.txt b/Documentation/ldm.txt
deleted file mode 100644
index 12c571368e73..000000000000
--- a/Documentation/ldm.txt
+++ /dev/null
@@ -1,121 +0,0 @@
-==========================================
-LDM - Logical Disk Manager (Dynamic Disks)
-==========================================
-
-:Author: Originally Written by FlatCap - Richard Russon <ldm@flatcap.org>.
-:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista.
-
-Overview
---------
-
-Windows 2000, XP, and Vista use a new partitioning scheme.  It is a complete
-replacement for the MSDOS style partitions.  It stores its information in a
-1MiB journalled database at the end of the physical disk.  The size of
-partitions is limited only by disk space.  The maximum number of partitions is
-nearly 2000.
-
-Any partitions created under the LDM are called "Dynamic Disks".  There are no
-longer any primary or extended partitions.  Normal MSDOS style partitions are
-now known as Basic Disks.
-
-If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use
-Dynamic Disks.  The journalling allows Windows to make changes to these
-partitions and filesystems without the need to reboot.
-
-Once the LDM driver has divided up the disk, you can use the MD driver to
-assemble any multi-partition volumes, e.g.  Stripes, RAID5.
-
-To prevent legacy applications from repartitioning the disk, the LDM creates a
-dummy MSDOS partition containing one disk-sized partition.  This is what is
-supported with the Linux LDM driver.
-
-A newer approach that has been implemented with Vista is to put LDM on top of a
-GPT label disk.  This is not supported by the Linux LDM driver yet.
-
-
-Example
--------
-
-Below we have a 50MiB disk, divided into seven partitions.
-
-.. note::
-
-   The missing 1MiB at the end of the disk is where the LDM database is
-   stored.
-
-+-------++--------------+---------+-----++--------------+---------+----+
-|Device || Offset Bytes | Sectors | MiB || Size   Bytes | Sectors | MiB|
-+=======++==============+=========+=====++==============+=========+====+
-|hda    ||            0 |       0 |   0 ||     52428800 |  102400 |  50|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda1   ||     51380224 |  100352 |  49 ||      1048576 |    2048 |   1|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda2   ||        16384 |      32 |   0 ||      6979584 |   13632 |   6|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda3   ||      6995968 |   13664 |   6 ||     10485760 |   20480 |  10|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda4   ||     17481728 |   34144 |  16 ||      4194304 |    8192 |   4|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda5   ||     21676032 |   42336 |  20 ||      5242880 |   10240 |   5|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda6   ||     26918912 |   52576 |  25 ||     10485760 |   20480 |  10|
-+-------++--------------+---------+-----++--------------+---------+----+
-|hda7   ||     37404672 |   73056 |  35 ||     13959168 |   27264 |  13|
-+-------++--------------+---------+-----++--------------+---------+----+
-
-The LDM Database may not store the partitions in the order that they appear on
-disk, but the driver will sort them.
-
-When Linux boots, you will see something like::
-
-  hda: 102400 sectors w/32KiB Cache, CHS=50/64/32
-  hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7
-
-
-Compiling LDM Support
----------------------
-
-To enable LDM, choose the following two options: 
-
-  - "Advanced partition selection" CONFIG_PARTITION_ADVANCED
-  - "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION
-
-If you believe the driver isn't working as it should, you can enable the extra
-debugging code.  This will produce a LOT of output.  The option is:
-
-  - "Windows LDM extra logging" CONFIG_LDM_DEBUG
-
-N.B. The partition code cannot be compiled as a module.
-
-As with all the partition code, if the driver doesn't see signs of its type of
-partition, it will pass control to another driver, so there is no harm in
-enabling it.
-
-If you have Dynamic Disks but don't enable the driver, then all you will see
-is a dummy MSDOS partition filling the whole disk.  You won't be able to mount
-any of the volumes on the disk.
-
-
-Booting
--------
-
-If you enable LDM support, then lilo is capable of booting from any of the
-discovered partitions.  However, grub does not understand the LDM partitioning
-and cannot boot from a Dynamic Disk.
-
-
-More Documentation
-------------------
-
-There is an Overview of the LDM together with complete Technical Documentation.
-It is available for download.
-
-  http://www.linux-ntfs.org/
-
-If you have any LDM questions that aren't answered in the documentation, email
-me.
-
-Cheers,
-    FlatCap - Richard Russon
-    ldm@flatcap.org
-
diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt
deleted file mode 100644
index 290840c160af..000000000000
--- a/Documentation/lockup-watchdogs.txt
+++ /dev/null
@@ -1,83 +0,0 @@
-===============================================================
-Softlockup detector and hardlockup detector (aka nmi_watchdog)
-===============================================================
-
-The Linux kernel can act as a watchdog to detect both soft and hard
-lockups.
-
-A 'softlockup' is defined as a bug that causes the kernel to loop in
-kernel mode for more than 20 seconds (see "Implementation" below for
-details), without giving other tasks a chance to run. The current
-stack trace is displayed upon detection and, by default, the system
-will stay locked up. Alternatively, the kernel can be configured to
-panic; a sysctl, "kernel.softlockup_panic", a kernel parameter,
-"softlockup_panic" (see "Documentation/admin-guide/kernel-parameters.rst" for
-details), and a compile option, "BOOTPARAM_SOFTLOCKUP_PANIC", are
-provided for this.
-
-A 'hardlockup' is defined as a bug that causes the CPU to loop in
-kernel mode for more than 10 seconds (see "Implementation" below for
-details), without letting other interrupts have a chance to run.
-Similarly to the softlockup case, the current stack trace is displayed
-upon detection and the system will stay locked up unless the default
-behavior is changed, which can be done through a sysctl,
-'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC",
-and a kernel parameter, "nmi_watchdog"
-(see "Documentation/admin-guide/kernel-parameters.rst" for details).
-
-The panic option can be used in combination with panic_timeout (this
-timeout is set through the confusingly named "kernel.panic" sysctl),
-to cause the system to reboot automatically after a specified amount
-of time.
-
-Implementation
-==============
-
-The soft and hard lockup detectors are built on top of the hrtimer and
-perf subsystems, respectively. A direct consequence of this is that,
-in principle, they should work in any architecture where these
-subsystems are present.
-
-A periodic hrtimer runs to generate interrupts and kick the watchdog
-task. An NMI perf event is generated every "watchdog_thresh"
-(compile-time initialized to 10 and configurable through sysctl of the
-same name) seconds to check for hardlockups. If any CPU in the system
-does not receive any hrtimer interrupt during that time the
-'hardlockup detector' (the handler for the NMI perf event) will
-generate a kernel warning or call panic, depending on the
-configuration.
-
-The watchdog task is a high priority kernel thread that updates a
-timestamp every time it is scheduled. If that timestamp is not updated
-for 2*watchdog_thresh seconds (the softlockup threshold) the
-'softlockup detector' (coded inside the hrtimer callback function)
-will dump useful debug information to the system log, after which it
-will call panic if it was instructed to do so or resume execution of
-other kernel code.
-
-The period of the hrtimer is 2*watchdog_thresh/5, which means it has
-two or three chances to generate an interrupt before the hardlockup
-detector kicks in.
-
-As explained above, a kernel knob is provided that allows
-administrators to configure the period of the hrtimer and the perf
-event. The right value for a particular environment is a trade-off
-between fast response to lockups and detection overhead.
-
-By default, the watchdog runs on all online cores.  However, on a
-kernel configured with NO_HZ_FULL, by default the watchdog runs only
-on the housekeeping cores, not the cores specified in the "nohz_full"
-boot argument.  If we allowed the watchdog to run by default on
-the "nohz_full" cores, we would have to run timer ticks to activate
-the scheduler, which would prevent the "nohz_full" functionality
-from protecting the user code on those cores from the kernel.
-Of course, disabling it by default on the nohz_full cores means that
-when those cores do enter the kernel, by default we will not be
-able to detect if they lock up.  However, allowing the watchdog
-to continue to run on the housekeeping (non-tickless) cores means
-that we will continue to detect lockups properly on those cores.
-
-In either case, the set of cores excluded from running the watchdog
-may be adjusted via the kernel.watchdog_cpumask sysctl.  For
-nohz_full cores, this may be useful for debugging a case where the
-kernel seems to be hanging on the nohz_full cores.
diff --git a/Documentation/numastat.txt b/Documentation/numastat.txt
deleted file mode 100644
index aaf1667489f8..000000000000
--- a/Documentation/numastat.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-===============================
-Numa policy hit/miss statistics
-===============================
-
-/sys/devices/system/node/node*/numastat
-
-All units are pages. Hugepages have separate counters.
-
-=============== ============================================================
-numa_hit	A process wanted to allocate memory from this node,
-		and succeeded.
-
-numa_miss	A process wanted to allocate memory from another node,
-		but ended up with memory from this node.
-
-numa_foreign	A process wanted to allocate on this node,
-		but ended up with memory from another one.
-
-local_node	A process ran on this node and got memory from it.
-
-other_node	A process ran on this node and got memory from another node.
-
-interleave_hit 	Interleaving wanted to allocate from this node
-		and succeeded.
-=============== ============================================================
-
-For easier reading you can use the numastat utility from the numactl package
-(http://oss.sgi.com/projects/libnuma/). Note that it only works
-well right now on machines with a small number of CPUs.
-
diff --git a/Documentation/pnp.txt b/Documentation/pnp.txt
deleted file mode 100644
index bab2d10631f0..000000000000
--- a/Documentation/pnp.txt
+++ /dev/null
@@ -1,292 +0,0 @@
-=================================
-Linux Plug and Play Documentation
-=================================
-
-:Author: Adam Belay <ambx1@neo.rr.com>
-:Last updated: Oct. 16, 2002
-
-
-Overview
---------
-
-Plug and Play provides a means of detecting and setting resources for legacy or
-otherwise unconfigurable devices.  The Linux Plug and Play Layer provides these 
-services to compatible drivers.
-
-
-The User Interface
-------------------
-
-The Linux Plug and Play user interface provides a means to activate PnP devices
-for legacy and user level drivers that do not support Linux Plug and Play.  The 
-user interface is integrated into sysfs.
-
-In addition to the standard sysfs file the following are created in each
-device's directory:
-- id - displays a list of support EISA IDs
-- options - displays possible resource configurations
-- resources - displays currently allocated resources and allows resource changes
-
-activating a device
-^^^^^^^^^^^^^^^^^^^
-
-::
-
-	# echo "auto" > resources
-
-this will invoke the automatic resource config system to activate the device
-
-manually activating a device
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-::
-
-	# echo "manual <depnum> <mode>" > resources
-
-	<depnum> - the configuration number
-	<mode> - static or dynamic
-		 static = for next boot
-		 dynamic = now
-
-disabling a device
-^^^^^^^^^^^^^^^^^^
-
-::
-
-	# echo "disable" > resources
-
-
-EXAMPLE:
-
-Suppose you need to activate the floppy disk controller.
-
-1. change to the proper directory, in my case it is
-   /driver/bus/pnp/devices/00:0f::
-
-	# cd /driver/bus/pnp/devices/00:0f
-	# cat name
-	PC standard floppy disk controller
-
-2. check if the device is already active::
-
-	# cat resources
-	DISABLED
-
-  - Notice the string "DISABLED".  This means the device is not active.
-
-3. check the device's possible configurations (optional)::
-
-	# cat options
-	Dependent: 01 - Priority acceptable
-	    port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding
-	    port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding
-	    irq 6
-	    dma 2 8-bit compatible
-	Dependent: 02 - Priority acceptable
-	    port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding
-	    port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding
-	    irq 6
-	    dma 2 8-bit compatible
-
-4. now activate the device::
-
-	# echo "auto" > resources
-
-5. finally check if the device is active::
-
-	# cat resources
-	io 0x3f0-0x3f5
-	io 0x3f7-0x3f7
-	irq 6
-	dma 2
-
-also there are a series of kernel parameters::
-
-	pnp_reserve_irq=irq1[,irq2] ....
-	pnp_reserve_dma=dma1[,dma2] ....
-	pnp_reserve_io=io1,size1[,io2,size2] ....
-	pnp_reserve_mem=mem1,size1[,mem2,size2] ....
-
-
-
-The Unified Plug and Play Layer
--------------------------------
-
-All Plug and Play drivers, protocols, and services meet at a central location
-called the Plug and Play Layer.  This layer is responsible for the exchange of 
-information between PnP drivers and PnP protocols.  Thus it automatically 
-forwards commands to the proper protocol.  This makes writing PnP drivers 
-significantly easier.
-
-The following functions are available from the Plug and Play Layer:
-
-pnp_get_protocol
-  increments the number of uses by one
-
-pnp_put_protocol
-  deincrements the number of uses by one
-
-pnp_register_protocol
-  use this to register a new PnP protocol
-
-pnp_unregister_protocol
-  use this function to remove a PnP protocol from the Plug and Play Layer
-
-pnp_register_driver
-  adds a PnP driver to the Plug and Play Layer
-
-  this includes driver model integration
-  returns zero for success or a negative error number for failure; count
-  calls to the .add() method if you need to know how many devices bind to
-  the driver
-
-pnp_unregister_driver
-  removes a PnP driver from the Plug and Play Layer
-
-
-
-Plug and Play Protocols
------------------------
-
-This section contains information for PnP protocol developers.
-
-The following Protocols are currently available in the computing world:
-
-- PNPBIOS:
-    used for system devices such as serial and parallel ports.
-- ISAPNP:
-    provides PnP support for the ISA bus
-- ACPI:
-    among its many uses, ACPI provides information about system level
-    devices.
-
-It is meant to replace the PNPBIOS.  It is not currently supported by Linux
-Plug and Play but it is planned to be in the near future.
-
-
-Requirements for a Linux PnP protocol:
-1. the protocol must use EISA IDs
-2. the protocol must inform the PnP Layer of a device's current configuration
-
-- the ability to set resources is optional but preferred.
-
-The following are PnP protocol related functions:
-
-pnp_add_device
-  use this function to add a PnP device to the PnP layer
-
-  only call this function when all wanted values are set in the pnp_dev
-  structure
-
-pnp_init_device
-  call this to initialize the PnP structure
-
-pnp_remove_device
-  call this to remove a device from the Plug and Play Layer.
-  it will fail if the device is still in use.
-  automatically will free mem used by the device and related structures
-
-pnp_add_id
-  adds an EISA ID to the list of supported IDs for the specified device
-
-For more information consult the source of a protocol such as
-/drivers/pnp/pnpbios/core.c.
-
-
-
-Linux Plug and Play Drivers
----------------------------
-
-This section contains information for Linux PnP driver developers.
-
-The New Way
-^^^^^^^^^^^
-
-1. first make a list of supported EISA IDS
-
-   ex::
-
-	static const struct pnp_id pnp_dev_table[] = {
-		/* Standard LPT Printer Port */
-		{.id = "PNP0400", .driver_data = 0},
-		/* ECP Printer Port */
-		{.id = "PNP0401", .driver_data = 0},
-		{.id = ""}
-	};
-
-   Please note that the character 'X' can be used as a wild card in the function
-   portion (last four characters).
-
-   ex::
-
-	/* Unknown PnP modems */
-	{	"PNPCXXX",		UNKNOWN_DEV	},
-
-   Supported PnP card IDs can optionally be defined.
-   ex::
-
-	static const struct pnp_id pnp_card_table[] = {
-		{	"ANYDEVS",		0	},
-		{	"",			0	}
-	};
-
-2. Optionally define probe and remove functions.  It may make sense not to
-   define these functions if the driver already has a reliable method of detecting
-   the resources, such as the parport_pc driver.
-
-   ex::
-
-	static int
-	serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const
-			struct pnp_id *dev_id)
-	{
-	. . .
-
-   ex::
-
-	static void serial_pnp_remove(struct pnp_dev * dev)
-	{
-	. . .
-
-   consult /drivers/serial/8250_pnp.c for more information.
-
-3. create a driver structure
-
-   ex::
-
-	static struct pnp_driver serial_pnp_driver = {
-		.name		= "serial",
-		.card_id_table	= pnp_card_table,
-		.id_table	= pnp_dev_table,
-		.probe		= serial_pnp_probe,
-		.remove		= serial_pnp_remove,
-	};
-
-   * name and id_table cannot be NULL.
-
-4. register the driver
-
-   ex::
-
-	static int __init serial8250_pnp_init(void)
-	{
-		return pnp_register_driver(&serial_pnp_driver);
-	}
-
-The Old Way
-^^^^^^^^^^^
-
-A series of compatibility functions have been created to make it easy to convert
-ISAPNP drivers.  They should serve as a temporary solution only.
-
-They are as follows::
-
-	struct pnp_card *pnp_find_card(unsigned short vendor,
-				       unsigned short device,
-				       struct pnp_card *from)
-
-	struct pnp_dev *pnp_find_dev(struct pnp_card *card,
-				     unsigned short vendor,
-				     unsigned short function,
-				     struct pnp_dev *from)
-
diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt
deleted file mode 100644
index 688c95b11919..000000000000
--- a/Documentation/rtc.txt
+++ /dev/null
@@ -1,140 +0,0 @@
-=======================================
-Real Time Clock (RTC) Drivers for Linux
-=======================================
-
-When Linux developers talk about a "Real Time Clock", they usually mean
-something that tracks wall clock time and is battery backed so that it
-works even with system power off.  Such clocks will normally not track
-the local time zone or daylight savings time -- unless they dual boot
-with MS-Windows -- but will instead be set to Coordinated Universal Time
-(UTC, formerly "Greenwich Mean Time").
-
-The newest non-PC hardware tends to just count seconds, like the time(2)
-system call reports, but RTCs also very commonly represent time using
-the Gregorian calendar and 24 hour time, as reported by gmtime(3).
-
-Linux has two largely-compatible userspace RTC API families you may
-need to know about:
-
-    *	/dev/rtc ... is the RTC provided by PC compatible systems,
-	so it's not very portable to non-x86 systems.
-
-    *	/dev/rtc0, /dev/rtc1 ... are part of a framework that's
-	supported by a wide variety of RTC chips on all systems.
-
-Programmers need to understand that the PC/AT functionality is not
-always available, and some systems can do much more.  That is, the
-RTCs use the same API to make requests in both RTC frameworks (using
-different filenames of course), but the hardware may not offer the
-same functionality.  For example, not every RTC is hooked up to an
-IRQ, so they can't all issue alarms; and where standard PC RTCs can
-only issue an alarm up to 24 hours in the future, other hardware may
-be able to schedule one any time in the upcoming century.
-
-
-Old PC/AT-Compatible driver:  /dev/rtc
---------------------------------------
-
-All PCs (even Alpha machines) have a Real Time Clock built into them.
-Usually they are built into the chipset of the computer, but some may
-actually have a Motorola MC146818 (or clone) on the board. This is the
-clock that keeps the date and time while your computer is turned off.
-
-ACPI has standardized that MC146818 functionality, and extended it in
-a few ways (enabling longer alarm periods, and wake-from-hibernate).
-That functionality is NOT exposed in the old driver.
-
-However it can also be used to generate signals from a slow 2Hz to a
-relatively fast 8192Hz, in increments of powers of two. These signals
-are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is
-for...) It can also function as a 24hr alarm, raising IRQ 8 when the
-alarm goes off. The alarm can also be programmed to only check any
-subset of the three programmable values, meaning that it could be set to
-ring on the 30th second of the 30th minute of every hour, for example.
-The clock can also be set to generate an interrupt upon every clock
-update, thus generating a 1Hz signal.
-
-The interrupts are reported via /dev/rtc (major 10, minor 135, read only
-character device) in the form of an unsigned long. The low byte contains
-the type of interrupt (update-done, alarm-rang, or periodic) that was
-raised, and the remaining bytes contain the number of interrupts since
-the last read.  Status information is reported through the pseudo-file
-/proc/driver/rtc if the /proc filesystem was enabled.  The driver has
-built in locking so that only one process is allowed to have the /dev/rtc
-interface open at a time.
-
-A user process can monitor these interrupts by doing a read(2) or a
-select(2) on /dev/rtc -- either will block/stop the user process until
-the next interrupt is received. This is useful for things like
-reasonably high frequency data acquisition where one doesn't want to
-burn up 100% CPU by polling gettimeofday etc. etc.
-
-At high frequencies, or under high loads, the user process should check
-the number of interrupts received since the last read to determine if
-there has been any interrupt "pileup" so to speak. Just for reference, a
-typical 486-33 running a tight read loop on /dev/rtc will start to suffer
-occasional interrupt pileup (i.e. > 1 IRQ event since last read) for
-frequencies above 1024Hz. So you really should check the high bytes
-of the value you read, especially at frequencies above that of the
-normal timer interrupt, which is 100Hz.
-
-Programming and/or enabling interrupt frequencies greater than 64Hz is
-only allowed by root. This is perhaps a bit conservative, but we don't want
-an evil user generating lots of IRQs on a slow 386sx-16, where it might have
-a negative impact on performance. This 64Hz limit can be changed by writing
-a different value to /proc/sys/dev/rtc/max-user-freq. Note that the
-interrupt handler is only a few lines of code to minimize any possibility
-of this effect.
-
-Also, if the kernel time is synchronized with an external source, the 
-kernel will write the time back to the CMOS clock every 11 minutes. In 
-the process of doing this, the kernel briefly turns off RTC periodic 
-interrupts, so be aware of this if you are doing serious work. If you
-don't synchronize the kernel time with an external source (via ntp or
-whatever) then the kernel will keep its hands off the RTC, allowing you
-exclusive access to the device for your applications.
-
-The alarm and/or interrupt frequency are programmed into the RTC via
-various ioctl(2) calls as listed in ./include/linux/rtc.h
-Rather than write 50 pages describing the ioctl() and so on, it is
-perhaps more useful to include a small test program that demonstrates
-how to use them, and demonstrates the features of the driver. This is
-probably a lot more useful to people interested in writing applications
-that will be using this driver.  See the code at the end of this document.
-
-(The original /dev/rtc driver was written by Paul Gortmaker.)
-
-
-New portable "RTC Class" drivers:  /dev/rtcN
---------------------------------------------
-
-Because Linux supports many non-ACPI and non-PC platforms, some of which
-have more than one RTC style clock, it needed a more portable solution
-than expecting a single battery-backed MC146818 clone on every system.
-Accordingly, a new "RTC Class" framework has been defined.  It offers
-three different userspace interfaces:
-
-    *	/dev/rtcN ... much the same as the older /dev/rtc interface
-
-    *	/sys/class/rtc/rtcN ... sysfs attributes support readonly
-	access to some RTC attributes.
-
-    *	/proc/driver/rtc ... the system clock RTC may expose itself
-	using a procfs interface. If there is no RTC for the system clock,
-	rtc0 is used by default. More information is (currently) shown
-	here than through sysfs.
-
-The RTC Class framework supports a wide variety of RTCs, ranging from those
-integrated into embeddable system-on-chip (SOC) processors to discrete chips
-using I2C, SPI, or some other bus to communicate with the host CPU.  There's
-even support for PC-style RTCs ... including the features exposed on newer PCs
-through ACPI.
-
-The new framework also removes the "one RTC per system" restriction.  For
-example, maybe the low-power battery-backed RTC is a discrete I2C chip, but
-a high functionality RTC is integrated into the SOC.  That system might read
-the system clock from the discrete RTC, but use the integrated one for all
-other tasks, because of its greater functionality.
-
-Check out tools/testing/selftests/rtc/rtctest.c for an example usage of the
-ioctl interface.
diff --git a/Documentation/svga.txt b/Documentation/svga.txt
deleted file mode 100644
index b6c2f9acca92..000000000000
--- a/Documentation/svga.txt
+++ /dev/null
@@ -1,249 +0,0 @@
-.. include:: <isonum.txt>
-
-=================================
-Video Mode Selection Support 2.13
-=================================
-
-:Copyright: |copy| 1995--1999 Martin Mares, <mj@ucw.cz>
-
-Intro
-~~~~~
-
-This small document describes the "Video Mode Selection" feature which
-allows the use of various special video modes supported by the video BIOS. Due
-to usage of the BIOS, the selection is limited to boot time (before the
-kernel decompression starts) and works only on 80X86 machines.
-
-.. note::
-
-   Short intro for the impatient: Just use vga=ask for the first time,
-   enter ``scan`` on the video mode prompt, pick the mode you want to use,
-   remember its mode ID (the four-digit hexadecimal number) and then
-   set the vga parameter to this number (converted to decimal first).
-
-The video mode to be used is selected by a kernel parameter which can be
-specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..."
-option of LILO (or some other boot loader you use) or by the "vidmode" utility
-(present in standard Linux utility packages). You can use the following values
-of this parameter::
-
-   NORMAL_VGA - Standard 80x25 mode available on all display adapters.
-
-   EXTENDED_VGA	- Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA.
-
-   ASK_VGA - Display a video mode menu upon startup (see below).
-
-   0..35 - Menu item number (when you have used the menu to view the list of
-      modes available on your adapter, you can specify the menu item you want
-      to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the
-      mode list displayed may vary as the kernel version changes, because the
-      modes are listed in a "first detected -- first displayed" manner. It's
-      better to use absolute mode numbers instead.
-
-   0x.... - Hexadecimal video mode ID (also displayed on the menu, see below
-      for exact meaning of the ID). Warning: rdev and LILO don't support
-      hexadecimal numbers -- you have to convert it to decimal manually.
-
-Menu
-~~~~
-
-The ASK_VGA mode causes the kernel to offer a video mode menu upon
-bootup. It displays a "Press <RETURN> to see video modes available, <SPACE>
-to continue or wait 30 secs" message. If you press <RETURN>, you enter the
-menu, if you press <SPACE> or wait 30 seconds, the kernel will boot up in
-the standard 80x25 mode.
-
-The menu looks like::
-
-	Video adapter: <name-of-detected-video-adapter>
-	Mode:    COLSxROWS:
-	0  0F00  80x25
-	1  0F01  80x50
-	2  0F02  80x43
-	3  0F03  80x26
-	....
-	Enter mode number or ``scan``: <flashing-cursor-here>
-
-<name-of-detected-video-adapter> tells what video adapter did Linux detect
--- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA
-with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection
-of chipsets is turned off by default as it's inherently unreliable due to
-absolutely insane PC design.
-
-"0  0F00  80x25" means that the first menu item (the menu items are numbered
-from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the
-next section for a description of mode IDs).
-
-<flashing-cursor-here> encourages you to enter the item number or mode ID
-you wish to set and press <RETURN>. If the computer complains something about
-"Unknown mode ID", it is trying to tell you that it isn't possible to set such
-a mode. It's also possible to press only <RETURN> which leaves the current mode.
-
-The mode list usually contains a few basic modes and some VESA modes.  In
-case your chipset has been detected, some chipset-specific modes are shown as
-well (some of these might be missing or unusable on your machine as different
-BIOSes are often shipped with the same card and the mode numbers depend purely
-on the VGA BIOS).
-
-The modes displayed on the menu are partially sorted: The list starts with
-the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and
-80x43), local modes (if the local modes feature is enabled), VESA modes and
-finally SVGA modes for the auto-detected adapter.
-
-If you are not happy with the mode list offered (e.g., if you think your card
-is able to do more), you can enter "scan" instead of item number / mode ID.  The
-program will try to ask the BIOS for all possible video mode numbers and test
-what happens then. The screen will be probably flashing wildly for some time and
-strange noises will be heard from inside the monitor and so on and then, really
-all consistent video modes supported by your BIOS will appear (plus maybe some
-``ghost modes``). If you are afraid this could damage your monitor, don't use
-this function.
-
-After scanning, the mode ordering is a bit different: the auto-detected SVGA
-modes are not listed at all and the modes revealed by ``scan`` are shown before
-all VESA modes.
-
-Mode IDs
-~~~~~~~~
-
-Because of the complexity of all the video stuff, the video mode IDs
-used here are also a bit complex. A video mode ID is a 16-bit number usually
-expressed in a hexadecimal notation (starting with "0x"). You can set a mode
-by entering its mode directly if you know it even if it isn't shown on the menu.
-
-The ID numbers can be divided to those regions::
-
-   0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use
-	outside the menu as this can change from boot to boot (especially if you
-	have used the ``scan`` feature).
-
-   0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number
-	(as presented to INT 10, function 00) increased by 0x0100.
-
-   0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by
-	0x0100. All VESA modes should be autodetected and shown on the menu.
-
-   0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05.
-	(Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60,
-	945=132x28 for the standard Video7 BIOS)
-
-   0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually
-	by modifying one of the standard modes). Currently available:
-	0x0f00	standard 80x25, don't reset mode if already set (=FFFF)
-	0x0f01	standard with 8-point font: 80x43 on EGA, 80x50 on VGA
-	0x0f02	VGA 80x43 (VGA switched to 350 scanlines with a 8-point font)
-	0x0f03	VGA 80x28 (standard VGA scans, but 14-point font)
-	0x0f04	leave current video mode
-	0x0f05	VGA 80x30 (480 scans, 16-point font)
-	0x0f06	VGA 80x34 (480 scans, 14-point font)
-	0x0f07	VGA 80x60 (480 scans, 8-point font)
-	0x0f08	Graphics hack (see the VIDEO_GFX_HACK paragraph below)
-
-   0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC"
-	form where RR is a number of rows and CC is a number of columns.
-	E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc.
-	This is the only fully portable way to refer to a non-standard mode,
-	but it relies on the mode being found and displayed on the menu
-	(remember that mode scanning is not done automatically).
-
-   0xff00 to 0xffff - aliases for backward compatibility:
-	0xffff	equivalent to 0x0f00 (standard 80x25)
-	0xfffe	equivalent to 0x0f01 (EGA 80x43 or VGA 80x50)
-
-If you add 0x8000 to the mode ID, the program will try to recalculate
-vertical display timing according to mode parameters, which can be used to
-eliminate some annoying bugs of certain VGA BIOSes (usually those used for
-cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the
-end of the display.
-
-Options
-~~~~~~~
-
-Build options for arch/x86/boot/* are selected by the kernel kconfig
-utility and the kernel .config file.
-
-VIDEO_GFX_HACK - includes special hack for setting of graphics modes
-to be used later by special drivers.
-Allows to set _any_ BIOS mode including graphic ones and forcing specific
-text screen resolution instead of peeking it from BIOS variables. Don't use
-unless you think you know what you're doing. To activate this setup, use
-mode number 0x0f08 (see the Mode IDs section above).
-
-Still doesn't work?
-~~~~~~~~~~~~~~~~~~~
-
-When the mode detection doesn't work (e.g., the mode list is incorrect or
-the machine hangs instead of displaying the menu), try to switch off some of
-the configuration options listed under "Options". If it fails, you can still use
-your kernel with the video mode set directly via the kernel parameter.
-
-In either case, please send me a bug report containing what _exactly_
-happens and how do the configuration switches affect the behaviour of the bug.
-
-If you start Linux from M$-DOS, you might also use some DOS tools for
-video mode setting. In this case, you must specify the 0x0f04 mode ("leave
-current settings") to Linux, because if you don't and you use any non-standard
-mode, Linux will switch to 80x25 automatically.
-
-If you set some extended mode and there's one or more extra lines on the
-bottom of the display containing already scrolled-out text, your VGA BIOS
-contains the most common video BIOS bug called "incorrect vertical display
-end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately,
-this must be done manually -- no autodetection mechanisms are available.
-
-History
-~~~~~~~
-
-=============== ================================================================
-1.0 (??-Nov-95)	First version supporting all adapters supported by the old
-		setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels
-		and then removed due to instability on some machines.
-2.0 (28-Jan-96)	Rewritten from scratch. Cirrus Logic 64XX support added, almost
-		everything is configurable, the VESA support should be much more
-		stable, explicit mode numbering allowed, "scan" implemented etc.
-2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution
-		supported. Few bugs fixed. VESA modes are listed prior to
-		modes supplied by SVGA autodetection as they are more reliable.
-		CLGD autodetect works better. Doesn't depend on 80x25 being
-		active when started. Scanning fixed. 80x43 (any VGA) added.
-		Code cleaned up.
-2.2 (01-Feb-96)	EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX
-		VESA modes work now). Display end bug workaround supported.
-		Special modes renumbered to allow adding of the "recalculate"
-		flag, 0xffff and 0xfffe became aliases instead of real IDs.
-		Screen contents retained during mode changes.
-2.3 (15-Mar-96)	Changed to work with 1.3.74 kernel.
-2.4 (18-Mar-96)	Added patches by Hans Lermen fixing a memory overwrite problem
-		with some boot loaders. Memory management rewritten to reflect
-		these changes. Unfortunately, screen contents retaining works
-		only with some loaders now.
-		Added a Tseng 132x60 mode.
-2.5 (19-Mar-96)	Fixed a VESA mode scanning bug introduced in 2.4.
-2.6 (25-Mar-96)	Some VESA BIOS errors not reported -- it fixes error reports on
-		several cards with broken VESA code (e.g., ATI VGA).
-2.7 (09-Apr-96)	- Accepted all VESA modes in range 0x100 to 0x7ff, because some
-		  cards use very strange mode numbers.
-		- Added Realtek VGA modes (thanks to Gonzalo Tornaria).
-		- Hardware testing order slightly changed, tests based on ROM
-		  contents done as first.
-		- Added support for special Video7 mode switching functions
-		  (thanks to Tom Vander Aa).
-		- Added 480-scanline modes (especially useful for notebooks,
-		  original version written by hhanemaa@cs.ruu.nl, patched by
-		  Jeff Chua, rewritten by me).
-		- Screen store/restore fixed.
-2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA.
-		- Better recognition of text modes during mode scan.
-2.9 (12-May-96)	- Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!)
-2.10(11-Nov-96) - The whole thing made optional.
-		- Added the CONFIG_VIDEO_400_HACK switch.
-		- Added the CONFIG_VIDEO_GFX_HACK switch.
-		- Code cleanup.
-2.11(03-May-97) - Yet another cleanup, now including also the documentation.
-		- Direct testing of SVGA adapters turned off by default, ``scan``
-		  offered explicitly on the prompt line.
-		- Removed the doc section describing adding of new probing
-		  functions as I try to get rid of _all_ hardware probing here.
-2.12(25-May-98) Added support for VESA frame buffer graphics.
-2.13(14-May-99) Minor documentation fixes.
-=============== ================================================================
diff --git a/Documentation/video-output.txt b/Documentation/video-output.txt
deleted file mode 100644
index 56d6fa2e2368..000000000000
--- a/Documentation/video-output.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-Video Output Switcher Control
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-2006 luming.yu@intel.com
-
-The output sysfs class driver provides an abstract video output layer that
-can be used to hook platform specific methods to enable/disable video output
-device through common sysfs interface. For example, on my IBM ThinkPad T42
-laptop, The ACPI video driver registered its output devices and read/write
-method for 'state' with output sysfs class. The user interface under sysfs is::
-
-  linux:/sys/class/video_output # tree .
-  .
-  |-- CRT0
-  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
-  |   |-- state
-  |   |-- subsystem -> ../../../class/video_output
-  |   `-- uevent
-  |-- DVI0
-  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
-  |   |-- state
-  |   |-- subsystem -> ../../../class/video_output
-  |   `-- uevent
-  |-- LCD0
-  |   |-- device -> ../../../devices/pci0000:00/0000:00:01.0
-  |   |-- state
-  |   |-- subsystem -> ../../../class/video_output
-  |   `-- uevent
-  `-- TV0
-     |-- device -> ../../../devices/pci0000:00/0000:00:01.0
-     |-- state
-     |-- subsystem -> ../../../class/video_output
-     `-- uevent
-
diff --git a/Documentation/x86/topology.rst b/Documentation/x86/topology.rst
index 8e9704f61017..e29739904e37 100644
--- a/Documentation/x86/topology.rst
+++ b/Documentation/x86/topology.rst
@@ -9,7 +9,7 @@ representation in the kernel. Update/change when doing changes to the
 respective code.
 
 The architecture-agnostic topology definitions are in
-Documentation/cputopology.txt. This file holds x86-specific
+Documentation/admin-guide/cputopology.rst. This file holds x86-specific
 differences/specialities which must not necessarily apply to the generic
 definitions. Thus, the way to read up on Linux topology on x86 is to start
 with the generic one and look at this one in parallel for the x86 specifics.
diff --git a/MAINTAINERS b/MAINTAINERS
index c1593a668f80..570572627fd1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6080,7 +6080,7 @@ M:	Ard Biesheuvel <ard.biesheuvel@linaro.org>
 L:	linux-efi@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git
 S:	Maintained
-F:	Documentation/efi-stub.txt
+F:	Documentation/admin-guide/efi-stub.rst
 F:	arch/*/kernel/efi.c
 F:	arch/x86/boot/compressed/eboot.[ch]
 F:	arch/*/include/asm/efi.h
@@ -7088,7 +7088,7 @@ M:	Herbert Xu <herbert@gondor.apana.org.au>
 L:	linux-crypto@vger.kernel.org
 S:	Odd fixes
 F:	Documentation/devicetree/bindings/rng/
-F:	Documentation/hw_random.txt
+F:	Documentation/admin-guide/hw_random.rst
 F:	drivers/char/hw_random/
 F:	include/linux/hw_random.h
 
@@ -9398,7 +9398,7 @@ M:	"Richard Russon (FlatCap)" <ldm@flatcap.org>
 L:	linux-ntfs-dev@lists.sourceforge.net
 W:	http://www.linux-ntfs.org/content/view/19/37/
 S:	Maintained
-F:	Documentation/ldm.txt
+F:	Documentation/admin-guide/ldm.rst
 F:	block/partitions/ldm.*
 
 LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI)
@@ -12058,7 +12058,7 @@ PARALLEL LCD/KEYPAD PANEL DRIVER
 M:	Willy Tarreau <willy@haproxy.com>
 M:	Ksenija Stanojevic <ksenija.stanojevic@gmail.com>
 S:	Odd Fixes
-F:	Documentation/auxdisplay/lcd-panel-cgram.rst
+F:	Documentation/admin-guide/lcd-panel-cgram.rst
 F:	drivers/auxdisplay/panel.c
 
 PARALLEL PORT SUBSYSTEM
@@ -13476,7 +13476,7 @@ Q:	http://patchwork.ozlabs.org/project/rtc-linux/list/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git
 S:	Maintained
 F:	Documentation/devicetree/bindings/rtc/
-F:	Documentation/rtc.txt
+F:	Documentation/admin-guide/rtc.rst
 F:	drivers/rtc/
 F:	include/linux/rtc.h
 F:	include/uapi/linux/rtc.h
@@ -15306,7 +15306,7 @@ SVGA HANDLING
 M:	Martin Mares <mj@ucw.cz>
 L:	linux-video@atrey.karlin.mff.cuni.cz
 S:	Maintained
-F:	Documentation/svga.txt
+F:	Documentation/admin-guide/svga.rst
 F:	arch/x86/boot/video*
 
 SWIOTLB SUBSYSTEM
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 20afd6077465..600c5ba1af41 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1297,7 +1297,7 @@ config SMP
 	  will run faster if you say N here.
 
 	  See also <file:Documentation/x86/i386/IO-APIC.rst>,
-	  <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at
+	  <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at
 	  <http://tldp.org/HOWTO/SMP-HOWTO.html>.
 
 	  If you don't know what to do here, say N.
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 42875ff15671..6d732e451071 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -277,7 +277,7 @@ config SMP
 	  machines, but will use only one CPU of a multiprocessor machine.
 	  On a uniprocessor machine, the kernel will run faster if you say N.
 
-	  See also <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO
+	  See also <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO
 	  available at <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index c2858ac6a46a..6b1b5941b618 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -679,7 +679,7 @@ config SMP
 	  People using multiprocessor machines who say Y here should also say
 	  Y to "Enhanced Real Time Clock Support", below.
 
-	  See also <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO
+	  See also <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO
 	  available at <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e9f5d62e9817..7926a2e11bdc 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -180,7 +180,7 @@ config SMP
 	  Y to "Enhanced Real Time Clock Support", below. The "Advanced Power
 	  Management" code will be disabled if you say Y here.
 
-	  See also <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO
+	  See also <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO
 	  available at <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9505066b7ba3..9e95af666b33 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -402,7 +402,7 @@ config SMP
 	  Management" code will be disabled if you say Y here.
 
 	  See also <file:Documentation/x86/i386/IO-APIC.rst>,
-	  <file:Documentation/lockup-watchdogs.txt> and the SMP-HOWTO available at
+	  <file:Documentation/admin-guide/lockup-watchdogs.rst> and the SMP-HOWTO available at
 	  <http://www.tldp.org/docs.html#howto>.
 
 	  If you don't know what to do here, say N.
@@ -1959,7 +1959,7 @@ config EFI_STUB
           This kernel feature allows a bzImage to be loaded directly
 	  by EFI firmware without the use of a bootloader.
 
-	  See Documentation/efi-stub.txt for more information.
+	  See Documentation/admin-guide/efi-stub.rst for more information.
 
 config EFI_MIXED
 	bool "EFI mixed-mode support"
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 37b9710cc80a..702689a628f0 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -194,7 +194,7 @@ config LDM_PARTITION
 	  Normal partitions are now called Basic Disks under Windows 2000, XP,
 	  and Vista.
 
-	  For a fuller description read <file:Documentation/ldm.txt>.
+	  For a fuller description read <file:Documentation/admin-guide/ldm.rst>.
 
 	  If unsure, say N.
 
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 442403abd73a..3e866885a405 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -291,7 +291,7 @@ config RTC
 	  and set the RTC in an SMP compatible fashion.
 
 	  If you think you have a use for such a device (such as periodic data
-	  sampling), then say Y here, and read <file:Documentation/rtc.txt>
+	  sampling), then say Y here, and read <file:Documentation/admin-guide/rtc.rst>
 	  for details.
 
 	  To compile this driver as a module, choose M here: the
@@ -313,7 +313,7 @@ config JS_RTC
 	  /dev/rtc.
 
 	  If you think you have a use for such a device (such as periodic data
-	  sampling), then say Y here, and read <file:Documentation/rtc.txt>
+	  sampling), then say Y here, and read <file:Documentation/admin-guide/rtc.rst>
 	  for details.
 
 	  To compile this driver as a module, choose M here: the
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index 95be7228f327..9044d31ab1a1 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -4,7 +4,7 @@
  * Copyright 2006 Michael Buesch <m@bues.ch>
  * Copyright 2005 (c) MontaVista Software, Inc.
  *
- * Please read Documentation/hw_random.txt for details on use.
+ * Please read Documentation/admin-guide/hw_random.rst for details on use.
  *
  * This software may be used and distributed according to the terms
  * of the GNU General Public License, incorporated herein by reference.
diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h
index c0b93e0ff0c0..8e6dd908da21 100644
--- a/include/linux/hw_random.h
+++ b/include/linux/hw_random.h
@@ -1,7 +1,7 @@
 /*
 	Hardware Random Number Generator
 
-	Please read Documentation/hw_random.txt for details on use.
+	Please read Documentation/admin-guide/hw_random.rst for details on use.
 
 	----------------------------------------------------------
 	This software may be used and distributed according to the terms
-- 
cgit v1.2.3-59-g8ed1b


From 65388dad1bbb51a4eb6cc91b9fa865b57646fb67 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Date: Thu, 27 Jun 2019 16:31:35 -0300
Subject: docs: serial: move it to the driver-api

The contents of this directory is mostly driver-api stuff.

Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
---
 Documentation/driver-api/index.rst                 |   1 +
 Documentation/driver-api/serial/cyclades_z.rst     |  11 +
 Documentation/driver-api/serial/driver.rst         | 549 ++++++++++++++++++
 Documentation/driver-api/serial/index.rst          |  32 ++
 Documentation/driver-api/serial/moxa-smartio.rst   | 615 +++++++++++++++++++++
 Documentation/driver-api/serial/n_gsm.rst          | 103 ++++
 Documentation/driver-api/serial/rocket.rst         | 185 +++++++
 Documentation/driver-api/serial/serial-iso7816.rst |  90 +++
 Documentation/driver-api/serial/serial-rs485.rst   | 103 ++++
 Documentation/driver-api/serial/tty.rst            | 328 +++++++++++
 Documentation/serial/cyclades_z.rst                |  11 -
 Documentation/serial/driver.rst                    | 549 ------------------
 Documentation/serial/index.rst                     |  32 --
 Documentation/serial/moxa-smartio.rst              | 615 ---------------------
 Documentation/serial/n_gsm.rst                     | 103 ----
 Documentation/serial/rocket.rst                    | 185 -------
 Documentation/serial/serial-iso7816.rst            |  90 ---
 Documentation/serial/serial-rs485.rst              | 103 ----
 Documentation/serial/tty.rst                       | 328 -----------
 MAINTAINERS                                        |   6 +-
 drivers/tty/Kconfig                                |   4 +-
 drivers/tty/serial/ucc_uart.c                      |   2 +-
 include/linux/serial_core.h                        |   2 +-
 23 files changed, 2024 insertions(+), 2023 deletions(-)
 create mode 100644 Documentation/driver-api/serial/cyclades_z.rst
 create mode 100644 Documentation/driver-api/serial/driver.rst
 create mode 100644 Documentation/driver-api/serial/index.rst
 create mode 100644 Documentation/driver-api/serial/moxa-smartio.rst
 create mode 100644 Documentation/driver-api/serial/n_gsm.rst
 create mode 100644 Documentation/driver-api/serial/rocket.rst
 create mode 100644 Documentation/driver-api/serial/serial-iso7816.rst
 create mode 100644 Documentation/driver-api/serial/serial-rs485.rst
 create mode 100644 Documentation/driver-api/serial/tty.rst
 delete mode 100644 Documentation/serial/cyclades_z.rst
 delete mode 100644 Documentation/serial/driver.rst
 delete mode 100644 Documentation/serial/index.rst
 delete mode 100644 Documentation/serial/moxa-smartio.rst
 delete mode 100644 Documentation/serial/n_gsm.rst
 delete mode 100644 Documentation/serial/rocket.rst
 delete mode 100644 Documentation/serial/serial-iso7816.rst
 delete mode 100644 Documentation/serial/serial-rs485.rst
 delete mode 100644 Documentation/serial/tty.rst

(limited to 'include/linux')

diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst
index 1dde9692075c..cf39b8f9d0f9 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -88,6 +88,7 @@ available subsections can be seen below.
    pti_intel_mid
    pwm
    rfkill
+   serial/index
    sgi-ioc4
    sm501
    smsc_ece1099
diff --git a/Documentation/driver-api/serial/cyclades_z.rst b/Documentation/driver-api/serial/cyclades_z.rst
new file mode 100644
index 000000000000..532ff67e2f1c
--- /dev/null
+++ b/Documentation/driver-api/serial/cyclades_z.rst
@@ -0,0 +1,11 @@
+================
+Cyclades-Z notes
+================
+
+The Cyclades-Z must have firmware loaded onto the card before it will
+operate.  This operation should be performed during system startup,
+
+The firmware, loader program and the latest device driver code are
+available from Cyclades at
+
+    ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/
diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst
new file mode 100644
index 000000000000..31bd4e16fb1f
--- /dev/null
+++ b/Documentation/driver-api/serial/driver.rst
@@ -0,0 +1,549 @@
+====================
+Low Level Serial API
+====================
+
+
+This document is meant as a brief overview of some aspects of the new serial
+driver.  It is not complete, any questions you have should be directed to
+<rmk@arm.linux.org.uk>
+
+The reference implementation is contained within amba-pl011.c.
+
+
+
+Low Level Serial Hardware Driver
+--------------------------------
+
+The low level serial hardware driver is responsible for supplying port
+information (defined by uart_port) and a set of control methods (defined
+by uart_ops) to the core serial driver.  The low level driver is also
+responsible for handling interrupts for the port, and providing any
+console support.
+
+
+Console Support
+---------------
+
+The serial core provides a few helper functions.  This includes identifing
+the correct port structure (via uart_get_console) and decoding command line
+arguments (uart_parse_options).
+
+There is also a helper function (uart_console_write) which performs a
+character by character write, translating newlines to CRLF sequences.
+Driver writers are recommended to use this function rather than implementing
+their own version.
+
+
+Locking
+-------
+
+It is the responsibility of the low level hardware driver to perform the
+necessary locking using port->lock.  There are some exceptions (which
+are described in the uart_ops listing below.)
+
+There are two locks.  A per-port spinlock, and an overall semaphore.
+
+From the core driver perspective, the port->lock locks the following
+data::
+
+	port->mctrl
+	port->icount
+	port->state->xmit.head (circ_buf->head)
+	port->state->xmit.tail (circ_buf->tail)
+
+The low level driver is free to use this lock to provide any additional
+locking.
+
+The port_sem semaphore is used to protect against ports being added/
+removed or reconfigured at inappropriate times. Since v2.6.27, this
+semaphore has been the 'mutex' member of the tty_port struct, and
+commonly referred to as the port mutex.
+
+
+uart_ops
+--------
+
+The uart_ops structure is the main interface between serial_core and the
+hardware specific driver.  It contains all the methods to control the
+hardware.
+
+  tx_empty(port)
+	This function tests whether the transmitter fifo and shifter
+	for the port described by 'port' is empty.  If it is empty,
+	this function should return TIOCSER_TEMT, otherwise return 0.
+	If the port does not support this operation, then it should
+	return TIOCSER_TEMT.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+  set_mctrl(port, mctrl)
+	This function sets the modem control lines for port described
+	by 'port' to the state described by mctrl.  The relevant bits
+	of mctrl are:
+
+		- TIOCM_RTS	RTS signal.
+		- TIOCM_DTR	DTR signal.
+		- TIOCM_OUT1	OUT1 signal.
+		- TIOCM_OUT2	OUT2 signal.
+		- TIOCM_LOOP	Set the port into loopback mode.
+
+	If the appropriate bit is set, the signal should be driven
+	active.  If the bit is clear, the signal should be driven
+	inactive.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  get_mctrl(port)
+	Returns the current state of modem control inputs.  The state
+	of the outputs should not be returned, since the core keeps
+	track of their state.  The state information should include:
+
+		- TIOCM_CAR	state of DCD signal
+		- TIOCM_CTS	state of CTS signal
+		- TIOCM_DSR	state of DSR signal
+		- TIOCM_RI	state of RI signal
+
+	The bit is set if the signal is currently driven active.  If
+	the port does not support CTS, DCD or DSR, the driver should
+	indicate that the signal is permanently active.  If RI is
+	not available, the signal should not be indicated as active.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  stop_tx(port)
+	Stop transmitting characters.  This might be due to the CTS
+	line becoming inactive or the tty layer indicating we want
+	to stop transmission due to an XOFF character.
+
+	The driver should stop transmitting characters as soon as
+	possible.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  start_tx(port)
+	Start transmitting characters.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  throttle(port)
+	Notify the serial driver that input buffers for the line discipline are
+	close to full, and it should somehow signal that no more characters
+	should be sent to the serial port.
+	This will be called only if hardware assisted flow control is enabled.
+
+	Locking: serialized with .unthrottle() and termios modification by the
+	tty layer.
+
+  unthrottle(port)
+	Notify the serial driver that characters can now be sent to the serial
+	port without fear of overrunning the input buffers of the line
+	disciplines.
+
+	This will be called only if hardware assisted flow control is enabled.
+
+	Locking: serialized with .throttle() and termios modification by the
+	tty layer.
+
+  send_xchar(port,ch)
+	Transmit a high priority character, even if the port is stopped.
+	This is used to implement XON/XOFF flow control and tcflow().  If
+	the serial driver does not implement this function, the tty core
+	will append the character to the circular buffer and then call
+	start_tx() / stop_tx() to flush the data out.
+
+	Do not transmit if ch == '\0' (__DISABLED_CHAR).
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  stop_rx(port)
+	Stop receiving characters; the port is in the process of
+	being closed.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  enable_ms(port)
+	Enable the modem status interrupts.
+
+	This method may be called multiple times.  Modem status
+	interrupts should be disabled when the shutdown method is
+	called.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  break_ctl(port,ctl)
+	Control the transmission of a break signal.  If ctl is
+	nonzero, the break signal should be transmitted.  The signal
+	should be terminated when another call is made with a zero
+	ctl.
+
+	Locking: caller holds tty_port->mutex
+
+  startup(port)
+	Grab any interrupt resources and initialise any low level driver
+	state.  Enable the port for reception.  It should not activate
+	RTS nor DTR; this will be done via a separate call to set_mctrl.
+
+	This method will only be called when the port is initially opened.
+
+	Locking: port_sem taken.
+
+	Interrupts: globally disabled.
+
+  shutdown(port)
+	Disable the port, disable any break condition that may be in
+	effect, and free any interrupt resources.  It should not disable
+	RTS nor DTR; this will have already been done via a separate
+	call to set_mctrl.
+
+	Drivers must not access port->state once this call has completed.
+
+	This method will only be called when there are no more users of
+	this port.
+
+	Locking: port_sem taken.
+
+	Interrupts: caller dependent.
+
+  flush_buffer(port)
+	Flush any write buffers, reset any DMA state and stop any
+	ongoing DMA transfers.
+
+	This will be called whenever the port->state->xmit circular
+	buffer is cleared.
+
+	Locking: port->lock taken.
+
+	Interrupts: locally disabled.
+
+	This call must not sleep
+
+  set_termios(port,termios,oldtermios)
+	Change the port parameters, including word length, parity, stop
+	bits.  Update read_status_mask and ignore_status_mask to indicate
+	the types of events we are interested in receiving.  Relevant
+	termios->c_cflag bits are:
+
+		CSIZE
+			- word size
+		CSTOPB
+			- 2 stop bits
+		PARENB
+			- parity enable
+		PARODD
+			- odd parity (when PARENB is in force)
+		CREAD
+			- enable reception of characters (if not set,
+			  still receive characters from the port, but
+			  throw them away.
+		CRTSCTS
+			- if set, enable CTS status change reporting
+		CLOCAL
+			- if not set, enable modem status change
+			  reporting.
+
+	Relevant termios->c_iflag bits are:
+
+		INPCK
+			- enable frame and parity error events to be
+			  passed to the TTY layer.
+		BRKINT / PARMRK
+			- both of these enable break events to be
+			  passed to the TTY layer.
+
+		IGNPAR
+			- ignore parity and framing errors
+		IGNBRK
+			- ignore break errors,  If IGNPAR is also
+			  set, ignore overrun errors as well.
+
+	The interaction of the iflag bits is as follows (parity error
+	given as an example):
+
+	=============== ======= ======  =============================
+	Parity error	INPCK	IGNPAR
+	=============== ======= ======  =============================
+	n/a		0	n/a	character received, marked as
+					TTY_NORMAL
+	None		1	n/a	character received, marked as
+					TTY_NORMAL
+	Yes		1	0	character received, marked as
+					TTY_PARITY
+	Yes		1	1	character discarded
+	=============== ======= ======  =============================
+
+	Other flags may be used (eg, xon/xoff characters) if your
+	hardware supports hardware "soft" flow control.
+
+	Locking: caller holds tty_port->mutex
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+  set_ldisc(port,termios)
+	Notifier for discipline change. See Documentation/driver-api/serial/tty.rst.
+
+	Locking: caller holds tty_port->mutex
+
+  pm(port,state,oldstate)
+	Perform any power management related activities on the specified
+	port.  State indicates the new state (defined by
+	enum uart_pm_state), oldstate indicates the previous state.
+
+	This function should not be used to grab any resources.
+
+	This will be called when the port is initially opened and finally
+	closed, except when the port is also the system console.  This
+	will occur even if CONFIG_PM is not set.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  type(port)
+	Return a pointer to a string constant describing the specified
+	port, or return NULL, in which case the string 'unknown' is
+	substituted.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  release_port(port)
+	Release any memory and IO region resources currently in use by
+	the port.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  request_port(port)
+	Request any memory and IO region resources required by the port.
+	If any fail, no resources should be registered when this function
+	returns, and it should return -EBUSY on failure.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  config_port(port,type)
+	Perform any autoconfiguration steps required for the port.  `type`
+	contains a bit mask of the required configuration.  UART_CONFIG_TYPE
+	indicates that the port requires detection and identification.
+	port->type should be set to the type found, or PORT_UNKNOWN if
+	no port was detected.
+
+	UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
+	which should be probed using standard kernel autoprobing techniques.
+	This is not necessary on platforms where ports have interrupts
+	internally hard wired (eg, system on a chip implementations).
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  verify_port(port,serinfo)
+	Verify the new serial port information contained within serinfo is
+	suitable for this port type.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  ioctl(port,cmd,arg)
+	Perform any port specific IOCTLs.  IOCTL commands must be defined
+	using the standard numbering system found in <asm/ioctl.h>
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+  poll_init(port)
+	Called by kgdb to perform the minimal hardware initialization needed
+	to support poll_put_char() and poll_get_char().  Unlike ->startup()
+	this should not request interrupts.
+
+	Locking: tty_mutex and tty_port->mutex taken.
+
+	Interrupts: n/a.
+
+  poll_put_char(port,ch)
+	Called by kgdb to write a single character directly to the serial
+	port.  It can and should block until there is space in the TX FIFO.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+  poll_get_char(port)
+	Called by kgdb to read a single character directly from the serial
+	port.  If data is available, it should be returned; otherwise
+	the function should return NO_POLL_CHAR immediately.
+
+	Locking: none.
+
+	Interrupts: caller dependent.
+
+	This call must not sleep
+
+Other functions
+---------------
+
+uart_update_timeout(port,cflag,baud)
+	Update the FIFO drain timeout, port->timeout, according to the
+	number of bits, parity, stop bits and baud rate.
+
+	Locking: caller is expected to take port->lock
+
+	Interrupts: n/a
+
+uart_get_baud_rate(port,termios,old,min,max)
+	Return the numeric baud rate for the specified termios, taking
+	account of the special 38400 baud "kludge".  The B0 baud rate
+	is mapped to 9600 baud.
+
+	If the baud rate is not within min..max, then if old is non-NULL,
+	the original baud rate will be tried.  If that exceeds the
+	min..max constraint, 9600 baud will be returned.  termios will
+	be updated to the baud rate in use.
+
+	Note: min..max must always allow 9600 baud to be selected.
+
+	Locking: caller dependent.
+
+	Interrupts: n/a
+
+uart_get_divisor(port,baud)
+	Return the divisor (baud_base / baud) for the specified baud
+	rate, appropriately rounded.
+
+	If 38400 baud and custom divisor is selected, return the
+	custom divisor instead.
+
+	Locking: caller dependent.
+
+	Interrupts: n/a
+
+uart_match_port(port1,port2)
+	This utility function can be used to determine whether two
+	uart_port structures describe the same port.
+
+	Locking: n/a
+
+	Interrupts: n/a
+
+uart_write_wakeup(port)
+	A driver is expected to call this function when the number of
+	characters in the transmit buffer have dropped below a threshold.
+
+	Locking: port->lock should be held.
+
+	Interrupts: n/a
+
+uart_register_driver(drv)
+	Register a uart driver with the core driver.  We in turn register
+	with the tty layer, and initialise the core driver per-port state.
+
+	drv->port should be NULL, and the per-port structures should be
+	registered using uart_add_one_port after this call has succeeded.
+
+	Locking: none
+
+	Interrupts: enabled
+
+uart_unregister_driver()
+	Remove all references to a driver from the core driver.  The low
+	level driver must have removed all its ports via the
+	uart_remove_one_port() if it registered them with uart_add_one_port().
+
+	Locking: none
+
+	Interrupts: enabled
+
+**uart_suspend_port()**
+
+**uart_resume_port()**
+
+**uart_add_one_port()**
+
+**uart_remove_one_port()**
+
+Other notes
+-----------
+
+It is intended some day to drop the 'unused' entries from uart_port, and
+allow low level drivers to register their own individual uart_port's with
+the core.  This will allow drivers to use uart_port as a pointer to a
+structure containing both the uart_port entry with their own extensions,
+thus::
+
+	struct my_port {
+		struct uart_port	port;
+		int			my_stuff;
+	};
+
+Modem control lines via GPIO
+----------------------------
+
+Some helpers are provided in order to set/get modem control lines via GPIO.
+
+mctrl_gpio_init(port, idx):
+	This will get the {cts,rts,...}-gpios from device tree if they are
+	present and request them, set direction etc, and return an
+	allocated structure. `devm_*` functions are used, so there's no need
+	to call mctrl_gpio_free().
+	As this sets up the irq handling make sure to not handle changes to the
+	gpio input lines in your driver, too.
+
+mctrl_gpio_free(dev, gpios):
+	This will free the requested gpios in mctrl_gpio_init().
+	As `devm_*` functions are used, there's generally no need to call
+	this function.
+
+mctrl_gpio_to_gpiod(gpios, gidx)
+	This returns the gpio_desc structure associated to the modem line
+	index.
+
+mctrl_gpio_set(gpios, mctrl):
+	This will sets the gpios according to the mctrl state.
+
+mctrl_gpio_get(gpios, mctrl):
+	This will update mctrl with the gpios values.
+
+mctrl_gpio_enable_ms(gpios):
+	Enables irqs and handling of changes to the ms lines.
+
+mctrl_gpio_disable_ms(gpios):
+	Disables irqs and handling of changes to the ms lines.
diff --git a/Documentation/driver-api/serial/index.rst b/Documentation/driver-api/serial/index.rst
new file mode 100644
index 000000000000..33ad10d05b26
--- /dev/null
+++ b/Documentation/driver-api/serial/index.rst
@@ -0,0 +1,32 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
+Support for Serial devices
+==========================
+
+.. toctree::
+    :maxdepth: 1
+
+
+    driver
+    tty
+
+Serial drivers
+==============
+
+.. toctree::
+    :maxdepth: 1
+
+    cyclades_z
+    moxa-smartio
+    n_gsm
+    rocket
+    serial-iso7816
+    serial-rs485
+
+.. only::  subproject and html
+
+   Indices
+   =======
+
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/serial/moxa-smartio.rst b/Documentation/driver-api/serial/moxa-smartio.rst
new file mode 100644
index 000000000000..156100f17c3f
--- /dev/null
+++ b/Documentation/driver-api/serial/moxa-smartio.rst
@@ -0,0 +1,615 @@
+=============================================================
+MOXA Smartio/Industio Family Device Driver Installation Guide
+=============================================================
+
+.. note::
+
+   This file is outdated. It needs some care in order to make it
+   updated to Kernel 5.0 and upper
+
+Copyright (C) 2008, Moxa Inc.
+
+Date: 01/21/2008
+
+.. Content
+
+   1. Introduction
+   2. System Requirement
+   3. Installation
+      3.1 Hardware installation
+      3.2 Driver files
+      3.3 Device naming convention
+      3.4 Module driver configuration
+      3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x.
+      3.6 Custom configuration
+      3.7 Verify driver installation
+   4. Utilities
+   5. Setserial
+   6. Troubleshooting
+
+1. Introduction
+^^^^^^^^^^^^^^^
+
+   The Smartio/Industio/UPCI family Linux driver supports following multiport
+   boards.
+
+    - 2 ports multiport board
+	CP-102U, CP-102UL, CP-102UF
+	CP-132U-I, CP-132UL,
+	CP-132, CP-132I, CP132S, CP-132IS,
+	CI-132, CI-132I, CI-132IS,
+	(C102H, C102HI, C102HIS, C102P, CP-102, CP-102S)
+
+    - 4 ports multiport board
+	CP-104EL,
+	CP-104UL, CP-104JU,
+	CP-134U, CP-134U-I,
+	C104H/PCI, C104HS/PCI,
+	CP-114, CP-114I, CP-114S, CP-114IS, CP-114UL,
+	C104H, C104HS,
+	CI-104J, CI-104JS,
+	CI-134, CI-134I, CI-134IS,
+	(C114HI, CT-114I, C104P),
+	POS-104UL,
+	CB-114,
+	CB-134I
+
+    - 8 ports multiport board
+	CP-118EL, CP-168EL,
+	CP-118U, CP-168U,
+	C168H/PCI,
+	C168H, C168HS,
+	(C168P),
+	CB-108
+
+   This driver and installation procedure have been developed upon Linux Kernel
+   2.4.x and 2.6.x. This driver supports Intel x86 hardware platform. In order
+   to maintain compatibility, this version has also been properly tested with
+   RedHat, Mandrake, Fedora and S.u.S.E Linux. However, if compatibility problem
+   occurs, please contact Moxa at support@moxa.com.tw.
+
+   In addition to device driver, useful utilities are also provided in this
+   version. They are:
+
+    - msdiag
+		 Diagnostic program for displaying installed Moxa
+                 Smartio/Industio boards.
+    - msmon
+		 Monitor program to observe data count and line status signals.
+    - msterm     A simple terminal program which is useful in testing serial
+	         ports.
+    - io-irq.exe
+		 Configuration program to setup ISA boards. Please note that
+                 this program can only be executed under DOS.
+
+   All the drivers and utilities are published in form of source code under
+   GNU General Public License in this version. Please refer to GNU General
+   Public License announcement in each source code file for more detail.
+
+   In Moxa's Web sites, you may always find latest driver at http://www.moxa.com/.
+
+   This version of driver can be installed as Loadable Module (Module driver)
+   or built-in into kernel (Static driver). You may refer to following
+   installation procedure for suitable one. Before you install the driver,
+   please refer to hardware installation procedure in the User's Manual.
+
+   We assume the user should be familiar with following documents.
+
+   - Serial-HOWTO
+   - Kernel-HOWTO
+
+2. System Requirement
+^^^^^^^^^^^^^^^^^^^^^
+
+   - Hardware platform: Intel x86 machine
+   - Kernel version: 2.4.x or 2.6.x
+   - gcc version 2.72 or later
+   - Maximum 4 boards can be installed in combination
+
+3. Installation
+^^^^^^^^^^^^^^^
+
+3.1 Hardware installation
+=========================
+
+   There are two types of buses, ISA and PCI, for Smartio/Industio
+   family multiport board.
+
+ISA board
+---------
+
+   You'll have to configure CAP address, I/O address, Interrupt Vector
+   as well as IRQ before installing this driver. Please refer to hardware
+   installation procedure in User's Manual before proceed any further.
+   Please make sure the JP1 is open after the ISA board is set properly.
+
+PCI/UPCI board
+--------------
+
+   You may need to adjust IRQ usage in BIOS to avoid from IRQ conflict
+   with other ISA devices. Please refer to hardware installation
+   procedure in User's Manual in advance.
+
+PCI IRQ Sharing
+---------------
+
+   Each port within the same multiport board shares the same IRQ. Up to
+   4 Moxa Smartio/Industio PCI Family multiport boards can be installed
+   together on one system and they can share the same IRQ.
+
+
+3.2 Driver files
+================
+
+   The driver file may be obtained from ftp, CD-ROM or floppy disk. The
+   first step, anyway, is to copy driver file "mxser.tgz" into specified
+   directory. e.g. /moxa. The execute commands as below::
+
+       # cd /
+       # mkdir moxa
+       # cd /moxa
+       # tar xvf /dev/fd0
+
+or::
+
+       # cd /
+       # mkdir moxa
+       # cd /moxa
+       # cp /mnt/cdrom/<driver directory>/mxser.tgz .
+       # tar xvfz mxser.tgz
+
+
+3.3 Device naming convention
+============================
+
+   You may find all the driver and utilities files in /moxa/mxser.
+   Following installation procedure depends on the model you'd like to
+   run the driver. If you prefer module driver, please refer to 3.4.
+   If static driver is required, please refer to 3.5.
+
+Dialin and callout port
+-----------------------
+
+   This driver remains traditional serial device properties. There are
+   two special file name for each serial port. One is dial-in port
+   which is named "ttyMxx". For callout port, the naming convention
+   is "cumxx".
+
+Device naming when more than 2 boards installed
+-----------------------------------------------
+
+   Naming convention for each Smartio/Industio multiport board is
+   pre-defined as below.
+
+   ============ ===============       ==============
+   Board Num.	 Dial-in Port	      Callout port
+   1st board	ttyM0  - ttyM7	      cum0  - cum7
+   2nd board	ttyM8  - ttyM15       cum8  - cum15
+   3rd board	ttyM16 - ttyM23       cum16 - cum23
+   4th board	ttyM24 - ttym31       cum24 - cum31
+   ============ ===============       ==============
+
+.. note::
+
+   Under Kernel 2.6 and upper, the cum Device is Obsolete. So use ttyM*
+   device instead.
+
+Board sequence
+--------------
+
+   This driver will activate ISA boards according to the parameter set
+   in the driver. After all specified ISA board activated, PCI board
+   will be installed in the system automatically driven.
+   Therefore the board number is sorted by the CAP address of ISA boards.
+   For PCI boards, their sequence will be after ISA boards and C168H/PCI
+   has higher priority than C104H/PCI boards.
+
+3.4 Module driver configuration
+===============================
+
+   Module driver is easiest way to install. If you prefer static driver
+   installation, please skip this paragraph.
+
+
+   ------------- Prepare to use the MOXA driver --------------------
+
+3.4.1 Create tty device with correct major number
+-------------------------------------------------
+
+   Before using MOXA driver, your system must have the tty devices
+   which are created with driver's major number. We offer one shell
+   script "msmknod" to simplify the procedure.
+   This step is only needed to be executed once. But you still
+   need to do this procedure when:
+
+   a. You change the driver's major number. Please refer the "3.7"
+      section.
+   b. Your total installed MOXA boards number is changed. Maybe you
+      add/delete one MOXA board.
+   c. You want to change the tty name. This needs to modify the
+      shell script "msmknod"
+
+   The procedure is::
+
+	 # cd /moxa/mxser/driver
+	 # ./msmknod
+
+   This shell script will require the major number for dial-in
+   device and callout device to create tty device. You also need
+   to specify the total installed MOXA board number. Default major
+   numbers for dial-in device and callout device are 30, 35. If
+   you need to change to other number, please refer section "3.7"
+   for more detailed procedure.
+   Msmknod will delete any special files occupying the same device
+   naming.
+
+3.4.2 Build the MOXA driver and utilities
+-----------------------------------------
+
+   Before using the MOXA driver and utilities, you need compile the
+   all the source code. This step is only need to be executed once.
+   But you still re-compile the source code if you modify the source
+   code. For example, if you change the driver's major number (see
+   "3.7" section), then you need to do this step again.
+
+   Find "Makefile" in /moxa/mxser, then run
+
+	 # make clean; make install
+
+   ..note::
+
+	 For Red Hat 9, Red Hat Enterprise Linux AS3/ES3/WS3 & Fedora Core1:
+	 # make clean; make installsp1
+
+	 For Red Hat Enterprise Linux AS4/ES4/WS4:
+	 # make clean; make installsp2
+
+   The driver files "mxser.o" and utilities will be properly compiled
+   and copied to system directories respectively.
+
+------------- Load MOXA driver--------------------
+
+3.4.3 Load the MOXA driver
+--------------------------
+
+   ::
+
+	 # modprobe mxser <argument>
+
+   will activate the module driver. You may run "lsmod" to check
+   if "mxser" is activated. If the MOXA board is ISA board, the
+   <argument> is needed. Please refer to section "3.4.5" for more
+   information.
+
+------------- Load MOXA driver on boot --------------------
+
+3.4.4 Load the mxser driver
+---------------------------
+
+
+   For the above description, you may manually execute
+   "modprobe mxser" to activate this driver and run
+   "rmmod mxser" to remove it.
+
+   However, it's better to have a boot time configuration to
+   eliminate manual operation. Boot time configuration can be
+   achieved by rc file. We offer one "rc.mxser" file to simplify
+   the procedure under "moxa/mxser/driver".
+
+   But if you use ISA board, please modify the "modprobe ..." command
+   to add the argument (see "3.4.5" section). After modifying the
+   rc.mxser, please try to execute "/moxa/mxser/driver/rc.mxser"
+   manually to make sure the modification is ok. If any error
+   encountered, please try to modify again. If the modification is
+   completed, follow the below step.
+
+   Run following command for setting rc files::
+
+	 # cd /moxa/mxser/driver
+	 # cp ./rc.mxser /etc/rc.d
+	 # cd /etc/rc.d
+
+   Check "rc.serial" is existed or not. If "rc.serial" doesn't exist,
+   create it by vi, run "chmod 755 rc.serial" to change the permission.
+
+   Add "/etc/rc.d/rc.mxser" in last line.
+
+   Reboot and check if moxa.o activated by "lsmod" command.
+
+3.4.5. specify CAP address
+--------------------------
+
+   If you'd like to drive Smartio/Industio ISA boards in the system,
+   you'll have to add parameter to specify CAP address of given
+   board while activating "mxser.o". The format for parameters are
+   as follows.::
+
+	   modprobe mxser ioaddr=0x???,0x???,0x???,0x???
+				  |  |  |    |
+				  |  |  |    +- 4th ISA board
+				  |  |  +------ 3rd ISA board
+				  |  +------------ 2nd ISA board
+				  +-------------------1st ISA board
+
+3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x
+================================================================
+
+    Note:
+          To use static driver, you must install the linux kernel
+          source package.
+
+3.5.1 Backup the built-in driver in the kernel
+----------------------------------------------
+
+    ::
+
+       # cd /usr/src/linux/drivers/char
+       # mv mxser.c mxser.c.old
+
+       For Red Hat 7.x user, you need to create link:
+       # cd /usr/src
+       # ln -s linux-2.4 linux
+
+3.5.2 Create link
+-----------------
+    ::
+
+	  # cd /usr/src/linux/drivers/char
+	  # ln -s /moxa/mxser/driver/mxser.c mxser.c
+
+3.5.3 Add CAP address list for ISA boards.
+------------------------------------------
+
+    For PCI boards user, please skip this step.
+
+    In module mode, the CAP address for ISA board is given by
+    parameter. In static driver configuration, you'll have to
+    assign it within driver's source code. If you will not
+    install any ISA boards, you may skip to next portion.
+    The instructions to modify driver source code are as
+    below.
+
+    a. run::
+
+	# cd /moxa/mxser/driver
+	# vi mxser.c
+
+    b. Find the array mxserBoardCAP[] as below::
+
+	  static int mxserBoardCAP[] = {0x00, 0x00, 0x00, 0x00};
+
+    c. Change the address within this array using vi. For
+       example, to driver 2 ISA boards with CAP address
+       0x280 and 0x180 as 1st and 2nd board. Just to change
+       the source code as follows::
+
+	  static int mxserBoardCAP[] = {0x280, 0x180, 0x00, 0x00};
+
+3.5.4 Setup kernel configuration
+--------------------------------
+
+    Configure the kernel::
+
+      # cd /usr/src/linux
+      # make menuconfig
+
+    You will go into a menu-driven system. Please select [Character
+    devices][Non-standard serial port support], enable the [Moxa
+    SmartIO support] driver with "[*]" for built-in (not "[M]"), then
+    select [Exit] to exit this program.
+
+3.5.5 Rebuild kernel
+--------------------
+
+    The following are for Linux kernel rebuilding, for your
+    reference only.
+
+    For appropriate details, please refer to the Linux document:
+
+        a. Run the following commands::
+
+	     cd /usr/src/linux
+	     make clean		     # take a few minutes
+	     make dep		     # take a few minutes
+	     make bzImage	     # take probably 10-20 minutes
+	     make install	     # copy boot image to correct position
+
+	f. Please make sure the boot kernel (vmlinuz) is in the
+	   correct position.
+	g. If you use 'lilo' utility, you should check /etc/lilo.conf
+	   'image' item specified the path which is the 'vmlinuz' path,
+	   or you will load wrong (or old) boot kernel image (vmlinuz).
+	   After checking /etc/lilo.conf, please run "lilo".
+
+	  Note that if the result of "make bzImage" is ERROR, then you have to
+	  go back to Linux configuration Setup. Type "make menuconfig" in
+          directory /usr/src/linux.
+
+
+3.5.6 Make tty device and special file
+--------------------------------------
+
+    ::
+       # cd /moxa/mxser/driver
+       # ./msmknod
+
+3.5.7 Make utility
+------------------
+
+    ::
+
+	  # cd /moxa/mxser/utility
+	  # make clean; make install
+
+3.5.8 Reboot
+------------
+
+
+
+3.6 Custom configuration
+========================
+
+    Although this driver already provides you default configuration, you
+    still can change the device name and major number. The instruction to
+    change these parameters are shown as below.
+
+a. Change Device name
+
+    If you'd like to use other device names instead of default naming
+    convention, all you have to do is to modify the internal code
+    within the shell script "msmknod". First, you have to open "msmknod"
+    by vi. Locate each line contains "ttyM" and "cum" and change them
+    to the device name you desired. "msmknod" creates the device names
+    you need next time executed.
+
+b. Change Major number
+
+    If major number 30 and 35 had been occupied, you may have to select
+    2 free major numbers for this driver. There are 3 steps to change
+    major numbers.
+
+3.6.1 Find free major numbers
+-----------------------------
+
+    In /proc/devices, you may find all the major numbers occupied
+    in the system. Please select 2 major numbers that are available.
+    e.g. 40, 45.
+
+3.6.2 Create special files
+--------------------------
+
+   Run /moxa/mxser/driver/msmknod to create special files with
+   specified major numbers.
+
+3.6.3 Modify driver with new major number
+-----------------------------------------
+
+   Run vi to open /moxa/mxser/driver/mxser.c. Locate the line
+   contains "MXSERMAJOR". Change the content as below::
+
+	  #define	  MXSERMAJOR		  40
+	  #define	  MXSERCUMAJOR		  45
+
+    3.6.4 Run "make clean; make install" in /moxa/mxser/driver.
+
+3.7 Verify driver installation
+==============================
+
+    You may refer to /var/log/messages to check the latest status
+    log reported by this driver whenever it's activated.
+
+4. Utilities
+^^^^^^^^^^^^
+
+   There are 3 utilities contained in this driver. They are msdiag, msmon and
+   msterm. These 3 utilities are released in form of source code. They should
+   be compiled into executable file and copied into /usr/bin.
+
+   Before using these utilities, please load driver (refer 3.4 & 3.5) and
+   make sure you had run the "msmknod" utility.
+
+msdiag - Diagnostic
+===================
+
+   This utility provides the function to display what Moxa Smartio/Industio
+   board found by driver in the system.
+
+msmon - Port Monitoring
+=======================
+
+   This utility gives the user a quick view about all the MOXA ports'
+   activities. One can easily learn each port's total received/transmitted
+   (Rx/Tx) character count since the time when the monitoring is started.
+
+   Rx/Tx throughputs per second are also reported in interval basis (e.g.
+   the last 5 seconds) and in average basis (since the time the monitoring
+   is started). You can reset all ports' count by <HOME> key. <+> <->
+   (plus/minus) keys to change the displaying time interval. Press <ENTER>
+   on the port, that cursor stay, to view the port's communication
+   parameters, signal status, and input/output queue.
+
+msterm - Terminal Emulation
+===========================
+
+   This utility provides data sending and receiving ability of all tty ports,
+   especially for MOXA ports. It is quite useful for testing simple
+   application, for example, sending AT command to a modem connected to the
+   port or used as a terminal for login purpose. Note that this is only a
+   dumb terminal emulation without handling full screen operation.
+
+5. Setserial
+^^^^^^^^^^^^
+
+   Supported Setserial parameters are listed as below.
+
+   ============== =========================================================
+   uart		  set UART type(16450-->disable FIFO, 16550A-->enable FIFO)
+   close_delay	  set the amount of time(in 1/100 of a second) that DTR
+		  should be kept low while being closed.
+   closing_wait   set the amount of time(in 1/100 of a second) that the
+		  serial port should wait for data to be drained while
+		  being closed, before the receiver is disable.
+   spd_hi	  Use  57.6kb  when  the application requests 38.4kb.
+   spd_vhi	  Use  115.2kb	when  the application requests 38.4kb.
+   spd_shi	  Use  230.4kb	when  the application requests 38.4kb.
+   spd_warp	  Use  460.8kb	when  the application requests 38.4kb.
+   spd_normal	  Use  38.4kb  when  the application requests 38.4kb.
+   spd_cust	  Use  the custom divisor to set the speed when  the
+		  application requests 38.4kb.
+   divisor	  This option set the custom division.
+   baud_base	  This option set the base baud rate.
+   ============== =========================================================
+
+6. Troubleshooting
+^^^^^^^^^^^^^^^^^^
+
+   The boot time error messages and solutions are stated as clearly as
+   possible. If all the possible solutions fail, please contact our technical
+   support team to get more help.
+
+
+   Error msg:
+	      More than 4 Moxa Smartio/Industio family boards found. Fifth board
+              and after are ignored.
+
+   Solution:
+   To avoid this problem, please unplug fifth and after board, because Moxa
+   driver supports up to 4 boards.
+
+   Error msg:
+	      Request_irq fail, IRQ(?) may be conflict with another device.
+
+   Solution:
+   Other PCI or ISA devices occupy the assigned IRQ. If you are not sure
+   which device causes the situation, please check /proc/interrupts to find
+   free IRQ and simply change another free IRQ for Moxa board.
+
+   Error msg:
+	      Board #: C1xx Series(CAP=xxx) interrupt number invalid.
+
+   Solution:
+   Each port within the same multiport board shares the same IRQ. Please set
+   one IRQ (IRQ doesn't equal to zero) for one Moxa board.
+
+   Error msg:
+	      No interrupt vector be set for Moxa ISA board(CAP=xxx).
+
+   Solution:
+   Moxa ISA board needs an interrupt vector.Please refer to user's manual
+   "Hardware Installation" chapter to set interrupt vector.
+
+   Error msg:
+              Couldn't install MOXA Smartio/Industio family driver!
+
+   Solution:
+   Load Moxa driver fail, the major number may conflict with other devices.
+   Please refer to previous section 3.7 to change a free major number for
+   Moxa driver.
+
+   Error msg:
+              Couldn't install MOXA Smartio/Industio family callout driver!
+
+   Solution:
+   Load Moxa callout driver fail, the callout device major number may
+   conflict with other devices. Please refer to previous section 3.7 to
+   change a free callout device major number for Moxa driver.
diff --git a/Documentation/driver-api/serial/n_gsm.rst b/Documentation/driver-api/serial/n_gsm.rst
new file mode 100644
index 000000000000..f3ad9fd26408
--- /dev/null
+++ b/Documentation/driver-api/serial/n_gsm.rst
@@ -0,0 +1,103 @@
+==============================
+GSM 0710 tty multiplexor HOWTO
+==============================
+
+This line discipline implements the GSM 07.10 multiplexing protocol
+detailed in the following 3GPP document:
+
+	http://www.3gpp.org/ftp/Specs/archive/07_series/07.10/0710-720.zip
+
+This document give some hints on how to use this driver with GPRS and 3G
+modems connected to a physical serial port.
+
+How to use it
+-------------
+1. initialize the modem in 0710 mux mode (usually AT+CMUX= command) through
+   its serial port. Depending on the modem used, you can pass more or less
+   parameters to this command,
+2. switch the serial line to using the n_gsm line discipline by using
+   TIOCSETD ioctl,
+3. configure the mux using GSMIOC_GETCONF / GSMIOC_SETCONF ioctl,
+
+Major parts of the initialization program :
+(a good starting point is util-linux-ng/sys-utils/ldattach.c)::
+
+  #include <linux/gsmmux.h>
+  #define N_GSM0710	21	/* GSM 0710 Mux */
+  #define DEFAULT_SPEED	B115200
+  #define SERIAL_PORT	/dev/ttyS0
+
+	int ldisc = N_GSM0710;
+	struct gsm_config c;
+	struct termios configuration;
+
+	/* open the serial port connected to the modem */
+	fd = open(SERIAL_PORT, O_RDWR | O_NOCTTY | O_NDELAY);
+
+	/* configure the serial port : speed, flow control ... */
+
+	/* send the AT commands to switch the modem to CMUX mode
+	   and check that it's successful (should return OK) */
+	write(fd, "AT+CMUX=0\r", 10);
+
+	/* experience showed that some modems need some time before
+	   being able to answer to the first MUX packet so a delay
+	   may be needed here in some case */
+	sleep(3);
+
+	/* use n_gsm line discipline */
+	ioctl(fd, TIOCSETD, &ldisc);
+
+	/* get n_gsm configuration */
+	ioctl(fd, GSMIOC_GETCONF, &c);
+	/* we are initiator and need encoding 0 (basic) */
+	c.initiator = 1;
+	c.encapsulation = 0;
+	/* our modem defaults to a maximum size of 127 bytes */
+	c.mru = 127;
+	c.mtu = 127;
+	/* set the new configuration */
+	ioctl(fd, GSMIOC_SETCONF, &c);
+
+	/* and wait for ever to keep the line discipline enabled */
+	daemon(0,0);
+	pause();
+
+4. create the devices corresponding to the "virtual" serial ports (take care,
+   each modem has its configuration and some DLC have dedicated functions,
+   for example GPS), starting with minor 1 (DLC0 is reserved for the management
+   of the mux)::
+
+     MAJOR=`cat /proc/devices |grep gsmtty | awk '{print $1}`
+     for i in `seq 1 4`; do
+	mknod /dev/ttygsm$i c $MAJOR $i
+     done
+
+5. use these devices as plain serial ports.
+
+   for example, it's possible:
+
+   - and to use gnokii to send / receive SMS on ttygsm1
+   - to use ppp to establish a datalink on ttygsm2
+
+6. first close all virtual ports before closing the physical port.
+
+   Note that after closing the physical port the modem is still in multiplexing
+   mode. This may prevent a successful re-opening of the port later. To avoid
+   this situation either reset the modem if your hardware allows that or send
+   a disconnect command frame manually before initializing the multiplexing mode
+   for the second time. The byte sequence for the disconnect command frame is::
+
+      0xf9, 0x03, 0xef, 0x03, 0xc3, 0x16, 0xf9.
+
+Additional Documentation
+------------------------
+More practical details on the protocol and how it's supported by industrial
+modems can be found in the following documents :
+
+- http://www.telit.com/module/infopool/download.php?id=616
+- http://www.u-blox.com/images/downloads/Product_Docs/LEON-G100-G200-MuxImplementation_ApplicationNote_%28GSM%20G1-CS-10002%29.pdf
+- http://www.sierrawireless.com/Support/Downloads/AirPrime/WMP_Series/~/media/Support_Downloads/AirPrime/Application_notes/CMUX_Feature_Application_Note-Rev004.ashx
+- http://wm.sim.com/sim/News/photo/2010721161442.pdf
+
+11-03-08 - Eric Bénard - <eric@eukrea.com>
diff --git a/Documentation/driver-api/serial/rocket.rst b/Documentation/driver-api/serial/rocket.rst
new file mode 100644
index 000000000000..23761eae4282
--- /dev/null
+++ b/Documentation/driver-api/serial/rocket.rst
@@ -0,0 +1,185 @@
+================================================
+Comtrol(tm) RocketPort(R)/RocketModem(TM) Series
+================================================
+
+Device Driver for the Linux Operating System
+============================================
+
+Product overview
+----------------
+
+This driver provides a loadable kernel driver for the Comtrol RocketPort
+and RocketModem PCI boards. These boards provide, 2, 4, 8, 16, or 32
+high-speed serial ports or modems.  This driver supports up to a combination
+of four RocketPort or RocketModems boards in one machine simultaneously.
+This file assumes that you are using the RocketPort driver which is
+integrated into the kernel sources.
+
+The driver can also be installed as an external module using the usual
+"make;make install" routine.  This external module driver, obtainable
+from the Comtrol website listed below, is useful for updating the driver
+or installing it into kernels which do not have the driver configured
+into them.  Installations instructions for the external module
+are in the included README and HW_INSTALL files.
+
+RocketPort ISA and RocketModem II PCI boards currently are only supported by
+this driver in module form.
+
+The RocketPort ISA board requires I/O ports to be configured by the DIP
+switches on the board.  See the section "ISA Rocketport Boards" below for
+information on how to set the DIP switches.
+
+You pass the I/O port to the driver using the following module parameters:
+
+board1:
+	I/O port for the first ISA board
+board2:
+	I/O port for the second ISA board
+board3:
+	I/O port for the third ISA board
+board4:
+	I/O port for the fourth ISA board
+
+There is a set of utilities and scripts provided with the external driver
+(downloadable from http://www.comtrol.com) that ease the configuration and
+setup of the ISA cards.
+
+The RocketModem II PCI boards require firmware to be loaded into the card
+before it will function.  The driver has only been tested as a module for this
+board.
+
+Installation Procedures
+-----------------------
+
+RocketPort/RocketModem PCI cards require no driver configuration, they are
+automatically detected and configured.
+
+The RocketPort driver can be installed as a module (recommended) or built
+into the kernel. This is selected, as for other drivers, through the `make config`
+command from the root of the Linux source tree during the kernel build process.
+
+The RocketPort/RocketModem serial ports installed by this driver are assigned
+device major number 46, and will be named /dev/ttyRx, where x is the port number
+starting at zero (ex. /dev/ttyR0, /devttyR1, ...).  If you have multiple cards
+installed in the system, the mapping of port names to serial ports is displayed
+in the system log at /var/log/messages.
+
+If installed as a module, the module must be loaded.  This can be done
+manually by entering "modprobe rocket".  To have the module loaded automatically
+upon system boot, edit a `/etc/modprobe.d/*.conf` file and add the line
+"alias char-major-46 rocket".
+
+In order to use the ports, their device names (nodes) must be created with mknod.
+This is only required once, the system will retain the names once created.  To
+create the RocketPort/RocketModem device names, use the command
+"mknod /dev/ttyRx c 46 x" where x is the port number starting at zero.
+
+For example::
+
+	> mknod /dev/ttyR0 c 46 0
+	> mknod /dev/ttyR1 c 46 1
+	> mknod /dev/ttyR2 c 46 2
+
+The Linux script MAKEDEV will create the first 16 ttyRx device names (nodes)
+for you::
+
+	>/dev/MAKEDEV ttyR
+
+ISA Rocketport Boards
+---------------------
+
+You must assign and configure the I/O addresses used by the ISA Rocketport
+card before installing and using it.  This is done by setting a set of DIP
+switches on the Rocketport board.
+
+
+Setting the I/O address
+-----------------------
+
+Before installing RocketPort(R) or RocketPort RA boards, you must find
+a range of I/O addresses for it to use. The first RocketPort card
+requires a 68-byte contiguous block of I/O addresses, starting at one
+of the following: 0x100h, 0x140h, 0x180h, 0x200h, 0x240h, 0x280h,
+0x300h, 0x340h, 0x380h.  This I/O address must be reflected in the DIP
+switches of *all* of the Rocketport cards.
+
+The second, third, and fourth RocketPort cards require a 64-byte
+contiguous block of I/O addresses, starting at one of the following
+I/O addresses: 0x100h, 0x140h, 0x180h, 0x1C0h, 0x200h, 0x240h, 0x280h,
+0x2C0h, 0x300h, 0x340h, 0x380h, 0x3C0h.  The I/O address used by the
+second, third, and fourth Rocketport cards (if present) are set via
+software control.  The DIP switch settings for the I/O address must be
+set to the value of the first Rocketport cards.
+
+In order to distinguish each of the card from the others, each card
+must have a unique board ID set on the dip switches.  The first
+Rocketport board must be set with the DIP switches corresponding to
+the first board, the second board must be set with the DIP switches
+corresponding to the second board, etc.  IMPORTANT: The board ID is
+the only place where the DIP switch settings should differ between the
+various Rocketport boards in a system.
+
+The I/O address range used by any of the RocketPort cards must not
+conflict with any other cards in the system, including other
+RocketPort cards.  Below, you will find a list of commonly used I/O
+address ranges which may be in use by other devices in your system.
+On a Linux system, "cat /proc/ioports" will also be helpful in
+identifying what I/O addresses are being used by devices on your
+system.
+
+Remember, the FIRST RocketPort uses 68 I/O addresses.  So, if you set it
+for 0x100, it will occupy 0x100 to 0x143.  This would mean that you
+CAN NOT set the second, third or fourth board for address 0x140 since
+the first 4 bytes of that range are used by the first board.  You would
+need to set the second, third, or fourth board to one of the next available
+blocks such as 0x180.
+
+RocketPort and RocketPort RA SW1 Settings::
+
+            +-------------------------------+
+            | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 |
+            +-------+-------+---------------+
+            | Unused| Card  | I/O Port Block|
+            +-------------------------------+
+
+  DIP Switches                             DIP Switches
+  7    8                                   6    5
+  ===================                      ===================
+  On   On   UNUSED, MUST BE ON.            On   On   First Card    <==== Default
+                                           On   Off  Second Card
+                                           Off  On   Third Card
+                                           Off  Off  Fourth Card
+
+  DIP Switches         I/O Address Range
+  4    3    2    1     Used by the First Card
+  =====================================
+  On   Off  On   Off   100-143
+  On   Off  Off  On    140-183
+  On   Off  Off  Off   180-1C3       <==== Default
+  Off  On   On   Off   200-243
+  Off  On   Off  On    240-283
+  Off  On   Off  Off   280-2C3
+  Off  Off  On   Off   300-343
+  Off  Off  Off  On    340-383
+  Off  Off  Off  Off   380-3C3
+
+Reporting Bugs
+--------------
+
+For technical support, please provide the following
+information: Driver version, kernel release, distribution of
+kernel, and type of board you are using. Error messages and log
+printouts port configuration details are especially helpful.
+
+USA:
+    :Phone: (612) 494-4100
+    :FAX: (612) 494-4199
+    :email: support@comtrol.com
+
+Comtrol Europe:
+    :Phone: +44 (0) 1 869 323-220
+    :FAX: +44 (0) 1 869 323-211
+    :email: support@comtrol.co.uk
+
+Web:	http://www.comtrol.com
+FTP:	ftp.comtrol.com
diff --git a/Documentation/driver-api/serial/serial-iso7816.rst b/Documentation/driver-api/serial/serial-iso7816.rst
new file mode 100644
index 000000000000..d990143de0c6
--- /dev/null
+++ b/Documentation/driver-api/serial/serial-iso7816.rst
@@ -0,0 +1,90 @@
+=============================
+ISO7816 Serial Communications
+=============================
+
+1. Introduction
+===============
+
+  ISO/IEC7816 is a series of standards specifying integrated circuit cards (ICC)
+  also known as smart cards.
+
+2. Hardware-related considerations
+==================================
+
+  Some CPUs/UARTs (e.g., Microchip AT91) contain a built-in mode capable of
+  handling communication with a smart card.
+
+  For these microcontrollers, the Linux driver should be made capable of
+  working in both modes, and proper ioctls (see later) should be made
+  available at user-level to allow switching from one mode to the other, and
+  vice versa.
+
+3. Data Structures Already Available in the Kernel
+==================================================
+
+  The Linux kernel provides the serial_iso7816 structure (see [1]) to handle
+  ISO7816 communications. This data structure is used to set and configure
+  ISO7816 parameters in ioctls.
+
+  Any driver for devices capable of working both as RS232 and ISO7816 should
+  implement the iso7816_config callback in the uart_port structure. The
+  serial_core calls iso7816_config to do the device specific part in response
+  to TIOCGISO7816 and TIOCSISO7816 ioctls (see below). The iso7816_config
+  callback receives a pointer to struct serial_iso7816.
+
+4. Usage from user-level
+========================
+
+  From user-level, ISO7816 configuration can be get/set using the previous
+  ioctls. For instance, to set ISO7816 you can use the following code::
+
+	#include <linux/serial.h>
+
+	/* Include definition for ISO7816 ioctls: TIOCSISO7816 and TIOCGISO7816 */
+	#include <sys/ioctl.h>
+
+	/* Open your specific device (e.g., /dev/mydevice): */
+	int fd = open ("/dev/mydevice", O_RDWR);
+	if (fd < 0) {
+		/* Error handling. See errno. */
+	}
+
+	struct serial_iso7816 iso7816conf;
+
+	/* Reserved fields as to be zeroed */
+	memset(&iso7816conf, 0, sizeof(iso7816conf));
+
+	/* Enable ISO7816 mode: */
+	iso7816conf.flags |= SER_ISO7816_ENABLED;
+
+	/* Select the protocol: */
+	/* T=0 */
+	iso7816conf.flags |= SER_ISO7816_T(0);
+	/* or T=1 */
+	iso7816conf.flags |= SER_ISO7816_T(1);
+
+	/* Set the guard time: */
+	iso7816conf.tg = 2;
+
+	/* Set the clock frequency*/
+	iso7816conf.clk = 3571200;
+
+	/* Set transmission factors: */
+	iso7816conf.sc_fi = 372;
+	iso7816conf.sc_di = 1;
+
+	if (ioctl(fd_usart, TIOCSISO7816, &iso7816conf) < 0) {
+		/* Error handling. See errno. */
+	}
+
+	/* Use read() and write() syscalls here... */
+
+	/* Close the device when finished: */
+	if (close (fd) < 0) {
+		/* Error handling. See errno. */
+	}
+
+5. References
+=============
+
+ [1]    include/uapi/linux/serial.h
diff --git a/Documentation/driver-api/serial/serial-rs485.rst b/Documentation/driver-api/serial/serial-rs485.rst
new file mode 100644
index 000000000000..6bc824f948f9
--- /dev/null
+++ b/Documentation/driver-api/serial/serial-rs485.rst
@@ -0,0 +1,103 @@
+===========================
+RS485 Serial Communications
+===========================
+
+1. Introduction
+===============
+
+   EIA-485, also known as TIA/EIA-485 or RS-485, is a standard defining the
+   electrical characteristics of drivers and receivers for use in balanced
+   digital multipoint systems.
+   This standard is widely used for communications in industrial automation
+   because it can be used effectively over long distances and in electrically
+   noisy environments.
+
+2. Hardware-related Considerations
+==================================
+
+   Some CPUs/UARTs (e.g., Atmel AT91 or 16C950 UART) contain a built-in
+   half-duplex mode capable of automatically controlling line direction by
+   toggling RTS or DTR signals. That can be used to control external
+   half-duplex hardware like an RS485 transceiver or any RS232-connected
+   half-duplex devices like some modems.
+
+   For these microcontrollers, the Linux driver should be made capable of
+   working in both modes, and proper ioctls (see later) should be made
+   available at user-level to allow switching from one mode to the other, and
+   vice versa.
+
+3. Data Structures Already Available in the Kernel
+==================================================
+
+   The Linux kernel provides the serial_rs485 structure (see [1]) to handle
+   RS485 communications. This data structure is used to set and configure RS485
+   parameters in the platform data and in ioctls.
+
+   The device tree can also provide RS485 boot time parameters (see [2]
+   for bindings). The driver is in charge of filling this data structure from
+   the values given by the device tree.
+
+   Any driver for devices capable of working both as RS232 and RS485 should
+   implement the rs485_config callback in the uart_port structure. The
+   serial_core calls rs485_config to do the device specific part in response
+   to TIOCSRS485 and TIOCGRS485 ioctls (see below). The rs485_config callback
+   receives a pointer to struct serial_rs485.
+
+4. Usage from user-level
+========================
+
+   From user-level, RS485 configuration can be get/set using the previous
+   ioctls. For instance, to set RS485 you can use the following code::
+
+	#include <linux/serial.h>
+
+	/* Include definition for RS485 ioctls: TIOCGRS485 and TIOCSRS485 */
+	#include <sys/ioctl.h>
+
+	/* Open your specific device (e.g., /dev/mydevice): */
+	int fd = open ("/dev/mydevice", O_RDWR);
+	if (fd < 0) {
+		/* Error handling. See errno. */
+	}
+
+	struct serial_rs485 rs485conf;
+
+	/* Enable RS485 mode: */
+	rs485conf.flags |= SER_RS485_ENABLED;
+
+	/* Set logical level for RTS pin equal to 1 when sending: */
+	rs485conf.flags |= SER_RS485_RTS_ON_SEND;
+	/* or, set logical level for RTS pin equal to 0 when sending: */
+	rs485conf.flags &= ~(SER_RS485_RTS_ON_SEND);
+
+	/* Set logical level for RTS pin equal to 1 after sending: */
+	rs485conf.flags |= SER_RS485_RTS_AFTER_SEND;
+	/* or, set logical level for RTS pin equal to 0 after sending: */
+	rs485conf.flags &= ~(SER_RS485_RTS_AFTER_SEND);
+
+	/* Set rts delay before send, if needed: */
+	rs485conf.delay_rts_before_send = ...;
+
+	/* Set rts delay after send, if needed: */
+	rs485conf.delay_rts_after_send = ...;
+
+	/* Set this flag if you want to receive data even while sending data */
+	rs485conf.flags |= SER_RS485_RX_DURING_TX;
+
+	if (ioctl (fd, TIOCSRS485, &rs485conf) < 0) {
+		/* Error handling. See errno. */
+	}
+
+	/* Use read() and write() syscalls here... */
+
+	/* Close the device when finished: */
+	if (close (fd) < 0) {
+		/* Error handling. See errno. */
+	}
+
+5. References
+=============
+
+ [1]	include/uapi/linux/serial.h
+
+ [2]	Documentation/devicetree/bindings/serial/rs485.txt
diff --git a/Documentation/driver-api/serial/tty.rst b/Documentation/driver-api/serial/tty.rst
new file mode 100644
index 000000000000..dd972caacf3e
--- /dev/null
+++ b/Documentation/driver-api/serial/tty.rst
@@ -0,0 +1,328 @@
+=================
+The Lockronomicon
+=================
+
+Your guide to the ancient and twisted locking policies of the tty layer and
+the warped logic behind them. Beware all ye who read on.
+
+
+Line Discipline
+---------------
+
+Line disciplines are registered with tty_register_ldisc() passing the
+discipline number and the ldisc structure. At the point of registration the
+discipline must be ready to use and it is possible it will get used before
+the call returns success. If the call returns an error then it won't get
+called. Do not re-use ldisc numbers as they are part of the userspace ABI
+and writing over an existing ldisc will cause demons to eat your computer.
+After the return the ldisc data has been copied so you may free your own
+copy of the structure. You must not re-register over the top of the line
+discipline even with the same data or your computer again will be eaten by
+demons.
+
+In order to remove a line discipline call tty_unregister_ldisc().
+In ancient times this always worked. In modern times the function will
+return -EBUSY if the ldisc is currently in use. Since the ldisc referencing
+code manages the module counts this should not usually be a concern.
+
+Heed this warning: the reference count field of the registered copies of the
+tty_ldisc structure in the ldisc table counts the number of lines using this
+discipline. The reference count of the tty_ldisc structure within a tty
+counts the number of active users of the ldisc at this instant. In effect it
+counts the number of threads of execution within an ldisc method (plus those
+about to enter and exit although this detail matters not).
+
+Line Discipline Methods
+-----------------------
+
+TTY side interfaces
+^^^^^^^^^^^^^^^^^^^
+
+======================= =======================================================
+open()			Called when the line discipline is attached to
+			the terminal. No other call into the line
+			discipline for this tty will occur until it
+			completes successfully. Should initialize any
+			state needed by the ldisc, and set receive_room
+			in the tty_struct to the maximum amount of data
+			the line discipline is willing to accept from the
+			driver with a single call to receive_buf().
+			Returning an error will prevent the ldisc from
+			being attached. Can sleep.
+
+close()			This is called on a terminal when the line
+			discipline is being unplugged. At the point of
+			execution no further users will enter the
+			ldisc code for this tty. Can sleep.
+
+hangup()		Called when the tty line is hung up.
+			The line discipline should cease I/O to the tty.
+			No further calls into the ldisc code will occur.
+			The return value is ignored. Can sleep.
+
+read()			(optional) A process requests reading data from
+			the line. Multiple read calls may occur in parallel
+			and the ldisc must deal with serialization issues.
+			If not defined, the process will receive an EIO
+			error. May sleep.
+
+write()			(optional) A process requests writing data to the
+			line. Multiple write calls are serialized by the
+			tty layer for the ldisc. If not defined, the
+			process will receive an EIO error. May sleep.
+
+flush_buffer()		(optional) May be called at any point between
+			open and close, and instructs the line discipline
+			to empty its input buffer.
+
+set_termios()		(optional) Called on termios structure changes.
+			The caller passes the old termios data and the
+			current data is in the tty. Called under the
+			termios semaphore so allowed to sleep. Serialized
+			against itself only.
+
+poll()			(optional) Check the status for the poll/select
+			calls. Multiple poll calls may occur in parallel.
+			May sleep.
+
+ioctl()			(optional) Called when an ioctl is handed to the
+			tty layer that might be for the ldisc. Multiple
+			ioctl calls may occur in parallel. May sleep.
+
+compat_ioctl()		(optional) Called when a 32 bit ioctl is handed
+			to the tty layer that might be for the ldisc.
+			Multiple ioctl calls may occur in parallel.
+			May sleep.
+======================= =======================================================
+
+Driver Side Interfaces
+^^^^^^^^^^^^^^^^^^^^^^
+
+======================= =======================================================
+receive_buf()		(optional) Called by the low-level driver to hand
+			a buffer of received bytes to the ldisc for
+			processing. The number of bytes is guaranteed not
+			to exceed the current value of tty->receive_room.
+			All bytes must be processed.
+
+receive_buf2()		(optional) Called by the low-level driver to hand
+			a buffer of received bytes to the ldisc for
+			processing. Returns the number of bytes processed.
+
+			If both receive_buf() and receive_buf2() are
+			defined, receive_buf2() should be preferred.
+
+write_wakeup()		May be called at any point between open and close.
+			The TTY_DO_WRITE_WAKEUP flag indicates if a call
+			is needed but always races versus calls. Thus the
+			ldisc must be careful about setting order and to
+			handle unexpected calls. Must not sleep.
+
+			The driver is forbidden from calling this directly
+			from the ->write call from the ldisc as the ldisc
+			is permitted to call the driver write method from
+			this function. In such a situation defer it.
+
+dcd_change()		Report to the tty line the current DCD pin status
+			changes and the relative timestamp. The timestamp
+			cannot be NULL.
+======================= =======================================================
+
+
+Driver Access
+^^^^^^^^^^^^^
+
+Line discipline methods can call the following methods of the underlying
+hardware driver through the function pointers within the tty->driver
+structure:
+
+======================= =======================================================
+write()			Write a block of characters to the tty device.
+			Returns the number of characters accepted. The
+			character buffer passed to this method is already
+			in kernel space.
+
+put_char()		Queues a character for writing to the tty device.
+			If there is no room in the queue, the character is
+			ignored.
+
+flush_chars()		(Optional) If defined, must be called after
+			queueing characters with put_char() in order to
+			start transmission.
+
+write_room()		Returns the numbers of characters the tty driver
+			will accept for queueing to be written.
+
+ioctl()			Invoke device specific ioctl.
+			Expects data pointers to refer to userspace.
+			Returns ENOIOCTLCMD for unrecognized ioctl numbers.
+
+set_termios()		Notify the tty driver that the device's termios
+			settings have changed. New settings are in
+			tty->termios. Previous settings should be passed in
+			the "old" argument.
+
+			The API is defined such that the driver should return
+			the actual modes selected. This means that the
+			driver function is responsible for modifying any
+			bits in the request it cannot fulfill to indicate
+			the actual modes being used. A device with no
+			hardware capability for change (e.g. a USB dongle or
+			virtual port) can provide NULL for this method.
+
+throttle()		Notify the tty driver that input buffers for the
+			line discipline are close to full, and it should
+			somehow signal that no more characters should be
+			sent to the tty.
+
+unthrottle()		Notify the tty driver that characters can now be
+			sent to the tty without fear of overrunning the
+			input buffers of the line disciplines.
+
+stop()			Ask the tty driver to stop outputting characters
+			to the tty device.
+
+start()			Ask the tty driver to resume sending characters
+			to the tty device.
+
+hangup()		Ask the tty driver to hang up the tty device.
+
+break_ctl()		(Optional) Ask the tty driver to turn on or off
+			BREAK status on the RS-232 port.  If state is -1,
+			then the BREAK status should be turned on; if
+			state is 0, then BREAK should be turned off.
+			If this routine is not implemented, use ioctls
+			TIOCSBRK / TIOCCBRK instead.
+
+wait_until_sent()	Waits until the device has written out all of the
+			characters in its transmitter FIFO.
+
+send_xchar()		Send a high-priority XON/XOFF character to the device.
+======================= =======================================================
+
+
+Flags
+^^^^^
+
+Line discipline methods have access to tty->flags field containing the
+following interesting flags:
+
+======================= =======================================================
+TTY_THROTTLED		Driver input is throttled. The ldisc should call
+			tty->driver->unthrottle() in order to resume
+			reception when it is ready to process more data.
+
+TTY_DO_WRITE_WAKEUP	If set, causes the driver to call the ldisc's
+			write_wakeup() method in order to resume
+			transmission when it can accept more data
+			to transmit.
+
+TTY_IO_ERROR		If set, causes all subsequent userspace read/write
+			calls on the tty to fail, returning -EIO.
+
+TTY_OTHER_CLOSED	Device is a pty and the other side has closed.
+
+TTY_NO_WRITE_SPLIT	Prevent driver from splitting up writes into
+			smaller chunks.
+======================= =======================================================
+
+
+Locking
+^^^^^^^
+
+Callers to the line discipline functions from the tty layer are required to
+take line discipline locks. The same is true of calls from the driver side
+but not yet enforced.
+
+Three calls are now provided::
+
+	ldisc = tty_ldisc_ref(tty);
+
+takes a handle to the line discipline in the tty and returns it. If no ldisc
+is currently attached or the ldisc is being closed and re-opened at this
+point then NULL is returned. While this handle is held the ldisc will not
+change or go away::
+
+	tty_ldisc_deref(ldisc)
+
+Returns the ldisc reference and allows the ldisc to be closed. Returning the
+reference takes away your right to call the ldisc functions until you take
+a new reference::
+
+	ldisc = tty_ldisc_ref_wait(tty);
+
+Performs the same function as tty_ldisc_ref except that it will wait for an
+ldisc change to complete and then return a reference to the new ldisc.
+
+While these functions are slightly slower than the old code they should have
+minimal impact as most receive logic uses the flip buffers and they only
+need to take a reference when they push bits up through the driver.
+
+A caution: The ldisc->open(), ldisc->close() and driver->set_ldisc
+functions are called with the ldisc unavailable. Thus tty_ldisc_ref will
+fail in this situation if used within these functions. Ldisc and driver
+code calling its own functions must be careful in this case.
+
+
+Driver Interface
+----------------
+
+======================= =======================================================
+open()			Called when a device is opened. May sleep
+
+close()			Called when a device is closed. At the point of
+			return from this call the driver must make no
+			further ldisc calls of any kind. May sleep
+
+write()			Called to write bytes to the device. May not
+			sleep. May occur in parallel in special cases.
+			Because this includes panic paths drivers generally
+			shouldn't try and do clever locking here.
+
+put_char()		Stuff a single character onto the queue. The
+			driver is guaranteed following up calls to
+			flush_chars.
+
+flush_chars()		Ask the kernel to write put_char queue
+
+write_room()		Return the number of characters that can be stuffed
+			into the port buffers without overflow (or less).
+			The ldisc is responsible for being intelligent
+			about multi-threading of write_room/write calls
+
+ioctl()			Called when an ioctl may be for the driver
+
+set_termios()		Called on termios change, serialized against
+			itself by a semaphore. May sleep.
+
+set_ldisc()		Notifier for discipline change. At the point this
+			is done the discipline is not yet usable. Can now
+			sleep (I think)
+
+throttle()		Called by the ldisc to ask the driver to do flow
+			control.  Serialization including with unthrottle
+			is the job of the ldisc layer.
+
+unthrottle()		Called by the ldisc to ask the driver to stop flow
+			control.
+
+stop()			Ldisc notifier to the driver to stop output. As with
+			throttle the serializations with start() are down
+			to the ldisc layer.
+
+start()			Ldisc notifier to the driver to start output.
+
+hangup()		Ask the tty driver to cause a hangup initiated
+			from the host side. [Can sleep ??]
+
+break_ctl()		Send RS232 break. Can sleep. Can get called in
+			parallel, driver must serialize (for now), and
+			with write calls.
+
+wait_until_sent()	Wait for characters to exit the hardware queue
+			of the driver. Can sleep
+
+send_xchar()	  	Send XON/XOFF and if possible jump the queue with
+			it in order to get fast flow control responses.
+			Cannot sleep ??
+======================= =======================================================
diff --git a/Documentation/serial/cyclades_z.rst b/Documentation/serial/cyclades_z.rst
deleted file mode 100644
index 532ff67e2f1c..000000000000
--- a/Documentation/serial/cyclades_z.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-================
-Cyclades-Z notes
-================
-
-The Cyclades-Z must have firmware loaded onto the card before it will
-operate.  This operation should be performed during system startup,
-
-The firmware, loader program and the latest device driver code are
-available from Cyclades at
-
-    ftp://ftp.cyclades.com/pub/cyclades/cyclades-z/linux/
diff --git a/Documentation/serial/driver.rst b/Documentation/serial/driver.rst
deleted file mode 100644
index 4537119bf624..000000000000
--- a/Documentation/serial/driver.rst
+++ /dev/null
@@ -1,549 +0,0 @@
-====================
-Low Level Serial API
-====================
-
-
-This document is meant as a brief overview of some aspects of the new serial
-driver.  It is not complete, any questions you have should be directed to
-<rmk@arm.linux.org.uk>
-
-The reference implementation is contained within amba-pl011.c.
-
-
-
-Low Level Serial Hardware Driver
---------------------------------
-
-The low level serial hardware driver is responsible for supplying port
-information (defined by uart_port) and a set of control methods (defined
-by uart_ops) to the core serial driver.  The low level driver is also
-responsible for handling interrupts for the port, and providing any
-console support.
-
-
-Console Support
----------------
-
-The serial core provides a few helper functions.  This includes identifing
-the correct port structure (via uart_get_console) and decoding command line
-arguments (uart_parse_options).
-
-There is also a helper function (uart_console_write) which performs a
-character by character write, translating newlines to CRLF sequences.
-Driver writers are recommended to use this function rather than implementing
-their own version.
-
-
-Locking
--------
-
-It is the responsibility of the low level hardware driver to perform the
-necessary locking using port->lock.  There are some exceptions (which
-are described in the uart_ops listing below.)
-
-There are two locks.  A per-port spinlock, and an overall semaphore.
-
-From the core driver perspective, the port->lock locks the following
-data::
-
-	port->mctrl
-	port->icount
-	port->state->xmit.head (circ_buf->head)
-	port->state->xmit.tail (circ_buf->tail)
-
-The low level driver is free to use this lock to provide any additional
-locking.
-
-The port_sem semaphore is used to protect against ports being added/
-removed or reconfigured at inappropriate times. Since v2.6.27, this
-semaphore has been the 'mutex' member of the tty_port struct, and
-commonly referred to as the port mutex.
-
-
-uart_ops
---------
-
-The uart_ops structure is the main interface between serial_core and the
-hardware specific driver.  It contains all the methods to control the
-hardware.
-
-  tx_empty(port)
-	This function tests whether the transmitter fifo and shifter
-	for the port described by 'port' is empty.  If it is empty,
-	this function should return TIOCSER_TEMT, otherwise return 0.
-	If the port does not support this operation, then it should
-	return TIOCSER_TEMT.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  set_mctrl(port, mctrl)
-	This function sets the modem control lines for port described
-	by 'port' to the state described by mctrl.  The relevant bits
-	of mctrl are:
-
-		- TIOCM_RTS	RTS signal.
-		- TIOCM_DTR	DTR signal.
-		- TIOCM_OUT1	OUT1 signal.
-		- TIOCM_OUT2	OUT2 signal.
-		- TIOCM_LOOP	Set the port into loopback mode.
-
-	If the appropriate bit is set, the signal should be driven
-	active.  If the bit is clear, the signal should be driven
-	inactive.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  get_mctrl(port)
-	Returns the current state of modem control inputs.  The state
-	of the outputs should not be returned, since the core keeps
-	track of their state.  The state information should include:
-
-		- TIOCM_CAR	state of DCD signal
-		- TIOCM_CTS	state of CTS signal
-		- TIOCM_DSR	state of DSR signal
-		- TIOCM_RI	state of RI signal
-
-	The bit is set if the signal is currently driven active.  If
-	the port does not support CTS, DCD or DSR, the driver should
-	indicate that the signal is permanently active.  If RI is
-	not available, the signal should not be indicated as active.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  stop_tx(port)
-	Stop transmitting characters.  This might be due to the CTS
-	line becoming inactive or the tty layer indicating we want
-	to stop transmission due to an XOFF character.
-
-	The driver should stop transmitting characters as soon as
-	possible.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  start_tx(port)
-	Start transmitting characters.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  throttle(port)
-	Notify the serial driver that input buffers for the line discipline are
-	close to full, and it should somehow signal that no more characters
-	should be sent to the serial port.
-	This will be called only if hardware assisted flow control is enabled.
-
-	Locking: serialized with .unthrottle() and termios modification by the
-	tty layer.
-
-  unthrottle(port)
-	Notify the serial driver that characters can now be sent to the serial
-	port without fear of overrunning the input buffers of the line
-	disciplines.
-
-	This will be called only if hardware assisted flow control is enabled.
-
-	Locking: serialized with .throttle() and termios modification by the
-	tty layer.
-
-  send_xchar(port,ch)
-	Transmit a high priority character, even if the port is stopped.
-	This is used to implement XON/XOFF flow control and tcflow().  If
-	the serial driver does not implement this function, the tty core
-	will append the character to the circular buffer and then call
-	start_tx() / stop_tx() to flush the data out.
-
-	Do not transmit if ch == '\0' (__DISABLED_CHAR).
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  stop_rx(port)
-	Stop receiving characters; the port is in the process of
-	being closed.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  enable_ms(port)
-	Enable the modem status interrupts.
-
-	This method may be called multiple times.  Modem status
-	interrupts should be disabled when the shutdown method is
-	called.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  break_ctl(port,ctl)
-	Control the transmission of a break signal.  If ctl is
-	nonzero, the break signal should be transmitted.  The signal
-	should be terminated when another call is made with a zero
-	ctl.
-
-	Locking: caller holds tty_port->mutex
-
-  startup(port)
-	Grab any interrupt resources and initialise any low level driver
-	state.  Enable the port for reception.  It should not activate
-	RTS nor DTR; this will be done via a separate call to set_mctrl.
-
-	This method will only be called when the port is initially opened.
-
-	Locking: port_sem taken.
-
-	Interrupts: globally disabled.
-
-  shutdown(port)
-	Disable the port, disable any break condition that may be in
-	effect, and free any interrupt resources.  It should not disable
-	RTS nor DTR; this will have already been done via a separate
-	call to set_mctrl.
-
-	Drivers must not access port->state once this call has completed.
-
-	This method will only be called when there are no more users of
-	this port.
-
-	Locking: port_sem taken.
-
-	Interrupts: caller dependent.
-
-  flush_buffer(port)
-	Flush any write buffers, reset any DMA state and stop any
-	ongoing DMA transfers.
-
-	This will be called whenever the port->state->xmit circular
-	buffer is cleared.
-
-	Locking: port->lock taken.
-
-	Interrupts: locally disabled.
-
-	This call must not sleep
-
-  set_termios(port,termios,oldtermios)
-	Change the port parameters, including word length, parity, stop
-	bits.  Update read_status_mask and ignore_status_mask to indicate
-	the types of events we are interested in receiving.  Relevant
-	termios->c_cflag bits are:
-
-		CSIZE
-			- word size
-		CSTOPB
-			- 2 stop bits
-		PARENB
-			- parity enable
-		PARODD
-			- odd parity (when PARENB is in force)
-		CREAD
-			- enable reception of characters (if not set,
-			  still receive characters from the port, but
-			  throw them away.
-		CRTSCTS
-			- if set, enable CTS status change reporting
-		CLOCAL
-			- if not set, enable modem status change
-			  reporting.
-
-	Relevant termios->c_iflag bits are:
-
-		INPCK
-			- enable frame and parity error events to be
-			  passed to the TTY layer.
-		BRKINT / PARMRK
-			- both of these enable break events to be
-			  passed to the TTY layer.
-
-		IGNPAR
-			- ignore parity and framing errors
-		IGNBRK
-			- ignore break errors,  If IGNPAR is also
-			  set, ignore overrun errors as well.
-
-	The interaction of the iflag bits is as follows (parity error
-	given as an example):
-
-	=============== ======= ======  =============================
-	Parity error	INPCK	IGNPAR
-	=============== ======= ======  =============================
-	n/a		0	n/a	character received, marked as
-					TTY_NORMAL
-	None		1	n/a	character received, marked as
-					TTY_NORMAL
-	Yes		1	0	character received, marked as
-					TTY_PARITY
-	Yes		1	1	character discarded
-	=============== ======= ======  =============================
-
-	Other flags may be used (eg, xon/xoff characters) if your
-	hardware supports hardware "soft" flow control.
-
-	Locking: caller holds tty_port->mutex
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  set_ldisc(port,termios)
-	Notifier for discipline change. See Documentation/serial/tty.rst.
-
-	Locking: caller holds tty_port->mutex
-
-  pm(port,state,oldstate)
-	Perform any power management related activities on the specified
-	port.  State indicates the new state (defined by
-	enum uart_pm_state), oldstate indicates the previous state.
-
-	This function should not be used to grab any resources.
-
-	This will be called when the port is initially opened and finally
-	closed, except when the port is also the system console.  This
-	will occur even if CONFIG_PM is not set.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  type(port)
-	Return a pointer to a string constant describing the specified
-	port, or return NULL, in which case the string 'unknown' is
-	substituted.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  release_port(port)
-	Release any memory and IO region resources currently in use by
-	the port.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  request_port(port)
-	Request any memory and IO region resources required by the port.
-	If any fail, no resources should be registered when this function
-	returns, and it should return -EBUSY on failure.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  config_port(port,type)
-	Perform any autoconfiguration steps required for the port.  `type`
-	contains a bit mask of the required configuration.  UART_CONFIG_TYPE
-	indicates that the port requires detection and identification.
-	port->type should be set to the type found, or PORT_UNKNOWN if
-	no port was detected.
-
-	UART_CONFIG_IRQ indicates autoconfiguration of the interrupt signal,
-	which should be probed using standard kernel autoprobing techniques.
-	This is not necessary on platforms where ports have interrupts
-	internally hard wired (eg, system on a chip implementations).
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  verify_port(port,serinfo)
-	Verify the new serial port information contained within serinfo is
-	suitable for this port type.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  ioctl(port,cmd,arg)
-	Perform any port specific IOCTLs.  IOCTL commands must be defined
-	using the standard numbering system found in <asm/ioctl.h>
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-  poll_init(port)
-	Called by kgdb to perform the minimal hardware initialization needed
-	to support poll_put_char() and poll_get_char().  Unlike ->startup()
-	this should not request interrupts.
-
-	Locking: tty_mutex and tty_port->mutex taken.
-
-	Interrupts: n/a.
-
-  poll_put_char(port,ch)
-	Called by kgdb to write a single character directly to the serial
-	port.  It can and should block until there is space in the TX FIFO.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-  poll_get_char(port)
-	Called by kgdb to read a single character directly from the serial
-	port.  If data is available, it should be returned; otherwise
-	the function should return NO_POLL_CHAR immediately.
-
-	Locking: none.
-
-	Interrupts: caller dependent.
-
-	This call must not sleep
-
-Other functions
----------------
-
-uart_update_timeout(port,cflag,baud)
-	Update the FIFO drain timeout, port->timeout, according to the
-	number of bits, parity, stop bits and baud rate.
-
-	Locking: caller is expected to take port->lock
-
-	Interrupts: n/a
-
-uart_get_baud_rate(port,termios,old,min,max)
-	Return the numeric baud rate for the specified termios, taking
-	account of the special 38400 baud "kludge".  The B0 baud rate
-	is mapped to 9600 baud.
-
-	If the baud rate is not within min..max, then if old is non-NULL,
-	the original baud rate will be tried.  If that exceeds the
-	min..max constraint, 9600 baud will be returned.  termios will
-	be updated to the baud rate in use.
-
-	Note: min..max must always allow 9600 baud to be selected.
-
-	Locking: caller dependent.
-
-	Interrupts: n/a
-
-uart_get_divisor(port,baud)
-	Return the divisor (baud_base / baud) for the specified baud
-	rate, appropriately rounded.
-
-	If 38400 baud and custom divisor is selected, return the
-	custom divisor instead.
-
-	Locking: caller dependent.
-
-	Interrupts: n/a
-
-uart_match_port(port1,port2)
-	This utility function can be used to determine whether two
-	uart_port structures describe the same port.
-
-	Locking: n/a
-
-	Interrupts: n/a
-
-uart_write_wakeup(port)
-	A driver is expected to call this function when the number of
-	characters in the transmit buffer have dropped below a threshold.
-
-	Locking: port->lock should be held.
-
-	Interrupts: n/a
-
-uart_register_driver(drv)
-	Register a uart driver with the core driver.  We in turn register
-	with the tty layer, and initialise the core driver per-port state.
-
-	drv->port should be NULL, and the per-port structures should be
-	registered using uart_add_one_port after this call has succeeded.
-
-	Locking: none
-
-	Interrupts: enabled
-
-uart_unregister_driver()
-	Remove all references to a driver from the core driver.  The low
-	level driver must have removed all its ports via the
-	uart_remove_one_port() if it registered them with uart_add_one_port().
-
-	Locking: none
-
-	Interrupts: enabled
-
-**uart_suspend_port()**
-
-**uart_resume_port()**
-
-**uart_add_one_port()**
-
-**uart_remove_one_port()**
-
-Other notes
------------
-
-It is intended some day to drop the 'unused' entries from uart_port, and
-allow low level drivers to register their own individual uart_port's with
-the core.  This will allow drivers to use uart_port as a pointer to a
-structure containing both the uart_port entry with their own extensions,
-thus::
-
-	struct my_port {
-		struct uart_port	port;
-		int			my_stuff;
-	};
-
-Modem control lines via GPIO
-----------------------------
-
-Some helpers are provided in order to set/get modem control lines via GPIO.
-
-mctrl_gpio_init(port, idx):
-	This will get the {cts,rts,...}-gpios from device tree if they are
-	present and request them, set direction etc, and return an
-	allocated structure. `devm_*` functions are used, so there's no need
-	to call mctrl_gpio_free().
-	As this sets up the irq handling make sure to not handle changes to the
-	gpio input lines in your driver, too.
-
-mctrl_gpio_free(dev, gpios):
-	This will free the requested gpios in mctrl_gpio_init().
-	As `devm_*` functions are used, there's generally no need to call
-	this function.
-
-mctrl_gpio_to_gpiod(gpios, gidx)
-	This returns the gpio_desc structure associated to the modem line
-	index.
-
-mctrl_gpio_set(gpios, mctrl):
-	This will sets the gpios according to the mctrl state.
-
-mctrl_gpio_get(gpios, mctrl):
-	This will update mctrl with the gpios values.
-
-mctrl_gpio_enable_ms(gpios):
-	Enables irqs and handling of changes to the ms lines.
-
-mctrl_gpio_disable_ms(gpios):
-	Disables irqs and handling of changes to the ms lines.
diff --git a/Documentation/serial/index.rst b/Documentation/serial/index.rst
deleted file mode 100644
index d0ba22ea23bf..000000000000
--- a/Documentation/serial/index.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-:orphan:
-
-==========================
-Support for Serial devices
-==========================
-
-.. toctree::
-    :maxdepth: 1
-
-
-    driver
-    tty
-
-Serial drivers
-==============
-
-.. toctree::
-    :maxdepth: 1
-
-    cyclades_z
-    moxa-smartio
-    n_gsm
-    rocket
-    serial-iso7816
-    serial-rs485
-
-.. only::  subproject and html
-
-   Indices
-   =======
-
-   * :ref:`genindex`
diff --git a/Documentation/serial/moxa-smartio.rst b/Documentation/serial/moxa-smartio.rst
deleted file mode 100644
index 156100f17c3f..000000000000
--- a/Documentation/serial/moxa-smartio.rst
+++ /dev/null
@@ -1,615 +0,0 @@
-=============================================================
-MOXA Smartio/Industio Family Device Driver Installation Guide
-=============================================================
-
-.. note::
-
-   This file is outdated. It needs some care in order to make it
-   updated to Kernel 5.0 and upper
-
-Copyright (C) 2008, Moxa Inc.
-
-Date: 01/21/2008
-
-.. Content
-
-   1. Introduction
-   2. System Requirement
-   3. Installation
-      3.1 Hardware installation
-      3.2 Driver files
-      3.3 Device naming convention
-      3.4 Module driver configuration
-      3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x.
-      3.6 Custom configuration
-      3.7 Verify driver installation
-   4. Utilities
-   5. Setserial
-   6. Troubleshooting
-
-1. Introduction
-^^^^^^^^^^^^^^^
-
-   The Smartio/Industio/UPCI family Linux driver supports following multiport
-   boards.
-
-    - 2 ports multiport board
-	CP-102U, CP-102UL, CP-102UF
-	CP-132U-I, CP-132UL,
-	CP-132, CP-132I, CP132S, CP-132IS,
-	CI-132, CI-132I, CI-132IS,
-	(C102H, C102HI, C102HIS, C102P, CP-102, CP-102S)
-
-    - 4 ports multiport board
-	CP-104EL,
-	CP-104UL, CP-104JU,
-	CP-134U, CP-134U-I,
-	C104H/PCI, C104HS/PCI,
-	CP-114, CP-114I, CP-114S, CP-114IS, CP-114UL,
-	C104H, C104HS,
-	CI-104J, CI-104JS,
-	CI-134, CI-134I, CI-134IS,
-	(C114HI, CT-114I, C104P),
-	POS-104UL,
-	CB-114,
-	CB-134I
-
-    - 8 ports multiport board
-	CP-118EL, CP-168EL,
-	CP-118U, CP-168U,
-	C168H/PCI,
-	C168H, C168HS,
-	(C168P),
-	CB-108
-
-   This driver and installation procedure have been developed upon Linux Kernel
-   2.4.x and 2.6.x. This driver supports Intel x86 hardware platform. In order
-   to maintain compatibility, this version has also been properly tested with
-   RedHat, Mandrake, Fedora and S.u.S.E Linux. However, if compatibility problem
-   occurs, please contact Moxa at support@moxa.com.tw.
-
-   In addition to device driver, useful utilities are also provided in this
-   version. They are:
-
-    - msdiag
-		 Diagnostic program for displaying installed Moxa
-                 Smartio/Industio boards.
-    - msmon
-		 Monitor program to observe data count and line status signals.
-    - msterm     A simple terminal program which is useful in testing serial
-	         ports.
-    - io-irq.exe
-		 Configuration program to setup ISA boards. Please note that
-                 this program can only be executed under DOS.
-
-   All the drivers and utilities are published in form of source code under
-   GNU General Public License in this version. Please refer to GNU General
-   Public License announcement in each source code file for more detail.
-
-   In Moxa's Web sites, you may always find latest driver at http://www.moxa.com/.
-
-   This version of driver can be installed as Loadable Module (Module driver)
-   or built-in into kernel (Static driver). You may refer to following
-   installation procedure for suitable one. Before you install the driver,
-   please refer to hardware installation procedure in the User's Manual.
-
-   We assume the user should be familiar with following documents.
-
-   - Serial-HOWTO
-   - Kernel-HOWTO
-
-2. System Requirement
-^^^^^^^^^^^^^^^^^^^^^
-
-   - Hardware platform: Intel x86 machine
-   - Kernel version: 2.4.x or 2.6.x
-   - gcc version 2.72 or later
-   - Maximum 4 boards can be installed in combination
-
-3. Installation
-^^^^^^^^^^^^^^^
-
-3.1 Hardware installation
-=========================
-
-   There are two types of buses, ISA and PCI, for Smartio/Industio
-   family multiport board.
-
-ISA board
----------
-
-   You'll have to configure CAP address, I/O address, Interrupt Vector
-   as well as IRQ before installing this driver. Please refer to hardware
-   installation procedure in User's Manual before proceed any further.
-   Please make sure the JP1 is open after the ISA board is set properly.
-
-PCI/UPCI board
---------------
-
-   You may need to adjust IRQ usage in BIOS to avoid from IRQ conflict
-   with other ISA devices. Please refer to hardware installation
-   procedure in User's Manual in advance.
-
-PCI IRQ Sharing
----------------
-
-   Each port within the same multiport board shares the same IRQ. Up to
-   4 Moxa Smartio/Industio PCI Family multiport boards can be installed
-   together on one system and they can share the same IRQ.
-
-
-3.2 Driver files
-================
-
-   The driver file may be obtained from ftp, CD-ROM or floppy disk. The
-   first step, anyway, is to copy driver file "mxser.tgz" into specified
-   directory. e.g. /moxa. The execute commands as below::
-
-       # cd /
-       # mkdir moxa
-       # cd /moxa
-       # tar xvf /dev/fd0
-
-or::
-
-       # cd /
-       # mkdir moxa
-       # cd /moxa
-       # cp /mnt/cdrom/<driver directory>/mxser.tgz .
-       # tar xvfz mxser.tgz
-
-
-3.3 Device naming convention
-============================
-
-   You may find all the driver and utilities files in /moxa/mxser.
-   Following installation procedure depends on the model you'd like to
-   run the driver. If you prefer module driver, please refer to 3.4.
-   If static driver is required, please refer to 3.5.
-
-Dialin and callout port
------------------------
-
-   This driver remains traditional serial device properties. There are
-   two special file name for each serial port. One is dial-in port
-   which is named "ttyMxx". For callout port, the naming convention
-   is "cumxx".
-
-Device naming when more than 2 boards installed
------------------------------------------------
-
-   Naming convention for each Smartio/Industio multiport board is
-   pre-defined as below.
-
-   ============ ===============       ==============
-   Board Num.	 Dial-in Port	      Callout port
-   1st board	ttyM0  - ttyM7	      cum0  - cum7
-   2nd board	ttyM8  - ttyM15       cum8  - cum15
-   3rd board	ttyM16 - ttyM23       cum16 - cum23
-   4th board	ttyM24 - ttym31       cum24 - cum31
-   ============ ===============       ==============
-
-.. note::
-
-   Under Kernel 2.6 and upper, the cum Device is Obsolete. So use ttyM*
-   device instead.
-
-Board sequence
---------------
-
-   This driver will activate ISA boards according to the parameter set
-   in the driver. After all specified ISA board activated, PCI board
-   will be installed in the system automatically driven.
-   Therefore the board number is sorted by the CAP address of ISA boards.
-   For PCI boards, their sequence will be after ISA boards and C168H/PCI
-   has higher priority than C104H/PCI boards.
-
-3.4 Module driver configuration
-===============================
-
-   Module driver is easiest way to install. If you prefer static driver
-   installation, please skip this paragraph.
-
-
-   ------------- Prepare to use the MOXA driver --------------------
-
-3.4.1 Create tty device with correct major number
--------------------------------------------------
-
-   Before using MOXA driver, your system must have the tty devices
-   which are created with driver's major number. We offer one shell
-   script "msmknod" to simplify the procedure.
-   This step is only needed to be executed once. But you still
-   need to do this procedure when:
-
-   a. You change the driver's major number. Please refer the "3.7"
-      section.
-   b. Your total installed MOXA boards number is changed. Maybe you
-      add/delete one MOXA board.
-   c. You want to change the tty name. This needs to modify the
-      shell script "msmknod"
-
-   The procedure is::
-
-	 # cd /moxa/mxser/driver
-	 # ./msmknod
-
-   This shell script will require the major number for dial-in
-   device and callout device to create tty device. You also need
-   to specify the total installed MOXA board number. Default major
-   numbers for dial-in device and callout device are 30, 35. If
-   you need to change to other number, please refer section "3.7"
-   for more detailed procedure.
-   Msmknod will delete any special files occupying the same device
-   naming.
-
-3.4.2 Build the MOXA driver and utilities
------------------------------------------
-
-   Before using the MOXA driver and utilities, you need compile the
-   all the source code. This step is only need to be executed once.
-   But you still re-compile the source code if you modify the source
-   code. For example, if you change the driver's major number (see
-   "3.7" section), then you need to do this step again.
-
-   Find "Makefile" in /moxa/mxser, then run
-
-	 # make clean; make install
-
-   ..note::
-
-	 For Red Hat 9, Red Hat Enterprise Linux AS3/ES3/WS3 & Fedora Core1:
-	 # make clean; make installsp1
-
-	 For Red Hat Enterprise Linux AS4/ES4/WS4:
-	 # make clean; make installsp2
-
-   The driver files "mxser.o" and utilities will be properly compiled
-   and copied to system directories respectively.
-
-------------- Load MOXA driver--------------------
-
-3.4.3 Load the MOXA driver
---------------------------
-
-   ::
-
-	 # modprobe mxser <argument>
-
-   will activate the module driver. You may run "lsmod" to check
-   if "mxser" is activated. If the MOXA board is ISA board, the
-   <argument> is needed. Please refer to section "3.4.5" for more
-   information.
-
-------------- Load MOXA driver on boot --------------------
-
-3.4.4 Load the mxser driver
----------------------------
-
-
-   For the above description, you may manually execute
-   "modprobe mxser" to activate this driver and run
-   "rmmod mxser" to remove it.
-
-   However, it's better to have a boot time configuration to
-   eliminate manual operation. Boot time configuration can be
-   achieved by rc file. We offer one "rc.mxser" file to simplify
-   the procedure under "moxa/mxser/driver".
-
-   But if you use ISA board, please modify the "modprobe ..." command
-   to add the argument (see "3.4.5" section). After modifying the
-   rc.mxser, please try to execute "/moxa/mxser/driver/rc.mxser"
-   manually to make sure the modification is ok. If any error
-   encountered, please try to modify again. If the modification is
-   completed, follow the below step.
-
-   Run following command for setting rc files::
-
-	 # cd /moxa/mxser/driver
-	 # cp ./rc.mxser /etc/rc.d
-	 # cd /etc/rc.d
-
-   Check "rc.serial" is existed or not. If "rc.serial" doesn't exist,
-   create it by vi, run "chmod 755 rc.serial" to change the permission.
-
-   Add "/etc/rc.d/rc.mxser" in last line.
-
-   Reboot and check if moxa.o activated by "lsmod" command.
-
-3.4.5. specify CAP address
---------------------------
-
-   If you'd like to drive Smartio/Industio ISA boards in the system,
-   you'll have to add parameter to specify CAP address of given
-   board while activating "mxser.o". The format for parameters are
-   as follows.::
-
-	   modprobe mxser ioaddr=0x???,0x???,0x???,0x???
-				  |  |  |    |
-				  |  |  |    +- 4th ISA board
-				  |  |  +------ 3rd ISA board
-				  |  +------------ 2nd ISA board
-				  +-------------------1st ISA board
-
-3.5 Static driver configuration for Linux kernel 2.4.x and 2.6.x
-================================================================
-
-    Note:
-          To use static driver, you must install the linux kernel
-          source package.
-
-3.5.1 Backup the built-in driver in the kernel
-----------------------------------------------
-
-    ::
-
-       # cd /usr/src/linux/drivers/char
-       # mv mxser.c mxser.c.old
-
-       For Red Hat 7.x user, you need to create link:
-       # cd /usr/src
-       # ln -s linux-2.4 linux
-
-3.5.2 Create link
------------------
-    ::
-
-	  # cd /usr/src/linux/drivers/char
-	  # ln -s /moxa/mxser/driver/mxser.c mxser.c
-
-3.5.3 Add CAP address list for ISA boards.
-------------------------------------------
-
-    For PCI boards user, please skip this step.
-
-    In module mode, the CAP address for ISA board is given by
-    parameter. In static driver configuration, you'll have to
-    assign it within driver's source code. If you will not
-    install any ISA boards, you may skip to next portion.
-    The instructions to modify driver source code are as
-    below.
-
-    a. run::
-
-	# cd /moxa/mxser/driver
-	# vi mxser.c
-
-    b. Find the array mxserBoardCAP[] as below::
-
-	  static int mxserBoardCAP[] = {0x00, 0x00, 0x00, 0x00};
-
-    c. Change the address within this array using vi. For
-       example, to driver 2 ISA boards with CAP address
-       0x280 and 0x180 as 1st and 2nd board. Just to change
-       the source code as follows::
-
-	  static int mxserBoardCAP[] = {0x280, 0x180, 0x00, 0x00};
-
-3.5.4 Setup kernel configuration
---------------------------------
-
-    Configure the kernel::
-
-      # cd /usr/src/linux
-      # make menuconfig
-
-    You will go into a menu-driven system. Please select [Character
-    devices][Non-standard serial port support], enable the [Moxa
-    SmartIO support] driver with "[*]" for built-in (not "[M]"), then
-    select [Exit] to exit this program.
-
-3.5.5 Rebuild kernel
---------------------
-
-    The following are for Linux kernel rebuilding, for your
-    reference only.
-
-    For appropriate details, please refer to the Linux document:
-
-        a. Run the following commands::
-
-	     cd /usr/src/linux
-	     make clean		     # take a few minutes
-	     make dep		     # take a few minutes
-	     make bzImage	     # take probably 10-20 minutes
-	     make install	     # copy boot image to correct position
-
-	f. Please make sure the boot kernel (vmlinuz) is in the
-	   correct position.
-	g. If you use 'lilo' utility, you should check /etc/lilo.conf
-	   'image' item specified the path which is the 'vmlinuz' path,
-	   or you will load wrong (or old) boot kernel image (vmlinuz).
-	   After checking /etc/lilo.conf, please run "lilo".
-
-	  Note that if the result of "make bzImage" is ERROR, then you have to
-	  go back to Linux configuration Setup. Type "make menuconfig" in
-          directory /usr/src/linux.
-
-
-3.5.6 Make tty device and special file
---------------------------------------
-
-    ::
-       # cd /moxa/mxser/driver
-       # ./msmknod
-
-3.5.7 Make utility
-------------------
-
-    ::
-
-	  # cd /moxa/mxser/utility
-	  # make clean; make install
-
-3.5.8 Reboot
-------------
-
-
-
-3.6 Custom configuration
-========================
-
-    Although this driver already provides you default configuration, you
-    still can change the device name and major number. The instruction to
-    change these parameters are shown as below.
-
-a. Change Device name
-
-    If you'd like to use other device names instead of default naming
-    convention, all you have to do is to modify the internal code
-    within the shell script "msmknod". First, you have to open "msmknod"
-    by vi. Locate each line contains "ttyM" and "cum" and change them
-    to the device name you desired. "msmknod" creates the device names
-    you need next time executed.
-
-b. Change Major number
-
-    If major number 30 and 35 had been occupied, you may have to select
-    2 free major numbers for this driver. There are 3 steps to change
-    major numbers.
-
-3.6.1 Find free major numbers
------------------------------
-
-    In /proc/devices, you may find all the major numbers occupied
-    in the system. Please select 2 major numbers that are available.
-    e.g. 40, 45.
-
-3.6.2 Create special files
---------------------------
-
-   Run /moxa/mxser/driver/msmknod to create special files with
-   specified major numbers.
-
-3.6.3 Modify driver with new major number
------------------------------------------
-
-   Run vi to open /moxa/mxser/driver/mxser.c. Locate the line
-   contains "MXSERMAJOR". Change the content as below::
-
-	  #define	  MXSERMAJOR		  40
-	  #define	  MXSERCUMAJOR		  45
-
-    3.6.4 Run "make clean; make install" in /moxa/mxser/driver.
-
-3.7 Verify driver installation
-==============================
-
-    You may refer to /var/log/messages to check the latest status
-    log reported by this driver whenever it's activated.
-
-4. Utilities
-^^^^^^^^^^^^
-
-   There are 3 utilities contained in this driver. They are msdiag, msmon and
-   msterm. These 3 utilities are released in form of source code. They should
-   be compiled into executable file and copied into /usr/bin.
-
-   Before using these utilities, please load driver (refer 3.4 & 3.5) and
-   make sure you had run the "msmknod" utility.
-
-msdiag - Diagnostic
-===================
-
-   This utility provides the function to display what Moxa Smartio/Industio
-   board found by driver in the system.
-
-msmon - Port Monitoring
-=======================
-
-   This utility gives the user a quick view about all the MOXA ports'
-   activities. One can easily learn each port's total received/transmitted
-   (Rx/Tx) character count since the time when the monitoring is started.
-
-   Rx/Tx throughputs per second are also reported in interval basis (e.g.
-   the last 5 seconds) and in average basis (since the time the monitoring
-   is started). You can reset all ports' count by <HOME> key. <+> <->
-   (plus/minus) keys to change the displaying time interval. Press <ENTER>
-   on the port, that cursor stay, to view the port's communication
-   parameters, signal status, and input/output queue.
-
-msterm - Terminal Emulation
-===========================
-
-   This utility provides data sending and receiving ability of all tty ports,
-   especially for MOXA ports. It is quite useful for testing simple
-   application, for example, sending AT command to a modem connected to the
-   port or used as a terminal for login purpose. Note that this is only a
-   dumb terminal emulation without handling full screen operation.
-
-5. Setserial
-^^^^^^^^^^^^
-
-   Supported Setserial parameters are listed as below.
-
-   ============== =========================================================
-   uart		  set UART type(16450-->disable FIFO, 16550A-->enable FIFO)
-   close_delay	  set the amount of time(in 1/100 of a second) that DTR
-		  should be kept low while being closed.
-   closing_wait   set the amount of time(in 1/100 of a second) that the
-		  serial port should wait for data to be drained while
-		  being closed, before the receiver is disable.
-   spd_hi	  Use  57.6kb  when  the application requests 38.4kb.
-   spd_vhi	  Use  115.2kb	when  the application requests 38.4kb.
-   spd_shi	  Use  230.4kb	when  the application requests 38.4kb.
-   spd_warp	  Use  460.8kb	when  the application requests 38.4kb.
-   spd_normal	  Use  38.4kb  when  the application requests 38.4kb.
-   spd_cust	  Use  the custom divisor to set the speed when  the
-		  application requests 38.4kb.
-   divisor	  This option set the custom division.
-   baud_base	  This option set the base baud rate.
-   ============== =========================================================
-
-6. Troubleshooting
-^^^^^^^^^^^^^^^^^^
-
-   The boot time error messages and solutions are stated as clearly as
-   possible. If all the possible solutions fail, please contact our technical
-   support team to get more help.
-
-
-   Error msg:
-	      More than 4 Moxa Smartio/Industio family boards found. Fifth board
-              and after are ignored.
-
-   Solution:
-   To avoid this problem, please unplug fifth and after board, because Moxa
-   driver supports up to 4 boards.
-
-   Error msg:
-	      Request_irq fail, IRQ(?) may be conflict with another device.
-
-   Solution:
-   Other PCI or ISA devices occupy the assigned IRQ. If you are not sure
-   which device causes the situation, please check /proc/interrupts to find
-   free IRQ and simply change another free IRQ for Moxa board.
-
-   Error msg:
-	      Board #: C1xx Series(CAP=xxx) interrupt number invalid.
-
-   Solution:
-   Each port within the same multiport board shares the same IRQ. Please set
-   one IRQ (IRQ doesn't equal to zero) for one Moxa board.
-
-   Error msg:
-	      No interrupt vector be set for Moxa ISA board(CAP=xxx).
-
-   Solution:
-   Moxa ISA board needs an interrupt vector.Please refer to user's manual
-   "Hardware Installation" chapter to set interrupt vector.
-
-   Error msg:
-              Couldn't install MOXA Smartio/Industio family driver!
-
-   Solution:
-   Load Moxa driver fail, the major number may conflict with other devices.
-   Please refer to previous section 3.7 to change a free major number for
-   Moxa driver.
-
-   Error msg:
-              Couldn't install MOXA Smartio/Industio family callout driver!
-
-   Solution:
-   Load Moxa callout driver fail, the callout device major number may
-   conflict with other devices. Please refer to previous section 3.7 to
-   change a free callout device major number for Moxa driver.
diff --git a/Documentation/serial/n_gsm.rst b/Documentation/serial/n_gsm.rst
deleted file mode 100644
index f3ad9fd26408..000000000000
--- a/Documentation/serial/n_gsm.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-==============================
-GSM 0710 tty multiplexor HOWTO
-==============================
-
-This line discipline implements the GSM 07.10 multiplexing protocol
-detailed in the following 3GPP document:
-
-	http://www.3gpp.org/ftp/Specs/archive/07_series/07.10/0710-720.zip
-
-This document give some hints on how to use this driver with GPRS and 3G
-modems connected to a physical serial port.
-
-How to use it
--------------
-1. initialize the modem in 0710 mux mode (usually AT+CMUX= command) through
-   its serial port. Depending on the modem used, you can pass more or less
-   parameters to this command,
-2. switch the serial line to using the n_gsm line discipline by using
-   TIOCSETD ioctl,
-3. configure the mux using GSMIOC_GETCONF / GSMIOC_SETCONF ioctl,
-
-Major parts of the initialization program :
-(a good starting point is util-linux-ng/sys-utils/ldattach.c)::
-
-  #include <linux/gsmmux.h>
-  #define N_GSM0710	21	/* GSM 0710 Mux */
-  #define DEFAULT_SPEED	B115200
-  #define SERIAL_PORT	/dev/ttyS0
-
-	int ldisc = N_GSM0710;
-	struct gsm_config c;
-	struct termios configuration;
-
-	/* open the serial port connected to the modem */
-	fd = open(SERIAL_PORT, O_RDWR | O_NOCTTY | O_NDELAY);
-
-	/* configure the serial port : speed, flow control ... */
-
-	/* send the AT commands to switch the modem to CMUX mode
-	   and check that it's successful (should return OK) */
-	write(fd, "AT+CMUX=0\r", 10);
-
-	/* experience showed that some modems need some time before
-	   being able to answer to the first MUX packet so a delay
-	   may be needed here in some case */
-	sleep(3);
-
-	/* use n_gsm line discipline */
-	ioctl(fd, TIOCSETD, &ldisc);
-
-	/* get n_gsm configuration */
-	ioctl(fd, GSMIOC_GETCONF, &c);
-	/* we are initiator and need encoding 0 (basic) */
-	c.initiator = 1;
-	c.encapsulation = 0;
-	/* our modem defaults to a maximum size of 127 bytes */
-	c.mru = 127;
-	c.mtu = 127;
-	/* set the new configuration */
-	ioctl(fd, GSMIOC_SETCONF, &c);
-
-	/* and wait for ever to keep the line discipline enabled */
-	daemon(0,0);
-	pause();
-
-4. create the devices corresponding to the "virtual" serial ports (take care,
-   each modem has its configuration and some DLC have dedicated functions,
-   for example GPS), starting with minor 1 (DLC0 is reserved for the management
-   of the mux)::
-
-     MAJOR=`cat /proc/devices |grep gsmtty | awk '{print $1}`
-     for i in `seq 1 4`; do
-	mknod /dev/ttygsm$i c $MAJOR $i
-     done
-
-5. use these devices as plain serial ports.
-
-   for example, it's possible:
-
-   - and to use gnokii to send / receive SMS on ttygsm1
-   - to use ppp to establish a datalink on ttygsm2
-
-6. first close all virtual ports before closing the physical port.
-
-   Note that after closing the physical port the modem is still in multiplexing
-   mode. This may prevent a successful re-opening of the port later. To avoid
-   this situation either reset the modem if your hardware allows that or send
-   a disconnect command frame manually before initializing the multiplexing mode
-   for the second time. The byte sequence for the disconnect command frame is::
-
-      0xf9, 0x03, 0xef, 0x03, 0xc3, 0x16, 0xf9.
-
-Additional Documentation
-------------------------
-More practical details on the protocol and how it's supported by industrial
-modems can be found in the following documents :
-
-- http://www.telit.com/module/infopool/download.php?id=616
-- http://www.u-blox.com/images/downloads/Product_Docs/LEON-G100-G200-MuxImplementation_ApplicationNote_%28GSM%20G1-CS-10002%29.pdf
-- http://www.sierrawireless.com/Support/Downloads/AirPrime/WMP_Series/~/media/Support_Downloads/AirPrime/Application_notes/CMUX_Feature_Application_Note-Rev004.ashx
-- http://wm.sim.com/sim/News/photo/2010721161442.pdf
-
-11-03-08 - Eric Bénard - <eric@eukrea.com>
diff --git a/Documentation/serial/rocket.rst b/Documentation/serial/rocket.rst
deleted file mode 100644
index 23761eae4282..000000000000
--- a/Documentation/serial/rocket.rst
+++ /dev/null
@@ -1,185 +0,0 @@
-================================================
-Comtrol(tm) RocketPort(R)/RocketModem(TM) Series
-================================================
-
-Device Driver for the Linux Operating System
-============================================
-
-Product overview
-----------------
-
-This driver provides a loadable kernel driver for the Comtrol RocketPort
-and RocketModem PCI boards. These boards provide, 2, 4, 8, 16, or 32
-high-speed serial ports or modems.  This driver supports up to a combination
-of four RocketPort or RocketModems boards in one machine simultaneously.
-This file assumes that you are using the RocketPort driver which is
-integrated into the kernel sources.
-
-The driver can also be installed as an external module using the usual
-"make;make install" routine.  This external module driver, obtainable
-from the Comtrol website listed below, is useful for updating the driver
-or installing it into kernels which do not have the driver configured
-into them.  Installations instructions for the external module
-are in the included README and HW_INSTALL files.
-
-RocketPort ISA and RocketModem II PCI boards currently are only supported by
-this driver in module form.
-
-The RocketPort ISA board requires I/O ports to be configured by the DIP
-switches on the board.  See the section "ISA Rocketport Boards" below for
-information on how to set the DIP switches.
-
-You pass the I/O port to the driver using the following module parameters:
-
-board1:
-	I/O port for the first ISA board
-board2:
-	I/O port for the second ISA board
-board3:
-	I/O port for the third ISA board
-board4:
-	I/O port for the fourth ISA board
-
-There is a set of utilities and scripts provided with the external driver
-(downloadable from http://www.comtrol.com) that ease the configuration and
-setup of the ISA cards.
-
-The RocketModem II PCI boards require firmware to be loaded into the card
-before it will function.  The driver has only been tested as a module for this
-board.
-
-Installation Procedures
------------------------
-
-RocketPort/RocketModem PCI cards require no driver configuration, they are
-automatically detected and configured.
-
-The RocketPort driver can be installed as a module (recommended) or built
-into the kernel. This is selected, as for other drivers, through the `make config`
-command from the root of the Linux source tree during the kernel build process.
-
-The RocketPort/RocketModem serial ports installed by this driver are assigned
-device major number 46, and will be named /dev/ttyRx, where x is the port number
-starting at zero (ex. /dev/ttyR0, /devttyR1, ...).  If you have multiple cards
-installed in the system, the mapping of port names to serial ports is displayed
-in the system log at /var/log/messages.
-
-If installed as a module, the module must be loaded.  This can be done
-manually by entering "modprobe rocket".  To have the module loaded automatically
-upon system boot, edit a `/etc/modprobe.d/*.conf` file and add the line
-"alias char-major-46 rocket".
-
-In order to use the ports, their device names (nodes) must be created with mknod.
-This is only required once, the system will retain the names once created.  To
-create the RocketPort/RocketModem device names, use the command
-"mknod /dev/ttyRx c 46 x" where x is the port number starting at zero.
-
-For example::
-
-	> mknod /dev/ttyR0 c 46 0
-	> mknod /dev/ttyR1 c 46 1
-	> mknod /dev/ttyR2 c 46 2
-
-The Linux script MAKEDEV will create the first 16 ttyRx device names (nodes)
-for you::
-
-	>/dev/MAKEDEV ttyR
-
-ISA Rocketport Boards
----------------------
-
-You must assign and configure the I/O addresses used by the ISA Rocketport
-card before installing and using it.  This is done by setting a set of DIP
-switches on the Rocketport board.
-
-
-Setting the I/O address
------------------------
-
-Before installing RocketPort(R) or RocketPort RA boards, you must find
-a range of I/O addresses for it to use. The first RocketPort card
-requires a 68-byte contiguous block of I/O addresses, starting at one
-of the following: 0x100h, 0x140h, 0x180h, 0x200h, 0x240h, 0x280h,
-0x300h, 0x340h, 0x380h.  This I/O address must be reflected in the DIP
-switches of *all* of the Rocketport cards.
-
-The second, third, and fourth RocketPort cards require a 64-byte
-contiguous block of I/O addresses, starting at one of the following
-I/O addresses: 0x100h, 0x140h, 0x180h, 0x1C0h, 0x200h, 0x240h, 0x280h,
-0x2C0h, 0x300h, 0x340h, 0x380h, 0x3C0h.  The I/O address used by the
-second, third, and fourth Rocketport cards (if present) are set via
-software control.  The DIP switch settings for the I/O address must be
-set to the value of the first Rocketport cards.
-
-In order to distinguish each of the card from the others, each card
-must have a unique board ID set on the dip switches.  The first
-Rocketport board must be set with the DIP switches corresponding to
-the first board, the second board must be set with the DIP switches
-corresponding to the second board, etc.  IMPORTANT: The board ID is
-the only place where the DIP switch settings should differ between the
-various Rocketport boards in a system.
-
-The I/O address range used by any of the RocketPort cards must not
-conflict with any other cards in the system, including other
-RocketPort cards.  Below, you will find a list of commonly used I/O
-address ranges which may be in use by other devices in your system.
-On a Linux system, "cat /proc/ioports" will also be helpful in
-identifying what I/O addresses are being used by devices on your
-system.
-
-Remember, the FIRST RocketPort uses 68 I/O addresses.  So, if you set it
-for 0x100, it will occupy 0x100 to 0x143.  This would mean that you
-CAN NOT set the second, third or fourth board for address 0x140 since
-the first 4 bytes of that range are used by the first board.  You would
-need to set the second, third, or fourth board to one of the next available
-blocks such as 0x180.
-
-RocketPort and RocketPort RA SW1 Settings::
-
-            +-------------------------------+
-            | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 |
-            +-------+-------+---------------+
-            | Unused| Card  | I/O Port Block|
-            +-------------------------------+
-
-  DIP Switches                             DIP Switches
-  7    8                                   6    5
-  ===================                      ===================
-  On   On   UNUSED, MUST BE ON.            On   On   First Card    <==== Default
-                                           On   Off  Second Card
-                                           Off  On   Third Card
-                                           Off  Off  Fourth Card
-
-  DIP Switches         I/O Address Range
-  4    3    2    1     Used by the First Card
-  =====================================
-  On   Off  On   Off   100-143
-  On   Off  Off  On    140-183
-  On   Off  Off  Off   180-1C3       <==== Default
-  Off  On   On   Off   200-243
-  Off  On   Off  On    240-283
-  Off  On   Off  Off   280-2C3
-  Off  Off  On   Off   300-343
-  Off  Off  Off  On    340-383
-  Off  Off  Off  Off   380-3C3
-
-Reporting Bugs
---------------
-
-For technical support, please provide the following
-information: Driver version, kernel release, distribution of
-kernel, and type of board you are using. Error messages and log
-printouts port configuration details are especially helpful.
-
-USA:
-    :Phone: (612) 494-4100
-    :FAX: (612) 494-4199
-    :email: support@comtrol.com
-
-Comtrol Europe:
-    :Phone: +44 (0) 1 869 323-220
-    :FAX: +44 (0) 1 869 323-211
-    :email: support@comtrol.co.uk
-
-Web:	http://www.comtrol.com
-FTP:	ftp.comtrol.com
diff --git a/Documentation/serial/serial-iso7816.rst b/Documentation/serial/serial-iso7816.rst
deleted file mode 100644
index d990143de0c6..000000000000
--- a/Documentation/serial/serial-iso7816.rst
+++ /dev/null
@@ -1,90 +0,0 @@
-=============================
-ISO7816 Serial Communications
-=============================
-
-1. Introduction
-===============
-
-  ISO/IEC7816 is a series of standards specifying integrated circuit cards (ICC)
-  also known as smart cards.
-
-2. Hardware-related considerations
-==================================
-
-  Some CPUs/UARTs (e.g., Microchip AT91) contain a built-in mode capable of
-  handling communication with a smart card.
-
-  For these microcontrollers, the Linux driver should be made capable of
-  working in both modes, and proper ioctls (see later) should be made
-  available at user-level to allow switching from one mode to the other, and
-  vice versa.
-
-3. Data Structures Already Available in the Kernel
-==================================================
-
-  The Linux kernel provides the serial_iso7816 structure (see [1]) to handle
-  ISO7816 communications. This data structure is used to set and configure
-  ISO7816 parameters in ioctls.
-
-  Any driver for devices capable of working both as RS232 and ISO7816 should
-  implement the iso7816_config callback in the uart_port structure. The
-  serial_core calls iso7816_config to do the device specific part in response
-  to TIOCGISO7816 and TIOCSISO7816 ioctls (see below). The iso7816_config
-  callback receives a pointer to struct serial_iso7816.
-
-4. Usage from user-level
-========================
-
-  From user-level, ISO7816 configuration can be get/set using the previous
-  ioctls. For instance, to set ISO7816 you can use the following code::
-
-	#include <linux/serial.h>
-
-	/* Include definition for ISO7816 ioctls: TIOCSISO7816 and TIOCGISO7816 */
-	#include <sys/ioctl.h>
-
-	/* Open your specific device (e.g., /dev/mydevice): */
-	int fd = open ("/dev/mydevice", O_RDWR);
-	if (fd < 0) {
-		/* Error handling. See errno. */
-	}
-
-	struct serial_iso7816 iso7816conf;
-
-	/* Reserved fields as to be zeroed */
-	memset(&iso7816conf, 0, sizeof(iso7816conf));
-
-	/* Enable ISO7816 mode: */
-	iso7816conf.flags |= SER_ISO7816_ENABLED;
-
-	/* Select the protocol: */
-	/* T=0 */
-	iso7816conf.flags |= SER_ISO7816_T(0);
-	/* or T=1 */
-	iso7816conf.flags |= SER_ISO7816_T(1);
-
-	/* Set the guard time: */
-	iso7816conf.tg = 2;
-
-	/* Set the clock frequency*/
-	iso7816conf.clk = 3571200;
-
-	/* Set transmission factors: */
-	iso7816conf.sc_fi = 372;
-	iso7816conf.sc_di = 1;
-
-	if (ioctl(fd_usart, TIOCSISO7816, &iso7816conf) < 0) {
-		/* Error handling. See errno. */
-	}
-
-	/* Use read() and write() syscalls here... */
-
-	/* Close the device when finished: */
-	if (close (fd) < 0) {
-		/* Error handling. See errno. */
-	}
-
-5. References
-=============
-
- [1]    include/uapi/linux/serial.h
diff --git a/Documentation/serial/serial-rs485.rst b/Documentation/serial/serial-rs485.rst
deleted file mode 100644
index 6bc824f948f9..000000000000
--- a/Documentation/serial/serial-rs485.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-===========================
-RS485 Serial Communications
-===========================
-
-1. Introduction
-===============
-
-   EIA-485, also known as TIA/EIA-485 or RS-485, is a standard defining the
-   electrical characteristics of drivers and receivers for use in balanced
-   digital multipoint systems.
-   This standard is widely used for communications in industrial automation
-   because it can be used effectively over long distances and in electrically
-   noisy environments.
-
-2. Hardware-related Considerations
-==================================
-
-   Some CPUs/UARTs (e.g., Atmel AT91 or 16C950 UART) contain a built-in
-   half-duplex mode capable of automatically controlling line direction by
-   toggling RTS or DTR signals. That can be used to control external
-   half-duplex hardware like an RS485 transceiver or any RS232-connected
-   half-duplex devices like some modems.
-
-   For these microcontrollers, the Linux driver should be made capable of
-   working in both modes, and proper ioctls (see later) should be made
-   available at user-level to allow switching from one mode to the other, and
-   vice versa.
-
-3. Data Structures Already Available in the Kernel
-==================================================
-
-   The Linux kernel provides the serial_rs485 structure (see [1]) to handle
-   RS485 communications. This data structure is used to set and configure RS485
-   parameters in the platform data and in ioctls.
-
-   The device tree can also provide RS485 boot time parameters (see [2]
-   for bindings). The driver is in charge of filling this data structure from
-   the values given by the device tree.
-
-   Any driver for devices capable of working both as RS232 and RS485 should
-   implement the rs485_config callback in the uart_port structure. The
-   serial_core calls rs485_config to do the device specific part in response
-   to TIOCSRS485 and TIOCGRS485 ioctls (see below). The rs485_config callback
-   receives a pointer to struct serial_rs485.
-
-4. Usage from user-level
-========================
-
-   From user-level, RS485 configuration can be get/set using the previous
-   ioctls. For instance, to set RS485 you can use the following code::
-
-	#include <linux/serial.h>
-
-	/* Include definition for RS485 ioctls: TIOCGRS485 and TIOCSRS485 */
-	#include <sys/ioctl.h>
-
-	/* Open your specific device (e.g., /dev/mydevice): */
-	int fd = open ("/dev/mydevice", O_RDWR);
-	if (fd < 0) {
-		/* Error handling. See errno. */
-	}
-
-	struct serial_rs485 rs485conf;
-
-	/* Enable RS485 mode: */
-	rs485conf.flags |= SER_RS485_ENABLED;
-
-	/* Set logical level for RTS pin equal to 1 when sending: */
-	rs485conf.flags |= SER_RS485_RTS_ON_SEND;
-	/* or, set logical level for RTS pin equal to 0 when sending: */
-	rs485conf.flags &= ~(SER_RS485_RTS_ON_SEND);
-
-	/* Set logical level for RTS pin equal to 1 after sending: */
-	rs485conf.flags |= SER_RS485_RTS_AFTER_SEND;
-	/* or, set logical level for RTS pin equal to 0 after sending: */
-	rs485conf.flags &= ~(SER_RS485_RTS_AFTER_SEND);
-
-	/* Set rts delay before send, if needed: */
-	rs485conf.delay_rts_before_send = ...;
-
-	/* Set rts delay after send, if needed: */
-	rs485conf.delay_rts_after_send = ...;
-
-	/* Set this flag if you want to receive data even while sending data */
-	rs485conf.flags |= SER_RS485_RX_DURING_TX;
-
-	if (ioctl (fd, TIOCSRS485, &rs485conf) < 0) {
-		/* Error handling. See errno. */
-	}
-
-	/* Use read() and write() syscalls here... */
-
-	/* Close the device when finished: */
-	if (close (fd) < 0) {
-		/* Error handling. See errno. */
-	}
-
-5. References
-=============
-
- [1]	include/uapi/linux/serial.h
-
- [2]	Documentation/devicetree/bindings/serial/rs485.txt
diff --git a/Documentation/serial/tty.rst b/Documentation/serial/tty.rst
deleted file mode 100644
index dd972caacf3e..000000000000
--- a/Documentation/serial/tty.rst
+++ /dev/null
@@ -1,328 +0,0 @@
-=================
-The Lockronomicon
-=================
-
-Your guide to the ancient and twisted locking policies of the tty layer and
-the warped logic behind them. Beware all ye who read on.
-
-
-Line Discipline
----------------
-
-Line disciplines are registered with tty_register_ldisc() passing the
-discipline number and the ldisc structure. At the point of registration the
-discipline must be ready to use and it is possible it will get used before
-the call returns success. If the call returns an error then it won't get
-called. Do not re-use ldisc numbers as they are part of the userspace ABI
-and writing over an existing ldisc will cause demons to eat your computer.
-After the return the ldisc data has been copied so you may free your own
-copy of the structure. You must not re-register over the top of the line
-discipline even with the same data or your computer again will be eaten by
-demons.
-
-In order to remove a line discipline call tty_unregister_ldisc().
-In ancient times this always worked. In modern times the function will
-return -EBUSY if the ldisc is currently in use. Since the ldisc referencing
-code manages the module counts this should not usually be a concern.
-
-Heed this warning: the reference count field of the registered copies of the
-tty_ldisc structure in the ldisc table counts the number of lines using this
-discipline. The reference count of the tty_ldisc structure within a tty
-counts the number of active users of the ldisc at this instant. In effect it
-counts the number of threads of execution within an ldisc method (plus those
-about to enter and exit although this detail matters not).
-
-Line Discipline Methods
------------------------
-
-TTY side interfaces
-^^^^^^^^^^^^^^^^^^^
-
-======================= =======================================================
-open()			Called when the line discipline is attached to
-			the terminal. No other call into the line
-			discipline for this tty will occur until it
-			completes successfully. Should initialize any
-			state needed by the ldisc, and set receive_room
-			in the tty_struct to the maximum amount of data
-			the line discipline is willing to accept from the
-			driver with a single call to receive_buf().
-			Returning an error will prevent the ldisc from
-			being attached. Can sleep.
-
-close()			This is called on a terminal when the line
-			discipline is being unplugged. At the point of
-			execution no further users will enter the
-			ldisc code for this tty. Can sleep.
-
-hangup()		Called when the tty line is hung up.
-			The line discipline should cease I/O to the tty.
-			No further calls into the ldisc code will occur.
-			The return value is ignored. Can sleep.
-
-read()			(optional) A process requests reading data from
-			the line. Multiple read calls may occur in parallel
-			and the ldisc must deal with serialization issues.
-			If not defined, the process will receive an EIO
-			error. May sleep.
-
-write()			(optional) A process requests writing data to the
-			line. Multiple write calls are serialized by the
-			tty layer for the ldisc. If not defined, the
-			process will receive an EIO error. May sleep.
-
-flush_buffer()		(optional) May be called at any point between
-			open and close, and instructs the line discipline
-			to empty its input buffer.
-
-set_termios()		(optional) Called on termios structure changes.
-			The caller passes the old termios data and the
-			current data is in the tty. Called under the
-			termios semaphore so allowed to sleep. Serialized
-			against itself only.
-
-poll()			(optional) Check the status for the poll/select
-			calls. Multiple poll calls may occur in parallel.
-			May sleep.
-
-ioctl()			(optional) Called when an ioctl is handed to the
-			tty layer that might be for the ldisc. Multiple
-			ioctl calls may occur in parallel. May sleep.
-
-compat_ioctl()		(optional) Called when a 32 bit ioctl is handed
-			to the tty layer that might be for the ldisc.
-			Multiple ioctl calls may occur in parallel.
-			May sleep.
-======================= =======================================================
-
-Driver Side Interfaces
-^^^^^^^^^^^^^^^^^^^^^^
-
-======================= =======================================================
-receive_buf()		(optional) Called by the low-level driver to hand
-			a buffer of received bytes to the ldisc for
-			processing. The number of bytes is guaranteed not
-			to exceed the current value of tty->receive_room.
-			All bytes must be processed.
-
-receive_buf2()		(optional) Called by the low-level driver to hand
-			a buffer of received bytes to the ldisc for
-			processing. Returns the number of bytes processed.
-
-			If both receive_buf() and receive_buf2() are
-			defined, receive_buf2() should be preferred.
-
-write_wakeup()		May be called at any point between open and close.
-			The TTY_DO_WRITE_WAKEUP flag indicates if a call
-			is needed but always races versus calls. Thus the
-			ldisc must be careful about setting order and to
-			handle unexpected calls. Must not sleep.
-
-			The driver is forbidden from calling this directly
-			from the ->write call from the ldisc as the ldisc
-			is permitted to call the driver write method from
-			this function. In such a situation defer it.
-
-dcd_change()		Report to the tty line the current DCD pin status
-			changes and the relative timestamp. The timestamp
-			cannot be NULL.
-======================= =======================================================
-
-
-Driver Access
-^^^^^^^^^^^^^
-
-Line discipline methods can call the following methods of the underlying
-hardware driver through the function pointers within the tty->driver
-structure:
-
-======================= =======================================================
-write()			Write a block of characters to the tty device.
-			Returns the number of characters accepted. The
-			character buffer passed to this method is already
-			in kernel space.
-
-put_char()		Queues a character for writing to the tty device.
-			If there is no room in the queue, the character is
-			ignored.
-
-flush_chars()		(Optional) If defined, must be called after
-			queueing characters with put_char() in order to
-			start transmission.
-
-write_room()		Returns the numbers of characters the tty driver
-			will accept for queueing to be written.
-
-ioctl()			Invoke device specific ioctl.
-			Expects data pointers to refer to userspace.
-			Returns ENOIOCTLCMD for unrecognized ioctl numbers.
-
-set_termios()		Notify the tty driver that the device's termios
-			settings have changed. New settings are in
-			tty->termios. Previous settings should be passed in
-			the "old" argument.
-
-			The API is defined such that the driver should return
-			the actual modes selected. This means that the
-			driver function is responsible for modifying any
-			bits in the request it cannot fulfill to indicate
-			the actual modes being used. A device with no
-			hardware capability for change (e.g. a USB dongle or
-			virtual port) can provide NULL for this method.
-
-throttle()		Notify the tty driver that input buffers for the
-			line discipline are close to full, and it should
-			somehow signal that no more characters should be
-			sent to the tty.
-
-unthrottle()		Notify the tty driver that characters can now be
-			sent to the tty without fear of overrunning the
-			input buffers of the line disciplines.
-
-stop()			Ask the tty driver to stop outputting characters
-			to the tty device.
-
-start()			Ask the tty driver to resume sending characters
-			to the tty device.
-
-hangup()		Ask the tty driver to hang up the tty device.
-
-break_ctl()		(Optional) Ask the tty driver to turn on or off
-			BREAK status on the RS-232 port.  If state is -1,
-			then the BREAK status should be turned on; if
-			state is 0, then BREAK should be turned off.
-			If this routine is not implemented, use ioctls
-			TIOCSBRK / TIOCCBRK instead.
-
-wait_until_sent()	Waits until the device has written out all of the
-			characters in its transmitter FIFO.
-
-send_xchar()		Send a high-priority XON/XOFF character to the device.
-======================= =======================================================
-
-
-Flags
-^^^^^
-
-Line discipline methods have access to tty->flags field containing the
-following interesting flags:
-
-======================= =======================================================
-TTY_THROTTLED		Driver input is throttled. The ldisc should call
-			tty->driver->unthrottle() in order to resume
-			reception when it is ready to process more data.
-
-TTY_DO_WRITE_WAKEUP	If set, causes the driver to call the ldisc's
-			write_wakeup() method in order to resume
-			transmission when it can accept more data
-			to transmit.
-
-TTY_IO_ERROR		If set, causes all subsequent userspace read/write
-			calls on the tty to fail, returning -EIO.
-
-TTY_OTHER_CLOSED	Device is a pty and the other side has closed.
-
-TTY_NO_WRITE_SPLIT	Prevent driver from splitting up writes into
-			smaller chunks.
-======================= =======================================================
-
-
-Locking
-^^^^^^^
-
-Callers to the line discipline functions from the tty layer are required to
-take line discipline locks. The same is true of calls from the driver side
-but not yet enforced.
-
-Three calls are now provided::
-
-	ldisc = tty_ldisc_ref(tty);
-
-takes a handle to the line discipline in the tty and returns it. If no ldisc
-is currently attached or the ldisc is being closed and re-opened at this
-point then NULL is returned. While this handle is held the ldisc will not
-change or go away::
-
-	tty_ldisc_deref(ldisc)
-
-Returns the ldisc reference and allows the ldisc to be closed. Returning the
-reference takes away your right to call the ldisc functions until you take
-a new reference::
-
-	ldisc = tty_ldisc_ref_wait(tty);
-
-Performs the same function as tty_ldisc_ref except that it will wait for an
-ldisc change to complete and then return a reference to the new ldisc.
-
-While these functions are slightly slower than the old code they should have
-minimal impact as most receive logic uses the flip buffers and they only
-need to take a reference when they push bits up through the driver.
-
-A caution: The ldisc->open(), ldisc->close() and driver->set_ldisc
-functions are called with the ldisc unavailable. Thus tty_ldisc_ref will
-fail in this situation if used within these functions. Ldisc and driver
-code calling its own functions must be careful in this case.
-
-
-Driver Interface
-----------------
-
-======================= =======================================================
-open()			Called when a device is opened. May sleep
-
-close()			Called when a device is closed. At the point of
-			return from this call the driver must make no
-			further ldisc calls of any kind. May sleep
-
-write()			Called to write bytes to the device. May not
-			sleep. May occur in parallel in special cases.
-			Because this includes panic paths drivers generally
-			shouldn't try and do clever locking here.
-
-put_char()		Stuff a single character onto the queue. The
-			driver is guaranteed following up calls to
-			flush_chars.
-
-flush_chars()		Ask the kernel to write put_char queue
-
-write_room()		Return the number of characters that can be stuffed
-			into the port buffers without overflow (or less).
-			The ldisc is responsible for being intelligent
-			about multi-threading of write_room/write calls
-
-ioctl()			Called when an ioctl may be for the driver
-
-set_termios()		Called on termios change, serialized against
-			itself by a semaphore. May sleep.
-
-set_ldisc()		Notifier for discipline change. At the point this
-			is done the discipline is not yet usable. Can now
-			sleep (I think)
-
-throttle()		Called by the ldisc to ask the driver to do flow
-			control.  Serialization including with unthrottle
-			is the job of the ldisc layer.
-
-unthrottle()		Called by the ldisc to ask the driver to stop flow
-			control.
-
-stop()			Ldisc notifier to the driver to stop output. As with
-			throttle the serializations with start() are down
-			to the ldisc layer.
-
-start()			Ldisc notifier to the driver to start output.
-
-hangup()		Ask the tty driver to cause a hangup initiated
-			from the host side. [Can sleep ??]
-
-break_ctl()		Send RS232 break. Can sleep. Can get called in
-			parallel, driver must serialize (for now), and
-			with write calls.
-
-wait_until_sent()	Wait for characters to exit the hardware queue
-			of the driver. Can sleep
-
-send_xchar()	  	Send XON/XOFF and if possible jump the queue with
-			it in order to get fast flow control responses.
-			Cannot sleep ??
-======================= =======================================================
diff --git a/MAINTAINERS b/MAINTAINERS
index d1a0a817dd92..4f88bca37c55 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10767,7 +10767,7 @@ F:	include/uapi/linux/meye.h
 MOXA SMARTIO/INDUSTIO/INTELLIO SERIAL CARD
 M:	Jiri Slaby <jirislaby@gmail.com>
 S:	Maintained
-F:	Documentation/serial/moxa-smartio.rst
+F:	Documentation/driver-api/serial/moxa-smartio.rst
 F:	drivers/tty/mxser.*
 
 MR800 AVERMEDIA USB FM RADIO DRIVER
@@ -13689,7 +13689,7 @@ ROCKETPORT DRIVER
 P:	Comtrol Corp.
 W:	http://www.comtrol.com
 S:	Maintained
-F:	Documentation/serial/rocket.rst
+F:	Documentation/driver-api/serial/rocket.rst
 F:	drivers/tty/rocket*
 
 ROCKETPORT EXPRESS/INFINITY DRIVER
@@ -16228,7 +16228,7 @@ M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 M:	Jiri Slaby <jslaby@suse.com>
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/tty.git
-F:	Documentation/serial/
+F:	Documentation/driver-api/serial/
 F:	drivers/tty/
 F:	drivers/tty/serial/serial_core.c
 F:	include/linux/serial_core.h
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index ee51b9514225..c7623f99ac0f 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -175,7 +175,7 @@ config ROCKETPORT
 	  This driver supports Comtrol RocketPort and RocketModem PCI boards.   
           These boards provide 2, 4, 8, 16, or 32 high-speed serial ports or
           modems.  For information about the RocketPort/RocketModem  boards
-          and this driver read <file:Documentation/serial/rocket.rst>.
+          and this driver read <file:Documentation/driver-api/serial/rocket.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called rocket.
@@ -193,7 +193,7 @@ config CYCLADES
 	  your Linux box, for instance in order to become a dial-in server.
 
 	  For information about the Cyclades-Z card, read
-	  <file:Documentation/serial/cyclades_z.rst>.
+	  <file:Documentation/driver-api/serial/cyclades_z.rst>.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called cyclades.
diff --git a/drivers/tty/serial/ucc_uart.c b/drivers/tty/serial/ucc_uart.c
index 6e3c66ab0e62..a0555ae2b1ef 100644
--- a/drivers/tty/serial/ucc_uart.c
+++ b/drivers/tty/serial/ucc_uart.c
@@ -1081,7 +1081,7 @@ static int qe_uart_verify_port(struct uart_port *port,
 }
 /* UART operations
  *
- * Details on these functions can be found in Documentation/serial/driver.rst
+ * Details on these functions can be found in Documentation/driver-api/serial/driver.rst
  */
 static const struct uart_ops qe_uart_pops = {
 	.tx_empty       = qe_uart_tx_empty,
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 05b179015d6c..2b78cc734719 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -32,7 +32,7 @@ struct device;
 
 /*
  * This structure describes all the operations that can be done on the
- * physical hardware.  See Documentation/serial/driver.rst for details.
+ * physical hardware.  See Documentation/driver-api/serial/driver.rst for details.
  */
 struct uart_ops {
 	unsigned int	(*tx_empty)(struct uart_port *);
-- 
cgit v1.2.3-59-g8ed1b


From 7e4b4dfc98d54bc79f7ca29c8bc6307ed2948014 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 16 Jul 2019 04:06:29 +1000
Subject: Revert "mm: adjust apply_to_pfn_range interface for dropped token."

This reverts commit 6dfc43d3a19174faead54575c204aee106225f43.

Going to revert the whole vmwwgfx pull.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 include/linux/mm.h    | 2 +-
 mm/as_dirty_helpers.c | 6 ++++--
 mm/memory.c           | 6 +++---
 3 files changed, 8 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c45f936bd81c..798cdda9560e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2687,7 +2687,7 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
 struct pfn_range_apply;
-typedef int (*pter_fn_t)(pte_t *pte, unsigned long addr,
+typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 			 struct pfn_range_apply *closure);
 struct pfn_range_apply {
 	struct mm_struct *mm;
diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c
index 6352a3729408..f600e31534fb 100644
--- a/mm/as_dirty_helpers.c
+++ b/mm/as_dirty_helpers.c
@@ -26,6 +26,7 @@ struct apply_as {
 /**
  * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
  * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
  * @addr: The virtual page address
  * @closure: Pointer to a struct pfn_range_apply embedded in a
  * struct apply_as
@@ -35,7 +36,7 @@ struct apply_as {
  *
  * Return: Always zero.
  */
-static int apply_pt_wrprotect(pte_t *pte,
+static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
 			      unsigned long addr,
 			      struct pfn_range_apply *closure)
 {
@@ -77,6 +78,7 @@ struct apply_as_clean {
 /**
  * apply_pt_clean - Leaf pte callback to clean a pte
  * @pte: Pointer to the pte
+ * @token: Page table token, see apply_to_pfn_range()
  * @addr: The virtual page address
  * @closure: Pointer to a struct pfn_range_apply embedded in a
  * struct apply_as_clean
@@ -89,7 +91,7 @@ struct apply_as_clean {
  *
  * Return: Always zero.
  */
-static int apply_pt_clean(pte_t *pte,
+static int apply_pt_clean(pte_t *pte, pgtable_t token,
 			  unsigned long addr,
 			  struct pfn_range_apply *closure)
 {
diff --git a/mm/memory.c b/mm/memory.c
index b8218e962231..462aa47f8878 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2053,7 +2053,7 @@ static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
 	token = pmd_pgtable(*pmd);
 
 	do {
-		err = closure->ptefn(pte++, addr, closure);
+		err = closure->ptefn(pte++, token, addr, closure);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
@@ -2194,14 +2194,14 @@ struct page_range_apply {
  * Callback wrapper to enable use of apply_to_pfn_range for
  * the apply_to_page_range interface
  */
-static int apply_to_page_range_wrapper(pte_t *pte,
+static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
 				       unsigned long addr,
 				       struct pfn_range_apply *pter)
 {
 	struct page_range_apply *pra =
 		container_of(pter, typeof(*pra), pter);
 
-	return pra->fn(pte, NULL, addr, pra->data);
+	return pra->fn(pte, token, addr, pra->data);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 3729fe2bc2a01f4cc1aa88be8f64af06084c87d6 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 16 Jul 2019 04:07:13 +1000
Subject: Revert "Merge branch 'vmwgfx-next' of
 git://people.freedesktop.org/~thomash/linux into drm-next"

This reverts commit 031e610a6a21448a63dff7a0416e5e206724caac, reversing
changes made to 52d2d44eee8091e740d0d275df1311fb8373c9a9.

The mm changes in there we premature and not fully ack or reviewed by core mm folks,
I dropped the ball by merging them via this tree, so lets take em all back out.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 MAINTAINERS                                        |   1 -
 drivers/gpu/drm/ttm/ttm_bo.c                       |   1 -
 drivers/gpu/drm/ttm/ttm_bo_vm.c                    | 169 +++-----
 drivers/gpu/drm/vmwgfx/Kconfig                     |   1 -
 drivers/gpu/drm/vmwgfx/Makefile                    |   2 +-
 .../drm/vmwgfx/device_include/svga3d_surfacedefs.h | 233 +---------
 drivers/gpu/drm/vmwgfx/ttm_lock.c                  | 100 +++++
 drivers/gpu/drm/vmwgfx/ttm_lock.h                  |  30 ++
 drivers/gpu/drm/vmwgfx/vmwgfx_bo.c                 |  12 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_context.c            |   4 -
 drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c            |  13 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c                | 167 +++++++-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.h                | 139 ++----
 drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c            |   1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_kms.c                |  23 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c         | 472 ---------------------
 drivers/gpu/drm/vmwgfx/vmwgfx_resource.c           | 245 ++---------
 drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h      |  15 -
 drivers/gpu/drm/vmwgfx/vmwgfx_shader.c             |   8 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_surface.c            | 405 +-----------------
 drivers/gpu/drm/vmwgfx/vmwgfx_validation.c         |  74 +---
 drivers/gpu/drm/vmwgfx/vmwgfx_validation.h         |  16 +-
 include/drm/ttm/ttm_bo_api.h                       |  10 -
 include/drm/ttm/ttm_bo_driver.h                    |   6 -
 include/linux/mm.h                                 |  19 +-
 include/uapi/drm/vmwgfx_drm.h                      |   4 +-
 mm/Kconfig                                         |   3 -
 mm/Makefile                                        |   1 -
 mm/as_dirty_helpers.c                              | 300 -------------
 mm/memory.c                                        | 145 ++-----
 30 files changed, 483 insertions(+), 2136 deletions(-)
 delete mode 100644 drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
 delete mode 100644 mm/as_dirty_helpers.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index d6600715a662..2abf6d28db64 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5191,7 +5191,6 @@ T:	git git://people.freedesktop.org/~thomash/linux
 S:	Supported
 F:	drivers/gpu/drm/vmwgfx/
 F:	include/uapi/drm/vmwgfx_drm.h
-F:	mm/as_dirty_helpers.c
 
 DRM DRIVERS
 M:	David Airlie <airlied@linux.ie>
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index a7fd5a4955c9..58c403eda04e 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1739,7 +1739,6 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev,
 	mutex_lock(&ttm_global_mutex);
 	list_add_tail(&bdev->device_list, &glob->device_list);
 	mutex_unlock(&ttm_global_mutex);
-	bdev->vm_ops = &ttm_bo_vm_ops;
 
 	return 0;
 out_no_sys:
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 0c4576cbafcf..6dacff49c1cc 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -42,6 +42,8 @@
 #include <linux/uaccess.h>
 #include <linux/mem_encrypt.h>
 
+#define TTM_BO_VM_NUM_PREFAULT 16
+
 static vm_fault_t ttm_bo_vm_fault_idle(struct ttm_buffer_object *bo,
 				struct vm_fault *vmf)
 {
@@ -104,30 +106,25 @@ static unsigned long ttm_bo_io_mem_pfn(struct ttm_buffer_object *bo,
 		+ page_offset;
 }
 
-/**
- * ttm_bo_vm_reserve - Reserve a buffer object in a retryable vm callback
- * @bo: The buffer object
- * @vmf: The fault structure handed to the callback
- *
- * vm callbacks like fault() and *_mkwrite() allow for the mm_sem to be dropped
- * during long waits, and after the wait the callback will be restarted. This
- * is to allow other threads using the same virtual memory space concurrent
- * access to map(), unmap() completely unrelated buffer objects. TTM buffer
- * object reservations sometimes wait for GPU and should therefore be
- * considered long waits. This function reserves the buffer object interruptibly
- * taking this into account. Starvation is avoided by the vm system not
- * allowing too many repeated restarts.
- * This function is intended to be used in customized fault() and _mkwrite()
- * handlers.
- *
- * Return:
- *    0 on success and the bo was reserved.
- *    VM_FAULT_RETRY if blocking wait.
- *    VM_FAULT_NOPAGE if blocking wait and retrying was not allowed.
- */
-vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
-			     struct vm_fault *vmf)
+static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
 {
+	struct vm_area_struct *vma = vmf->vma;
+	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
+	    vma->vm_private_data;
+	struct ttm_bo_device *bdev = bo->bdev;
+	unsigned long page_offset;
+	unsigned long page_last;
+	unsigned long pfn;
+	struct ttm_tt *ttm = NULL;
+	struct page *page;
+	int err;
+	int i;
+	vm_fault_t ret = VM_FAULT_NOPAGE;
+	unsigned long address = vmf->address;
+	struct ttm_mem_type_manager *man =
+		&bdev->man[bo->mem.mem_type];
+	struct vm_area_struct cvma;
+
 	/*
 	 * Work around locking order reversal in fault / nopfn
 	 * between mmap_sem and bo_reserve: Perform a trylock operation
@@ -154,55 +151,14 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
 		return VM_FAULT_NOPAGE;
 	}
 
-	return 0;
-}
-EXPORT_SYMBOL(ttm_bo_vm_reserve);
-
-/**
- * ttm_bo_vm_fault_reserved - TTM fault helper
- * @vmf: The struct vm_fault given as argument to the fault callback
- * @prot: The page protection to be used for this memory area.
- * @num_prefault: Maximum number of prefault pages. The caller may want to
- * specify this based on madvice settings and the size of the GPU object
- * backed by the memory.
- *
- * This function inserts one or more page table entries pointing to the
- * memory backing the buffer object, and then returns a return code
- * instructing the caller to retry the page access.
- *
- * Return:
- *   VM_FAULT_NOPAGE on success or pending signal
- *   VM_FAULT_SIGBUS on unspecified error
- *   VM_FAULT_OOM on out-of-memory
- *   VM_FAULT_RETRY if retryable wait
- */
-vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
-				    pgprot_t prot,
-				    pgoff_t num_prefault)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	struct vm_area_struct cvma = *vma;
-	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
-	    vma->vm_private_data;
-	struct ttm_bo_device *bdev = bo->bdev;
-	unsigned long page_offset;
-	unsigned long page_last;
-	unsigned long pfn;
-	struct ttm_tt *ttm = NULL;
-	struct page *page;
-	int err;
-	pgoff_t i;
-	vm_fault_t ret = VM_FAULT_NOPAGE;
-	unsigned long address = vmf->address;
-	struct ttm_mem_type_manager *man =
-		&bdev->man[bo->mem.mem_type];
-
 	/*
 	 * Refuse to fault imported pages. This should be handled
 	 * (if at all) by redirecting mmap to the exporter.
 	 */
-	if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG))
-		return VM_FAULT_SIGBUS;
+	if (bo->ttm && (bo->ttm->page_flags & TTM_PAGE_FLAG_SG)) {
+		ret = VM_FAULT_SIGBUS;
+		goto out_unlock;
+	}
 
 	if (bdev->driver->fault_reserve_notify) {
 		struct dma_fence *moving = dma_fence_get(bo->moving);
@@ -213,9 +169,11 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 			break;
 		case -EBUSY:
 		case -ERESTARTSYS:
-			return VM_FAULT_NOPAGE;
+			ret = VM_FAULT_NOPAGE;
+			goto out_unlock;
 		default:
-			return VM_FAULT_SIGBUS;
+			ret = VM_FAULT_SIGBUS;
+			goto out_unlock;
 		}
 
 		if (bo->moving != moving) {
@@ -231,12 +189,21 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 	 * move.
 	 */
 	ret = ttm_bo_vm_fault_idle(bo, vmf);
-	if (unlikely(ret != 0))
-		return ret;
+	if (unlikely(ret != 0)) {
+		if (ret == VM_FAULT_RETRY &&
+		    !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+			/* The BO has already been unreserved. */
+			return ret;
+		}
+
+		goto out_unlock;
+	}
 
 	err = ttm_mem_io_lock(man, true);
-	if (unlikely(err != 0))
-		return VM_FAULT_NOPAGE;
+	if (unlikely(err != 0)) {
+		ret = VM_FAULT_NOPAGE;
+		goto out_unlock;
+	}
 	err = ttm_mem_io_reserve_vm(bo);
 	if (unlikely(err != 0)) {
 		ret = VM_FAULT_SIGBUS;
@@ -253,8 +220,18 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 		goto out_io_unlock;
 	}
 
-	cvma.vm_page_prot = ttm_io_prot(bo->mem.placement, prot);
-	if (!bo->mem.bus.is_iomem) {
+	/*
+	 * Make a local vma copy to modify the page_prot member
+	 * and vm_flags if necessary. The vma parameter is protected
+	 * by mmap_sem in write mode.
+	 */
+	cvma = *vma;
+	cvma.vm_page_prot = vm_get_page_prot(cvma.vm_flags);
+
+	if (bo->mem.bus.is_iomem) {
+		cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
+						cvma.vm_page_prot);
+	} else {
 		struct ttm_operation_ctx ctx = {
 			.interruptible = false,
 			.no_wait_gpu = false,
@@ -263,21 +240,24 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 		};
 
 		ttm = bo->ttm;
-		if (ttm_tt_populate(bo->ttm, &ctx)) {
+		cvma.vm_page_prot = ttm_io_prot(bo->mem.placement,
+						cvma.vm_page_prot);
+
+		/* Allocate all page at once, most common usage */
+		if (ttm_tt_populate(ttm, &ctx)) {
 			ret = VM_FAULT_OOM;
 			goto out_io_unlock;
 		}
-	} else {
-		/* Iomem should not be marked encrypted */
-		cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
 	}
 
 	/*
 	 * Speculatively prefault a number of pages. Only error on
 	 * first page.
 	 */
-	for (i = 0; i < num_prefault; ++i) {
+	for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
 		if (bo->mem.bus.is_iomem) {
+			/* Iomem should not be marked encrypted */
+			cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot);
 			pfn = ttm_bo_io_mem_pfn(bo, page_offset);
 		} else {
 			page = ttm->pages[page_offset];
@@ -315,26 +295,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
 	ret = VM_FAULT_NOPAGE;
 out_io_unlock:
 	ttm_mem_io_unlock(man);
-	return ret;
-}
-EXPORT_SYMBOL(ttm_bo_vm_fault_reserved);
-
-static vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	pgprot_t prot;
-	struct ttm_buffer_object *bo = vma->vm_private_data;
-	vm_fault_t ret;
-
-	ret = ttm_bo_vm_reserve(bo, vmf);
-	if (ret)
-		return ret;
-
-	prot = vm_get_page_prot(vma->vm_flags);
-	ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
-	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
-		return ret;
-
+out_unlock:
 	reservation_object_unlock(bo->resv);
 	return ret;
 }
@@ -434,7 +395,7 @@ static int ttm_bo_vm_access(struct vm_area_struct *vma, unsigned long addr,
 	return ret;
 }
 
-const struct vm_operations_struct ttm_bo_vm_ops = {
+static const struct vm_operations_struct ttm_bo_vm_ops = {
 	.fault = ttm_bo_vm_fault,
 	.open = ttm_bo_vm_open,
 	.close = ttm_bo_vm_close,
@@ -487,7 +448,7 @@ int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
 	if (unlikely(ret != 0))
 		goto out_unref;
 
-	vma->vm_ops = bdev->vm_ops;
+	vma->vm_ops = &ttm_bo_vm_ops;
 
 	/*
 	 * Note: We're transferring the bo reference to
@@ -519,7 +480,7 @@ int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
 
 	ttm_bo_get(bo);
 
-	vma->vm_ops = bo->bdev->vm_ops;
+	vma->vm_ops = &ttm_bo_vm_ops;
 	vma->vm_private_data = bo;
 	vma->vm_flags |= VM_MIXEDMAP;
 	vma->vm_flags |= VM_IO | VM_DONTEXPAND;
diff --git a/drivers/gpu/drm/vmwgfx/Kconfig b/drivers/gpu/drm/vmwgfx/Kconfig
index d5fd81a521f6..6b28a326f8bb 100644
--- a/drivers/gpu/drm/vmwgfx/Kconfig
+++ b/drivers/gpu/drm/vmwgfx/Kconfig
@@ -8,7 +8,6 @@ config DRM_VMWGFX
 	select FB_CFB_IMAGEBLIT
 	select DRM_TTM
 	select FB
-	select AS_DIRTY_HELPERS
 	# Only needed for the transitional use of drm_crtc_init - can be removed
 	# again once vmwgfx sets up the primary plane itself.
 	select DRM_KMS_HELPER
diff --git a/drivers/gpu/drm/vmwgfx/Makefile b/drivers/gpu/drm/vmwgfx/Makefile
index c877a21a0739..8841bd30e1e5 100644
--- a/drivers/gpu/drm/vmwgfx/Makefile
+++ b/drivers/gpu/drm/vmwgfx/Makefile
@@ -8,7 +8,7 @@ vmwgfx-y := vmwgfx_execbuf.o vmwgfx_gmr.o vmwgfx_kms.o vmwgfx_drv.o \
 	    vmwgfx_cmdbuf_res.o vmwgfx_cmdbuf.o vmwgfx_stdu.o \
 	    vmwgfx_cotable.o vmwgfx_so.o vmwgfx_binding.o vmwgfx_msg.o \
 	    vmwgfx_simple_resource.o vmwgfx_va.o vmwgfx_blit.o \
-	    vmwgfx_validation.o vmwgfx_page_dirty.o \
+	    vmwgfx_validation.o \
 	    ttm_object.o ttm_lock.o
 
 obj-$(CONFIG_DRM_VMWGFX) := vmwgfx.o
diff --git a/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h b/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
index 61414f105c67..f2bfd3d80598 100644
--- a/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
+++ b/drivers/gpu/drm/vmwgfx/device_include/svga3d_surfacedefs.h
@@ -1280,6 +1280,7 @@ svga3dsurface_get_pixel_offset(SVGA3dSurfaceFormat format,
 	return offset;
 }
 
+
 static inline u32
 svga3dsurface_get_image_offset(SVGA3dSurfaceFormat format,
 			       surf_size_struct baseLevelSize,
@@ -1374,236 +1375,4 @@ svga3dsurface_is_screen_target_format(SVGA3dSurfaceFormat format)
 	return svga3dsurface_is_dx_screen_target_format(format);
 }
 
-/**
- * struct svga3dsurface_mip - Mimpmap level information
- * @bytes: Bytes required in the backing store of this mipmap level.
- * @img_stride: Byte stride per image.
- * @row_stride: Byte stride per block row.
- * @size: The size of the mipmap.
- */
-struct svga3dsurface_mip {
-	size_t bytes;
-	size_t img_stride;
-	size_t row_stride;
-	struct drm_vmw_size size;
-
-};
-
-/**
- * struct svga3dsurface_cache - Cached surface information
- * @desc: Pointer to the surface descriptor
- * @mip: Array of mipmap level information. Valid size is @num_mip_levels.
- * @mip_chain_bytes: Bytes required in the backing store for the whole chain
- * of mip levels.
- * @sheet_bytes: Bytes required in the backing store for a sheet
- * representing a single sample.
- * @num_mip_levels: Valid size of the @mip array. Number of mipmap levels in
- * a chain.
- * @num_layers: Number of slices in an array texture or number of faces in
- * a cubemap texture.
- */
-struct svga3dsurface_cache {
-	const struct svga3d_surface_desc *desc;
-	struct svga3dsurface_mip mip[DRM_VMW_MAX_MIP_LEVELS];
-	size_t mip_chain_bytes;
-	size_t sheet_bytes;
-	u32 num_mip_levels;
-	u32 num_layers;
-};
-
-/**
- * struct svga3dsurface_loc - Surface location
- * @sub_resource: Surface subresource. Defined as layer * num_mip_levels +
- * mip_level.
- * @x: X coordinate.
- * @y: Y coordinate.
- * @z: Z coordinate.
- */
-struct svga3dsurface_loc {
-	u32 sub_resource;
-	u32 x, y, z;
-};
-
-/**
- * svga3dsurface_subres - Compute the subresource from layer and mipmap.
- * @cache: Surface layout data.
- * @mip_level: The mipmap level.
- * @layer: The surface layer (face or array slice).
- *
- * Return: The subresource.
- */
-static inline u32 svga3dsurface_subres(const struct svga3dsurface_cache *cache,
-				       u32 mip_level, u32 layer)
-{
-	return cache->num_mip_levels * layer + mip_level;
-}
-
-/**
- * svga3dsurface_setup_cache - Build a surface cache entry
- * @size: The surface base level dimensions.
- * @format: The surface format.
- * @num_mip_levels: Number of mipmap levels.
- * @num_layers: Number of layers.
- * @cache: Pointer to a struct svga3dsurface_cach object to be filled in.
- *
- * Return: Zero on success, -EINVAL on invalid surface layout.
- */
-static inline int svga3dsurface_setup_cache(const struct drm_vmw_size *size,
-					    SVGA3dSurfaceFormat format,
-					    u32 num_mip_levels,
-					    u32 num_layers,
-					    u32 num_samples,
-					    struct svga3dsurface_cache *cache)
-{
-	const struct svga3d_surface_desc *desc;
-	u32 i;
-
-	memset(cache, 0, sizeof(*cache));
-	cache->desc = desc = svga3dsurface_get_desc(format);
-	cache->num_mip_levels = num_mip_levels;
-	cache->num_layers = num_layers;
-	for (i = 0; i < cache->num_mip_levels; i++) {
-		struct svga3dsurface_mip *mip = &cache->mip[i];
-
-		mip->size = svga3dsurface_get_mip_size(*size, i);
-		mip->bytes = svga3dsurface_get_image_buffer_size
-			(desc, &mip->size, 0);
-		mip->row_stride =
-			__KERNEL_DIV_ROUND_UP(mip->size.width,
-					      desc->block_size.width) *
-			desc->bytes_per_block * num_samples;
-		if (!mip->row_stride)
-			goto invalid_dim;
-
-		mip->img_stride =
-			__KERNEL_DIV_ROUND_UP(mip->size.height,
-					      desc->block_size.height) *
-			mip->row_stride;
-		if (!mip->img_stride)
-			goto invalid_dim;
-
-		cache->mip_chain_bytes += mip->bytes;
-	}
-	cache->sheet_bytes = cache->mip_chain_bytes * num_layers;
-	if (!cache->sheet_bytes)
-		goto invalid_dim;
-
-	return 0;
-
-invalid_dim:
-	VMW_DEBUG_USER("Invalid surface layout for dirty tracking.\n");
-	return -EINVAL;
-}
-
-/**
- * svga3dsurface_get_loc - Get a surface location from an offset into the
- * backing store
- * @cache: Surface layout data.
- * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
- * @offset: Offset into the surface backing store.
- */
-static inline void
-svga3dsurface_get_loc(const struct svga3dsurface_cache *cache,
-		      struct svga3dsurface_loc *loc,
-		      size_t offset)
-{
-	const struct svga3dsurface_mip *mip = &cache->mip[0];
-	const struct svga3d_surface_desc *desc = cache->desc;
-	u32 layer;
-	int i;
-
-	if (offset >= cache->sheet_bytes)
-		offset %= cache->sheet_bytes;
-
-	layer = offset / cache->mip_chain_bytes;
-	offset -= layer * cache->mip_chain_bytes;
-	for (i = 0; i < cache->num_mip_levels; ++i, ++mip) {
-		if (mip->bytes > offset)
-			break;
-		offset -= mip->bytes;
-	}
-
-	loc->sub_resource = svga3dsurface_subres(cache, i, layer);
-	loc->z = offset / mip->img_stride;
-	offset -= loc->z * mip->img_stride;
-	loc->z *= desc->block_size.depth;
-	loc->y = offset / mip->row_stride;
-	offset -= loc->y * mip->row_stride;
-	loc->y *= desc->block_size.height;
-	loc->x = offset / desc->bytes_per_block;
-	loc->x *= desc->block_size.width;
-}
-
-/**
- * svga3dsurface_inc_loc - Clamp increment a surface location with one block
- * size
- * in each dimension.
- * @loc: Pointer to a struct svga3dsurface_loc to be incremented.
- *
- * When computing the size of a range as size = end - start, the range does not
- * include the end element. However a location representing the last byte
- * of a touched region in the backing store *is* included in the range.
- * This function modifies such a location to match the end definition
- * given as start + size which is the one used in a SVGA3dBox.
- */
-static inline void
-svga3dsurface_inc_loc(const struct svga3dsurface_cache *cache,
-		      struct svga3dsurface_loc *loc)
-{
-	const struct svga3d_surface_desc *desc = cache->desc;
-	u32 mip = loc->sub_resource % cache->num_mip_levels;
-	const struct drm_vmw_size *size = &cache->mip[mip].size;
-
-	loc->sub_resource++;
-	loc->x += desc->block_size.width;
-	if (loc->x > size->width)
-		loc->x = size->width;
-	loc->y += desc->block_size.height;
-	if (loc->y > size->height)
-		loc->y = size->height;
-	loc->z += desc->block_size.depth;
-	if (loc->z > size->depth)
-		loc->z = size->depth;
-}
-
-/**
- * svga3dsurface_min_loc - The start location in a subresource
- * @cache: Surface layout data.
- * @sub_resource: The subresource.
- * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
- */
-static inline void
-svga3dsurface_min_loc(const struct svga3dsurface_cache *cache,
-		      u32 sub_resource,
-		      struct svga3dsurface_loc *loc)
-{
-	loc->sub_resource = sub_resource;
-	loc->x = loc->y = loc->z = 0;
-}
-
-/**
- * svga3dsurface_min_loc - The end location in a subresource
- * @cache: Surface layout data.
- * @sub_resource: The subresource.
- * @loc: Pointer to a struct svga3dsurface_loc to be filled in.
- *
- * Following the end definition given in svga3dsurface_inc_loc(),
- * Compute the end location of a surface subresource.
- */
-static inline void
-svga3dsurface_max_loc(const struct svga3dsurface_cache *cache,
-		      u32 sub_resource,
-		      struct svga3dsurface_loc *loc)
-{
-	const struct drm_vmw_size *size;
-	u32 mip;
-
-	loc->sub_resource = sub_resource + 1;
-	mip = sub_resource % cache->num_mip_levels;
-	size = &cache->mip[mip].size;
-	loc->x = size->width;
-	loc->y = size->height;
-	loc->z = size->depth;
-}
-
 #endif /* _SVGA3D_SURFACEDEFS_H_ */
diff --git a/drivers/gpu/drm/vmwgfx/ttm_lock.c b/drivers/gpu/drm/vmwgfx/ttm_lock.c
index 5971c72e6d10..16b2083cb9d4 100644
--- a/drivers/gpu/drm/vmwgfx/ttm_lock.c
+++ b/drivers/gpu/drm/vmwgfx/ttm_lock.c
@@ -29,6 +29,7 @@
  * Authors: Thomas Hellstrom <thellstrom-at-vmware-dot-com>
  */
 
+#include <drm/ttm/ttm_module.h>
 #include <linux/atomic.h>
 #include <linux/errno.h>
 #include <linux/wait.h>
@@ -48,6 +49,8 @@ void ttm_lock_init(struct ttm_lock *lock)
 	init_waitqueue_head(&lock->queue);
 	lock->rw = 0;
 	lock->flags = 0;
+	lock->kill_takers = false;
+	lock->signal = SIGKILL;
 }
 
 void ttm_read_unlock(struct ttm_lock *lock)
@@ -63,6 +66,11 @@ static bool __ttm_read_lock(struct ttm_lock *lock)
 	bool locked = false;
 
 	spin_lock(&lock->lock);
+	if (unlikely(lock->kill_takers)) {
+		send_sig(lock->signal, current, 0);
+		spin_unlock(&lock->lock);
+		return false;
+	}
 	if (lock->rw >= 0 && lock->flags == 0) {
 		++lock->rw;
 		locked = true;
@@ -90,6 +98,11 @@ static bool __ttm_read_trylock(struct ttm_lock *lock, bool *locked)
 	*locked = false;
 
 	spin_lock(&lock->lock);
+	if (unlikely(lock->kill_takers)) {
+		send_sig(lock->signal, current, 0);
+		spin_unlock(&lock->lock);
+		return false;
+	}
 	if (lock->rw >= 0 && lock->flags == 0) {
 		++lock->rw;
 		block = false;
@@ -134,6 +147,11 @@ static bool __ttm_write_lock(struct ttm_lock *lock)
 	bool locked = false;
 
 	spin_lock(&lock->lock);
+	if (unlikely(lock->kill_takers)) {
+		send_sig(lock->signal, current, 0);
+		spin_unlock(&lock->lock);
+		return false;
+	}
 	if (lock->rw == 0 && ((lock->flags & ~TTM_WRITE_LOCK_PENDING) == 0)) {
 		lock->rw = -1;
 		lock->flags &= ~TTM_WRITE_LOCK_PENDING;
@@ -164,6 +182,88 @@ int ttm_write_lock(struct ttm_lock *lock, bool interruptible)
 	return ret;
 }
 
+static int __ttm_vt_unlock(struct ttm_lock *lock)
+{
+	int ret = 0;
+
+	spin_lock(&lock->lock);
+	if (unlikely(!(lock->flags & TTM_VT_LOCK)))
+		ret = -EINVAL;
+	lock->flags &= ~TTM_VT_LOCK;
+	wake_up_all(&lock->queue);
+	spin_unlock(&lock->lock);
+
+	return ret;
+}
+
+static void ttm_vt_lock_remove(struct ttm_base_object **p_base)
+{
+	struct ttm_base_object *base = *p_base;
+	struct ttm_lock *lock = container_of(base, struct ttm_lock, base);
+	int ret;
+
+	*p_base = NULL;
+	ret = __ttm_vt_unlock(lock);
+	BUG_ON(ret != 0);
+}
+
+static bool __ttm_vt_lock(struct ttm_lock *lock)
+{
+	bool locked = false;
+
+	spin_lock(&lock->lock);
+	if (lock->rw == 0) {
+		lock->flags &= ~TTM_VT_LOCK_PENDING;
+		lock->flags |= TTM_VT_LOCK;
+		locked = true;
+	} else {
+		lock->flags |= TTM_VT_LOCK_PENDING;
+	}
+	spin_unlock(&lock->lock);
+	return locked;
+}
+
+int ttm_vt_lock(struct ttm_lock *lock,
+		bool interruptible,
+		struct ttm_object_file *tfile)
+{
+	int ret = 0;
+
+	if (interruptible) {
+		ret = wait_event_interruptible(lock->queue,
+					       __ttm_vt_lock(lock));
+		if (unlikely(ret != 0)) {
+			spin_lock(&lock->lock);
+			lock->flags &= ~TTM_VT_LOCK_PENDING;
+			wake_up_all(&lock->queue);
+			spin_unlock(&lock->lock);
+			return ret;
+		}
+	} else
+		wait_event(lock->queue, __ttm_vt_lock(lock));
+
+	/*
+	 * Add a base-object, the destructor of which will
+	 * make sure the lock is released if the client dies
+	 * while holding it.
+	 */
+
+	ret = ttm_base_object_init(tfile, &lock->base, false,
+				   ttm_lock_type, &ttm_vt_lock_remove, NULL);
+	if (ret)
+		(void)__ttm_vt_unlock(lock);
+	else
+		lock->vt_holder = tfile;
+
+	return ret;
+}
+
+int ttm_vt_unlock(struct ttm_lock *lock)
+{
+	return ttm_ref_object_base_unref(lock->vt_holder,
+					 lock->base.handle, TTM_REF_USAGE);
+}
+
 void ttm_suspend_unlock(struct ttm_lock *lock)
 {
 	spin_lock(&lock->lock);
diff --git a/drivers/gpu/drm/vmwgfx/ttm_lock.h b/drivers/gpu/drm/vmwgfx/ttm_lock.h
index 3d454e8b491f..0c3af9836863 100644
--- a/drivers/gpu/drm/vmwgfx/ttm_lock.h
+++ b/drivers/gpu/drm/vmwgfx/ttm_lock.h
@@ -63,6 +63,8 @@
  * @lock: Spinlock protecting some lock members.
  * @rw: Read-write lock counter. Protected by @lock.
  * @flags: Lock state. Protected by @lock.
+ * @kill_takers: Boolean whether to kill takers of the lock.
+ * @signal: Signal to send when kill_takers is true.
  */
 
 struct ttm_lock {
@@ -71,6 +73,9 @@ struct ttm_lock {
 	spinlock_t lock;
 	int32_t rw;
 	uint32_t flags;
+	bool kill_takers;
+	int signal;
+	struct ttm_object_file *vt_holder;
 };
 
 
@@ -215,4 +220,29 @@ extern void ttm_write_unlock(struct ttm_lock *lock);
  */
 extern int ttm_write_lock(struct ttm_lock *lock, bool interruptible);
 
+/**
+ * ttm_lock_set_kill
+ *
+ * @lock: Pointer to a struct ttm_lock
+ * @val: Boolean whether to kill processes taking the lock.
+ * @signal: Signal to send to the process taking the lock.
+ *
+ * The kill-when-taking-lock functionality is used to kill processes that keep
+ * on using the TTM functionality when its resources has been taken down, for
+ * example when the X server exits. A typical sequence would look like this:
+ * - X server takes lock in write mode.
+ * - ttm_lock_set_kill() is called with @val set to true.
+ * - As part of X server exit, TTM resources are taken down.
+ * - X server releases the lock on file release.
+ * - Another dri client wants to render, takes the lock and is killed.
+ *
+ */
+static inline void ttm_lock_set_kill(struct ttm_lock *lock, bool val,
+				     int signal)
+{
+	lock->kill_takers = val;
+	if (val)
+		lock->signal = signal;
+}
+
 #endif
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
index e8bc7a7ac031..5d5c2bce01f3 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_bo.c
@@ -463,8 +463,6 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
 {
 	struct vmw_buffer_object *vmw_bo = vmw_buffer_object(bo);
 
-	WARN_ON(vmw_bo->dirty);
-	WARN_ON(!RB_EMPTY_ROOT(&vmw_bo->res_tree));
 	vmw_bo_unmap(vmw_bo);
 	kfree(vmw_bo);
 }
@@ -478,11 +476,8 @@ void vmw_bo_bo_free(struct ttm_buffer_object *bo)
 static void vmw_user_bo_destroy(struct ttm_buffer_object *bo)
 {
 	struct vmw_user_buffer_object *vmw_user_bo = vmw_user_buffer_object(bo);
-	struct vmw_buffer_object *vbo = &vmw_user_bo->vbo;
 
-	WARN_ON(vbo->dirty);
-	WARN_ON(!RB_EMPTY_ROOT(&vbo->res_tree));
-	vmw_bo_unmap(vbo);
+	vmw_bo_unmap(&vmw_user_bo->vbo);
 	ttm_prime_object_kfree(vmw_user_bo, prime);
 }
 
@@ -515,9 +510,8 @@ int vmw_bo_init(struct vmw_private *dev_priv,
 
 	acc_size = vmw_bo_acc_size(dev_priv, size, user);
 	memset(vmw_bo, 0, sizeof(*vmw_bo));
-	BUILD_BUG_ON(TTM_MAX_BO_PRIORITY <= 3);
-	vmw_bo->base.priority = 3;
-	vmw_bo->res_tree = RB_ROOT;
+
+	INIT_LIST_HEAD(&vmw_bo->res_list);
 
 	ret = ttm_bo_init(bdev, &vmw_bo->base, size,
 			  ttm_bo_type_device, placement,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_context.c b/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
index a56c9d802382..63f111068a44 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_context.c
@@ -88,8 +88,6 @@ static const struct vmw_res_func vmw_gb_context_func = {
 	.res_type = vmw_res_context,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
 	.type_name = "guest backed contexts",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_gb_context_create,
@@ -102,8 +100,6 @@ static const struct vmw_res_func vmw_dx_context_func = {
 	.res_type = vmw_res_dx_context,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
 	.type_name = "dx contexts",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_dx_context_create,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
index 8c699cb2565b..b4f6e1217c9d 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cotable.c
@@ -116,8 +116,6 @@ static const struct vmw_res_func vmw_cotable_func = {
 	.res_type = vmw_res_cotable,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
 	.type_name = "context guest backed object tables",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_cotable_create,
@@ -309,7 +307,7 @@ static int vmw_cotable_unbind(struct vmw_resource *res,
 	struct ttm_buffer_object *bo = val_buf->bo;
 	struct vmw_fence_obj *fence;
 
-	if (!vmw_resource_mob_attached(res))
+	if (list_empty(&res->mob_head))
 		return 0;
 
 	WARN_ON_ONCE(bo->mem.mem_type != VMW_PL_MOB);
@@ -455,7 +453,6 @@ static int vmw_cotable_resize(struct vmw_resource *res, size_t new_size)
 		goto out_wait;
 	}
 
-	vmw_resource_mob_detach(res);
 	res->backup = buf;
 	res->backup_size = new_size;
 	vcotbl->size_read_back = cur_size_read_back;
@@ -470,12 +467,12 @@ static int vmw_cotable_resize(struct vmw_resource *res, size_t new_size)
 		res->backup = old_buf;
 		res->backup_size = old_size;
 		vcotbl->size_read_back = old_size_read_back;
-		vmw_resource_mob_attach(res);
 		goto out_wait;
 	}
 
-	vmw_resource_mob_attach(res);
 	/* Let go of the old mob. */
+	list_del(&res->mob_head);
+	list_add_tail(&res->mob_head, &buf->res_list);
 	vmw_bo_unreference(&old_buf);
 	res->id = vcotbl->type;
 
@@ -499,7 +496,7 @@ out_wait:
  * is called before bind() in the validation sequence is instead used for two
  * things.
  * 1) Unscrub the cotable if it is scrubbed and still attached to a backup
- *    buffer.
+ *    buffer, that is, if @res->mob_head is non-empty.
  * 2) Resize the cotable if needed.
  */
 static int vmw_cotable_create(struct vmw_resource *res)
@@ -515,7 +512,7 @@ static int vmw_cotable_create(struct vmw_resource *res)
 		new_size *= 2;
 
 	if (likely(new_size <= res->backup_size)) {
-		if (vcotbl->scrubbed && vmw_resource_mob_attached(res)) {
+		if (vcotbl->scrubbed && !list_empty(&res->mob_head)) {
 			ret = vmw_cotable_unscrub(res);
 			if (ret)
 				return ret;
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 8349a6cc126f..4ff11a0077e1 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -254,6 +254,7 @@ static int vmw_restrict_dma_mask;
 static int vmw_assume_16bpp;
 
 static int vmw_probe(struct pci_dev *, const struct pci_device_id *);
+static void vmw_master_init(struct vmw_master *);
 static int vmwgfx_pm_notifier(struct notifier_block *nb, unsigned long val,
 			      void *ptr);
 
@@ -761,6 +762,10 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 	DRM_INFO("MMIO at 0x%08x size is %u kiB\n",
 		 dev_priv->mmio_start, dev_priv->mmio_size / 1024);
 
+	vmw_master_init(&dev_priv->fbdev_master);
+	ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
+	dev_priv->active_master = &dev_priv->fbdev_master;
+
 	dev_priv->mmio_virt = memremap(dev_priv->mmio_start,
 				       dev_priv->mmio_size, MEMREMAP_WB);
 
@@ -828,11 +833,6 @@ static int vmw_driver_load(struct drm_device *dev, unsigned long chipset)
 		DRM_ERROR("Failed initializing TTM buffer object driver.\n");
 		goto out_no_bdev;
 	}
-	dev_priv->vm_ops = *dev_priv->bdev.vm_ops;
-	dev_priv->vm_ops.fault = vmw_bo_vm_fault;
-	dev_priv->vm_ops.pfn_mkwrite = vmw_bo_vm_mkwrite;
-	dev_priv->vm_ops.page_mkwrite = vmw_bo_vm_mkwrite;
-	dev_priv->bdev.vm_ops = &dev_priv->vm_ops;
 
 	/*
 	 * Enable VRAM, but initially don't use it until SVGA is enabled and
@@ -1007,7 +1007,18 @@ static void vmw_driver_unload(struct drm_device *dev)
 static void vmw_postclose(struct drm_device *dev,
 			 struct drm_file *file_priv)
 {
-	struct vmw_fpriv *vmw_fp = vmw_fpriv(file_priv);
+	struct vmw_fpriv *vmw_fp;
+
+	vmw_fp = vmw_fpriv(file_priv);
+
+	if (vmw_fp->locked_master) {
+		struct vmw_master *vmaster =
+			vmw_master(vmw_fp->locked_master);
+
+		ttm_lock_set_kill(&vmaster->lock, true, SIGTERM);
+		ttm_vt_unlock(&vmaster->lock);
+		drm_master_put(&vmw_fp->locked_master);
+	}
 
 	ttm_object_file_release(&vmw_fp->tfile);
 	kfree(vmw_fp);
@@ -1036,6 +1047,55 @@ out_no_tfile:
 	return ret;
 }
 
+static struct vmw_master *vmw_master_check(struct drm_device *dev,
+					   struct drm_file *file_priv,
+					   unsigned int flags)
+{
+	int ret;
+	struct vmw_fpriv *vmw_fp = vmw_fpriv(file_priv);
+	struct vmw_master *vmaster;
+
+	if (!drm_is_primary_client(file_priv) || !(flags & DRM_AUTH))
+		return NULL;
+
+	ret = mutex_lock_interruptible(&dev->master_mutex);
+	if (unlikely(ret != 0))
+		return ERR_PTR(-ERESTARTSYS);
+
+	if (drm_is_current_master(file_priv)) {
+		mutex_unlock(&dev->master_mutex);
+		return NULL;
+	}
+
+	/*
+	 * Check if we were previously master, but now dropped. In that
+	 * case, allow at least render node functionality.
+	 */
+	if (vmw_fp->locked_master) {
+		mutex_unlock(&dev->master_mutex);
+
+		if (flags & DRM_RENDER_ALLOW)
+			return NULL;
+
+		DRM_ERROR("Dropped master trying to access ioctl that "
+			  "requires authentication.\n");
+		return ERR_PTR(-EACCES);
+	}
+	mutex_unlock(&dev->master_mutex);
+
+	/*
+	 * Take the TTM lock. Possibly sleep waiting for the authenticating
+	 * master to become master again, or for a SIGTERM if the
+	 * authenticating master exits.
+	 */
+	vmaster = vmw_master(file_priv->master);
+	ret = ttm_read_lock(&vmaster->lock, true);
+	if (unlikely(ret != 0))
+		vmaster = ERR_PTR(ret);
+
+	return vmaster;
+}
+
 static long vmw_generic_ioctl(struct file *filp, unsigned int cmd,
 			      unsigned long arg,
 			      long (*ioctl_func)(struct file *, unsigned int,
@@ -1044,6 +1104,7 @@ static long vmw_generic_ioctl(struct file *filp, unsigned int cmd,
 	struct drm_file *file_priv = filp->private_data;
 	struct drm_device *dev = file_priv->minor->dev;
 	unsigned int nr = DRM_IOCTL_NR(cmd);
+	struct vmw_master *vmaster;
 	unsigned int flags;
 	long ret;
 
@@ -1079,7 +1140,21 @@ static long vmw_generic_ioctl(struct file *filp, unsigned int cmd,
 	} else if (!drm_ioctl_flags(nr, &flags))
 		return -EINVAL;
 
-	return ioctl_func(filp, cmd, arg);
+	vmaster = vmw_master_check(dev, file_priv, flags);
+	if (IS_ERR(vmaster)) {
+		ret = PTR_ERR(vmaster);
+
+		if (ret != -ERESTARTSYS)
+			DRM_INFO("IOCTL ERROR Command %d, Error %ld.\n",
+				 nr, ret);
+		return ret;
+	}
+
+	ret = ioctl_func(filp, cmd, arg);
+	if (vmaster)
+		ttm_read_unlock(&vmaster->lock);
+
+	return ret;
 
 out_io_encoding:
 	DRM_ERROR("Invalid command format, ioctl %d\n",
@@ -1106,10 +1181,65 @@ static void vmw_lastclose(struct drm_device *dev)
 {
 }
 
+static void vmw_master_init(struct vmw_master *vmaster)
+{
+	ttm_lock_init(&vmaster->lock);
+}
+
+static int vmw_master_create(struct drm_device *dev,
+			     struct drm_master *master)
+{
+	struct vmw_master *vmaster;
+
+	vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL);
+	if (unlikely(!vmaster))
+		return -ENOMEM;
+
+	vmw_master_init(vmaster);
+	ttm_lock_set_kill(&vmaster->lock, true, SIGTERM);
+	master->driver_priv = vmaster;
+
+	return 0;
+}
+
+static void vmw_master_destroy(struct drm_device *dev,
+			       struct drm_master *master)
+{
+	struct vmw_master *vmaster = vmw_master(master);
+
+	master->driver_priv = NULL;
+	kfree(vmaster);
+}
+
 static int vmw_master_set(struct drm_device *dev,
 			  struct drm_file *file_priv,
 			  bool from_open)
 {
+	struct vmw_private *dev_priv = vmw_priv(dev);
+	struct vmw_fpriv *vmw_fp = vmw_fpriv(file_priv);
+	struct vmw_master *active = dev_priv->active_master;
+	struct vmw_master *vmaster = vmw_master(file_priv->master);
+	int ret = 0;
+
+	if (active) {
+		BUG_ON(active != &dev_priv->fbdev_master);
+		ret = ttm_vt_lock(&active->lock, false, vmw_fp->tfile);
+		if (unlikely(ret != 0))
+			return ret;
+
+		ttm_lock_set_kill(&active->lock, true, SIGTERM);
+		dev_priv->active_master = NULL;
+	}
+
+	ttm_lock_set_kill(&vmaster->lock, false, SIGTERM);
+	if (!from_open) {
+		ttm_vt_unlock(&vmaster->lock);
+		BUG_ON(vmw_fp->locked_master != file_priv->master);
+		drm_master_put(&vmw_fp->locked_master);
+	}
+
+	dev_priv->active_master = vmaster;
+
 	/*
 	 * Inform a new master that the layout may have changed while
 	 * it was gone.
@@ -1124,10 +1254,31 @@ static void vmw_master_drop(struct drm_device *dev,
 			    struct drm_file *file_priv)
 {
 	struct vmw_private *dev_priv = vmw_priv(dev);
+	struct vmw_fpriv *vmw_fp = vmw_fpriv(file_priv);
+	struct vmw_master *vmaster = vmw_master(file_priv->master);
+	int ret;
+
+	/**
+	 * Make sure the master doesn't disappear while we have
+	 * it locked.
+	 */
 
+	vmw_fp->locked_master = drm_master_get(file_priv->master);
+	ret = ttm_vt_lock(&vmaster->lock, false, vmw_fp->tfile);
 	vmw_kms_legacy_hotspot_clear(dev_priv);
+	if (unlikely((ret != 0))) {
+		DRM_ERROR("Unable to lock TTM at VT switch.\n");
+		drm_master_put(&vmw_fp->locked_master);
+	}
+
+	ttm_lock_set_kill(&vmaster->lock, false, SIGTERM);
+
 	if (!dev_priv->enable_fb)
 		vmw_svga_disable(dev_priv);
+
+	dev_priv->active_master = &dev_priv->fbdev_master;
+	ttm_lock_set_kill(&dev_priv->fbdev_master.lock, false, SIGTERM);
+	ttm_vt_unlock(&dev_priv->fbdev_master.lock);
 }
 
 /**
@@ -1406,6 +1557,8 @@ static struct drm_driver driver = {
 	.disable_vblank = vmw_disable_vblank,
 	.ioctls = vmw_ioctls,
 	.num_ioctls = ARRAY_SIZE(vmw_ioctls),
+	.master_create = vmw_master_create,
+	.master_destroy = vmw_master_destroy,
 	.master_set = vmw_master_set,
 	.master_drop = vmw_master_drop,
 	.open = vmw_driver_open,
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
index 3a358a5495e4..366dcfc1f9bb 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.h
@@ -44,9 +44,9 @@
 #include <linux/sync_file.h>
 
 #define VMWGFX_DRIVER_NAME "vmwgfx"
-#define VMWGFX_DRIVER_DATE "20190328"
+#define VMWGFX_DRIVER_DATE "20180704"
 #define VMWGFX_DRIVER_MAJOR 2
-#define VMWGFX_DRIVER_MINOR 16
+#define VMWGFX_DRIVER_MINOR 15
 #define VMWGFX_DRIVER_PATCHLEVEL 0
 #define VMWGFX_FIFO_STATIC_SIZE (1024*1024)
 #define VMWGFX_MAX_RELOCATIONS 2048
@@ -81,30 +81,19 @@
 #define VMW_RES_SHADER ttm_driver_type4
 
 struct vmw_fpriv {
+	struct drm_master *locked_master;
 	struct ttm_object_file *tfile;
 	bool gb_aware; /* user-space is guest-backed aware */
 };
 
-/**
- * struct vmw_buffer_object - TTM buffer object with vmwgfx additions
- * @base: The TTM buffer object
- * @res_tree: RB tree of resources using this buffer object as a backing MOB
- * @pin_count: pin depth
- * @dx_query_ctx: DX context if this buffer object is used as a DX query MOB
- * @map: Kmap object for semi-persistent mappings
- * @res_prios: Eviction priority counts for attached resources
- * @dirty: structure for user-space dirty-tracking
- */
 struct vmw_buffer_object {
 	struct ttm_buffer_object base;
-	struct rb_root res_tree;
+	struct list_head res_list;
 	s32 pin_count;
 	/* Not ref-counted.  Protected by binding_mutex */
 	struct vmw_resource *dx_query_ctx;
 	/* Protected by reservation */
 	struct ttm_bo_kmap_obj map;
-	u32 res_prios[TTM_MAX_BO_PRIORITY];
-	struct vmw_bo_dirty *dirty;
 };
 
 /**
@@ -135,8 +124,7 @@ struct vmw_res_func;
  * @res_dirty: Resource contains data not yet in the backup buffer. Protected
  * by resource reserved.
  * @backup_dirty: Backup buffer contains data not yet in the HW resource.
- * Protected by resource reserved.
- * @coherent: Emulate coherency by tracking vm accesses.
+ * Protecte by resource reserved.
  * @backup: The backup buffer if any. Protected by resource reserved.
  * @backup_offset: Offset into the backup buffer if any. Protected by resource
  * reserved. Note that only a few resource types can have a @backup_offset
@@ -145,32 +133,28 @@ struct vmw_res_func;
  * pin-count greater than zero. It is not on the resource LRU lists and its
  * backup buffer is pinned. Hence it can't be evicted.
  * @func: Method vtable for this resource. Immutable.
- * @mob_node; Node for the MOB backup rbtree. Protected by @backup reserved.
  * @lru_head: List head for the LRU list. Protected by @dev_priv::resource_lock.
+ * @mob_head: List head for the MOB backup list. Protected by @backup reserved.
  * @binding_head: List head for the context binding list. Protected by
  * the @dev_priv::binding_mutex
  * @res_free: The resource destructor.
  * @hw_destroy: Callback to destroy the resource on the device, as part of
  * resource destruction.
  */
-struct vmw_resource_dirty;
 struct vmw_resource {
 	struct kref kref;
 	struct vmw_private *dev_priv;
 	int id;
-	u32 used_prio;
 	unsigned long backup_size;
-	u32 res_dirty : 1;
-	u32 backup_dirty : 1;
-	u32 coherent : 1;
+	bool res_dirty;
+	bool backup_dirty;
 	struct vmw_buffer_object *backup;
 	unsigned long backup_offset;
 	unsigned long pin_count;
 	const struct vmw_res_func *func;
-	struct rb_node mob_node;
 	struct list_head lru_head;
+	struct list_head mob_head;
 	struct list_head binding_head;
-	struct vmw_resource_dirty *dirty;
 	void (*res_free) (struct vmw_resource *res);
 	void (*hw_destroy) (struct vmw_resource *res);
 };
@@ -392,6 +376,10 @@ struct vmw_sw_context{
 struct vmw_legacy_display;
 struct vmw_overlay;
 
+struct vmw_master {
+	struct ttm_lock lock;
+};
+
 struct vmw_vga_topology_state {
 	uint32_t width;
 	uint32_t height;
@@ -554,8 +542,11 @@ struct vmw_private {
 	spinlock_t svga_lock;
 
 	/**
-	 * PM management.
+	 * Master management.
 	 */
+
+	struct vmw_master *active_master;
+	struct vmw_master fbdev_master;
 	struct notifier_block pm_nb;
 	bool refuse_hibernation;
 	bool suspend_locked;
@@ -604,9 +595,6 @@ struct vmw_private {
 
 	/* Validation memory reservation */
 	struct vmw_validation_mem vvm;
-
-	/* VM operations */
-	struct vm_operations_struct vm_ops;
 };
 
 static inline struct vmw_surface *vmw_res_to_srf(struct vmw_resource *res)
@@ -624,6 +612,11 @@ static inline struct vmw_fpriv *vmw_fpriv(struct drm_file *file_priv)
 	return (struct vmw_fpriv *)file_priv->driver_priv;
 }
 
+static inline struct vmw_master *vmw_master(struct drm_master *master)
+{
+	return (struct vmw_master *) master->driver_priv;
+}
+
 /*
  * The locking here is fine-grained, so that it is performed once
  * for every read- and write operation. This is of course costly, but we
@@ -676,8 +669,7 @@ extern void vmw_resource_unreference(struct vmw_resource **p_res);
 extern struct vmw_resource *vmw_resource_reference(struct vmw_resource *res);
 extern struct vmw_resource *
 vmw_resource_reference_unless_doomed(struct vmw_resource *res);
-extern int vmw_resource_validate(struct vmw_resource *res, bool intr,
-				 bool dirtying);
+extern int vmw_resource_validate(struct vmw_resource *res, bool intr);
 extern int vmw_resource_reserve(struct vmw_resource *res, bool interruptible,
 				bool no_backup);
 extern bool vmw_resource_needs_backup(const struct vmw_resource *res);
@@ -717,23 +709,6 @@ extern void vmw_query_move_notify(struct ttm_buffer_object *bo,
 extern int vmw_query_readback_all(struct vmw_buffer_object *dx_query_mob);
 extern void vmw_resource_evict_all(struct vmw_private *dev_priv);
 extern void vmw_resource_unbind_list(struct vmw_buffer_object *vbo);
-void vmw_resource_mob_attach(struct vmw_resource *res);
-void vmw_resource_mob_detach(struct vmw_resource *res);
-void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
-			       pgoff_t end);
-int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t start,
-			pgoff_t end, pgoff_t *num_prefault);
-
-/**
- * vmw_resource_mob_attached - Whether a resource currently has a mob attached
- * @res: The resource
- *
- * Return: true if the resource has a mob attached, false otherwise.
- */
-static inline bool vmw_resource_mob_attached(const struct vmw_resource *res)
-{
-	return !RB_EMPTY_NODE(&res->mob_node);
-}
 
 /**
  * vmw_user_resource_noref_release - release a user resource pointer looked up
@@ -812,54 +787,6 @@ static inline void vmw_user_bo_noref_release(void)
 	ttm_base_object_noref_release();
 }
 
-/**
- * vmw_bo_adjust_prio - Adjust the buffer object eviction priority
- * according to attached resources
- * @vbo: The struct vmw_buffer_object
- */
-static inline void vmw_bo_prio_adjust(struct vmw_buffer_object *vbo)
-{
-	int i = ARRAY_SIZE(vbo->res_prios);
-
-	while (i--) {
-		if (vbo->res_prios[i]) {
-			vbo->base.priority = i;
-			return;
-		}
-	}
-
-	vbo->base.priority = 3;
-}
-
-/**
- * vmw_bo_prio_add - Notify a buffer object of a newly attached resource
- * eviction priority
- * @vbo: The struct vmw_buffer_object
- * @prio: The resource priority
- *
- * After being notified, the code assigns the highest resource eviction priority
- * to the backing buffer object (mob).
- */
-static inline void vmw_bo_prio_add(struct vmw_buffer_object *vbo, int prio)
-{
-	if (vbo->res_prios[prio]++ == 0)
-		vmw_bo_prio_adjust(vbo);
-}
-
-/**
- * vmw_bo_prio_del - Notify a buffer object of a resource with a certain
- * priority being removed
- * @vbo: The struct vmw_buffer_object
- * @prio: The resource priority
- *
- * After being notified, the code assigns the highest resource eviction priority
- * to the backing buffer object (mob).
- */
-static inline void vmw_bo_prio_del(struct vmw_buffer_object *vbo, int prio)
-{
-	if (--vbo->res_prios[prio] == 0)
-		vmw_bo_prio_adjust(vbo);
-}
 
 /**
  * Misc Ioctl functionality - vmwgfx_ioctl.c
@@ -1089,6 +1016,7 @@ void vmw_kms_cursor_snoop(struct vmw_surface *srf,
 int vmw_kms_write_svga(struct vmw_private *vmw_priv,
 		       unsigned width, unsigned height, unsigned pitch,
 		       unsigned bpp, unsigned depth);
+void vmw_kms_idle_workqueues(struct vmw_master *vmaster);
 bool vmw_kms_validate_mode_vram(struct vmw_private *dev_priv,
 				uint32_t pitch,
 				uint32_t height);
@@ -1410,25 +1338,6 @@ int vmw_host_log(const char *log);
 #define VMW_DEBUG_USER(fmt, ...)                                              \
 	DRM_DEBUG_DRIVER(fmt, ##__VA_ARGS__)
 
-/**
- * VMW_DEBUG_KMS - Debug output for kernel mode-setting
- *
- * This macro is for debugging vmwgfx mode-setting code.
- */
-#define VMW_DEBUG_KMS(fmt, ...)                                               \
-	DRM_DEBUG_DRIVER(fmt, ##__VA_ARGS__)
-
-/* Resource dirtying - vmwgfx_page_dirty.c */
-void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo);
-int vmw_bo_dirty_add(struct vmw_buffer_object *vbo);
-void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res);
-void vmw_bo_dirty_clear_res(struct vmw_resource *res);
-void vmw_bo_dirty_release(struct vmw_buffer_object *vbo);
-void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
-			pgoff_t start, pgoff_t end);
-vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf);
-vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf);
-
 /**
  * Inline helper functions
  */
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index 319c1ca35663..33533d126277 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -2560,6 +2560,7 @@ static int vmw_cmd_dx_check_subresource(struct vmw_private *dev_priv,
 		     offsetof(typeof(*cmd), sid));
 
 	cmd = container_of(header, typeof(*cmd), header);
+
 	return vmw_cmd_res_check(dev_priv, sw_context, vmw_res_surface,
 				 VMW_RES_DIRTY_NONE, user_surface_converter,
 				 &cmd->sid, NULL);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
index e7222fa2cfdf..b97bc8e5944b 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c
@@ -1462,7 +1462,7 @@ static int vmw_kms_check_display_memory(struct drm_device *dev,
 		if (dev_priv->active_display_unit == vmw_du_screen_target &&
 		    (drm_rect_width(&rects[i]) > dev_priv->stdu_max_width ||
 		     drm_rect_height(&rects[i]) > dev_priv->stdu_max_height)) {
-			VMW_DEBUG_KMS("Screen size not supported.\n");
+			DRM_ERROR("Screen size not supported.\n");
 			return -EINVAL;
 		}
 
@@ -1486,7 +1486,7 @@ static int vmw_kms_check_display_memory(struct drm_device *dev,
 	 * limit on primary bounding box
 	 */
 	if (pixel_mem > dev_priv->prim_bb_mem) {
-		VMW_DEBUG_KMS("Combined output size too large.\n");
+		DRM_ERROR("Combined output size too large.\n");
 		return -EINVAL;
 	}
 
@@ -1496,7 +1496,7 @@ static int vmw_kms_check_display_memory(struct drm_device *dev,
 		bb_mem = (u64) bounding_box.x2 * bounding_box.y2 * 4;
 
 		if (bb_mem > dev_priv->prim_bb_mem) {
-			VMW_DEBUG_KMS("Topology is beyond supported limits.\n");
+			DRM_ERROR("Topology is beyond supported limits.\n");
 			return -EINVAL;
 		}
 	}
@@ -1645,7 +1645,6 @@ static int vmw_kms_check_topology(struct drm_device *dev,
 		struct vmw_connector_state *vmw_conn_state;
 
 		if (!du->pref_active && new_crtc_state->enable) {
-			VMW_DEBUG_KMS("Enabling a disabled display unit\n");
 			ret = -EINVAL;
 			goto clean;
 		}
@@ -1702,10 +1701,8 @@ vmw_kms_atomic_check_modeset(struct drm_device *dev,
 		return ret;
 
 	ret = vmw_kms_check_implicit(dev, state);
-	if (ret) {
-		VMW_DEBUG_KMS("Invalid implicit state\n");
+	if (ret)
 		return ret;
-	}
 
 	if (!state->allow_modeset)
 		return ret;
@@ -2350,9 +2347,6 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 
 	if (!arg->num_outputs) {
 		struct drm_rect def_rect = {0, 0, 800, 600};
-		VMW_DEBUG_KMS("Default layout x1 = %d y1 = %d x2 = %d y2 = %d\n",
-			      def_rect.x1, def_rect.y1,
-			      def_rect.x2, def_rect.y2);
 		vmw_du_update_layout(dev_priv, 1, &def_rect);
 		return 0;
 	}
@@ -2373,7 +2367,6 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 
 	drm_rects = (struct drm_rect *)rects;
 
-	VMW_DEBUG_KMS("Layout count = %u\n", arg->num_outputs);
 	for (i = 0; i < arg->num_outputs; i++) {
 		struct drm_vmw_rect curr_rect;
 
@@ -2390,10 +2383,6 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 		drm_rects[i].x2 = curr_rect.x + curr_rect.w;
 		drm_rects[i].y2 = curr_rect.y + curr_rect.h;
 
-		VMW_DEBUG_KMS("  x1 = %d y1 = %d x2 = %d y2 = %d\n",
-			      drm_rects[i].x1, drm_rects[i].y1,
-			      drm_rects[i].x2, drm_rects[i].y2);
-
 		/*
 		 * Currently this check is limiting the topology within
 		 * mode_config->max (which actually is max texture size
@@ -2404,9 +2393,7 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data,
 		if (drm_rects[i].x1 < 0 ||  drm_rects[i].y1 < 0 ||
 		    drm_rects[i].x2 > mode_config->max_width ||
 		    drm_rects[i].y2 > mode_config->max_height) {
-			VMW_DEBUG_KMS("Invalid layout %d %d %d %d\n",
-				      drm_rects[i].x1, drm_rects[i].y1,
-				      drm_rects[i].x2, drm_rects[i].y2);
+			DRM_ERROR("Invalid GUI layout.\n");
 			ret = -EINVAL;
 			goto out_free;
 		}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
deleted file mode 100644
index 730c51e397dd..000000000000
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
+++ /dev/null
@@ -1,472 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR MIT
-/**************************************************************************
- *
- * Copyright 2019 VMware, Inc., Palo Alto, CA., USA
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-#include "vmwgfx_drv.h"
-
-/*
- * Different methods for tracking dirty:
- * VMW_BO_DIRTY_PAGETABLE - Scan the pagetable for hardware dirty bits
- * VMW_BO_DIRTY_MKWRITE - Write-protect page table entries and record write-
- * accesses in the VM mkwrite() callback
- */
-enum vmw_bo_dirty_method {
-	VMW_BO_DIRTY_PAGETABLE,
-	VMW_BO_DIRTY_MKWRITE,
-};
-
-/*
- * No dirtied pages at scan trigger a transition to the _MKWRITE method,
- * similarly a certain percentage of dirty pages trigger a transition to
- * the _PAGETABLE method. How many triggers should we wait for before
- * changing method?
- */
-#define VMW_DIRTY_NUM_CHANGE_TRIGGERS 2
-
-/* Percentage to trigger a transition to the _PAGETABLE method */
-#define VMW_DIRTY_PERCENTAGE 10
-
-/**
- * struct vmw_bo_dirty - Dirty information for buffer objects
- * @start: First currently dirty bit
- * @end: Last currently dirty bit + 1
- * @method: The currently used dirty method
- * @change_count: Number of consecutive method change triggers
- * @ref_count: Reference count for this structure
- * @bitmap_size: The size of the bitmap in bits. Typically equal to the
- * nuber of pages in the bo.
- * @size: The accounting size for this struct.
- * @bitmap: A bitmap where each bit represents a page. A set bit means a
- * dirty page.
- */
-struct vmw_bo_dirty {
-	unsigned long start;
-	unsigned long end;
-	enum vmw_bo_dirty_method method;
-	unsigned int change_count;
-	unsigned int ref_count;
-	unsigned long bitmap_size;
-	size_t size;
-	unsigned long bitmap[0];
-};
-
-/**
- * vmw_bo_dirty_scan_pagetable - Perform a pagetable scan for dirty bits
- * @vbo: The buffer object to scan
- *
- * Scans the pagetable for dirty bits. Clear those bits and modify the
- * dirty structure with the results. This function may change the
- * dirty-tracking method.
- */
-static void vmw_bo_dirty_scan_pagetable(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	pgoff_t offset = drm_vma_node_start(&vbo->base.vma_node);
-	struct address_space *mapping = vbo->base.bdev->dev_mapping;
-	pgoff_t num_marked;
-
-	num_marked = apply_as_clean(mapping,
-				    offset, dirty->bitmap_size,
-				    offset, &dirty->bitmap[0],
-				    &dirty->start, &dirty->end);
-	if (num_marked == 0)
-		dirty->change_count++;
-	else
-		dirty->change_count = 0;
-
-	if (dirty->change_count > VMW_DIRTY_NUM_CHANGE_TRIGGERS) {
-		dirty->change_count = 0;
-		dirty->method = VMW_BO_DIRTY_MKWRITE;
-		apply_as_wrprotect(mapping,
-				   offset, dirty->bitmap_size);
-		apply_as_clean(mapping,
-			       offset, dirty->bitmap_size,
-			       offset, &dirty->bitmap[0],
-			       &dirty->start, &dirty->end);
-	}
-}
-
-/**
- * vmw_bo_dirty_scan_mkwrite - Reset the mkwrite dirty-tracking method
- * @vbo: The buffer object to scan
- *
- * Write-protect pages written to so that consecutive write accesses will
- * trigger a call to mkwrite.
- *
- * This function may change the dirty-tracking method.
- */
-static void vmw_bo_dirty_scan_mkwrite(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
-	struct address_space *mapping = vbo->base.bdev->dev_mapping;
-	pgoff_t num_marked;
-
-	if (dirty->end <= dirty->start)
-		return;
-
-	num_marked = apply_as_wrprotect(vbo->base.bdev->dev_mapping,
-					dirty->start + offset,
-					dirty->end - dirty->start);
-
-	if (100UL * num_marked / dirty->bitmap_size >
-	    VMW_DIRTY_PERCENTAGE) {
-		dirty->change_count++;
-	} else {
-		dirty->change_count = 0;
-	}
-
-	if (dirty->change_count > VMW_DIRTY_NUM_CHANGE_TRIGGERS) {
-		pgoff_t start = 0;
-		pgoff_t end = dirty->bitmap_size;
-
-		dirty->method = VMW_BO_DIRTY_PAGETABLE;
-		apply_as_clean(mapping, offset, end, offset, &dirty->bitmap[0],
-			       &start, &end);
-		bitmap_clear(&dirty->bitmap[0], 0, dirty->bitmap_size);
-		if (dirty->start < dirty->end)
-			bitmap_set(&dirty->bitmap[0], dirty->start,
-				   dirty->end - dirty->start);
-		dirty->change_count = 0;
-	}
-}
-
-/**
- * vmw_bo_dirty_scan - Scan for dirty pages and add them to the dirty
- * tracking structure
- * @vbo: The buffer object to scan
- *
- * This function may change the dirty tracking method.
- */
-void vmw_bo_dirty_scan(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-
-	if (dirty->method == VMW_BO_DIRTY_PAGETABLE)
-		vmw_bo_dirty_scan_pagetable(vbo);
-	else
-		vmw_bo_dirty_scan_mkwrite(vbo);
-}
-
-/**
- * vmw_bo_dirty_pre_unmap - write-protect and pick up dirty pages before
- * an unmap_mapping_range operation.
- * @vbo: The buffer object,
- * @start: First page of the range within the buffer object.
- * @end: Last page of the range within the buffer object + 1.
- *
- * If we're using the _PAGETABLE scan method, we may leak dirty pages
- * when calling unmap_mapping_range(). This function makes sure we pick
- * up all dirty pages.
- */
-static void vmw_bo_dirty_pre_unmap(struct vmw_buffer_object *vbo,
-				   pgoff_t start, pgoff_t end)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
-	struct address_space *mapping = vbo->base.bdev->dev_mapping;
-
-	if (dirty->method != VMW_BO_DIRTY_PAGETABLE || start >= end)
-		return;
-
-	apply_as_wrprotect(mapping, start + offset, end - start);
-	apply_as_clean(mapping, start + offset, end - start, offset,
-		       &dirty->bitmap[0], &dirty->start, &dirty->end);
-}
-
-/**
- * vmw_bo_dirty_unmap - Clear all ptes pointing to a range within a bo
- * @vbo: The buffer object,
- * @start: First page of the range within the buffer object.
- * @end: Last page of the range within the buffer object + 1.
- *
- * This is similar to ttm_bo_unmap_virtual_locked() except it takes a subrange.
- */
-void vmw_bo_dirty_unmap(struct vmw_buffer_object *vbo,
-			pgoff_t start, pgoff_t end)
-{
-	unsigned long offset = drm_vma_node_start(&vbo->base.vma_node);
-	struct address_space *mapping = vbo->base.bdev->dev_mapping;
-
-	vmw_bo_dirty_pre_unmap(vbo, start, end);
-	unmap_shared_mapping_range(mapping, (offset + start) << PAGE_SHIFT,
-				   (loff_t) (end - start) << PAGE_SHIFT);
-}
-
-/**
- * vmw_bo_dirty_add - Add a dirty-tracking user to a buffer object
- * @vbo: The buffer object
- *
- * This function registers a dirty-tracking user to a buffer object.
- * A user can be for example a resource or a vma in a special user-space
- * mapping.
- *
- * Return: Zero on success, -ENOMEM on memory allocation failure.
- */
-int vmw_bo_dirty_add(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	pgoff_t num_pages = vbo->base.num_pages;
-	size_t size, acc_size;
-	int ret;
-	static struct ttm_operation_ctx ctx = {
-		.interruptible = false,
-		.no_wait_gpu = false
-	};
-
-	if (dirty) {
-		dirty->ref_count++;
-		return 0;
-	}
-
-	size = sizeof(*dirty) + BITS_TO_LONGS(num_pages) * sizeof(long);
-	acc_size = ttm_round_pot(size);
-	ret = ttm_mem_global_alloc(&ttm_mem_glob, acc_size, &ctx);
-	if (ret) {
-		VMW_DEBUG_USER("Out of graphics memory for buffer object "
-			       "dirty tracker.\n");
-		return ret;
-	}
-	dirty = kvzalloc(size, GFP_KERNEL);
-	if (!dirty) {
-		ret = -ENOMEM;
-		goto out_no_dirty;
-	}
-
-	dirty->size = acc_size;
-	dirty->bitmap_size = num_pages;
-	dirty->start = dirty->bitmap_size;
-	dirty->end = 0;
-	dirty->ref_count = 1;
-	if (num_pages < PAGE_SIZE / sizeof(pte_t)) {
-		dirty->method = VMW_BO_DIRTY_PAGETABLE;
-	} else {
-		struct address_space *mapping = vbo->base.bdev->dev_mapping;
-		pgoff_t offset = drm_vma_node_start(&vbo->base.vma_node);
-
-		dirty->method = VMW_BO_DIRTY_MKWRITE;
-
-		/* Write-protect and then pick up already dirty bits */
-		apply_as_wrprotect(mapping, offset, num_pages);
-		apply_as_clean(mapping, offset, num_pages, offset,
-			       &dirty->bitmap[0], &dirty->start, &dirty->end);
-	}
-
-	vbo->dirty = dirty;
-
-	return 0;
-
-out_no_dirty:
-	ttm_mem_global_free(&ttm_mem_glob, acc_size);
-	return ret;
-}
-
-/**
- * vmw_bo_dirty_release - Release a dirty-tracking user from a buffer object
- * @vbo: The buffer object
- *
- * This function releases a dirty-tracking user from a buffer object.
- * If the reference count reaches zero, then the dirty-tracking object is
- * freed and the pointer to it cleared.
- *
- * Return: Zero on success, -ENOMEM on memory allocation failure.
- */
-void vmw_bo_dirty_release(struct vmw_buffer_object *vbo)
-{
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-
-	if (dirty && --dirty->ref_count == 0) {
-		size_t acc_size = dirty->size;
-
-		kvfree(dirty);
-		ttm_mem_global_free(&ttm_mem_glob, acc_size);
-		vbo->dirty = NULL;
-	}
-}
-
-/**
- * vmw_bo_dirty_transfer_to_res - Pick up a resource's dirty region from
- * its backing mob.
- * @res: The resource
- *
- * This function will pick up all dirty ranges affecting the resource from
- * it's backup mob, and call vmw_resource_dirty_update() once for each
- * range. The transferred ranges will be cleared from the backing mob's
- * dirty tracking.
- */
-void vmw_bo_dirty_transfer_to_res(struct vmw_resource *res)
-{
-	struct vmw_buffer_object *vbo = res->backup;
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-	pgoff_t start, cur, end;
-	unsigned long res_start = res->backup_offset;
-	unsigned long res_end = res->backup_offset + res->backup_size;
-
-	WARN_ON_ONCE(res_start & ~PAGE_MASK);
-	res_start >>= PAGE_SHIFT;
-	res_end = DIV_ROUND_UP(res_end, PAGE_SIZE);
-
-	if (res_start >= dirty->end || res_end <= dirty->start)
-		return;
-
-	cur = max(res_start, dirty->start);
-	res_end = max(res_end, dirty->end);
-	while (cur < res_end) {
-		unsigned long num;
-
-		start = find_next_bit(&dirty->bitmap[0], res_end, cur);
-		if (start >= res_end)
-			break;
-
-		end = find_next_zero_bit(&dirty->bitmap[0], res_end, start + 1);
-		cur = end + 1;
-		num = end - start;
-		bitmap_clear(&dirty->bitmap[0], start, num);
-		vmw_resource_dirty_update(res, start, end);
-	}
-
-	if (res_start <= dirty->start && res_end > dirty->start)
-		dirty->start = res_end;
-	if (res_start < dirty->end && res_end >= dirty->end)
-		dirty->end = res_start;
-}
-
-/**
- * vmw_bo_dirty_clear_res - Clear a resource's dirty region from
- * its backing mob.
- * @res: The resource
- *
- * This function will clear all dirty ranges affecting the resource from
- * it's backup mob's dirty tracking.
- */
-void vmw_bo_dirty_clear_res(struct vmw_resource *res)
-{
-	unsigned long res_start = res->backup_offset;
-	unsigned long res_end = res->backup_offset + res->backup_size;
-	struct vmw_buffer_object *vbo = res->backup;
-	struct vmw_bo_dirty *dirty = vbo->dirty;
-
-	res_start >>= PAGE_SHIFT;
-	res_end = DIV_ROUND_UP(res_end, PAGE_SIZE);
-
-	if (res_start >= dirty->end || res_end <= dirty->start)
-		return;
-
-	res_start = max(res_start, dirty->start);
-	res_end = min(res_end, dirty->end);
-	bitmap_clear(&dirty->bitmap[0], res_start, res_end - res_start);
-
-	if (res_start <= dirty->start && res_end > dirty->start)
-		dirty->start = res_end;
-	if (res_start < dirty->end && res_end >= dirty->end)
-		dirty->end = res_start;
-}
-
-vm_fault_t vmw_bo_vm_mkwrite(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
-	    vma->vm_private_data;
-	vm_fault_t ret;
-	unsigned long page_offset;
-	struct vmw_buffer_object *vbo =
-		container_of(bo, typeof(*vbo), base);
-
-	ret = ttm_bo_vm_reserve(bo, vmf);
-	if (ret)
-		return ret;
-
-	page_offset = vmf->pgoff - drm_vma_node_start(&bo->vma_node);
-	if (unlikely(page_offset >= bo->num_pages)) {
-		ret = VM_FAULT_SIGBUS;
-		goto out_unlock;
-	}
-
-	if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE &&
-	    !test_bit(page_offset, &vbo->dirty->bitmap[0])) {
-		struct vmw_bo_dirty *dirty = vbo->dirty;
-
-		__set_bit(page_offset, &dirty->bitmap[0]);
-		dirty->start = min(dirty->start, page_offset);
-		dirty->end = max(dirty->end, page_offset + 1);
-	}
-
-out_unlock:
-	reservation_object_unlock(bo->resv);
-	return ret;
-}
-
-vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
-	    vma->vm_private_data;
-	struct vmw_buffer_object *vbo =
-		container_of(bo, struct vmw_buffer_object, base);
-	pgoff_t num_prefault;
-	pgprot_t prot;
-	vm_fault_t ret;
-
-	ret = ttm_bo_vm_reserve(bo, vmf);
-	if (ret)
-		return ret;
-
-	num_prefault = (vma->vm_flags & VM_RAND_READ) ? 1 :
-		TTM_BO_VM_NUM_PREFAULT;
-
-	if (vbo->dirty) {
-		pgoff_t allowed_prefault;
-		unsigned long page_offset;
-
-		page_offset = vmf->pgoff - drm_vma_node_start(&bo->vma_node);
-		if (page_offset >= bo->num_pages ||
-		    vmw_resources_clean(vbo, page_offset,
-					page_offset + PAGE_SIZE,
-					&allowed_prefault)) {
-			ret = VM_FAULT_SIGBUS;
-			goto out_unlock;
-		}
-
-		num_prefault = min(num_prefault, allowed_prefault);
-	}
-
-	/*
-	 * If we don't track dirty using the MKWRITE method, make sure
-	 * sure the page protection is write-enabled so we don't get
-	 * a lot of unnecessary write faults.
-	 */
-	if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
-		prot = vma->vm_page_prot;
-	else
-		prot = vm_get_page_prot(vma->vm_flags);
-
-	ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
-	if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
-		return ret;
-
-out_unlock:
-	reservation_object_unlock(bo->resv);
-	return ret;
-}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index d70ee0df5c13..1d38a8b2f2ec 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -34,51 +34,6 @@
 
 #define VMW_RES_EVICT_ERR_COUNT 10
 
-/**
- * vmw_resource_mob_attach - Mark a resource as attached to its backing mob
- * @res: The resource
- */
-void vmw_resource_mob_attach(struct vmw_resource *res)
-{
-	struct vmw_buffer_object *backup = res->backup;
-	struct rb_node **new = &backup->res_tree.rb_node, *parent = NULL;
-
-	lockdep_assert_held(&backup->base.resv->lock.base);
-	res->used_prio = (res->res_dirty) ? res->func->dirty_prio :
-		res->func->prio;
-
-	while (*new) {
-		struct vmw_resource *this =
-			container_of(*new, struct vmw_resource, mob_node);
-
-		parent = *new;
-		new = (res->backup_offset < this->backup_offset) ?
-			&((*new)->rb_left) : &((*new)->rb_right);
-	}
-
-	rb_link_node(&res->mob_node, parent, new);
-	rb_insert_color(&res->mob_node, &backup->res_tree);
-
-	vmw_bo_prio_add(backup, res->used_prio);
-}
-
-/**
- * vmw_resource_mob_detach - Mark a resource as detached from its backing mob
- * @res: The resource
- */
-void vmw_resource_mob_detach(struct vmw_resource *res)
-{
-	struct vmw_buffer_object *backup = res->backup;
-
-	lockdep_assert_held(&backup->base.resv->lock.base);
-	if (vmw_resource_mob_attached(res)) {
-		rb_erase(&res->mob_node, &backup->res_tree);
-		RB_CLEAR_NODE(&res->mob_node);
-		vmw_bo_prio_del(backup, res->used_prio);
-	}
-}
-
-
 struct vmw_resource *vmw_resource_reference(struct vmw_resource *res)
 {
 	kref_get(&res->kref);
@@ -125,7 +80,7 @@ static void vmw_resource_release(struct kref *kref)
 		struct ttm_buffer_object *bo = &res->backup->base;
 
 		ttm_bo_reserve(bo, false, false, NULL);
-		if (vmw_resource_mob_attached(res) &&
+		if (!list_empty(&res->mob_head) &&
 		    res->func->unbind != NULL) {
 			struct ttm_validate_buffer val_buf;
 
@@ -134,11 +89,7 @@ static void vmw_resource_release(struct kref *kref)
 			res->func->unbind(res, false, &val_buf);
 		}
 		res->backup_dirty = false;
-		vmw_resource_mob_detach(res);
-		if (res->dirty)
-			res->func->dirty_free(res);
-		if (res->coherent)
-			vmw_bo_dirty_release(res->backup);
+		list_del_init(&res->mob_head);
 		ttm_bo_unreserve(bo);
 		vmw_bo_unreference(&res->backup);
 	}
@@ -220,17 +171,14 @@ int vmw_resource_init(struct vmw_private *dev_priv, struct vmw_resource *res,
 	res->res_free = res_free;
 	res->dev_priv = dev_priv;
 	res->func = func;
-	RB_CLEAR_NODE(&res->mob_node);
 	INIT_LIST_HEAD(&res->lru_head);
+	INIT_LIST_HEAD(&res->mob_head);
 	INIT_LIST_HEAD(&res->binding_head);
 	res->id = -1;
 	res->backup = NULL;
 	res->backup_offset = 0;
 	res->backup_dirty = false;
 	res->res_dirty = false;
-	res->coherent = false;
-	res->used_prio = 3;
-	res->dirty = NULL;
 	if (delay_id)
 		return 0;
 	else
@@ -395,8 +343,7 @@ out_no_bo:
  * should be retried once resources have been freed up.
  */
 static int vmw_resource_do_validate(struct vmw_resource *res,
-				    struct ttm_validate_buffer *val_buf,
-				    bool dirtying)
+				    struct ttm_validate_buffer *val_buf)
 {
 	int ret = 0;
 	const struct vmw_res_func *func = res->func;
@@ -408,47 +355,14 @@ static int vmw_resource_do_validate(struct vmw_resource *res,
 	}
 
 	if (func->bind &&
-	    ((func->needs_backup && !vmw_resource_mob_attached(res) &&
+	    ((func->needs_backup && list_empty(&res->mob_head) &&
 	      val_buf->bo != NULL) ||
 	     (!func->needs_backup && val_buf->bo != NULL))) {
 		ret = func->bind(res, val_buf);
 		if (unlikely(ret != 0))
 			goto out_bind_failed;
 		if (func->needs_backup)
-			vmw_resource_mob_attach(res);
-	}
-
-	/*
-	 * Handle the case where the backup mob is marked coherent but
-	 * the resource isn't.
-	 */
-	if (func->dirty_alloc && vmw_resource_mob_attached(res) &&
-	    !res->coherent) {
-		if (res->backup->dirty && !res->dirty) {
-			ret = func->dirty_alloc(res);
-			if (ret)
-				return ret;
-		} else if (!res->backup->dirty && res->dirty) {
-			func->dirty_free(res);
-		}
-	}
-
-	/*
-	 * Transfer the dirty regions to the resource and update
-	 * the resource.
-	 */
-	if (res->dirty) {
-		if (dirtying && !res->res_dirty) {
-			pgoff_t start = res->backup_offset >> PAGE_SHIFT;
-			pgoff_t end = __KERNEL_DIV_ROUND_UP
-				(res->backup_offset + res->backup_size,
-				 PAGE_SIZE);
-
-			vmw_bo_dirty_unmap(res->backup, start, end);
-		}
-
-		vmw_bo_dirty_transfer_to_res(res);
-		return func->dirty_sync(res);
+			list_add_tail(&res->mob_head, &res->backup->res_list);
 	}
 
 	return 0;
@@ -488,29 +402,19 @@ void vmw_resource_unreserve(struct vmw_resource *res,
 
 	if (switch_backup && new_backup != res->backup) {
 		if (res->backup) {
-			vmw_resource_mob_detach(res);
-			if (res->coherent)
-				vmw_bo_dirty_release(res->backup);
+			lockdep_assert_held(&res->backup->base.resv->lock.base);
+			list_del_init(&res->mob_head);
 			vmw_bo_unreference(&res->backup);
 		}
 
 		if (new_backup) {
 			res->backup = vmw_bo_reference(new_backup);
-
-			/*
-			 * The validation code should already have added a
-			 * dirty tracker here.
-			 */
-			WARN_ON(res->coherent && !new_backup->dirty);
-
-			vmw_resource_mob_attach(res);
+			lockdep_assert_held(&new_backup->base.resv->lock.base);
+			list_add_tail(&res->mob_head, &new_backup->res_list);
 		} else {
 			res->backup = NULL;
 		}
-	} else if (switch_backup && res->coherent) {
-		vmw_bo_dirty_release(res->backup);
 	}
-
 	if (switch_backup)
 		res->backup_offset = new_backup_offset;
 
@@ -565,7 +469,7 @@ vmw_resource_check_buffer(struct ww_acquire_ctx *ticket,
 	if (unlikely(ret != 0))
 		goto out_no_reserve;
 
-	if (res->func->needs_backup && !vmw_resource_mob_attached(res))
+	if (res->func->needs_backup && list_empty(&res->mob_head))
 		return 0;
 
 	backup_dirty = res->backup_dirty;
@@ -670,11 +574,11 @@ static int vmw_resource_do_evict(struct ww_acquire_ctx *ticket,
 		return ret;
 
 	if (unlikely(func->unbind != NULL &&
-		     (!func->needs_backup || vmw_resource_mob_attached(res)))) {
+		     (!func->needs_backup || !list_empty(&res->mob_head)))) {
 		ret = func->unbind(res, res->res_dirty, &val_buf);
 		if (unlikely(ret != 0))
 			goto out_no_unbind;
-		vmw_resource_mob_detach(res);
+		list_del_init(&res->mob_head);
 	}
 	ret = func->destroy(res);
 	res->backup_dirty = true;
@@ -691,7 +595,6 @@ out_no_unbind:
  *                         to the device.
  * @res: The resource to make visible to the device.
  * @intr: Perform waits interruptible if possible.
- * @dirtying: Pending GPU operation will dirty the resource
  *
  * On succesful return, any backup DMA buffer pointed to by @res->backup will
  * be reserved and validated.
@@ -701,8 +604,7 @@ out_no_unbind:
  * Return: Zero on success, -ERESTARTSYS if interrupted, negative error code
  * on failure.
  */
-int vmw_resource_validate(struct vmw_resource *res, bool intr,
-			  bool dirtying)
+int vmw_resource_validate(struct vmw_resource *res, bool intr)
 {
 	int ret;
 	struct vmw_resource *evict_res;
@@ -719,7 +621,7 @@ int vmw_resource_validate(struct vmw_resource *res, bool intr,
 	if (res->backup)
 		val_buf.bo = &res->backup->base;
 	do {
-		ret = vmw_resource_do_validate(res, &val_buf, dirtying);
+		ret = vmw_resource_do_validate(res, &val_buf);
 		if (likely(ret != -EBUSY))
 			break;
 
@@ -758,7 +660,7 @@ int vmw_resource_validate(struct vmw_resource *res, bool intr,
 	if (unlikely(ret != 0))
 		goto out_no_validate;
 	else if (!res->func->needs_backup && res->backup) {
-		WARN_ON_ONCE(vmw_resource_mob_attached(res));
+		list_del_init(&res->mob_head);
 		vmw_bo_unreference(&res->backup);
 	}
 
@@ -782,23 +684,22 @@ out_no_validate:
  */
 void vmw_resource_unbind_list(struct vmw_buffer_object *vbo)
 {
+
+	struct vmw_resource *res, *next;
 	struct ttm_validate_buffer val_buf = {
 		.bo = &vbo->base,
 		.num_shared = 0
 	};
 
 	lockdep_assert_held(&vbo->base.resv->lock.base);
-	while (!RB_EMPTY_ROOT(&vbo->res_tree)) {
-		struct rb_node *node = vbo->res_tree.rb_node;
-		struct vmw_resource *res =
-			container_of(node, struct vmw_resource, mob_node);
-
-		if (!WARN_ON_ONCE(!res->func->unbind))
-			(void) res->func->unbind(res, res->res_dirty, &val_buf);
+	list_for_each_entry_safe(res, next, &vbo->res_list, mob_head) {
+		if (!res->func->unbind)
+			continue;
 
+		(void) res->func->unbind(res, res->res_dirty, &val_buf);
 		res->backup_dirty = true;
 		res->res_dirty = false;
-		vmw_resource_mob_detach(res);
+		list_del_init(&res->mob_head);
 	}
 
 	(void) ttm_bo_wait(&vbo->base, false, false);
@@ -1019,7 +920,7 @@ int vmw_resource_pin(struct vmw_resource *res, bool interruptible)
 			/* Do we really need to pin the MOB as well? */
 			vmw_bo_pin_reserved(vbo, true);
 		}
-		ret = vmw_resource_validate(res, interruptible, true);
+		ret = vmw_resource_validate(res, interruptible);
 		if (vbo)
 			ttm_bo_unreserve(&vbo->base);
 		if (ret)
@@ -1079,101 +980,3 @@ enum vmw_res_type vmw_res_type(const struct vmw_resource *res)
 {
 	return res->func->res_type;
 }
-
-/**
- * vmw_resource_update_dirty - Update a resource's dirty tracker with a
- * sequential range of touched backing store memory.
- * @res: The resource.
- * @start: The first page touched.
- * @end: The last page touched + 1.
- */
-void vmw_resource_dirty_update(struct vmw_resource *res, pgoff_t start,
-			       pgoff_t end)
-{
-	if (res->dirty)
-		res->func->dirty_range_add(res, start << PAGE_SHIFT,
-					   end << PAGE_SHIFT);
-}
-
-/**
- * vmw_resources_clean - Clean resources intersecting a mob range
- * @vbo: The mob buffer object
- * @start: The mob page offset starting the range
- * @end: The mob page offset ending the range
- * @num_prefault: Returns how many pages including the first have been
- * cleaned and are ok to prefault
- */
-int vmw_resources_clean(struct vmw_buffer_object *vbo, pgoff_t start,
-			pgoff_t end, pgoff_t *num_prefault)
-{
-	struct rb_node *cur = vbo->res_tree.rb_node;
-	struct vmw_resource *found = NULL;
-	unsigned long res_start = start << PAGE_SHIFT;
-	unsigned long res_end = end << PAGE_SHIFT;
-	unsigned long last_cleaned = 0;
-
-	/*
-	 * Find the resource with lowest backup_offset that intersects the
-	 * range.
-	 */
-	while (cur) {
-		struct vmw_resource *cur_res =
-			container_of(cur, struct vmw_resource, mob_node);
-
-		if (cur_res->backup_offset >= res_end) {
-			cur = cur->rb_left;
-		} else if (cur_res->backup_offset + cur_res->backup_size <=
-			   res_start) {
-			cur = cur->rb_right;
-		} else {
-			found = cur_res;
-			cur = cur->rb_left;
-			/* Continue to look for resources with lower offsets */
-		}
-	}
-
-	/*
-	 * In order of increasing backup_offset, clean dirty resorces
-	 * intersecting the range.
-	 */
-	while (found) {
-		if (found->res_dirty) {
-			int ret;
-
-			if (!found->func->clean)
-				return -EINVAL;
-
-			ret = found->func->clean(found);
-			if (ret)
-				return ret;
-
-			found->res_dirty = false;
-		}
-		last_cleaned = found->backup_offset + found->backup_size;
-		cur = rb_next(&found->mob_node);
-		if (!cur)
-			break;
-
-		found = container_of(cur, struct vmw_resource, mob_node);
-		if (found->backup_offset >= res_end)
-			break;
-	}
-
-	/*
-	 * Set number of pages allowed prefaulting and fence the buffer object
-	 */
-	*num_prefault = 1;
-	if (last_cleaned > res_start) {
-		struct ttm_buffer_object *bo = &vbo->base;
-
-		*num_prefault = __KERNEL_DIV_ROUND_UP(last_cleaned - res_start,
-						      PAGE_SIZE);
-		vmw_bo_fence_single(bo, NULL);
-		if (bo->moving)
-			dma_fence_put(bo->moving);
-		bo->moving = dma_fence_get
-			(reservation_object_get_excl(bo->resv));
-	}
-
-	return 0;
-}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
index 3b7438b2d289..7e19eba0b0b8 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource_priv.h
@@ -71,13 +71,6 @@ struct vmw_user_resource_conv {
  * @commit_notify:     If the resource is a command buffer managed resource,
  *                     callback to notify that a define or remove command
  *                     has been committed to the device.
- * @dirty_alloc:       Allocate a dirty tracker. NULL if dirty-tracking is not
- *                     supported.
- * @dirty_free:        Free the dirty tracker.
- * @dirty_sync:        Upload the dirty mob contents to the resource.
- * @dirty_add_range:   Add a sequential dirty range to the resource
- *                     dirty tracker.
- * @clean:             Clean the resource.
  */
 struct vmw_res_func {
 	enum vmw_res_type res_type;
@@ -85,8 +78,6 @@ struct vmw_res_func {
 	const char *type_name;
 	struct ttm_placement *backup_placement;
 	bool may_evict;
-	u32 prio;
-	u32 dirty_prio;
 
 	int (*create) (struct vmw_resource *res);
 	int (*destroy) (struct vmw_resource *res);
@@ -97,12 +88,6 @@ struct vmw_res_func {
 		       struct ttm_validate_buffer *val_buf);
 	void (*commit_notify)(struct vmw_resource *res,
 			      enum vmw_cmdbuf_res_state state);
-	int (*dirty_alloc)(struct vmw_resource *res);
-	void (*dirty_free)(struct vmw_resource *res);
-	int (*dirty_sync)(struct vmw_resource *res);
-	void (*dirty_range_add)(struct vmw_resource *res, size_t start,
-				 size_t end);
-	int (*clean)(struct vmw_resource *res);
 };
 
 /**
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
index e139fdfd1635..d310d21f0d54 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_shader.c
@@ -95,8 +95,6 @@ static const struct vmw_res_func vmw_gb_shader_func = {
 	.res_type = vmw_res_shader,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
 	.type_name = "guest backed shaders",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_gb_shader_create,
@@ -108,9 +106,7 @@ static const struct vmw_res_func vmw_gb_shader_func = {
 static const struct vmw_res_func vmw_dx_shader_func = {
 	.res_type = vmw_res_shader,
 	.needs_backup = true,
-	.may_evict = true,
-	.prio = 3,
-	.dirty_prio = 3,
+	.may_evict = false,
 	.type_name = "dx shaders",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_dx_shader_create,
@@ -427,7 +423,7 @@ static int vmw_dx_shader_create(struct vmw_resource *res)
 
 	WARN_ON_ONCE(!shader->committed);
 
-	if (vmw_resource_mob_attached(res)) {
+	if (!list_empty(&res->mob_head)) {
 		mutex_lock(&dev_priv->binding_mutex);
 		ret = vmw_dx_shader_unscrub(res);
 		mutex_unlock(&dev_priv->binding_mutex);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
index 862ca44680ca..219471903bc1 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
@@ -68,20 +68,6 @@ struct vmw_surface_offset {
 	uint32_t bo_offset;
 };
 
-/**
- * vmw_surface_dirty - Surface dirty-tracker
- * @cache: Cached layout information of the surface.
- * @size: Accounting size for the struct vmw_surface_dirty.
- * @num_subres: Number of subresources.
- * @boxes: Array of SVGA3dBoxes indicating dirty regions. One per subresource.
- */
-struct vmw_surface_dirty {
-	struct svga3dsurface_cache cache;
-	size_t size;
-	u32 num_subres;
-	SVGA3dBox boxes[0];
-};
-
 static void vmw_user_surface_free(struct vmw_resource *res);
 static struct vmw_resource *
 vmw_user_surface_base_to_res(struct ttm_base_object *base);
@@ -110,13 +96,6 @@ vmw_gb_surface_reference_internal(struct drm_device *dev,
 				  struct drm_vmw_gb_surface_ref_ext_rep *rep,
 				  struct drm_file *file_priv);
 
-static void vmw_surface_dirty_free(struct vmw_resource *res);
-static int vmw_surface_dirty_alloc(struct vmw_resource *res);
-static int vmw_surface_dirty_sync(struct vmw_resource *res);
-static void vmw_surface_dirty_range_add(struct vmw_resource *res, size_t start,
-					size_t end);
-static int vmw_surface_clean(struct vmw_resource *res);
-
 static const struct vmw_user_resource_conv user_surface_conv = {
 	.object_type = VMW_RES_SURFACE,
 	.base_obj_to_res = vmw_user_surface_base_to_res,
@@ -133,8 +112,6 @@ static const struct vmw_res_func vmw_legacy_surface_func = {
 	.res_type = vmw_res_surface,
 	.needs_backup = false,
 	.may_evict = true,
-	.prio = 1,
-	.dirty_prio = 1,
 	.type_name = "legacy surfaces",
 	.backup_placement = &vmw_srf_placement,
 	.create = &vmw_legacy_srf_create,
@@ -147,19 +124,12 @@ static const struct vmw_res_func vmw_gb_surface_func = {
 	.res_type = vmw_res_surface,
 	.needs_backup = true,
 	.may_evict = true,
-	.prio = 1,
-	.dirty_prio = 2,
 	.type_name = "guest backed surfaces",
 	.backup_placement = &vmw_mob_placement,
 	.create = vmw_gb_surface_create,
 	.destroy = vmw_gb_surface_destroy,
 	.bind = vmw_gb_surface_bind,
-	.unbind = vmw_gb_surface_unbind,
-	.dirty_alloc = vmw_surface_dirty_alloc,
-	.dirty_free = vmw_surface_dirty_free,
-	.dirty_sync = vmw_surface_dirty_sync,
-	.dirty_range_add = vmw_surface_dirty_range_add,
-	.clean = vmw_surface_clean,
+	.unbind = vmw_gb_surface_unbind
 };
 
 /**
@@ -667,7 +637,6 @@ static void vmw_user_surface_free(struct vmw_resource *res)
 	struct vmw_private *dev_priv = srf->res.dev_priv;
 	uint32_t size = user_srf->size;
 
-	WARN_ON_ONCE(res->dirty);
 	if (user_srf->master)
 		drm_master_put(&user_srf->master);
 	kfree(srf->offsets);
@@ -946,6 +915,12 @@ vmw_surface_handle_reference(struct vmw_private *dev_priv,
 		if (unlikely(drm_is_render_client(file_priv)))
 			require_exist = true;
 
+		if (READ_ONCE(vmw_fpriv(file_priv)->locked_master)) {
+			DRM_ERROR("Locked master refused legacy "
+				  "surface reference.\n");
+			return -EACCES;
+		}
+
 		handle = u_handle;
 	}
 
@@ -1195,16 +1170,10 @@ static int vmw_gb_surface_bind(struct vmw_resource *res,
 		cmd2->header.id = SVGA_3D_CMD_UPDATE_GB_SURFACE;
 		cmd2->header.size = sizeof(cmd2->body);
 		cmd2->body.sid = res->id;
+		res->backup_dirty = false;
 	}
 	vmw_fifo_commit(dev_priv, submit_size);
 
-	if (res->backup->dirty && res->backup_dirty) {
-		/* We've just made a full upload. Cear dirty regions. */
-		vmw_bo_dirty_clear_res(res);
-	}
-
-	res->backup_dirty = false;
-
 	return 0;
 }
 
@@ -1669,8 +1638,7 @@ vmw_gb_surface_define_internal(struct drm_device *dev,
 			}
 		}
 	} else if (req->base.drm_surface_flags &
-		   (drm_vmw_surface_flag_create_buffer |
-		    drm_vmw_surface_flag_coherent))
+		   drm_vmw_surface_flag_create_buffer)
 		ret = vmw_user_bo_alloc(dev_priv, tfile,
 					res->backup_size,
 					req->base.drm_surface_flags &
@@ -1684,26 +1652,6 @@ vmw_gb_surface_define_internal(struct drm_device *dev,
 		goto out_unlock;
 	}
 
-	if (req->base.drm_surface_flags & drm_vmw_surface_flag_coherent) {
-		struct vmw_buffer_object *backup = res->backup;
-
-		ttm_bo_reserve(&backup->base, false, false, NULL);
-		if (!res->func->dirty_alloc)
-			ret = -EINVAL;
-		if (!ret)
-			ret = vmw_bo_dirty_add(backup);
-		if (!ret) {
-			res->coherent = true;
-			ret = res->func->dirty_alloc(res);
-		}
-		ttm_bo_unreserve(&backup->base);
-		if (ret) {
-			vmw_resource_unreference(&res);
-			goto out_unlock;
-		}
-
-	}
-
 	tmp = vmw_resource_reference(res);
 	ret = ttm_prime_object_init(tfile, res->backup_size, &user_srf->prime,
 				    req->base.drm_surface_flags &
@@ -1812,338 +1760,3 @@ out_bad_resource:
 
 	return ret;
 }
-
-/**
- * vmw_subres_dirty_add - Add a dirty region to a subresource
- * @dirty: The surfaces's dirty tracker.
- * @loc_start: The location corresponding to the start of the region.
- * @loc_end: The location corresponding to the end of the region.
- *
- * As we are assuming that @loc_start and @loc_end represent a sequential
- * range of backing store memory, if the region spans multiple lines then
- * regardless of the x coordinate, the full lines are dirtied.
- * Correspondingly if the region spans multiple z slices, then full rather
- * than partial z slices are dirtied.
- */
-static void vmw_subres_dirty_add(struct vmw_surface_dirty *dirty,
-				 const struct svga3dsurface_loc *loc_start,
-				 const struct svga3dsurface_loc *loc_end)
-{
-	const struct svga3dsurface_cache *cache = &dirty->cache;
-	SVGA3dBox *box = &dirty->boxes[loc_start->sub_resource];
-	u32 mip = loc_start->sub_resource % cache->num_mip_levels;
-	const struct drm_vmw_size *size = &cache->mip[mip].size;
-	u32 box_c2 = box->z + box->d;
-
-	if (WARN_ON(loc_start->sub_resource >= dirty->num_subres))
-		return;
-
-	if (box->d == 0 || box->z > loc_start->z)
-		box->z = loc_start->z;
-	if (box_c2 < loc_end->z)
-		box->d = loc_end->z - box->z;
-
-	if (loc_start->z + 1 == loc_end->z) {
-		box_c2 = box->y + box->h;
-		if (box->h == 0 || box->y > loc_start->y)
-			box->y = loc_start->y;
-		if (box_c2 < loc_end->y)
-			box->h = loc_end->y - box->y;
-
-		if (loc_start->y + 1 == loc_end->y) {
-			box_c2 = box->x + box->w;
-			if (box->w == 0 || box->x > loc_start->x)
-				box->x = loc_start->x;
-			if (box_c2 < loc_end->x)
-				box->w = loc_end->x - box->x;
-		} else {
-			box->x = 0;
-			box->w = size->width;
-		}
-	} else {
-		box->y = 0;
-		box->h = size->height;
-		box->x = 0;
-		box->w = size->width;
-	}
-}
-
-/**
- * vmw_subres_dirty_full - Mark a full subresource as dirty
- * @dirty: The surface's dirty tracker.
- * @subres: The subresource
- */
-static void vmw_subres_dirty_full(struct vmw_surface_dirty *dirty, u32 subres)
-{
-	const struct svga3dsurface_cache *cache = &dirty->cache;
-	u32 mip = subres % cache->num_mip_levels;
-	const struct drm_vmw_size *size = &cache->mip[mip].size;
-	SVGA3dBox *box = &dirty->boxes[subres];
-
-	box->x = 0;
-	box->y = 0;
-	box->z = 0;
-	box->w = size->width;
-	box->h = size->height;
-	box->d = size->depth;
-}
-
-/*
- * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for texture
- * surfaces.
- */
-static void vmw_surface_tex_dirty_range_add(struct vmw_resource *res,
-					    size_t start, size_t end)
-{
-	struct vmw_surface_dirty *dirty =
-		(struct vmw_surface_dirty *) res->dirty;
-	size_t backup_end = res->backup_offset + res->backup_size;
-	struct svga3dsurface_loc loc1, loc2;
-	const struct svga3dsurface_cache *cache;
-
-	start = max_t(size_t, start, res->backup_offset) - res->backup_offset;
-	end = min(end, backup_end) - res->backup_offset;
-	cache = &dirty->cache;
-	svga3dsurface_get_loc(cache, &loc1, start);
-	svga3dsurface_get_loc(cache, &loc2, end - 1);
-	svga3dsurface_inc_loc(cache, &loc2);
-
-	if (loc1.sub_resource + 1 == loc2.sub_resource) {
-		/* Dirty range covers a single sub-resource */
-		vmw_subres_dirty_add(dirty, &loc1, &loc2);
-	} else {
-		/* Dirty range covers multiple sub-resources */
-		struct svga3dsurface_loc loc_min, loc_max;
-		u32 sub_res = loc1.sub_resource;
-
-		svga3dsurface_max_loc(cache, loc1.sub_resource, &loc_max);
-		vmw_subres_dirty_add(dirty, &loc1, &loc_max);
-		svga3dsurface_min_loc(cache, loc2.sub_resource - 1, &loc_min);
-		vmw_subres_dirty_add(dirty, &loc_min, &loc2);
-		for (sub_res = loc1.sub_resource + 1;
-		     sub_res < loc2.sub_resource - 1; ++sub_res)
-			vmw_subres_dirty_full(dirty, sub_res);
-	}
-}
-
-/*
- * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for buffer
- * surfaces.
- */
-static void vmw_surface_buf_dirty_range_add(struct vmw_resource *res,
-					    size_t start, size_t end)
-{
-	struct vmw_surface_dirty *dirty =
-		(struct vmw_surface_dirty *) res->dirty;
-	const struct svga3dsurface_cache *cache = &dirty->cache;
-	size_t backup_end = res->backup_offset + cache->mip_chain_bytes;
-	SVGA3dBox *box = &dirty->boxes[0];
-	u32 box_c2;
-
-	box->h = box->d = 1;
-	start = max_t(size_t, start, res->backup_offset) - res->backup_offset;
-	end = min(end, backup_end) - res->backup_offset;
-	box_c2 = box->x + box->w;
-	if (box->w == 0 || box->x > start)
-		box->x = start;
-	if (box_c2 < end)
-		box->w = end - box->x;
-}
-
-/*
- * vmw_surface_tex_dirty_add_range - The dirty_add_range callback for surfaces
- */
-static void vmw_surface_dirty_range_add(struct vmw_resource *res, size_t start,
-					size_t end)
-{
-	struct vmw_surface *srf = vmw_res_to_srf(res);
-
-	if (WARN_ON(end <= res->backup_offset ||
-		    start >= res->backup_offset + res->backup_size))
-		return;
-
-	if (srf->format == SVGA3D_BUFFER)
-		vmw_surface_buf_dirty_range_add(res, start, end);
-	else
-		vmw_surface_tex_dirty_range_add(res, start, end);
-}
-
-/*
- * vmw_surface_dirty_sync - The surface's dirty_sync callback.
- */
-static int vmw_surface_dirty_sync(struct vmw_resource *res)
-{
-	struct vmw_private *dev_priv = res->dev_priv;
-	bool has_dx = 0;
-	u32 i, num_dirty;
-	struct vmw_surface_dirty *dirty =
-		(struct vmw_surface_dirty *) res->dirty;
-	size_t alloc_size;
-	const struct svga3dsurface_cache *cache = &dirty->cache;
-	struct {
-		SVGA3dCmdHeader header;
-		SVGA3dCmdDXUpdateSubResource body;
-	} *cmd1;
-	struct {
-		SVGA3dCmdHeader header;
-		SVGA3dCmdUpdateGBImage body;
-	} *cmd2;
-	void *cmd;
-
-	num_dirty = 0;
-	for (i = 0; i < dirty->num_subres; ++i) {
-		const SVGA3dBox *box = &dirty->boxes[i];
-
-		if (box->d)
-			num_dirty++;
-	}
-
-	if (!num_dirty)
-		goto out;
-
-	alloc_size = num_dirty * ((has_dx) ? sizeof(*cmd1) : sizeof(*cmd2));
-	cmd = VMW_FIFO_RESERVE(dev_priv, alloc_size);
-	if (!cmd)
-		return -ENOMEM;
-
-	cmd1 = cmd;
-	cmd2 = cmd;
-
-	for (i = 0; i < dirty->num_subres; ++i) {
-		const SVGA3dBox *box = &dirty->boxes[i];
-
-		if (!box->d)
-			continue;
-
-		/*
-		 * DX_UPDATE_SUBRESOURCE is aware of array surfaces.
-		 * UPDATE_GB_IMAGE is not.
-		 */
-		if (has_dx) {
-			cmd1->header.id = SVGA_3D_CMD_DX_UPDATE_SUBRESOURCE;
-			cmd1->header.size = sizeof(cmd1->body);
-			cmd1->body.sid = res->id;
-			cmd1->body.subResource = i;
-			cmd1->body.box = *box;
-			cmd1++;
-		} else {
-			cmd2->header.id = SVGA_3D_CMD_UPDATE_GB_IMAGE;
-			cmd2->header.size = sizeof(cmd2->body);
-			cmd2->body.image.sid = res->id;
-			cmd2->body.image.face = i / cache->num_mip_levels;
-			cmd2->body.image.mipmap = i -
-				(cache->num_mip_levels * cmd2->body.image.face);
-			cmd2->body.box = *box;
-			cmd2++;
-		}
-
-	}
-	vmw_fifo_commit(dev_priv, alloc_size);
- out:
-	memset(&dirty->boxes[0], 0, sizeof(dirty->boxes[0]) *
-	       dirty->num_subres);
-
-	return 0;
-}
-
-/*
- * vmw_surface_dirty_alloc - The surface's dirty_alloc callback.
- */
-static int vmw_surface_dirty_alloc(struct vmw_resource *res)
-{
-	struct vmw_surface *srf = vmw_res_to_srf(res);
-	struct vmw_surface_dirty *dirty;
-	u32 num_layers = 1;
-	u32 num_mip;
-	u32 num_subres;
-	u32 num_samples;
-	size_t dirty_size, acc_size;
-	static struct ttm_operation_ctx ctx = {
-		.interruptible = false,
-		.no_wait_gpu = false
-	};
-	int ret;
-
-	if (srf->array_size)
-		num_layers = srf->array_size;
-	else if (srf->flags & SVGA3D_SURFACE_CUBEMAP)
-		num_layers *= SVGA3D_MAX_SURFACE_FACES;
-
-	num_mip = srf->mip_levels[0];
-	if (!num_mip)
-		num_mip = 1;
-
-	num_subres = num_layers * num_mip;
-	dirty_size = sizeof(*dirty) + num_subres * sizeof(dirty->boxes[0]);
-	acc_size = ttm_round_pot(dirty_size);
-	ret = ttm_mem_global_alloc(vmw_mem_glob(res->dev_priv),
-				   acc_size, &ctx);
-	if (ret) {
-		VMW_DEBUG_USER("Out of graphics memory for surface "
-			       "dirty tracker.\n");
-		return ret;
-	}
-
-	dirty = kvzalloc(dirty_size, GFP_KERNEL);
-	if (!dirty) {
-		ret = -ENOMEM;
-		goto out_no_dirty;
-	}
-
-	num_samples = max_t(u32, 1, srf->multisample_count);
-	ret = svga3dsurface_setup_cache(&srf->base_size, srf->format, num_mip,
-					num_layers, num_samples, &dirty->cache);
-	if (ret)
-		goto out_no_cache;
-
-	dirty->num_subres = num_subres;
-	dirty->size = acc_size;
-	res->dirty = (struct vmw_resource_dirty *) dirty;
-
-	return 0;
-
-out_no_cache:
-	kvfree(dirty);
-out_no_dirty:
-	ttm_mem_global_free(vmw_mem_glob(res->dev_priv), acc_size);
-	return ret;
-}
-
-/*
- * vmw_surface_dirty_free - The surface's dirty_free callback
- */
-static void vmw_surface_dirty_free(struct vmw_resource *res)
-{
-	struct vmw_surface_dirty *dirty =
-		(struct vmw_surface_dirty *) res->dirty;
-	size_t acc_size = dirty->size;
-
-	kvfree(dirty);
-	ttm_mem_global_free(vmw_mem_glob(res->dev_priv), acc_size);
-	res->dirty = NULL;
-}
-
-/*
- * vmw_surface_clean - The surface's clean callback
- */
-static int vmw_surface_clean(struct vmw_resource *res)
-{
-	struct vmw_private *dev_priv = res->dev_priv;
-	size_t alloc_size;
-	struct {
-		SVGA3dCmdHeader header;
-		SVGA3dCmdReadbackGBSurface body;
-	} *cmd;
-
-	alloc_size = sizeof(*cmd);
-	cmd = VMW_FIFO_RESERVE(dev_priv, alloc_size);
-	if (!cmd)
-		return -ENOMEM;
-
-	cmd->header.id = SVGA_3D_CMD_READBACK_GB_SURFACE;
-	cmd->header.size = sizeof(cmd->body);
-	cmd->body.sid = res->id;
-	vmw_fifo_commit(dev_priv, alloc_size);
-
-	return 0;
-}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
index 9aaf807ed73c..f611b2290a1b 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.c
@@ -33,8 +33,6 @@
  * struct vmw_validation_bo_node - Buffer object validation metadata.
  * @base: Metadata used for TTM reservation- and validation.
  * @hash: A hash entry used for the duplicate detection hash table.
- * @coherent_count: If switching backup buffers, number of new coherent
- * resources that will have this buffer as a backup buffer.
  * @as_mob: Validate as mob.
  * @cpu_blit: Validate for cpu blit access.
  *
@@ -44,7 +42,6 @@
 struct vmw_validation_bo_node {
 	struct ttm_validate_buffer base;
 	struct drm_hash_item hash;
-	unsigned int coherent_count;
 	u32 as_mob : 1;
 	u32 cpu_blit : 1;
 };
@@ -462,19 +459,6 @@ int vmw_validation_res_reserve(struct vmw_validation_context *ctx,
 			if (ret)
 				goto out_unreserve;
 		}
-
-		if (val->switching_backup && val->new_backup &&
-		    res->coherent) {
-			struct vmw_validation_bo_node *bo_node =
-				vmw_validation_find_bo_dup(ctx,
-							   val->new_backup);
-
-			if (WARN_ON(!bo_node)) {
-				ret = -EINVAL;
-				goto out_unreserve;
-			}
-			bo_node->coherent_count++;
-		}
 	}
 
 	return 0;
@@ -578,9 +562,6 @@ int vmw_validation_bo_validate(struct vmw_validation_context *ctx, bool intr)
 	int ret;
 
 	list_for_each_entry(entry, &ctx->bo_list, base.head) {
-		struct vmw_buffer_object *vbo =
-			container_of(entry->base.bo, typeof(*vbo), base);
-
 		if (entry->cpu_blit) {
 			struct ttm_operation_ctx ctx = {
 				.interruptible = intr,
@@ -595,27 +576,6 @@ int vmw_validation_bo_validate(struct vmw_validation_context *ctx, bool intr)
 		}
 		if (ret)
 			return ret;
-
-		/*
-		 * Rather than having the resource code allocating the bo
-		 * dirty tracker in resource_unreserve() where we can't fail,
-		 * Do it here when validating the buffer object.
-		 */
-		if (entry->coherent_count) {
-			unsigned int coherent_count = entry->coherent_count;
-
-			while (coherent_count) {
-				ret = vmw_bo_dirty_add(vbo);
-				if (ret)
-					return ret;
-
-				coherent_count--;
-			}
-			entry->coherent_count -= coherent_count;
-		}
-
-		if (vbo->dirty)
-			vmw_bo_dirty_scan(vbo);
 	}
 	return 0;
 }
@@ -641,8 +601,7 @@ int vmw_validation_res_validate(struct vmw_validation_context *ctx, bool intr)
 		struct vmw_resource *res = val->res;
 		struct vmw_buffer_object *backup = res->backup;
 
-		ret = vmw_resource_validate(res, intr, val->dirty_set &&
-					    val->dirty);
+		ret = vmw_resource_validate(res, intr);
 		if (ret) {
 			if (ret != -ERESTARTSYS)
 				DRM_ERROR("Failed to validate resource.\n");
@@ -869,34 +828,3 @@ int vmw_validation_preload_res(struct vmw_validation_context *ctx,
 	ctx->mem_size_left += size;
 	return 0;
 }
-
-/**
- * vmw_validation_bo_backoff - Unreserve buffer objects registered with a
- * validation context
- * @ctx: The validation context
- *
- * This function unreserves the buffer objects previously reserved using
- * vmw_validation_bo_reserve. It's typically used as part of an error path
- */
-void vmw_validation_bo_backoff(struct vmw_validation_context *ctx)
-{
-	struct vmw_validation_bo_node *entry;
-
-	/*
-	 * Switching coherent resource backup buffers failed.
-	 * Release corresponding buffer object dirty trackers.
-	 */
-	list_for_each_entry(entry, &ctx->bo_list, base.head) {
-		if (entry->coherent_count) {
-			unsigned int coherent_count = entry->coherent_count;
-			struct vmw_buffer_object *vbo =
-				container_of(entry->base.bo, typeof(*vbo),
-					     base);
-
-			while (coherent_count--)
-				vmw_bo_dirty_release(vbo);
-		}
-	}
-
-	ttm_eu_backoff_reservation(&ctx->ticket, &ctx->bo_list);
-}
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
index fd83e017c2a5..1d2322ad6fd5 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_validation.h
@@ -172,6 +172,20 @@ vmw_validation_bo_reserve(struct vmw_validation_context *ctx,
 				      NULL, true);
 }
 
+/**
+ * vmw_validation_bo_backoff - Unreserve buffer objects registered with a
+ * validation context
+ * @ctx: The validation context
+ *
+ * This function unreserves the buffer objects previously reserved using
+ * vmw_validation_bo_reserve. It's typically used as part of an error path
+ */
+static inline void
+vmw_validation_bo_backoff(struct vmw_validation_context *ctx)
+{
+	ttm_eu_backoff_reservation(&ctx->ticket, &ctx->bo_list);
+}
+
 /**
  * vmw_validation_bo_fence - Unreserve and fence buffer objects registered
  * with a validation context
@@ -254,6 +268,4 @@ int vmw_validation_preload_res(struct vmw_validation_context *ctx,
 			       unsigned int size);
 void vmw_validation_res_set_dirty(struct vmw_validation_context *ctx,
 				  void *val_private, u32 dirty);
-void vmw_validation_bo_backoff(struct vmw_validation_context *ctx);
-
 #endif
diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h
index 435d02f719a8..49d9cdfc58f2 100644
--- a/include/drm/ttm/ttm_bo_api.h
+++ b/include/drm/ttm/ttm_bo_api.h
@@ -768,14 +768,4 @@ int ttm_bo_swapout(struct ttm_bo_global *glob,
 			struct ttm_operation_ctx *ctx);
 void ttm_bo_swapout_all(struct ttm_bo_device *bdev);
 int ttm_bo_wait_unreserved(struct ttm_buffer_object *bo);
-
-/* Default number of pre-faulted pages in the TTM fault handler */
-#define TTM_BO_VM_NUM_PREFAULT 16
-
-vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
-			     struct vm_fault *vmf);
-
-vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
-				    pgprot_t prot,
-				    pgoff_t num_prefault);
 #endif
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index a2d810a2504d..c9b8ba492f24 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -442,9 +442,6 @@ extern struct ttm_bo_global {
  * @driver: Pointer to a struct ttm_bo_driver struct setup by the driver.
  * @man: An array of mem_type_managers.
  * @vma_manager: Address space manager
- * @vm_ops: Pointer to the struct vm_operations_struct used for this
- * device's VM operations. The driver may override this before the first
- * mmap() call.
  * lru_lock: Spinlock that protects the buffer+device lru lists and
  * ddestroy lists.
  * @dev_mapping: A pointer to the struct address_space representing the
@@ -463,7 +460,6 @@ struct ttm_bo_device {
 	struct ttm_bo_global *glob;
 	struct ttm_bo_driver *driver;
 	struct ttm_mem_type_manager man[TTM_NUM_MEM_TYPES];
-	const struct vm_operations_struct *vm_ops;
 
 	/*
 	 * Protected by internal locks.
@@ -492,8 +488,6 @@ struct ttm_bo_device {
 	bool no_retry;
 };
 
-extern const struct vm_operations_struct ttm_bo_vm_ops;
-
 /**
  * struct ttm_lru_bulk_move_pos
  *
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 798cdda9560e..dd0b5f4e1e45 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2686,24 +2686,7 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
 			       unsigned long size, pte_fn_t fn, void *data);
 
-struct pfn_range_apply;
-typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
-			 struct pfn_range_apply *closure);
-struct pfn_range_apply {
-	struct mm_struct *mm;
-	pter_fn_t ptefn;
-	unsigned int alloc;
-};
-extern int apply_to_pfn_range(struct pfn_range_apply *closure,
-			      unsigned long address, unsigned long size);
-unsigned long apply_as_wrprotect(struct address_space *mapping,
-				 pgoff_t first_index, pgoff_t nr);
-unsigned long apply_as_clean(struct address_space *mapping,
-			     pgoff_t first_index, pgoff_t nr,
-			     pgoff_t bitmap_pgoff,
-			     unsigned long *bitmap,
-			     pgoff_t *start,
-			     pgoff_t *end);
+
 #ifdef CONFIG_PAGE_POISONING
 extern bool page_poisoning_enabled(void);
 extern void kernel_poison_pages(struct page *page, int numpages, int enable);
diff --git a/include/uapi/drm/vmwgfx_drm.h b/include/uapi/drm/vmwgfx_drm.h
index 02cab33f2f25..399f58317cff 100644
--- a/include/uapi/drm/vmwgfx_drm.h
+++ b/include/uapi/drm/vmwgfx_drm.h
@@ -891,13 +891,11 @@ struct drm_vmw_shader_arg {
  *                                      surface.
  * @drm_vmw_surface_flag_create_buffer: Create a backup buffer if none is
  *                                      given.
- * @drm_vmw_surface_flag_coherent:      Back surface with coherent memory.
  */
 enum drm_vmw_surface_flags {
 	drm_vmw_surface_flag_shareable = (1 << 0),
 	drm_vmw_surface_flag_scanout = (1 << 1),
-	drm_vmw_surface_flag_create_buffer = (1 << 2),
-	drm_vmw_surface_flag_coherent = (1 << 3),
+	drm_vmw_surface_flag_create_buffer = (1 << 2)
 };
 
 /**
diff --git a/mm/Kconfig b/mm/Kconfig
index 5006d0e6a5c7..f0c76ba47695 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -765,7 +765,4 @@ config GUP_BENCHMARK
 config ARCH_HAS_PTE_SPECIAL
 	bool
 
-config AS_DIRTY_HELPERS
-        bool
-
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index f5d412bbc2f7..ac5e5ba78874 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -104,4 +104,3 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
 obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
 obj-$(CONFIG_HMM) += hmm.o
 obj-$(CONFIG_MEMFD_CREATE) += memfd.o
-obj-$(CONFIG_AS_DIRTY_HELPERS) += as_dirty_helpers.o
diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c
deleted file mode 100644
index f600e31534fb..000000000000
--- a/mm/as_dirty_helpers.c
+++ /dev/null
@@ -1,300 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-#include <linux/mm_types.h>
-#include <linux/hugetlb.h>
-#include <linux/bitops.h>
-#include <linux/mmu_notifier.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-/**
- * struct apply_as - Closure structure for apply_as_range
- * @base: struct pfn_range_apply we derive from
- * @start: Address of first modified pte
- * @end: Address of last modified pte + 1
- * @total: Total number of modified ptes
- * @vma: Pointer to the struct vm_area_struct we're currently operating on
- */
-struct apply_as {
-	struct pfn_range_apply base;
-	unsigned long start;
-	unsigned long end;
-	unsigned long total;
-	struct vm_area_struct *vma;
-};
-
-/**
- * apply_pt_wrprotect - Leaf pte callback to write-protect a pte
- * @pte: Pointer to the pte
- * @token: Page table token, see apply_to_pfn_range()
- * @addr: The virtual page address
- * @closure: Pointer to a struct pfn_range_apply embedded in a
- * struct apply_as
- *
- * The function write-protects a pte and records the range in
- * virtual address space of touched ptes for efficient range TLB flushes.
- *
- * Return: Always zero.
- */
-static int apply_pt_wrprotect(pte_t *pte, pgtable_t token,
-			      unsigned long addr,
-			      struct pfn_range_apply *closure)
-{
-	struct apply_as *aas = container_of(closure, typeof(*aas), base);
-	pte_t ptent = *pte;
-
-	if (pte_write(ptent)) {
-		pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte);
-
-		ptent = pte_wrprotect(old_pte);
-		ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent);
-		aas->total++;
-		aas->start = min(aas->start, addr);
-		aas->end = max(aas->end, addr + PAGE_SIZE);
-	}
-
-	return 0;
-}
-
-/**
- * struct apply_as_clean - Closure structure for apply_as_clean
- * @base: struct apply_as we derive from
- * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap
- * @bitmap: Bitmap with one bit for each page offset in the address_space range
- * covered.
- * @start: Address_space page offset of first modified pte relative
- * to @bitmap_pgoff
- * @end: Address_space page offset of last modified pte relative
- * to @bitmap_pgoff
- */
-struct apply_as_clean {
-	struct apply_as base;
-	pgoff_t bitmap_pgoff;
-	unsigned long *bitmap;
-	pgoff_t start;
-	pgoff_t end;
-};
-
-/**
- * apply_pt_clean - Leaf pte callback to clean a pte
- * @pte: Pointer to the pte
- * @token: Page table token, see apply_to_pfn_range()
- * @addr: The virtual page address
- * @closure: Pointer to a struct pfn_range_apply embedded in a
- * struct apply_as_clean
- *
- * The function cleans a pte and records the range in
- * virtual address space of touched ptes for efficient TLB flushes.
- * It also records dirty ptes in a bitmap representing page offsets
- * in the address_space, as well as the first and last of the bits
- * touched.
- *
- * Return: Always zero.
- */
-static int apply_pt_clean(pte_t *pte, pgtable_t token,
-			  unsigned long addr,
-			  struct pfn_range_apply *closure)
-{
-	struct apply_as *aas = container_of(closure, typeof(*aas), base);
-	struct apply_as_clean *clean = container_of(aas, typeof(*clean), base);
-	pte_t ptent = *pte;
-
-	if (pte_dirty(ptent)) {
-		pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) +
-			aas->vma->vm_pgoff - clean->bitmap_pgoff;
-		pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte);
-
-		ptent = pte_mkclean(old_pte);
-		ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent);
-
-		aas->total++;
-		aas->start = min(aas->start, addr);
-		aas->end = max(aas->end, addr + PAGE_SIZE);
-
-		__set_bit(pgoff, clean->bitmap);
-		clean->start = min(clean->start, pgoff);
-		clean->end = max(clean->end, pgoff + 1);
-	}
-
-	return 0;
-}
-
-/**
- * apply_as_range - Apply a pte callback to all PTEs pointing into a range
- * of an address_space.
- * @mapping: Pointer to the struct address_space
- * @aas: Closure structure
- * @first_index: First page offset in the address_space
- * @nr: Number of incremental page offsets to cover
- *
- * Return: Number of ptes touched. Note that this number might be larger
- * than @nr if there are overlapping vmas
- */
-static unsigned long apply_as_range(struct address_space *mapping,
-				    struct apply_as *aas,
-				    pgoff_t first_index, pgoff_t nr)
-{
-	struct vm_area_struct *vma;
-	pgoff_t vba, vea, cba, cea;
-	unsigned long start_addr, end_addr;
-	struct mmu_notifier_range range;
-
-	i_mmap_lock_read(mapping);
-	vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
-				  first_index + nr - 1) {
-		unsigned long vm_flags = READ_ONCE(vma->vm_flags);
-
-		/*
-		 * We can only do advisory flag tests below, since we can't
-		 * require the vm's mmap_sem to be held to protect the flags.
-		 * Therefore, callers that strictly depend on specific mmap
-		 * flags to remain constant throughout the operation must
-		 * either ensure those flags are immutable for all relevant
-		 * vmas or can't use this function. Fixing this properly would
-		 * require the vma::vm_flags to be protected by a separate
-		 * lock taken after the i_mmap_lock
-		 */
-
-		/* Skip non-applicable VMAs */
-		if ((vm_flags & (VM_SHARED | VM_WRITE)) !=
-		    (VM_SHARED | VM_WRITE))
-			continue;
-
-		/* Warn on and skip VMAs whose flags indicate illegal usage */
-		if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO))
-			continue;
-
-		/* Clip to the vma */
-		vba = vma->vm_pgoff;
-		vea = vba + vma_pages(vma);
-		cba = first_index;
-		cba = max(cba, vba);
-		cea = first_index + nr;
-		cea = min(cea, vea);
-
-		/* Translate to virtual address */
-		start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
-		end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
-		if (start_addr >= end_addr)
-			continue;
-
-		aas->base.mm = vma->vm_mm;
-		aas->vma = vma;
-		aas->start = end_addr;
-		aas->end = start_addr;
-
-		mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
-					vma, vma->vm_mm, start_addr, end_addr);
-		mmu_notifier_invalidate_range_start(&range);
-
-		/* Needed when we only change protection? */
-		flush_cache_range(vma, start_addr, end_addr);
-
-		/*
-		 * We're not using tlb_gather_mmu() since typically
-		 * only a small subrange of PTEs are affected.
-		 */
-		inc_tlb_flush_pending(vma->vm_mm);
-
-		/* Should not error since aas->base.alloc == 0 */
-		WARN_ON(apply_to_pfn_range(&aas->base, start_addr,
-					   end_addr - start_addr));
-		if (aas->end > aas->start)
-			flush_tlb_range(vma, aas->start, aas->end);
-
-		mmu_notifier_invalidate_range_end(&range);
-		dec_tlb_flush_pending(vma->vm_mm);
-	}
-	i_mmap_unlock_read(mapping);
-
-	return aas->total;
-}
-
-/**
- * apply_as_wrprotect - Write-protect all ptes in an address_space range
- * @mapping: The address_space we want to write protect
- * @first_index: The first page offset in the range
- * @nr: Number of incremental page offsets to cover
- *
- * WARNING: This function should only be used for address spaces whose
- * vmas are marked VM_IO and that do not contain huge pages.
- * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
- * simply skipped.
- *
- * Return: The number of ptes actually write-protected. Note that
- * already write-protected ptes are not counted.
- */
-unsigned long apply_as_wrprotect(struct address_space *mapping,
-				 pgoff_t first_index, pgoff_t nr)
-{
-	struct apply_as aas = {
-		.base = {
-			.alloc = 0,
-			.ptefn = apply_pt_wrprotect,
-		},
-		.total = 0,
-	};
-
-	return apply_as_range(mapping, &aas, first_index, nr);
-}
-EXPORT_SYMBOL_GPL(apply_as_wrprotect);
-
-/**
- * apply_as_clean - Clean all ptes in an address_space range
- * @mapping: The address_space we want to clean
- * @first_index: The first page offset in the range
- * @nr: Number of incremental page offsets to cover
- * @bitmap_pgoff: The page offset of the first bit in @bitmap
- * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to
- * cover the whole range @first_index..@first_index + @nr.
- * @start: Pointer to number of the first set bit in @bitmap.
- * is modified as new bits are set by the function.
- * @end: Pointer to the number of the last set bit in @bitmap.
- * none set. The value is modified as new bits are set by the function.
- *
- * Note: When this function returns there is no guarantee that a CPU has
- * not already dirtied new ptes. However it will not clean any ptes not
- * reported in the bitmap.
- *
- * If a caller needs to make sure all dirty ptes are picked up and none
- * additional are added, it first needs to write-protect the address-space
- * range and make sure new writers are blocked in page_mkwrite() or
- * pfn_mkwrite(). And then after a TLB flush following the write-protection
- * pick up all dirty bits.
- *
- * WARNING: This function should only be used for address spaces whose
- * vmas are marked VM_IO and that do not contain huge pages.
- * To avoid interference with COW'd pages, vmas not marked VM_SHARED are
- * simply skipped.
- *
- * Return: The number of dirty ptes actually cleaned.
- */
-unsigned long apply_as_clean(struct address_space *mapping,
-			     pgoff_t first_index, pgoff_t nr,
-			     pgoff_t bitmap_pgoff,
-			     unsigned long *bitmap,
-			     pgoff_t *start,
-			     pgoff_t *end)
-{
-	bool none_set = (*start >= *end);
-	struct apply_as_clean clean = {
-		.base = {
-			.base = {
-				.alloc = 0,
-				.ptefn = apply_pt_clean,
-			},
-			.total = 0,
-		},
-		.bitmap_pgoff = bitmap_pgoff,
-		.bitmap = bitmap,
-		.start = none_set ? nr : *start,
-		.end = none_set ? 0 : *end,
-	};
-	unsigned long ret = apply_as_range(mapping, &clean.base, first_index,
-					   nr);
-
-	*start = clean.start;
-	*end = clean.end;
-	return ret;
-}
-EXPORT_SYMBOL_GPL(apply_as_clean);
diff --git a/mm/memory.c b/mm/memory.c
index 462aa47f8878..ddf20bd0c317 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2032,17 +2032,18 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long
 }
 EXPORT_SYMBOL(vm_iomap_memory);
 
-static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
-			      unsigned long addr, unsigned long end)
+static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
 {
 	pte_t *pte;
 	int err;
 	pgtable_t token;
 	spinlock_t *uninitialized_var(ptl);
 
-	pte = (closure->mm == &init_mm) ?
+	pte = (mm == &init_mm) ?
 		pte_alloc_kernel(pmd, addr) :
-		pte_alloc_map_lock(closure->mm, pmd, addr, &ptl);
+		pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 
@@ -2053,109 +2054,86 @@ static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
 	token = pmd_pgtable(*pmd);
 
 	do {
-		err = closure->ptefn(pte++, token, addr, closure);
+		err = fn(pte++, token, addr, data);
 		if (err)
 			break;
 	} while (addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
 
-	if (closure->mm != &init_mm)
+	if (mm != &init_mm)
 		pte_unmap_unlock(pte-1, ptl);
 	return err;
 }
 
-static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
-			      unsigned long addr, unsigned long end)
+static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
 {
 	pmd_t *pmd;
 	unsigned long next;
-	int err = 0;
+	int err;
 
 	BUG_ON(pud_huge(*pud));
 
-	pmd = pmd_alloc(closure->mm, pud, addr);
+	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
 		return -ENOMEM;
-
 	do {
 		next = pmd_addr_end(addr, end);
-		if (!closure->alloc && pmd_none_or_clear_bad(pmd))
-			continue;
-		err = apply_to_pte_range(closure, pmd, addr, next);
+		err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pmd++, addr = next, addr != end);
 	return err;
 }
 
-static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d,
-			      unsigned long addr, unsigned long end)
+static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
 {
 	pud_t *pud;
 	unsigned long next;
-	int err = 0;
+	int err;
 
-	pud = pud_alloc(closure->mm, p4d, addr);
+	pud = pud_alloc(mm, p4d, addr);
 	if (!pud)
 		return -ENOMEM;
-
 	do {
 		next = pud_addr_end(addr, end);
-		if (!closure->alloc && pud_none_or_clear_bad(pud))
-			continue;
-		err = apply_to_pmd_range(closure, pud, addr, next);
+		err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pud++, addr = next, addr != end);
 	return err;
 }
 
-static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd,
-			      unsigned long addr, unsigned long end)
+static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
+				     unsigned long addr, unsigned long end,
+				     pte_fn_t fn, void *data)
 {
 	p4d_t *p4d;
 	unsigned long next;
-	int err = 0;
+	int err;
 
-	p4d = p4d_alloc(closure->mm, pgd, addr);
+	p4d = p4d_alloc(mm, pgd, addr);
 	if (!p4d)
 		return -ENOMEM;
-
 	do {
 		next = p4d_addr_end(addr, end);
-		if (!closure->alloc && p4d_none_or_clear_bad(p4d))
-			continue;
-		err = apply_to_pud_range(closure, p4d, addr, next);
+		err = apply_to_pud_range(mm, p4d, addr, next, fn, data);
 		if (err)
 			break;
 	} while (p4d++, addr = next, addr != end);
 	return err;
 }
 
-/**
- * apply_to_pfn_range - Scan a region of virtual memory, calling a provided
- * function on each leaf page table entry
- * @closure: Details about how to scan and what function to apply
- * @addr: Start virtual address
- * @size: Size of the region
- *
- * If @closure->alloc is set to 1, the function will fill in the page table
- * as necessary. Otherwise it will skip non-present parts.
- * Note: The caller must ensure that the range does not contain huge pages.
- * The caller must also assure that the proper mmu_notifier functions are
- * called before and after the call to apply_to_pfn_range.
- *
- * WARNING: Do not use this function unless you know exactly what you are
- * doing. It is lacking support for huge pages and transparent huge pages.
- *
- * Return: Zero on success. If the provided function returns a non-zero status,
- * the page table walk will terminate and that status will be returned.
- * If @closure->alloc is set to 1, then this function may also return memory
- * allocation errors arising from allocating page table memory.
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
  */
-int apply_to_pfn_range(struct pfn_range_apply *closure,
-		       unsigned long addr, unsigned long size)
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+			unsigned long size, pte_fn_t fn, void *data)
 {
 	pgd_t *pgd;
 	unsigned long next;
@@ -2165,65 +2143,16 @@ int apply_to_pfn_range(struct pfn_range_apply *closure,
 	if (WARN_ON(addr >= end))
 		return -EINVAL;
 
-	pgd = pgd_offset(closure->mm, addr);
+	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		if (!closure->alloc && pgd_none_or_clear_bad(pgd))
-			continue;
-		err = apply_to_p4d_range(closure, pgd, addr, next);
+		err = apply_to_p4d_range(mm, pgd, addr, next, fn, data);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
 
 	return err;
 }
-
-/**
- * struct page_range_apply - Closure structure for apply_to_page_range()
- * @pter: The base closure structure we derive from
- * @fn: The leaf pte function to call
- * @data: The leaf pte function closure
- */
-struct page_range_apply {
-	struct pfn_range_apply pter;
-	pte_fn_t fn;
-	void *data;
-};
-
-/*
- * Callback wrapper to enable use of apply_to_pfn_range for
- * the apply_to_page_range interface
- */
-static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token,
-				       unsigned long addr,
-				       struct pfn_range_apply *pter)
-{
-	struct page_range_apply *pra =
-		container_of(pter, typeof(*pra), pter);
-
-	return pra->fn(pte, token, addr, pra->data);
-}
-
-/*
- * Scan a region of virtual memory, filling in page tables as necessary
- * and calling a provided function on each leaf page table.
- *
- * WARNING: Do not use this function unless you know exactly what you are
- * doing. It is lacking support for huge pages and transparent huge pages.
- */
-int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
-			unsigned long size, pte_fn_t fn, void *data)
-{
-	struct page_range_apply pra = {
-		.pter = {.mm = mm,
-			 .alloc = 1,
-			 .ptefn = apply_to_page_range_wrapper },
-		.fn = fn,
-		.data = data
-	};
-
-	return apply_to_pfn_range(&pra.pter, addr, size);
-}
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
@@ -2309,7 +2238,7 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
 	ret = vmf->vma->vm_ops->page_mkwrite(vmf);
 	/* Restore original flags so that caller is not surprised */
 	vmf->flags = old_flags;
-	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
+	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
 		return ret;
 	if (unlikely(!(ret & VM_FAULT_LOCKED))) {
 		lock_page(page);
@@ -2586,7 +2515,7 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		vmf->flags |= FAULT_FLAG_MKWRITE;
 		ret = vma->vm_ops->pfn_mkwrite(vmf);
-		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))
+		if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
 			return ret;
 		return finish_mkwrite_fault(vmf);
 	}
@@ -2607,8 +2536,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		tmp = do_page_mkwrite(vmf);
 		if (unlikely(!tmp || (tmp &
-				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
-				       VM_FAULT_RETRY)))) {
+				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
 			put_page(vmf->page);
 			return tmp;
 		}
@@ -3673,8 +3601,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
 		unlock_page(vmf->page);
 		tmp = do_page_mkwrite(vmf);
 		if (unlikely(!tmp ||
-				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
-					VM_FAULT_RETRY)))) {
+				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
 			put_page(vmf->page);
 			return tmp;
 		}
-- 
cgit v1.2.3-59-g8ed1b


From b43995469e5804636a55372e9bbb17ccb22441c5 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 15 Jul 2019 09:39:52 -0700
Subject: bpf: rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok

Rename bpf_ctx_wide_store_ok to bpf_ctx_wide_access_ok to indicate
that it can be used for both loads and stores.

Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h |  2 +-
 net/core/filter.c      | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6d944369ca87..ff65d22cf336 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -747,7 +747,7 @@ bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
 	return size <= size_default && (size & (size - 1)) == 0;
 }
 
-#define bpf_ctx_wide_store_ok(off, size, type, field)			\
+#define bpf_ctx_wide_access_ok(off, size, type, field)			\
 	(size == sizeof(__u64) &&					\
 	off >= offsetof(type, field) &&					\
 	off + sizeof(__u64) <= offsetofend(type, field) &&		\
diff --git a/net/core/filter.c b/net/core/filter.c
index 47f6386fb17a..c5983ddb1a9f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6890,14 +6890,14 @@ static bool sock_addr_is_valid_access(int off, int size,
 			if (!bpf_ctx_narrow_access_ok(off, size, size_default))
 				return false;
 		} else {
-			if (bpf_ctx_wide_store_ok(off, size,
-						  struct bpf_sock_addr,
-						  user_ip6))
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   user_ip6))
 				return true;
 
-			if (bpf_ctx_wide_store_ok(off, size,
-						  struct bpf_sock_addr,
-						  msg_src_ip6))
+			if (bpf_ctx_wide_access_ok(off, size,
+						   struct bpf_sock_addr,
+						   msg_src_ip6))
 				return true;
 
 			if (size != size_default)
-- 
cgit v1.2.3-59-g8ed1b


From c4dcc8a162784c1f827c7f6d8409598f19708fe6 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 16 Jul 2019 09:36:08 +0530
Subject: cpufreq: Make cpufreq_generic_init() return void

It always returns 0 (success) and its return type should really be void.

Over that, many drivers have added error handling code based on its
return value, which is not required at all.

Change its return type to void and update all the callers.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/bmips-cpufreq.c     | 17 ++++++-----------
 drivers/cpufreq/cpufreq.c           |  4 +---
 drivers/cpufreq/davinci-cpufreq.c   |  3 ++-
 drivers/cpufreq/imx6q-cpufreq.c     |  6 ++----
 drivers/cpufreq/kirkwood-cpufreq.c  |  3 ++-
 drivers/cpufreq/loongson1-cpufreq.c |  8 +++-----
 drivers/cpufreq/loongson2_cpufreq.c |  3 ++-
 drivers/cpufreq/maple-cpufreq.c     |  3 ++-
 drivers/cpufreq/omap-cpufreq.c      | 15 +++++----------
 drivers/cpufreq/pasemi-cpufreq.c    |  3 ++-
 drivers/cpufreq/pmac32-cpufreq.c    |  3 ++-
 drivers/cpufreq/pmac64-cpufreq.c    |  3 ++-
 drivers/cpufreq/s3c2416-cpufreq.c   |  9 ++-------
 drivers/cpufreq/s3c64xx-cpufreq.c   | 15 +++------------
 drivers/cpufreq/s5pv210-cpufreq.c   |  3 ++-
 drivers/cpufreq/sa1100-cpufreq.c    |  3 ++-
 drivers/cpufreq/sa1110-cpufreq.c    |  3 ++-
 drivers/cpufreq/spear-cpufreq.c     |  3 ++-
 drivers/cpufreq/tegra20-cpufreq.c   |  8 +-------
 include/linux/cpufreq.h             |  2 +-
 20 files changed, 46 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/bmips-cpufreq.c b/drivers/cpufreq/bmips-cpufreq.c
index 56a4ebbf00e0..f7c23fa468f0 100644
--- a/drivers/cpufreq/bmips-cpufreq.c
+++ b/drivers/cpufreq/bmips-cpufreq.c
@@ -131,23 +131,18 @@ static int bmips_cpufreq_exit(struct cpufreq_policy *policy)
 static int bmips_cpufreq_init(struct cpufreq_policy *policy)
 {
 	struct cpufreq_frequency_table *freq_table;
-	int ret;
 
 	freq_table = bmips_cpufreq_get_freq_table(policy);
 	if (IS_ERR(freq_table)) {
-		ret = PTR_ERR(freq_table);
-		pr_err("%s: couldn't determine frequency table (%d).\n",
-			BMIPS_CPUFREQ_NAME, ret);
-		return ret;
+		pr_err("%s: couldn't determine frequency table (%ld).\n",
+			BMIPS_CPUFREQ_NAME, PTR_ERR(freq_table));
+		return PTR_ERR(freq_table);
 	}
 
-	ret = cpufreq_generic_init(policy, freq_table, TRANSITION_LATENCY);
-	if (ret)
-		bmips_cpufreq_exit(policy);
-	else
-		pr_info("%s: registered\n", BMIPS_CPUFREQ_NAME);
+	cpufreq_generic_init(policy, freq_table, TRANSITION_LATENCY);
+	pr_info("%s: registered\n", BMIPS_CPUFREQ_NAME);
 
-	return ret;
+	return 0;
 }
 
 static struct cpufreq_driver bmips_cpufreq_driver = {
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 99aa7d20b458..efab334d6ab2 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -162,7 +162,7 @@ EXPORT_SYMBOL_GPL(arch_set_freq_scale);
  * - set policies transition latency
  * - policy->cpus with all possible CPUs
  */
-int cpufreq_generic_init(struct cpufreq_policy *policy,
+void cpufreq_generic_init(struct cpufreq_policy *policy,
 		struct cpufreq_frequency_table *table,
 		unsigned int transition_latency)
 {
@@ -174,8 +174,6 @@ int cpufreq_generic_init(struct cpufreq_policy *policy,
 	 * share the clock and voltage and clock.
 	 */
 	cpumask_setall(policy->cpus);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(cpufreq_generic_init);
 
diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c
index 940fe85db97a..664fa4ab9d1c 100644
--- a/drivers/cpufreq/davinci-cpufreq.c
+++ b/drivers/cpufreq/davinci-cpufreq.c
@@ -93,7 +93,8 @@ static int davinci_cpu_init(struct cpufreq_policy *policy)
 	 * Setting the latency to 2000 us to accommodate addition of drivers
 	 * to pre/post change notification list.
 	 */
-	return cpufreq_generic_init(policy, freq_table, 2000 * 1000);
+	cpufreq_generic_init(policy, freq_table, 2000 * 1000);
+	return 0;
 }
 
 static struct cpufreq_driver davinci_driver = {
diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 3e17560b1efe..91ea95c97bb2 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -193,14 +193,12 @@ static int imx6q_set_target(struct cpufreq_policy *policy, unsigned int index)
 
 static int imx6q_cpufreq_init(struct cpufreq_policy *policy)
 {
-	int ret;
-
 	policy->clk = clks[ARM].clk;
-	ret = cpufreq_generic_init(policy, freq_table, transition_latency);
+	cpufreq_generic_init(policy, freq_table, transition_latency);
 	policy->suspend_freq = max_freq;
 	dev_pm_opp_of_register_em(policy->cpus);
 
-	return ret;
+	return 0;
 }
 
 static struct cpufreq_driver imx6q_cpufreq_driver = {
diff --git a/drivers/cpufreq/kirkwood-cpufreq.c b/drivers/cpufreq/kirkwood-cpufreq.c
index 7ab564c1f7ae..cb74bdc5baaa 100644
--- a/drivers/cpufreq/kirkwood-cpufreq.c
+++ b/drivers/cpufreq/kirkwood-cpufreq.c
@@ -85,7 +85,8 @@ static int kirkwood_cpufreq_target(struct cpufreq_policy *policy,
 /* Module init and exit code */
 static int kirkwood_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, kirkwood_freq_table, 5000);
+	cpufreq_generic_init(policy, kirkwood_freq_table, 5000);
+	return 0;
 }
 
 static struct cpufreq_driver kirkwood_cpufreq_driver = {
diff --git a/drivers/cpufreq/loongson1-cpufreq.c b/drivers/cpufreq/loongson1-cpufreq.c
index 21c9ce8526c0..0ea88778882a 100644
--- a/drivers/cpufreq/loongson1-cpufreq.c
+++ b/drivers/cpufreq/loongson1-cpufreq.c
@@ -81,7 +81,7 @@ static int ls1x_cpufreq_init(struct cpufreq_policy *policy)
 	struct device *cpu_dev = get_cpu_device(policy->cpu);
 	struct cpufreq_frequency_table *freq_tbl;
 	unsigned int pll_freq, freq;
-	int steps, i, ret;
+	int steps, i;
 
 	pll_freq = clk_get_rate(cpufreq->pll_clk) / 1000;
 
@@ -103,11 +103,9 @@ static int ls1x_cpufreq_init(struct cpufreq_policy *policy)
 	freq_tbl[i].frequency = CPUFREQ_TABLE_END;
 
 	policy->clk = cpufreq->clk;
-	ret = cpufreq_generic_init(policy, freq_tbl, 0);
-	if (ret)
-		kfree(freq_tbl);
+	cpufreq_generic_init(policy, freq_tbl, 0);
 
-	return ret;
+	return 0;
 }
 
 static int ls1x_cpufreq_exit(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/loongson2_cpufreq.c b/drivers/cpufreq/loongson2_cpufreq.c
index da344696beed..890813e0bb76 100644
--- a/drivers/cpufreq/loongson2_cpufreq.c
+++ b/drivers/cpufreq/loongson2_cpufreq.c
@@ -95,7 +95,8 @@ static int loongson2_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	}
 
 	policy->clk = cpuclk;
-	return cpufreq_generic_init(policy, &loongson2_clockmod_table[0], 0);
+	cpufreq_generic_init(policy, &loongson2_clockmod_table[0], 0);
+	return 0;
 }
 
 static int loongson2_cpufreq_exit(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/maple-cpufreq.c b/drivers/cpufreq/maple-cpufreq.c
index a94355723ef8..a03cd3ad170f 100644
--- a/drivers/cpufreq/maple-cpufreq.c
+++ b/drivers/cpufreq/maple-cpufreq.c
@@ -143,7 +143,8 @@ static unsigned int maple_cpufreq_get_speed(unsigned int cpu)
 
 static int maple_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, maple_cpu_freqs, 12000);
+	cpufreq_generic_init(policy, maple_cpu_freqs, 12000);
+	return 0;
 }
 
 static struct cpufreq_driver maple_cpufreq_driver = {
diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c
index 68052b74d28f..edda20119cfd 100644
--- a/drivers/cpufreq/omap-cpufreq.c
+++ b/drivers/cpufreq/omap-cpufreq.c
@@ -125,23 +125,18 @@ static int omap_cpu_init(struct cpufreq_policy *policy)
 			dev_err(mpu_dev,
 				"%s: cpu%d: failed creating freq table[%d]\n",
 				__func__, policy->cpu, result);
-			goto fail;
+			clk_put(policy->clk);
+			return result;
 		}
 	}
 
 	atomic_inc_return(&freq_table_users);
 
 	/* FIXME: what's the actual transition time? */
-	result = cpufreq_generic_init(policy, freq_table, 300 * 1000);
-	if (!result) {
-		dev_pm_opp_of_register_em(policy->cpus);
-		return 0;
-	}
+	cpufreq_generic_init(policy, freq_table, 300 * 1000);
+	dev_pm_opp_of_register_em(policy->cpus);
 
-	freq_table_free();
-fail:
-	clk_put(policy->clk);
-	return result;
+	return 0;
 }
 
 static int omap_cpu_exit(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c
index 6b1e4abe3248..93f39a1d4c3d 100644
--- a/drivers/cpufreq/pasemi-cpufreq.c
+++ b/drivers/cpufreq/pasemi-cpufreq.c
@@ -196,7 +196,8 @@ static int pas_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	policy->cur = pas_freqs[cur_astate].frequency;
 	ppc_proc_freq = policy->cur * 1000ul;
 
-	return cpufreq_generic_init(policy, pas_freqs, get_gizmo_latency());
+	cpufreq_generic_init(policy, pas_freqs, get_gizmo_latency());
+	return 0;
 
 out_unmap_sdcpwr:
 	iounmap(sdcpwr_mapbase);
diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c
index 9b4ce2eb8222..bc7fc930294e 100644
--- a/drivers/cpufreq/pmac32-cpufreq.c
+++ b/drivers/cpufreq/pmac32-cpufreq.c
@@ -376,7 +376,8 @@ static int pmac_cpufreq_target(	struct cpufreq_policy *policy,
 
 static int pmac_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, pmac_cpu_freqs, transition_latency);
+	cpufreq_generic_init(policy, pmac_cpu_freqs, transition_latency);
+	return 0;
 }
 
 static u32 read_gpio(struct device_node *np)
diff --git a/drivers/cpufreq/pmac64-cpufreq.c b/drivers/cpufreq/pmac64-cpufreq.c
index 1d32a863332d..045881494cc9 100644
--- a/drivers/cpufreq/pmac64-cpufreq.c
+++ b/drivers/cpufreq/pmac64-cpufreq.c
@@ -324,7 +324,8 @@ static unsigned int g5_cpufreq_get_speed(unsigned int cpu)
 
 static int g5_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, g5_cpu_freqs, transition_latency);
+	cpufreq_generic_init(policy, g5_cpu_freqs, transition_latency);
+	return 0;
 }
 
 static struct cpufreq_driver g5_cpufreq_driver = {
diff --git a/drivers/cpufreq/s3c2416-cpufreq.c b/drivers/cpufreq/s3c2416-cpufreq.c
index 5b2db3c6568f..124a4c68c5ec 100644
--- a/drivers/cpufreq/s3c2416-cpufreq.c
+++ b/drivers/cpufreq/s3c2416-cpufreq.c
@@ -450,21 +450,16 @@ static int s3c2416_cpufreq_driver_init(struct cpufreq_policy *policy)
 	/* Datasheet says PLL stabalisation time must be at least 300us,
 	 * so but add some fudge. (reference in LOCKCON0 register description)
 	 */
-	ret = cpufreq_generic_init(policy, s3c_freq->freq_table,
+	cpufreq_generic_init(policy, s3c_freq->freq_table,
 			(500 * 1000) + s3c_freq->regulator_latency);
-	if (ret)
-		goto err_freq_table;
-
 	register_reboot_notifier(&s3c2416_cpufreq_reboot_notifier);
 
 	return 0;
 
-err_freq_table:
 #ifdef CONFIG_ARM_S3C2416_CPUFREQ_VCORESCALE
-	regulator_put(s3c_freq->vddarm);
 err_vddarm:
-#endif
 	clk_put(s3c_freq->armclk);
+#endif
 err_armclk:
 	clk_put(s3c_freq->hclk);
 err_hclk:
diff --git a/drivers/cpufreq/s3c64xx-cpufreq.c b/drivers/cpufreq/s3c64xx-cpufreq.c
index 0cb9040eca49..40aafa8299a0 100644
--- a/drivers/cpufreq/s3c64xx-cpufreq.c
+++ b/drivers/cpufreq/s3c64xx-cpufreq.c
@@ -147,7 +147,6 @@ out:
 
 static int s3c64xx_cpufreq_driver_init(struct cpufreq_policy *policy)
 {
-	int ret;
 	struct cpufreq_frequency_table *freq;
 
 	if (policy->cpu != 0)
@@ -168,8 +167,7 @@ static int s3c64xx_cpufreq_driver_init(struct cpufreq_policy *policy)
 #ifdef CONFIG_REGULATOR
 	vddarm = regulator_get(NULL, "vddarm");
 	if (IS_ERR(vddarm)) {
-		ret = PTR_ERR(vddarm);
-		pr_err("Failed to obtain VDDARM: %d\n", ret);
+		pr_err("Failed to obtain VDDARM: %ld\n", PTR_ERR(vddarm));
 		pr_err("Only frequency scaling available\n");
 		vddarm = NULL;
 	} else {
@@ -199,16 +197,9 @@ static int s3c64xx_cpufreq_driver_init(struct cpufreq_policy *policy)
 	 * the PLLs, which we don't currently) is ~300us worst case,
 	 * but add some fudge.
 	 */
-	ret = cpufreq_generic_init(policy, s3c64xx_freq_table,
+	cpufreq_generic_init(policy, s3c64xx_freq_table,
 			(500 * 1000) + regulator_latency);
-	if (ret != 0) {
-		pr_err("Failed to configure frequency table: %d\n",
-		       ret);
-		regulator_put(vddarm);
-		clk_put(policy->clk);
-	}
-
-	return ret;
+	return 0;
 }
 
 static struct cpufreq_driver s3c64xx_cpufreq_driver = {
diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c
index c7b7d1e65b08..0663cc935fa6 100644
--- a/drivers/cpufreq/s5pv210-cpufreq.c
+++ b/drivers/cpufreq/s5pv210-cpufreq.c
@@ -544,7 +544,8 @@ static int s5pv210_cpu_init(struct cpufreq_policy *policy)
 	s5pv210_dram_conf[1].freq = clk_get_rate(dmc1_clk);
 
 	policy->suspend_freq = SLEEP_FREQ;
-	return cpufreq_generic_init(policy, s5pv210_freq_table, 40000);
+	cpufreq_generic_init(policy, s5pv210_freq_table, 40000);
+	return 0;
 
 out_dmc1:
 	clk_put(dmc0_clk);
diff --git a/drivers/cpufreq/sa1100-cpufreq.c b/drivers/cpufreq/sa1100-cpufreq.c
index ab5cab93e638..5c075ef6adc0 100644
--- a/drivers/cpufreq/sa1100-cpufreq.c
+++ b/drivers/cpufreq/sa1100-cpufreq.c
@@ -181,7 +181,8 @@ static int sa1100_target(struct cpufreq_policy *policy, unsigned int ppcr)
 
 static int __init sa1100_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+	cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+	return 0;
 }
 
 static struct cpufreq_driver sa1100_driver __refdata = {
diff --git a/drivers/cpufreq/sa1110-cpufreq.c b/drivers/cpufreq/sa1110-cpufreq.c
index 66e5fb088ecc..1057d7f65118 100644
--- a/drivers/cpufreq/sa1110-cpufreq.c
+++ b/drivers/cpufreq/sa1110-cpufreq.c
@@ -306,7 +306,8 @@ static int sa1110_target(struct cpufreq_policy *policy, unsigned int ppcr)
 
 static int __init sa1110_cpu_init(struct cpufreq_policy *policy)
 {
-	return cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+	cpufreq_generic_init(policy, sa11x0_freq_table, 0);
+	return 0;
 }
 
 /* sa1110_driver needs __refdata because it must remain after init registers
diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c
index 4074e2615522..73bd8dc47074 100644
--- a/drivers/cpufreq/spear-cpufreq.c
+++ b/drivers/cpufreq/spear-cpufreq.c
@@ -153,8 +153,9 @@ static int spear_cpufreq_target(struct cpufreq_policy *policy,
 static int spear_cpufreq_init(struct cpufreq_policy *policy)
 {
 	policy->clk = spear_cpufreq.clk;
-	return cpufreq_generic_init(policy, spear_cpufreq.freq_tbl,
+	cpufreq_generic_init(policy, spear_cpufreq.freq_tbl,
 			spear_cpufreq.transition_latency);
+	return 0;
 }
 
 static struct cpufreq_driver spear_cpufreq_driver = {
diff --git a/drivers/cpufreq/tegra20-cpufreq.c b/drivers/cpufreq/tegra20-cpufreq.c
index 3c32cc7b0671..f84ecd22f488 100644
--- a/drivers/cpufreq/tegra20-cpufreq.c
+++ b/drivers/cpufreq/tegra20-cpufreq.c
@@ -118,17 +118,11 @@ static int tegra_target(struct cpufreq_policy *policy, unsigned int index)
 static int tegra_cpu_init(struct cpufreq_policy *policy)
 {
 	struct tegra20_cpufreq *cpufreq = cpufreq_get_driver_data();
-	int ret;
 
 	clk_prepare_enable(cpufreq->cpu_clk);
 
 	/* FIXME: what's the actual transition time? */
-	ret = cpufreq_generic_init(policy, freq_table, 300 * 1000);
-	if (ret) {
-		clk_disable_unprepare(cpufreq->cpu_clk);
-		return ret;
-	}
-
+	cpufreq_generic_init(policy, freq_table, 300 * 1000);
 	policy->clk = cpufreq->cpu_clk;
 	policy->suspend_freq = freq_table[0].frequency;
 	return 0;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index afc683021ac5..441ff15b7768 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -995,7 +995,7 @@ extern struct freq_attr *cpufreq_generic_attr[];
 int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy);
 
 unsigned int cpufreq_generic_get(unsigned int cpu);
-int cpufreq_generic_init(struct cpufreq_policy *policy,
+void cpufreq_generic_init(struct cpufreq_policy *policy,
 		struct cpufreq_frequency_table *table,
 		unsigned int transition_latency);
 #endif /* _LINUX_CPUFREQ_H */
-- 
cgit v1.2.3-59-g8ed1b


From 46710f3a34b592ac5c51a95f696b2d2a2a0d9419 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 25 May 2019 09:57:59 -0700
Subject: tracing: Pass type into tracing_generic_entry_update()

All callers of tracing_generic_entry_update() have to initialize
entry->type, so let's just simply move it inside.
Link: http://lkml.kernel.org/r/20190525165802.25944-2-xiyou.wangcong@gmail.com

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h    | 1 +
 kernel/trace/trace.c            | 8 ++++----
 kernel/trace/trace_event_perf.c | 3 +--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8a62731673f7..5c6f2a6c8cd2 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -142,6 +142,7 @@ enum print_line_t {
 enum print_line_t trace_handle_return(struct trace_seq *s);
 
 void tracing_generic_entry_update(struct trace_entry *entry,
+				  unsigned short type,
 				  unsigned long flags,
 				  int pc);
 struct trace_event_file;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 77b9c4ca5faa..6b62e1718548 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -743,8 +743,7 @@ trace_event_setup(struct ring_buffer_event *event,
 {
 	struct trace_entry *ent = ring_buffer_event_data(event);
 
-	tracing_generic_entry_update(ent, flags, pc);
-	ent->type = type;
+	tracing_generic_entry_update(ent, type, flags, pc);
 }
 
 static __always_inline struct ring_buffer_event *
@@ -2312,13 +2311,14 @@ enum print_line_t trace_handle_return(struct trace_seq *s)
 EXPORT_SYMBOL_GPL(trace_handle_return);
 
 void
-tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
-			     int pc)
+tracing_generic_entry_update(struct trace_entry *entry, unsigned short type,
+			     unsigned long flags, int pc)
 {
 	struct task_struct *tsk = current;
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
+	entry->type			= type;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4629a6104474..0892e38ed6fb 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -416,8 +416,7 @@ void perf_trace_buf_update(void *record, u16 type)
 	unsigned long flags;
 
 	local_save_flags(flags);
-	tracing_generic_entry_update(entry, flags, pc);
-	entry->type = type;
+	tracing_generic_entry_update(entry, type, flags, pc);
 }
 NOKPROBE_SYMBOL(perf_trace_buf_update);
 
-- 
cgit v1.2.3-59-g8ed1b


From 0aeb1def44169cbe7119f26cf10b974a2046142e Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Sat, 25 May 2019 09:58:01 -0700
Subject: tracing: Make trace_get_fields() global

trace_get_fields() is the only way to read tracepoint fields at
run time, as their fields are defined at compile-time with macros.
Make this function visible to all users and it will be used by
trace event injection code to calculate the size of a tracepoint
entry.
Link: http://lkml.kernel.org/r/20190525165802.25944-4-xiyou.wangcong@gmail.com

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 8 ++++++++
 kernel/trace/trace_events.c  | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 5c6f2a6c8cd2..5150436783e8 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -318,6 +318,14 @@ trace_event_name(struct trace_event_call *call)
 		return call->name;
 }
 
+static inline struct list_head *
+trace_get_fields(struct trace_event_call *event_call)
+{
+	if (!event_call->class->get_fields)
+		return &event_call->class->fields;
+	return event_call->class->get_fields(event_call);
+}
+
 struct trace_array;
 struct trace_subsystem_dir;
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index edc72f3b080c..c7506bc81b75 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -70,14 +70,6 @@ static int system_refcount_dec(struct event_subsystem *system)
 #define while_for_each_event_file()		\
 	}
 
-static struct list_head *
-trace_get_fields(struct trace_event_call *event_call)
-{
-	if (!event_call->class->get_fields)
-		return &event_call->class->fields;
-	return event_call->class->get_fields(event_call);
-}
-
 static struct ftrace_event_field *
 __find_event_field(struct list_head *head, char *name)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 9087c37584fb7d8315877bb55f85e4268cc0b4f4 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Wed, 10 Jul 2019 19:01:19 +0000
Subject: dma-direct: Force unencrypted DMA under SME for certain DMA masks

If a device doesn't support DMA to a physical address that includes the
encryption bit (currently bit 47, so 48-bit DMA), then the DMA must
occur to unencrypted memory. SWIOTLB is used to satisfy that requirement
if an IOMMU is not active (enabled or configured in passthrough mode).

However, commit fafadcd16595 ("swiotlb: don't dip into swiotlb pool for
coherent allocations") modified the coherent allocation support in
SWIOTLB to use the DMA direct coherent allocation support. When an IOMMU
is not active, this resulted in dma_alloc_coherent() failing for devices
that didn't support DMA addresses that included the encryption bit.

Addressing this requires changes to the force_dma_unencrypted() function
in kernel/dma/direct.c. Since the function is now non-trivial and
SME/SEV specific, update the DMA direct support to add an arch override
for the force_dma_unencrypted() function. The arch override is selected
when CONFIG_AMD_MEM_ENCRYPT is set. The arch override function resides in
the arch/x86/mm/mem_encrypt.c file and forces unencrypted DMA when either
SEV is active or SME is active and the device does not support DMA to
physical addresses that include the encryption bit.

Fixes: fafadcd16595 ("swiotlb: don't dip into swiotlb pool for coherent allocations")
Suggested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
[hch: moved the force_dma_unencrypted declaration to dma-mapping.h,
      fold the s390 fix from Halil Pasic]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 arch/s390/Kconfig          |  1 +
 arch/s390/mm/init.c        |  7 ++++++-
 arch/x86/Kconfig           |  1 +
 arch/x86/mm/mem_encrypt.c  | 30 ++++++++++++++++++++++++++++++
 include/linux/dma-direct.h |  9 +++++++++
 kernel/dma/Kconfig         |  3 +++
 kernel/dma/direct.c        | 16 ++++------------
 7 files changed, 54 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 5d8570ed6cab..a4ad2733eedf 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -189,6 +189,7 @@ config S390
 	select VIRT_CPU_ACCOUNTING
 	select ARCH_HAS_SCALED_CPUTIME
 	select HAVE_NMI
+	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	select SWIOTLB
 	select GENERIC_ALLOCATOR
 
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index f0bee6af3960..78c319c5ce48 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -30,7 +30,7 @@
 #include <linux/export.h>
 #include <linux/cma.h>
 #include <linux/gfp.h>
-#include <linux/dma-mapping.h>
+#include <linux/dma-direct.h>
 #include <asm/processor.h>
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
@@ -161,6 +161,11 @@ bool sev_active(void)
 	return is_prot_virt_guest();
 }
 
+bool force_dma_unencrypted(struct device *dev)
+{
+	return sev_active();
+}
+
 /* protected virtualization */
 static void pv_init(void)
 {
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 879741336771..d1afe92bf994 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1528,6 +1528,7 @@ config AMD_MEM_ENCRYPT
 	depends on X86_64 && CPU_SUP_AMD
 	select DYNAMIC_PHYSICAL_MASK
 	select ARCH_USE_MEMREMAP_PROT
+	select ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	---help---
 	  Say yes to enable support for the encryption of system memory.
 	  This requires an AMD processor that supports Secure Memory
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index e0df96fdfe46..c805f0a5c16e 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -15,6 +15,10 @@
 #include <linux/dma-direct.h>
 #include <linux/swiotlb.h>
 #include <linux/mem_encrypt.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
@@ -348,6 +352,32 @@ bool sev_active(void)
 }
 EXPORT_SYMBOL(sev_active);
 
+/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
+bool force_dma_unencrypted(struct device *dev)
+{
+	/*
+	 * For SEV, all DMA must be to unencrypted addresses.
+	 */
+	if (sev_active())
+		return true;
+
+	/*
+	 * For SME, all DMA must be to unencrypted addresses if the
+	 * device does not support DMA to addresses that include the
+	 * encryption mask.
+	 */
+	if (sme_active()) {
+		u64 dma_enc_mask = DMA_BIT_MASK(__ffs64(sme_me_mask));
+		u64 dma_dev_mask = min_not_zero(dev->coherent_dma_mask,
+						dev->bus_dma_mask);
+
+		if (dma_dev_mask <= dma_enc_mask)
+			return true;
+	}
+
+	return false;
+}
+
 /* Architecture __weak replacement functions */
 void __init mem_encrypt_free_decrypted_mem(void)
 {
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index b7338702592a..adf993a3bd58 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -32,6 +32,15 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
 }
 #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */
 
+#ifdef CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED
+bool force_dma_unencrypted(struct device *dev);
+#else
+static inline bool force_dma_unencrypted(struct device *dev)
+{
+	return false;
+}
+#endif /* CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED */
+
 /*
  * If memory encryption is supported, phys_to_dma will set the memory encryption
  * bit in the DMA address, and dma_to_phys will clear it.  The raw __phys_to_dma
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 70f8f8d9200e..9decbba255fc 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -48,6 +48,9 @@ config ARCH_HAS_DMA_COHERENT_TO_PFN
 config ARCH_HAS_DMA_MMAP_PGPROT
 	bool
 
+config ARCH_HAS_FORCE_DMA_UNENCRYPTED
+	bool
+
 config DMA_NONCOHERENT_CACHE_SYNC
 	bool
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index b90e1aede743..d7cec866d16b 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -23,14 +23,6 @@
 #define ARCH_ZONE_DMA_BITS 24
 #endif
 
-/*
- * For AMD SEV all DMA must be to unencrypted addresses.
- */
-static inline bool force_dma_unencrypted(void)
-{
-	return sev_active();
-}
-
 static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
 {
 	if (!dev->dma_mask) {
@@ -46,7 +38,7 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
 static inline dma_addr_t phys_to_dma_direct(struct device *dev,
 		phys_addr_t phys)
 {
-	if (force_dma_unencrypted())
+	if (force_dma_unencrypted(dev))
 		return __phys_to_dma(dev, phys);
 	return phys_to_dma(dev, phys);
 }
@@ -67,7 +59,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 	if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
 		dma_mask = dev->bus_dma_mask;
 
-	if (force_dma_unencrypted())
+	if (force_dma_unencrypted(dev))
 		*phys_mask = __dma_to_phys(dev, dma_mask);
 	else
 		*phys_mask = dma_to_phys(dev, dma_mask);
@@ -159,7 +151,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	}
 
 	ret = page_address(page);
-	if (force_dma_unencrypted()) {
+	if (force_dma_unencrypted(dev)) {
 		set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
 		*dma_handle = __phys_to_dma(dev, page_to_phys(page));
 	} else {
@@ -192,7 +184,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		return;
 	}
 
-	if (force_dma_unencrypted())
+	if (force_dma_unencrypted(dev))
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
 	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
-- 
cgit v1.2.3-59-g8ed1b


From 89165b8b0ee97bd775ac4376b932fd030f7462bd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Jul 2019 16:26:30 -0700
Subject: mm: provide a print_vma_addr stub for !CONFIG_MMU

Link: http://lkml.kernel.org/r/20190703122359.18200-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Vladimir Murzin <vladimir.murzin@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0389c34ac529..74797ed20c2c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2767,7 +2767,13 @@ extern int randomize_va_space;
 #endif
 
 const char * arch_vma_name(struct vm_area_struct *vma);
+#ifdef CONFIG_MMU
 void print_vma_addr(char *prefix, unsigned long rip);
+#else
+static inline void print_vma_addr(char *prefix, unsigned long rip)
+{
+}
+#endif
 
 void *sparse_buffer_alloc(unsigned long size);
 struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
-- 
cgit v1.2.3-59-g8ed1b


From 9b98fa22948551e20a15b0b9d22589e3724c361a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Jul 2019 16:26:33 -0700
Subject: mm: stub out all of swapops.h for !CONFIG_MMU

The whole header file deals with swap entries and PTEs, none of which
can exist for nommu builds.  The current nommu ports have lots of stubs
to allow the inline functions in swapops.h to compile, but as none of
this functionality is actually used there is no point in even providing
it.  This way we don't have to provide the stubs for the upcoming RISC-V
nommu port, and can eventually remove it from the existing ports.

Link: http://lkml.kernel.org/r/20190703122359.18200-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swapops.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 15bdb6fe71e5..877fd239b6ff 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -6,6 +6,8 @@
 #include <linux/bug.h>
 #include <linux/mm_types.h>
 
+#ifdef CONFIG_MMU
+
 /*
  * swapcache pages are stored in the swapper_space radix tree.  We want to
  * get good packing density in that tree, so the index should be dense in
@@ -50,13 +52,11 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 	return entry.val & SWP_OFFSET_MASK;
 }
 
-#ifdef CONFIG_MMU
 /* check whether a pte points to a swap entry */
 static inline int is_swap_pte(pte_t pte)
 {
 	return !pte_none(pte) && !pte_present(pte);
 }
-#endif
 
 /*
  * Convert the arch-dependent pte representation of a swp_entry_t into an
@@ -360,4 +360,5 @@ static inline int non_swap_entry(swp_entry_t entry)
 }
 #endif
 
+#endif /* CONFIG_MMU */
 #endif /* _LINUX_SWAPOPS_H */
-- 
cgit v1.2.3-59-g8ed1b


From ce251e0e3c0597ea8cab5787df579bd1f9c1aca1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jul 2019 16:26:42 -0700
Subject: include/linux/kernel.h: add typeof_member() macro

Add typeof_member() macro so that types can be extracted without
introducing dummy variables.

Link: http://lkml.kernel.org/r/20190529190720.GA5703@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 0c9bc231107f..4fa360a13c1e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -88,6 +88,8 @@
  */
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
 
+#define typeof_member(T, m)	typeof(((T*)0)->m)
+
 #define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP
 
 #define DIV_ROUND_DOWN_ULL(ll, d) \
-- 
cgit v1.2.3-59-g8ed1b


From 95b980d62d52c4c1768ee719e8db3efe27ef52b2 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 16 Jul 2019 16:26:57 -0700
Subject: linux/bits.h: make BIT(), GENMASK(), and friends available in
 assembly

BIT(),  GENMASK(), etc. are useful to define register bits of hardware.
However, low-level code is often written in assembly, where they are
not available due to the hard-coded 1UL, 0UL.

In fact, in-kernel headers such as arch/arm64/include/asm/sysreg.h
use _BITUL() instead of BIT() so that the register bit macros are
available in assembly.

Using macros in include/uapi/linux/const.h have two reasons:

[1] For use in uapi headers
  We should use underscore-prefixed variants for user-space.

[2] For use in assembly code
  Since _BITUL() uses UL(1) instead of 1UL, it can be used as an
  alternative of BIT().

For [2], it is pretty easy to change BIT() etc. for use in assembly.

This allows to replace _BUTUL() in kernel-space headers with BIT().

Link: http://lkml.kernel.org/r/20190609153941.17249-1-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bits.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bits.h b/include/linux/bits.h
index 2b7b532c1d51..669d69441a62 100644
--- a/include/linux/bits.h
+++ b/include/linux/bits.h
@@ -1,13 +1,15 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __LINUX_BITS_H
 #define __LINUX_BITS_H
+
+#include <linux/const.h>
 #include <asm/bitsperlong.h>
 
-#define BIT(nr)			(1UL << (nr))
-#define BIT_ULL(nr)		(1ULL << (nr))
-#define BIT_MASK(nr)		(1UL << ((nr) % BITS_PER_LONG))
+#define BIT(nr)			(UL(1) << (nr))
+#define BIT_ULL(nr)		(ULL(1) << (nr))
+#define BIT_MASK(nr)		(UL(1) << ((nr) % BITS_PER_LONG))
 #define BIT_WORD(nr)		((nr) / BITS_PER_LONG)
-#define BIT_ULL_MASK(nr)	(1ULL << ((nr) % BITS_PER_LONG_LONG))
+#define BIT_ULL_MASK(nr)	(ULL(1) << ((nr) % BITS_PER_LONG_LONG))
 #define BIT_ULL_WORD(nr)	((nr) / BITS_PER_LONG_LONG)
 #define BITS_PER_BYTE		8
 
@@ -17,10 +19,11 @@
  * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
  */
 #define GENMASK(h, l) \
-	(((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
+	(((~UL(0)) - (UL(1) << (l)) + 1) & \
+	 (~UL(0) >> (BITS_PER_LONG - 1 - (h))))
 
 #define GENMASK_ULL(h, l) \
-	(((~0ULL) - (1ULL << (l)) + 1) & \
-	 (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
+	(((~ULL(0)) - (ULL(1) << (l)) + 1) & \
+	 (~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h))))
 
 #endif	/* __LINUX_BITS_H */
-- 
cgit v1.2.3-59-g8ed1b


From 4c6080cd6f8baad9f7faa3deac9a90e59726b119 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jul 2019 16:27:12 -0700
Subject: lib/list: tweak LIST_POISON2 for better code generation on x86_64

list_del() poisoning can generate 2 64-bit immediate loads but it also can
generate one 64-bit immediate load and an addition:

	48 b8 00 01 00 00 00 00 ad de	movabs rax,0xdead000000000100
	48 89 47 58			mov    QWORD PTR [rdi+0x58],rax
	48 05 00 01 00 00   <=====>	add    rax,0x100
	48 89 47 60			mov    QWORD PTR [rdi+0x60],rax

However on x86_64 not all constants are equal: those within [-128, 127]
range can be added with shorter "add r64, imm32" instruction:

	48 b8 00 01 00 00 00 00 ad de	movabs rax,0xdead000000000100
	48 89 47 58			mov    QWORD PTR [rdi+0x58],rax
	48 83 c0 22	<======>	add    rax,0x22
	48 89 47 60			mov    QWORD PTR [rdi+0x60],rax

Patch saves 2 bytes per some LIST_POISON2 usage.

(Slightly disappointing) space savings on F29 x86_64 config:

	add/remove: 0/0 grow/shrink: 0/2164 up/down: 0/-5184 (-5184)
	Function                                     old     new   delta
	zstd_get_workspace                           548     546      -2
		...
	mlx4_delete_all_resources_for_slave         4826    4804     -22
	Total: Before=83304131, After=83298947, chg -0.01%

New constants are:

	0xdead000000000100
	0xdead000000000122

Note: LIST_POISON1 can't be changed to ...11 because something in page
allocator requires low bit unset.

Link: http://lkml.kernel.org/r/20190513191502.GA8492@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/poison.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/poison.h b/include/linux/poison.h
index d6d980a681c7..df34330b4e34 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -21,7 +21,7 @@
  * non-initialized list entries.
  */
 #define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)
-#define LIST_POISON2  ((void *) 0x200 + POISON_POINTER_DELTA)
+#define LIST_POISON2  ((void *) 0x122 + POISON_POINTER_DELTA)
 
 /********** include/linux/timer.h **********/
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 0f472d04f59ff89d15b2a1c4eafde7317ddd67a2 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 16 Jul 2019 16:27:33 -0700
Subject: mm/ioremap: probe platform for p4d huge map support

Finish up what commit c2febafc6773 ("mm: convert generic code to 5-level
paging") started while levelling up P4D huge mapping support at par with
PUD and PMD.  A new arch call back arch_ioremap_p4d_supported() is added
which just maintains status quo (P4D huge map not supported) on x86,
arm64 and powerpc.

When HAVE_ARCH_HUGE_VMAP is enabled its just a simple check from the
arch about the support, hence runtime effects are minimal.

Link: http://lkml.kernel.org/r/1561699231-20991-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/mmu.c                      | 5 +++++
 arch/powerpc/mm/book3s64/radix_pgtable.c | 5 +++++
 arch/x86/mm/ioremap.c                    | 5 +++++
 include/linux/io.h                       | 1 +
 lib/ioremap.c                            | 2 ++
 5 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1b49c08dfa2b..e661469cabdd 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -942,6 +942,11 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
 	return dt_virt;
 }
 
+int __init arch_ioremap_p4d_supported(void)
+{
+	return 0;
+}
+
 int __init arch_ioremap_pud_supported(void)
 {
 	/*
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 65c2ba1e1783..b4ca9e95e678 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1237,3 +1237,8 @@ int radix__ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size,
 		return 0;
 	}
 }
+
+int __init arch_ioremap_p4d_supported(void)
+{
+	return 0;
+}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index e500f1df1140..63e99f15d7cf 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -459,6 +459,11 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
+int __init arch_ioremap_p4d_supported(void)
+{
+	return 0;
+}
+
 int __init arch_ioremap_pud_supported(void)
 {
 #ifdef CONFIG_X86_64
diff --git a/include/linux/io.h b/include/linux/io.h
index 9876e5801a9d..accac822336a 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -33,6 +33,7 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end,
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 void __init ioremap_huge_init(void);
+int arch_ioremap_p4d_supported(void);
 int arch_ioremap_pud_supported(void);
 int arch_ioremap_pmd_supported(void);
 #else
diff --git a/lib/ioremap.c b/lib/ioremap.c
index a95161d9c883..0a2ffadc6d71 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -30,6 +30,8 @@ early_param("nohugeiomap", set_nohugeiomap);
 void __init ioremap_huge_init(void)
 {
 	if (!ioremap_huge_disabled) {
+		if (arch_ioremap_p4d_supported())
+			ioremap_p4d_capable = 1;
 		if (arch_ioremap_pud_supported())
 			ioremap_pud_capable = 1;
 		if (arch_ioremap_pmd_supported())
-- 
cgit v1.2.3-59-g8ed1b


From 9f973cb38088e0cf42e0bae97ff140813e623f13 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 16 Jul 2019 16:27:45 -0700
Subject: lib/rbtree: avoid generating code twice for the cached versions

As was already noted in rbtree.h, the logic to cache rb_first (or
rb_last) can easily be implemented externally to the core rbtree api.

Change the implementation to do just that.  Previously the update of
rb_leftmost was wired deeper into the implmentation, but there were some
disadvantages to that - mostly, lib/rbtree.c had separate instantiations
for rb_insert_color() vs rb_insert_color_cached(), as well as rb_erase()
vs rb_erase_cached(), which were doing exactly the same thing save for
the rb_leftmost update at the start of either function.

   text	   data	    bss	    dec	    hex	filename
   5405	    120	      0	   5525	   1595	lib/rbtree.o-vanilla
   3827	     96	      0	   3923	    f53	lib/rbtree.o-patch

[dave@stgolabs.net: changelog addition]
  Link: http://lkml.kernel.org/r/20190628171416.by5gdizl3rcxk5h5@linux-r8p5
[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20190628045008.39926-1-walken@google.com
Signed-off-by: Michel Lespinasse <walken@google.com>
Acked-by: Davidlohr Bueso <dbueso@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rbtree.h           | 70 ++++++++++++++++++++++++++--------------
 include/linux/rbtree_augmented.h | 27 ++++++----------
 lib/rbtree.c                     | 40 ++---------------------
 3 files changed, 59 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index e6337fce08f2..1fd61a9af45c 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -32,25 +32,9 @@ struct rb_root {
 	struct rb_node *rb_node;
 };
 
-/*
- * Leftmost-cached rbtrees.
- *
- * We do not cache the rightmost node based on footprint
- * size vs number of potential users that could benefit
- * from O(1) rb_last(). Just not worth it, users that want
- * this feature can always implement the logic explicitly.
- * Furthermore, users that want to cache both pointers may
- * find it a bit asymmetric, but that's ok.
- */
-struct rb_root_cached {
-	struct rb_root rb_root;
-	struct rb_node *rb_leftmost;
-};
-
 #define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))
 
 #define RB_ROOT	(struct rb_root) { NULL, }
-#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
 #define	rb_entry(ptr, type, member) container_of(ptr, type, member)
 
 #define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)
@@ -72,12 +56,6 @@ extern struct rb_node *rb_prev(const struct rb_node *);
 extern struct rb_node *rb_first(const struct rb_root *);
 extern struct rb_node *rb_last(const struct rb_root *);
 
-extern void rb_insert_color_cached(struct rb_node *,
-				   struct rb_root_cached *, bool);
-extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *);
-/* Same as rb_first(), but O(1) */
-#define rb_first_cached(root) (root)->rb_leftmost
-
 /* Postorder iteration - always visit the parent after its children */
 extern struct rb_node *rb_first_postorder(const struct rb_root *);
 extern struct rb_node *rb_next_postorder(const struct rb_node *);
@@ -87,8 +65,6 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 			    struct rb_root *root);
 extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
 				struct rb_root *root);
-extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
-				   struct rb_root_cached *root);
 
 static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
 				struct rb_node **rb_link)
@@ -136,4 +112,50 @@ static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent
 			typeof(*pos), field); 1; }); \
 	     pos = n)
 
+/*
+ * Leftmost-cached rbtrees.
+ *
+ * We do not cache the rightmost node based on footprint
+ * size vs number of potential users that could benefit
+ * from O(1) rb_last(). Just not worth it, users that want
+ * this feature can always implement the logic explicitly.
+ * Furthermore, users that want to cache both pointers may
+ * find it a bit asymmetric, but that's ok.
+ */
+struct rb_root_cached {
+	struct rb_root rb_root;
+	struct rb_node *rb_leftmost;
+};
+
+#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
+
+/* Same as rb_first(), but O(1) */
+#define rb_first_cached(root) (root)->rb_leftmost
+
+static inline void rb_insert_color_cached(struct rb_node *node,
+					  struct rb_root_cached *root,
+					  bool leftmost)
+{
+	if (leftmost)
+		root->rb_leftmost = node;
+	rb_insert_color(node, &root->rb_root);
+}
+
+static inline void rb_erase_cached(struct rb_node *node,
+				   struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == node)
+		root->rb_leftmost = rb_next(node);
+	rb_erase(node, &root->rb_root);
+}
+
+static inline void rb_replace_node_cached(struct rb_node *victim,
+					  struct rb_node *new,
+					  struct rb_root_cached *root)
+{
+	if (root->rb_leftmost == victim)
+		root->rb_leftmost = new;
+	rb_replace_node(victim, new, &root->rb_root);
+}
+
 #endif	/* _LINUX_RBTREE_H */
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 0f902ccb48b0..179faab29f52 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -30,10 +30,9 @@ struct rb_augment_callbacks {
 	void (*rotate)(struct rb_node *old, struct rb_node *new);
 };
 
-extern void __rb_insert_augmented(struct rb_node *node,
-				  struct rb_root *root,
-				  bool newleft, struct rb_node **leftmost,
+extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
+
 /*
  * Fixup the rbtree and update the augmented information when rebalancing.
  *
@@ -48,7 +47,7 @@ static inline void
 rb_insert_augmented(struct rb_node *node, struct rb_root *root,
 		    const struct rb_augment_callbacks *augment)
 {
-	__rb_insert_augmented(node, root, false, NULL, augment->rotate);
+	__rb_insert_augmented(node, root, augment->rotate);
 }
 
 static inline void
@@ -56,8 +55,9 @@ rb_insert_augmented_cached(struct rb_node *node,
 			   struct rb_root_cached *root, bool newleft,
 			   const struct rb_augment_callbacks *augment)
 {
-	__rb_insert_augmented(node, &root->rb_root,
-			      newleft, &root->rb_leftmost, augment->rotate);
+	if (newleft)
+		root->rb_leftmost = node;
+	rb_insert_augmented(node, &root->rb_root, augment);
 }
 
 #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield,	\
@@ -150,7 +150,6 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
 
 static __always_inline struct rb_node *
 __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
-		     struct rb_node **leftmost,
 		     const struct rb_augment_callbacks *augment)
 {
 	struct rb_node *child = node->rb_right;
@@ -158,9 +157,6 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 	struct rb_node *parent, *rebalance;
 	unsigned long pc;
 
-	if (leftmost && node == *leftmost)
-		*leftmost = rb_next(node);
-
 	if (!tmp) {
 		/*
 		 * Case 1: node to erase has no more than 1 child (easy!)
@@ -260,8 +256,7 @@ static __always_inline void
 rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		   const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *rebalance = __rb_erase_augmented(node, root,
-							 NULL, augment);
+	struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
 	if (rebalance)
 		__rb_erase_color(rebalance, root, augment->rotate);
 }
@@ -270,11 +265,9 @@ static __always_inline void
 rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
 			  const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root,
-							 &root->rb_leftmost,
-							 augment);
-	if (rebalance)
-		__rb_erase_color(rebalance, &root->rb_root, augment->rotate);
+	if (root->rb_leftmost == node)
+		root->rb_leftmost = rb_next(node);
+	rb_erase_augmented(node, &root->rb_root, augment);
 }
 
 #endif	/* _LINUX_RBTREE_AUGMENTED_H */
diff --git a/lib/rbtree.c b/lib/rbtree.c
index 1ef6e25d031c..abc86c6a3177 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -83,14 +83,10 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
 
 static __always_inline void
 __rb_insert(struct rb_node *node, struct rb_root *root,
-	    bool newleft, struct rb_node **leftmost,
 	    void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
 	struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
 
-	if (newleft)
-		*leftmost = node;
-
 	while (true) {
 		/*
 		 * Loop invariant: node is red.
@@ -437,38 +433,19 @@ static const struct rb_augment_callbacks dummy_callbacks = {
 
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
 {
-	__rb_insert(node, root, false, NULL, dummy_rotate);
+	__rb_insert(node, root, dummy_rotate);
 }
 EXPORT_SYMBOL(rb_insert_color);
 
 void rb_erase(struct rb_node *node, struct rb_root *root)
 {
 	struct rb_node *rebalance;
-	rebalance = __rb_erase_augmented(node, root,
-					 NULL, &dummy_callbacks);
+	rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
 	if (rebalance)
 		____rb_erase_color(rebalance, root, dummy_rotate);
 }
 EXPORT_SYMBOL(rb_erase);
 
-void rb_insert_color_cached(struct rb_node *node,
-			    struct rb_root_cached *root, bool leftmost)
-{
-	__rb_insert(node, &root->rb_root, leftmost,
-		    &root->rb_leftmost, dummy_rotate);
-}
-EXPORT_SYMBOL(rb_insert_color_cached);
-
-void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
-{
-	struct rb_node *rebalance;
-	rebalance = __rb_erase_augmented(node, &root->rb_root,
-					 &root->rb_leftmost, &dummy_callbacks);
-	if (rebalance)
-		____rb_erase_color(rebalance, &root->rb_root, dummy_rotate);
-}
-EXPORT_SYMBOL(rb_erase_cached);
-
 /*
  * Augmented rbtree manipulation functions.
  *
@@ -477,10 +454,9 @@ EXPORT_SYMBOL(rb_erase_cached);
  */
 
 void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
-			   bool newleft, struct rb_node **leftmost,
 	void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
 {
-	__rb_insert(node, root, newleft, leftmost, augment_rotate);
+	__rb_insert(node, root, augment_rotate);
 }
 EXPORT_SYMBOL(__rb_insert_augmented);
 
@@ -591,16 +567,6 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 }
 EXPORT_SYMBOL(rb_replace_node);
 
-void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new,
-			    struct rb_root_cached *root)
-{
-	rb_replace_node(victim, new, &root->rb_root);
-
-	if (root->rb_leftmost == victim)
-		root->rb_leftmost = new;
-}
-EXPORT_SYMBOL(rb_replace_node_cached);
-
 void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
 			 struct rb_root *root)
 {
-- 
cgit v1.2.3-59-g8ed1b


From b98cca444d287a63dd96df04af7fb9793567599e Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 16 Jul 2019 16:28:00 -0700
Subject: mm, kprobes: generalize and rename notify_page_fault() as
 kprobe_page_fault()

Architectures which support kprobes have very similar boilerplate around
calling kprobe_fault_handler().  Use a helper function in kprobes.h to
unify them, based on the x86 code.

This changes the behaviour for other architectures when preemption is
enabled.  Previously, they would have disabled preemption while calling
the kprobe handler.  However, preemption would be disabled if this fault
was due to a kprobe, so we know the fault was not due to a kprobe
handler and can simply return failure.

This behaviour was introduced in commit a980c0ef9f6d ("x86/kprobes:
Refactor kprobes_fault() like kprobe_exceptions_notify()")

[anshuman.khandual@arm.com: export kprobe_fault_handler()]
  Link: http://lkml.kernel.org/r/1561133358-8876-1-git-send-email-anshuman.khandual@arm.com
Link: http://lkml.kernel.org/r/1560420444-25737-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: James Hogan <jhogan@kernel.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/fault.c             | 24 +-----------------------
 arch/arm64/mm/fault.c           | 24 +-----------------------
 arch/ia64/mm/fault.c            | 24 +-----------------------
 arch/mips/include/asm/kprobes.h |  1 +
 arch/mips/kernel/kprobes.c      |  2 +-
 arch/powerpc/mm/fault.c         | 23 ++---------------------
 arch/s390/mm/fault.c            | 16 +---------------
 arch/sh/mm/fault.c              | 18 ++----------------
 arch/sparc/mm/fault_64.c        | 16 +---------------
 arch/x86/mm/fault.c             | 21 ++-------------------
 include/linux/kprobes.h         | 19 +++++++++++++++++++
 11 files changed, 32 insertions(+), 156 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 0e417233dad7..890eeaac3cbb 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -27,28 +27,6 @@
 
 #ifdef CONFIG_MMU
 
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr)
-{
-	int ret = 0;
-
-	if (!user_mode(regs)) {
-		/* kprobe_running() needs smp_processor_id() */
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, fsr))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr)
-{
-	return 0;
-}
-#endif
-
 /*
  * This is useful to dump out the page tables associated with
  * 'addr' in mm 'mm'.
@@ -265,7 +243,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	vm_fault_t fault;
 	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
-	if (notify_page_fault(regs, fsr))
+	if (kprobe_page_fault(regs, fsr))
 		return 0;
 
 	tsk = current;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c8c61b1eb479..9568c116ac7f 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -59,28 +59,6 @@ static inline const struct fault_info *esr_to_debug_fault_info(unsigned int esr)
 	return debug_fault_info + DBG_ESR_EVT(esr);
 }
 
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, esr))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
-{
-	return 0;
-}
-#endif
-
 static void data_abort_decode(unsigned int esr)
 {
 	pr_alert("Data abort info:\n");
@@ -434,7 +412,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	unsigned long vm_flags = VM_READ | VM_WRITE;
 	unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
 
-	if (notify_page_fault(regs, esr))
+	if (kprobe_page_fault(regs, esr))
 		return 0;
 
 	/*
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 3c3a283d3172..c2f299fe9e04 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -21,28 +21,6 @@
 
 extern int die(char *, struct pt_regs *, long);
 
-#ifdef CONFIG_KPROBES
-static inline int notify_page_fault(struct pt_regs *regs, int trap)
-{
-	int ret = 0;
-
-	if (!user_mode(regs)) {
-		/* kprobe_running() needs smp_processor_id() */
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, trap))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-#else
-static inline int notify_page_fault(struct pt_regs *regs, int trap)
-{
-	return 0;
-}
-#endif
-
 /*
  * Return TRUE if ADDRESS points at a page in the kernel's mapped segment
  * (inside region 5, on ia64) and that page is present.
@@ -116,7 +94,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * This is to handle the kprobes on user space access instructions
 	 */
-	if (notify_page_fault(regs, TRAP_BRKPT))
+	if (kprobe_page_fault(regs, TRAP_BRKPT))
 		return;
 
 	if (user_mode(regs))
diff --git a/arch/mips/include/asm/kprobes.h b/arch/mips/include/asm/kprobes.h
index 3cf8e4d5fa28..68b1e5d458cf 100644
--- a/arch/mips/include/asm/kprobes.h
+++ b/arch/mips/include/asm/kprobes.h
@@ -41,6 +41,7 @@ do {									\
 #define kretprobe_blacklist_size 0
 
 void arch_remove_kprobe(struct kprobe *p);
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c
index 81ba1d3c367c..6cfae2411c04 100644
--- a/arch/mips/kernel/kprobes.c
+++ b/arch/mips/kernel/kprobes.c
@@ -398,7 +398,7 @@ out:
 	return 1;
 }
 
-static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 {
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index d989592b6fc8..8432c281de92 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -42,26 +42,6 @@
 #include <asm/debug.h>
 #include <asm/kup.h>
 
-static inline bool notify_page_fault(struct pt_regs *regs)
-{
-	bool ret = false;
-
-#ifdef CONFIG_KPROBES
-	/* kprobe_running() needs smp_processor_id() */
-	if (!user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 11))
-			ret = true;
-		preempt_enable();
-	}
-#endif /* CONFIG_KPROBES */
-
-	if (unlikely(debugger_fault_handler(regs)))
-		ret = true;
-
-	return ret;
-}
-
 /*
  * Check whether the instruction inst is a store using
  * an update addressing form which will update r1.
@@ -461,8 +441,9 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
 	int is_write = page_fault_is_write(error_code);
 	vm_fault_t fault, major = 0;
 	bool must_retry = false;
+	bool kprobe_fault = kprobe_page_fault(regs, 11);
 
-	if (notify_page_fault(regs))
+	if (unlikely(debugger_fault_handler(regs) || kprobe_fault))
 		return 0;
 
 	if (unlikely(page_fault_is_bad(error_code))) {
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 0ba174f779da..63507662828f 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -67,20 +67,6 @@ static int __init fault_init(void)
 }
 early_initcall(fault_init);
 
-static inline int notify_page_fault(struct pt_regs *regs)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (kprobes_built_in() && !user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 14))
-			ret = 1;
-		preempt_enable();
-	}
-	return ret;
-}
-
 /*
  * Find out which address space caused the exception.
  */
@@ -412,7 +398,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
 	 */
 	clear_pt_regs_flag(regs, PIF_PER_TRAP);
 
-	if (notify_page_fault(regs))
+	if (kprobe_page_fault(regs, 14))
 		return 0;
 
 	mm = tsk->mm;
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 3093bc372138..5f51456f4fc7 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -24,20 +24,6 @@
 #include <asm/tlbflush.h>
 #include <asm/traps.h>
 
-static inline int notify_page_fault(struct pt_regs *regs, int trap)
-{
-	int ret = 0;
-
-	if (kprobes_built_in() && !user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, trap))
-			ret = 1;
-		preempt_enable();
-	}
-
-	return ret;
-}
-
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address)
 {
@@ -412,14 +398,14 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	if (unlikely(fault_in_kernel_space(address))) {
 		if (vmalloc_fault(address) >= 0)
 			return;
-		if (notify_page_fault(regs, vec))
+		if (kprobe_page_fault(regs, vec))
 			return;
 
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
 
-	if (unlikely(notify_page_fault(regs, vec)))
+	if (unlikely(kprobe_page_fault(regs, vec)))
 		return;
 
 	/* Only enable interrupts if they were on before the fault */
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 83fda4d9c3b2..2371fb6b97e4 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -38,20 +38,6 @@
 
 int show_unhandled_signals = 1;
 
-static inline __kprobes int notify_page_fault(struct pt_regs *regs)
-{
-	int ret = 0;
-
-	/* kprobe_running() needs smp_processor_id() */
-	if (kprobes_built_in() && !user_mode(regs)) {
-		preempt_disable();
-		if (kprobe_running() && kprobe_fault_handler(regs, 0))
-			ret = 1;
-		preempt_enable();
-	}
-	return ret;
-}
-
 static void __kprobes unhandled_fault(unsigned long address,
 				      struct task_struct *tsk,
 				      struct pt_regs *regs)
@@ -285,7 +271,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 
 	fault_code = get_thread_fault_code();
 
-	if (notify_page_fault(regs))
+	if (kprobe_page_fault(regs, 0))
 		goto exit_exception;
 
 	si_code = SEGV_MAPERR;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 794f364cb882..d1634c59ed56 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -46,23 +46,6 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)
 	return 0;
 }
 
-static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
-{
-	if (!kprobes_built_in())
-		return 0;
-	if (user_mode(regs))
-		return 0;
-	/*
-	 * To be potentially processing a kprobe fault and to be allowed to call
-	 * kprobe_running(), we have to be non-preemptible.
-	 */
-	if (preemptible())
-		return 0;
-	if (!kprobe_running())
-		return 0;
-	return kprobe_fault_handler(regs, X86_TRAP_PF);
-}
-
 /*
  * Prefetch quirks:
  *
@@ -1282,7 +1265,7 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
 		return;
 
 	/* kprobes don't want to hook the spurious faults: */
-	if (kprobes_fault(regs))
+	if (kprobe_page_fault(regs, X86_TRAP_PF))
 		return;
 
 	/*
@@ -1313,7 +1296,7 @@ void do_user_addr_fault(struct pt_regs *regs,
 	mm = tsk->mm;
 
 	/* kprobes don't want to hook the spurious faults: */
-	if (unlikely(kprobes_fault(regs)))
+	if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
 		return;
 
 	/*
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 443d9800ca3f..04bdaf01112c 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -458,4 +458,23 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr)
 }
 #endif
 
+/* Returns true if kprobes handled the fault */
+static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs,
+					      unsigned int trap)
+{
+	if (!kprobes_built_in())
+		return false;
+	if (user_mode(regs))
+		return false;
+	/*
+	 * To be potentially processing a kprobe fault and to be allowed
+	 * to call kprobe_running(), we have to be non-preemptible.
+	 */
+	if (preemptible())
+		return false;
+	if (!kprobe_running())
+		return false;
+	return kprobe_fault_handler(regs, trap);
+}
+
 #endif /* _LINUX_KPROBES_H */
-- 
cgit v1.2.3-59-g8ed1b


From f90fb3c7e2c13ae829db2274b88b845a75038b8a Mon Sep 17 00:00:00 2001
From: Mikko Rapeli <mikko.rapeli@iki.fi>
Date: Tue, 16 Jul 2019 16:28:10 -0700
Subject: uapi linux/coda_psdev.h: move upc_req definition from uapi to kernel
 side headers

Only users of upc_req in kernel side fs/coda/psdev.c and
fs/coda/upcall.c already include linux/coda_psdev.h.

Suggested by Jan Harkes <jaharkes@cs.cmu.edu> in
  https://lore.kernel.org/lkml/20150531111913.GA23377@cs.cmu.edu/

Fixes these include/uapi/linux/coda_psdev.h compilation errors in userspace:

  linux/coda_psdev.h:12:19: error: field `uc_chain' has incomplete type
  struct list_head    uc_chain;
                   ^
  linux/coda_psdev.h:13:2: error: unknown type name `caddr_t'
  caddr_t             uc_data;
  ^
  linux/coda_psdev.h:14:2: error: unknown type name `u_short'
  u_short             uc_flags;
  ^
  linux/coda_psdev.h:15:2: error: unknown type name `u_short'
  u_short             uc_inSize;  /* Size is at most 5000 bytes */
  ^
  linux/coda_psdev.h:16:2: error: unknown type name `u_short'
  u_short             uc_outSize;
  ^
  linux/coda_psdev.h:17:2: error: unknown type name `u_short'
  u_short             uc_opcode;  /* copied from data to save lookup */
  ^
  linux/coda_psdev.h:19:2: error: unknown type name `wait_queue_head_t'
  wait_queue_head_t   uc_sleep;   /* process' wait queue */
  ^

Link: http://lkml.kernel.org/r/9f99f5ce6a0563d5266e6cf7aa9585aac2cae971.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Mikko Rapeli <mikko.rapeli@iki.fi>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/coda_psdev.h      | 11 +++++++++++
 include/uapi/linux/coda_psdev.h | 13 -------------
 2 files changed, 11 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 15170954aa2b..57d2b2faf6a3 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -19,6 +19,17 @@ struct venus_comm {
 	struct mutex	    vc_mutex;
 };
 
+/* messages between coda filesystem in kernel and Venus */
+struct upc_req {
+	struct list_head	uc_chain;
+	caddr_t			uc_data;
+	u_short			uc_flags;
+	u_short			uc_inSize;  /* Size is at most 5000 bytes */
+	u_short			uc_outSize;
+	u_short			uc_opcode;  /* copied from data to save lookup */
+	int			uc_unique;
+	wait_queue_head_t	uc_sleep;   /* process' wait queue */
+};
 
 static inline struct venus_comm *coda_vcp(struct super_block *sb)
 {
diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h
index aa6623efd2dd..d50d51a57fe4 100644
--- a/include/uapi/linux/coda_psdev.h
+++ b/include/uapi/linux/coda_psdev.h
@@ -7,19 +7,6 @@
 #define CODA_PSDEV_MAJOR 67
 #define MAX_CODADEVS  5	   /* how many do we allow */
 
-
-/* messages between coda filesystem in kernel and Venus */
-struct upc_req {
-	struct list_head    uc_chain;
-	caddr_t	            uc_data;
-	u_short	            uc_flags;
-	u_short             uc_inSize;  /* Size is at most 5000 bytes */
-	u_short	            uc_outSize;
-	u_short	            uc_opcode;  /* copied from data to save lookup */
-	int		    uc_unique;
-	wait_queue_head_t   uc_sleep;   /* process' wait queue */
-};
-
 #define CODA_REQ_ASYNC  0x1
 #define CODA_REQ_READ   0x2
 #define CODA_REQ_WRITE  0x4
-- 
cgit v1.2.3-59-g8ed1b


From 6e51f8aa76b67d0a6eb168fd41a81e8478ae07a9 Mon Sep 17 00:00:00 2001
From: Jan Harkes <jaharkes@cs.cmu.edu>
Date: Tue, 16 Jul 2019 16:28:16 -0700
Subject: coda: potential buffer overflow in coda_psdev_write()

Add checks to make sure the downcall message we got from the Coda cache
manager is large enough to contain the data it is supposed to have.
i.e.  when we get a CODA_ZAPDIR we can access &out->coda_zapdir.CodaFid.

Link: http://lkml.kernel.org/r/894fb6b250add09e4e3935f14649f21284a5cb18.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/psdev.c            |  8 ++++++--
 fs/coda/upcall.c           | 34 +++++++++++++++++++++++++++++++++-
 include/linux/coda_psdev.h |  3 ++-
 3 files changed, 41 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 241f7e04ad04..b4da2812499e 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -100,8 +100,12 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 	ssize_t retval = 0, count = 0;
 	int error;
 
+	/* make sure there is enough to copy out the (opcode, unique) values */
+	if (nbytes < (2 * sizeof(u_int32_t)))
+		return -EINVAL;
+
         /* Peek at the opcode, uniquefier */
-	if (copy_from_user(&hdr, buf, 2 * sizeof(u_long)))
+	if (copy_from_user(&hdr, buf, 2 * sizeof(u_int32_t)))
 	        return -EFAULT;
 
         if (DOWNCALL(hdr.opcode)) {
@@ -127,7 +131,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
 		}
 
 		/* what downcall errors does Venus handle ? */
-		error = coda_downcall(vcp, hdr.opcode, dcbuf);
+		error = coda_downcall(vcp, hdr.opcode, dcbuf, nbytes);
 
 		CODA_FREE(dcbuf, nbytes);
 		if (error) {
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1175a1722411..cf1e662681a5 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -804,12 +804,44 @@ exit:
  *
  * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */
 
-int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out)
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
+		  size_t nbytes)
 {
 	struct inode *inode = NULL;
 	struct CodaFid *fid = NULL, *newfid;
 	struct super_block *sb;
 
+	/*
+	 * Make sure we have received enough data from the cache
+	 * manager to populate the necessary fields in the buffer
+	 */
+	switch (opcode) {
+	case CODA_PURGEUSER:
+		if (nbytes < sizeof(struct coda_purgeuser_out))
+			return -EINVAL;
+		break;
+
+	case CODA_ZAPDIR:
+		if (nbytes < sizeof(struct coda_zapdir_out))
+			return -EINVAL;
+		break;
+
+	case CODA_ZAPFILE:
+		if (nbytes < sizeof(struct coda_zapfile_out))
+			return -EINVAL;
+		break;
+
+	case CODA_PURGEFID:
+		if (nbytes < sizeof(struct coda_purgefid_out))
+			return -EINVAL;
+		break;
+
+	case CODA_REPLACE:
+		if (nbytes < sizeof(struct coda_replace_out))
+			return -EINVAL;
+		break;
+	}
+
 	/* Handle invalidation requests. */
 	mutex_lock(&vcp->vc_mutex);
 	sb = vcp->vc_sb;
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 57d2b2faf6a3..d1672fd5e638 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -71,7 +71,8 @@ int venus_symlink(struct super_block *sb, struct CodaFid *fid,
 int venus_access(struct super_block *sb, struct CodaFid *fid, int mask);
 int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
 		 unsigned int cmd, struct PioctlData *data);
-int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out);
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
+		  size_t nbytes);
 int venus_fsync(struct super_block *sb, struct CodaFid *fid);
 int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
 
-- 
cgit v1.2.3-59-g8ed1b


From b2a57e334086602be56b74958d9f29b955cd157f Mon Sep 17 00:00:00 2001
From: Sam Protsenko <semen.protsenko@linaro.org>
Date: Tue, 16 Jul 2019 16:28:20 -0700
Subject: coda: fix build using bare-metal toolchain

The kernel is self-contained project and can be built with bare-metal
toolchain.  But bare-metal toolchain doesn't define __linux__.  Because
of this u_quad_t type is not defined when using bare-metal toolchain and
codafs build fails.  This patch fixes it by defining u_quad_t type
unconditionally.

Link: http://lkml.kernel.org/r/3cbb40b0a57b6f9923a9d67b53473c0b691a3eaa.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Sam Protsenko <semen.protsenko@linaro.org>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/coda.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coda.h b/include/linux/coda.h
index d30209b9cef8..0ca0c83fdb1c 100644
--- a/include/linux/coda.h
+++ b/include/linux/coda.h
@@ -58,8 +58,7 @@ Mellon the rights to redistribute these changes without encumbrance.
 #ifndef _CODA_HEADER_
 #define _CODA_HEADER_
 
-#if defined(__linux__)
 typedef unsigned long long u_quad_t;
-#endif
+
 #include <uapi/linux/coda.h>
 #endif 
-- 
cgit v1.2.3-59-g8ed1b


From 2fe7491d219428a32f09948e88bfaf8e71b9a66b Mon Sep 17 00:00:00 2001
From: Jan Harkes <jaharkes@cs.cmu.edu>
Date: Tue, 16 Jul 2019 16:28:26 -0700
Subject: uapi linux/coda_psdev.h: move CODA_REQ_ from uapi to kernel side
 headers

These constants only used internally and not exposed to userspace.

Link: http://lkml.kernel.org/r/baeafc30dad70d8b422ee679420099c2d8aa7da0.1558117389.git.jaharkes@cs.cmu.edu
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/coda_psdev.h      | 5 +++++
 include/uapi/linux/coda_psdev.h | 5 -----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index d1672fd5e638..9487f792770c 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -31,6 +31,11 @@ struct upc_req {
 	wait_queue_head_t	uc_sleep;   /* process' wait queue */
 };
 
+#define CODA_REQ_ASYNC  0x1
+#define CODA_REQ_READ   0x2
+#define CODA_REQ_WRITE  0x4
+#define CODA_REQ_ABORT  0x8
+
 static inline struct venus_comm *coda_vcp(struct super_block *sb)
 {
 	return (struct venus_comm *)((sb)->s_fs_info);
diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h
index d50d51a57fe4..3dacb7fad66a 100644
--- a/include/uapi/linux/coda_psdev.h
+++ b/include/uapi/linux/coda_psdev.h
@@ -7,9 +7,4 @@
 #define CODA_PSDEV_MAJOR 67
 #define MAX_CODADEVS  5	   /* how many do we allow */
 
-#define CODA_REQ_ASYNC  0x1
-#define CODA_REQ_READ   0x2
-#define CODA_REQ_WRITE  0x4
-#define CODA_REQ_ABORT  0x8
-
 #endif /* _UAPI__CODA_PSDEV_H */
-- 
cgit v1.2.3-59-g8ed1b


From 8fc8b9df831387e0d02c1d0f5bb53d327e0d477a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 16 Jul 2019 16:28:47 -0700
Subject: coda: move internal defs out of include/linux/ [ver #2]

Move include/linux/coda_psdev.h to fs/coda/ as there's nothing else that
uses it.

Link: http://lkml.kernel.org/r/3ceeee0415a929b89fb02700b6b4b3a07938acb8.1558117389.git.jaharkes@cs.cmu.edu
Link: https://patchwork.kernel.org/patch/10590257/
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Jan Harkes <jaharkes@cs.cmu.edu>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Mikko Rapeli <mikko.rapeli@iki.fi>
Cc: Sam Protsenko <semen.protsenko@linaro.org>
Cc: Zhouyang Jia <jiazhouyang09@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/cache.c            |  2 +-
 fs/coda/cnode.c            |  2 +-
 fs/coda/coda_linux.c       |  2 +-
 fs/coda/coda_psdev.h       | 89 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/coda/dir.c              |  2 +-
 fs/coda/file.c             |  3 +-
 fs/coda/inode.c            |  2 +-
 fs/coda/pioctl.c           |  3 +-
 fs/coda/psdev.c            |  3 +-
 fs/coda/symlink.c          |  3 +-
 fs/coda/upcall.c           |  2 +-
 include/linux/coda_psdev.h | 89 ----------------------------------------------
 12 files changed, 99 insertions(+), 103 deletions(-)
 create mode 100644 fs/coda/coda_psdev.h
 delete mode 100644 include/linux/coda_psdev.h

(limited to 'include/linux')

diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 201fc08a8b4f..3b8c4513118f 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -21,7 +21,7 @@
 #include <linux/spinlock.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 845b5a66952a..2e5badf67f98 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -8,8 +8,8 @@
 #include <linux/time.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
 #include <linux/pagemap.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index e4b5f02f0dd4..2e1a5a192074 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -18,7 +18,7 @@
 #include <linux/string.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 /* initialize the debugging variables */
diff --git a/fs/coda/coda_psdev.h b/fs/coda/coda_psdev.h
new file mode 100644
index 000000000000..012e16f741a6
--- /dev/null
+++ b/fs/coda/coda_psdev.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __CODA_PSDEV_H
+#define __CODA_PSDEV_H
+
+#include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/coda_psdev.h>
+
+struct kstatfs;
+
+/* messages between coda filesystem in kernel and Venus */
+struct upc_req {
+	struct list_head	uc_chain;
+	caddr_t			uc_data;
+	u_short			uc_flags;
+	u_short			uc_inSize;  /* Size is at most 5000 bytes */
+	u_short			uc_outSize;
+	u_short			uc_opcode;  /* copied from data to save lookup */
+	int			uc_unique;
+	wait_queue_head_t	uc_sleep;   /* process' wait queue */
+};
+
+#define CODA_REQ_ASYNC  0x1
+#define CODA_REQ_READ   0x2
+#define CODA_REQ_WRITE  0x4
+#define CODA_REQ_ABORT  0x8
+
+/* communication pending/processing queues */
+struct venus_comm {
+	u_long		    vc_seq;
+	wait_queue_head_t   vc_waitq; /* Venus wait queue */
+	struct list_head    vc_pending;
+	struct list_head    vc_processing;
+	int                 vc_inuse;
+	struct super_block *vc_sb;
+	struct mutex	    vc_mutex;
+};
+
+static inline struct venus_comm *coda_vcp(struct super_block *sb)
+{
+	return (struct venus_comm *)((sb)->s_fs_info);
+}
+
+/* upcalls */
+int venus_rootfid(struct super_block *sb, struct CodaFid *fidp);
+int venus_getattr(struct super_block *sb, struct CodaFid *fid,
+		  struct coda_vattr *attr);
+int venus_setattr(struct super_block *, struct CodaFid *, struct coda_vattr *);
+int venus_lookup(struct super_block *sb, struct CodaFid *fid,
+		 const char *name, int length, int *type,
+		 struct CodaFid *resfid);
+int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
+		kuid_t uid);
+int venus_open(struct super_block *sb, struct CodaFid *fid, int flags,
+	       struct file **f);
+int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid,
+		const char *name, int length,
+		struct CodaFid *newfid, struct coda_vattr *attrs);
+int venus_create(struct super_block *sb, struct CodaFid *dirfid,
+		 const char *name, int length, int excl, int mode,
+		 struct CodaFid *newfid, struct coda_vattr *attrs);
+int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid,
+		const char *name, int length);
+int venus_remove(struct super_block *sb, struct CodaFid *dirfid,
+		 const char *name, int length);
+int venus_readlink(struct super_block *sb, struct CodaFid *fid,
+		   char *buffer, int *length);
+int venus_rename(struct super_block *sb, struct CodaFid *new_fid,
+		 struct CodaFid *old_fid, size_t old_length,
+		 size_t new_length, const char *old_name,
+		 const char *new_name);
+int venus_link(struct super_block *sb, struct CodaFid *fid,
+		  struct CodaFid *dirfid, const char *name, int len );
+int venus_symlink(struct super_block *sb, struct CodaFid *fid,
+		  const char *name, int len, const char *symname, int symlen);
+int venus_access(struct super_block *sb, struct CodaFid *fid, int mask);
+int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
+		 unsigned int cmd, struct PioctlData *data);
+int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
+		  size_t nbytes);
+int venus_fsync(struct super_block *sb, struct CodaFid *fid);
+int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
+
+/*
+ * Statistics
+ */
+
+extern struct venus_comm coda_comms[];
+#endif
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 7e103eb8ffcd..716a0b932ec0 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -23,7 +23,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 43d371551d2b..a6b32c883a50 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -22,8 +22,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_int.h"
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 23f6ebd08e80..96d832ed23b5 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -27,7 +27,7 @@
 #include <linux/vmalloc.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index e0c17b7dccce..644d48c12ce8 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -20,8 +20,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 /* pioctl ops */
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index e80bda1de6c5..0a61e949a430 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -38,8 +38,7 @@
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 #include "coda_int.h"
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index 202297d156df..8907d0508198 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -17,8 +17,7 @@
 #include <linux/pagemap.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
-
+#include "coda_psdev.h"
 #include "coda_linux.h"
 
 static int coda_symlink_filler(struct file *file, struct page *page)
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 1e2f50722107..eb8cc30f2589 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -33,7 +33,7 @@
 #include <linux/vfs.h>
 
 #include <linux/coda.h>
-#include <linux/coda_psdev.h>
+#include "coda_psdev.h"
 #include "coda_linux.h"
 #include "coda_cache.h"
 
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
deleted file mode 100644
index 9487f792770c..000000000000
--- a/include/linux/coda_psdev.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __CODA_PSDEV_H
-#define __CODA_PSDEV_H
-
-#include <linux/backing-dev.h>
-#include <linux/mutex.h>
-#include <uapi/linux/coda_psdev.h>
-
-struct kstatfs;
-
-/* communication pending/processing queues */
-struct venus_comm {
-	u_long		    vc_seq;
-	wait_queue_head_t   vc_waitq; /* Venus wait queue */
-	struct list_head    vc_pending;
-	struct list_head    vc_processing;
-	int                 vc_inuse;
-	struct super_block *vc_sb;
-	struct mutex	    vc_mutex;
-};
-
-/* messages between coda filesystem in kernel and Venus */
-struct upc_req {
-	struct list_head	uc_chain;
-	caddr_t			uc_data;
-	u_short			uc_flags;
-	u_short			uc_inSize;  /* Size is at most 5000 bytes */
-	u_short			uc_outSize;
-	u_short			uc_opcode;  /* copied from data to save lookup */
-	int			uc_unique;
-	wait_queue_head_t	uc_sleep;   /* process' wait queue */
-};
-
-#define CODA_REQ_ASYNC  0x1
-#define CODA_REQ_READ   0x2
-#define CODA_REQ_WRITE  0x4
-#define CODA_REQ_ABORT  0x8
-
-static inline struct venus_comm *coda_vcp(struct super_block *sb)
-{
-	return (struct venus_comm *)((sb)->s_fs_info);
-}
-
-/* upcalls */
-int venus_rootfid(struct super_block *sb, struct CodaFid *fidp);
-int venus_getattr(struct super_block *sb, struct CodaFid *fid,
-		  struct coda_vattr *attr);
-int venus_setattr(struct super_block *, struct CodaFid *, struct coda_vattr *);
-int venus_lookup(struct super_block *sb, struct CodaFid *fid, 
-		 const char *name, int length, int *type, 
-		 struct CodaFid *resfid);
-int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
-		kuid_t uid);
-int venus_open(struct super_block *sb, struct CodaFid *fid, int flags,
-	       struct file **f);
-int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid, 
-		const char *name, int length, 
-		struct CodaFid *newfid, struct coda_vattr *attrs);
-int venus_create(struct super_block *sb, struct CodaFid *dirfid, 
-		 const char *name, int length, int excl, int mode,
-		 struct CodaFid *newfid, struct coda_vattr *attrs) ;
-int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid, 
-		const char *name, int length);
-int venus_remove(struct super_block *sb, struct CodaFid *dirfid, 
-		 const char *name, int length);
-int venus_readlink(struct super_block *sb, struct CodaFid *fid, 
-		   char *buffer, int *length);
-int venus_rename(struct super_block *, struct CodaFid *new_fid, 
-		 struct CodaFid *old_fid, size_t old_length, 
-		 size_t new_length, const char *old_name, 
-		 const char *new_name);
-int venus_link(struct super_block *sb, struct CodaFid *fid, 
-		  struct CodaFid *dirfid, const char *name, int len );
-int venus_symlink(struct super_block *sb, struct CodaFid *fid,
-		  const char *name, int len, const char *symname, int symlen);
-int venus_access(struct super_block *sb, struct CodaFid *fid, int mask);
-int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
-		 unsigned int cmd, struct PioctlData *data);
-int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out,
-		  size_t nbytes);
-int venus_fsync(struct super_block *sb, struct CodaFid *fid);
-int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
-
-/*
- * Statistics
- */
-
-extern struct venus_comm coda_comms[];
-#endif
-- 
cgit v1.2.3-59-g8ed1b


From 201766a20e30f982ccfe36bebfad9602c3ff574a Mon Sep 17 00:00:00 2001
From: Elvira Khabirova <lineprinter@altlinux.org>
Date: Tue, 16 Jul 2019 16:29:42 -0700
Subject: ptrace: add PTRACE_GET_SYSCALL_INFO request

PTRACE_GET_SYSCALL_INFO is a generic ptrace API that lets ptracer obtain
details of the syscall the tracee is blocked in.

There are two reasons for a special syscall-related ptrace request.

Firstly, with the current ptrace API there are cases when ptracer cannot
retrieve necessary information about syscalls.  Some examples include:

 * The notorious int-0x80-from-64-bit-task issue. See [1] for details.
   In short, if a 64-bit task performs a syscall through int 0x80, its
   tracer has no reliable means to find out that the syscall was, in
   fact, a compat syscall, and misidentifies it.

 * Syscall-enter-stop and syscall-exit-stop look the same for the
   tracer. Common practice is to keep track of the sequence of
   ptrace-stops in order not to mix the two syscall-stops up. But it is
   not as simple as it looks; for example, strace had a (just recently
   fixed) long-standing bug where attaching strace to a tracee that is
   performing the execve system call led to the tracer identifying the
   following syscall-exit-stop as syscall-enter-stop, which messed up
   all the state tracking.

 * Since the introduction of commit 84d77d3f06e7 ("ptrace: Don't allow
   accessing an undumpable mm"), both PTRACE_PEEKDATA and
   process_vm_readv become unavailable when the process dumpable flag is
   cleared. On such architectures as ia64 this results in all syscall
   arguments being unavailable for the tracer.

Secondly, ptracers also have to support a lot of arch-specific code for
obtaining information about the tracee.  For some architectures, this
requires a ptrace(PTRACE_PEEKUSER, ...) invocation for every syscall
argument and return value.

ptrace(2) man page:

long ptrace(enum __ptrace_request request, pid_t pid,
            void *addr, void *data);
...
PTRACE_GET_SYSCALL_INFO
       Retrieve information about the syscall that caused the stop.
       The information is placed into the buffer pointed by "data"
       argument, which should be a pointer to a buffer of type
       "struct ptrace_syscall_info".
       The "addr" argument contains the size of the buffer pointed to
       by "data" argument (i.e., sizeof(struct ptrace_syscall_info)).
       The return value contains the number of bytes available
       to be written by the kernel.
       If the size of data to be written by the kernel exceeds the size
       specified by "addr" argument, the output is truncated.

[ldv@altlinux.org: selftests/seccomp/seccomp_bpf: update for PTRACE_GET_SYSCALL_INFO]
  Link: http://lkml.kernel.org/r/20190708182904.GA12332@altlinux.org
Link: http://lkml.kernel.org/r/20190510152842.GF28558@altlinux.org
Signed-off-by: Elvira Khabirova <lineprinter@altlinux.org>
Co-developed-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Cc: Eugene Syromyatnikov <esyr@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Greentime Hu <greentime@andestech.com>
Cc: Helge Deller <deller@gmx.de>	[parisc]
Cc: James E.J. Bottomley <jejb@parisc-linux.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: kbuild test robot <lkp@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vincent Chen <deanbo422@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/tracehook.h                     |   9 ++-
 include/uapi/linux/ptrace.h                   |  35 +++++++++
 kernel/ptrace.c                               | 101 +++++++++++++++++++++++++-
 tools/testing/selftests/seccomp/seccomp_bpf.c |  13 +++-
 4 files changed, 150 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 8446573cc682..36fb3bbed6b2 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -54,13 +54,15 @@ struct linux_binprm;
 /*
  * ptrace report for syscall entry and exit looks identical.
  */
-static inline int ptrace_report_syscall(struct pt_regs *regs)
+static inline int ptrace_report_syscall(struct pt_regs *regs,
+					unsigned long message)
 {
 	int ptrace = current->ptrace;
 
 	if (!(ptrace & PT_PTRACED))
 		return 0;
 
+	current->ptrace_message = message;
 	ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
 
 	/*
@@ -73,6 +75,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs)
 		current->exit_code = 0;
 	}
 
+	current->ptrace_message = 0;
 	return fatal_signal_pending(current);
 }
 
@@ -98,7 +101,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs)
 static inline __must_check int tracehook_report_syscall_entry(
 	struct pt_regs *regs)
 {
-	return ptrace_report_syscall(regs);
+	return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY);
 }
 
 /**
@@ -123,7 +126,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step)
 	if (step)
 		user_single_step_report(regs);
 	else
-		ptrace_report_syscall(regs);
+		ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT);
 }
 
 /**
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index d5a1b8a492b9..a71b6e3b03eb 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -73,6 +73,41 @@ struct seccomp_metadata {
 	__u64 flags;		/* Output: filter's flags */
 };
 
+#define PTRACE_GET_SYSCALL_INFO		0x420e
+#define PTRACE_SYSCALL_INFO_NONE	0
+#define PTRACE_SYSCALL_INFO_ENTRY	1
+#define PTRACE_SYSCALL_INFO_EXIT	2
+#define PTRACE_SYSCALL_INFO_SECCOMP	3
+
+struct ptrace_syscall_info {
+	__u8 op;	/* PTRACE_SYSCALL_INFO_* */
+	__u32 arch __attribute__((__aligned__(sizeof(__u32))));
+	__u64 instruction_pointer;
+	__u64 stack_pointer;
+	union {
+		struct {
+			__u64 nr;
+			__u64 args[6];
+		} entry;
+		struct {
+			__s64 rval;
+			__u8 is_error;
+		} exit;
+		struct {
+			__u64 nr;
+			__u64 args[6];
+			__u32 ret_data;
+		} seccomp;
+	};
+};
+
+/*
+ * These values are stored in task->ptrace_message
+ * by tracehook_report_syscall_* to describe the current syscall-stop.
+ */
+#define PTRACE_EVENTMSG_SYSCALL_ENTRY	1
+#define PTRACE_EVENTMSG_SYSCALL_EXIT	2
+
 /* Read signals from a shared (process wide) queue */
 #define PTRACE_PEEKSIGINFO_SHARED	(1 << 0)
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 83a531cea2f3..cb9ddcc08119 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -32,6 +32,8 @@
 #include <linux/compat.h>
 #include <linux/sched/signal.h>
 
+#include <asm/syscall.h>	/* for syscall_get_* */
+
 /*
  * Access another process' address space via ptrace.
  * Source/target buffer must be kernel space,
@@ -897,7 +899,100 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
  * to ensure no machine forgets it.
  */
 EXPORT_SYMBOL_GPL(task_user_regset_view);
-#endif
+
+static unsigned long
+ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs,
+			      struct ptrace_syscall_info *info)
+{
+	unsigned long args[ARRAY_SIZE(info->entry.args)];
+	int i;
+
+	info->op = PTRACE_SYSCALL_INFO_ENTRY;
+	info->entry.nr = syscall_get_nr(child, regs);
+	syscall_get_arguments(child, regs, args);
+	for (i = 0; i < ARRAY_SIZE(args); i++)
+		info->entry.args[i] = args[i];
+
+	/* args is the last field in struct ptrace_syscall_info.entry */
+	return offsetofend(struct ptrace_syscall_info, entry.args);
+}
+
+static unsigned long
+ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs,
+				struct ptrace_syscall_info *info)
+{
+	/*
+	 * As struct ptrace_syscall_info.entry is currently a subset
+	 * of struct ptrace_syscall_info.seccomp, it makes sense to
+	 * initialize that subset using ptrace_get_syscall_info_entry().
+	 * This can be reconsidered in the future if these structures
+	 * diverge significantly enough.
+	 */
+	ptrace_get_syscall_info_entry(child, regs, info);
+	info->op = PTRACE_SYSCALL_INFO_SECCOMP;
+	info->seccomp.ret_data = child->ptrace_message;
+
+	/* ret_data is the last field in struct ptrace_syscall_info.seccomp */
+	return offsetofend(struct ptrace_syscall_info, seccomp.ret_data);
+}
+
+static unsigned long
+ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs,
+			     struct ptrace_syscall_info *info)
+{
+	info->op = PTRACE_SYSCALL_INFO_EXIT;
+	info->exit.rval = syscall_get_error(child, regs);
+	info->exit.is_error = !!info->exit.rval;
+	if (!info->exit.is_error)
+		info->exit.rval = syscall_get_return_value(child, regs);
+
+	/* is_error is the last field in struct ptrace_syscall_info.exit */
+	return offsetofend(struct ptrace_syscall_info, exit.is_error);
+}
+
+static int
+ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size,
+			void __user *datavp)
+{
+	struct pt_regs *regs = task_pt_regs(child);
+	struct ptrace_syscall_info info = {
+		.op = PTRACE_SYSCALL_INFO_NONE,
+		.arch = syscall_get_arch(child),
+		.instruction_pointer = instruction_pointer(regs),
+		.stack_pointer = user_stack_pointer(regs),
+	};
+	unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry);
+	unsigned long write_size;
+
+	/*
+	 * This does not need lock_task_sighand() to access
+	 * child->last_siginfo because ptrace_freeze_traced()
+	 * called earlier by ptrace_check_attach() ensures that
+	 * the tracee cannot go away and clear its last_siginfo.
+	 */
+	switch (child->last_siginfo ? child->last_siginfo->si_code : 0) {
+	case SIGTRAP | 0x80:
+		switch (child->ptrace_message) {
+		case PTRACE_EVENTMSG_SYSCALL_ENTRY:
+			actual_size = ptrace_get_syscall_info_entry(child, regs,
+								    &info);
+			break;
+		case PTRACE_EVENTMSG_SYSCALL_EXIT:
+			actual_size = ptrace_get_syscall_info_exit(child, regs,
+								   &info);
+			break;
+		}
+		break;
+	case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8):
+		actual_size = ptrace_get_syscall_info_seccomp(child, regs,
+							      &info);
+		break;
+	}
+
+	write_size = min(actual_size, user_size);
+	return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size;
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 
 int ptrace_request(struct task_struct *child, long request,
 		   unsigned long addr, unsigned long data)
@@ -1114,6 +1209,10 @@ int ptrace_request(struct task_struct *child, long request,
 			ret = __put_user(kiov.iov_len, &uiov->iov_len);
 		break;
 	}
+
+	case PTRACE_GET_SYSCALL_INFO:
+		ret = ptrace_get_syscall_info(child, addr, datavp);
+		break;
 #endif
 
 	case PTRACE_SECCOMP_GET_FILTER:
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index dc66fe852768..6ef7f16c4cf5 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -1775,13 +1775,18 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
 	unsigned long msg;
 	static bool entry;
 
-	/* Make sure we got an empty message. */
+	/*
+	 * The traditional way to tell PTRACE_SYSCALL entry/exit
+	 * is by counting.
+	 */
+	entry = !entry;
+
+	/* Make sure we got an appropriate message. */
 	ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
 	EXPECT_EQ(0, ret);
-	EXPECT_EQ(0, msg);
+	EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
+			: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
 
-	/* The only way to tell PTRACE_SYSCALL entry/exit is by counting. */
-	entry = !entry;
 	if (!entry)
 		return;
 
-- 
cgit v1.2.3-59-g8ed1b


From e2d9018e81ba9357d3bb8bddc0ee58d460d092fe Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 16 Jul 2019 16:29:50 -0700
Subject: signal: reorder struct sighand_struct

struct sighand_struct::siglock field is the most used field by far, put
it first so that is can be accessed without IMM8 or IMM32 encoding on
x86_64.

Space savings (on trimmed down VM test config):

add/remove: 0/0 grow/shrink: 8/68 up/down: 49/-1147 (-1098)
Function                                     old     new   delta
complete_signal                              512     533     +21
do_signalfd4                                 335     346     +11
__cleanup_sighand                             39      43      +4
unhandled_signal                              49      52      +3
prepare_signal                               692     695      +3
ignore_signals                                37      40      +3
__tty_check_change.part                      248     251      +3
ksys_unshare                                 780     781      +1
sighand_ctor                                  33      29      -4
ptrace_trap_notify                            60      56      -4
sigqueue_free                                 98      91      -7
run_posix_cpu_timers                        1389    1382      -7
proc_pid_status                             2448    2441      -7
proc_pid_limits                              344     337      -7
posix_cpu_timer_rearm                        222     215      -7
posix_cpu_timer_get                          249     242      -7
kill_pid_info_as_cred                        243     236      -7
freeze_task                                  197     190      -7
flush_old_exec                              1873    1866      -7
do_task_stat                                3363    3356      -7
do_send_sig_info                              98      91      -7
do_group_exit                                147     140      -7
init_sighand                                2088    2080      -8
do_notify_parent_cldstop                     399     391      -8
signalfd_cleanup                              50      41      -9
do_notify_parent                             557     545     -12
__send_signal                               1029    1017     -12
ptrace_stop                                  590     577     -13
get_signal                                  1576    1563     -13
__lock_task_sighand                          112      99     -13
zap_pid_ns_processes                         391     377     -14
update_rlimit_cpu                             78      64     -14
tty_signal_session_leader                    413     399     -14
tty_open_proc_set_tty                        149     135     -14
tty_jobctrl_ioctl                            936     922     -14
set_cpu_itimer                               339     325     -14
ptrace_resume                                226     212     -14
ptrace_notify                                110      96     -14
proc_clear_tty                                81      67     -14
posix_cpu_timer_del                          229     215     -14
kernel_sigaction                             156     142     -14
getrusage                                    977     963     -14
get_current_tty                               98      84     -14
force_sigsegv                                 89      75     -14
force_sig_info                               205     191     -14
flush_signals                                 83      69     -14
flush_itimer_signals                          85      71     -14
do_timer_create                             1120    1106     -14
do_sigpending                                 88      74     -14
do_signal_stop                               537     523     -14
cgroup_init_fs_context                       644     630     -14
call_usermodehelper_exec_async               402     388     -14
calculate_sigpending                          58      44     -14
__x64_sys_timer_delete                       248     234     -14
__set_current_blocked                         80      66     -14
__ptrace_unlink                              310     296     -14
__ptrace_detach.part                         187     173     -14
send_sigqueue                                362     347     -15
get_cpu_itimer                               214     199     -15
signalfd_poll                                175     159     -16
dequeue_signal                               340     323     -17
do_getitimer                                 192     174     -18
release_task.part                           1060    1040     -20
ptrace_peek_siginfo                          408     387     -21
posix_cpu_timer_set                          827     806     -21
exit_signals                                 437     416     -21
do_sigaction                                 541     520     -21
do_setitimer                                 485     464     -21
disassociate_ctty.part                       545     517     -28
__x64_sys_rt_sigtimedwait                    721     679     -42
__x64_sys_ptrace                            1319    1277     -42
ptrace_request                              1828    1782     -46
signalfd_read                                507     459     -48
wait_consider_task                          2027    1971     -56
do_coredump                                 3672    3616     -56
copy_process.part                           6936    6871     -65

Link: http://lkml.kernel.org/r/20190503192800.GA18004@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/signal.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 532458698bde..01add55a609b 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -15,10 +15,10 @@
  */
 
 struct sighand_struct {
-	refcount_t		count;
-	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
+	refcount_t		count;
 	wait_queue_head_t	signalfd_wqh;
+	struct k_sigaction	action[_NSIG];
 };
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From b772434be0891ed1081a08ae7cfd4666728f8e82 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 16 Jul 2019 16:29:53 -0700
Subject: signal: simplify set_user_sigmask/restore_user_sigmask

task->saved_sigmask and ->restore_sigmask are only used in the ret-from-
syscall paths.  This means that set_user_sigmask() can save ->blocked in
->saved_sigmask and do set_restore_sigmask() to indicate that ->blocked
was modified.

This way the callers do not need 2 sigset_t's passed to set/restore and
restore_user_sigmask() renamed to restore_saved_sigmask_unless() turns
into the trivial helper which just calls restore_saved_sigmask().

Link: http://lkml.kernel.org/r/20190606113206.GA9464@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Deepa Dinamani <deepa.kernel@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Eric Wong <e@80x24.org>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: David Laight <David.Laight@aculab.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c                     | 20 +++++--------
 fs/eventpoll.c               | 12 +++-----
 fs/io_uring.c                | 11 ++-----
 fs/select.c                  | 34 ++++++++--------------
 include/linux/compat.h       |  3 +-
 include/linux/sched/signal.h | 12 ++++++--
 include/linux/signal.h       |  4 ---
 kernel/signal.c              | 69 ++++++++++++--------------------------------
 8 files changed, 57 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 2d405733a8c6..8327db0c8e08 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2094,7 +2094,6 @@ SYSCALL_DEFINE6(io_pgetevents,
 		const struct __aio_sigset __user *, usig)
 {
 	struct __aio_sigset	ksig = { NULL, };
-	sigset_t		ksigmask, sigsaved;
 	struct timespec64	ts;
 	bool interrupted;
 	int ret;
@@ -2105,14 +2104,14 @@ SYSCALL_DEFINE6(io_pgetevents,
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
 
 	interrupted = signal_pending(current);
-	restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+	restore_saved_sigmask_unless(interrupted);
 	if (interrupted && !ret)
 		ret = -ERESTARTNOHAND;
 
@@ -2130,7 +2129,6 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
 		const struct __aio_sigset __user *, usig)
 {
 	struct __aio_sigset	ksig = { NULL, };
-	sigset_t		ksigmask, sigsaved;
 	struct timespec64	ts;
 	bool interrupted;
 	int ret;
@@ -2142,14 +2140,14 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
 		return -EFAULT;
 
 
-	ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
 
 	interrupted = signal_pending(current);
-	restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+	restore_saved_sigmask_unless(interrupted);
 	if (interrupted && !ret)
 		ret = -ERESTARTNOHAND;
 
@@ -2198,7 +2196,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
 		const struct __compat_aio_sigset __user *, usig)
 {
 	struct __compat_aio_sigset ksig = { NULL, };
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 t;
 	bool interrupted;
 	int ret;
@@ -2209,14 +2206,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
 
 	interrupted = signal_pending(current);
-	restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+	restore_saved_sigmask_unless(interrupted);
 	if (interrupted && !ret)
 		ret = -ERESTARTNOHAND;
 
@@ -2234,7 +2231,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
 		const struct __compat_aio_sigset __user *, usig)
 {
 	struct __compat_aio_sigset ksig = { NULL, };
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 t;
 	bool interrupted;
 	int ret;
@@ -2245,14 +2241,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
 	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
 		return -EFAULT;
 
-	ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+	ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
 
 	interrupted = signal_pending(current);
-	restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+	restore_saved_sigmask_unless(interrupted);
 	if (interrupted && !ret)
 		ret = -ERESTARTNOHAND;
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4c74c768ae43..0f9c073d78d5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2313,19 +2313,17 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 		size_t, sigsetsize)
 {
 	int error;
-	sigset_t ksigmask, sigsaved;
 
 	/*
 	 * If the caller wants a certain signal mask to be set during the wait,
 	 * we apply it here.
 	 */
-	error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	error = set_user_sigmask(sigmask, sigsetsize);
 	if (error)
 		return error;
 
 	error = do_epoll_wait(epfd, events, maxevents, timeout);
-
-	restore_user_sigmask(sigmask, &sigsaved, error == -EINTR);
+	restore_saved_sigmask_unless(error == -EINTR);
 
 	return error;
 }
@@ -2338,19 +2336,17 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 			compat_size_t, sigsetsize)
 {
 	long err;
-	sigset_t ksigmask, sigsaved;
 
 	/*
 	 * If the caller wants a certain signal mask to be set during the wait,
 	 * we apply it here.
 	 */
-	err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	err = set_compat_user_sigmask(sigmask, sigsetsize);
 	if (err)
 		return err;
 
 	err = do_epoll_wait(epfd, events, maxevents, timeout);
-
-	restore_user_sigmask(sigmask, &sigsaved, err == -EINTR);
+	restore_saved_sigmask_unless(err == -EINTR);
 
 	return err;
 }
diff --git a/fs/io_uring.c b/fs/io_uring.c
index d682049c07b2..e2a66e12fbc6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2400,7 +2400,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz)
 {
 	struct io_cq_ring *ring = ctx->cq_ring;
-	sigset_t ksigmask, sigsaved;
 	int ret;
 
 	if (io_cqring_events(ring) >= min_events)
@@ -2410,21 +2409,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 #ifdef CONFIG_COMPAT
 		if (in_compat_syscall())
 			ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
-						      &ksigmask, &sigsaved, sigsz);
+						      sigsz);
 		else
 #endif
-			ret = set_user_sigmask(sig, &ksigmask,
-					       &sigsaved, sigsz);
+			ret = set_user_sigmask(sig, sigsz);
 
 		if (ret)
 			return ret;
 	}
 
 	ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
-
-	if (sig)
-		restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS);
-
+	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
 
diff --git a/fs/select.c b/fs/select.c
index a4d8f6e8b63c..1fc1b247fede 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -730,7 +730,6 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 		       const sigset_t __user *sigmask, size_t sigsetsize,
 		       enum poll_time_type type)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -753,12 +752,12 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
 			return -EINVAL;
 	}
 
-	ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = core_sys_select(n, inp, outp, exp, to);
-	restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
+	restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
 	ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
 
 	return ret;
@@ -1086,7 +1085,6 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 		struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
 		size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1099,17 +1097,16 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 			return -EINVAL;
 	}
 
-	ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+	restore_saved_sigmask_unless(ret == -EINTR);
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-
 	ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
 
 	return ret;
@@ -1121,7 +1118,6 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
 		struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
 		size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1134,17 +1130,16 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
 			return -EINVAL;
 	}
 
-	ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+	restore_saved_sigmask_unless(ret == -EINTR);
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-
 	ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
 
 	return ret;
@@ -1319,7 +1314,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	void __user *tsp, compat_sigset_t __user *sigmask,
 	compat_size_t sigsetsize, enum poll_time_type type)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1342,12 +1336,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 			return -EINVAL;
 	}
 
-	ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_compat_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = compat_core_sys_select(n, inp, outp, exp, to);
-	restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
+	restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
 	ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
 
 	return ret;
@@ -1402,7 +1396,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
 	unsigned int,  nfds, struct old_timespec32 __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1415,17 +1408,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
 			return -EINVAL;
 	}
 
-	ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_compat_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+	restore_saved_sigmask_unless(ret == -EINTR);
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-
 	ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
 
 	return ret;
@@ -1437,7 +1429,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
 	unsigned int,  nfds, struct __kernel_timespec __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
-	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
 
@@ -1450,17 +1441,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
 			return -EINVAL;
 	}
 
-	ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+	ret = set_compat_user_sigmask(sigmask, sigsetsize);
 	if (ret)
 		return ret;
 
 	ret = do_sys_poll(ufds, nfds, to);
 
-	restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+	restore_saved_sigmask_unless(ret == -EINTR);
 	/* We can restart this syscall, usually */
 	if (ret == -EINTR)
 		ret = -ERESTARTNOHAND;
-
 	ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
 
 	return ret;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ebddcb6cfcf8..16dafd9f4b86 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -138,8 +138,7 @@ typedef struct {
 	compat_sigset_word	sig[_COMPAT_NSIG_WORDS];
 } compat_sigset_t;
 
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
-			    sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
 			    size_t sigsetsize);
 
 struct compat_sigaction {
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 01add55a609b..efd8ce7675ed 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -420,7 +420,6 @@ void task_join_group_stop(struct task_struct *task);
 static inline void set_restore_sigmask(void)
 {
 	set_thread_flag(TIF_RESTORE_SIGMASK);
-	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
 
 static inline void clear_tsk_restore_sigmask(struct task_struct *task)
@@ -451,7 +450,6 @@ static inline bool test_and_clear_restore_sigmask(void)
 static inline void set_restore_sigmask(void)
 {
 	current->restore_sigmask = true;
-	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
 static inline void clear_tsk_restore_sigmask(struct task_struct *task)
 {
@@ -484,6 +482,16 @@ static inline void restore_saved_sigmask(void)
 		__set_current_blocked(&current->saved_sigmask);
 }
 
+extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);
+
+static inline void restore_saved_sigmask_unless(bool interrupted)
+{
+	if (interrupted)
+		WARN_ON(!test_thread_flag(TIF_SIGPENDING));
+	else
+		restore_saved_sigmask();
+}
+
 static inline sigset_t *sigmask_to_save(void)
 {
 	sigset_t *res = &current->blocked;
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 78c2bb376954..b5d99482d3fe 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -273,10 +273,6 @@ extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
 			       struct task_struct *p, enum pid_type type);
 extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
-extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
-	sigset_t *oldset, size_t sigsetsize);
-extern void restore_user_sigmask(const void __user *usigmask,
-				 sigset_t *sigsaved, bool interrupted);
 extern void set_current_blocked(sigset_t *);
 extern void __set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
diff --git a/kernel/signal.c b/kernel/signal.c
index dabe100d2091..91b789dd6e72 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2951,80 +2951,49 @@ EXPORT_SYMBOL(sigprocmask);
  *
  * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
  * epoll_pwait where a new sigmask is passed from userland for the syscalls.
+ *
+ * Note that it does set_restore_sigmask() in advance, so it must be always
+ * paired with restore_saved_sigmask_unless() before return from syscall.
  */
-int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set,
-		     sigset_t *oldset, size_t sigsetsize)
+int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
 {
-	if (!usigmask)
-		return 0;
+	sigset_t kmask;
 
+	if (!umask)
+		return 0;
 	if (sigsetsize != sizeof(sigset_t))
 		return -EINVAL;
-	if (copy_from_user(set, usigmask, sizeof(sigset_t)))
+	if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
 		return -EFAULT;
 
-	*oldset = current->blocked;
-	set_current_blocked(set);
+	set_restore_sigmask();
+	current->saved_sigmask = current->blocked;
+	set_current_blocked(&kmask);
 
 	return 0;
 }
-EXPORT_SYMBOL(set_user_sigmask);
 
 #ifdef CONFIG_COMPAT
-int set_compat_user_sigmask(const compat_sigset_t __user *usigmask,
-			    sigset_t *set, sigset_t *oldset,
+int set_compat_user_sigmask(const compat_sigset_t __user *umask,
 			    size_t sigsetsize)
 {
-	if (!usigmask)
-		return 0;
+	sigset_t kmask;
 
+	if (!umask)
+		return 0;
 	if (sigsetsize != sizeof(compat_sigset_t))
 		return -EINVAL;
-	if (get_compat_sigset(set, usigmask))
+	if (get_compat_sigset(&kmask, umask))
 		return -EFAULT;
 
-	*oldset = current->blocked;
-	set_current_blocked(set);
+	set_restore_sigmask();
+	current->saved_sigmask = current->blocked;
+	set_current_blocked(&kmask);
 
 	return 0;
 }
-EXPORT_SYMBOL(set_compat_user_sigmask);
 #endif
 
-/*
- * restore_user_sigmask:
- * usigmask: sigmask passed in from userland.
- * sigsaved: saved sigmask when the syscall started and changed the sigmask to
- *           usigmask.
- *
- * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
- * epoll_pwait where a new sigmask is passed in from userland for the syscalls.
- */
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
-				bool interrupted)
-{
-
-	if (!usigmask)
-		return;
-	/*
-	 * When signals are pending, do not restore them here.
-	 * Restoring sigmask here can lead to delivering signals that the above
-	 * syscalls are intended to block because of the sigmask passed in.
-	 */
-	if (interrupted) {
-		current->saved_sigmask = *sigsaved;
-		set_restore_sigmask();
-		return;
-	}
-
-	/*
-	 * This is needed because the fast syscall return path does not restore
-	 * saved_sigmask when signals are not pending.
-	 */
-	set_current_blocked(sigsaved);
-}
-EXPORT_SYMBOL(restore_user_sigmask);
-
 /**
  *  sys_rt_sigprocmask - change the list of currently blocked signals
  *  @how: whether to add, remove, or set signals
-- 
cgit v1.2.3-59-g8ed1b


From f57e515a1b56325a28a0972c632a623a9c84590c Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Tue, 16 Jul 2019 16:30:06 -0700
Subject: kernel/pid.c: convert struct pid count to refcount_t

struct pid's count is an atomic_t field used as a refcount.  Use
refcount_t for it which is basically atomic_t but does additional
checking to prevent use-after-free bugs.

For memory ordering, the only change is with the following:

 -	if ((atomic_read(&pid->count) == 1) ||
 -	     atomic_dec_and_test(&pid->count)) {
 +	if (refcount_dec_and_test(&pid->count)) {
 		kmem_cache_free(ns->pid_cachep, pid);

Here the change is from: Fully ordered --> RELEASE + ACQUIRE (as per
refcount-vs-atomic.rst) This ACQUIRE should take care of making sure the
free happens after the refcount_dec_and_test().

The above hunk also removes atomic_read() since it is not needed for the
code to work and it is unclear how beneficial it is.  The removal lets
refcount_dec_and_test() check for cases where get_pid() happened before
the object was freed.

Link: http://lkml.kernel.org/r/20190701183826.191936-1-joel@joelfernandes.org
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Andrea Parri <andrea.parri@amarulasolutions.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Elena Reshetova <elena.reshetova@intel.com>
Cc: Jann Horn <jannh@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: KJ Tsanaktsidis <ktsanaktsidis@zendesk.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid.h | 5 +++--
 kernel/pid.c        | 9 ++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 1484db6ca8d1..2a83e434db9d 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -4,6 +4,7 @@
 
 #include <linux/rculist.h>
 #include <linux/wait.h>
+#include <linux/refcount.h>
 
 enum pid_type
 {
@@ -57,7 +58,7 @@ struct upid {
 
 struct pid
 {
-	atomic_t count;
+	refcount_t count;
 	unsigned int level;
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
@@ -74,7 +75,7 @@ extern const struct file_operations pidfd_fops;
 static inline struct pid *get_pid(struct pid *pid)
 {
 	if (pid)
-		atomic_inc(&pid->count);
+		refcount_inc(&pid->count);
 	return pid;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 16263b526560..0a9f2e437217 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -37,14 +37,14 @@
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
 #include <linux/proc_ns.h>
-#include <linux/proc_fs.h>
+#include <linux/refcount.h>
 #include <linux/anon_inodes.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 
 struct pid init_struct_pid = {
-	.count 		= ATOMIC_INIT(1),
+	.count		= REFCOUNT_INIT(1),
 	.tasks		= {
 		{ .first = NULL },
 		{ .first = NULL },
@@ -108,8 +108,7 @@ void put_pid(struct pid *pid)
 		return;
 
 	ns = pid->numbers[pid->level].ns;
-	if ((atomic_read(&pid->count) == 1) ||
-	     atomic_dec_and_test(&pid->count)) {
+	if (refcount_dec_and_test(&pid->count)) {
 		kmem_cache_free(ns->pid_cachep, pid);
 		put_pid_ns(ns);
 	}
@@ -212,7 +211,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	}
 
 	get_pid_ns(ns);
-	atomic_set(&pid->count, 1);
+	refcount_set(&pid->count, 1);
 	for (type = 0; type < PIDTYPE_MAX; ++type)
 		INIT_HLIST_HEAD(&pid->tasks[type]);
 
-- 
cgit v1.2.3-59-g8ed1b


From 97a0efea657e986322b09b99016b3f7d2ce37021 Mon Sep 17 00:00:00 2001
From: Tom Levy <tomlevy93@gmail.com>
Date: Tue, 16 Jul 2019 16:30:24 -0700
Subject: include/linux/lz4.h: fix spelling and copy-paste errors in
 documentation

Fix a few spelling and grammar errors, and two places where fast/safe in
the documentation did not match the function.

Link: http://lkml.kernel.org/r/20190321014452.13297-1-tomlevy93@gmail.com
Signed-off-by: Tom Levy <tomlevy93@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Jiri Kosina <trivial@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lz4.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index 394e3d9213b8..b16e15b9587a 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -278,7 +278,7 @@ int LZ4_decompress_fast(const char *source, char *dest, int originalSize);
  * @compressedSize: is the precise full size of the compressed block
  * @maxDecompressedSize: is the size of 'dest' buffer
  *
- * Decompresses data fom 'source' into 'dest'.
+ * Decompresses data from 'source' into 'dest'.
  * If the source stream is detected malformed, the function will
  * stop decoding and return a negative result.
  * This function is protected against buffer overflow exploits,
@@ -522,7 +522,7 @@ int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
 	const char *dictionary, int dictSize);
 
 /**
- * LZ4_decompress_fast_continue() - Decompress blocks in streaming mode
+ * LZ4_decompress_safe_continue() - Decompress blocks in streaming mode
  * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure
  * @source: source address of the compressed data
  * @dest: output buffer address of the uncompressed data
@@ -530,7 +530,7 @@ int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
  * @compressedSize: is the precise full size of the compressed block
  * @maxDecompressedSize: is the size of 'dest' buffer
  *
- * These decoding function allows decompression of multiple blocks
+ * This decoding function allows decompression of multiple blocks
  * in "streaming" mode.
  * Previously decoded blocks *must* remain available at the memory position
  * where they were decoded (up to 64 KB)
@@ -569,7 +569,7 @@ int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
  *	which must be already allocated with 'originalSize' bytes
  * @originalSize: is the original and therefore uncompressed size
  *
- * These decoding function allows decompression of multiple blocks
+ * This decoding function allows decompression of multiple blocks
  * in "streaming" mode.
  * Previously decoded blocks *must* remain available at the memory position
  * where they were decoded (up to 64 KB)
@@ -610,10 +610,10 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
  * @dictStart: pointer to the start of the dictionary in memory
  * @dictSize: size of dictionary
  *
- * These decoding function works the same as
+ * This decoding function works the same as
  * a combination of LZ4_setStreamDecode() followed by
  * LZ4_decompress_safe_continue()
- * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure.
+ * It is stand-alone, and doesn't need an LZ4_streamDecode_t structure.
  *
  * Return: number of bytes decompressed into destination buffer
  *	(necessarily <= maxDecompressedSize)
@@ -633,10 +633,10 @@ int LZ4_decompress_safe_usingDict(const char *source, char *dest,
  * @dictStart: pointer to the start of the dictionary in memory
  * @dictSize: size of dictionary
  *
- * These decoding function works the same as
+ * This decoding function works the same as
  * a combination of LZ4_setStreamDecode() followed by
- * LZ4_decompress_safe_continue()
- * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure.
+ * LZ4_decompress_fast_continue()
+ * It is stand-alone, and doesn't need an LZ4_streamDecode_t structure.
  *
  * Return: number of bytes decompressed into destination buffer
  *	(necessarily <= maxDecompressedSize)
-- 
cgit v1.2.3-59-g8ed1b


From eca499ab3749a4537dee77ffead47a1a2c0dee19 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@soleen.com>
Date: Tue, 16 Jul 2019 16:30:31 -0700
Subject: mm/hotplug: make remove_memory() interface usable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Presently the remove_memory() interface is inherently broken.  It tries
to remove memory but panics if some memory is not offline.  The problem
is that it is impossible to ensure that all memory blocks are offline as
this function also takes lock_device_hotplug that is required to change
memory state via sysfs.

So, between calling this function and offlining all memory blocks there
is always a window when lock_device_hotplug is released, and therefore,
there is always a chance for a panic during this window.

Make this interface to return an error if memory removal fails.  This
way it is safe to call this function without panicking machine, and also
makes it symmetric to add_memory() which already returns an error.

Link: http://lkml.kernel.org/r/20190517215438.6487-3-pasha.tatashin@soleen.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: James Morris <jmorris@namei.org>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Ross Zwisler <zwisler@kernel.org>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |  8 ++++--
 mm/memory_hotplug.c            | 64 ++++++++++++++++++++++++++++--------------
 2 files changed, 49 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ae892eef8b82..988fde33cd7f 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -324,7 +324,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
 extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
 extern void try_offline_node(int nid);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
-extern void remove_memory(int nid, u64 start, u64 size);
+extern int remove_memory(int nid, u64 start, u64 size);
 extern void __remove_memory(int nid, u64 start, u64 size);
 
 #else
@@ -341,7 +341,11 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 	return -EINVAL;
 }
 
-static inline void remove_memory(int nid, u64 start, u64 size) {}
+static inline int remove_memory(int nid, u64 start, u64 size)
+{
+	return -EBUSY;
+}
+
 static inline void __remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6166ba5a15f3..4ebe696138e8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1734,9 +1734,10 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 		endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
 		pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
 			&beginpa, &endpa);
-	}
 
-	return ret;
+		return -EBUSY;
+	}
+	return 0;
 }
 
 static int check_cpu_on_node(pg_data_t *pgdat)
@@ -1819,19 +1820,9 @@ static void __release_memory_resource(resource_size_t start,
 	}
 }
 
-/**
- * remove_memory
- * @nid: the node ID
- * @start: physical address of the region to remove
- * @size: size of the region to remove
- *
- * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
- * and online/offline operations before this call, as required by
- * try_offline_node().
- */
-void __ref __remove_memory(int nid, u64 start, u64 size)
+static int __ref try_remove_memory(int nid, u64 start, u64 size)
 {
-	int ret;
+	int rc = 0;
 
 	BUG_ON(check_hotplug_memory_range(start, size));
 
@@ -1839,13 +1830,13 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
 
 	/*
 	 * All memory blocks must be offlined before removing memory.  Check
-	 * whether all memory blocks in question are offline and trigger a BUG()
+	 * whether all memory blocks in question are offline and return error
 	 * if this is not the case.
 	 */
-	ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
-				check_memblock_offlined_cb);
-	if (ret)
-		BUG();
+	rc = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
+			       check_memblock_offlined_cb);
+	if (rc)
+		goto done;
 
 	/* remove memmap entry */
 	firmware_map_remove(start, start + size, "System RAM");
@@ -1857,14 +1848,45 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
 
 	try_offline_node(nid);
 
+done:
 	mem_hotplug_done();
+	return rc;
 }
 
-void remove_memory(int nid, u64 start, u64 size)
+/**
+ * remove_memory
+ * @nid: the node ID
+ * @start: physical address of the region to remove
+ * @size: size of the region to remove
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call, as required by
+ * try_offline_node().
+ */
+void __remove_memory(int nid, u64 start, u64 size)
+{
+
+	/*
+	 * trigger BUG() is some memory is not offlined prior to calling this
+	 * function
+	 */
+	if (try_remove_memory(nid, start, size))
+		BUG();
+}
+
+/*
+ * Remove memory if every memory block is offline, otherwise return -EBUSY is
+ * some memory is not offline
+ */
+int remove_memory(int nid, u64 start, u64 size)
 {
+	int rc;
+
 	lock_device_hotplug();
-	__remove_memory(nid, start, size);
+	rc  = try_remove_memory(nid, start, size);
 	unlock_device_hotplug();
+
+	return rc;
 }
 EXPORT_SYMBOL_GPL(remove_memory);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
-- 
cgit v1.2.3-59-g8ed1b


From 7588adf8dff12c4b358557a13796a25fef796548 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 16 Jul 2019 16:30:44 -0700
Subject: mm: clean up is_device_*_page() definitions

Refactor is_device_{public,private}_page() with is_pci_p2pdma_page() to
make them all consistent in depending on their respective config options
even when CONFIG_DEV_PAGEMAP_OPS is enabled for other reasons.  This
allows a little more compile-time optimisation as well as the conceptual
and cosmetic cleanup.

Link: http://lkml.kernel.org/r/187c2ab27dea70635d375a61b2f2076d26c032b0.1558547956.git.robin.murphy@arm.com
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Suggested-by: Jerome Glisse <jglisse@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oliver O'Halloran <oohall@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 74797ed20c2c..baa8b8761d8c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -956,41 +956,28 @@ static inline bool put_devmap_managed_page(struct page *page)
 	return false;
 }
 
-static inline bool is_device_private_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
-}
-
-#ifdef CONFIG_PCI_P2PDMA
-static inline bool is_pci_p2pdma_page(const struct page *page)
-{
-	return is_zone_device_page(page) &&
-		page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
-}
-#else /* CONFIG_PCI_P2PDMA */
-static inline bool is_pci_p2pdma_page(const struct page *page)
-{
-	return false;
-}
-#endif /* CONFIG_PCI_P2PDMA */
-
 #else /* CONFIG_DEV_PAGEMAP_OPS */
 static inline bool put_devmap_managed_page(struct page *page)
 {
 	return false;
 }
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
 
 static inline bool is_device_private_page(const struct page *page)
 {
-	return false;
+	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+		IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
+		is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
-	return false;
+	return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+		IS_ENABLED(CONFIG_PCI_P2PDMA) &&
+		is_zone_device_page(page) &&
+		page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA;
 }
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
 
 /* 127: arbitrary random number, small enough to assemble well */
 #define page_ref_zero_or_close_to_overflow(page) \
-- 
cgit v1.2.3-59-g8ed1b


From 175967318c3018d01931ac950c82adab5deb47ca Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 16 Jul 2019 16:30:47 -0700
Subject: mm: introduce ARCH_HAS_PTE_DEVMAP

ARCH_HAS_ZONE_DEVICE is somewhat meaningless in itself, and combined
with the long-out-of-date comment can lead to the impression than an
architecture may just enable it (since __add_pages() now "comprehends
device memory" for itself) and expect things to work.

In practice, however, ZONE_DEVICE users have little chance of
functioning correctly without __HAVE_ARCH_PTE_DEVMAP, so let's clean
that up the same way as ARCH_HAS_PTE_SPECIAL and make it the proper
dependency so the real situation is clearer.

Link: http://lkml.kernel.org/r/87554aa78478a02a63f2c4cf60a847279ae3eb3b.1558547956.git.robin.murphy@arm.com
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Acked-by: Oliver O'Halloran <oohall@gmail.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Will Deacon <will.deacon@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/Kconfig                         | 2 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h | 1 -
 arch/x86/Kconfig                             | 2 +-
 arch/x86/include/asm/pgtable.h               | 4 ++--
 arch/x86/include/asm/pgtable_types.h         | 1 -
 include/linux/mm.h                           | 4 ++--
 include/linux/pfn_t.h                        | 4 ++--
 mm/Kconfig                                   | 5 ++---
 mm/gup.c                                     | 2 +-
 9 files changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f516796dd819..d8dcd8820369 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,6 +129,7 @@ config PPC
 	select ARCH_HAS_MMIOWB			if PPC64
 	select ARCH_HAS_PHYS_TO_DMA
 	select ARCH_HAS_PMEM_API                if PPC64
+	select ARCH_HAS_PTE_DEVMAP		if PPC_BOOK3S_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_MEMBARRIER_CALLBACKS
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE && PPC64
@@ -136,7 +137,6 @@ config PPC
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if PPC64
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_ZONE_DEVICE		if PPC_BOOK3S_64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_KEEP_MEMBLOCK
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 62e6ea0a7650..8308f32e9782 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -90,7 +90,6 @@
 #define _PAGE_SOFT_DIRTY	_RPAGE_SW3 /* software: software dirty tracking */
 #define _PAGE_SPECIAL		_RPAGE_SW2 /* software: special page */
 #define _PAGE_DEVMAP		_RPAGE_SW1 /* software: ZONE_DEVICE page */
-#define __HAVE_ARCH_PTE_DEVMAP
 
 /*
  * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 879741336771..4a55bd01e918 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -70,6 +70,7 @@ config X86
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
 	select ARCH_HAS_PMEM_API		if X86_64
+	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
@@ -80,7 +81,6 @@ config X86
 	select ARCH_HAS_STRICT_MODULE_RWX
 	select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
-	select ARCH_HAS_ZONE_DEVICE		if X86_64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_MIGHT_HAVE_ACPI_PDC		if ACPI
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5e0509b41986..0bc530c4eb13 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -271,7 +271,7 @@ static inline int has_transparent_hugepage(void)
 	return boot_cpu_has(X86_FEATURE_PSE);
 }
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pmd_devmap(pmd_t pmd)
 {
 	return !!(pmd_val(pmd) & _PAGE_DEVMAP);
@@ -732,7 +732,7 @@ static inline int pte_present(pte_t a)
 	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pte_devmap(pte_t a)
 {
 	return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP;
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d6ff0bbdb394..b5e49e6bac63 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -103,7 +103,6 @@
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
-#define __HAVE_ARCH_PTE_DEVMAP
 #else
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #define _PAGE_DEVMAP	(_AT(pteval_t, 0))
diff --git a/include/linux/mm.h b/include/linux/mm.h
index baa8b8761d8c..f43f4de4de68 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -547,7 +547,7 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
 struct mmu_gather;
 struct inode;
 
-#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static inline int pmd_devmap(pmd_t pmd)
 {
 	return 0;
@@ -1750,7 +1750,7 @@ static inline void sync_mm_rss(struct mm_struct *mm)
 }
 #endif
 
-#ifndef __HAVE_ARCH_PTE_DEVMAP
+#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline int pte_devmap(pte_t pte)
 {
 	return 0;
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h
index 01e8037023f7..2d9148221e9a 100644
--- a/include/linux/pfn_t.h
+++ b/include/linux/pfn_t.h
@@ -97,7 +97,7 @@ static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot)
 #endif
 #endif
 
-#ifdef __HAVE_ARCH_PTE_DEVMAP
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
 static inline bool pfn_t_devmap(pfn_t pfn)
 {
 	const u64 flags = PFN_DEV|PFN_MAP;
@@ -115,7 +115,7 @@ pmd_t pmd_mkdevmap(pmd_t pmd);
 	defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 pud_t pud_mkdevmap(pud_t pud);
 #endif
-#endif /* __HAVE_ARCH_PTE_DEVMAP */
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
 
 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static inline bool pfn_t_special(pfn_t pfn)
diff --git a/mm/Kconfig b/mm/Kconfig
index 495d7368ced8..56cec636a1fc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,8 +649,7 @@ config IDLE_PAGE_TRACKING
 	  See Documentation/admin-guide/mm/idle_page_tracking.rst for
 	  more details.
 
-# arch_add_memory() comprehends device memory
-config ARCH_HAS_ZONE_DEVICE
+config ARCH_HAS_PTE_DEVMAP
 	bool
 
 config ZONE_DEVICE
@@ -658,7 +657,7 @@ config ZONE_DEVICE
 	depends on MEMORY_HOTPLUG
 	depends on MEMORY_HOTREMOVE
 	depends on SPARSEMEM_VMEMMAP
-	depends on ARCH_HAS_ZONE_DEVICE
+	depends on ARCH_HAS_PTE_DEVMAP
 	select XARRAY_MULTI
 
 	help
diff --git a/mm/gup.c b/mm/gup.c
index 8bbaa5523116..98f13ab37bac 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1895,7 +1895,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 }
 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
-#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 		unsigned long end, struct page **pages, int *nr)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 79eb597cba06c435b72f220e9d426ae413fc2579 Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Tue, 16 Jul 2019 16:30:54 -0700
Subject: mm: add account_locked_vm utility function

locked_vm accounting is done roughly the same way in five places, so
unify them in a helper.

Include the helper's caller in the debug print to distinguish between
callsites.

Error codes stay the same, so user-visible behavior does too.  The one
exception is that the -EPERM case in tce_account_locked_vm is removed
because Alexey has never seen it triggered.

[daniel.m.jordan@oracle.com: v3]
  Link: http://lkml.kernel.org/r/20190529205019.20927-1-daniel.m.jordan@oracle.com
[sfr@canb.auug.org.au: fix mm/util.c]
Link: http://lkml.kernel.org/r/20190524175045.26897-1-daniel.m.jordan@oracle.com
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Cc: Alan Tull <atull@kernel.org>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Moritz Fischer <mdf@kernel.org>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Steve Sistare <steven.sistare@oracle.com>
Cc: Wu Hao <hao.wu@intel.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kvm/book3s_64_vio.c     | 44 ++-------------------
 arch/powerpc/mm/book3s64/iommu_api.c | 41 ++------------------
 drivers/fpga/dfl-afu-dma-region.c    | 53 ++-----------------------
 drivers/vfio/vfio_iommu_spapr_tce.c  | 54 +++-----------------------
 drivers/vfio/vfio_iommu_type1.c      | 17 +-------
 include/linux/mm.h                   |  4 ++
 mm/util.c                            | 75 ++++++++++++++++++++++++++++++++++++
 7 files changed, 98 insertions(+), 190 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 5bf05cc774e2..e99a14798ab0 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -19,6 +19,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/iommu.h>
 #include <linux/file.h>
+#include <linux/mm.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -45,43 +46,6 @@ static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
 	return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
 }
 
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
-{
-	long ret = 0;
-
-	if (!current || !current->mm)
-		return ret; /* process exited */
-
-	down_write(&current->mm->mmap_sem);
-
-	if (inc) {
-		unsigned long locked, lock_limit;
-
-		locked = current->mm->locked_vm + stt_pages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			current->mm->locked_vm += stt_pages;
-	} else {
-		if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
-			stt_pages = current->mm->locked_vm;
-
-		current->mm->locked_vm -= stt_pages;
-	}
-
-	pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
-			inc ? '+' : '-',
-			stt_pages << PAGE_SHIFT,
-			current->mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK),
-			ret ? " - exceeded" : "");
-
-	up_write(&current->mm->mmap_sem);
-
-	return ret;
-}
-
 static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
 {
 	struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
@@ -291,7 +255,7 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 
 	kvm_put_kvm(stt->kvm);
 
-	kvmppc_account_memlimit(
+	account_locked_vm(current->mm,
 		kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
 	call_rcu(&stt->rcu, release_spapr_tce_table);
 
@@ -316,7 +280,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 		return -EINVAL;
 
 	npages = kvmppc_tce_pages(size);
-	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+	ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true);
 	if (ret)
 		return ret;
 
@@ -362,7 +326,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 
 	kfree(stt);
  fail_acct:
-	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
+	account_locked_vm(current->mm, kvmppc_stt_pages(npages), false);
 	return ret;
 }
 
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index 90ee3a89722c..b056cae3388b 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -14,6 +14,7 @@
 #include <linux/hugetlb.h>
 #include <linux/swap.h>
 #include <linux/sizes.h>
+#include <linux/mm.h>
 #include <asm/mmu_context.h>
 #include <asm/pte-walk.h>
 #include <linux/mm_inline.h>
@@ -46,40 +47,6 @@ struct mm_iommu_table_group_mem_t {
 	u64 dev_hpa;		/* Device memory base address */
 };
 
-static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
-		unsigned long npages, bool incr)
-{
-	long ret = 0, locked, lock_limit;
-
-	if (!npages)
-		return 0;
-
-	down_write(&mm->mmap_sem);
-
-	if (incr) {
-		locked = mm->locked_vm + npages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			mm->locked_vm += npages;
-	} else {
-		if (WARN_ON_ONCE(npages > mm->locked_vm))
-			npages = mm->locked_vm;
-		mm->locked_vm -= npages;
-	}
-
-	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
-			current ? current->pid : 0,
-			incr ? '+' : '-',
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK));
-	up_write(&mm->mmap_sem);
-
-	return ret;
-}
-
 bool mm_iommu_preregistered(struct mm_struct *mm)
 {
 	return !list_empty(&mm->context.iommu_group_mem_list);
@@ -96,7 +63,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
 	unsigned long entry, chunk;
 
 	if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
-		ret = mm_iommu_adjust_locked_vm(mm, entries, true);
+		ret = account_locked_vm(mm, entries, true);
 		if (ret)
 			return ret;
 
@@ -211,7 +178,7 @@ free_exit:
 	kfree(mem);
 
 unlock_exit:
-	mm_iommu_adjust_locked_vm(mm, locked_entries, false);
+	account_locked_vm(mm, locked_entries, false);
 
 	return ret;
 }
@@ -311,7 +278,7 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
 unlock_exit:
 	mutex_unlock(&mem_list_mutex);
 
-	mm_iommu_adjust_locked_vm(mm, unlock_entries, false);
+	account_locked_vm(mm, unlock_entries, false);
 
 	return ret;
 }
diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c
index dcd80b088c7b..62f924489db5 100644
--- a/drivers/fpga/dfl-afu-dma-region.c
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -12,6 +12,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/sched/signal.h>
 #include <linux/uaccess.h>
+#include <linux/mm.h>
 
 #include "dfl-afu.h"
 
@@ -31,52 +32,6 @@ void afu_dma_region_init(struct dfl_feature_platform_data *pdata)
 	afu->dma_regions = RB_ROOT;
 }
 
-/**
- * afu_dma_adjust_locked_vm - adjust locked memory
- * @dev: port device
- * @npages: number of pages
- * @incr: increase or decrease locked memory
- *
- * Increase or decrease the locked memory size with npages input.
- *
- * Return 0 on success.
- * Return -ENOMEM if locked memory size is over the limit and no CAP_IPC_LOCK.
- */
-static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr)
-{
-	unsigned long locked, lock_limit;
-	int ret = 0;
-
-	/* the task is exiting. */
-	if (!current->mm)
-		return 0;
-
-	down_write(&current->mm->mmap_sem);
-
-	if (incr) {
-		locked = current->mm->locked_vm + npages;
-		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-			ret = -ENOMEM;
-		else
-			current->mm->locked_vm += npages;
-	} else {
-		if (WARN_ON_ONCE(npages > current->mm->locked_vm))
-			npages = current->mm->locked_vm;
-		current->mm->locked_vm -= npages;
-	}
-
-	dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid,
-		incr ? '+' : '-', npages << PAGE_SHIFT,
-		current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK),
-		ret ? "- exceeded" : "");
-
-	up_write(&current->mm->mmap_sem);
-
-	return ret;
-}
-
 /**
  * afu_dma_pin_pages - pin pages of given dma memory region
  * @pdata: feature device platform data
@@ -92,7 +47,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata,
 	struct device *dev = &pdata->dev->dev;
 	int ret, pinned;
 
-	ret = afu_dma_adjust_locked_vm(dev, npages, true);
+	ret = account_locked_vm(current->mm, npages, true);
 	if (ret)
 		return ret;
 
@@ -121,7 +76,7 @@ put_pages:
 free_pages:
 	kfree(region->pages);
 unlock_vm:
-	afu_dma_adjust_locked_vm(dev, npages, false);
+	account_locked_vm(current->mm, npages, false);
 	return ret;
 }
 
@@ -141,7 +96,7 @@ static void afu_dma_unpin_pages(struct dfl_feature_platform_data *pdata,
 
 	put_all_pages(region->pages, npages);
 	kfree(region->pages);
-	afu_dma_adjust_locked_vm(dev, npages, false);
+	account_locked_vm(current->mm, npages, false);
 
 	dev_dbg(dev, "%ld pages unpinned\n", npages);
 }
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 7048c9198c21..8ce9ad21129f 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -19,6 +19,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/signal.h>
+#include <linux/mm.h>
 
 #include <asm/iommu.h>
 #include <asm/tce.h>
@@ -31,51 +32,6 @@
 static void tce_iommu_detach_group(void *iommu_data,
 		struct iommu_group *iommu_group);
 
-static long try_increment_locked_vm(struct mm_struct *mm, long npages)
-{
-	long ret = 0, locked, lock_limit;
-
-	if (WARN_ON_ONCE(!mm))
-		return -EPERM;
-
-	if (!npages)
-		return 0;
-
-	down_write(&mm->mmap_sem);
-	locked = mm->locked_vm + npages;
-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-		ret = -ENOMEM;
-	else
-		mm->locked_vm += npages;
-
-	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK),
-			ret ? " - exceeded" : "");
-
-	up_write(&mm->mmap_sem);
-
-	return ret;
-}
-
-static void decrement_locked_vm(struct mm_struct *mm, long npages)
-{
-	if (!mm || !npages)
-		return;
-
-	down_write(&mm->mmap_sem);
-	if (WARN_ON_ONCE(npages > mm->locked_vm))
-		npages = mm->locked_vm;
-	mm->locked_vm -= npages;
-	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
-			npages << PAGE_SHIFT,
-			mm->locked_vm << PAGE_SHIFT,
-			rlimit(RLIMIT_MEMLOCK));
-	up_write(&mm->mmap_sem);
-}
-
 /*
  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
  *
@@ -333,7 +289,7 @@ static int tce_iommu_enable(struct tce_container *container)
 		return ret;
 
 	locked = table_group->tce32_size >> PAGE_SHIFT;
-	ret = try_increment_locked_vm(container->mm, locked);
+	ret = account_locked_vm(container->mm, locked, true);
 	if (ret)
 		return ret;
 
@@ -352,7 +308,7 @@ static void tce_iommu_disable(struct tce_container *container)
 	container->enabled = false;
 
 	BUG_ON(!container->mm);
-	decrement_locked_vm(container->mm, container->locked_pages);
+	account_locked_vm(container->mm, container->locked_pages, false);
 }
 
 static void *tce_iommu_open(unsigned long arg)
@@ -656,7 +612,7 @@ static long tce_iommu_create_table(struct tce_container *container,
 	if (!table_size)
 		return -EINVAL;
 
-	ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
+	ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true);
 	if (ret)
 		return ret;
 
@@ -675,7 +631,7 @@ static void tce_iommu_free_table(struct tce_container *container,
 	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 
 	iommu_tce_table_put(tbl);
-	decrement_locked_vm(container->mm, pages);
+	account_locked_vm(container->mm, pages, false);
 }
 
 static long tce_iommu_create_window(struct tce_container *container,
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index add34adfadc7..054391f30fa8 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -272,21 +272,8 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 
 	ret = down_write_killable(&mm->mmap_sem);
 	if (!ret) {
-		if (npage > 0) {
-			if (!dma->lock_cap) {
-				unsigned long limit;
-
-				limit = task_rlimit(dma->task,
-						RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-				if (mm->locked_vm + npage > limit)
-					ret = -ENOMEM;
-			}
-		}
-
-		if (!ret)
-			mm->locked_vm += npage;
-
+		ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
+					  dma->lock_cap);
 		up_write(&mm->mmap_sem);
 	}
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f43f4de4de68..bd6512559bed 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1543,6 +1543,10 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
 int get_user_pages_fast(unsigned long start, int nr_pages,
 			unsigned int gup_flags, struct page **pages);
 
+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+			struct task_struct *task, bool bypass_rlim);
+
 /* Container for pinned pfns / pages */
 struct frame_vector {
 	unsigned int nr_allocated;	/* Number of frames we have space for */
diff --git a/mm/util.c b/mm/util.c
index 68575a315dc5..e6351a80f248 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,6 +7,7 @@
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
 #include <linux/sched/task_stack.h>
 #include <linux/security.h>
 #include <linux/swap.h>
@@ -300,6 +301,80 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 }
 #endif
 
+/**
+ * __account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm:          mm to account against
+ * @pages:       number of pages to account
+ * @inc:         %true if @pages should be considered positive, %false if not
+ * @task:        task used to check RLIMIT_MEMLOCK
+ * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
+ *
+ * Assumes @task and @mm are valid (i.e. at least one reference on each), and
+ * that mmap_sem is held as writer.
+ *
+ * Return:
+ * * 0       on success
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+			struct task_struct *task, bool bypass_rlim)
+{
+	unsigned long locked_vm, limit;
+	int ret = 0;
+
+	lockdep_assert_held_write(&mm->mmap_sem);
+
+	locked_vm = mm->locked_vm;
+	if (inc) {
+		if (!bypass_rlim) {
+			limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+			if (locked_vm + pages > limit)
+				ret = -ENOMEM;
+		}
+		if (!ret)
+			mm->locked_vm = locked_vm + pages;
+	} else {
+		WARN_ON_ONCE(pages > locked_vm);
+		mm->locked_vm = locked_vm - pages;
+	}
+
+	pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
+		 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
+		 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
+		 ret ? " - exceeded" : "");
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__account_locked_vm);
+
+/**
+ * account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm:          mm to account against, may be NULL
+ * @pages:       number of pages to account
+ * @inc:         %true if @pages should be considered positive, %false if not
+ *
+ * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
+ *
+ * Return:
+ * * 0       on success, or if mm is NULL
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
+ */
+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
+{
+	int ret;
+
+	if (pages == 0 || !mm)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+	ret = __account_locked_vm(mm, pages, inc, current,
+				  capable(CAP_IPC_LOCK));
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(account_locked_vm);
+
 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
 	unsigned long flag, unsigned long pgoff)
-- 
cgit v1.2.3-59-g8ed1b


From 56cbb429d911991170fe867b4bba14f0efed5829 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Jul 2019 16:57:51 -0400
Subject: switch the remnants of releasing the mountpoint away from fs_pin

We used to need rather convoluted ordering trickery to guarantee
that dput() of ex-mountpoints happens before the final mntput()
of the same.  Since we don't need that anymore, there's no point
playing with fs_pin for that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs_pin.c            | 10 ++--------
 fs/mount.h             |  7 +++++--
 fs/namespace.c         | 37 +++++++++++++++++++------------------
 include/linux/fs_pin.h |  1 -
 4 files changed, 26 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index a6497cf8ae53..47ef3c71ce90 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -19,20 +19,14 @@ void pin_remove(struct fs_pin *pin)
 	spin_unlock_irq(&pin->wait.lock);
 }
 
-void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
 {
 	spin_lock(&pin_lock);
-	if (p)
-		hlist_add_head(&pin->s_list, p);
+	hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
 	hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
 	spin_unlock(&pin_lock);
 }
 
-void pin_insert(struct fs_pin *pin, struct vfsmount *m)
-{
-	pin_insert_group(pin, m, &m->mnt_sb->s_pins);
-}
-
 void pin_kill(struct fs_pin *p)
 {
 	wait_queue_entry_t wait;
diff --git a/fs/mount.h b/fs/mount.h
index 84aa8cdf4971..711a4093e475 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -58,7 +58,10 @@ struct mount {
 	struct mount *mnt_master;	/* slave is on master->mnt_slave_list */
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
 	struct mountpoint *mnt_mp;	/* where is it mounted */
-	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+	union {
+		struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
+		struct hlist_node mnt_umount;
+	};
 	struct list_head mnt_umounting; /* list entry for umount propagation */
 #ifdef CONFIG_FSNOTIFY
 	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
@@ -68,7 +71,7 @@ struct mount {
 	int mnt_group_id;		/* peer group identifier */
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	struct hlist_head mnt_pins;
-	struct fs_pin mnt_umount;
+	struct hlist_head mnt_stuck_children;
 } __randomize_layout;
 
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namespace.c b/fs/namespace.c
index 46316ba15615..54a815e48ead 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -171,13 +171,6 @@ unsigned int mnt_get_count(struct mount *mnt)
 #endif
 }
 
-static void drop_mountpoint(struct fs_pin *p)
-{
-	struct mount *m = container_of(p, struct mount, mnt_umount);
-	pin_remove(p);
-	mntput(&m->mnt);
-}
-
 static struct mount *alloc_vfsmnt(const char *name)
 {
 	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -215,7 +208,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_LIST_HEAD(&mnt->mnt_umounting);
-		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
+		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
 	}
 	return mnt;
 
@@ -1087,19 +1080,22 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 
 static void cleanup_mnt(struct mount *mnt)
 {
+	struct hlist_node *p;
+	struct mount *m;
 	/*
-	 * This probably indicates that somebody messed
-	 * up a mnt_want/drop_write() pair.  If this
-	 * happens, the filesystem was probably unable
-	 * to make r/w->r/o transitions.
-	 */
-	/*
+	 * The warning here probably indicates that somebody messed
+	 * up a mnt_want/drop_write() pair.  If this happens, the
+	 * filesystem was probably unable to make r/w->r/o transitions.
 	 * The locking used to deal with mnt_count decrement provides barriers,
 	 * so mnt_get_writers() below is safe.
 	 */
 	WARN_ON(mnt_get_writers(mnt));
 	if (unlikely(mnt->mnt_pins.first))
 		mnt_pin_kill(mnt);
+	hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		mntput(&m->mnt);
+	}
 	fsnotify_vfsmount_delete(&mnt->mnt);
 	dput(mnt->mnt.mnt_root);
 	deactivate_super(mnt->mnt.mnt_sb);
@@ -1168,6 +1164,7 @@ static void mntput_no_expire(struct mount *mnt)
 		struct mount *p, *tmp;
 		list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
 			__put_mountpoint(unhash_mnt(p), &list);
+			hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
 		}
 	}
 	unlock_mount_hash();
@@ -1360,6 +1357,8 @@ EXPORT_SYMBOL(may_umount);
 static void namespace_unlock(void)
 {
 	struct hlist_head head;
+	struct hlist_node *p;
+	struct mount *m;
 	LIST_HEAD(list);
 
 	hlist_move_list(&unmounted, &head);
@@ -1374,7 +1373,10 @@ static void namespace_unlock(void)
 
 	synchronize_rcu_expedited();
 
-	group_pin_kill(&head);
+	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
+		hlist_del(&m->mnt_umount);
+		mntput(&m->mnt);
+	}
 }
 
 static inline void namespace_lock(void)
@@ -1461,8 +1463,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 
 		disconnect = disconnect_mount(p, how);
 
-		pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
-				 disconnect ? &unmounted : NULL);
 		if (mnt_has_parent(p)) {
 			mnt_add_count(p->mnt_parent, -1);
 			if (!disconnect) {
@@ -1470,6 +1470,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
 			} else {
 				umount_mnt(p);
+				hlist_add_head(&p->mnt_umount, &unmounted);
 			}
 		}
 		change_mnt_propagation(p, MS_PRIVATE);
@@ -1622,8 +1623,8 @@ void __detach_mounts(struct dentry *dentry)
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
 		if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
-			hlist_add_head(&mnt->mnt_umount.s_list, &unmounted);
 			umount_mnt(mnt);
+			hlist_add_head(&mnt->mnt_umount, &unmounted);
 		}
 		else umount_tree(mnt, UMOUNT_CONNECTED);
 	}
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index 7cab74d66f85..bdd09fd2520c 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -20,6 +20,5 @@ static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *))
 }
 
 void pin_remove(struct fs_pin *);
-void pin_insert_group(struct fs_pin *, struct vfsmount *, struct hlist_head *);
 void pin_insert(struct fs_pin *, struct vfsmount *);
 void pin_kill(struct fs_pin *);
-- 
cgit v1.2.3-59-g8ed1b


From b866455423e040813f113d8b87e8297778ee2014 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 16 Jul 2019 21:59:11 +0200
Subject: dma-mapping: add a dma_addressing_limited helper

This helper returns if the device has issues addressing all present
memory in the system.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 8d13e28a8e07..e11b115dd0e4 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -679,6 +679,20 @@ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask)
 	return dma_set_mask_and_coherent(dev, mask);
 }
 
+/**
+ * dma_addressing_limited - return if the device is addressing limited
+ * @dev:	device to check
+ *
+ * Return %true if the devices DMA mask is too small to address all memory in
+ * the system, else %false.  Lack of addressing bits is the prime reason for
+ * bounce buffering, but might not be the only one.
+ */
+static inline bool dma_addressing_limited(struct device *dev)
+{
+	return min_not_zero(*dev->dma_mask, dev->bus_dma_mask) <
+		dma_get_required_mask(dev);
+}
+
 #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		const struct iommu_ops *iommu, bool coherent);
-- 
cgit v1.2.3-59-g8ed1b


From db074436f421967f4f30cfbb6fbc2a728f3e62b3 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 15 Jul 2019 08:50:59 -0700
Subject: iomap: move the direct IO code into a separate file

Move the direct IO code into a separate file so that we can group
related functions in a single file instead of having a single enormous
source file.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/iomap.c            | 554 -------------------------------------------------
 fs/iomap/Makefile     |   1 +
 fs/iomap/direct-io.c  | 562 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |   7 +
 4 files changed, 570 insertions(+), 554 deletions(-)
 create mode 100644 fs/iomap/direct-io.c

(limited to 'include/linux')

diff --git a/fs/iomap.c b/fs/iomap.c
index ad994c408cb8..c983fedc7081 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -90,12 +90,6 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
 	return written ? written : ret;
 }
 
-static sector_t
-iomap_sector(struct iomap *iomap, loff_t pos)
-{
-	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
-}
-
 static struct iomap_page *
 iomap_page_create(struct inode *inode, struct page *page)
 {
@@ -1148,551 +1142,3 @@ out_unlock:
 	return block_page_mkwrite_return(ret);
 }
 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
-
-/*
- * Private flags for iomap_dio, must not overlap with the public ones in
- * iomap.h:
- */
-#define IOMAP_DIO_WRITE_FUA	(1 << 28)
-#define IOMAP_DIO_NEED_SYNC	(1 << 29)
-#define IOMAP_DIO_WRITE		(1 << 30)
-#define IOMAP_DIO_DIRTY		(1 << 31)
-
-struct iomap_dio {
-	struct kiocb		*iocb;
-	iomap_dio_end_io_t	*end_io;
-	loff_t			i_size;
-	loff_t			size;
-	atomic_t		ref;
-	unsigned		flags;
-	int			error;
-	bool			wait_for_completion;
-
-	union {
-		/* used during submission and for synchronous completion: */
-		struct {
-			struct iov_iter		*iter;
-			struct task_struct	*waiter;
-			struct request_queue	*last_queue;
-			blk_qc_t		cookie;
-		} submit;
-
-		/* used for aio completion: */
-		struct {
-			struct work_struct	work;
-		} aio;
-	};
-};
-
-int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
-{
-	struct request_queue *q = READ_ONCE(kiocb->private);
-
-	if (!q)
-		return 0;
-	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
-}
-EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
-
-static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
-		struct bio *bio)
-{
-	atomic_inc(&dio->ref);
-
-	if (dio->iocb->ki_flags & IOCB_HIPRI)
-		bio_set_polled(bio, dio->iocb);
-
-	dio->submit.last_queue = bdev_get_queue(iomap->bdev);
-	dio->submit.cookie = submit_bio(bio);
-}
-
-static ssize_t iomap_dio_complete(struct iomap_dio *dio)
-{
-	struct kiocb *iocb = dio->iocb;
-	struct inode *inode = file_inode(iocb->ki_filp);
-	loff_t offset = iocb->ki_pos;
-	ssize_t ret;
-
-	if (dio->end_io) {
-		ret = dio->end_io(iocb,
-				dio->error ? dio->error : dio->size,
-				dio->flags);
-	} else {
-		ret = dio->error;
-	}
-
-	if (likely(!ret)) {
-		ret = dio->size;
-		/* check for short read */
-		if (offset + ret > dio->i_size &&
-		    !(dio->flags & IOMAP_DIO_WRITE))
-			ret = dio->i_size - offset;
-		iocb->ki_pos += ret;
-	}
-
-	/*
-	 * Try again to invalidate clean pages which might have been cached by
-	 * non-direct readahead, or faulted in by get_user_pages() if the source
-	 * of the write was an mmap'ed region of the file we're writing.  Either
-	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
-	 * this invalidation fails, tough, the write still worked...
-	 *
-	 * And this page cache invalidation has to be after dio->end_io(), as
-	 * some filesystems convert unwritten extents to real allocations in
-	 * end_io() when necessary, otherwise a racing buffer read would cache
-	 * zeros from unwritten extents.
-	 */
-	if (!dio->error &&
-	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
-		int err;
-		err = invalidate_inode_pages2_range(inode->i_mapping,
-				offset >> PAGE_SHIFT,
-				(offset + dio->size - 1) >> PAGE_SHIFT);
-		if (err)
-			dio_warn_stale_pagecache(iocb->ki_filp);
-	}
-
-	/*
-	 * If this is a DSYNC write, make sure we push it to stable storage now
-	 * that we've written data.
-	 */
-	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
-		ret = generic_write_sync(iocb, ret);
-
-	inode_dio_end(file_inode(iocb->ki_filp));
-	kfree(dio);
-
-	return ret;
-}
-
-static void iomap_dio_complete_work(struct work_struct *work)
-{
-	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
-	struct kiocb *iocb = dio->iocb;
-
-	iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
-}
-
-/*
- * Set an error in the dio if none is set yet.  We have to use cmpxchg
- * as the submission context and the completion context(s) can race to
- * update the error.
- */
-static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
-{
-	cmpxchg(&dio->error, 0, ret);
-}
-
-static void iomap_dio_bio_end_io(struct bio *bio)
-{
-	struct iomap_dio *dio = bio->bi_private;
-	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
-
-	if (bio->bi_status)
-		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
-
-	if (atomic_dec_and_test(&dio->ref)) {
-		if (dio->wait_for_completion) {
-			struct task_struct *waiter = dio->submit.waiter;
-			WRITE_ONCE(dio->submit.waiter, NULL);
-			blk_wake_io_task(waiter);
-		} else if (dio->flags & IOMAP_DIO_WRITE) {
-			struct inode *inode = file_inode(dio->iocb->ki_filp);
-
-			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
-			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
-		} else {
-			iomap_dio_complete_work(&dio->aio.work);
-		}
-	}
-
-	if (should_dirty) {
-		bio_check_pages_dirty(bio);
-	} else {
-		bio_release_pages(bio, false);
-		bio_put(bio);
-	}
-}
-
-static void
-iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
-		unsigned len)
-{
-	struct page *page = ZERO_PAGE(0);
-	int flags = REQ_SYNC | REQ_IDLE;
-	struct bio *bio;
-
-	bio = bio_alloc(GFP_KERNEL, 1);
-	bio_set_dev(bio, iomap->bdev);
-	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
-	bio->bi_private = dio;
-	bio->bi_end_io = iomap_dio_bio_end_io;
-
-	get_page(page);
-	__bio_add_page(bio, page, len, 0);
-	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
-	iomap_dio_submit_bio(dio, iomap, bio);
-}
-
-static loff_t
-iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
-		struct iomap_dio *dio, struct iomap *iomap)
-{
-	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
-	unsigned int fs_block_size = i_blocksize(inode), pad;
-	unsigned int align = iov_iter_alignment(dio->submit.iter);
-	struct iov_iter iter;
-	struct bio *bio;
-	bool need_zeroout = false;
-	bool use_fua = false;
-	int nr_pages, ret = 0;
-	size_t copied = 0;
-
-	if ((pos | length | align) & ((1 << blkbits) - 1))
-		return -EINVAL;
-
-	if (iomap->type == IOMAP_UNWRITTEN) {
-		dio->flags |= IOMAP_DIO_UNWRITTEN;
-		need_zeroout = true;
-	}
-
-	if (iomap->flags & IOMAP_F_SHARED)
-		dio->flags |= IOMAP_DIO_COW;
-
-	if (iomap->flags & IOMAP_F_NEW) {
-		need_zeroout = true;
-	} else if (iomap->type == IOMAP_MAPPED) {
-		/*
-		 * Use a FUA write if we need datasync semantics, this is a pure
-		 * data IO that doesn't require any metadata updates (including
-		 * after IO completion such as unwritten extent conversion) and
-		 * the underlying device supports FUA. This allows us to avoid
-		 * cache flushes on IO completion.
-		 */
-		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
-		    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
-		    blk_queue_fua(bdev_get_queue(iomap->bdev)))
-			use_fua = true;
-	}
-
-	/*
-	 * Operate on a partial iter trimmed to the extent we were called for.
-	 * We'll update the iter in the dio once we're done with this extent.
-	 */
-	iter = *dio->submit.iter;
-	iov_iter_truncate(&iter, length);
-
-	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-	if (nr_pages <= 0)
-		return nr_pages;
-
-	if (need_zeroout) {
-		/* zero out from the start of the block to the write offset */
-		pad = pos & (fs_block_size - 1);
-		if (pad)
-			iomap_dio_zero(dio, iomap, pos - pad, pad);
-	}
-
-	do {
-		size_t n;
-		if (dio->error) {
-			iov_iter_revert(dio->submit.iter, copied);
-			return 0;
-		}
-
-		bio = bio_alloc(GFP_KERNEL, nr_pages);
-		bio_set_dev(bio, iomap->bdev);
-		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
-		bio->bi_write_hint = dio->iocb->ki_hint;
-		bio->bi_ioprio = dio->iocb->ki_ioprio;
-		bio->bi_private = dio;
-		bio->bi_end_io = iomap_dio_bio_end_io;
-
-		ret = bio_iov_iter_get_pages(bio, &iter);
-		if (unlikely(ret)) {
-			/*
-			 * We have to stop part way through an IO. We must fall
-			 * through to the sub-block tail zeroing here, otherwise
-			 * this short IO may expose stale data in the tail of
-			 * the block we haven't written data to.
-			 */
-			bio_put(bio);
-			goto zero_tail;
-		}
-
-		n = bio->bi_iter.bi_size;
-		if (dio->flags & IOMAP_DIO_WRITE) {
-			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
-			if (use_fua)
-				bio->bi_opf |= REQ_FUA;
-			else
-				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
-			task_io_account_write(n);
-		} else {
-			bio->bi_opf = REQ_OP_READ;
-			if (dio->flags & IOMAP_DIO_DIRTY)
-				bio_set_pages_dirty(bio);
-		}
-
-		iov_iter_advance(dio->submit.iter, n);
-
-		dio->size += n;
-		pos += n;
-		copied += n;
-
-		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
-		iomap_dio_submit_bio(dio, iomap, bio);
-	} while (nr_pages);
-
-	/*
-	 * We need to zeroout the tail of a sub-block write if the extent type
-	 * requires zeroing or the write extends beyond EOF. If we don't zero
-	 * the block tail in the latter case, we can expose stale data via mmap
-	 * reads of the EOF block.
-	 */
-zero_tail:
-	if (need_zeroout ||
-	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
-		/* zero out from the end of the write to the end of the block */
-		pad = pos & (fs_block_size - 1);
-		if (pad)
-			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
-	}
-	return copied ? copied : ret;
-}
-
-static loff_t
-iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
-{
-	length = iov_iter_zero(length, dio->submit.iter);
-	dio->size += length;
-	return length;
-}
-
-static loff_t
-iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
-		struct iomap_dio *dio, struct iomap *iomap)
-{
-	struct iov_iter *iter = dio->submit.iter;
-	size_t copied;
-
-	BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
-
-	if (dio->flags & IOMAP_DIO_WRITE) {
-		loff_t size = inode->i_size;
-
-		if (pos > size)
-			memset(iomap->inline_data + size, 0, pos - size);
-		copied = copy_from_iter(iomap->inline_data + pos, length, iter);
-		if (copied) {
-			if (pos + copied > size)
-				i_size_write(inode, pos + copied);
-			mark_inode_dirty(inode);
-		}
-	} else {
-		copied = copy_to_iter(iomap->inline_data + pos, length, iter);
-	}
-	dio->size += copied;
-	return copied;
-}
-
-static loff_t
-iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
-		void *data, struct iomap *iomap)
-{
-	struct iomap_dio *dio = data;
-
-	switch (iomap->type) {
-	case IOMAP_HOLE:
-		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
-			return -EIO;
-		return iomap_dio_hole_actor(length, dio);
-	case IOMAP_UNWRITTEN:
-		if (!(dio->flags & IOMAP_DIO_WRITE))
-			return iomap_dio_hole_actor(length, dio);
-		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
-	case IOMAP_MAPPED:
-		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
-	case IOMAP_INLINE:
-		return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
-	default:
-		WARN_ON_ONCE(1);
-		return -EIO;
-	}
-}
-
-/*
- * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
- * is being issued as AIO or not.  This allows us to optimise pure data writes
- * to use REQ_FUA rather than requiring generic_write_sync() to issue a
- * REQ_FLUSH post write. This is slightly tricky because a single request here
- * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
- * may be pure data writes. In that case, we still need to do a full data sync
- * completion.
- */
-ssize_t
-iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
-		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
-{
-	struct address_space *mapping = iocb->ki_filp->f_mapping;
-	struct inode *inode = file_inode(iocb->ki_filp);
-	size_t count = iov_iter_count(iter);
-	loff_t pos = iocb->ki_pos, start = pos;
-	loff_t end = iocb->ki_pos + count - 1, ret = 0;
-	unsigned int flags = IOMAP_DIRECT;
-	bool wait_for_completion = is_sync_kiocb(iocb);
-	struct blk_plug plug;
-	struct iomap_dio *dio;
-
-	lockdep_assert_held(&inode->i_rwsem);
-
-	if (!count)
-		return 0;
-
-	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
-	if (!dio)
-		return -ENOMEM;
-
-	dio->iocb = iocb;
-	atomic_set(&dio->ref, 1);
-	dio->size = 0;
-	dio->i_size = i_size_read(inode);
-	dio->end_io = end_io;
-	dio->error = 0;
-	dio->flags = 0;
-
-	dio->submit.iter = iter;
-	dio->submit.waiter = current;
-	dio->submit.cookie = BLK_QC_T_NONE;
-	dio->submit.last_queue = NULL;
-
-	if (iov_iter_rw(iter) == READ) {
-		if (pos >= dio->i_size)
-			goto out_free_dio;
-
-		if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
-			dio->flags |= IOMAP_DIO_DIRTY;
-	} else {
-		flags |= IOMAP_WRITE;
-		dio->flags |= IOMAP_DIO_WRITE;
-
-		/* for data sync or sync, we need sync completion processing */
-		if (iocb->ki_flags & IOCB_DSYNC)
-			dio->flags |= IOMAP_DIO_NEED_SYNC;
-
-		/*
-		 * For datasync only writes, we optimistically try using FUA for
-		 * this IO.  Any non-FUA write that occurs will clear this flag,
-		 * hence we know before completion whether a cache flush is
-		 * necessary.
-		 */
-		if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
-			dio->flags |= IOMAP_DIO_WRITE_FUA;
-	}
-
-	if (iocb->ki_flags & IOCB_NOWAIT) {
-		if (filemap_range_has_page(mapping, start, end)) {
-			ret = -EAGAIN;
-			goto out_free_dio;
-		}
-		flags |= IOMAP_NOWAIT;
-	}
-
-	ret = filemap_write_and_wait_range(mapping, start, end);
-	if (ret)
-		goto out_free_dio;
-
-	/*
-	 * Try to invalidate cache pages for the range we're direct
-	 * writing.  If this invalidation fails, tough, the write will
-	 * still work, but racing two incompatible write paths is a
-	 * pretty crazy thing to do, so we don't support it 100%.
-	 */
-	ret = invalidate_inode_pages2_range(mapping,
-			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-	if (ret)
-		dio_warn_stale_pagecache(iocb->ki_filp);
-	ret = 0;
-
-	if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
-	    !inode->i_sb->s_dio_done_wq) {
-		ret = sb_init_dio_done_wq(inode->i_sb);
-		if (ret < 0)
-			goto out_free_dio;
-	}
-
-	inode_dio_begin(inode);
-
-	blk_start_plug(&plug);
-	do {
-		ret = iomap_apply(inode, pos, count, flags, ops, dio,
-				iomap_dio_actor);
-		if (ret <= 0) {
-			/* magic error code to fall back to buffered I/O */
-			if (ret == -ENOTBLK) {
-				wait_for_completion = true;
-				ret = 0;
-			}
-			break;
-		}
-		pos += ret;
-
-		if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
-			break;
-	} while ((count = iov_iter_count(iter)) > 0);
-	blk_finish_plug(&plug);
-
-	if (ret < 0)
-		iomap_dio_set_error(dio, ret);
-
-	/*
-	 * If all the writes we issued were FUA, we don't need to flush the
-	 * cache on IO completion. Clear the sync flag for this case.
-	 */
-	if (dio->flags & IOMAP_DIO_WRITE_FUA)
-		dio->flags &= ~IOMAP_DIO_NEED_SYNC;
-
-	WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
-	WRITE_ONCE(iocb->private, dio->submit.last_queue);
-
-	/*
-	 * We are about to drop our additional submission reference, which
-	 * might be the last reference to the dio.  There are three three
-	 * different ways we can progress here:
-	 *
-	 *  (a) If this is the last reference we will always complete and free
-	 *	the dio ourselves.
-	 *  (b) If this is not the last reference, and we serve an asynchronous
-	 *	iocb, we must never touch the dio after the decrement, the
-	 *	I/O completion handler will complete and free it.
-	 *  (c) If this is not the last reference, but we serve a synchronous
-	 *	iocb, the I/O completion handler will wake us up on the drop
-	 *	of the final reference, and we will complete and free it here
-	 *	after we got woken by the I/O completion handler.
-	 */
-	dio->wait_for_completion = wait_for_completion;
-	if (!atomic_dec_and_test(&dio->ref)) {
-		if (!wait_for_completion)
-			return -EIOCBQUEUED;
-
-		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			if (!READ_ONCE(dio->submit.waiter))
-				break;
-
-			if (!(iocb->ki_flags & IOCB_HIPRI) ||
-			    !dio->submit.last_queue ||
-			    !blk_poll(dio->submit.last_queue,
-					 dio->submit.cookie, true))
-				io_schedule();
-		}
-		__set_current_state(TASK_RUNNING);
-	}
-
-	return iomap_dio_complete(dio);
-
-out_free_dio:
-	kfree(dio);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 5dfe8b5cf330..a67a97758858 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -6,6 +6,7 @@
 obj-$(CONFIG_FS_IOMAP)		+= iomap.o
 
 iomap-y				+= \
+					direct-io.o \
 					fiemap.o \
 					seek.o
 
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
new file mode 100644
index 000000000000..10517cea9682
--- /dev/null
+++ b/fs/iomap/direct-io.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (c) 2016-2018 Christoph Hellwig.
+ */
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/backing-dev.h>
+#include <linux/uio.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "../internal.h"
+
+/*
+ * Private flags for iomap_dio, must not overlap with the public ones in
+ * iomap.h:
+ */
+#define IOMAP_DIO_WRITE_FUA	(1 << 28)
+#define IOMAP_DIO_NEED_SYNC	(1 << 29)
+#define IOMAP_DIO_WRITE		(1 << 30)
+#define IOMAP_DIO_DIRTY		(1 << 31)
+
+struct iomap_dio {
+	struct kiocb		*iocb;
+	iomap_dio_end_io_t	*end_io;
+	loff_t			i_size;
+	loff_t			size;
+	atomic_t		ref;
+	unsigned		flags;
+	int			error;
+	bool			wait_for_completion;
+
+	union {
+		/* used during submission and for synchronous completion: */
+		struct {
+			struct iov_iter		*iter;
+			struct task_struct	*waiter;
+			struct request_queue	*last_queue;
+			blk_qc_t		cookie;
+		} submit;
+
+		/* used for aio completion: */
+		struct {
+			struct work_struct	work;
+		} aio;
+	};
+};
+
+int iomap_dio_iopoll(struct kiocb *kiocb, bool spin)
+{
+	struct request_queue *q = READ_ONCE(kiocb->private);
+
+	if (!q)
+		return 0;
+	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin);
+}
+EXPORT_SYMBOL_GPL(iomap_dio_iopoll);
+
+static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
+		struct bio *bio)
+{
+	atomic_inc(&dio->ref);
+
+	if (dio->iocb->ki_flags & IOCB_HIPRI)
+		bio_set_polled(bio, dio->iocb);
+
+	dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+	dio->submit.cookie = submit_bio(bio);
+}
+
+static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+{
+	struct kiocb *iocb = dio->iocb;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	loff_t offset = iocb->ki_pos;
+	ssize_t ret;
+
+	if (dio->end_io) {
+		ret = dio->end_io(iocb,
+				dio->error ? dio->error : dio->size,
+				dio->flags);
+	} else {
+		ret = dio->error;
+	}
+
+	if (likely(!ret)) {
+		ret = dio->size;
+		/* check for short read */
+		if (offset + ret > dio->i_size &&
+		    !(dio->flags & IOMAP_DIO_WRITE))
+			ret = dio->i_size - offset;
+		iocb->ki_pos += ret;
+	}
+
+	/*
+	 * Try again to invalidate clean pages which might have been cached by
+	 * non-direct readahead, or faulted in by get_user_pages() if the source
+	 * of the write was an mmap'ed region of the file we're writing.  Either
+	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
+	 * this invalidation fails, tough, the write still worked...
+	 *
+	 * And this page cache invalidation has to be after dio->end_io(), as
+	 * some filesystems convert unwritten extents to real allocations in
+	 * end_io() when necessary, otherwise a racing buffer read would cache
+	 * zeros from unwritten extents.
+	 */
+	if (!dio->error &&
+	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
+		int err;
+		err = invalidate_inode_pages2_range(inode->i_mapping,
+				offset >> PAGE_SHIFT,
+				(offset + dio->size - 1) >> PAGE_SHIFT);
+		if (err)
+			dio_warn_stale_pagecache(iocb->ki_filp);
+	}
+
+	/*
+	 * If this is a DSYNC write, make sure we push it to stable storage now
+	 * that we've written data.
+	 */
+	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
+		ret = generic_write_sync(iocb, ret);
+
+	inode_dio_end(file_inode(iocb->ki_filp));
+	kfree(dio);
+
+	return ret;
+}
+
+static void iomap_dio_complete_work(struct work_struct *work)
+{
+	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
+	struct kiocb *iocb = dio->iocb;
+
+	iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
+}
+
+/*
+ * Set an error in the dio if none is set yet.  We have to use cmpxchg
+ * as the submission context and the completion context(s) can race to
+ * update the error.
+ */
+static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
+{
+	cmpxchg(&dio->error, 0, ret);
+}
+
+static void iomap_dio_bio_end_io(struct bio *bio)
+{
+	struct iomap_dio *dio = bio->bi_private;
+	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+	if (bio->bi_status)
+		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+
+	if (atomic_dec_and_test(&dio->ref)) {
+		if (dio->wait_for_completion) {
+			struct task_struct *waiter = dio->submit.waiter;
+			WRITE_ONCE(dio->submit.waiter, NULL);
+			blk_wake_io_task(waiter);
+		} else if (dio->flags & IOMAP_DIO_WRITE) {
+			struct inode *inode = file_inode(dio->iocb->ki_filp);
+
+			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
+		} else {
+			iomap_dio_complete_work(&dio->aio.work);
+		}
+	}
+
+	if (should_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		bio_release_pages(bio, false);
+		bio_put(bio);
+	}
+}
+
+static void
+iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
+		unsigned len)
+{
+	struct page *page = ZERO_PAGE(0);
+	int flags = REQ_SYNC | REQ_IDLE;
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	bio_set_dev(bio, iomap->bdev);
+	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+	bio->bi_private = dio;
+	bio->bi_end_io = iomap_dio_bio_end_io;
+
+	get_page(page);
+	__bio_add_page(bio, page, len, 0);
+	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
+	iomap_dio_submit_bio(dio, iomap, bio);
+}
+
+static loff_t
+iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
+		struct iomap_dio *dio, struct iomap *iomap)
+{
+	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
+	unsigned int fs_block_size = i_blocksize(inode), pad;
+	unsigned int align = iov_iter_alignment(dio->submit.iter);
+	struct iov_iter iter;
+	struct bio *bio;
+	bool need_zeroout = false;
+	bool use_fua = false;
+	int nr_pages, ret = 0;
+	size_t copied = 0;
+
+	if ((pos | length | align) & ((1 << blkbits) - 1))
+		return -EINVAL;
+
+	if (iomap->type == IOMAP_UNWRITTEN) {
+		dio->flags |= IOMAP_DIO_UNWRITTEN;
+		need_zeroout = true;
+	}
+
+	if (iomap->flags & IOMAP_F_SHARED)
+		dio->flags |= IOMAP_DIO_COW;
+
+	if (iomap->flags & IOMAP_F_NEW) {
+		need_zeroout = true;
+	} else if (iomap->type == IOMAP_MAPPED) {
+		/*
+		 * Use a FUA write if we need datasync semantics, this is a pure
+		 * data IO that doesn't require any metadata updates (including
+		 * after IO completion such as unwritten extent conversion) and
+		 * the underlying device supports FUA. This allows us to avoid
+		 * cache flushes on IO completion.
+		 */
+		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
+		    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
+		    blk_queue_fua(bdev_get_queue(iomap->bdev)))
+			use_fua = true;
+	}
+
+	/*
+	 * Operate on a partial iter trimmed to the extent we were called for.
+	 * We'll update the iter in the dio once we're done with this extent.
+	 */
+	iter = *dio->submit.iter;
+	iov_iter_truncate(&iter, length);
+
+	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+	if (nr_pages <= 0)
+		return nr_pages;
+
+	if (need_zeroout) {
+		/* zero out from the start of the block to the write offset */
+		pad = pos & (fs_block_size - 1);
+		if (pad)
+			iomap_dio_zero(dio, iomap, pos - pad, pad);
+	}
+
+	do {
+		size_t n;
+		if (dio->error) {
+			iov_iter_revert(dio->submit.iter, copied);
+			return 0;
+		}
+
+		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		bio_set_dev(bio, iomap->bdev);
+		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+		bio->bi_write_hint = dio->iocb->ki_hint;
+		bio->bi_ioprio = dio->iocb->ki_ioprio;
+		bio->bi_private = dio;
+		bio->bi_end_io = iomap_dio_bio_end_io;
+
+		ret = bio_iov_iter_get_pages(bio, &iter);
+		if (unlikely(ret)) {
+			/*
+			 * We have to stop part way through an IO. We must fall
+			 * through to the sub-block tail zeroing here, otherwise
+			 * this short IO may expose stale data in the tail of
+			 * the block we haven't written data to.
+			 */
+			bio_put(bio);
+			goto zero_tail;
+		}
+
+		n = bio->bi_iter.bi_size;
+		if (dio->flags & IOMAP_DIO_WRITE) {
+			bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+			if (use_fua)
+				bio->bi_opf |= REQ_FUA;
+			else
+				dio->flags &= ~IOMAP_DIO_WRITE_FUA;
+			task_io_account_write(n);
+		} else {
+			bio->bi_opf = REQ_OP_READ;
+			if (dio->flags & IOMAP_DIO_DIRTY)
+				bio_set_pages_dirty(bio);
+		}
+
+		iov_iter_advance(dio->submit.iter, n);
+
+		dio->size += n;
+		pos += n;
+		copied += n;
+
+		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+		iomap_dio_submit_bio(dio, iomap, bio);
+	} while (nr_pages);
+
+	/*
+	 * We need to zeroout the tail of a sub-block write if the extent type
+	 * requires zeroing or the write extends beyond EOF. If we don't zero
+	 * the block tail in the latter case, we can expose stale data via mmap
+	 * reads of the EOF block.
+	 */
+zero_tail:
+	if (need_zeroout ||
+	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
+		/* zero out from the end of the write to the end of the block */
+		pad = pos & (fs_block_size - 1);
+		if (pad)
+			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
+	}
+	return copied ? copied : ret;
+}
+
+static loff_t
+iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio)
+{
+	length = iov_iter_zero(length, dio->submit.iter);
+	dio->size += length;
+	return length;
+}
+
+static loff_t
+iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
+		struct iomap_dio *dio, struct iomap *iomap)
+{
+	struct iov_iter *iter = dio->submit.iter;
+	size_t copied;
+
+	BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data));
+
+	if (dio->flags & IOMAP_DIO_WRITE) {
+		loff_t size = inode->i_size;
+
+		if (pos > size)
+			memset(iomap->inline_data + size, 0, pos - size);
+		copied = copy_from_iter(iomap->inline_data + pos, length, iter);
+		if (copied) {
+			if (pos + copied > size)
+				i_size_write(inode, pos + copied);
+			mark_inode_dirty(inode);
+		}
+	} else {
+		copied = copy_to_iter(iomap->inline_data + pos, length, iter);
+	}
+	dio->size += copied;
+	return copied;
+}
+
+static loff_t
+iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct iomap_dio *dio = data;
+
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
+			return -EIO;
+		return iomap_dio_hole_actor(length, dio);
+	case IOMAP_UNWRITTEN:
+		if (!(dio->flags & IOMAP_DIO_WRITE))
+			return iomap_dio_hole_actor(length, dio);
+		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
+	case IOMAP_MAPPED:
+		return iomap_dio_bio_actor(inode, pos, length, dio, iomap);
+	case IOMAP_INLINE:
+		return iomap_dio_inline_actor(inode, pos, length, dio, iomap);
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+}
+
+/*
+ * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
+ * is being issued as AIO or not.  This allows us to optimise pure data writes
+ * to use REQ_FUA rather than requiring generic_write_sync() to issue a
+ * REQ_FLUSH post write. This is slightly tricky because a single request here
+ * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
+ * may be pure data writes. In that case, we still need to do a full data sync
+ * completion.
+ */
+ssize_t
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	size_t count = iov_iter_count(iter);
+	loff_t pos = iocb->ki_pos, start = pos;
+	loff_t end = iocb->ki_pos + count - 1, ret = 0;
+	unsigned int flags = IOMAP_DIRECT;
+	bool wait_for_completion = is_sync_kiocb(iocb);
+	struct blk_plug plug;
+	struct iomap_dio *dio;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	if (!count)
+		return 0;
+
+	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	if (!dio)
+		return -ENOMEM;
+
+	dio->iocb = iocb;
+	atomic_set(&dio->ref, 1);
+	dio->size = 0;
+	dio->i_size = i_size_read(inode);
+	dio->end_io = end_io;
+	dio->error = 0;
+	dio->flags = 0;
+
+	dio->submit.iter = iter;
+	dio->submit.waiter = current;
+	dio->submit.cookie = BLK_QC_T_NONE;
+	dio->submit.last_queue = NULL;
+
+	if (iov_iter_rw(iter) == READ) {
+		if (pos >= dio->i_size)
+			goto out_free_dio;
+
+		if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
+			dio->flags |= IOMAP_DIO_DIRTY;
+	} else {
+		flags |= IOMAP_WRITE;
+		dio->flags |= IOMAP_DIO_WRITE;
+
+		/* for data sync or sync, we need sync completion processing */
+		if (iocb->ki_flags & IOCB_DSYNC)
+			dio->flags |= IOMAP_DIO_NEED_SYNC;
+
+		/*
+		 * For datasync only writes, we optimistically try using FUA for
+		 * this IO.  Any non-FUA write that occurs will clear this flag,
+		 * hence we know before completion whether a cache flush is
+		 * necessary.
+		 */
+		if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
+			dio->flags |= IOMAP_DIO_WRITE_FUA;
+	}
+
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		if (filemap_range_has_page(mapping, start, end)) {
+			ret = -EAGAIN;
+			goto out_free_dio;
+		}
+		flags |= IOMAP_NOWAIT;
+	}
+
+	ret = filemap_write_and_wait_range(mapping, start, end);
+	if (ret)
+		goto out_free_dio;
+
+	/*
+	 * Try to invalidate cache pages for the range we're direct
+	 * writing.  If this invalidation fails, tough, the write will
+	 * still work, but racing two incompatible write paths is a
+	 * pretty crazy thing to do, so we don't support it 100%.
+	 */
+	ret = invalidate_inode_pages2_range(mapping,
+			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
+	if (ret)
+		dio_warn_stale_pagecache(iocb->ki_filp);
+	ret = 0;
+
+	if (iov_iter_rw(iter) == WRITE && !wait_for_completion &&
+	    !inode->i_sb->s_dio_done_wq) {
+		ret = sb_init_dio_done_wq(inode->i_sb);
+		if (ret < 0)
+			goto out_free_dio;
+	}
+
+	inode_dio_begin(inode);
+
+	blk_start_plug(&plug);
+	do {
+		ret = iomap_apply(inode, pos, count, flags, ops, dio,
+				iomap_dio_actor);
+		if (ret <= 0) {
+			/* magic error code to fall back to buffered I/O */
+			if (ret == -ENOTBLK) {
+				wait_for_completion = true;
+				ret = 0;
+			}
+			break;
+		}
+		pos += ret;
+
+		if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
+			break;
+	} while ((count = iov_iter_count(iter)) > 0);
+	blk_finish_plug(&plug);
+
+	if (ret < 0)
+		iomap_dio_set_error(dio, ret);
+
+	/*
+	 * If all the writes we issued were FUA, we don't need to flush the
+	 * cache on IO completion. Clear the sync flag for this case.
+	 */
+	if (dio->flags & IOMAP_DIO_WRITE_FUA)
+		dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+
+	WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie);
+	WRITE_ONCE(iocb->private, dio->submit.last_queue);
+
+	/*
+	 * We are about to drop our additional submission reference, which
+	 * might be the last reference to the dio.  There are three three
+	 * different ways we can progress here:
+	 *
+	 *  (a) If this is the last reference we will always complete and free
+	 *	the dio ourselves.
+	 *  (b) If this is not the last reference, and we serve an asynchronous
+	 *	iocb, we must never touch the dio after the decrement, the
+	 *	I/O completion handler will complete and free it.
+	 *  (c) If this is not the last reference, but we serve a synchronous
+	 *	iocb, the I/O completion handler will wake us up on the drop
+	 *	of the final reference, and we will complete and free it here
+	 *	after we got woken by the I/O completion handler.
+	 */
+	dio->wait_for_completion = wait_for_completion;
+	if (!atomic_dec_and_test(&dio->ref)) {
+		if (!wait_for_completion)
+			return -EIOCBQUEUED;
+
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!READ_ONCE(dio->submit.waiter))
+				break;
+
+			if (!(iocb->ki_flags & IOCB_HIPRI) ||
+			    !dio->submit.last_queue ||
+			    !blk_poll(dio->submit.last_queue,
+					 dio->submit.cookie, true))
+				io_schedule();
+		}
+		__set_current_state(TASK_RUNNING);
+	}
+
+	return iomap_dio_complete(dio);
+
+out_free_dio:
+	kfree(dio);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 1df9ea187a9a..baa1e2d31f05 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/types.h>
 #include <linux/mm_types.h>
+#include <linux/blkdev.h>
 
 struct address_space;
 struct fiemap_extent_info;
@@ -69,6 +70,12 @@ struct iomap {
 	const struct iomap_page_ops *page_ops;
 };
 
+static inline sector_t
+iomap_sector(struct iomap *iomap, loff_t pos)
+{
+	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
+}
+
 /*
  * When a filesystem sets page_ops in an iomap mapping it returns, page_prepare
  * and page_done will be called for each page written to.  This only applies to
-- 
cgit v1.2.3-59-g8ed1b


From 5d907307adc14cd5148b07629c2b4535acd06062 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 15 Jul 2019 08:51:01 -0700
Subject: iomap: move internal declarations into fs/iomap/

Move internal function declarations out of fs/internal.h into
include/linux/iomap.h so that our transition is complete.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/dax.c              |  1 -
 fs/internal.h         | 10 ----------
 fs/iomap/apply.c      |  2 --
 fs/iomap/fiemap.c     |  2 --
 fs/iomap/seek.c       |  2 --
 fs/iomap/swapfile.c   |  2 --
 include/linux/iomap.h | 10 ++++++++++
 7 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index fe5e33810cd4..cb53f9bd6fd7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -26,7 +26,6 @@
 #include <linux/mmu_notifier.h>
 #include <linux/iomap.h>
 #include <asm/pgalloc.h>
-#include "internal.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/fs_dax.h>
diff --git a/fs/internal.h b/fs/internal.h
index 2f3c3de51fad..2b0bebd67904 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -182,15 +182,5 @@ extern const struct dentry_operations ns_dentry_operations;
 extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
 		    unsigned long arg);
 
-/*
- * iomap support:
- */
-typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
-		void *data, struct iomap *iomap);
-
-loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
-		unsigned flags, const struct iomap_ops *ops, void *data,
-		iomap_actor_t actor);
-
 /* direct-io.c: */
 int sb_init_dio_done_wq(struct super_block *sb);
diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
index 9f956cf23867..54c02aecf3cd 100644
--- a/fs/iomap/apply.c
+++ b/fs/iomap/apply.c
@@ -8,8 +8,6 @@
 #include <linux/fs.h>
 #include <linux/iomap.h>
 
-#include "../internal.h"
-
 /*
  * Execute a iomap write on a segment of the mapping that spans a
  * contiguous range of pages that have identical block mapping state.
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index 1fc88ec1584d..f26fdd36e383 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -7,8 +7,6 @@
 #include <linux/fs.h>
 #include <linux/iomap.h>
 
-#include "../internal.h"
-
 struct fiemap_ctx {
 	struct fiemap_extent_info *fi;
 	struct iomap prev;
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index 715442eb71aa..c04bad4b2b43 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -10,8 +10,6 @@
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
 
-#include "../internal.h"
-
 /*
  * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
  * Returns true if found and updates @lastoff to the offset in file.
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index b79c33631263..152a230f668d 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -9,8 +9,6 @@
 #include <linux/iomap.h>
 #include <linux/swap.h>
 
-#include "../internal.h"
-
 /* Swapfile activation */
 
 struct iomap_swapfile_info {
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index baa1e2d31f05..bc499ceae392 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -122,6 +122,16 @@ struct iomap_ops {
 			ssize_t written, unsigned flags, struct iomap *iomap);
 };
 
+/*
+ * Main iomap iterator function.
+ */
+typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
+		void *data, struct iomap *iomap);
+
+loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
+		unsigned flags, const struct iomap_ops *ops, void *data,
+		iomap_actor_t actor);
+
 /*
  * Structure allocate for each page when block size < PAGE_SIZE to track
  * sub-page uptodate status and I/O completions.
-- 
cgit v1.2.3-59-g8ed1b


From 9af93db9e140a4e6e79cdb098919bc928a72cd59 Mon Sep 17 00:00:00 2001
From: Daniel Drake <drake@endlessm.com>
Date: Wed, 17 Jul 2019 13:10:58 +0800
Subject: platform/x86: asus: Rename "fan mode" to "fan boost mode"

The Asus WMI spec indicates that the function being controlled here
is called "Fan Boost Mode". The user-facing documentation also calls it
this.

The spec uses the term "fan mode" is used to refer to other things,
including functionality expected to appear on future products.
We missed this before as we are not dealing with the most readable of
specs, and didn't forsee any confusion around shortening the name.

Rename "fan mode" to "fan boost mode" to improve consistency with the
spec and to avoid a future naming conflict.

There is no interface breakage here since this has yet to be included
in an official kernel release. I also updated the kernel version listed
under ABI accordingly.

Signed-off-by: Daniel Drake <drake@endlessm.com>
Acked-by: Yurii Pavlovskyi <yurii.pavlovskyi@gmail.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 Documentation/ABI/testing/sysfs-platform-asus-wmi |   6 +-
 drivers/platform/x86/asus-wmi.c                   | 118 ++++++++++++----------
 include/linux/platform_data/x86/asus-wmi.h        |   2 +-
 3 files changed, 66 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 87ae5cc983bf..9e99f2909612 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -37,9 +37,9 @@ Contact:	"AceLan Kao" <acelan.kao@canonical.com>
 Description:
 		Resume on lid open. 1 means on, 0 means off.
 
-What:		/sys/devices/platform/<platform>/fan_mode
-Date:		Apr 2019
-KernelVersion:	5.2
+What:		/sys/devices/platform/<platform>/fan_boost_mode
+Date:		Sep 2019
+KernelVersion:	5.3
 Contact:	"Yurii Pavlovskyi" <yurii.pavlovskyi@gmail.com>
 Description:
 		Fan boost mode:
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 18f3a8bad52f..ca28d27dae63 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -68,12 +68,12 @@ MODULE_LICENSE("GPL");
 #define ASUS_FAN_CTRL_MANUAL		1
 #define ASUS_FAN_CTRL_AUTO		2
 
-#define ASUS_FAN_MODE_NORMAL		0
-#define ASUS_FAN_MODE_OVERBOOST		1
-#define ASUS_FAN_MODE_OVERBOOST_MASK	0x01
-#define ASUS_FAN_MODE_SILENT		2
-#define ASUS_FAN_MODE_SILENT_MASK	0x02
-#define ASUS_FAN_MODES_MASK		0x03
+#define ASUS_FAN_BOOST_MODE_NORMAL		0
+#define ASUS_FAN_BOOST_MODE_OVERBOOST		1
+#define ASUS_FAN_BOOST_MODE_OVERBOOST_MASK	0x01
+#define ASUS_FAN_BOOST_MODE_SILENT		2
+#define ASUS_FAN_BOOST_MODE_SILENT_MASK		0x02
+#define ASUS_FAN_BOOST_MODES_MASK		0x03
 
 #define USB_INTEL_XUSB2PR		0xD0
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI	0x9c31
@@ -182,9 +182,9 @@ struct asus_wmi {
 	int asus_hwmon_num_fans;
 	int asus_hwmon_pwm;
 
-	bool fan_mode_available;
-	u8 fan_mode_mask;
-	u8 fan_mode;
+	bool fan_boost_mode_available;
+	u8 fan_boost_mode_mask;
+	u8 fan_boost_mode;
 
 	struct hotplug_slot hotplug_slot;
 	struct mutex hotplug_lock;
@@ -1487,14 +1487,15 @@ static int asus_wmi_fan_init(struct asus_wmi *asus)
 
 /* Fan mode *******************************************************************/
 
-static int fan_mode_check_present(struct asus_wmi *asus)
+static int fan_boost_mode_check_present(struct asus_wmi *asus)
 {
 	u32 result;
 	int err;
 
-	asus->fan_mode_available = false;
+	asus->fan_boost_mode_available = false;
 
-	err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_MODE, &result);
+	err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_BOOST_MODE,
+				    &result);
 	if (err) {
 		if (err == -ENODEV)
 			return 0;
@@ -1503,72 +1504,77 @@ static int fan_mode_check_present(struct asus_wmi *asus)
 	}
 
 	if ((result & ASUS_WMI_DSTS_PRESENCE_BIT) &&
-			(result & ASUS_FAN_MODES_MASK)) {
-		asus->fan_mode_available = true;
-		asus->fan_mode_mask = result & ASUS_FAN_MODES_MASK;
+			(result & ASUS_FAN_BOOST_MODES_MASK)) {
+		asus->fan_boost_mode_available = true;
+		asus->fan_boost_mode_mask = result & ASUS_FAN_BOOST_MODES_MASK;
 	}
 
 	return 0;
 }
 
-static int fan_mode_write(struct asus_wmi *asus)
+static int fan_boost_mode_write(struct asus_wmi *asus)
 {
 	int err;
 	u8 value;
 	u32 retval;
 
-	value = asus->fan_mode;
+	value = asus->fan_boost_mode;
 
-	pr_info("Set fan mode: %u\n", value);
-	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_FAN_MODE, value, &retval);
+	pr_info("Set fan boost mode: %u\n", value);
+	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_FAN_BOOST_MODE, value,
+				    &retval);
 
 	if (err) {
-		pr_warn("Failed to set fan mode: %d\n", err);
+		pr_warn("Failed to set fan boost mode: %d\n", err);
 		return err;
 	}
 
 	if (retval != 1) {
-		pr_warn("Failed to set fan mode (retval): 0x%x\n", retval);
+		pr_warn("Failed to set fan boost mode (retval): 0x%x\n",
+			retval);
 		return -EIO;
 	}
 
 	return 0;
 }
 
-static int fan_mode_switch_next(struct asus_wmi *asus)
+static int fan_boost_mode_switch_next(struct asus_wmi *asus)
 {
-	if (asus->fan_mode == ASUS_FAN_MODE_NORMAL) {
-		if (asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK)
-			asus->fan_mode = ASUS_FAN_MODE_OVERBOOST;
-		else if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)
-			asus->fan_mode = ASUS_FAN_MODE_SILENT;
-	} else if (asus->fan_mode == ASUS_FAN_MODE_OVERBOOST) {
-		if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)
-			asus->fan_mode = ASUS_FAN_MODE_SILENT;
+	u8 mask = asus->fan_boost_mode_mask;
+
+	if (asus->fan_boost_mode == ASUS_FAN_BOOST_MODE_NORMAL) {
+		if (mask & ASUS_FAN_BOOST_MODE_OVERBOOST_MASK)
+			asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_OVERBOOST;
+		else if (mask & ASUS_FAN_BOOST_MODE_SILENT_MASK)
+			asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_SILENT;
+	} else if (asus->fan_boost_mode == ASUS_FAN_BOOST_MODE_OVERBOOST) {
+		if (mask & ASUS_FAN_BOOST_MODE_SILENT_MASK)
+			asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_SILENT;
 		else
-			asus->fan_mode = ASUS_FAN_MODE_NORMAL;
+			asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_NORMAL;
 	} else {
-		asus->fan_mode = ASUS_FAN_MODE_NORMAL;
+		asus->fan_boost_mode = ASUS_FAN_BOOST_MODE_NORMAL;
 	}
 
-	return fan_mode_write(asus);
+	return fan_boost_mode_write(asus);
 }
 
-static ssize_t fan_mode_show(struct device *dev,
-		struct device_attribute *attr, char *buf)
+static ssize_t fan_boost_mode_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
 {
 	struct asus_wmi *asus = dev_get_drvdata(dev);
 
-	return scnprintf(buf, PAGE_SIZE, "%d\n", asus->fan_mode);
+	return scnprintf(buf, PAGE_SIZE, "%d\n", asus->fan_boost_mode);
 }
 
-static ssize_t fan_mode_store(struct device *dev, struct device_attribute *attr,
-		const char *buf, size_t count)
+static ssize_t fan_boost_mode_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t count)
 {
 	int result;
 	u8 new_mode;
-
 	struct asus_wmi *asus = dev_get_drvdata(dev);
+	u8 mask = asus->fan_boost_mode_mask;
 
 	result = kstrtou8(buf, 10, &new_mode);
 	if (result < 0) {
@@ -1576,24 +1582,24 @@ static ssize_t fan_mode_store(struct device *dev, struct device_attribute *attr,
 		return result;
 	}
 
-	if (new_mode == ASUS_FAN_MODE_OVERBOOST) {
-		if (!(asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK))
+	if (new_mode == ASUS_FAN_BOOST_MODE_OVERBOOST) {
+		if (!(mask & ASUS_FAN_BOOST_MODE_OVERBOOST_MASK))
 			return -EINVAL;
-	} else if (new_mode == ASUS_FAN_MODE_SILENT) {
-		if (!(asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK))
+	} else if (new_mode == ASUS_FAN_BOOST_MODE_SILENT) {
+		if (!(mask & ASUS_FAN_BOOST_MODE_SILENT_MASK))
 			return -EINVAL;
-	} else if (new_mode != ASUS_FAN_MODE_NORMAL) {
+	} else if (new_mode != ASUS_FAN_BOOST_MODE_NORMAL) {
 		return -EINVAL;
 	}
 
-	asus->fan_mode = new_mode;
-	fan_mode_write(asus);
+	asus->fan_boost_mode = new_mode;
+	fan_boost_mode_write(asus);
 
 	return result;
 }
 
-// Fan mode: 0 - normal, 1 - overboost, 2 - silent
-static DEVICE_ATTR_RW(fan_mode);
+// Fan boost mode: 0 - normal, 1 - overboost, 2 - silent
+static DEVICE_ATTR_RW(fan_boost_mode);
 
 /* Backlight ******************************************************************/
 
@@ -1873,8 +1879,8 @@ static void asus_wmi_handle_event_code(int code, struct asus_wmi *asus)
 		return;
 	}
 
-	if (asus->fan_mode_available && code == NOTIFY_KBD_FBM) {
-		fan_mode_switch_next(asus);
+	if (asus->fan_boost_mode_available && code == NOTIFY_KBD_FBM) {
+		fan_boost_mode_switch_next(asus);
 		return;
 	}
 
@@ -2034,7 +2040,7 @@ static struct attribute *platform_attributes[] = {
 	&dev_attr_touchpad.attr,
 	&dev_attr_lid_resume.attr,
 	&dev_attr_als_enable.attr,
-	&dev_attr_fan_mode.attr,
+	&dev_attr_fan_boost_mode.attr,
 	NULL
 };
 
@@ -2056,8 +2062,8 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
 		devid = ASUS_WMI_DEVID_LID_RESUME;
 	else if (attr == &dev_attr_als_enable.attr)
 		devid = ASUS_WMI_DEVID_ALS_ENABLE;
-	else if (attr == &dev_attr_fan_mode.attr)
-		ok = asus->fan_mode_available;
+	else if (attr == &dev_attr_fan_boost_mode.attr)
+		ok = asus->fan_boost_mode_available;
 
 	if (devid != -1)
 		ok = !(asus_wmi_get_devstate_simple(asus, devid) < 0);
@@ -2315,9 +2321,9 @@ static int asus_wmi_add(struct platform_device *pdev)
 	if (err)
 		goto fail_platform;
 
-	err = fan_mode_check_present(asus);
+	err = fan_boost_mode_check_present(asus);
 	if (err)
-		goto fail_fan_mode;
+		goto fail_fan_boost_mode;
 
 	err = asus_wmi_sysfs_init(asus->platform_device);
 	if (err)
@@ -2402,7 +2408,7 @@ fail_hwmon:
 fail_input:
 	asus_wmi_sysfs_exit(asus->platform_device);
 fail_sysfs:
-fail_fan_mode:
+fail_fan_boost_mode:
 fail_platform:
 	kfree(asus);
 	return err;
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 8551156b8dca..4802cd2c7309 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -57,7 +57,7 @@
 #define ASUS_WMI_DEVID_KBD_BACKLIGHT	0x00050021
 #define ASUS_WMI_DEVID_LIGHT_SENSOR	0x00050022 /* ?? */
 #define ASUS_WMI_DEVID_LIGHTBAR		0x00050025
-#define ASUS_WMI_DEVID_FAN_MODE		0x00110018
+#define ASUS_WMI_DEVID_FAN_BOOST_MODE	0x00110018
 
 /* Misc */
 #define ASUS_WMI_DEVID_CAMERA		0x00060013
-- 
cgit v1.2.3-59-g8ed1b


From 733232f8c852bcc2ad6fc1db7f4c43eb01c7c217 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 17 Jul 2019 12:57:06 -0400
Subject: dm: use printk ratelimiting functions

DM provided its own ratelimiting printk wrapper but given printk
advances this is no longer needed.

Also, switching DMDEBUG_LIMIT to using pr_debug_ratelimited() fixes the
reported issue where DMDEBUG_LIMIT() still caused a flood of "callbacks
suppressed" messages.

Reported-by: Milan Broz <gmazyland@gmail.com>
Depends-on: 29fc2bc7539386 ("printk: pr_debug_ratelimited: check state first to reduce "callbacks suppressed" messages")
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 include/linux/device-mapper.h | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e1f51d607cc5..603ce5bb4fac 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -530,29 +530,20 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
  *---------------------------------------------------------------*/
 #define DM_NAME "device-mapper"
 
-#define DM_RATELIMIT(pr_func, fmt, ...)					\
-do {									\
-	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,	\
-				      DEFAULT_RATELIMIT_BURST);		\
-									\
-	if (__ratelimit(&rs))						\
-		pr_func(DM_FMT(fmt), ##__VA_ARGS__);			\
-} while (0)
-
 #define DM_FMT(fmt) DM_NAME ": " DM_MSG_PREFIX ": " fmt "\n"
 
 #define DMCRIT(fmt, ...) pr_crit(DM_FMT(fmt), ##__VA_ARGS__)
 
 #define DMERR(fmt, ...) pr_err(DM_FMT(fmt), ##__VA_ARGS__)
-#define DMERR_LIMIT(fmt, ...) DM_RATELIMIT(pr_err, fmt, ##__VA_ARGS__)
+#define DMERR_LIMIT(fmt, ...) pr_err_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
 #define DMWARN(fmt, ...) pr_warn(DM_FMT(fmt), ##__VA_ARGS__)
-#define DMWARN_LIMIT(fmt, ...) DM_RATELIMIT(pr_warn, fmt, ##__VA_ARGS__)
+#define DMWARN_LIMIT(fmt, ...) pr_warn_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
 #define DMINFO(fmt, ...) pr_info(DM_FMT(fmt), ##__VA_ARGS__)
-#define DMINFO_LIMIT(fmt, ...) DM_RATELIMIT(pr_info, fmt, ##__VA_ARGS__)
+#define DMINFO_LIMIT(fmt, ...) pr_info_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
 
 #ifdef CONFIG_DM_DEBUG
 #define DMDEBUG(fmt, ...) printk(KERN_DEBUG DM_FMT(fmt), ##__VA_ARGS__)
-#define DMDEBUG_LIMIT(fmt, ...) DM_RATELIMIT(pr_debug, fmt, ##__VA_ARGS__)
+#define DMDEBUG_LIMIT(fmt, ...) pr_debug_ratelimited(DM_FMT(fmt), ##__VA_ARGS__)
 #else
 #define DMDEBUG(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
 #define DMDEBUG_LIMIT(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
-- 
cgit v1.2.3-59-g8ed1b


From 7402a4fedc2bc448100c2d086406c708451b16dc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Tue, 16 Jul 2019 13:51:29 -0400
Subject: SUNRPC: Fix up backchannel slot table accounting

Add a per-transport maximum limit in the socket case, and add
helpers to allow the NFSv4 code to discover that limit.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/nfs4proc.c                 |  3 +++
 include/linux/sunrpc/bc_xprt.h    |  1 +
 include/linux/sunrpc/clnt.h       |  1 +
 include/linux/sunrpc/xprt.h       |  6 ++++--
 net/sunrpc/backchannel_rqst.c     | 40 +++++++++++++++++++++------------------
 net/sunrpc/clnt.c                 | 13 +++++++++++++
 net/sunrpc/svc.c                  |  2 +-
 net/sunrpc/xprtrdma/backchannel.c |  7 +++++++
 net/sunrpc/xprtrdma/transport.c   |  1 +
 net/sunrpc/xprtrdma/xprt_rdma.h   |  1 +
 net/sunrpc/xprtsock.c             |  1 +
 11 files changed, 55 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 52de7245a2ee..39896afc6edf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8380,6 +8380,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
 {
 	unsigned int max_rqst_sz, max_resp_sz;
 	unsigned int max_bc_payload = rpc_max_bc_payload(clnt);
+	unsigned int max_bc_slots = rpc_num_bc_slots(clnt);
 
 	max_rqst_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxwrite_overhead;
 	max_resp_sz = NFS_MAX_FILE_IO_SIZE + nfs41_maxread_overhead;
@@ -8402,6 +8403,8 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
 	args->bc_attrs.max_resp_sz_cached = 0;
 	args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
 	args->bc_attrs.max_reqs = max_t(unsigned short, max_session_cb_slots, 1);
+	if (args->bc_attrs.max_reqs > max_bc_slots)
+		args->bc_attrs.max_reqs = max_bc_slots;
 
 	dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
 		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index d4229a78524a..87d27e13d885 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -43,6 +43,7 @@ void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs);
 int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs);
 void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs);
 void xprt_free_bc_rqst(struct rpc_rqst *req);
+unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt);
 
 /*
  * Determine if a shared backchannel is in use
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 4e070e00c143..abc63bd1be2b 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -194,6 +194,7 @@ void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 struct net *	rpc_net_ns(struct rpc_clnt *);
 size_t		rpc_max_payload(struct rpc_clnt *);
 size_t		rpc_max_bc_payload(struct rpc_clnt *);
+unsigned int	rpc_num_bc_slots(struct rpc_clnt *);
 void		rpc_force_rebind(struct rpc_clnt *);
 size_t		rpc_peeraddr(struct rpc_clnt *, struct sockaddr *, size_t);
 const char	*rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t);
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index ed76e5fb36c1..13e108bcc9eb 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -158,6 +158,7 @@ struct rpc_xprt_ops {
 	int		(*bc_setup)(struct rpc_xprt *xprt,
 				    unsigned int min_reqs);
 	size_t		(*bc_maxpayload)(struct rpc_xprt *xprt);
+	unsigned int	(*bc_num_slots)(struct rpc_xprt *xprt);
 	void		(*bc_free_rqst)(struct rpc_rqst *rqst);
 	void		(*bc_destroy)(struct rpc_xprt *xprt,
 				      unsigned int max_reqs);
@@ -251,8 +252,9 @@ struct rpc_xprt {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	struct svc_serv		*bc_serv;       /* The RPC service which will */
 						/* process the callback */
-	int			bc_alloc_count;	/* Total number of preallocs */
-	atomic_t		bc_free_slots;
+	unsigned int		bc_alloc_max;
+	unsigned int		bc_alloc_count;	/* Total number of preallocs */
+	atomic_t		bc_slot_count;	/* Number of allocated slots */
 	spinlock_t		bc_pa_lock;	/* Protects the preallocated
 						 * items */
 	struct list_head	bc_pa_list;	/* List of preallocated
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index c47d82622fd1..339e8c077c2d 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -31,25 +31,20 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
+#define BC_MAX_SLOTS	64U
+
+unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt)
+{
+	return BC_MAX_SLOTS;
+}
+
 /*
  * Helper routines that track the number of preallocation elements
  * on the transport.
  */
 static inline int xprt_need_to_requeue(struct rpc_xprt *xprt)
 {
-	return xprt->bc_alloc_count < atomic_read(&xprt->bc_free_slots);
-}
-
-static inline void xprt_inc_alloc_count(struct rpc_xprt *xprt, unsigned int n)
-{
-	atomic_add(n, &xprt->bc_free_slots);
-	xprt->bc_alloc_count += n;
-}
-
-static inline int xprt_dec_alloc_count(struct rpc_xprt *xprt, unsigned int n)
-{
-	atomic_sub(n, &xprt->bc_free_slots);
-	return xprt->bc_alloc_count -= n;
+	return xprt->bc_alloc_count < xprt->bc_alloc_max;
 }
 
 /*
@@ -145,6 +140,9 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
 
 	dprintk("RPC:       setup backchannel transport\n");
 
+	if (min_reqs > BC_MAX_SLOTS)
+		min_reqs = BC_MAX_SLOTS;
+
 	/*
 	 * We use a temporary list to keep track of the preallocated
 	 * buffers.  Once we're done building the list we splice it
@@ -172,7 +170,9 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
 	 */
 	spin_lock(&xprt->bc_pa_lock);
 	list_splice(&tmp_list, &xprt->bc_pa_list);
-	xprt_inc_alloc_count(xprt, min_reqs);
+	xprt->bc_alloc_count += min_reqs;
+	xprt->bc_alloc_max += min_reqs;
+	atomic_add(min_reqs, &xprt->bc_slot_count);
 	spin_unlock(&xprt->bc_pa_lock);
 
 	dprintk("RPC:       setup backchannel transport done\n");
@@ -220,11 +220,13 @@ void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs)
 		goto out;
 
 	spin_lock_bh(&xprt->bc_pa_lock);
-	xprt_dec_alloc_count(xprt, max_reqs);
+	xprt->bc_alloc_max -= max_reqs;
 	list_for_each_entry_safe(req, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
 		dprintk("RPC:        req=%p\n", req);
 		list_del(&req->rq_bc_pa_list);
 		xprt_free_allocation(req);
+		xprt->bc_alloc_count--;
+		atomic_dec(&xprt->bc_slot_count);
 		if (--max_reqs == 0)
 			break;
 	}
@@ -241,13 +243,14 @@ static struct rpc_rqst *xprt_get_bc_request(struct rpc_xprt *xprt, __be32 xid,
 	struct rpc_rqst *req = NULL;
 
 	dprintk("RPC:       allocate a backchannel request\n");
-	if (atomic_read(&xprt->bc_free_slots) <= 0)
-		goto not_found;
 	if (list_empty(&xprt->bc_pa_list)) {
 		if (!new)
 			goto not_found;
+		if (atomic_read(&xprt->bc_slot_count) >= BC_MAX_SLOTS)
+			goto not_found;
 		list_add_tail(&new->rq_bc_pa_list, &xprt->bc_pa_list);
 		xprt->bc_alloc_count++;
+		atomic_inc(&xprt->bc_slot_count);
 	}
 	req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
 				rq_bc_pa_list);
@@ -291,6 +294,7 @@ void xprt_free_bc_rqst(struct rpc_rqst *req)
 	if (xprt_need_to_requeue(xprt)) {
 		list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
 		xprt->bc_alloc_count++;
+		atomic_inc(&xprt->bc_slot_count);
 		req = NULL;
 	}
 	spin_unlock_bh(&xprt->bc_pa_lock);
@@ -357,7 +361,7 @@ void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
 
 	spin_lock(&xprt->bc_pa_lock);
 	list_del(&req->rq_bc_pa_list);
-	xprt_dec_alloc_count(xprt, 1);
+	xprt->bc_alloc_count--;
 	spin_unlock(&xprt->bc_pa_lock);
 
 	req->rq_private_buf.len = copied;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 383555d2b522..79c849391cb9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1526,6 +1526,19 @@ size_t rpc_max_bc_payload(struct rpc_clnt *clnt)
 }
 EXPORT_SYMBOL_GPL(rpc_max_bc_payload);
 
+unsigned int rpc_num_bc_slots(struct rpc_clnt *clnt)
+{
+	struct rpc_xprt *xprt;
+	unsigned int ret;
+
+	rcu_read_lock();
+	xprt = rcu_dereference(clnt->cl_xprt);
+	ret = xprt->ops->bc_num_slots(xprt);
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_num_bc_slots);
+
 /**
  * rpc_force_rebind - force transport to check that remote port is unchanged
  * @clnt: client to rebind
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index e15cb704453e..220b79988000 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1595,7 +1595,7 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
 	/* Parse and execute the bc call */
 	proc_error = svc_process_common(rqstp, argv, resv);
 
-	atomic_inc(&req->rq_xprt->bc_free_slots);
+	atomic_dec(&req->rq_xprt->bc_slot_count);
 	if (!proc_error) {
 		/* Processing error: drop the request */
 		xprt_free_bc_request(req);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index ce986591f213..59e624b1d7a0 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -52,6 +52,13 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
 	return maxmsg - RPCRDMA_HDRLEN_MIN;
 }
 
+unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *xprt)
+{
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+
+	return r_xprt->rx_buf.rb_bc_srv_max_requests;
+}
+
 static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 {
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 4993aa49ecbe..52abddac19e5 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -812,6 +812,7 @@ static const struct rpc_xprt_ops xprt_rdma_procs = {
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 	.bc_setup		= xprt_rdma_bc_setup,
 	.bc_maxpayload		= xprt_rdma_bc_maxpayload,
+	.bc_num_slots		= xprt_rdma_bc_max_slots,
 	.bc_free_rqst		= xprt_rdma_bc_free_rqst,
 	.bc_destroy		= xprt_rdma_bc_destroy,
 #endif
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 8378f45d2da7..92ce09fcea74 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -605,6 +605,7 @@ void xprt_rdma_cleanup(void);
 #if defined(CONFIG_SUNRPC_BACKCHANNEL)
 int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
 size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *);
+unsigned int xprt_rdma_bc_max_slots(struct rpc_xprt *);
 int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
 void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
 int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 3c2cc96afcaa..6b1fca51028a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2788,6 +2788,7 @@ static const struct rpc_xprt_ops xs_tcp_ops = {
 #ifdef CONFIG_SUNRPC_BACKCHANNEL
 	.bc_setup		= xprt_setup_bc,
 	.bc_maxpayload		= xs_tcp_bc_maxpayload,
+	.bc_num_slots		= xprt_bc_max_slots,
 	.bc_free_rqst		= xprt_free_bc_rqst,
 	.bc_destroy		= xprt_destroy_bc,
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 3193c0836f203a91bef96d88c64cccf0be090d9c Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Wed, 17 Jul 2019 20:36:45 -0500
Subject: bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()

On x86-64, with CONFIG_RETPOLINE=n, GCC's "global common subexpression
elimination" optimization results in ___bpf_prog_run()'s jumptable code
changing from this:

	select_insn:
		jmp *jumptable(, %rax, 8)
		...
	ALU64_ADD_X:
		...
		jmp *jumptable(, %rax, 8)
	ALU_ADD_X:
		...
		jmp *jumptable(, %rax, 8)

to this:

	select_insn:
		mov jumptable, %r12
		jmp *(%r12, %rax, 8)
		...
	ALU64_ADD_X:
		...
		jmp *(%r12, %rax, 8)
	ALU_ADD_X:
		...
		jmp *(%r12, %rax, 8)

The jumptable address is placed in a register once, at the beginning of
the function.  The function execution can then go through multiple
indirect jumps which rely on that same register value.  This has a few
issues:

1) Objtool isn't smart enough to be able to track such a register value
   across multiple recursive indirect jumps through the jump table.

2) With CONFIG_RETPOLINE enabled, this optimization actually results in
   a small slowdown.  I measured a ~4.7% slowdown in the test_bpf
   "tcpdump port 22" selftest.

   This slowdown is actually predicted by the GCC manual:

     Note: When compiling a program using computed gotos, a GCC
     extension, you may get better run-time performance if you
     disable the global common subexpression elimination pass by
     adding -fno-gcse to the command line.

So just disable the optimization for this function.

Fixes: e55a73251da3 ("bpf: Fix ORC unwinding in non-JIT BPF code")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/30c3ca29ba037afcbd860a8672eef0021addf9fe.1563413318.git.jpoimboe@redhat.com
---
 include/linux/compiler-gcc.h   | 2 ++
 include/linux/compiler_types.h | 4 ++++
 kernel/bpf/core.c              | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index e8579412ad21..d7ee4c6bad48 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -170,3 +170,5 @@
 #else
 #define __diag_GCC_8(s)
 #endif
+
+#define __no_fgcse __attribute__((optimize("-fno-gcse")))
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 095d55c3834d..599c27b56c29 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -189,6 +189,10 @@ struct ftrace_likely_data {
 #define asm_volatile_goto(x...) asm goto(x)
 #endif
 
+#ifndef __no_fgcse
+# define __no_fgcse
+#endif
+
 /* Are two types/vars the same type (ignoring qualifiers)? */
 #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7e98f36a14e2..8191a7db2777 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1295,7 +1295,7 @@ bool bpf_opcode_in_insntable(u8 code)
  *
  * Decode and execute eBPF instructions.
  */
-static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
+static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
 {
 #define BPF_INSN_2_LBL(x, y)    [BPF_##x | BPF_##y] = &&x##_##y
 #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
-- 
cgit v1.2.3-59-g8ed1b


From d5b9216fd5114be4ed98ca9c1ecc5f164cd8cf5e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Thu, 18 Jul 2019 09:32:17 -0400
Subject: pnfs/flexfiles: Add tracepoints for detecting pnfs fallback to MDS

Add tracepoints to allow debugging of the event chain leading to
a pnfs fallback to doing I/O through the MDS.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/flexfilelayout/flexfilelayout.c | 26 ++++++++++++
 fs/nfs/nfs4trace.c                     |  8 ++++
 fs/nfs/nfs4trace.h                     | 76 +++++++++++++++++++++++++++++++++-
 fs/nfs/pnfs.c                          |  2 +
 include/linux/nfs4.h                   |  1 +
 5 files changed, 112 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index bcff3bf5ae09..b04e20d28162 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -934,6 +934,10 @@ out_nolseg:
 	if (pgio->pg_error < 0)
 		return;
 out_mds:
+	trace_pnfs_mds_fallback_pg_init_read(pgio->pg_inode,
+			0, NFS4_MAX_UINT64, IOMODE_READ,
+			NFS_I(pgio->pg_inode)->layout,
+			pgio->pg_lseg);
 	pnfs_put_lseg(pgio->pg_lseg);
 	pgio->pg_lseg = NULL;
 	nfs_pageio_reset_read_mds(pgio);
@@ -1000,6 +1004,10 @@ retry:
 	return;
 
 out_mds:
+	trace_pnfs_mds_fallback_pg_init_write(pgio->pg_inode,
+			0, NFS4_MAX_UINT64, IOMODE_RW,
+			NFS_I(pgio->pg_inode)->layout,
+			pgio->pg_lseg);
 	pnfs_put_lseg(pgio->pg_lseg);
 	pgio->pg_lseg = NULL;
 	nfs_pageio_reset_write_mds(pgio);
@@ -1026,6 +1034,10 @@ ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
 	if (pgio->pg_lseg)
 		return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 
+	trace_pnfs_mds_fallback_pg_get_mirror_count(pgio->pg_inode,
+			0, NFS4_MAX_UINT64, IOMODE_RW,
+			NFS_I(pgio->pg_inode)->layout,
+			pgio->pg_lseg);
 	/* no lseg means that pnfs is not in use, so no mirroring here */
 	nfs_pageio_reset_write_mds(pgio);
 out:
@@ -1075,6 +1087,10 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
 			hdr->args.count,
 			(unsigned long long)hdr->args.offset);
 
+		trace_pnfs_mds_fallback_write_done(hdr->inode,
+				hdr->args.offset, hdr->args.count,
+				IOMODE_RW, NFS_I(hdr->inode)->layout,
+				hdr->lseg);
 		task->tk_status = pnfs_write_done_resend_to_mds(hdr);
 	}
 }
@@ -1094,6 +1110,10 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
 			hdr->args.count,
 			(unsigned long long)hdr->args.offset);
 
+		trace_pnfs_mds_fallback_read_done(hdr->inode,
+				hdr->args.offset, hdr->args.count,
+				IOMODE_READ, NFS_I(hdr->inode)->layout,
+				hdr->lseg);
 		task->tk_status = pnfs_read_done_resend_to_mds(hdr);
 	}
 }
@@ -1827,6 +1847,9 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
 out_failed:
 	if (ff_layout_avoid_mds_available_ds(lseg))
 		return PNFS_TRY_AGAIN;
+	trace_pnfs_mds_fallback_read_pagelist(hdr->inode,
+			hdr->args.offset, hdr->args.count,
+			IOMODE_READ, NFS_I(hdr->inode)->layout, lseg);
 	return PNFS_NOT_ATTEMPTED;
 }
 
@@ -1892,6 +1915,9 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
 out_failed:
 	if (ff_layout_avoid_mds_available_ds(lseg))
 		return PNFS_TRY_AGAIN;
+	trace_pnfs_mds_fallback_write_pagelist(hdr->inode,
+			hdr->args.offset, hdr->args.count,
+			IOMODE_RW, NFS_I(hdr->inode)->layout, lseg);
 	return PNFS_NOT_ATTEMPTED;
 }
 
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index e9fb3e50a999..1a8f376b3f73 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -16,4 +16,12 @@
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_read);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_write);
 EXPORT_TRACEPOINT_SYMBOL_GPL(nfs4_pnfs_commit_ds);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_init_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_pg_get_mirror_count);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_pagelist);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist);
 #endif
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index d85f20945a2b..b2f395fa7350 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1771,6 +1771,7 @@ TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_BLOCKED);
 TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_INVALID_OPEN);
 TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_RETRY);
 TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_EXIT);
 
 #define show_pnfs_update_layout_reason(reason)				\
 	__print_symbolic(reason,					\
@@ -1786,7 +1787,8 @@ TRACE_DEFINE_ENUM(PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
 		{ PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" },	\
 		{ PNFS_UPDATE_LAYOUT_INVALID_OPEN, "invalid open" },	\
 		{ PNFS_UPDATE_LAYOUT_RETRY, "retrying" },	\
-		{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
+		{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }, \
+		{ PNFS_UPDATE_LAYOUT_EXIT, "exit" })
 
 TRACE_EVENT(pnfs_update_layout,
 		TP_PROTO(struct inode *inode,
@@ -1845,6 +1847,78 @@ TRACE_EVENT(pnfs_update_layout,
 		)
 );
 
+DECLARE_EVENT_CLASS(pnfs_layout_event,
+		TP_PROTO(struct inode *inode,
+			loff_t pos,
+			u64 count,
+			enum pnfs_iomode iomode,
+			struct pnfs_layout_hdr *lo,
+			struct pnfs_layout_segment *lseg
+		),
+		TP_ARGS(inode, pos, count, iomode, lo, lseg),
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u64, fileid)
+			__field(u32, fhandle)
+			__field(loff_t, pos)
+			__field(u64, count)
+			__field(enum pnfs_iomode, iomode)
+			__field(int, layoutstateid_seq)
+			__field(u32, layoutstateid_hash)
+			__field(long, lseg)
+		),
+		TP_fast_assign(
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->pos = pos;
+			__entry->count = count;
+			__entry->iomode = iomode;
+			if (lo != NULL) {
+				__entry->layoutstateid_seq =
+				be32_to_cpu(lo->plh_stateid.seqid);
+				__entry->layoutstateid_hash =
+				nfs_stateid_hash(&lo->plh_stateid);
+			} else {
+				__entry->layoutstateid_seq = 0;
+				__entry->layoutstateid_hash = 0;
+			}
+			__entry->lseg = (long)lseg;
+		),
+		TP_printk(
+			"fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"iomode=%s pos=%llu count=%llu "
+			"layoutstateid=%d:0x%08x lseg=0x%lx",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			show_pnfs_iomode(__entry->iomode),
+			(unsigned long long)__entry->pos,
+			(unsigned long long)__entry->count,
+			__entry->layoutstateid_seq, __entry->layoutstateid_hash,
+			__entry->lseg
+		)
+);
+
+#define DEFINE_PNFS_LAYOUT_EVENT(name) \
+	DEFINE_EVENT(pnfs_layout_event, name, \
+		TP_PROTO(struct inode *inode, \
+			loff_t pos, \
+			u64 count, \
+			enum pnfs_iomode iomode, \
+			struct pnfs_layout_hdr *lo, \
+			struct pnfs_layout_segment *lseg \
+		), \
+		TP_ARGS(inode, pos, count, iomode, lo, lseg))
+
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_read);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_init_write);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_pg_get_mirror_count);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_done);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_done);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_read_pagelist);
+DEFINE_PNFS_LAYOUT_EVENT(pnfs_mds_fallback_write_pagelist);
+
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* _TRACE_NFS4_H */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 758917463700..75bd5b552ba4 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2037,6 +2037,8 @@ lookup_again:
 out_put_layout_hdr:
 	if (first)
 		pnfs_clear_first_layoutget(lo);
+	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
+				 PNFS_UPDATE_LAYOUT_EXIT);
 	pnfs_put_layout_hdr(lo);
 out:
 	dprintk("%s: inode %s/%llu pNFS layout segment %s for "
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 22494d170619..fd59904a282c 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -660,6 +660,7 @@ enum pnfs_update_layout_reason {
 	PNFS_UPDATE_LAYOUT_BLOCKED,
 	PNFS_UPDATE_LAYOUT_INVALID_OPEN,
 	PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET,
+	PNFS_UPDATE_LAYOUT_EXIT,
 };
 
 #define NFS4_OP_MAP_NUM_LONGS					\
-- 
cgit v1.2.3-59-g8ed1b


From 80ec922dbd87fd38d15719c86a94457204648aeb Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:56:51 -0700
Subject: mm/memory_hotplug: allow arch_remove_memory() without
 CONFIG_MEMORY_HOTREMOVE

We want to improve error handling while adding memory by allowing to use
arch_remove_memory() and __remove_pages() even if
CONFIG_MEMORY_HOTREMOVE is not set to e.g., implement something like:

	arch_add_memory()
	rc = do_something();
	if (rc) {
		arch_remove_memory();
	}

We won't get rid of CONFIG_MEMORY_HOTREMOVE for now, as it will require
quite some dependencies for memory offlining.

Link: http://lkml.kernel.org/r/20190527111152.16324-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Mark Brown <broonie@kernel.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/mmu.c            | 2 --
 arch/ia64/mm/init.c            | 2 --
 arch/powerpc/mm/mem.c          | 2 --
 arch/s390/mm/init.c            | 2 --
 arch/sh/mm/init.c              | 2 --
 arch/x86/mm/init_32.c          | 2 --
 arch/x86/mm/init_64.c          | 2 --
 drivers/base/memory.c          | 2 --
 include/linux/memory.h         | 2 --
 include/linux/memory_hotplug.h | 2 --
 mm/memory_hotplug.c            | 2 --
 mm/sparse.c                    | 6 ------
 12 files changed, 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index a21fa7e1167d..750a69dde39b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1074,7 +1074,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
 			   restrictions);
 }
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -1093,4 +1092,3 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
-#endif
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index d28e29103bdb..aae75fd7b810 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -681,7 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return ret;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -693,4 +692,3 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
-#endif
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 26a8da3723bb..9259337d7374 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -125,7 +125,6 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
 	return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void __ref arch_remove_memory(int nid, u64 start, u64 size,
 			     struct vmem_altmap *altmap)
 {
@@ -151,7 +150,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 		pr_warn("Hash collision while resizing HPT\n");
 }
 #endif
-#endif /* CONFIG_MEMORY_HOTPLUG */
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 void __init mem_topology_setup(void)
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 5b1ec2f532e0..4e5bbe328594 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -286,7 +286,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return rc;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -298,5 +297,4 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 	vmem_remove_mapping(start, size);
 }
-#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 13c6a6bb5fd9..dfdbaa50946e 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -429,7 +429,6 @@ int memory_add_physaddr_to_nid(u64 addr)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -440,5 +439,4 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	zone = page_zone(pfn_to_page(start_pfn));
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
-#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index f265a4316179..4068abb9427f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -860,7 +860,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
 	return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void arch_remove_memory(int nid, u64 start, u64 size,
 			struct vmem_altmap *altmap)
 {
@@ -872,7 +871,6 @@ void arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 }
 #endif
-#endif
 
 int kernel_set_to_readonly __read_mostly;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 08bbf648827b..5a289a2ab108 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1198,7 +1198,6 @@ void __ref vmemmap_free(unsigned long start, unsigned long end,
 	remove_pagetable(start, end, false, altmap);
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void __meminit
 kernel_physical_mapping_remove(unsigned long start, unsigned long end)
 {
@@ -1219,7 +1218,6 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
 	__remove_pages(zone, start_pfn, nr_pages, altmap);
 	kernel_physical_mapping_remove(start, start + size);
 }
-#endif
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static struct kcore_list kcore_vsyscall;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index e0aa7f9abb36..92459d6f12be 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -723,7 +723,6 @@ out:
 	return ret;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void
 unregister_memory(struct memory_block *memory)
 {
@@ -762,7 +761,6 @@ void unregister_memory_section(struct mem_section *section)
 out_unlock:
 	mutex_unlock(&mem_sysfs_mutex);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /* return true if the memory block is offlined, otherwise, return false */
 bool is_memblock_offlined(struct memory_block *mem)
diff --git a/include/linux/memory.h b/include/linux/memory.h
index e1dc1bb2b787..474c7c60c8f2 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -112,9 +112,7 @@ extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
 int hotplug_memory_register(int nid, struct mem_section *section);
-#ifdef CONFIG_MEMORY_HOTREMOVE
 extern void unregister_memory_section(struct mem_section *);
-#endif
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 988fde33cd7f..87bf9c4a889e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -123,12 +123,10 @@ static inline bool movable_node_is_enabled(void)
 	return movable_node_enabled;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 extern void arch_remove_memory(int nid, u64 start, u64 size,
 			       struct vmem_altmap *altmap);
 extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
 			   unsigned long nr_pages, struct vmem_altmap *altmap);
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /*
  * Do we want sysfs memblock files created. This will allow userspace to online
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a8c25fd85ee3..bc11888d5d7e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -318,7 +318,6 @@ out:
 	return err;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
 static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
 				     unsigned long start_pfn,
@@ -580,7 +579,6 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 
 	set_zone_contiguous(zone);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 int set_online_page_callback(online_page_callback_t callback)
 {
diff --git a/mm/sparse.c b/mm/sparse.c
index fd13166949b5..d1d5e05f5b8d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -604,7 +604,6 @@ static void __kfree_section_memmap(struct page *memmap,
 
 	vmemmap_free(start, end, altmap);
 }
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void free_map_bootmem(struct page *memmap)
 {
 	unsigned long start = (unsigned long)memmap;
@@ -612,7 +611,6 @@ static void free_map_bootmem(struct page *memmap)
 
 	vmemmap_free(start, end, NULL);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 #else
 static struct page *__kmalloc_section_memmap(void)
 {
@@ -651,7 +649,6 @@ static void __kfree_section_memmap(struct page *memmap,
 			   get_order(sizeof(struct page) * PAGES_PER_SECTION));
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static void free_map_bootmem(struct page *memmap)
 {
 	unsigned long maps_section_nr, removing_section_nr, i;
@@ -681,7 +678,6 @@ static void free_map_bootmem(struct page *memmap)
 			put_page_bootmem(page);
 	}
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 /**
@@ -746,7 +742,6 @@ out:
 	return ret;
 }
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
 #ifdef CONFIG_MEMORY_FAILURE
 static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 {
@@ -823,5 +818,4 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
 			PAGES_PER_SECTION - map_offset);
 	free_section_usemap(memmap, usemap, altmap);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3-59-g8ed1b


From db051a0dac13db24d58470d75cee0ce7c6b031a1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:56:56 -0700
Subject: mm/memory_hotplug: create memory block devices after
 arch_add_memory()

Only memory to be added to the buddy and to be onlined/offlined by user
space using /sys/devices/system/memory/...  needs (and should have!)
memory block devices.

Factor out creation of memory block devices.  Create all devices after
arch_add_memory() succeeded.  We can later drop the want_memblock
parameter, because it is now effectively stale.

Only after memory block devices have been added, memory can be onlined
by user space.  This implies, that memory is not visible to user space
at all before arch_add_memory() succeeded.

While at it
 - use WARN_ON_ONCE instead of BUG_ON in moved unregister_memory()
 - introduce find_memory_block_by_id() to search via block id
 - Use find_memory_block_by_id() in init_memory_block() to catch
   duplicates

Link: http://lkml.kernel.org/r/20190527111152.16324-8-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qian Cai <cai@lca.pw>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 82 +++++++++++++++++++++++++++++++++-----------------
 include/linux/memory.h |  2 +-
 mm/memory_hotplug.c    | 15 ++++-----
 3 files changed, 63 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 92459d6f12be..18a30c3ac0ef 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -39,6 +39,11 @@ static inline int base_memory_block_id(int section_nr)
 	return section_nr / sections_per_block;
 }
 
+static inline int pfn_to_block_id(unsigned long pfn)
+{
+	return base_memory_block_id(pfn_to_section_nr(pfn));
+}
+
 static int memory_subsys_online(struct device *dev);
 static int memory_subsys_offline(struct device *dev);
 
@@ -582,10 +587,9 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
  * A reference for the returned object is held and the reference for the
  * hinted object is released.
  */
-struct memory_block *find_memory_block_hinted(struct mem_section *section,
-					      struct memory_block *hint)
+static struct memory_block *find_memory_block_by_id(int block_id,
+						    struct memory_block *hint)
 {
-	int block_id = base_memory_block_id(__section_nr(section));
 	struct device *hintdev = hint ? &hint->dev : NULL;
 	struct device *dev;
 
@@ -597,6 +601,14 @@ struct memory_block *find_memory_block_hinted(struct mem_section *section,
 	return to_memory_block(dev);
 }
 
+struct memory_block *find_memory_block_hinted(struct mem_section *section,
+					      struct memory_block *hint)
+{
+	int block_id = base_memory_block_id(__section_nr(section));
+
+	return find_memory_block_by_id(block_id, hint);
+}
+
 /*
  * For now, we have a linear search to go find the appropriate
  * memory_block corresponding to a particular phys_index. If
@@ -658,6 +670,11 @@ static int init_memory_block(struct memory_block **memory, int block_id,
 	unsigned long start_pfn;
 	int ret = 0;
 
+	mem = find_memory_block_by_id(block_id, NULL);
+	if (mem) {
+		put_device(&mem->dev);
+		return -EEXIST;
+	}
 	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
 	if (!mem)
 		return -ENOMEM;
@@ -695,44 +712,53 @@ static int add_memory_block(int base_section_nr)
 	return 0;
 }
 
+static void unregister_memory(struct memory_block *memory)
+{
+	if (WARN_ON_ONCE(memory->dev.bus != &memory_subsys))
+		return;
+
+	/* drop the ref. we got via find_memory_block() */
+	put_device(&memory->dev);
+	device_unregister(&memory->dev);
+}
+
 /*
- * need an interface for the VM to add new memory regions,
- * but without onlining it.
+ * Create memory block devices for the given memory area. Start and size
+ * have to be aligned to memory block granularity. Memory block devices
+ * will be initialized as offline.
  */
-int hotplug_memory_register(int nid, struct mem_section *section)
+int create_memory_block_devices(unsigned long start, unsigned long size)
 {
-	int block_id = base_memory_block_id(__section_nr(section));
-	int ret = 0;
+	const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
+	int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 	struct memory_block *mem;
+	unsigned long block_id;
+	int ret = 0;
 
-	mutex_lock(&mem_sysfs_mutex);
+	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
+			 !IS_ALIGNED(size, memory_block_size_bytes())))
+		return -EINVAL;
 
-	mem = find_memory_block(section);
-	if (mem) {
-		mem->section_count++;
-		put_device(&mem->dev);
-	} else {
+	mutex_lock(&mem_sysfs_mutex);
+	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
 		ret = init_memory_block(&mem, block_id, MEM_OFFLINE);
 		if (ret)
-			goto out;
-		mem->section_count++;
+			break;
+		mem->section_count = sections_per_block;
+	}
+	if (ret) {
+		end_block_id = block_id;
+		for (block_id = start_block_id; block_id != end_block_id;
+		     block_id++) {
+			mem = find_memory_block_by_id(block_id, NULL);
+			mem->section_count = 0;
+			unregister_memory(mem);
+		}
 	}
-
-out:
 	mutex_unlock(&mem_sysfs_mutex);
 	return ret;
 }
 
-static void
-unregister_memory(struct memory_block *memory)
-{
-	BUG_ON(memory->dev.bus != &memory_subsys);
-
-	/* drop the ref. we got via find_memory_block() */
-	put_device(&memory->dev);
-	device_unregister(&memory->dev);
-}
-
 void unregister_memory_section(struct mem_section *section)
 {
 	struct memory_block *mem;
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 474c7c60c8f2..db3e8567f900 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -111,7 +111,7 @@ extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
-int hotplug_memory_register(int nid, struct mem_section *section);
+int create_memory_block_devices(unsigned long start, unsigned long size);
 extern void unregister_memory_section(struct mem_section *);
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index bc11888d5d7e..78291526eb4d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -259,13 +259,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 		return -EEXIST;
 
 	ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
-	if (ret < 0)
-		return ret;
-
-	if (!want_memblock)
-		return 0;
-
-	return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
+	return ret < 0 ? ret : 0;
 }
 
 /*
@@ -1105,6 +1099,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
 	if (ret < 0)
 		goto error;
 
+	/* create memory block devices after memory was added */
+	ret = create_memory_block_devices(start, size);
+	if (ret) {
+		arch_remove_memory(nid, start, size, NULL);
+		goto error;
+	}
+
 	if (new_node) {
 		/* If sysfs file of new node can't be created, cpu on the node
 		 * can't be hot-added. There is no rollback way now.
-- 
cgit v1.2.3-59-g8ed1b


From 05f800a0bd08e14606ac63e0a5c63ed6880acaab Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:01 -0700
Subject: mm/memory_hotplug: drop MHP_MEMBLOCK_API

No longer needed, the callers of arch_add_memory() can handle this
manually.

Link: http://lkml.kernel.org/r/20190527111152.16324-9-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 8 --------
 mm/memory_hotplug.c            | 9 +++------
 2 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 87bf9c4a889e..36c514b80cf1 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -128,14 +128,6 @@ extern void arch_remove_memory(int nid, u64 start, u64 size,
 extern void __remove_pages(struct zone *zone, unsigned long start_pfn,
 			   unsigned long nr_pages, struct vmem_altmap *altmap);
 
-/*
- * Do we want sysfs memblock files created. This will allow userspace to online
- * and offline memory explicitly. Lack of this bit means that the caller has to
- * call move_pfn_range_to_zone to finish the initialization.
- */
-
-#define MHP_MEMBLOCK_API               (1<<0)
-
 /* reasonably generic interface to expand the physical pages */
 extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
 		       struct mhp_restrictions *restrictions);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 78291526eb4d..fb9dc3fa1138 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -251,7 +251,7 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 
 static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
-		struct vmem_altmap *altmap, bool want_memblock)
+				   struct vmem_altmap *altmap)
 {
 	int ret;
 
@@ -294,8 +294,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 	}
 
 	for (i = start_sec; i <= end_sec; i++) {
-		err = __add_section(nid, section_nr_to_pfn(i), altmap,
-				restrictions->flags & MHP_MEMBLOCK_API);
+		err = __add_section(nid, section_nr_to_pfn(i), altmap);
 
 		/*
 		 * EEXIST is finally dealt with by ioresource collision
@@ -1065,9 +1064,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
  */
 int __ref add_memory_resource(int nid, struct resource *res)
 {
-	struct mhp_restrictions restrictions = {
-		.flags = MHP_MEMBLOCK_API,
-	};
+	struct mhp_restrictions restrictions = {};
 	u64 start, size;
 	bool new_node = false;
 	int ret;
-- 
cgit v1.2.3-59-g8ed1b


From 4c4b7f9ba9486c565aead99a198ceeef73ae81f6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:06 -0700
Subject: mm/memory_hotplug: remove memory block devices before
 arch_remove_memory()

Let's factor out removing of memory block devices, which is only
necessary for memory added via add_memory() and friends that created
memory block devices.  Remove the devices before calling
arch_remove_memory().

This finishes factoring out memory block device handling from
arch_add_memory() and arch_remove_memory().

Link: http://lkml.kernel.org/r/20190527111152.16324-10-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Mark Brown <broonie@kernel.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 37 ++++++++++++++++++-------------------
 drivers/base/node.c    | 11 ++++++-----
 include/linux/memory.h |  2 +-
 include/linux/node.h   |  6 ++----
 mm/memory_hotplug.c    |  5 +++--
 5 files changed, 30 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 18a30c3ac0ef..826dd76f662e 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -759,32 +759,31 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
 	return ret;
 }
 
-void unregister_memory_section(struct mem_section *section)
+/*
+ * Remove memory block devices for the given memory area. Start and size
+ * have to be aligned to memory block granularity. Memory block devices
+ * have to be offline.
+ */
+void remove_memory_block_devices(unsigned long start, unsigned long size)
 {
+	const int start_block_id = pfn_to_block_id(PFN_DOWN(start));
+	const int end_block_id = pfn_to_block_id(PFN_DOWN(start + size));
 	struct memory_block *mem;
+	int block_id;
 
-	if (WARN_ON_ONCE(!present_section(section)))
+	if (WARN_ON_ONCE(!IS_ALIGNED(start, memory_block_size_bytes()) ||
+			 !IS_ALIGNED(size, memory_block_size_bytes())))
 		return;
 
 	mutex_lock(&mem_sysfs_mutex);
-
-	/*
-	 * Some users of the memory hotplug do not want/need memblock to
-	 * track all sections. Skip over those.
-	 */
-	mem = find_memory_block(section);
-	if (!mem)
-		goto out_unlock;
-
-	unregister_mem_sect_under_nodes(mem, __section_nr(section));
-
-	mem->section_count--;
-	if (mem->section_count == 0)
+	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
+		mem = find_memory_block_by_id(block_id, NULL);
+		if (WARN_ON_ONCE(!mem))
+			continue;
+		mem->section_count = 0;
+		unregister_memory_block_under_nodes(mem);
 		unregister_memory(mem);
-	else
-		put_device(&mem->dev);
-
-out_unlock:
+	}
 	mutex_unlock(&mem_sysfs_mutex);
 }
 
diff --git a/drivers/base/node.c b/drivers/base/node.c
index aa878fbcf705..0b0f38c2c7cd 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -802,9 +802,10 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
 	return 0;
 }
 
-/* unregister memory section under all nodes that it spans */
-int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
-				    unsigned long phys_index)
+/*
+ * Unregister memory block device under all nodes that it spans.
+ */
+int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
 	NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
 	unsigned long pfn, sect_start_pfn, sect_end_pfn;
@@ -817,8 +818,8 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
 		return -ENOMEM;
 	nodes_clear(*unlinked_nodes);
 
-	sect_start_pfn = section_nr_to_pfn(phys_index);
-	sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
+	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
+	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
 	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
 		int nid;
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index db3e8567f900..f26a5417ec5d 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -112,7 +112,7 @@ extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
 int create_memory_block_devices(unsigned long start, unsigned long size);
-extern void unregister_memory_section(struct mem_section *);
+void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
diff --git a/include/linux/node.h b/include/linux/node.h
index 1a557c589ecb..02a29e71b175 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -139,8 +139,7 @@ extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int register_mem_sect_under_node(struct memory_block *mem_blk,
 						void *arg);
-extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
-					   unsigned long phys_index);
+extern int unregister_memory_block_under_nodes(struct memory_block *mem_blk);
 
 extern int register_memory_node_under_compute_node(unsigned int mem_nid,
 						   unsigned int cpu_nid,
@@ -176,8 +175,7 @@ static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
 {
 	return 0;
 }
-static inline int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
-						  unsigned long phys_index)
+static inline int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
 	return 0;
 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fb9dc3fa1138..37c861e7a717 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -520,8 +520,6 @@ static void __remove_section(struct zone *zone, struct mem_section *ms,
 	if (WARN_ON_ONCE(!valid_section(ms)))
 		return;
 
-	unregister_memory_section(ms);
-
 	scn_nr = __section_nr(ms);
 	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
 	__remove_zone(zone, start_pfn);
@@ -1834,6 +1832,9 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 	memblock_free(start, size);
 	memblock_remove(start, size);
 
+	/* remove memory block devices before removing memory */
+	remove_memory_block_devices(start, size);
+
 	arch_remove_memory(nid, start, size, NULL);
 	__release_memory_resource(start, size);
 
-- 
cgit v1.2.3-59-g8ed1b


From a31b264c2b415b29660da0bc2ba291a98629ce51 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:12 -0700
Subject: mm/memory_hotplug: make unregister_memory_block_under_nodes() never
 fail

We really don't want anything during memory hotunplug to fail.  We
always pass a valid memory block device, that check can go.  Avoid
allocating memory and eventually failing.  As we are always called under
lock, we can use a static piece of memory.  This avoids having to put
the structure onto the stack, having to guess about the stack size of
callers.

Patch inspired by a patch from Oscar Salvador.

In the future, there might be no need to iterate over nodes at all.
mem->nid should tell us exactly what to remove.  Memory block devices
with mixed nodes (added during boot) should properly fenced off and
never removed.

Link: http://lkml.kernel.org/r/20190527111152.16324-11-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Mark Brown <broonie@kernel.org>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c  | 18 +++++-------------
 include/linux/node.h |  5 ++---
 2 files changed, 7 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0b0f38c2c7cd..beec80649b33 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -804,20 +804,14 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
 
 /*
  * Unregister memory block device under all nodes that it spans.
+ * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes).
  */
-int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
+void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
-	NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
 	unsigned long pfn, sect_start_pfn, sect_end_pfn;
+	static nodemask_t unlinked_nodes;
 
-	if (!mem_blk) {
-		NODEMASK_FREE(unlinked_nodes);
-		return -EFAULT;
-	}
-	if (!unlinked_nodes)
-		return -ENOMEM;
-	nodes_clear(*unlinked_nodes);
-
+	nodes_clear(unlinked_nodes);
 	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
 	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
 	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
@@ -828,15 +822,13 @@ int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 			continue;
 		if (!node_online(nid))
 			continue;
-		if (node_test_and_set(nid, *unlinked_nodes))
+		if (node_test_and_set(nid, unlinked_nodes))
 			continue;
 		sysfs_remove_link(&node_devices[nid]->dev.kobj,
 			 kobject_name(&mem_blk->dev.kobj));
 		sysfs_remove_link(&mem_blk->dev.kobj,
 			 kobject_name(&node_devices[nid]->dev.kobj));
 	}
-	NODEMASK_FREE(unlinked_nodes);
-	return 0;
 }
 
 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
diff --git a/include/linux/node.h b/include/linux/node.h
index 02a29e71b175..548c226966a2 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -139,7 +139,7 @@ extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int register_mem_sect_under_node(struct memory_block *mem_blk,
 						void *arg);
-extern int unregister_memory_block_under_nodes(struct memory_block *mem_blk);
+extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
 
 extern int register_memory_node_under_compute_node(unsigned int mem_nid,
 						   unsigned int cpu_nid,
@@ -175,9 +175,8 @@ static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
 {
 	return 0;
 }
-static inline int unregister_memory_block_under_nodes(struct memory_block *mem_blk)
+static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
-	return 0;
 }
 
 static inline void register_hugetlbfs_with_node(node_registration_func_t reg,
-- 
cgit v1.2.3-59-g8ed1b


From b9bf8d342d9b443c0d19aa57883d8ddb38d965de Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:17 -0700
Subject: mm/memory_hotplug: remove "zone" parameter from
 sparse_remove_one_section

The parameter is unused, so let's drop it.  Memory removal paths should
never care about zones.  This is the job of memory offlining and will
require more refactorings.

Link: http://lkml.kernel.org/r/20190527111152.16324-12-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chintan Pandya <cpandya@codeaurora.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jun Yao <yaojun8558363@gmail.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Qian Cai <cai@lca.pw>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Rob Herring <robh@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 +-
 mm/memory_hotplug.c            | 2 +-
 mm/sparse.c                    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 36c514b80cf1..79e0add6a597 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -350,7 +350,7 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern int sparse_add_one_section(int nid, unsigned long start_pfn,
 				  struct vmem_altmap *altmap);
-extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+extern void sparse_remove_one_section(struct mem_section *ms,
 		unsigned long map_offset, struct vmem_altmap *altmap);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 37c861e7a717..d1d0ceaaca88 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -524,7 +524,7 @@ static void __remove_section(struct zone *zone, struct mem_section *ms,
 	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
 	__remove_zone(zone, start_pfn);
 
-	sparse_remove_one_section(zone, ms, map_offset, altmap);
+	sparse_remove_one_section(ms, map_offset, altmap);
 }
 
 /**
diff --git a/mm/sparse.c b/mm/sparse.c
index d1d5e05f5b8d..1552c855d62a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -800,8 +800,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap,
 		free_map_bootmem(memmap);
 }
 
-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
-		unsigned long map_offset, struct vmem_altmap *altmap)
+void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
+			       struct vmem_altmap *altmap)
 {
 	struct page *memmap = NULL;
 	unsigned long *usemap = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From 43675e6fbbeadca90c6c5031557ff95e217e6d2f Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Thu, 18 Jul 2019 15:57:24 -0700
Subject: mm: thp: make transhuge_vma_suitable available for anonymous THP

transhuge_vma_suitable() was only available for shmem THP, but anonymous
THP has the same check except pgoff check.  And, it will be used for THP
eligible check in the later patch, so make it available for all kind of
THPs.  This also helps reduce code duplication slightly.

Since anonymous THP doesn't have to check pgoff, so make pgoff check
shmem vma only.

And regroup some functions in include/linux/mm.h to solve compile issue
since transhuge_vma_suitable() needs call vma_is_anonymous() which was
defined after huge_mm.h is included.

[akpm@linux-foundation.org: fix typo]
[yang.shi@linux.alibaba.com: v4]
  Link: http://lkml.kernel.org/r/1563400758-124759-2-git-send-email-yang.shi@linux.alibaba.com
Link: http://lkml.kernel.org/r/1560401041-32207-2-git-send-email-yang.shi@linux.alibaba.com
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 23 +++++++++++++++++++++++
 include/linux/mm.h      | 34 +++++++++++++++++-----------------
 mm/huge_memory.c        |  2 +-
 mm/memory.c             | 13 -------------
 4 files changed, 41 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7cd5c150c21d..45ede62aa85b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -121,6 +121,23 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
 
 bool transparent_hugepage_enabled(struct vm_area_struct *vma);
 
+#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
+
+static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
+		unsigned long haddr)
+{
+	/* Don't have to check pgoff for anonymous vma */
+	if (!vma_is_anonymous(vma)) {
+		if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
+			(vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
+			return false;
+	}
+
+	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+		return false;
+	return true;
+}
+
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
@@ -271,6 +288,12 @@ static inline bool transparent_hugepage_enabled(struct vm_area_struct *vma)
 	return false;
 }
 
+static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
+		unsigned long haddr)
+{
+	return false;
+}
+
 static inline void prep_transhuge_page(struct page *page) {}
 
 #define transparent_hugepage_flags 0UL
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bd6512559bed..48ab7b982d82 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -541,6 +541,23 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
 	vma->vm_ops = NULL;
 }
 
+static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+{
+	return !vma->vm_ops;
+}
+
+#ifdef CONFIG_SHMEM
+/*
+ * The vma_is_shmem is not inline because it is used only by slow
+ * paths in userfault.
+ */
+bool vma_is_shmem(struct vm_area_struct *vma);
+#else
+static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
+#endif
+
+int vma_is_stack_for_current(struct vm_area_struct *vma);
+
 /* flush_tlb_range() takes a vma, not a mm, and can care about flags */
 #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }
 
@@ -1620,23 +1637,6 @@ int clear_page_dirty_for_io(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
 
-static inline bool vma_is_anonymous(struct vm_area_struct *vma)
-{
-	return !vma->vm_ops;
-}
-
-#ifdef CONFIG_SHMEM
-/*
- * The vma_is_shmem is not inline because it is used only by slow
- * paths in userfault.
- */
-bool vma_is_shmem(struct vm_area_struct *vma);
-#else
-static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
-#endif
-
-int vma_is_stack_for_current(struct vm_area_struct *vma);
-
 extern unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 885642c82aaa..782dd1446a3e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -689,7 +689,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 	struct page *page;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 
-	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+	if (!transhuge_vma_suitable(vma, haddr))
 		return VM_FAULT_FALLBACK;
 	if (unlikely(anon_vma_prepare(vma)))
 		return VM_FAULT_OOM;
diff --git a/mm/memory.c b/mm/memory.c
index 89325f9c6173..e2bb51b6242e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3162,19 +3162,6 @@ map_pte:
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
-
-#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
-		unsigned long haddr)
-{
-	if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
-			(vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
-		return false;
-	if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
-		return false;
-	return true;
-}
-
 static void deposit_prealloc_pte(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
-- 
cgit v1.2.3-59-g8ed1b


From 2491f0a2c0b117b9097e9c9eee0c21f2e5f716d7 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:37 -0700
Subject: mm: section numbers use the type "unsigned long"

Patch series "mm: Further memory block device cleanups", v1.

Some further cleanups around memory block devices.  Especially, clean up
and simplify walk_memory_range().  Including some other minor cleanups.

This patch (of 6):

We are using a mixture of "int" and "unsigned long".  Let's make this
consistent by using "unsigned long" everywhere.  We'll do the same with
memory block ids next.

While at it, turn the "unsigned long i" in removable_show() into an int
- sections_per_block is an int.

[akpm@linux-foundation.org: s/unsigned long i/unsigned long nr/]
[david@redhat.com: v3]
  Link: http://lkml.kernel.org/r/20190620183139.4352-2-david@redhat.com
Link: http://lkml.kernel.org/r/20190614100114.311-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 27 +++++++++++++--------------
 include/linux/mmzone.h |  4 ++--
 mm/sparse.c            | 12 ++++++------
 3 files changed, 21 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 826dd76f662e..5947b5a5686d 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -34,7 +34,7 @@ static DEFINE_MUTEX(mem_sysfs_mutex);
 
 static int sections_per_block;
 
-static inline int base_memory_block_id(int section_nr)
+static inline int base_memory_block_id(unsigned long section_nr)
 {
 	return section_nr / sections_per_block;
 }
@@ -131,9 +131,9 @@ static ssize_t phys_index_show(struct device *dev,
 static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
 			      char *buf)
 {
-	unsigned long i, pfn;
-	int ret = 1;
 	struct memory_block *mem = to_memory_block(dev);
+	unsigned long pfn;
+	int ret = 1, i;
 
 	if (mem->state != MEM_ONLINE)
 		goto out;
@@ -691,15 +691,15 @@ static int init_memory_block(struct memory_block **memory, int block_id,
 	return ret;
 }
 
-static int add_memory_block(int base_section_nr)
+static int add_memory_block(unsigned long base_section_nr)
 {
+	int ret, section_count = 0;
 	struct memory_block *mem;
-	int i, ret, section_count = 0;
+	unsigned long nr;
 
-	for (i = base_section_nr;
-	     i < base_section_nr + sections_per_block;
-	     i++)
-		if (present_section_nr(i))
+	for (nr = base_section_nr; nr < base_section_nr + sections_per_block;
+	     nr++)
+		if (present_section_nr(nr))
 			section_count++;
 
 	if (section_count == 0)
@@ -822,10 +822,9 @@ static const struct attribute_group *memory_root_attr_groups[] = {
  */
 int __init memory_dev_init(void)
 {
-	unsigned int i;
 	int ret;
 	int err;
-	unsigned long block_sz;
+	unsigned long block_sz, nr;
 
 	ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
 	if (ret)
@@ -839,9 +838,9 @@ int __init memory_dev_init(void)
 	 * during boot and have been initialized
 	 */
 	mutex_lock(&mem_sysfs_mutex);
-	for (i = 0; i <= __highest_present_section_nr;
-		i += sections_per_block) {
-		err = add_memory_block(i);
+	for (nr = 0; nr <= __highest_present_section_nr;
+	     nr += sections_per_block) {
+		err = add_memory_block(nr);
 		if (!ret)
 			ret = err;
 	}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 70394cabaf4e..298d1c3e4c2e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1219,7 +1219,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
 		return NULL;
 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
-extern int __section_nr(struct mem_section* ms);
+extern unsigned long __section_nr(struct mem_section *ms);
 extern unsigned long usemap_size(void);
 
 /*
@@ -1291,7 +1291,7 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 	return __nr_to_section(pfn_to_section_nr(pfn));
 }
 
-extern int __highest_present_section_nr;
+extern unsigned long __highest_present_section_nr;
 
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
 static inline int pfn_valid(unsigned long pfn)
diff --git a/mm/sparse.c b/mm/sparse.c
index fe44b2d3bd7e..b29534cea8c0 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -102,7 +102,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_EXTREME
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
 {
 	unsigned long root_nr;
 	struct mem_section *root = NULL;
@@ -121,9 +121,9 @@ int __section_nr(struct mem_section* ms)
 	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
 #else
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
 {
-	return (int)(ms - mem_section[0]);
+	return (unsigned long)(ms - mem_section[0]);
 }
 #endif
 
@@ -178,10 +178,10 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
  * Keeping track of this gives us an easy way to break out of
  * those loops early.
  */
-int __highest_present_section_nr;
+unsigned long __highest_present_section_nr;
 static void section_mark_present(struct mem_section *ms)
 {
-	int section_nr = __section_nr(ms);
+	unsigned long section_nr = __section_nr(ms);
 
 	if (section_nr > __highest_present_section_nr)
 		__highest_present_section_nr = section_nr;
@@ -189,7 +189,7 @@ static void section_mark_present(struct mem_section *ms)
 	ms->section_mem_map |= SECTION_MARKED_PRESENT;
 }
 
-static inline int next_present_section_nr(int section_nr)
+static inline unsigned long next_present_section_nr(unsigned long section_nr)
 {
 	do {
 		section_nr++;
-- 
cgit v1.2.3-59-g8ed1b


From 8d595c4c0f768f19db043d378b22e98405f9fd47 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:43 -0700
Subject: mm: make register_mem_sect_under_node() static

It is only used internally.

Link: http://lkml.kernel.org/r/20190614100114.311-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c  | 3 ++-
 include/linux/node.h | 7 -------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/node.c b/drivers/base/node.c
index beec80649b33..27391f1e8f60 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -753,7 +753,8 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
 }
 
 /* register memory section under specified node if it spans that node */
-int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
+static int register_mem_sect_under_node(struct memory_block *mem_blk,
+					 void *arg)
 {
 	int ret, nid = *(int *)arg;
 	unsigned long pfn, sect_start_pfn, sect_end_pfn;
diff --git a/include/linux/node.h b/include/linux/node.h
index 548c226966a2..4866f32a02d8 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -137,8 +137,6 @@ static inline int register_one_node(int nid)
 extern void unregister_one_node(int nid);
 extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
-extern int register_mem_sect_under_node(struct memory_block *mem_blk,
-						void *arg);
 extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk);
 
 extern int register_memory_node_under_compute_node(unsigned int mem_nid,
@@ -170,11 +168,6 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
 {
 	return 0;
 }
-static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
-							void *arg)
-{
-	return 0;
-}
 static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 {
 }
-- 
cgit v1.2.3-59-g8ed1b


From fbcf73ce65827c3d8935f38b832a43153a0c78d1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:46 -0700
Subject: mm/memory_hotplug: rename walk_memory_range() and pass start+size
 instead of pfns

walk_memory_range() was once used to iterate over sections.  Now, it
iterates over memory blocks.  Rename the function, fixup the
documentation.

Also, pass start+size instead of PFNs, which is what most callers
already have at hand.  (we'll rework link_mem_sections() most probably
soon)

Follow-up patches will rework, simplify, and move walk_memory_blocks()
to drivers/base/memory.c.

Note: walk_memory_blocks() only works correctly right now if the
start_pfn is aligned to a section start.  This is the case right now,
but we'll generalize the function in a follow up patch so the semantics
match the documentation.

[akpm@linux-foundation.org: remove unused variable]
Link: http://lkml.kernel.org/r/20190614100114.311-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Rashmica Gupta <rashmica.g@gmail.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/powernv/memtrace.c | 23 +++++++++++------------
 drivers/acpi/acpi_memhotplug.c            | 19 ++++---------------
 drivers/base/node.c                       |  5 +++--
 include/linux/memory_hotplug.h            |  2 +-
 mm/memory_hotplug.c                       | 24 +++++++++++++-----------
 5 files changed, 32 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 5e53c1392d3b..eb2e75dac369 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -70,23 +70,23 @@ static int change_memblock_state(struct memory_block *mem, void *arg)
 /* called with device_hotplug_lock held */
 static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
 {
-	u64 end_pfn = start_pfn + nr_pages - 1;
+	const unsigned long start = PFN_PHYS(start_pfn);
+	const unsigned long size = PFN_PHYS(nr_pages);
 
-	if (walk_memory_range(start_pfn, end_pfn, NULL,
-	    check_memblock_online))
+	if (walk_memory_blocks(start, size, NULL, check_memblock_online))
 		return false;
 
-	walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
-			  change_memblock_state);
+	walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
+			   change_memblock_state);
 
 	if (offline_pages(start_pfn, nr_pages)) {
-		walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
-				  change_memblock_state);
+		walk_memory_blocks(start, size, (void *)MEM_ONLINE,
+				   change_memblock_state);
 		return false;
 	}
 
-	walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
-			  change_memblock_state);
+	walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
+			   change_memblock_state);
 
 
 	return true;
@@ -242,9 +242,8 @@ static int memtrace_online(void)
 		 */
 		if (!memhp_auto_online) {
 			lock_device_hotplug();
-			walk_memory_range(PFN_DOWN(ent->start),
-					  PFN_UP(ent->start + ent->size - 1),
-					  NULL, online_mem_block);
+			walk_memory_blocks(ent->start, ent->size, NULL,
+					   online_mem_block);
 			unlock_device_hotplug();
 		}
 
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index db013dc21c02..e294f44a7850 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -155,16 +155,6 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
 	return 0;
 }
 
-static unsigned long acpi_meminfo_start_pfn(struct acpi_memory_info *info)
-{
-	return PFN_DOWN(info->start_addr);
-}
-
-static unsigned long acpi_meminfo_end_pfn(struct acpi_memory_info *info)
-{
-	return PFN_UP(info->start_addr + info->length-1);
-}
-
 static int acpi_bind_memblk(struct memory_block *mem, void *arg)
 {
 	return acpi_bind_one(&mem->dev, arg);
@@ -173,9 +163,8 @@ static int acpi_bind_memblk(struct memory_block *mem, void *arg)
 static int acpi_bind_memory_blocks(struct acpi_memory_info *info,
 				   struct acpi_device *adev)
 {
-	return walk_memory_range(acpi_meminfo_start_pfn(info),
-				 acpi_meminfo_end_pfn(info), adev,
-				 acpi_bind_memblk);
+	return walk_memory_blocks(info->start_addr, info->length, adev,
+				  acpi_bind_memblk);
 }
 
 static int acpi_unbind_memblk(struct memory_block *mem, void *arg)
@@ -186,8 +175,8 @@ static int acpi_unbind_memblk(struct memory_block *mem, void *arg)
 
 static void acpi_unbind_memory_blocks(struct acpi_memory_info *info)
 {
-	walk_memory_range(acpi_meminfo_start_pfn(info),
-			  acpi_meminfo_end_pfn(info), NULL, acpi_unbind_memblk);
+	walk_memory_blocks(info->start_addr, info->length, NULL,
+			   acpi_unbind_memblk);
 }
 
 static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 27391f1e8f60..75b7e6f6535b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -834,8 +834,9 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
 
 int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
 {
-	return walk_memory_range(start_pfn, end_pfn, (void *)&nid,
-					register_mem_sect_under_node);
+	return walk_memory_blocks(PFN_PHYS(start_pfn),
+				  PFN_PHYS(end_pfn - start_pfn), (void *)&nid,
+				  register_mem_sect_under_node);
 }
 
 #ifdef CONFIG_HUGETLBFS
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 79e0add6a597..d9fffc34949f 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -340,7 +340,7 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 extern void __ref free_area_init_core_hotplug(int nid);
-extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+extern int walk_memory_blocks(unsigned long start, unsigned long size,
 		void *arg, int (*func)(struct memory_block *, void *));
 extern int __add_memory(int nid, u64 start, u64 size);
 extern int add_memory(int nid, u64 start, u64 size);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d1d0ceaaca88..b3ef84e408fa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1124,8 +1124,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
 
 	/* online pages if requested */
 	if (memhp_auto_online)
-		walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
-				  NULL, online_memory_block);
+		walk_memory_blocks(start, size, NULL, online_memory_block);
 
 	return ret;
 error:
@@ -1663,20 +1662,24 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /**
- * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
- * @start_pfn: start pfn of the memory range
- * @end_pfn: end pfn of the memory range
+ * walk_memory_blocks - walk through all present memory blocks overlapped
+ *			by the range [start, start + size)
+ *
+ * @start: start address of the memory range
+ * @size: size of the memory range
  * @arg: argument passed to func
- * @func: callback for each memory section walked
+ * @func: callback for each memory block walked
  *
- * This function walks through all present mem sections in range
- * [start_pfn, end_pfn) and call func on each mem section.
+ * This function walks through all present memory blocks overlapped by the
+ * range [start, start + size), calling func on each memory block.
  *
  * Returns the return value of func.
  */
-int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+int walk_memory_blocks(unsigned long start, unsigned long size,
 		void *arg, int (*func)(struct memory_block *, void *))
 {
+	const unsigned long start_pfn = PFN_DOWN(start);
+	const unsigned long end_pfn = PFN_UP(start + size - 1);
 	struct memory_block *mem = NULL;
 	struct mem_section *section;
 	unsigned long pfn, section_nr;
@@ -1822,8 +1825,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
 	 * whether all memory blocks in question are offline and return error
 	 * if this is not the case.
 	 */
-	rc = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
-			       check_memblock_offlined_cb);
+	rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
 	if (rc)
 		goto done;
 
-- 
cgit v1.2.3-59-g8ed1b


From ea8846411ad686ff626e00bb2c3821b3db2ab56a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:50 -0700
Subject: mm/memory_hotplug: move and simplify walk_memory_blocks()

Let's move walk_memory_blocks() to the place where memory block logic
resides and simplify it.  While at it, add a type for the callback
function.

Link: http://lkml.kernel.org/r/20190614100114.311-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Mike Travis <mike.travis@hpe.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c          | 42 ++++++++++++++++++++++++++++++++
 include/linux/memory.h         |  3 +++
 include/linux/memory_hotplug.h |  2 --
 mm/memory_hotplug.c            | 55 ------------------------------------------
 4 files changed, 45 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c54e80fd25a8..0204384b4d1d 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -44,6 +44,11 @@ static inline unsigned long pfn_to_block_id(unsigned long pfn)
 	return base_memory_block_id(pfn_to_section_nr(pfn));
 }
 
+static inline unsigned long phys_to_block_id(unsigned long phys)
+{
+	return pfn_to_block_id(PFN_DOWN(phys));
+}
+
 static int memory_subsys_online(struct device *dev);
 static int memory_subsys_offline(struct device *dev);
 
@@ -851,3 +856,40 @@ out:
 		printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
 	return ret;
 }
+
+/**
+ * walk_memory_blocks - walk through all present memory blocks overlapped
+ *			by the range [start, start + size)
+ *
+ * @start: start address of the memory range
+ * @size: size of the memory range
+ * @arg: argument passed to func
+ * @func: callback for each memory section walked
+ *
+ * This function walks through all present memory blocks overlapped by the
+ * range [start, start + size), calling func on each memory block.
+ *
+ * In case func() returns an error, walking is aborted and the error is
+ * returned.
+ */
+int walk_memory_blocks(unsigned long start, unsigned long size,
+		       void *arg, walk_memory_blocks_func_t func)
+{
+	const unsigned long start_block_id = phys_to_block_id(start);
+	const unsigned long end_block_id = phys_to_block_id(start + size - 1);
+	struct memory_block *mem;
+	unsigned long block_id;
+	int ret = 0;
+
+	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
+		mem = find_memory_block_by_id(block_id, NULL);
+		if (!mem)
+			continue;
+
+		ret = func(mem, arg);
+		put_device(&mem->dev);
+		if (ret)
+			break;
+	}
+	return ret;
+}
diff --git a/include/linux/memory.h b/include/linux/memory.h
index f26a5417ec5d..b3b388775a30 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -119,6 +119,9 @@ extern int memory_isolate_notify(unsigned long val, void *v);
 extern struct memory_block *find_memory_block_hinted(struct mem_section *,
 							struct memory_block *);
 extern struct memory_block *find_memory_block(struct mem_section *);
+typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
+extern int walk_memory_blocks(unsigned long start, unsigned long size,
+			      void *arg, walk_memory_blocks_func_t func);
 #define CONFIG_MEM_BLOCK_SIZE	(PAGES_PER_SECTION<<PAGE_SHIFT)
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index d9fffc34949f..475aff8efbf8 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -340,8 +340,6 @@ static inline void __remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 extern void __ref free_area_init_core_hotplug(int nid);
-extern int walk_memory_blocks(unsigned long start, unsigned long size,
-		void *arg, int (*func)(struct memory_block *, void *));
 extern int __add_memory(int nid, u64 start, u64 size);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int add_memory_resource(int nid, struct resource *resource);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b3ef84e408fa..fafee5f13ef2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1659,62 +1659,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
 	return __offline_pages(start_pfn, start_pfn + nr_pages);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 
-/**
- * walk_memory_blocks - walk through all present memory blocks overlapped
- *			by the range [start, start + size)
- *
- * @start: start address of the memory range
- * @size: size of the memory range
- * @arg: argument passed to func
- * @func: callback for each memory block walked
- *
- * This function walks through all present memory blocks overlapped by the
- * range [start, start + size), calling func on each memory block.
- *
- * Returns the return value of func.
- */
-int walk_memory_blocks(unsigned long start, unsigned long size,
-		void *arg, int (*func)(struct memory_block *, void *))
-{
-	const unsigned long start_pfn = PFN_DOWN(start);
-	const unsigned long end_pfn = PFN_UP(start + size - 1);
-	struct memory_block *mem = NULL;
-	struct mem_section *section;
-	unsigned long pfn, section_nr;
-	int ret;
-
-	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
-		section_nr = pfn_to_section_nr(pfn);
-		if (!present_section_nr(section_nr))
-			continue;
-
-		section = __nr_to_section(section_nr);
-		/* same memblock? */
-		if (mem)
-			if ((section_nr >= mem->start_section_nr) &&
-			    (section_nr <= mem->end_section_nr))
-				continue;
-
-		mem = find_memory_block_hinted(section, mem);
-		if (!mem)
-			continue;
-
-		ret = func(mem, arg);
-		if (ret) {
-			kobject_put(&mem->dev.kobj);
-			return ret;
-		}
-	}
-
-	if (mem)
-		kobject_put(&mem->dev.kobj);
-
-	return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTREMOVE
 static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 {
 	int ret = !is_memblock_offlined(mem);
-- 
cgit v1.2.3-59-g8ed1b


From dd625285910d3cff535fa76355e49949513918a4 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 18 Jul 2019 15:57:53 -0700
Subject: drivers/base/memory.c: get rid of find_memory_block_hinted()

No longer needed, let's remove it.  Also, drop the "hint" parameter
completely from "find_memory_block_by_id", as nobody needs it anymore.

[david@redhat.com: v3]
  Link: http://lkml.kernel.org/r/20190620183139.4352-7-david@redhat.com
[david@redhat.com: handle zero-length walks]
  Link: http://lkml.kernel.org/r/1c2edc22-afd7-2211-c4c7-40e54e5007e8@redhat.com
Link: http://lkml.kernel.org/r/20190614100114.311-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Qian Cai <cai@lca.pw>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Mike Travis <mike.travis@hpe.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  | 40 ++++++++++++++--------------------------
 include/linux/memory.h |  2 --
 2 files changed, 14 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 0204384b4d1d..20c39d1bcef8 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -588,30 +588,13 @@ int __weak arch_get_memory_phys_device(unsigned long start_pfn)
 	return 0;
 }
 
-/*
- * A reference for the returned object is held and the reference for the
- * hinted object is released.
- */
-static struct memory_block *find_memory_block_by_id(unsigned long block_id,
-						    struct memory_block *hint)
+/* A reference for the returned memory block device is acquired. */
+static struct memory_block *find_memory_block_by_id(unsigned long block_id)
 {
-	struct device *hintdev = hint ? &hint->dev : NULL;
 	struct device *dev;
 
-	dev = subsys_find_device_by_id(&memory_subsys, block_id, hintdev);
-	if (hint)
-		put_device(&hint->dev);
-	if (!dev)
-		return NULL;
-	return to_memory_block(dev);
-}
-
-struct memory_block *find_memory_block_hinted(struct mem_section *section,
-					      struct memory_block *hint)
-{
-	unsigned long block_id = base_memory_block_id(__section_nr(section));
-
-	return find_memory_block_by_id(block_id, hint);
+	dev = subsys_find_device_by_id(&memory_subsys, block_id, NULL);
+	return dev ? to_memory_block(dev) : NULL;
 }
 
 /*
@@ -624,7 +607,9 @@ struct memory_block *find_memory_block_hinted(struct mem_section *section,
  */
 struct memory_block *find_memory_block(struct mem_section *section)
 {
-	return find_memory_block_hinted(section, NULL);
+	unsigned long block_id = base_memory_block_id(__section_nr(section));
+
+	return find_memory_block_by_id(block_id);
 }
 
 static struct attribute *memory_memblk_attrs[] = {
@@ -675,7 +660,7 @@ static int init_memory_block(struct memory_block **memory,
 	unsigned long start_pfn;
 	int ret = 0;
 
-	mem = find_memory_block_by_id(block_id, NULL);
+	mem = find_memory_block_by_id(block_id);
 	if (mem) {
 		put_device(&mem->dev);
 		return -EEXIST;
@@ -755,7 +740,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size)
 		end_block_id = block_id;
 		for (block_id = start_block_id; block_id != end_block_id;
 		     block_id++) {
-			mem = find_memory_block_by_id(block_id, NULL);
+			mem = find_memory_block_by_id(block_id);
 			mem->section_count = 0;
 			unregister_memory(mem);
 		}
@@ -782,7 +767,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size)
 
 	mutex_lock(&mem_sysfs_mutex);
 	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-		mem = find_memory_block_by_id(block_id, NULL);
+		mem = find_memory_block_by_id(block_id);
 		if (WARN_ON_ONCE(!mem))
 			continue;
 		mem->section_count = 0;
@@ -881,8 +866,11 @@ int walk_memory_blocks(unsigned long start, unsigned long size,
 	unsigned long block_id;
 	int ret = 0;
 
+	if (!size)
+		return 0;
+
 	for (block_id = start_block_id; block_id <= end_block_id; block_id++) {
-		mem = find_memory_block_by_id(block_id, NULL);
+		mem = find_memory_block_by_id(block_id);
 		if (!mem)
 			continue;
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index b3b388775a30..02e633f3ede0 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -116,8 +116,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
-extern struct memory_block *find_memory_block_hinted(struct mem_section *,
-							struct memory_block *);
 extern struct memory_block *find_memory_block(struct mem_section *);
 typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
 extern int walk_memory_blocks(unsigned long start, unsigned long size,
-- 
cgit v1.2.3-59-g8ed1b


From f1eca35a0dc7cb3cdb00c88c8c5e5138a65face0 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:57:57 -0700
Subject: mm/sparsemem: introduce struct mem_section_usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "mm: Sub-section memory hotplug support", v10.

The memory hotplug section is an arbitrary / convenient unit for memory
hotplug.  'Section-size' units have bled into the user interface
('memblock' sysfs) and can not be changed without breaking existing
userspace.  The section-size constraint, while mostly benign for typical
memory hotplug, has and continues to wreak havoc with 'device-memory'
use cases, persistent memory (pmem) in particular.  Recall that pmem
uses devm_memremap_pages(), and subsequently arch_add_memory(), to
allocate a 'struct page' memmap for pmem.  However, it does not use the
'bottom half' of memory hotplug, i.e.  never marks pmem pages online and
never exposes the userspace memblock interface for pmem.  This leaves an
opening to redress the section-size constraint.

To date, the libnvdimm subsystem has attempted to inject padding to
satisfy the internal constraints of arch_add_memory().  Beyond
complicating the code, leading to bugs [2], wasting memory, and limiting
configuration flexibility, the padding hack is broken when the platform
changes this physical memory alignment of pmem from one boot to the
next.  Device failure (intermittent or permanent) and physical
reconfiguration are events that can cause the platform firmware to
change the physical placement of pmem on a subsequent boot, and device
failure is an everyday event in a data-center.

It turns out that sections are only a hard requirement of the
user-facing interface for memory hotplug and with a bit more
infrastructure sub-section arch_add_memory() support can be added for
kernel internal usages like devm_memremap_pages().  Here is an analysis
of the current design assumptions in the current code and how they are
addressed in the new implementation:

Current design assumptions:

 - Sections that describe boot memory (early sections) are never
   unplugged / removed.

 - pfn_valid(), in the CONFIG_SPARSEMEM_VMEMMAP=y, case devolves to a
   valid_section() check

 - __add_pages() and helper routines assume all operations occur in
   PAGES_PER_SECTION units.

 - The memblock sysfs interface only comprehends full sections

New design assumptions:

 - Sections are instrumented with a sub-section bitmask to track (on
   x86) individual 2MB sub-divisions of a 128MB section.

 - Partially populated early sections can be extended with additional
   sub-sections, and those sub-sections can be removed with
   arch_remove_memory(). With this in place we no longer lose usable
   memory capacity to padding.

 - pfn_valid() is updated to look deeper than valid_section() to also
   check the active-sub-section mask. This indication is in the same
   cacheline as the valid_section() so the performance impact is
   expected to be negligible. So far the lkp robot has not reported any
   regressions.

 - Outside of the core vmemmap population routines which are replaced,
   other helper routines like shrink_{zone,pgdat}_span() are updated to
   handle the smaller granularity. Core memory hotplug routines that
   deal with online memory are not touched.

 - The existing memblock sysfs user api guarantees / assumptions are not
   touched since this capability is limited to !online
   !memblock-sysfs-accessible sections.

Meanwhile the issue reports continue to roll in from users that do not
understand when and how the 128MB constraint will bite them.  The current
implementation relied on being able to support at least one misaligned
namespace, but that immediately falls over on any moderately complex
namespace creation attempt.  Beyond the initial problem of 'System RAM'
colliding with pmem, and the unsolvable problem of physical alignment
changes, Linux is now being exposed to platforms that collide pmem ranges
with other pmem ranges by default [3].  In short, devm_memremap_pages()
has pushed the venerable section-size constraint past the breaking point,
and the simplicity of section-aligned arch_add_memory() is no longer
tenable.

These patches are exposed to the kbuild robot on a subsection-v10 branch
[4], and a preview of the unit test for this functionality is available
on the 'subsection-pending' branch of ndctl [5].

[2]: https://lore.kernel.org/r/155000671719.348031.2347363160141119237.stgit@dwillia2-desk3.amr.corp.intel.com
[3]: https://github.com/pmem/ndctl/issues/76
[4]: https://git.kernel.org/pub/scm/linux/kernel/git/djbw/nvdimm.git/log/?h=subsection-v10
[5]: https://github.com/pmem/ndctl/commit/7c59b4867e1c

This patch (of 13):

Towards enabling memory hotplug to track partial population of a section,
introduce 'struct mem_section_usage'.

A pointer to a 'struct mem_section_usage' instance replaces the existing
pointer to a 'pageblock_flags' bitmap.  Effectively it adds one more
'unsigned long' beyond the 'pageblock_flags' (usemap) allocation to house
a new 'subsection_map' bitmap.  The new bitmap enables the memory
hot{plug,remove} implementation to act on incremental sub-divisions of a
section.

SUBSECTION_SHIFT is defined as global constant instead of per-architecture
value like SECTION_SIZE_BITS in order to allow cross-arch compatibility of
subsection users.  Specifically a common subsection size allows for the
possibility that persistent memory namespace configurations be made
compatible across architectures.

The primary motivation for this functionality is to support platforms that
mix "System RAM" and "Persistent Memory" within a single section, or
multiple PMEM ranges with different mapping lifetimes within a single
section.  The section restriction for hotplug has caused an ongoing saga
of hacks and bugs for devm_memremap_pages() users.

Beyond the fixups to teach existing paths how to retrieve the 'usemap'
from a section, and updates to usemap allocation path, there are no
expected behavior changes.

Link: http://lkml.kernel.org/r/156092349845.979959.73333291612799019.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Qian Cai <cai@lca.pw>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 28 +++++++++++++++--
 mm/memory_hotplug.c    | 18 ++++++-----
 mm/page_alloc.c        |  2 +-
 mm/sparse.c            | 81 +++++++++++++++++++++++++-------------------------
 4 files changed, 76 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 298d1c3e4c2e..2520336bdfd1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1160,6 +1160,24 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 #define SECTION_ALIGN_UP(pfn)	(((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
 #define SECTION_ALIGN_DOWN(pfn)	((pfn) & PAGE_SECTION_MASK)
 
+#define SUBSECTION_SHIFT 21
+
+#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
+#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
+#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))
+
+#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
+#error Subsection size exceeds section size
+#else
+#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
+#endif
+
+struct mem_section_usage {
+	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
+	/* See declaration of similar field in struct zone */
+	unsigned long pageblock_flags[0];
+};
+
 struct page;
 struct page_ext;
 struct mem_section {
@@ -1177,8 +1195,7 @@ struct mem_section {
 	 */
 	unsigned long section_mem_map;
 
-	/* See declaration of similar field in struct zone */
-	unsigned long *pageblock_flags;
+	struct mem_section_usage *usage;
 #ifdef CONFIG_PAGE_EXTENSION
 	/*
 	 * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
@@ -1209,6 +1226,11 @@ extern struct mem_section **mem_section;
 extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
 #endif
 
+static inline unsigned long *section_to_usemap(struct mem_section *ms)
+{
+	return ms->usage->pageblock_flags;
+}
+
 static inline struct mem_section *__nr_to_section(unsigned long nr)
 {
 #ifdef CONFIG_SPARSEMEM_EXTREME
@@ -1220,7 +1242,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr)
 	return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK];
 }
 extern unsigned long __section_nr(struct mem_section *ms);
-extern unsigned long usemap_size(void);
+extern size_t mem_section_usage_size(void);
 
 /*
  * We use the lower bits of the mem_map pointer to store
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fafee5f13ef2..cf9d979a6498 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -166,9 +166,10 @@ void put_page_bootmem(struct page *page)
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
-	unsigned long *usemap, mapsize, section_nr, i;
+	unsigned long mapsize, section_nr, i;
 	struct mem_section *ms;
 	struct page *page, *memmap;
+	struct mem_section_usage *usage;
 
 	section_nr = pfn_to_section_nr(start_pfn);
 	ms = __nr_to_section(section_nr);
@@ -188,10 +189,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 	for (i = 0; i < mapsize; i++, page++)
 		get_page_bootmem(section_nr, page, SECTION_INFO);
 
-	usemap = ms->pageblock_flags;
-	page = virt_to_page(usemap);
+	usage = ms->usage;
+	page = virt_to_page(usage);
 
-	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
 
 	for (i = 0; i < mapsize; i++, page++)
 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
@@ -200,9 +201,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 #else /* CONFIG_SPARSEMEM_VMEMMAP */
 static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
-	unsigned long *usemap, mapsize, section_nr, i;
+	unsigned long mapsize, section_nr, i;
 	struct mem_section *ms;
 	struct page *page, *memmap;
+	struct mem_section_usage *usage;
 
 	section_nr = pfn_to_section_nr(start_pfn);
 	ms = __nr_to_section(section_nr);
@@ -211,10 +213,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 
 	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
 
-	usemap = ms->pageblock_flags;
-	page = virt_to_page(usemap);
+	usage = ms->usage;
+	page = virt_to_page(usage);
 
-	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
 
 	for (i = 0; i < mapsize; i++, page++)
 		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e515bfcf7f28..be78bafbfe3a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -450,7 +450,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
-	return __pfn_to_section(pfn)->pageblock_flags;
+	return section_to_usemap(__pfn_to_section(pfn));
 #else
 	return page_zone(page)->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
diff --git a/mm/sparse.c b/mm/sparse.c
index b29534cea8c0..41bef8e1f65c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -288,33 +288,31 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
 
 static void __meminit sparse_init_one_section(struct mem_section *ms,
 		unsigned long pnum, struct page *mem_map,
-		unsigned long *pageblock_bitmap)
+		struct mem_section_usage *usage)
 {
 	ms->section_mem_map &= ~SECTION_MAP_MASK;
 	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
 							SECTION_HAS_MEM_MAP;
- 	ms->pageblock_flags = pageblock_bitmap;
+	ms->usage = usage;
 }
 
-unsigned long usemap_size(void)
+static unsigned long usemap_size(void)
 {
 	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-static unsigned long *__kmalloc_section_usemap(void)
+size_t mem_section_usage_size(void)
 {
-	return kmalloc(usemap_size(), GFP_KERNEL);
+	return sizeof(struct mem_section_usage) + usemap_size();
 }
-#endif /* CONFIG_MEMORY_HOTPLUG */
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-static unsigned long * __init
+static struct mem_section_usage * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 					 unsigned long size)
 {
+	struct mem_section_usage *usage;
 	unsigned long goal, limit;
-	unsigned long *p;
 	int nid;
 	/*
 	 * A page may contain usemaps for other sections preventing the
@@ -330,15 +328,16 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 	limit = goal + (1UL << PA_SECTION_SHIFT);
 	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
 again:
-	p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
-	if (!p && limit) {
+	usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
+	if (!usage && limit) {
 		limit = 0;
 		goto again;
 	}
-	return p;
+	return usage;
 }
 
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+		struct mem_section_usage *usage)
 {
 	unsigned long usemap_snr, pgdat_snr;
 	static unsigned long old_usemap_snr;
@@ -352,7 +351,7 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 		old_pgdat_snr = NR_MEM_SECTIONS;
 	}
 
-	usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
 	pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
 	if (usemap_snr == pgdat_snr)
 		return;
@@ -380,14 +379,15 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 		usemap_snr, pgdat_snr, nid);
 }
 #else
-static unsigned long * __init
+static struct mem_section_usage * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
 					 unsigned long size)
 {
 	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
 }
 
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+		struct mem_section_usage *usage)
 {
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -474,14 +474,13 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 				   unsigned long pnum_end,
 				   unsigned long map_count)
 {
-	unsigned long pnum, usemap_longs, *usemap;
+	struct mem_section_usage *usage;
+	unsigned long pnum;
 	struct page *map;
 
-	usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
-	usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
-							  usemap_size() *
-							  map_count);
-	if (!usemap) {
+	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
+			mem_section_usage_size() * map_count);
+	if (!usage) {
 		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
 		goto failed;
 	}
@@ -497,9 +496,9 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 			pnum_begin = pnum;
 			goto failed;
 		}
-		check_usemap_section_nr(nid, usemap);
-		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
-		usemap += usemap_longs;
+		check_usemap_section_nr(nid, usage);
+		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage);
+		usage = (void *) usage + mem_section_usage_size();
 	}
 	sparse_buffer_fini();
 	return;
@@ -697,9 +696,9 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 				     struct vmem_altmap *altmap)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
+	struct mem_section_usage *usage;
 	struct mem_section *ms;
 	struct page *memmap;
-	unsigned long *usemap;
 	int ret;
 
 	/*
@@ -713,8 +712,8 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 	memmap = kmalloc_section_memmap(section_nr, nid, altmap);
 	if (!memmap)
 		return -ENOMEM;
-	usemap = __kmalloc_section_usemap();
-	if (!usemap) {
+	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
+	if (!usage) {
 		__kfree_section_memmap(memmap, altmap);
 		return -ENOMEM;
 	}
@@ -733,11 +732,11 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 
 	set_section_nid(section_nr, nid);
 	section_mark_present(ms);
-	sparse_init_one_section(ms, section_nr, memmap, usemap);
+	sparse_init_one_section(ms, section_nr, memmap, usage);
 
 out:
 	if (ret < 0) {
-		kfree(usemap);
+		kfree(usage);
 		__kfree_section_memmap(memmap, altmap);
 	}
 	return ret;
@@ -773,20 +772,20 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-static void free_section_usemap(struct page *memmap, unsigned long *usemap,
-		struct vmem_altmap *altmap)
+static void free_section_usage(struct page *memmap,
+		struct mem_section_usage *usage, struct vmem_altmap *altmap)
 {
-	struct page *usemap_page;
+	struct page *usage_page;
 
-	if (!usemap)
+	if (!usage)
 		return;
 
-	usemap_page = virt_to_page(usemap);
+	usage_page = virt_to_page(usage);
 	/*
 	 * Check to see if allocation came from hot-plug-add
 	 */
-	if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
-		kfree(usemap);
+	if (PageSlab(usage_page) || PageCompound(usage_page)) {
+		kfree(usage);
 		if (memmap)
 			__kfree_section_memmap(memmap, altmap);
 		return;
@@ -805,18 +804,18 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
 			       struct vmem_altmap *altmap)
 {
 	struct page *memmap = NULL;
-	unsigned long *usemap = NULL;
+	struct mem_section_usage *usage = NULL;
 
 	if (ms->section_mem_map) {
-		usemap = ms->pageblock_flags;
+		usage = ms->usage;
 		memmap = sparse_decode_mem_map(ms->section_mem_map,
 						__section_nr(ms));
 		ms->section_mem_map = 0;
-		ms->pageblock_flags = NULL;
+		ms->usage = NULL;
 	}
 
 	clear_hwpoisoned_pages(memmap + map_offset,
 			PAGES_PER_SECTION - map_offset);
-	free_section_usemap(memmap, usemap, altmap);
+	free_section_usage(memmap, usage, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3-59-g8ed1b


From 326e1b8f83a4318b09033ef754f40c785aed5e68 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:00 -0700
Subject: mm/sparsemem: introduce a SECTION_IS_EARLY flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In preparation for sub-section hotplug, track whether a given section
was created during early memory initialization, or later via memory
hotplug.  This distinction is needed to maintain the coarse expectation
that pfn_valid() returns true for any pfn within a given section even if
that section has pages that are reserved from the page allocator.

For example one of the of goals of subsection hotplug is to support
cases where the system physical memory layout collides System RAM and
PMEM within a section.  Several pfn_valid() users expect to just check
if a section is valid, but they are not careful to check if the given
pfn is within a "System RAM" boundary and instead expect pgdat
information to further validate the pfn.

Rather than unwind those paths to make their pfn_valid() queries more
precise a follow on patch uses the SECTION_IS_EARLY flag to maintain the
traditional expectation that pfn_valid() returns true for all early
sections.

Link: https://lore.kernel.org/lkml/1560366952-10660-1-git-send-email-cai@lca.pw/
Link: http://lkml.kernel.org/r/156092350358.979959.5817209875548072819.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Qian Cai <cai@lca.pw>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  8 +++++++-
 mm/sparse.c            | 20 +++++++++-----------
 2 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2520336bdfd1..4be40634238b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1260,7 +1260,8 @@ extern size_t mem_section_usage_size(void);
 #define	SECTION_MARKED_PRESENT	(1UL<<0)
 #define SECTION_HAS_MEM_MAP	(1UL<<1)
 #define SECTION_IS_ONLINE	(1UL<<2)
-#define SECTION_MAP_LAST_BIT	(1UL<<3)
+#define SECTION_IS_EARLY	(1UL<<3)
+#define SECTION_MAP_LAST_BIT	(1UL<<4)
 #define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
 #define SECTION_NID_SHIFT	3
 
@@ -1286,6 +1287,11 @@ static inline int valid_section(struct mem_section *section)
 	return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
 }
 
+static inline int early_section(struct mem_section *section)
+{
+	return (section && (section->section_mem_map & SECTION_IS_EARLY));
+}
+
 static inline int valid_section_nr(unsigned long nr)
 {
 	return valid_section(__nr_to_section(nr));
diff --git a/mm/sparse.c b/mm/sparse.c
index 41bef8e1f65c..6d23a526279a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -288,11 +288,11 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
 
 static void __meminit sparse_init_one_section(struct mem_section *ms,
 		unsigned long pnum, struct page *mem_map,
-		struct mem_section_usage *usage)
+		struct mem_section_usage *usage, unsigned long flags)
 {
 	ms->section_mem_map &= ~SECTION_MAP_MASK;
-	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
-							SECTION_HAS_MEM_MAP;
+	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
+		| SECTION_HAS_MEM_MAP | flags;
 	ms->usage = usage;
 }
 
@@ -497,7 +497,8 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 			goto failed;
 		}
 		check_usemap_section_nr(nid, usage);
-		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage);
+		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
+				SECTION_IS_EARLY);
 		usage = (void *) usage + mem_section_usage_size();
 	}
 	sparse_buffer_fini();
@@ -732,7 +733,7 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 
 	set_section_nid(section_nr, nid);
 	section_mark_present(ms);
-	sparse_init_one_section(ms, section_nr, memmap, usage);
+	sparse_init_one_section(ms, section_nr, memmap, usage, 0);
 
 out:
 	if (ret < 0) {
@@ -772,19 +773,16 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-static void free_section_usage(struct page *memmap,
+static void free_section_usage(struct mem_section *ms, struct page *memmap,
 		struct mem_section_usage *usage, struct vmem_altmap *altmap)
 {
-	struct page *usage_page;
-
 	if (!usage)
 		return;
 
-	usage_page = virt_to_page(usage);
 	/*
 	 * Check to see if allocation came from hot-plug-add
 	 */
-	if (PageSlab(usage_page) || PageCompound(usage_page)) {
+	if (!early_section(ms)) {
 		kfree(usage);
 		if (memmap)
 			__kfree_section_memmap(memmap, altmap);
@@ -816,6 +814,6 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
 
 	clear_hwpoisoned_pages(memmap + map_offset,
 			PAGES_PER_SECTION - map_offset);
-	free_section_usage(memmap, usage, altmap);
+	free_section_usage(ms, memmap, usage, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3-59-g8ed1b


From f46edbd1b1516da1fb34c917775168d5df576f78 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:04 -0700
Subject: mm/sparsemem: add helpers track active portions of a section at boot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare for hot{plug,remove} of sub-ranges of a section by tracking a
sub-section active bitmask, each bit representing a PMD_SIZE span of the
architecture's memory hotplug section size.

The implications of a partially populated section is that pfn_valid()
needs to go beyond a valid_section() check and either determine that the
section is an "early section", or read the sub-section active ranges
from the bitmask.  The expectation is that the bitmask (subsection_map)
fits in the same cacheline as the valid_section() / early_section()
data, so the incremental performance overhead to pfn_valid() should be
negligible.

The rationale for using early_section() to short-ciruit the
subsection_map check is that there are legacy code paths that use
pfn_valid() at section granularity before validating the pfn against
pgdat data.  So, the early_section() check allows those traditional
assumptions to persist while also permitting subsection_map to tell the
truth for purposes of populating the unused portions of early sections
with PMEM and other ZONE_DEVICE mappings.

Link: http://lkml.kernel.org/r/156092350874.979959.18185938451405518285.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Qian Cai <cai@lca.pw>
Tested-by: Jane Chu <jane.chu@oracle.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 33 ++++++++++++++++++++++++++++++++-
 mm/page_alloc.c        | 10 ++++++++--
 mm/sparse.c            | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4be40634238b..7747ec9de588 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1178,6 +1178,8 @@ struct mem_section_usage {
 	unsigned long pageblock_flags[0];
 };
 
+void subsection_map_init(unsigned long pfn, unsigned long nr_pages);
+
 struct page;
 struct page_ext;
 struct mem_section {
@@ -1321,12 +1323,40 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 
 extern unsigned long __highest_present_section_nr;
 
+static inline int subsection_map_index(unsigned long pfn)
+{
+	return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
+{
+	int idx = subsection_map_index(pfn);
+
+	return test_bit(idx, ms->usage->subsection_map);
+}
+#else
+static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
+{
+	return 1;
+}
+#endif
+
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
 static inline int pfn_valid(unsigned long pfn)
 {
+	struct mem_section *ms;
+
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
-	return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
+	ms = __nr_to_section(pfn_to_section_nr(pfn));
+	if (!valid_section(ms))
+		return 0;
+	/*
+	 * Traditionally early sections always returned pfn_valid() for
+	 * the entire section-sized span.
+	 */
+	return early_section(ms) || pfn_section_valid(ms, pfn);
 }
 #endif
 
@@ -1358,6 +1388,7 @@ void sparse_init(void);
 #define sparse_init()	do {} while (0)
 #define sparse_index_init(_sec, _nid)  do {} while (0)
 #define pfn_present pfn_valid
+#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index be78bafbfe3a..c4cdd3954804 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7351,12 +7351,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 			       (u64)zone_movable_pfn[i] << PAGE_SHIFT);
 	}
 
-	/* Print out the early node map */
+	/*
+	 * Print out the early node map, and initialize the
+	 * subsection-map relative to active online memory ranges to
+	 * enable future "sub-section" extensions of the memory map.
+	 */
 	pr_info("Early memory node ranges\n");
-	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
 		pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
 			(u64)start_pfn << PAGE_SHIFT,
 			((u64)end_pfn << PAGE_SHIFT) - 1);
+		subsection_map_init(start_pfn, end_pfn - start_pfn);
+	}
 
 	/* Initialise every node */
 	mminit_verify_pageflags_layout();
diff --git a/mm/sparse.c b/mm/sparse.c
index 6d23a526279a..26b48ee1a262 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -210,6 +210,41 @@ static inline unsigned long first_present_section_nr(void)
 	return next_present_section_nr(-1);
 }
 
+void subsection_mask_set(unsigned long *map, unsigned long pfn,
+		unsigned long nr_pages)
+{
+	int idx = subsection_map_index(pfn);
+	int end = subsection_map_index(pfn + nr_pages - 1);
+
+	bitmap_set(map, idx, end - idx + 1);
+}
+
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
+{
+	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+	int i, start_sec = pfn_to_section_nr(pfn);
+
+	if (!nr_pages)
+		return;
+
+	for (i = start_sec; i <= end_sec; i++) {
+		struct mem_section *ms;
+		unsigned long pfns;
+
+		pfns = min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK));
+		ms = __nr_to_section(i);
+		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
+
+		pr_debug("%s: sec: %d pfns: %ld set(%d, %d)\n", __func__, i,
+				pfns, subsection_map_index(pfn),
+				subsection_map_index(pfn + pfns - 1));
+
+		pfn += pfns;
+		nr_pages -= pfns;
+	}
+}
+
 /* Record a memory area against a node. */
 void __init memory_present(int nid, unsigned long start, unsigned long end)
 {
-- 
cgit v1.2.3-59-g8ed1b


From e9c0a3f05477e18d2dae816cb61b62be1b7e90d3 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:11 -0700
Subject: mm/sparsemem: convert kmalloc_section_memmap() to
 populate_section_memmap()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow sub-section sized ranges to be added to the memmap.

populate_section_memmap() takes an explict pfn range rather than
assuming a full section, and those parameters are plumbed all the way
through to vmmemap_populate().  There should be no sub-section usage in
current deployments.  New warnings are added to clarify which memmap
allocation paths are sub-section capable.

Link: http://lkml.kernel.org/r/156092352058.979959.6551283472062305149.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/init_64.c |  4 +++-
 include/linux/mm.h    |  4 ++--
 mm/sparse-vmemmap.c   | 21 ++++++++++++++-------
 mm/sparse.c           | 50 +++++++++++++++++++++++++++-----------------------
 4 files changed, 46 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a289a2ab108..a6b5c653727b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1518,7 +1518,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 {
 	int err;
 
-	if (boot_cpu_has(X86_FEATURE_PSE))
+	if (end - start < PAGES_PER_SECTION * sizeof(struct page))
+		err = vmemmap_populate_basepages(start, end, node);
+	else if (boot_cpu_has(X86_FEATURE_PSE))
 		err = vmemmap_populate_hugepages(start, end, node, altmap);
 	else if (altmap) {
 		pr_err_once("%s: no cpu support for altmap allocations\n",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 48ab7b982d82..0334ca97c584 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2767,8 +2767,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
 #endif
 
 void *sparse_buffer_alloc(unsigned long size);
-struct page *sparse_mem_map_populate(unsigned long pnum, int nid,
-		struct vmem_altmap *altmap);
+struct page * __populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap);
 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
 p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
 pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 7fec05796796..200aef686722 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -245,19 +245,26 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
 	return 0;
 }
 
-struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
-		struct vmem_altmap *altmap)
+struct page * __meminit __populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
 	unsigned long start;
 	unsigned long end;
-	struct page *map;
 
-	map = pfn_to_page(pnum * PAGES_PER_SECTION);
-	start = (unsigned long)map;
-	end = (unsigned long)(map + PAGES_PER_SECTION);
+	/*
+	 * The minimum granularity of memmap extensions is
+	 * PAGES_PER_SUBSECTION as allocations are tracked in the
+	 * 'subsection_map' bitmap of the section.
+	 */
+	end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION);
+	pfn &= PAGE_SUBSECTION_MASK;
+	nr_pages = end - pfn;
+
+	start = (unsigned long) pfn_to_page(pfn);
+	end = start + nr_pages * sizeof(struct page);
 
 	if (vmemmap_populate(start, end, nid, altmap))
 		return NULL;
 
-	return map;
+	return pfn_to_page(pfn);
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 26b48ee1a262..6b01022e23a9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -439,8 +439,8 @@ static unsigned long __init section_map_size(void)
 	return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
 }
 
-struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
-		struct vmem_altmap *altmap)
+struct page __init *__populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
 	unsigned long size = section_map_size();
 	struct page *map = sparse_buffer_alloc(size);
@@ -521,10 +521,13 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
 	}
 	sparse_buffer_init(map_count * section_map_size(), nid);
 	for_each_present_section_nr(pnum_begin, pnum) {
+		unsigned long pfn = section_nr_to_pfn(pnum);
+
 		if (pnum >= pnum_end)
 			break;
 
-		map = sparse_mem_map_populate(pnum, nid, NULL);
+		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
+				nid, NULL);
 		if (!map) {
 			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
 			       __func__, nid);
@@ -625,17 +628,17 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
-		struct vmem_altmap *altmap)
+static struct page *populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
-	/* This will make the necessary allocations eventually. */
-	return sparse_mem_map_populate(pnum, nid, altmap);
+	return __populate_section_memmap(pfn, nr_pages, nid, altmap);
 }
-static void __kfree_section_memmap(struct page *memmap,
+
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 		struct vmem_altmap *altmap)
 {
-	unsigned long start = (unsigned long)memmap;
-	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+	unsigned long start = (unsigned long) pfn_to_page(pfn);
+	unsigned long end = start + nr_pages * sizeof(struct page);
 
 	vmemmap_free(start, end, altmap);
 }
@@ -647,7 +650,8 @@ static void free_map_bootmem(struct page *memmap)
 	vmemmap_free(start, end, NULL);
 }
 #else
-static struct page *__kmalloc_section_memmap(void)
+struct page *populate_section_memmap(unsigned long pfn,
+		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 {
 	struct page *page, *ret;
 	unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
@@ -668,15 +672,11 @@ got_map_ptr:
 	return ret;
 }
 
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
 		struct vmem_altmap *altmap)
 {
-	return __kmalloc_section_memmap();
-}
+	struct page *memmap = pfn_to_page(pfn);
 
-static void __kfree_section_memmap(struct page *memmap,
-		struct vmem_altmap *altmap)
-{
 	if (is_vmalloc_addr(memmap))
 		vfree(memmap);
 	else
@@ -745,12 +745,13 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 	if (ret < 0 && ret != -EEXIST)
 		return ret;
 	ret = 0;
-	memmap = kmalloc_section_memmap(section_nr, nid, altmap);
+	memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid,
+			altmap);
 	if (!memmap)
 		return -ENOMEM;
 	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
 	if (!usage) {
-		__kfree_section_memmap(memmap, altmap);
+		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
 		return -ENOMEM;
 	}
 
@@ -773,7 +774,7 @@ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
 out:
 	if (ret < 0) {
 		kfree(usage);
-		__kfree_section_memmap(memmap, altmap);
+		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
 	}
 	return ret;
 }
@@ -809,7 +810,8 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 #endif
 
 static void free_section_usage(struct mem_section *ms, struct page *memmap,
-		struct mem_section_usage *usage, struct vmem_altmap *altmap)
+		struct mem_section_usage *usage, unsigned long pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap)
 {
 	if (!usage)
 		return;
@@ -820,7 +822,7 @@ static void free_section_usage(struct mem_section *ms, struct page *memmap,
 	if (!early_section(ms)) {
 		kfree(usage);
 		if (memmap)
-			__kfree_section_memmap(memmap, altmap);
+			depopulate_section_memmap(pfn, nr_pages, altmap);
 		return;
 	}
 
@@ -849,6 +851,8 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
 
 	clear_hwpoisoned_pages(memmap + map_offset,
 			PAGES_PER_SECTION - map_offset);
-	free_section_usage(ms, memmap, usage, altmap);
+	free_section_usage(ms, memmap, usage,
+			section_nr_to_pfn(__section_nr(ms)),
+			PAGES_PER_SECTION, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3-59-g8ed1b


From 46d945aeab4d7dd837bd0724662de2caf712f047 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:18 -0700
Subject: mm: kill is_dev_zone() helper
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Given there are no more usages of is_dev_zone() outside of 'ifdef
CONFIG_ZONE_DEVICE' protection, kill off the compilation helper.

Link: http://lkml.kernel.org/r/156092353211.979959.1489004866360828964.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Cc: Michal Hocko <mhocko@suse.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 12 ------------
 mm/page_alloc.c        |  2 +-
 2 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7747ec9de588..8331e76677c0 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -855,18 +855,6 @@ static inline int local_memory_node(int node_id) { return node_id; };
  */
 #define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
 
-#ifdef CONFIG_ZONE_DEVICE
-static inline bool is_dev_zone(const struct zone *zone)
-{
-	return zone_idx(zone) == ZONE_DEVICE;
-}
-#else
-static inline bool is_dev_zone(const struct zone *zone)
-{
-	return false;
-}
-#endif
-
 /*
  * Returns true if a zone has pages managed by the buddy allocator.
  * All the reclaim decisions have to use this function rather than
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c4cdd3954804..2c74367a8eba 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5926,7 +5926,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 	unsigned long start = jiffies;
 	int nid = pgdat->node_id;
 
-	if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone)))
+	if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
 		return;
 
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 7ea6216049ff9cf250a6722cd766d99c8d1424e5 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:22 -0700
Subject: mm/sparsemem: prepare for sub-section ranges
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prepare the memory hot-{add,remove} paths for handling sub-section
ranges by plumbing the starting page frame and number of pages being
handled through arch_{add,remove}_memory() to
sparse_{add,remove}_one_section().

This is simply plumbing, small cleanups, and some identifier renames.
No intended functional changes.

Link: http://lkml.kernel.org/r/156092353780.979959.9713046515562743194.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Pavel Tatashin <pasha.tatashin@soleen.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |   5 +-
 mm/memory_hotplug.c            | 114 +++++++++++++++++++++++++----------------
 mm/sparse.c                    |  16 +++---
 3 files changed, 81 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 475aff8efbf8..2d636a7491a4 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -346,9 +346,10 @@ extern int add_memory_resource(int nid, struct resource *resource);
 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap);
 extern bool is_memblock_offlined(struct memory_block *mem);
-extern int sparse_add_one_section(int nid, unsigned long start_pfn,
-				  struct vmem_altmap *altmap);
+extern int sparse_add_section(int nid, unsigned long pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap);
 extern void sparse_remove_one_section(struct mem_section *ms,
+		unsigned long pfn, unsigned long nr_pages,
 		unsigned long map_offset, struct vmem_altmap *altmap);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 11220044b01a..3fbb2cfab126 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -252,51 +252,84 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 }
 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 
-static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
-				   struct vmem_altmap *altmap)
+static int __meminit __add_section(int nid, unsigned long pfn,
+		unsigned long nr_pages,	struct vmem_altmap *altmap)
 {
 	int ret;
 
-	if (pfn_valid(phys_start_pfn))
+	if (pfn_valid(pfn))
 		return -EEXIST;
 
-	ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
+	ret = sparse_add_section(nid, pfn, nr_pages, altmap);
 	return ret < 0 ? ret : 0;
 }
 
+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
+		const char *reason)
+{
+	/*
+	 * Disallow all operations smaller than a sub-section and only
+	 * allow operations smaller than a section for
+	 * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
+	 * enforces a larger memory_block_size_bytes() granularity for
+	 * memory that will be marked online, so this check should only
+	 * fire for direct arch_{add,remove}_memory() users outside of
+	 * add_memory_resource().
+	 */
+	unsigned long min_align;
+
+	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+		min_align = PAGES_PER_SUBSECTION;
+	else
+		min_align = PAGES_PER_SECTION;
+	if (!IS_ALIGNED(pfn, min_align)
+			|| !IS_ALIGNED(nr_pages, min_align)) {
+		WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
+				reason, pfn, pfn + nr_pages - 1);
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /*
  * Reasonably generic function for adding memory.  It is
  * expected that archs that support memory hotplug will
  * call this function after deciding the zone to which to
  * add the new pages.
  */
-int __ref __add_pages(int nid, unsigned long phys_start_pfn,
-		unsigned long nr_pages, struct mhp_restrictions *restrictions)
+int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
+		struct mhp_restrictions *restrictions)
 {
 	unsigned long i;
-	int err = 0;
-	int start_sec, end_sec;
+	int start_sec, end_sec, err;
 	struct vmem_altmap *altmap = restrictions->altmap;
 
-	/* during initialize mem_map, align hot-added range to section */
-	start_sec = pfn_to_section_nr(phys_start_pfn);
-	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
-
 	if (altmap) {
 		/*
 		 * Validate altmap is within bounds of the total request
 		 */
-		if (altmap->base_pfn != phys_start_pfn
+		if (altmap->base_pfn != pfn
 				|| vmem_altmap_offset(altmap) > nr_pages) {
 			pr_warn_once("memory add fail, invalid altmap\n");
-			err = -EINVAL;
-			goto out;
+			return -EINVAL;
 		}
 		altmap->alloc = 0;
 	}
 
+	err = check_pfn_span(pfn, nr_pages, "add");
+	if (err)
+		return err;
+
+	start_sec = pfn_to_section_nr(pfn);
+	end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
 	for (i = start_sec; i <= end_sec; i++) {
-		err = __add_section(nid, section_nr_to_pfn(i), altmap);
+		unsigned long pfns;
+
+		pfns = min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK));
+		err = __add_section(nid, pfn, pfns, altmap);
+		pfn += pfns;
+		nr_pages -= pfns;
 
 		/*
 		 * EEXIST is finally dealt with by ioresource collision
@@ -309,7 +342,6 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 		cond_resched();
 	}
 	vmemmap_populate_print_last();
-out:
 	return err;
 }
 
@@ -487,10 +519,10 @@ static void shrink_pgdat_span(struct pglist_data *pgdat,
 	pgdat->node_spanned_pages = 0;
 }
 
-static void __remove_zone(struct zone *zone, unsigned long start_pfn)
+static void __remove_zone(struct zone *zone, unsigned long start_pfn,
+		unsigned long nr_pages)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
-	int nr_pages = PAGES_PER_SECTION;
 	unsigned long flags;
 
 	pgdat_resize_lock(zone->zone_pgdat, &flags);
@@ -499,27 +531,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
 }
 
-static void __remove_section(struct zone *zone, struct mem_section *ms,
-			     unsigned long map_offset,
-			     struct vmem_altmap *altmap)
+static void __remove_section(struct zone *zone, unsigned long pfn,
+		unsigned long nr_pages, unsigned long map_offset,
+		struct vmem_altmap *altmap)
 {
-	unsigned long start_pfn;
-	int scn_nr;
+	struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
 
 	if (WARN_ON_ONCE(!valid_section(ms)))
 		return;
 
-	scn_nr = __section_nr(ms);
-	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
-	__remove_zone(zone, start_pfn);
-
-	sparse_remove_one_section(ms, map_offset, altmap);
+	__remove_zone(zone, pfn, nr_pages);
+	sparse_remove_one_section(ms, pfn, nr_pages, map_offset, altmap);
 }
 
 /**
  * __remove_pages() - remove sections of pages from a zone
  * @zone: zone from which pages need to be removed
- * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
+ * @pfn: starting pageframe (must be aligned to start of a section)
  * @nr_pages: number of pages to remove (must be multiple of section size)
  * @altmap: alternative device page map or %NULL if default memmap is used
  *
@@ -528,30 +556,30 @@ static void __remove_section(struct zone *zone, struct mem_section *ms,
  * sure that pages are marked reserved and zones are adjust properly by
  * calling offline_pages().
  */
-void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+void __remove_pages(struct zone *zone, unsigned long pfn,
 		    unsigned long nr_pages, struct vmem_altmap *altmap)
 {
-	unsigned long i;
 	unsigned long map_offset = 0;
-	int sections_to_remove;
+	int i, start_sec, end_sec;
 
 	map_offset = vmem_altmap_offset(altmap);
 
 	clear_zone_contiguous(zone);
 
-	/*
-	 * We can only remove entire sections
-	 */
-	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
-	BUG_ON(nr_pages % PAGES_PER_SECTION);
+	if (check_pfn_span(pfn, nr_pages, "remove"))
+		return;
 
-	sections_to_remove = nr_pages / PAGES_PER_SECTION;
-	for (i = 0; i < sections_to_remove; i++) {
-		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+	start_sec = pfn_to_section_nr(pfn);
+	end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+	for (i = start_sec; i <= end_sec; i++) {
+		unsigned long pfns;
 
 		cond_resched();
-		__remove_section(zone, __pfn_to_section(pfn), map_offset,
-				 altmap);
+		pfns = min(nr_pages, PAGES_PER_SECTION
+				- (pfn & ~PAGE_SECTION_MASK));
+		__remove_section(zone, pfn, pfns, map_offset, altmap);
+		pfn += pfns;
+		nr_pages -= pfns;
 		map_offset = 0;
 	}
 
diff --git a/mm/sparse.c b/mm/sparse.c
index 6b01022e23a9..41579b66fff1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -728,8 +728,8 @@ static void free_map_bootmem(struct page *memmap)
  * * -EEXIST	- Section has been present.
  * * -ENOMEM	- Out of memory.
  */
-int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
-				     struct vmem_altmap *altmap)
+int __meminit sparse_add_section(int nid, unsigned long start_pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
 	struct mem_section_usage *usage;
@@ -835,8 +835,9 @@ static void free_section_usage(struct mem_section *ms, struct page *memmap,
 		free_map_bootmem(memmap);
 }
 
-void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
-			       struct vmem_altmap *altmap)
+void sparse_remove_one_section(struct mem_section *ms, unsigned long pfn,
+		unsigned long nr_pages, unsigned long map_offset,
+		struct vmem_altmap *altmap)
 {
 	struct page *memmap = NULL;
 	struct mem_section_usage *usage = NULL;
@@ -849,10 +850,7 @@ void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
 		ms->usage = NULL;
 	}
 
-	clear_hwpoisoned_pages(memmap + map_offset,
-			PAGES_PER_SECTION - map_offset);
-	free_section_usage(ms, memmap, usage,
-			section_nr_to_pfn(__section_nr(ms)),
-			PAGES_PER_SECTION, altmap);
+	clear_hwpoisoned_pages(memmap + map_offset, nr_pages - map_offset);
+	free_section_usage(ms, memmap, usage, pfn, nr_pages, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3-59-g8ed1b


From ba72b4c8cf60e452cf6f0258ed9ee697957b7dfd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:26 -0700
Subject: mm/sparsemem: support sub-section hotplug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The libnvdimm sub-system has suffered a series of hacks and broken
workarounds for the memory-hotplug implementation's awkward
section-aligned (128MB) granularity.

For example the following backtrace is emitted when attempting
arch_add_memory() with physical address ranges that intersect 'System
RAM' (RAM) with 'Persistent Memory' (PMEM) within a given section:

    # cat /proc/iomem | grep -A1 -B1 Persistent\ Memory
    100000000-1ffffffff : System RAM
    200000000-303ffffff : Persistent Memory (legacy)
    304000000-43fffffff : System RAM
    440000000-23ffffffff : Persistent Memory
    2400000000-43bfffffff : Persistent Memory
      2400000000-43bfffffff : namespace2.0

    WARNING: CPU: 38 PID: 928 at arch/x86/mm/init_64.c:850 add_pages+0x5c/0x60
    [..]
    RIP: 0010:add_pages+0x5c/0x60
    [..]
    Call Trace:
     devm_memremap_pages+0x460/0x6e0
     pmem_attach_disk+0x29e/0x680 [nd_pmem]
     ? nd_dax_probe+0xfc/0x120 [libnvdimm]
     nvdimm_bus_probe+0x66/0x160 [libnvdimm]

It was discovered that the problem goes beyond RAM vs PMEM collisions as
some platform produce PMEM vs PMEM collisions within a given section.
The libnvdimm workaround for that case revealed that the libnvdimm
section-alignment-padding implementation has been broken for a long
while.

A fix for that long-standing breakage introduces as many problems as it
solves as it would require a backward-incompatible change to the
namespace metadata interpretation.  Instead of that dubious route [1],
address the root problem in the memory-hotplug implementation.

Note that EEXIST is no longer treated as success as that is how
sparse_add_section() reports subsection collisions, it was also obviated
by recent changes to perform the request_region() for 'System RAM'
before arch_add_memory() in the add_memory() sequence.

[1] https://lore.kernel.org/r/155000671719.348031.2347363160141119237.stgit@dwillia2-desk3.amr.corp.intel.com

[osalvador@suse.de: fix deactivate_section for early sections]
  Link: http://lkml.kernel.org/r/20190715081549.32577-2-osalvador@suse.de
Link: http://lkml.kernel.org/r/156092354368.979959.6232443923440952359.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |   2 +-
 mm/memory_hotplug.c            |  27 +-----
 mm/page_alloc.c                |   2 +-
 mm/sparse.c                    | 206 +++++++++++++++++++++++++++--------------
 4 files changed, 141 insertions(+), 96 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 2d636a7491a4..f46ea71b4ffd 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -348,7 +348,7 @@ extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern int sparse_add_section(int nid, unsigned long pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap);
-extern void sparse_remove_one_section(struct mem_section *ms,
+extern void sparse_remove_section(struct mem_section *ms,
 		unsigned long pfn, unsigned long nr_pages,
 		unsigned long map_offset, struct vmem_altmap *altmap);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3fbb2cfab126..aafb71594ee3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -252,18 +252,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 }
 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 
-static int __meminit __add_section(int nid, unsigned long pfn,
-		unsigned long nr_pages,	struct vmem_altmap *altmap)
-{
-	int ret;
-
-	if (pfn_valid(pfn))
-		return -EEXIST;
-
-	ret = sparse_add_section(nid, pfn, nr_pages, altmap);
-	return ret < 0 ? ret : 0;
-}
-
 static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
 		const char *reason)
 {
@@ -327,18 +315,11 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
 
 		pfns = min(nr_pages, PAGES_PER_SECTION
 				- (pfn & ~PAGE_SECTION_MASK));
-		err = __add_section(nid, pfn, pfns, altmap);
+		err = sparse_add_section(nid, pfn, pfns, altmap);
+		if (err)
+			break;
 		pfn += pfns;
 		nr_pages -= pfns;
-
-		/*
-		 * EEXIST is finally dealt with by ioresource collision
-		 * check. see add_memory() => register_memory_resource()
-		 * Warning will be printed if there is collision.
-		 */
-		if (err && (err != -EEXIST))
-			break;
-		err = 0;
 		cond_resched();
 	}
 	vmemmap_populate_print_last();
@@ -541,7 +522,7 @@ static void __remove_section(struct zone *zone, unsigned long pfn,
 		return;
 
 	__remove_zone(zone, pfn, nr_pages);
-	sparse_remove_one_section(ms, pfn, nr_pages, map_offset, altmap);
+	sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
 }
 
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c74367a8eba..272c6de1bf4e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5974,7 +5974,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
 		 * pfn out of zone.
 		 *
 		 * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
-		 * because this is done early in sparse_add_one_section
+		 * because this is done early in section_activate()
 		 */
 		if (!(pfn & (pageblock_nr_pages - 1))) {
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
diff --git a/mm/sparse.c b/mm/sparse.c
index 41579b66fff1..a205a2ac66a4 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -83,8 +83,15 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
 	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
 	struct mem_section *section;
 
+	/*
+	 * An existing section is possible in the sub-section hotplug
+	 * case. First hot-add instantiates, follow-on hot-add reuses
+	 * the existing section.
+	 *
+	 * The mem_hotplug_lock resolves the apparent race below.
+	 */
 	if (mem_section[root])
-		return -EEXIST;
+		return 0;
 
 	section = sparse_index_alloc(nid);
 	if (!section)
@@ -715,10 +722,120 @@ static void free_map_bootmem(struct page *memmap)
 }
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
+static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
+		struct vmem_altmap *altmap)
+{
+	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
+	struct mem_section *ms = __pfn_to_section(pfn);
+	bool section_is_early = early_section(ms);
+	struct page *memmap = NULL;
+	unsigned long *subsection_map = ms->usage
+		? &ms->usage->subsection_map[0] : NULL;
+
+	subsection_mask_set(map, pfn, nr_pages);
+	if (subsection_map)
+		bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
+
+	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
+				"section already deactivated (%#lx + %ld)\n",
+				pfn, nr_pages))
+		return;
+
+	/*
+	 * There are 3 cases to handle across two configurations
+	 * (SPARSEMEM_VMEMMAP={y,n}):
+	 *
+	 * 1/ deactivation of a partial hot-added section (only possible
+	 * in the SPARSEMEM_VMEMMAP=y case).
+	 *    a/ section was present at memory init
+	 *    b/ section was hot-added post memory init
+	 * 2/ deactivation of a complete hot-added section
+	 * 3/ deactivation of a complete section from memory init
+	 *
+	 * For 1/, when subsection_map does not empty we will not be
+	 * freeing the usage map, but still need to free the vmemmap
+	 * range.
+	 *
+	 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
+	 */
+	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
+	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+		unsigned long section_nr = pfn_to_section_nr(pfn);
+
+		if (!section_is_early) {
+			kfree(ms->usage);
+			ms->usage = NULL;
+		}
+		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
+	}
+
+	if (section_is_early && memmap)
+		free_map_bootmem(memmap);
+	else
+		depopulate_section_memmap(pfn, nr_pages, altmap);
+}
+
+static struct page * __meminit section_activate(int nid, unsigned long pfn,
+		unsigned long nr_pages, struct vmem_altmap *altmap)
+{
+	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+	struct mem_section *ms = __pfn_to_section(pfn);
+	struct mem_section_usage *usage = NULL;
+	unsigned long *subsection_map;
+	struct page *memmap;
+	int rc = 0;
+
+	subsection_mask_set(map, pfn, nr_pages);
+
+	if (!ms->usage) {
+		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
+		if (!usage)
+			return ERR_PTR(-ENOMEM);
+		ms->usage = usage;
+	}
+	subsection_map = &ms->usage->subsection_map[0];
+
+	if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
+		rc = -EINVAL;
+	else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
+		rc = -EEXIST;
+	else
+		bitmap_or(subsection_map, map, subsection_map,
+				SUBSECTIONS_PER_SECTION);
+
+	if (rc) {
+		if (usage)
+			ms->usage = NULL;
+		kfree(usage);
+		return ERR_PTR(rc);
+	}
+
+	/*
+	 * The early init code does not consider partially populated
+	 * initial sections, it simply assumes that memory will never be
+	 * referenced.  If we hot-add memory into such a section then we
+	 * do not need to populate the memmap and can simply reuse what
+	 * is already there.
+	 */
+	if (nr_pages < PAGES_PER_SECTION && early_section(ms))
+		return pfn_to_page(pfn);
+
+	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
+	if (!memmap) {
+		section_deactivate(pfn, nr_pages, altmap);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return memmap;
+}
+
 /**
- * sparse_add_one_section - add a memory section
+ * sparse_add_section - add a memory section, or populate an existing one
  * @nid: The node to add section on
  * @start_pfn: start pfn of the memory range
+ * @nr_pages: number of pfns to add in the section
  * @altmap: device page map
  *
  * This is only intended for hotplug.
@@ -732,51 +849,34 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
-	struct mem_section_usage *usage;
 	struct mem_section *ms;
 	struct page *memmap;
 	int ret;
 
-	/*
-	 * no locking for this, because it does its own
-	 * plus, it does a kmalloc
-	 */
 	ret = sparse_index_init(section_nr, nid);
-	if (ret < 0 && ret != -EEXIST)
+	if (ret < 0)
 		return ret;
-	ret = 0;
-	memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid,
-			altmap);
-	if (!memmap)
-		return -ENOMEM;
-	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
-	if (!usage) {
-		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
-		return -ENOMEM;
-	}
 
-	ms = __pfn_to_section(start_pfn);
-	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
-		ret = -EEXIST;
-		goto out;
-	}
+	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
+	if (IS_ERR(memmap))
+		return PTR_ERR(memmap);
 
 	/*
 	 * Poison uninitialized struct pages in order to catch invalid flags
 	 * combinations.
 	 */
-	page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
+	page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
 
+	ms = __pfn_to_section(start_pfn);
 	set_section_nid(section_nr, nid);
 	section_mark_present(ms);
-	sparse_init_one_section(ms, section_nr, memmap, usage, 0);
 
-out:
-	if (ret < 0) {
-		kfree(usage);
-		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
-	}
-	return ret;
+	/* Align memmap to section boundary in the subsection case */
+	if (section_nr_to_pfn(section_nr) != start_pfn)
+		memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr));
+	sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
+
+	return 0;
 }
 
 #ifdef CONFIG_MEMORY_FAILURE
@@ -809,48 +909,12 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
-static void free_section_usage(struct mem_section *ms, struct page *memmap,
-		struct mem_section_usage *usage, unsigned long pfn,
-		unsigned long nr_pages, struct vmem_altmap *altmap)
-{
-	if (!usage)
-		return;
-
-	/*
-	 * Check to see if allocation came from hot-plug-add
-	 */
-	if (!early_section(ms)) {
-		kfree(usage);
-		if (memmap)
-			depopulate_section_memmap(pfn, nr_pages, altmap);
-		return;
-	}
-
-	/*
-	 * The usemap came from bootmem. This is packed with other usemaps
-	 * on the section which has pgdat at boot time. Just keep it as is now.
-	 */
-
-	if (memmap)
-		free_map_bootmem(memmap);
-}
-
-void sparse_remove_one_section(struct mem_section *ms, unsigned long pfn,
+void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
 		unsigned long nr_pages, unsigned long map_offset,
 		struct vmem_altmap *altmap)
 {
-	struct page *memmap = NULL;
-	struct mem_section_usage *usage = NULL;
-
-	if (ms->section_mem_map) {
-		usage = ms->usage;
-		memmap = sparse_decode_mem_map(ms->section_mem_map,
-						__section_nr(ms));
-		ms->section_mem_map = 0;
-		ms->usage = NULL;
-	}
-
-	clear_hwpoisoned_pages(memmap + map_offset, nr_pages - map_offset);
-	free_section_usage(ms, memmap, usage, pfn, nr_pages, altmap);
+	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
+			nr_pages - map_offset);
+	section_deactivate(pfn, nr_pages, altmap);
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit v1.2.3-59-g8ed1b


From a3619190d62ed9d66416891be2416f6bea2b3ca4 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 18 Jul 2019 15:58:40 -0700
Subject: libnvdimm/pfn: stop padding pmem namespaces to section alignment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the mm core supports section-unaligned hotplug of ZONE_DEVICE
memory, we no longer need to add padding at pfn/dax device creation
time.  The kernel will still honor padding established by older kernels.

Link: http://lkml.kernel.org/r/156092356588.979959.6793371748950931916.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reported-by: Jeff Moyer <jmoyer@redhat.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>	[ppc64]
Cc: David Hildenbrand <david@redhat.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/nvdimm/pfn.h      | 14 ---------
 drivers/nvdimm/pfn_devs.c | 77 ++++++++---------------------------------------
 include/linux/mmzone.h    |  3 ++
 3 files changed, 16 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index dfb2bcda8f5a..7381673b7b70 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -33,18 +33,4 @@ struct nd_pfn_sb {
 	__le64 checksum;
 };
 
-#ifdef CONFIG_SPARSEMEM
-#define PFN_SECTION_ALIGN_DOWN(x) SECTION_ALIGN_DOWN(x)
-#define PFN_SECTION_ALIGN_UP(x) SECTION_ALIGN_UP(x)
-#else
-/*
- * In this case ZONE_DEVICE=n and we will disable 'pfn' device support,
- * but we still want pmem to compile.
- */
-#define PFN_SECTION_ALIGN_DOWN(x) (x)
-#define PFN_SECTION_ALIGN_UP(x) (x)
-#endif
-
-#define PHYS_SECTION_ALIGN_DOWN(x) PFN_PHYS(PFN_SECTION_ALIGN_DOWN(PHYS_PFN(x)))
-#define PHYS_SECTION_ALIGN_UP(x) PFN_PHYS(PFN_SECTION_ALIGN_UP(PHYS_PFN(x)))
 #endif /* __NVDIMM_PFN_H */
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 06f465c0baf3..df2bdbd22450 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -587,14 +587,14 @@ static u32 info_block_reserve(void)
 }
 
 /*
- * We hotplug memory at section granularity, pad the reserved area from
- * the previous section base to the namespace base address.
+ * We hotplug memory at sub-section granularity, pad the reserved area
+ * from the previous section base to the namespace base address.
  */
 static unsigned long init_altmap_base(resource_size_t base)
 {
 	unsigned long base_pfn = PHYS_PFN(base);
 
-	return PFN_SECTION_ALIGN_DOWN(base_pfn);
+	return SUBSECTION_ALIGN_DOWN(base_pfn);
 }
 
 static unsigned long init_altmap_reserve(resource_size_t base)
@@ -602,7 +602,7 @@ static unsigned long init_altmap_reserve(resource_size_t base)
 	unsigned long reserve = info_block_reserve() >> PAGE_SHIFT;
 	unsigned long base_pfn = PHYS_PFN(base);
 
-	reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn);
+	reserve += base_pfn - SUBSECTION_ALIGN_DOWN(base_pfn);
 	return reserve;
 }
 
@@ -632,8 +632,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 			return -EINVAL;
 		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
 	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
-		nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res)
-					- offset) / PAGE_SIZE);
+		nd_pfn->npfns = PHYS_PFN((resource_size(res) - offset));
 		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
 			dev_info(&nd_pfn->dev,
 					"number of pfns truncated from %lld to %ld\n",
@@ -649,54 +648,14 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
 	return 0;
 }
 
-static u64 phys_pmem_align_down(struct nd_pfn *nd_pfn, u64 phys)
-{
-	return min_t(u64, PHYS_SECTION_ALIGN_DOWN(phys),
-			ALIGN_DOWN(phys, nd_pfn->align));
-}
-
-/*
- * Check if pmem collides with 'System RAM', or other regions when
- * section aligned.  Trim it accordingly.
- */
-static void trim_pfn_device(struct nd_pfn *nd_pfn, u32 *start_pad, u32 *end_trunc)
-{
-	struct nd_namespace_common *ndns = nd_pfn->ndns;
-	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	struct nd_region *nd_region = to_nd_region(nd_pfn->dev.parent);
-	const resource_size_t start = nsio->res.start;
-	const resource_size_t end = start + resource_size(&nsio->res);
-	resource_size_t adjust, size;
-
-	*start_pad = 0;
-	*end_trunc = 0;
-
-	adjust = start - PHYS_SECTION_ALIGN_DOWN(start);
-	size = resource_size(&nsio->res) + adjust;
-	if (region_intersects(start - adjust, size, IORESOURCE_SYSTEM_RAM,
-				IORES_DESC_NONE) == REGION_MIXED
-			|| nd_region_conflict(nd_region, start - adjust, size))
-		*start_pad = PHYS_SECTION_ALIGN_UP(start) - start;
-
-	/* Now check that end of the range does not collide. */
-	adjust = PHYS_SECTION_ALIGN_UP(end) - end;
-	size = resource_size(&nsio->res) + adjust;
-	if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM,
-				IORES_DESC_NONE) == REGION_MIXED
-			|| !IS_ALIGNED(end, nd_pfn->align)
-			|| nd_region_conflict(nd_region, start, size))
-		*end_trunc = end - phys_pmem_align_down(nd_pfn, end);
-}
-
 static int nd_pfn_init(struct nd_pfn *nd_pfn)
 {
 	struct nd_namespace_common *ndns = nd_pfn->ndns;
 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
-	u32 start_pad, end_trunc, reserve = info_block_reserve();
 	resource_size_t start, size;
 	struct nd_region *nd_region;
+	unsigned long npfns, align;
 	struct nd_pfn_sb *pfn_sb;
-	unsigned long npfns;
 	phys_addr_t offset;
 	const char *sig;
 	u64 checksum;
@@ -727,43 +686,35 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 		return -ENXIO;
 	}
 
-	memset(pfn_sb, 0, sizeof(*pfn_sb));
-
-	trim_pfn_device(nd_pfn, &start_pad, &end_trunc);
-	if (start_pad + end_trunc)
-		dev_info(&nd_pfn->dev, "%s alignment collision, truncate %d bytes\n",
-				dev_name(&ndns->dev), start_pad + end_trunc);
-
 	/*
 	 * Note, we use 64 here for the standard size of struct page,
 	 * debugging options may cause it to be larger in which case the
 	 * implementation will limit the pfns advertised through
 	 * ->direct_access() to those that are included in the memmap.
 	 */
-	start = nsio->res.start + start_pad;
+	start = nsio->res.start;
 	size = resource_size(&nsio->res);
-	npfns = PFN_SECTION_ALIGN_UP((size - start_pad - end_trunc - reserve)
-			/ PAGE_SIZE);
+	npfns = PHYS_PFN(size - SZ_8K);
+	align = max(nd_pfn->align, (1UL << SUBSECTION_SHIFT));
 	if (nd_pfn->mode == PFN_MODE_PMEM) {
 		/*
 		 * The altmap should be padded out to the block size used
 		 * when populating the vmemmap. This *should* be equal to
 		 * PMD_SIZE for most architectures.
 		 */
-		offset = ALIGN(start + reserve + 64 * npfns,
-				max(nd_pfn->align, PMD_SIZE)) - start;
+		offset = ALIGN(start + SZ_8K + 64 * npfns, align) - start;
 	} else if (nd_pfn->mode == PFN_MODE_RAM)
-		offset = ALIGN(start + reserve, nd_pfn->align) - start;
+		offset = ALIGN(start + SZ_8K, align) - start;
 	else
 		return -ENXIO;
 
-	if (offset + start_pad + end_trunc >= size) {
+	if (offset >= size) {
 		dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n",
 				dev_name(&ndns->dev));
 		return -ENXIO;
 	}
 
-	npfns = (size - offset - start_pad - end_trunc) / SZ_4K;
+	npfns = PHYS_PFN(size - offset);
 	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
 	pfn_sb->dataoff = cpu_to_le64(offset);
 	pfn_sb->npfns = cpu_to_le64(npfns);
@@ -772,8 +723,6 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
 	pfn_sb->version_major = cpu_to_le16(1);
 	pfn_sb->version_minor = cpu_to_le16(3);
-	pfn_sb->start_pad = cpu_to_le32(start_pad);
-	pfn_sb->end_trunc = cpu_to_le32(end_trunc);
 	pfn_sb->align = cpu_to_le32(nd_pfn->align);
 	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
 	pfn_sb->checksum = cpu_to_le64(checksum);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8331e76677c0..d77d717c620c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1160,6 +1160,9 @@ static inline unsigned long section_nr_to_pfn(unsigned long sec)
 #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
 #endif
 
+#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
+#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)
+
 struct mem_section_usage {
 	DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
 	/* See declaration of similar field in struct zone */
-- 
cgit v1.2.3-59-g8ed1b


From 371096949f0ad3950b06729989bd27de51b8c5f5 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 18 Jul 2019 15:58:46 -0700
Subject: mm: migrate: remove unused mode argument

migrate_page_move_mapping() doesn't use the mode argument.  Remove it
and update callers accordingly.

Link: http://lkml.kernel.org/r/20190508210301.8472-1-keith.busch@intel.com
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c                | 2 +-
 fs/f2fs/data.c          | 2 +-
 fs/iomap.c              | 2 +-
 fs/ubifs/file.c         | 2 +-
 include/linux/migrate.h | 3 +--
 mm/migrate.c            | 7 +++----
 6 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 8327db0c8e08..8b3aa2739906 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -425,7 +425,7 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 	BUG_ON(PageWriteback(old));
 	get_page(new);
 
-	rc = migrate_page_move_mapping(mapping, new, old, mode, 1);
+	rc = migrate_page_move_mapping(mapping, new, old, 1);
 	if (rc != MIGRATEPAGE_SUCCESS) {
 		put_page(new);
 		goto out_unlock;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 4eb2f3920140..abbf14e9bd72 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2919,7 +2919,7 @@ int f2fs_migrate_page(struct address_space *mapping,
 	/* one extra reference was held for atomic_write page */
 	extra_count = atomic_written ? 1 : 0;
 	rc = migrate_page_move_mapping(mapping, newpage,
-				page, mode, extra_count);
+				page, extra_count);
 	if (rc != MIGRATEPAGE_SUCCESS) {
 		if (atomic_written)
 			mutex_unlock(&fi->inmem_lock);
diff --git a/fs/iomap.c b/fs/iomap.c
index 217c3e5a13d6..3e7f16a05653 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -566,7 +566,7 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
 {
 	int ret;
 
-	ret = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+	ret = migrate_page_move_mapping(mapping, newpage, page, 0);
 	if (ret != MIGRATEPAGE_SUCCESS)
 		return ret;
 
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e5f8de62fc51..400970d740bb 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1470,7 +1470,7 @@ static int ubifs_migrate_page(struct address_space *mapping,
 {
 	int rc;
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
 
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e13d9bf2f9a5..7f04754c7f2b 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -77,8 +77,7 @@ extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
 				  struct page *newpage, struct page *page);
 extern int migrate_page_move_mapping(struct address_space *mapping,
-		struct page *newpage, struct page *page, enum migrate_mode mode,
-		int extra_count);
+		struct page *newpage, struct page *page, int extra_count);
 #else
 
 static inline void putback_movable_pages(struct list_head *l) {}
diff --git a/mm/migrate.c b/mm/migrate.c
index 3445747e229d..8992741f10aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -394,8 +394,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
 int migrate_page_move_mapping(struct address_space *mapping,
-		struct page *newpage, struct page *page, enum migrate_mode mode,
-		int extra_count)
+		struct page *newpage, struct page *page, int extra_count)
 {
 	XA_STATE(xas, &mapping->i_pages, page_index(page));
 	struct zone *oldzone, *newzone;
@@ -681,7 +680,7 @@ int migrate_page(struct address_space *mapping,
 
 	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
 
 	if (rc != MIGRATEPAGE_SUCCESS)
 		return rc;
@@ -780,7 +779,7 @@ recheck_buffers:
 		}
 	}
 
-	rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+	rc = migrate_page_move_mapping(mapping, newpage, page, 0);
 	if (rc != MIGRATEPAGE_SUCCESS)
 		goto unlock_buffers;
 
-- 
cgit v1.2.3-59-g8ed1b


From eec4844fae7c033a0c1fc1eb3b8517aeb8b6cc49 Mon Sep 17 00:00:00 2001
From: Matteo Croce <mcroce@redhat.com>
Date: Thu, 18 Jul 2019 15:58:50 -0700
Subject: proc/sysctl: add shared variables for range check

In the sysctl code the proc_dointvec_minmax() function is often used to
validate the user supplied value between an allowed range.  This
function uses the extra1 and extra2 members from struct ctl_table as
minimum and maximum allowed value.

On sysctl handler declaration, in every source file there are some
readonly variables containing just an integer which address is assigned
to the extra1 and extra2 members, so the sysctl range is enforced.

The special values 0, 1 and INT_MAX are very often used as range
boundary, leading duplication of variables like zero=0, one=1,
int_max=INT_MAX in different source files:

    $ git grep -E '\.extra[12].*&(zero|one|int_max)' |wc -l
    248

Add a const int array containing the most commonly used values, some
macros to refer more easily to the correct array member, and use them
instead of creating a local one for every object file.

This is the bloat-o-meter output comparing the old and new binary
compiled with the default Fedora config:

    # scripts/bloat-o-meter -d vmlinux.o.old vmlinux.o
    add/remove: 2/2 grow/shrink: 0/2 up/down: 24/-188 (-164)
    Data                                         old     new   delta
    sysctl_vals                                    -      12     +12
    __kstrtab_sysctl_vals                          -      12     +12
    max                                           14      10      -4
    int_max                                       16       -     -16
    one                                           68       -     -68
    zero                                         128      28    -100
    Total: Before=20583249, After=20583085, chg -0.00%

[mcroce@redhat.com: tipc: remove two unused variables]
  Link: http://lkml.kernel.org/r/20190530091952.4108-1-mcroce@redhat.com
[akpm@linux-foundation.org: fix net/ipv6/sysctl_net_ipv6.c]
[arnd@arndb.de: proc/sysctl: make firmware loader table conditional]
  Link: http://lkml.kernel.org/r/20190617130014.1713870-1-arnd@arndb.de
[akpm@linux-foundation.org: fix fs/eventpoll.c]
Link: http://lkml.kernel.org/r/20190430180111.10688-1-mcroce@redhat.com
Signed-off-by: Matteo Croce <mcroce@redhat.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/appldata/appldata_base.c            |  15 +-
 arch/s390/kernel/topology.c                   |   6 +-
 arch/x86/entry/vdso/vdso32-setup.c            |   7 +-
 arch/x86/kernel/itmt.c                        |   6 +-
 drivers/base/firmware_loader/fallback_table.c |  13 +-
 drivers/gpu/drm/i915/i915_perf.c              |   8 +-
 drivers/hv/vmbus_drv.c                        |   6 +-
 drivers/tty/tty_ldisc.c                       |   6 +-
 drivers/xen/balloon.c                         |   7 +-
 fs/eventpoll.c                                |   4 +-
 fs/notify/inotify/inotify_user.c              |   8 +-
 fs/proc/proc_sysctl.c                         |   4 +
 include/linux/sysctl.h                        |   7 +
 ipc/ipc_sysctl.c                              |  35 +++--
 kernel/pid_namespace.c                        |   3 +-
 kernel/sysctl.c                               | 197 +++++++++++++-------------
 kernel/ucount.c                               |   6 +-
 net/core/neighbour.c                          |  20 ++-
 net/core/sysctl_net_core.c                    |  34 +++--
 net/dccp/sysctl.c                             |  16 +--
 net/ipv4/sysctl_net_ipv4.c                    |  60 ++++----
 net/ipv6/addrconf.c                           |   6 +-
 net/ipv6/route.c                              |   7 +-
 net/ipv6/sysctl_net_ipv6.c                    |  10 +-
 net/mpls/af_mpls.c                            |  10 +-
 net/netfilter/ipvs/ip_vs_ctl.c                |   3 +-
 net/rxrpc/sysctl.c                            |   9 +-
 net/sctp/sysctl.c                             |  35 +++--
 net/sunrpc/xprtrdma/transport.c               |   3 +-
 net/tipc/sysctl.c                             |   6 +-
 security/keys/sysctl.c                        |  26 ++--
 security/loadpin/loadpin.c                    |   6 +-
 security/yama/yama_lsm.c                      |   3 +-
 33 files changed, 270 insertions(+), 322 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index e4b58240ec53..aa738cad1338 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -220,15 +220,13 @@ appldata_timer_handler(struct ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int timer_active = appldata_timer_active;
-	int zero = 0;
-	int one = 1;
 	int rc;
 	struct ctl_table ctl_entry = {
 		.procname	= ctl->procname,
 		.data		= &timer_active,
 		.maxlen		= sizeof(int),
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	};
 
 	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
@@ -255,13 +253,12 @@ appldata_interval_handler(struct ctl_table *ctl, int write,
 			   void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int interval = appldata_interval;
-	int one = 1;
 	int rc;
 	struct ctl_table ctl_entry = {
 		.procname	= ctl->procname,
 		.data		= &interval,
 		.maxlen		= sizeof(int),
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	};
 
 	rc = proc_dointvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
@@ -289,13 +286,11 @@ appldata_generic_handler(struct ctl_table *ctl, int write,
 	struct list_head *lh;
 	int rc, found;
 	int active;
-	int zero = 0;
-	int one = 1;
 	struct ctl_table ctl_entry = {
 		.data		= &active,
 		.maxlen		= sizeof(int),
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	};
 
 	found = 0;
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 8964a3f60aad..2db6fb405a9a 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -587,15 +587,13 @@ static int topology_ctl_handler(struct ctl_table *ctl, int write,
 {
 	int enabled = topology_is_enabled();
 	int new_mode;
-	int zero = 0;
-	int one = 1;
 	int rc;
 	struct ctl_table ctl_entry = {
 		.procname	= ctl->procname,
 		.data		= &enabled,
 		.maxlen		= sizeof(int),
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	};
 
 	rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos);
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 42d4c89f990e..240626e7f55a 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -65,9 +65,6 @@ subsys_initcall(sysenter_setup);
 /* Register vsyscall32 into the ABI table */
 #include <linux/sysctl.h>
 
-static const int zero;
-static const int one = 1;
-
 static struct ctl_table abi_table2[] = {
 	{
 		.procname	= "vsyscall32",
@@ -75,8 +72,8 @@ static struct ctl_table abi_table2[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (int *)&zero,
-		.extra2		= (int *)&one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{}
 };
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 838cf8a32c49..1cb3ca9bba49 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -65,8 +65,6 @@ static int sched_itmt_update_handler(struct ctl_table *table, int write,
 	return ret;
 }
 
-static unsigned int zero;
-static unsigned int one = 1;
 static struct ctl_table itmt_kern_table[] = {
 	{
 		.procname	= "sched_itmt_enabled",
@@ -74,8 +72,8 @@ static struct ctl_table itmt_kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sched_itmt_update_handler,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{}
 };
diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c
index 776dd69cf5be..ba9d30b28edc 100644
--- a/drivers/base/firmware_loader/fallback_table.c
+++ b/drivers/base/firmware_loader/fallback_table.c
@@ -16,9 +16,6 @@
  * firmware fallback configuration table
  */
 
-static unsigned int zero;
-static unsigned int one = 1;
-
 struct firmware_fallback_config fw_fallback_config = {
 	.force_sysfs_fallback = IS_ENABLED(CONFIG_FW_LOADER_USER_HELPER_FALLBACK),
 	.loading_timeout = 60,
@@ -26,6 +23,7 @@ struct firmware_fallback_config fw_fallback_config = {
 };
 EXPORT_SYMBOL_GPL(fw_fallback_config);
 
+#ifdef CONFIG_SYSCTL
 struct ctl_table firmware_config_table[] = {
 	{
 		.procname	= "force_sysfs_fallback",
@@ -33,8 +31,8 @@ struct ctl_table firmware_config_table[] = {
 		.maxlen         = sizeof(unsigned int),
 		.mode           = 0644,
 		.proc_handler   = proc_douintvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "ignore_sysfs_fallback",
@@ -42,9 +40,10 @@ struct ctl_table firmware_config_table[] = {
 		.maxlen         = sizeof(unsigned int),
 		.mode           = 0644,
 		.proc_handler   = proc_douintvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{ }
 };
 EXPORT_SYMBOL_GPL(firmware_config_table);
+#endif
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 3d8162d28730..a700c5c3d167 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -274,8 +274,6 @@
 #define POLL_PERIOD (NSEC_PER_SEC / POLL_FREQUENCY)
 
 /* for sysctl proc_dointvec_minmax of dev.i915.perf_stream_paranoid */
-static int zero;
-static int one = 1;
 static u32 i915_perf_stream_paranoid = true;
 
 /* The maximum exponent the hardware accepts is 63 (essentially it selects one
@@ -3366,8 +3364,8 @@ static struct ctl_table oa_table[] = {
 	 .maxlen = sizeof(i915_perf_stream_paranoid),
 	 .mode = 0644,
 	 .proc_handler = proc_dointvec_minmax,
-	 .extra1 = &zero,
-	 .extra2 = &one,
+	 .extra1 = SYSCTL_ZERO,
+	 .extra2 = SYSCTL_ONE,
 	 },
 	{
 	 .procname = "oa_max_sample_rate",
@@ -3375,7 +3373,7 @@ static struct ctl_table oa_table[] = {
 	 .maxlen = sizeof(i915_oa_max_sample_rate),
 	 .mode = 0644,
 	 .proc_handler = proc_dointvec_minmax,
-	 .extra1 = &zero,
+	 .extra1 = SYSCTL_ZERO,
 	 .extra2 = &oa_sample_rate_hard_limit,
 	 },
 	{}
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 894da5abdc55..ebd35fc35290 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1197,8 +1197,6 @@ static struct kmsg_dumper hv_kmsg_dumper = {
 };
 
 static struct ctl_table_header *hv_ctl_table_hdr;
-static int zero;
-static int one = 1;
 
 /*
  * sysctl option to allow the user to control whether kmsg data should be
@@ -1211,8 +1209,8 @@ static struct ctl_table hv_ctl_table[] = {
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
 	},
 	{}
 };
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index fde8d4073e74..4c49f53afa3e 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -855,8 +855,6 @@ void tty_ldisc_deinit(struct tty_struct *tty)
 	tty->ldisc = NULL;
 }
 
-static int zero;
-static int one = 1;
 static struct ctl_table tty_table[] = {
 	{
 		.procname	= "ldisc_autoload",
@@ -864,8 +862,8 @@ static struct ctl_table tty_table[] = {
 		.maxlen		= sizeof(tty_ldisc_autoload),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{ }
 };
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index d37dd5bb7a8f..37a36c6b9f93 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -77,9 +77,6 @@ static int xen_hotplug_unpopulated;
 
 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
 
-static int zero;
-static int one = 1;
-
 static struct ctl_table balloon_table[] = {
 	{
 		.procname	= "hotplug_unpopulated",
@@ -87,8 +84,8 @@ static struct ctl_table balloon_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &zero,
-		.extra2         = &one,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
 	},
 	{ }
 };
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 0f9c073d78d5..d7f1f5011fac 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -291,7 +291,7 @@ static LIST_HEAD(tfile_check_list);
 
 #include <linux/sysctl.h>
 
-static long zero;
+static long long_zero;
 static long long_max = LONG_MAX;
 
 struct ctl_table epoll_table[] = {
@@ -301,7 +301,7 @@ struct ctl_table epoll_table[] = {
 		.maxlen		= sizeof(max_user_watches),
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
-		.extra1		= &zero,
+		.extra1		= &long_zero,
 		.extra2		= &long_max,
 	},
 	{ }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index cce8de32779f..0b815178126e 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -45,8 +45,6 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 
 #include <linux/sysctl.h>
 
-static int zero;
-
 struct ctl_table inotify_table[] = {
 	{
 		.procname	= "max_user_instances",
@@ -54,7 +52,7 @@ struct ctl_table inotify_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "max_user_watches",
@@ -62,7 +60,7 @@ struct ctl_table inotify_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "max_queued_events",
@@ -70,7 +68,7 @@ struct ctl_table inotify_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero
+		.extra1		= SYSCTL_ZERO
 	},
 	{ }
 };
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 36ad1b0d6259..d80989b6c344 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -22,6 +22,10 @@ static const struct inode_operations proc_sys_inode_operations;
 static const struct file_operations proc_sys_dir_file_operations;
 static const struct inode_operations proc_sys_dir_operations;
 
+/* shared constants to be used in various sysctls */
+const int sysctl_vals[] = { 0, 1, INT_MAX };
+EXPORT_SYMBOL(sysctl_vals);
+
 /* Support for permanently empty directories */
 
 struct ctl_table sysctl_mount_point[] = {
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index aadd310769d0..6df477329b76 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -37,6 +37,13 @@ struct ctl_table_root;
 struct ctl_table_header;
 struct ctl_dir;
 
+/* Keep the same order as in fs/proc/proc_sysctl.c */
+#define SYSCTL_ZERO	((void *)&sysctl_vals[0])
+#define SYSCTL_ONE	((void *)&sysctl_vals[1])
+#define SYSCTL_INT_MAX	((void *)&sysctl_vals[2])
+
+extern const int sysctl_vals[];
+
 typedef int proc_handler (struct ctl_table *ctl, int write,
 			  void __user *buffer, size_t *lenp, loff_t *ppos);
 
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 2b14ce8ce73f..affd66537e87 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -113,9 +113,6 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 #define proc_ipc_sem_dointvec	   NULL
 #endif
 
-static int zero;
-static int one = 1;
-static int int_max = INT_MAX;
 int ipc_mni = IPCMNI;
 int ipc_mni_shift = IPCMNI_SHIFT;
 int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
@@ -141,7 +138,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.shm_ctlmni),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ipc_mni,
 	},
 	{
@@ -150,8 +147,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "msgmax",
@@ -159,8 +156,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmax),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "msgmni",
@@ -168,7 +165,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmni),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ipc_mni,
 	},
 	{
@@ -177,8 +174,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_auto_msgmni,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	=  "msgmnb",
@@ -186,8 +183,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.msg_ctlmnb),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "sem",
@@ -203,8 +200,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "msg_next_id",
@@ -212,8 +209,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "shm_next_id",
@@ -221,8 +218,8 @@ static struct ctl_table ipc_kern_table[] = {
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
 		.mode		= 0644,
 		.proc_handler	= proc_ipc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 #endif
 	{}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 6d726cef241c..a6a79f85c81a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -291,14 +291,13 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 }
 
 extern int pid_max;
-static int zero = 0;
 static struct ctl_table pid_ns_ctl_table[] = {
 	{
 		.procname = "ns_last_pid",
 		.maxlen = sizeof(int),
 		.mode = 0666, /* permissions are checked in the handler */
 		.proc_handler = pid_ns_ctl_handler,
-		.extra1 = &zero,
+		.extra1 = SYSCTL_ZERO,
 		.extra2 = &pid_max,
 	},
 	{ }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 43186ccfa139..078950d9605b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -125,9 +125,6 @@ static int sixty = 60;
 #endif
 
 static int __maybe_unused neg_one = -1;
-
-static int zero;
-static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long zero_ul;
@@ -385,8 +382,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sysctl_schedstats,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_SCHEDSTATS */
 #endif /* CONFIG_SMP */
@@ -418,7 +415,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "numa_balancing",
@@ -426,8 +423,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sysctl_numa_balancing,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
@@ -475,8 +472,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
@@ -486,7 +483,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 #endif
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
@@ -496,8 +493,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= sched_energy_aware_handler,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_PROVE_LOCKING
@@ -562,7 +559,7 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &neg_one,
-		.extra2		= &one,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_LATENCYTOP
@@ -696,8 +693,8 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		/* only handle a transition from default "0" to "1" */
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_MODULES
@@ -715,8 +712,8 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		/* only handle a transition from default "0" to "1" */
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_UEVENT_HELPER
@@ -875,7 +872,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &ten_thousand,
 	},
 	{
@@ -891,8 +888,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax_sysadmin,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "kptr_restrict",
@@ -900,7 +897,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax_sysadmin,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 #endif
@@ -925,8 +922,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler   = proc_watchdog,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "watchdog_thresh",
@@ -934,7 +931,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_watchdog_thresh,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &sixty,
 	},
 	{
@@ -943,8 +940,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= NMI_WATCHDOG_SYSCTL_PERM,
 		.proc_handler   = proc_nmi_watchdog,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "watchdog_cpumask",
@@ -960,8 +957,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler   = proc_soft_watchdog,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "softlockup_panic",
@@ -969,8 +966,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #ifdef CONFIG_SMP
 	{
@@ -979,8 +976,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_SMP */
 #endif
@@ -991,8 +988,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #ifdef CONFIG_SMP
 	{
@@ -1001,8 +998,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif /* CONFIG_SMP */
 #endif
@@ -1115,8 +1112,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "hung_task_check_count",
@@ -1124,7 +1121,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "hung_task_timeout_secs",
@@ -1201,7 +1198,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_perf_event_sample_rate),
 		.mode		= 0644,
 		.proc_handler	= perf_proc_update_handler,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "perf_cpu_time_max_percent",
@@ -1209,7 +1206,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_perf_cpu_time_max_percent),
 		.mode		= 0644,
 		.proc_handler	= perf_cpu_time_max_percent_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 	{
@@ -1218,7 +1215,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_perf_event_max_stack),
 		.mode		= 0644,
 		.proc_handler	= perf_event_max_stack_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &six_hundred_forty_kb,
 	},
 	{
@@ -1227,7 +1224,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_perf_event_max_contexts_per_stack),
 		.mode		= 0644,
 		.proc_handler	= perf_event_max_stack_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_thousand,
 	},
 #endif
@@ -1237,8 +1234,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 	{
@@ -1247,8 +1244,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= timer_migration_handler,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_BPF_SYSCALL
@@ -1259,8 +1256,8 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		/* only handle a transition from default "0" to "1" */
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "bpf_stats_enabled",
@@ -1277,8 +1274,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(sysctl_panic_on_rcu_stall),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
@@ -1288,8 +1285,8 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= stack_erasing_sysctl,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{ }
@@ -1302,7 +1299,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_overcommit_memory),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1311,7 +1308,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_panic_on_oom),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1348,7 +1345,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "dirty_background_ratio",
@@ -1356,7 +1353,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(dirty_background_ratio),
 		.mode		= 0644,
 		.proc_handler	= dirty_background_ratio_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 	{
@@ -1373,7 +1370,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(vm_dirty_ratio),
 		.mode		= 0644,
 		.proc_handler	= dirty_ratio_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 	{
@@ -1397,7 +1394,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "dirtytime_expire_seconds",
@@ -1405,7 +1402,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(dirtytime_expire_interval),
 		.mode		= 0644,
 		.proc_handler	= dirtytime_interval_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "swappiness",
@@ -1413,7 +1410,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(vm_swappiness),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 #ifdef CONFIG_HUGETLB_PAGE
@@ -1438,8 +1435,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen			= sizeof(int),
 		.mode			= 0644,
 		.proc_handler	= sysctl_vm_numa_stat_handler,
-		.extra1			= &zero,
-		.extra2			= &one,
+		.extra1			= SYSCTL_ZERO,
+		.extra2			= SYSCTL_ONE,
 	},
 #endif
 	 {
@@ -1470,7 +1467,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= drop_caches_sysctl_handler,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &four,
 	},
 #ifdef CONFIG_COMPACTION
@@ -1496,8 +1493,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 
 #endif /* CONFIG_COMPACTION */
@@ -1507,7 +1504,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(min_free_kbytes),
 		.mode		= 0644,
 		.proc_handler	= min_free_kbytes_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "watermark_boost_factor",
@@ -1515,7 +1512,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(watermark_boost_factor),
 		.mode		= 0644,
 		.proc_handler	= watermark_boost_factor_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "watermark_scale_factor",
@@ -1523,7 +1520,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(watermark_scale_factor),
 		.mode		= 0644,
 		.proc_handler	= watermark_scale_factor_sysctl_handler,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &one_thousand,
 	},
 	{
@@ -1532,7 +1529,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(percpu_pagelist_fraction),
 		.mode		= 0644,
 		.proc_handler	= percpu_pagelist_fraction_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #ifdef CONFIG_MMU
 	{
@@ -1541,7 +1538,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_max_map_count),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #else
 	{
@@ -1550,7 +1547,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_nr_trim_pages),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #endif
 	{
@@ -1566,7 +1563,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(block_dump),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "vfs_cache_pressure",
@@ -1574,7 +1571,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_vfs_cache_pressure),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 	{
@@ -1583,7 +1580,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_legacy_va_layout),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #endif
 #ifdef CONFIG_NUMA
@@ -1593,7 +1590,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(node_reclaim_mode),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "min_unmapped_ratio",
@@ -1601,7 +1598,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_min_unmapped_ratio),
 		.mode		= 0644,
 		.proc_handler	= sysctl_min_unmapped_ratio_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 	{
@@ -1610,7 +1607,7 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_min_slab_ratio),
 		.mode		= 0644,
 		.proc_handler	= sysctl_min_slab_ratio_sysctl_handler,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
 #endif
@@ -1661,7 +1658,7 @@ static struct ctl_table vm_table[] = {
 #endif
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #endif
 #ifdef CONFIG_HIGHMEM
@@ -1671,8 +1668,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(vm_highmem_is_dirtyable),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 #ifdef CONFIG_MEMORY_FAILURE
@@ -1682,8 +1679,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_memory_failure_early_kill),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "memory_failure_recovery",
@@ -1691,8 +1688,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_memory_failure_recovery),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -1738,8 +1735,8 @@ static struct ctl_table vm_table[] = {
 		.maxlen		= sizeof(sysctl_unprivileged_userfaultfd),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{ }
@@ -1875,8 +1872,8 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "protected_hardlinks",
@@ -1884,8 +1881,8 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "protected_fifos",
@@ -1893,7 +1890,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1902,7 +1899,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1911,7 +1908,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax_coredump,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
@@ -1948,7 +1945,7 @@ static struct ctl_table fs_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{ }
 };
@@ -1970,8 +1967,8 @@ static struct ctl_table debug_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_kprobes_optimization_handler,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{ }
@@ -3395,8 +3392,8 @@ int proc_do_static_key(struct ctl_table *table, int write,
 		.data   = &val,
 		.maxlen = sizeof(val),
 		.mode   = table->mode,
-		.extra1 = &zero,
-		.extra2 = &one,
+		.extra1 = SYSCTL_ZERO,
+		.extra2 = SYSCTL_ONE,
 	};
 
 	if (write && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/ucount.c b/kernel/ucount.c
index feb128c7b5d9..a53cc2b4179c 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -52,16 +52,14 @@ static struct ctl_table_root set_root = {
 	.permissions = set_permissions,
 };
 
-static int zero = 0;
-static int int_max = INT_MAX;
 #define UCOUNT_ENTRY(name)				\
 	{						\
 		.procname	= name,			\
 		.maxlen		= sizeof(int),		\
 		.mode		= 0644,			\
 		.proc_handler	= proc_dointvec_minmax,	\
-		.extra1		= &zero,		\
-		.extra2		= &int_max,		\
+		.extra1		= SYSCTL_ZERO,		\
+		.extra2		= SYSCTL_INT_MAX,	\
 	}
 static struct ctl_table user_table[] = {
 	UCOUNT_ENTRY("max_user_namespaces"),
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 742cea4ce72e..26da97359d5b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3374,8 +3374,6 @@ void neigh_app_ns(struct neighbour *n)
 EXPORT_SYMBOL(neigh_app_ns);
 
 #ifdef CONFIG_SYSCTL
-static int zero;
-static int int_max = INT_MAX;
 static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
 
 static int proc_unres_qlen(struct ctl_table *ctl, int write,
@@ -3384,7 +3382,7 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write,
 	int size, ret;
 	struct ctl_table tmp = *ctl;
 
-	tmp.extra1 = &zero;
+	tmp.extra1 = SYSCTL_ZERO;
 	tmp.extra2 = &unres_qlen_max;
 	tmp.data = &size;
 
@@ -3449,8 +3447,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
 	struct ctl_table tmp = *ctl;
 	int ret;
 
-	tmp.extra1 = &zero;
-	tmp.extra2 = &int_max;
+	tmp.extra1 = SYSCTL_ZERO;
+	tmp.extra2 = SYSCTL_INT_MAX;
 
 	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 	neigh_proc_update(ctl, write);
@@ -3595,24 +3593,24 @@ static struct neigh_sysctl_table {
 			.procname	= "gc_thresh1",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.extra1 	= &zero,
-			.extra2		= &int_max,
+			.extra1		= SYSCTL_ZERO,
+			.extra2		= SYSCTL_INT_MAX,
 			.proc_handler	= proc_dointvec_minmax,
 		},
 		[NEIGH_VAR_GC_THRESH2] = {
 			.procname	= "gc_thresh2",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.extra1 	= &zero,
-			.extra2		= &int_max,
+			.extra1		= SYSCTL_ZERO,
+			.extra2		= SYSCTL_INT_MAX,
 			.proc_handler	= proc_dointvec_minmax,
 		},
 		[NEIGH_VAR_GC_THRESH3] = {
 			.procname	= "gc_thresh3",
 			.maxlen		= sizeof(int),
 			.mode		= 0644,
-			.extra1 	= &zero,
-			.extra2		= &int_max,
+			.extra1		= SYSCTL_ZERO,
+			.extra2		= SYSCTL_INT_MAX,
 			.proc_handler	= proc_dointvec_minmax,
 		},
 		{},
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index f9204719aeee..8da5b3a54dac 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -22,8 +22,6 @@
 #include <net/busy_poll.h>
 #include <net/pkt_sched.h>
 
-static int zero = 0;
-static int one = 1;
 static int two __maybe_unused = 2;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
@@ -390,10 +388,10 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax_bpf_enable,
 # ifdef CONFIG_BPF_JIT_ALWAYS_ON
-		.extra1		= &one,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE,
 # else
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 # endif
 	},
@@ -404,7 +402,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -413,8 +411,8 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0600,
 		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 # endif
 	{
@@ -461,8 +459,8 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE
 	},
 #ifdef CONFIG_RPS
 	{
@@ -493,7 +491,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "busy_read",
@@ -501,7 +499,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #endif
 #ifdef CONFIG_NET_SCHED
@@ -533,7 +531,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &max_skb_frags,
 	},
 	{
@@ -542,7 +540,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "fb_tunnels_only_for_init_net",
@@ -550,8 +548,8 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "devconf_inherit_init_net",
@@ -559,7 +557,7 @@ static struct ctl_table net_core_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -578,7 +576,7 @@ static struct ctl_table netns_core_table[] = {
 		.data		= &init_net.core.sysctl_somaxconn,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.proc_handler	= proc_dointvec_minmax
 	},
 	{ }
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index b59040f268a9..ee8d4f5afa72 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -16,9 +16,7 @@
 #endif
 
 /* Boundary values */
-static int		zero     = 0,
-			one      = 1,
-			u8_max   = 0xFF;
+static int		u8_max   = 0xFF;
 static unsigned long	seqw_min = DCCPF_SEQ_WMIN,
 			seqw_max = 0xFFFFFFFF;		/* maximum on 32 bit */
 
@@ -38,7 +36,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_rx_ccid),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &u8_max,		/* RFC 4340, 10. */
 	},
 	{
@@ -47,7 +45,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_tx_ccid),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &u8_max,		/* RFC 4340, 10. */
 	},
 	{
@@ -56,7 +54,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_request_retries),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &u8_max,
 	},
 	{
@@ -65,7 +63,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_retries1),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &u8_max,
 	},
 	{
@@ -74,7 +72,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_retries2),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &u8_max,
 	},
 	{
@@ -83,7 +81,7 @@ static struct ctl_table dccp_default_table[] = {
 		.maxlen		= sizeof(sysctl_dccp_tx_qlen),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "sync_ratelimit",
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7d66306b5f39..0b980e841927 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -28,8 +28,6 @@
 #include <net/protocol.h>
 #include <net/netevent.h>
 
-static int zero;
-static int one = 1;
 static int two = 2;
 static int four = 4;
 static int thousand = 1000;
@@ -576,7 +574,7 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "icmp_msgs_burst",
@@ -584,7 +582,7 @@ static struct ctl_table ipv4_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 	{
 		.procname	= "udp_mem",
@@ -674,8 +672,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -763,8 +761,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler   = ipv4_fwd_update_priority,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "ip_nonlocal_bind",
@@ -794,8 +792,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -864,7 +862,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
+		.extra1		= SYSCTL_ONE
 	},
 #endif
 	{
@@ -969,7 +967,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &two,
 	},
 	{
@@ -1011,7 +1009,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_tfo_blackhole_detect_timeout,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 	},
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	{
@@ -1020,8 +1018,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "fib_multipath_hash_policy",
@@ -1029,8 +1027,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_fib_multipath_hash_policy,
-		.extra1		= &zero,
-		.extra2		= &two,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -1047,8 +1045,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 #endif
 	{
@@ -1078,7 +1076,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &four,
 	},
 	{
@@ -1222,7 +1220,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &gso_max_segs,
 	},
 	{
@@ -1231,7 +1229,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_day_secs
 	},
 	{
@@ -1240,8 +1238,8 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "tcp_invalid_ratelimit",
@@ -1256,7 +1254,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &thousand,
 	},
 	{
@@ -1265,7 +1263,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &thousand,
 	},
 	{
@@ -1274,7 +1272,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_wmem),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "tcp_rmem",
@@ -1282,7 +1280,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_rmem),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "tcp_comp_sack_delay_ns",
@@ -1297,7 +1295,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &comp_sack_nr_max,
 	},
 	{
@@ -1306,7 +1304,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(init_net.ipv4.sysctl_udp_rmem_min),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
+		.extra1		= SYSCTL_ONE
 	},
 	{
 		.procname	= "udp_wmem_min",
@@ -1314,7 +1312,7 @@ static struct ctl_table ipv4_net_table[] = {
 		.maxlen		= sizeof(init_net.ipv4.sysctl_udp_wmem_min),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
+		.extra1		= SYSCTL_ONE
 	},
 	{ }
 };
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 521e3203e83a..dc73888c7859 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6432,8 +6432,6 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
 }
 
 static int minus_one = -1;
-static const int zero = 0;
-static const int one = 1;
 static const int two_five_five = 255;
 
 static const struct ctl_table addrconf_sysctl[] = {
@@ -6450,7 +6448,7 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&two_five_five,
 	},
 	{
@@ -6809,7 +6807,7 @@ static const struct ctl_table addrconf_sysctl[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&zero,
+		.extra1		= (void *)SYSCTL_ZERO,
 		.extra2		= (void *)&two_five_five,
 	},
 	{
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4d2e6b31a8d6..8b0c33fb19a2 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6031,9 +6031,6 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
 	return 0;
 }
 
-static int zero;
-static int one = 1;
-
 static struct ctl_table ipv6_route_table_template[] = {
 	{
 		.procname	=	"flush",
@@ -6111,8 +6108,8 @@ static struct ctl_table ipv6_route_table_template[] = {
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
 		.proc_handler	=	proc_dointvec_minmax,
-		.extra1		=	&zero,
-		.extra2		=	&one,
+		.extra1		=	SYSCTL_ZERO,
+		.extra2		=	SYSCTL_ONE,
 	},
 	{ }
 };
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index dc4c91e0bfb8..ec8fcfc60a27 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -21,8 +21,6 @@
 #include <net/calipso.h>
 #endif
 
-static int zero;
-static int one = 1;
 static int flowlabel_reflect_max = 0x7;
 static int auto_flowlabels_min;
 static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
@@ -115,7 +113,7 @@ static struct ctl_table ipv6_table_template[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &flowlabel_reflect_max,
 	},
 	{
@@ -152,8 +150,8 @@ static struct ctl_table ipv6_table_template[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler   = proc_rt6_multipath_hash_policy,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "seg6_flowlabel",
@@ -179,7 +177,7 @@ static struct ctl_table ipv6_rotable[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one
+		.extra1		= SYSCTL_ONE
 	},
 #ifdef CONFIG_NETLABEL
 	{
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 198ec4fe4148..c312741df2ce 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -37,8 +37,6 @@
 
 #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1)
 
-static int zero = 0;
-static int one = 1;
 static int label_limit = (1 << 20) - 1;
 static int ttl_max = 255;
 
@@ -2607,7 +2605,7 @@ static int mpls_platform_labels(struct ctl_table *table, int write,
 		.data		= &platform_labels,
 		.maxlen		= sizeof(int),
 		.mode		= table->mode,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &label_limit,
 	};
 
@@ -2636,8 +2634,8 @@ static const struct ctl_table mpls_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	},
 	{
 		.procname	= "default_ttl",
@@ -2645,7 +2643,7 @@ static const struct ctl_table mpls_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &ttl_max,
 	},
 	{ }
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 07e0967bf129..060565e7d227 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1726,7 +1726,6 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
 
 #ifdef CONFIG_SYSCTL
 
-static int zero;
 static int three = 3;
 
 static int
@@ -1935,7 +1934,7 @@ static struct ctl_table vs_vars[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &three,
 	},
 	{
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 1e3fa67d91aa..2bbb38161851 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -11,7 +11,6 @@
 #include "ar-internal.h"
 
 static struct ctl_table_header *rxrpc_sysctl_reg_table;
-static const unsigned int one = 1;
 static const unsigned int four = 4;
 static const unsigned int thirtytwo = 32;
 static const unsigned int n_65535 = 65535;
@@ -97,7 +96,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&rxrpc_max_client_connections,
 	},
 	{
@@ -115,7 +114,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&n_max_acks,
 	},
 	{
@@ -124,7 +123,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&n_65535,
 	},
 	{
@@ -133,7 +132,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= (void *)&one,
+		.extra1		= (void *)SYSCTL_ONE,
 		.extra2		= (void *)&four,
 	},
 
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 9a19147902f1..1250751bca1b 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -25,10 +25,7 @@
 #include <net/sctp/sctp.h>
 #include <linux/sysctl.h>
 
-static int zero = 0;
-static int one = 1;
 static int timer_max = 86400000; /* ms in one day */
-static int int_max = INT_MAX;
 static int sack_timer_min = 1;
 static int sack_timer_max = 500;
 static int addr_scope_max = SCTP_SCOPE_POLICY_MAX;
@@ -92,7 +89,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 		.extra2         = &timer_max
 	},
 	{
@@ -101,7 +98,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_sctp_do_rto_min,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 		.extra2         = &init_net.sctp.rto_max
 	},
 	{
@@ -137,8 +134,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "cookie_preserve_enable",
@@ -160,7 +157,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 		.extra2         = &timer_max
 	},
 	{
@@ -178,7 +175,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 		.extra2         = &timer_max
 	},
 	{
@@ -187,8 +184,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "path_max_retrans",
@@ -196,8 +193,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "max_init_retransmits",
@@ -205,8 +202,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &one,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "pf_retrans",
@@ -214,8 +211,8 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &int_max
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_INT_MAX,
 	},
 	{
 		.procname	= "sndbuf_policy",
@@ -286,7 +283,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &addr_scope_max,
 	},
 	{
@@ -295,7 +292,7 @@ static struct ctl_table sctp_net_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-		.extra1		= &one,
+		.extra1		= SYSCTL_ONE,
 		.extra2		= &rwnd_scale_max,
 	},
 	{
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 1f73a6a7e43c..ffb1684c4573 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -80,7 +80,6 @@ static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
 static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
 static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
 static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
-static unsigned int zero;
 static unsigned int max_padding = PAGE_SIZE;
 static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
 static unsigned int max_memreg = RPCRDMA_LAST - 1;
@@ -122,7 +121,7 @@ static struct ctl_table xr_tunables_table[] = {
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
+		.extra1		= SYSCTL_ZERO,
 		.extra2		= &max_padding,
 	},
 	{
diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c
index 9df82a573aa7..6159d327db76 100644
--- a/net/tipc/sysctl.c
+++ b/net/tipc/sysctl.c
@@ -38,8 +38,6 @@
 
 #include <linux/sysctl.h>
 
-static int zero;
-static int one = 1;
 static struct ctl_table_header *tipc_ctl_hdr;
 
 static struct ctl_table tipc_table[] = {
@@ -49,7 +47,7 @@ static struct ctl_table tipc_table[] = {
 		.maxlen		= sizeof(sysctl_tipc_rmem),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &one,
+		.extra1         = SYSCTL_ONE,
 	},
 	{
 		.procname	= "named_timeout",
@@ -57,7 +55,7 @@ static struct ctl_table tipc_table[] = {
 		.maxlen		= sizeof(sysctl_tipc_named_timeout),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-		.extra1         = &zero,
+		.extra1         = SYSCTL_ZERO,
 	},
 	{
 		.procname       = "sk_filter",
diff --git a/security/keys/sysctl.c b/security/keys/sysctl.c
index dd1e21fab827..b46b651b3c4c 100644
--- a/security/keys/sysctl.c
+++ b/security/keys/sysctl.c
@@ -9,8 +9,6 @@
 #include <linux/sysctl.h>
 #include "internal.h"
 
-static const int zero, one = 1, max = INT_MAX;
-
 struct ctl_table key_sysctls[] = {
 	{
 		.procname = "maxkeys",
@@ -18,8 +16,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &one,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ONE,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 	{
 		.procname = "maxbytes",
@@ -27,8 +25,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &one,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ONE,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 	{
 		.procname = "root_maxkeys",
@@ -36,8 +34,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &one,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ONE,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 	{
 		.procname = "root_maxbytes",
@@ -45,8 +43,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &one,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ONE,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 	{
 		.procname = "gc_delay",
@@ -54,8 +52,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &zero,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ZERO,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 	{
@@ -64,8 +62,8 @@ struct ctl_table key_sysctls[] = {
 		.maxlen = sizeof(unsigned),
 		.mode = 0644,
 		.proc_handler = proc_dointvec_minmax,
-		.extra1 = (void *) &zero,
-		.extra2 = (void *) &max,
+		.extra1 = (void *) SYSCTL_ZERO,
+		.extra2 = (void *) SYSCTL_INT_MAX,
 	},
 #endif
 	{ }
diff --git a/security/loadpin/loadpin.c b/security/loadpin/loadpin.c
index 81519c804888..ee5cb944f4ad 100644
--- a/security/loadpin/loadpin.c
+++ b/security/loadpin/loadpin.c
@@ -43,8 +43,6 @@ static struct super_block *pinned_root;
 static DEFINE_SPINLOCK(pinned_root_spinlock);
 
 #ifdef CONFIG_SYSCTL
-static int zero;
-static int one = 1;
 
 static struct ctl_path loadpin_sysctl_path[] = {
 	{ .procname = "kernel", },
@@ -59,8 +57,8 @@ static struct ctl_table loadpin_sysctl_table[] = {
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = proc_dointvec_minmax,
-		.extra1         = &zero,
-		.extra2         = &one,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
 	},
 	{ }
 };
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index 01c6239c4493..94dc346370b1 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -445,7 +445,6 @@ static int yama_dointvec_minmax(struct ctl_table *table, int write,
 	return proc_dointvec_minmax(&table_copy, write, buffer, lenp, ppos);
 }
 
-static int zero;
 static int max_scope = YAMA_SCOPE_NO_ATTACH;
 
 static struct ctl_path yama_sysctl_path[] = {
@@ -461,7 +460,7 @@ static struct ctl_table yama_sysctl_table[] = {
 		.maxlen         = sizeof(int),
 		.mode           = 0644,
 		.proc_handler   = yama_dointvec_minmax,
-		.extra1         = &zero,
+		.extra1         = SYSCTL_ZERO,
 		.extra2         = &max_scope,
 	},
 	{ }
-- 
cgit v1.2.3-59-g8ed1b