From 67b886d290052dbf2bcfc876a5ae41a5fe461edf Mon Sep 17 00:00:00 2001 From: "Andrew F. Davis" Date: Thu, 21 Mar 2019 15:09:56 -0500 Subject: dma-buf: Remove leftover [un]map_atomic comments The map_atomic/unmap_atomic callbacks have been removed, remove the related comments. Fixes: f664a5269542 ("dma-buf: remove kmap_atomic interface") Signed-off-by: Andrew F. Davis Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20190321200957.16938-1-afd@ti.com --- include/linux/dma-buf.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 58725f890b5b..e4a8dab2bc54 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -39,11 +39,6 @@ struct dma_buf_attachment; /** * struct dma_buf_ops - operations possible on struct dma_buf - * @map_atomic: [optional] maps a page from the buffer into kernel address - * space, users may not block until the subsequent unmap call. - * This callback must not sleep. - * @unmap_atomic: [optional] unmaps a atomically mapped page from the buffer. - * This Callback must not sleep. * @map: [optional] maps a page from the buffer into kernel address space. * @unmap: [optional] unmaps a page from the buffer. * @vmap: [optional] creates a virtual mapping for the buffer into kernel -- cgit v1.2.3-59-g8ed1b From d5ae7712b7ffbb435e8f3d98f2123eff4734c77f Mon Sep 17 00:00:00 2001 From: "Andrew F. Davis" Date: Thu, 21 Mar 2019 15:09:57 -0500 Subject: dma-buf: Update [un]map documentation to match the other functions Other function have inline documentation, a couple still have theirs at the top of the structure, update the docs and move them inline. Signed-off-by: Andrew F. Davis Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20190321200957.16938-2-afd@ti.com --- include/linux/dma-buf.h | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index e4a8dab2bc54..a0bd071466fc 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -39,8 +39,6 @@ struct dma_buf_attachment; /** * struct dma_buf_ops - operations possible on struct dma_buf - * @map: [optional] maps a page from the buffer into kernel address space. - * @unmap: [optional] unmaps a page from the buffer. * @vmap: [optional] creates a virtual mapping for the buffer into kernel * address space. Same restrictions as for vmap and friends apply. * @vunmap: [optional] unmaps a vmap from the buffer @@ -200,8 +198,6 @@ struct dma_buf_ops { * to be restarted. */ int (*end_cpu_access)(struct dma_buf *, enum dma_data_direction); - void *(*map)(struct dma_buf *, unsigned long); - void (*unmap)(struct dma_buf *, unsigned long, void *); /** * @mmap: @@ -240,6 +236,31 @@ struct dma_buf_ops { */ int (*mmap)(struct dma_buf *, struct vm_area_struct *vma); + /** + * @map: + * + * Maps a page from the buffer into kernel address space. The page is + * specified by offset into the buffer in PAGE_SIZE units. + * + * This callback is optional. + * + * Returns: + * + * Virtual address pointer where requested page can be accessed. NULL + * on error or when this function is unimplemented by the exporter. + */ + void *(*map)(struct dma_buf *, unsigned long); + + /** + * @unmap: + * + * Unmaps a page from the buffer. Page offset and address pointer should + * be the same as the one passed to and returned by matching call to map. + * + * This callback is optional. + */ + void (*unmap)(struct dma_buf *, unsigned long, void *); + void *(*vmap)(struct dma_buf *); void (*vunmap)(struct dma_buf *, void *vaddr); }; -- cgit v1.2.3-59-g8ed1b From 09ed79d6d75f06cc963a78f25463251b0a758dc7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 7 May 2019 10:01:47 -0700 Subject: percpu_ref: introduce PERCPU_REF_ALLOW_REINIT flag In most cases percpu reference counters are not switched to the percpu mode after they reach the atomic mode. Some obvious exceptions are reference counters which are initialized into the atomic mode (using PERCPU_REF_INIT_ATOMIC and PERCPU_REF_INIT_DEAD flags), and there are few other exceptions. But in most cases there is no way back, and once the reference counter is switched to the atomic mode, there is no reason to wait for percpu_ref_exit() to release the percpu memory. Of course, the size of a single counter is not so big, but because it can pin the whole percpu block in memory, the memory footprint can be noticeable (e.g. on my 32 CPUs machine a percpu block is 8Mb large). To make releasing of the percpu memory as early as possible, let's introduce the PERCPU_REF_ALLOW_REINIT flag with the following semantics: it has to be set in order to switch a percpu reference counter to the percpu mode after the initialization. PERCPU_REF_INIT_ATOMIC and PERCPU_REF_INIT_DEAD flags will implicitly assume PERCPU_REF_ALLOW_REINIT. This patch doesn't introduce any functional change to avoid any regressions. It will be done later in the patchset after adjusting all call sites, which are reviving percpu counters. Signed-off-by: Roman Gushchin Acked-by: Tejun Heo Signed-off-by: Dennis Zhou --- include/linux/percpu-refcount.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index b297cd1cd4f1..0f0240af8520 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -75,14 +75,21 @@ enum { * operation using percpu_ref_switch_to_percpu(). If initialized * with this flag, the ref will stay in atomic mode until * percpu_ref_switch_to_percpu() is invoked on it. + * Implies ALLOW_REINIT. */ PERCPU_REF_INIT_ATOMIC = 1 << 0, /* * Start dead w/ ref == 0 in atomic mode. Must be revived with - * percpu_ref_reinit() before used. Implies INIT_ATOMIC. + * percpu_ref_reinit() before used. Implies INIT_ATOMIC and + * ALLOW_REINIT. */ PERCPU_REF_INIT_DEAD = 1 << 1, + + /* + * Allow switching from atomic mode to percpu mode. + */ + PERCPU_REF_ALLOW_REINIT = 1 << 2, }; struct percpu_ref { -- cgit v1.2.3-59-g8ed1b From 7d9ab9b6adffd9c474c1274acb5f6208f9a09cf3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 7 May 2019 10:01:50 -0700 Subject: percpu_ref: release percpu memory early without PERCPU_REF_ALLOW_REINIT Release percpu memory after finishing the switch to the atomic mode if only PERCPU_REF_ALLOW_REINIT isn't set. Signed-off-by: Roman Gushchin Acked-by: Tejun Heo Signed-off-by: Dennis Zhou --- include/linux/percpu-refcount.h | 1 + lib/percpu-refcount.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 0f0240af8520..7aef0abc194a 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -102,6 +102,7 @@ struct percpu_ref { percpu_ref_func_t *release; percpu_ref_func_t *confirm_switch; bool force_atomic:1; + bool allow_reinit:1; struct rcu_head rcu; }; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 9877682e49c7..501b517bd3db 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -69,11 +69,14 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, return -ENOMEM; ref->force_atomic = flags & PERCPU_REF_INIT_ATOMIC; + ref->allow_reinit = flags & PERCPU_REF_ALLOW_REINIT; - if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) + if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) { ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; - else + ref->allow_reinit = true; + } else { start_count += PERCPU_COUNT_BIAS; + } if (flags & PERCPU_REF_INIT_DEAD) ref->percpu_count_ptr |= __PERCPU_REF_DEAD; @@ -119,6 +122,9 @@ static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu) ref->confirm_switch = NULL; wake_up_all(&percpu_ref_switch_waitq); + if (!ref->allow_reinit) + percpu_ref_exit(ref); + /* drop ref from percpu_ref_switch_to_atomic() */ percpu_ref_put(ref); } @@ -194,6 +200,9 @@ static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) return; + if (WARN_ON_ONCE(!ref->allow_reinit)) + return; + atomic_long_add(PERCPU_COUNT_BIAS, &ref->count); /* -- cgit v1.2.3-59-g8ed1b From ec9964b4803300fb86f8e8fd9b421e59f7a71dc5 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 13 May 2019 09:56:34 +0200 Subject: Platform: OLPC: Move EC-specific functionality out from x86 Move the olpc-ec driver away from the X86 OLPC platform so that it could be used by the ARM based laptops too. Notably, the driver for the OLPC battery, which is also used on the ARM models, builds on this driver's interface. It is actually plaform independent: the OLPC EC commands with their argument and responses are mostly the same despite the delivery mechanism is different. Signed-off-by: Lubomir Rintel Acked-by: Pavel Machek Signed-off-by: Andy Shevchenko --- arch/x86/include/asm/olpc.h | 31 ---------- arch/x86/platform/olpc/olpc.c | 119 ++++++------------------------------ drivers/platform/olpc/olpc-ec.c | 99 +++++++++++++++++++++++++++++- drivers/power/supply/olpc_battery.c | 1 - include/linux/olpc-ec.h | 32 +++++++++- 5 files changed, 145 insertions(+), 137 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index c2bf1de5d901..6fe76282aceb 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -9,12 +9,10 @@ struct olpc_platform_t { int flags; uint32_t boardrev; - int ecver; }; #define OLPC_F_PRESENT 0x01 #define OLPC_F_DCON 0x02 -#define OLPC_F_EC_WIDE_SCI 0x04 #ifdef CONFIG_OLPC @@ -64,13 +62,6 @@ static inline int olpc_board_at_least(uint32_t rev) return olpc_platform_info.boardrev >= rev; } -extern void olpc_ec_wakeup_set(u16 value); -extern void olpc_ec_wakeup_clear(u16 value); -extern bool olpc_ec_wakeup_available(void); - -extern int olpc_ec_mask_write(u16 bits); -extern int olpc_ec_sci_query(u16 *sci_value); - #else static inline int machine_is_olpc(void) @@ -83,14 +74,6 @@ static inline int olpc_has_dcon(void) return 0; } -static inline void olpc_ec_wakeup_set(u16 value) { } -static inline void olpc_ec_wakeup_clear(u16 value) { } - -static inline bool olpc_ec_wakeup_available(void) -{ - return false; -} - #endif #ifdef CONFIG_OLPC_XO1_PM @@ -101,20 +84,6 @@ extern void olpc_xo1_pm_wakeup_clear(u16 value); extern int pci_olpc_init(void); -/* SCI source values */ - -#define EC_SCI_SRC_EMPTY 0x00 -#define EC_SCI_SRC_GAME 0x01 -#define EC_SCI_SRC_BATTERY 0x02 -#define EC_SCI_SRC_BATSOC 0x04 -#define EC_SCI_SRC_BATERR 0x08 -#define EC_SCI_SRC_EBOOK 0x10 /* XO-1 only */ -#define EC_SCI_SRC_WLAN 0x20 /* XO-1 only */ -#define EC_SCI_SRC_ACPWR 0x40 -#define EC_SCI_SRC_BATCRIT 0x80 -#define EC_SCI_SRC_GPWAKE 0x100 /* XO-1.5 only */ -#define EC_SCI_SRC_ALL 0x1FF - /* GPIO assignments */ #define OLPC_GPIO_MIC_AC 1 diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c index f0e920fb98ad..c6c62b4f251f 100644 --- a/arch/x86/platform/olpc/olpc.c +++ b/arch/x86/platform/olpc/olpc.c @@ -30,9 +30,6 @@ struct olpc_platform_t olpc_platform_info; EXPORT_SYMBOL_GPL(olpc_platform_info); -/* EC event mask to be applied during suspend (defining wakeup sources). */ -static u16 ec_wakeup_mask; - /* what the timeout *should* be (in ms) */ #define EC_BASE_TIMEOUT 20 @@ -186,83 +183,6 @@ err: return ret; } -void olpc_ec_wakeup_set(u16 value) -{ - ec_wakeup_mask |= value; -} -EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set); - -void olpc_ec_wakeup_clear(u16 value) -{ - ec_wakeup_mask &= ~value; -} -EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear); - -/* - * Returns true if the compile and runtime configurations allow for EC events - * to wake the system. - */ -bool olpc_ec_wakeup_available(void) -{ - if (!machine_is_olpc()) - return false; - - /* - * XO-1 EC wakeups are available when olpc-xo1-sci driver is - * compiled in - */ -#ifdef CONFIG_OLPC_XO1_SCI - if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) /* XO-1 */ - return true; -#endif - - /* - * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is - * compiled in - */ -#ifdef CONFIG_OLPC_XO15_SCI - if (olpc_platform_info.boardrev >= olpc_board_pre(0xd0)) /* XO-1.5 */ - return true; -#endif - - return false; -} -EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available); - -int olpc_ec_mask_write(u16 bits) -{ - if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) { - __be16 ec_word = cpu_to_be16(bits); - return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *) &ec_word, 2, - NULL, 0); - } else { - unsigned char ec_byte = bits & 0xff; - return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0); - } -} -EXPORT_SYMBOL_GPL(olpc_ec_mask_write); - -int olpc_ec_sci_query(u16 *sci_value) -{ - int ret; - - if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) { - __be16 ec_word; - ret = olpc_ec_cmd(EC_EXT_SCI_QUERY, - NULL, 0, (void *) &ec_word, 2); - if (ret == 0) - *sci_value = be16_to_cpu(ec_word); - } else { - unsigned char ec_byte; - ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1); - if (ret == 0) - *sci_value = ec_byte; - } - - return ret; -} -EXPORT_SYMBOL_GPL(olpc_ec_sci_query); - static bool __init check_ofw_architecture(struct device_node *root) { const char *olpc_arch; @@ -296,6 +216,10 @@ static bool __init platform_detect(void) if (success) { olpc_platform_info.boardrev = get_board_revision(root); olpc_platform_info.flags |= OLPC_F_PRESENT; + + pr_info("OLPC board revision %s%X\n", + ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", + olpc_platform_info.boardrev >> 4); } of_node_put(root); @@ -315,27 +239,8 @@ static int __init add_xo1_platform_devices(void) return PTR_ERR_OR_ZERO(pdev); } -static int olpc_xo1_ec_probe(struct platform_device *pdev) -{ - /* get the EC revision */ - olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, - (unsigned char *) &olpc_platform_info.ecver, 1); - - /* EC version 0x5f adds support for wide SCI mask */ - if (olpc_platform_info.ecver >= 0x5f) - olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI; - - pr_info("OLPC board revision %s%X (EC=%x)\n", - ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", - olpc_platform_info.boardrev >> 4, - olpc_platform_info.ecver); - - return 0; -} static int olpc_xo1_ec_suspend(struct platform_device *pdev) { - olpc_ec_mask_write(ec_wakeup_mask); - /* * Squelch SCIs while suspended. This is a fix for * . @@ -359,15 +264,27 @@ static int olpc_xo1_ec_resume(struct platform_device *pdev) } static struct olpc_ec_driver ec_xo1_driver = { - .probe = olpc_xo1_ec_probe, .suspend = olpc_xo1_ec_suspend, .resume = olpc_xo1_ec_resume, .ec_cmd = olpc_xo1_ec_cmd, +#ifdef CONFIG_OLPC_XO1_SCI + /* + * XO-1 EC wakeups are available when olpc-xo1-sci driver is + * compiled in + */ + .wakeup_available = true, +#endif }; static struct olpc_ec_driver ec_xo1_5_driver = { - .probe = olpc_xo1_ec_probe, .ec_cmd = olpc_xo1_ec_cmd, +#ifdef CONFIG_OLPC_XO1_5_SCI + /* + * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is + * compiled in + */ + .wakeup_available = true, +#endif }; static int __init olpc_init(void) diff --git a/drivers/platform/olpc/olpc-ec.c b/drivers/platform/olpc/olpc-ec.c index 981955dce926..2a647455a368 100644 --- a/drivers/platform/olpc/olpc-ec.c +++ b/drivers/platform/olpc/olpc-ec.c @@ -32,6 +32,7 @@ struct ec_cmd_desc { struct olpc_ec_priv { struct olpc_ec_driver *drv; + u8 version; struct work_struct worker; struct mutex cmd_lock; @@ -41,6 +42,12 @@ struct olpc_ec_priv { struct dentry *dbgfs_dir; + /* + * EC event mask to be applied during suspend (defining wakeup + * sources). + */ + u16 ec_wakeup_mask; + /* * Running an EC command while suspending means we don't always finish * the command before the machine suspends. This means that the EC @@ -149,6 +156,88 @@ int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen) } EXPORT_SYMBOL_GPL(olpc_ec_cmd); +void olpc_ec_wakeup_set(u16 value) +{ + struct olpc_ec_priv *ec = ec_priv; + + if (WARN_ON(!ec)) + return; + + ec->ec_wakeup_mask |= value; +} +EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set); + +void olpc_ec_wakeup_clear(u16 value) +{ + struct olpc_ec_priv *ec = ec_priv; + + if (WARN_ON(!ec)) + return; + + ec->ec_wakeup_mask &= ~value; +} +EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear); + +int olpc_ec_mask_write(u16 bits) +{ + struct olpc_ec_priv *ec = ec_priv; + + if (WARN_ON(!ec)) + return -ENODEV; + + /* EC version 0x5f adds support for wide SCI mask */ + if (ec->version >= 0x5f) { + __be16 ec_word = cpu_to_be16(bits); + + return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *)&ec_word, 2, NULL, 0); + } else { + u8 ec_byte = bits & 0xff; + + return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0); + } +} +EXPORT_SYMBOL_GPL(olpc_ec_mask_write); + +/* + * Returns true if the compile and runtime configurations allow for EC events + * to wake the system. + */ +bool olpc_ec_wakeup_available(void) +{ + if (WARN_ON(!ec_driver)) + return false; + + return ec_driver->wakeup_available; +} +EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available); + +int olpc_ec_sci_query(u16 *sci_value) +{ + struct olpc_ec_priv *ec = ec_priv; + int ret; + + if (WARN_ON(!ec)) + return -ENODEV; + + /* EC version 0x5f adds support for wide SCI mask */ + if (ec->version >= 0x5f) { + __be16 ec_word; + + ret = olpc_ec_cmd(EC_EXT_SCI_QUERY, NULL, 0, (void *)&ec_word, 2); + if (ret == 0) + *sci_value = be16_to_cpu(ec_word); + } else { + u8 ec_byte; + + ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1); + if (ret == 0) + *sci_value = ec_byte; + } + + return ret; +} +EXPORT_SYMBOL_GPL(olpc_ec_sci_query); + #ifdef CONFIG_DEBUG_FS /* @@ -276,14 +365,16 @@ static int olpc_ec_probe(struct platform_device *pdev) ec_priv = ec; platform_set_drvdata(pdev, ec); - err = ec_driver->probe ? ec_driver->probe(pdev) : 0; + /* get the EC revision */ + err = olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, &ec->version, 1); if (err) { ec_priv = NULL; kfree(ec); - } else { - ec->dbgfs_dir = olpc_ec_setup_debugfs(); + return err; } + ec->dbgfs_dir = olpc_ec_setup_debugfs(); + return err; } @@ -293,6 +384,8 @@ static int olpc_ec_suspend(struct device *dev) struct olpc_ec_priv *ec = platform_get_drvdata(pdev); int err = 0; + olpc_ec_mask_write(ec->ec_wakeup_mask); + if (ec_driver->suspend) err = ec_driver->suspend(pdev); if (!err) diff --git a/drivers/power/supply/olpc_battery.c b/drivers/power/supply/olpc_battery.c index 7720e4c2ac0b..066ec9a11153 100644 --- a/drivers/power/supply/olpc_battery.c +++ b/drivers/power/supply/olpc_battery.c @@ -20,7 +20,6 @@ #include #include #include -#include #define EC_BAT_VOLTAGE 0x10 /* uint16_t, *9.76/32, mV */ diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h index 79bdc6328c52..7fa3d27f7fee 100644 --- a/include/linux/olpc-ec.h +++ b/include/linux/olpc-ec.h @@ -16,14 +16,28 @@ #define EC_SCI_QUERY 0x84 #define EC_EXT_SCI_QUERY 0x85 +/* SCI source values */ +#define EC_SCI_SRC_EMPTY 0x00 +#define EC_SCI_SRC_GAME 0x01 +#define EC_SCI_SRC_BATTERY 0x02 +#define EC_SCI_SRC_BATSOC 0x04 +#define EC_SCI_SRC_BATERR 0x08 +#define EC_SCI_SRC_EBOOK 0x10 /* XO-1 only */ +#define EC_SCI_SRC_WLAN 0x20 /* XO-1 only */ +#define EC_SCI_SRC_ACPWR 0x40 +#define EC_SCI_SRC_BATCRIT 0x80 +#define EC_SCI_SRC_GPWAKE 0x100 /* XO-1.5 only */ +#define EC_SCI_SRC_ALL 0x1FF + struct platform_device; struct olpc_ec_driver { - int (*probe)(struct platform_device *); int (*suspend)(struct platform_device *); int (*resume)(struct platform_device *); int (*ec_cmd)(u8, u8 *, size_t, u8 *, size_t, void *); + + bool wakeup_available; }; #ifdef CONFIG_OLPC @@ -33,11 +47,27 @@ extern void olpc_ec_driver_register(struct olpc_ec_driver *drv, void *arg); extern int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen); +extern void olpc_ec_wakeup_set(u16 value); +extern void olpc_ec_wakeup_clear(u16 value); + +extern int olpc_ec_mask_write(u16 bits); +extern int olpc_ec_sci_query(u16 *sci_value); + +extern bool olpc_ec_wakeup_available(void); + #else static inline int olpc_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *outbuf, size_t outlen) { return -ENODEV; } +static inline void olpc_ec_wakeup_set(u16 value) { } +static inline void olpc_ec_wakeup_clear(u16 value) { } + +static inline bool olpc_ec_wakeup_available(void) +{ + return false; +} + #endif /* CONFIG_OLPC */ #endif /* _LINUX_OLPC_EC_H */ -- cgit v1.2.3-59-g8ed1b From 8097548f3af9ec990169574ad9d874052b78bff8 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 13 May 2019 09:56:36 +0200 Subject: Platform: OLPC: Use BIT() and GENMASK() for event masks Just a cosmetic tidy-up. Signed-off-by: Lubomir Rintel Reviewed-by: Andy Shevchenko Signed-off-by: Andy Shevchenko --- include/linux/olpc-ec.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h index 7fa3d27f7fee..f7b6a7eda232 100644 --- a/include/linux/olpc-ec.h +++ b/include/linux/olpc-ec.h @@ -2,6 +2,8 @@ #ifndef _LINUX_OLPC_EC_H #define _LINUX_OLPC_EC_H +#include + /* XO-1 EC commands */ #define EC_FIRMWARE_REV 0x08 #define EC_WRITE_SCI_MASK 0x1b @@ -17,17 +19,16 @@ #define EC_EXT_SCI_QUERY 0x85 /* SCI source values */ -#define EC_SCI_SRC_EMPTY 0x00 -#define EC_SCI_SRC_GAME 0x01 -#define EC_SCI_SRC_BATTERY 0x02 -#define EC_SCI_SRC_BATSOC 0x04 -#define EC_SCI_SRC_BATERR 0x08 -#define EC_SCI_SRC_EBOOK 0x10 /* XO-1 only */ -#define EC_SCI_SRC_WLAN 0x20 /* XO-1 only */ -#define EC_SCI_SRC_ACPWR 0x40 -#define EC_SCI_SRC_BATCRIT 0x80 -#define EC_SCI_SRC_GPWAKE 0x100 /* XO-1.5 only */ -#define EC_SCI_SRC_ALL 0x1FF +#define EC_SCI_SRC_GAME BIT(0) +#define EC_SCI_SRC_BATTERY BIT(1) +#define EC_SCI_SRC_BATSOC BIT(2) +#define EC_SCI_SRC_BATERR BIT(3) +#define EC_SCI_SRC_EBOOK BIT(4) /* XO-1 only */ +#define EC_SCI_SRC_WLAN BIT(5) /* XO-1 only */ +#define EC_SCI_SRC_ACPWR BIT(6) +#define EC_SCI_SRC_BATCRIT BIT(7) +#define EC_SCI_SRC_GPWAKE BIT(8) /* XO-1.5 only */ +#define EC_SCI_SRC_ALL GENMASK(8, 0) struct platform_device; -- cgit v1.2.3-59-g8ed1b From 0c3d931b3ab9efeea4948b5373c62095449d0101 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Mon, 13 May 2019 09:56:37 +0200 Subject: Platform: OLPC: Add XO-1.75 EC driver It's based off the driver from the OLPC kernel sources. Somewhat modernized and cleaned up, for better or worse. Modified to plug into the olpc-ec driver infrastructure (so that battery interface and debugfs could be reused) and the SPI slave framework. Signed-off-by: Lubomir Rintel Reviewed-by: Andy Shevchenko Signed-off-by: Andy Shevchenko --- arch/x86/Kconfig | 1 + drivers/platform/Kconfig | 2 + drivers/platform/Makefile | 2 +- drivers/platform/olpc/Kconfig | 14 + drivers/platform/olpc/Makefile | 3 +- drivers/platform/olpc/olpc-xo175-ec.c | 752 ++++++++++++++++++++++++++++++++++ include/linux/olpc-ec.h | 4 +- 7 files changed, 774 insertions(+), 4 deletions(-) create mode 100644 drivers/platform/olpc/Kconfig create mode 100644 drivers/platform/olpc/olpc-xo175-ec.c (limited to 'include/linux') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2bbbd4d1ba31..cb1c073b3c7e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2698,6 +2698,7 @@ config OLPC select OF select OF_PROMTREE select IRQ_DOMAIN + select OLPC_EC ---help--- Add support for detecting the unique features of the OLPC XO hardware. diff --git a/drivers/platform/Kconfig b/drivers/platform/Kconfig index d4c2e424a700..4313d73d3618 100644 --- a/drivers/platform/Kconfig +++ b/drivers/platform/Kconfig @@ -10,3 +10,5 @@ source "drivers/platform/goldfish/Kconfig" source "drivers/platform/chrome/Kconfig" source "drivers/platform/mellanox/Kconfig" + +source "drivers/platform/olpc/Kconfig" diff --git a/drivers/platform/Makefile b/drivers/platform/Makefile index 4b2ce58bcd9c..6fda58c021ca 100644 --- a/drivers/platform/Makefile +++ b/drivers/platform/Makefile @@ -6,6 +6,6 @@ obj-$(CONFIG_X86) += x86/ obj-$(CONFIG_MELLANOX_PLATFORM) += mellanox/ obj-$(CONFIG_MIPS) += mips/ -obj-$(CONFIG_OLPC) += olpc/ +obj-$(CONFIG_OLPC_EC) += olpc/ obj-$(CONFIG_GOLDFISH) += goldfish/ obj-$(CONFIG_CHROME_PLATFORMS) += chrome/ diff --git a/drivers/platform/olpc/Kconfig b/drivers/platform/olpc/Kconfig new file mode 100644 index 000000000000..559f843199d7 --- /dev/null +++ b/drivers/platform/olpc/Kconfig @@ -0,0 +1,14 @@ +config OLPC_EC + bool + +config OLPC_XO175_EC + tristate "OLPC XO 1.75 Embedded Controller" + depends on ARCH_MMP || COMPILE_TEST + select SPI_SLAVE + select OLPC_EC + help + Include support for the OLPC XO Embedded Controller (EC). The EC + provides various platform services, including support for the power, + button, restart, shutdown and battery charging status. + + Unless you have an OLPC XO laptop, you will want to say N. diff --git a/drivers/platform/olpc/Makefile b/drivers/platform/olpc/Makefile index dc8b26bc7209..01fe6ba01665 100644 --- a/drivers/platform/olpc/Makefile +++ b/drivers/platform/olpc/Makefile @@ -1,4 +1,5 @@ # # OLPC XO platform-specific drivers # -obj-$(CONFIG_OLPC) += olpc-ec.o +obj-$(CONFIG_OLPC_EC) += olpc-ec.o +obj-$(CONFIG_OLPC_XO175_EC) += olpc-xo175-ec.o diff --git a/drivers/platform/olpc/olpc-xo175-ec.c b/drivers/platform/olpc/olpc-xo175-ec.c new file mode 100644 index 000000000000..344d14f3da54 --- /dev/null +++ b/drivers/platform/olpc/olpc-xo175-ec.c @@ -0,0 +1,752 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Driver for the OLPC XO-1.75 Embedded Controller. + * + * The EC protocol is documented at: + * http://wiki.laptop.org/go/XO_1.75_HOST_to_EC_Protocol + * + * Copyright (C) 2010 One Laptop per Child Foundation. + * Copyright (C) 2018 Lubomir Rintel + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ec_cmd_t { + u8 cmd; + u8 bytes_returned; +}; + +enum ec_chan_t { + CHAN_NONE = 0, + CHAN_SWITCH, + CHAN_CMD_RESP, + CHAN_KEYBOARD, + CHAN_TOUCHPAD, + CHAN_EVENT, + CHAN_DEBUG, + CHAN_CMD_ERROR, +}; + +/* + * EC events + */ +#define EVENT_AC_CHANGE 1 /* AC plugged/unplugged */ +#define EVENT_BATTERY_STATUS 2 /* Battery low/full/error/gone */ +#define EVENT_BATTERY_CRITICAL 3 /* Battery critical voltage */ +#define EVENT_BATTERY_SOC_CHANGE 4 /* 1% SOC Change */ +#define EVENT_BATTERY_ERROR 5 /* Abnormal error, query for cause */ +#define EVENT_POWER_PRESSED 6 /* Power button was pressed */ +#define EVENT_POWER_PRESS_WAKE 7 /* Woken up with a power button */ +#define EVENT_TIMED_HOST_WAKE 8 /* Host wake timer */ +#define EVENT_OLS_HIGH_LIMIT 9 /* OLS crossed dark threshold */ +#define EVENT_OLS_LOW_LIMIT 10 /* OLS crossed light threshold */ + +/* + * EC commands + * (from http://dev.laptop.org/git/users/rsmith/ec-1.75/tree/ec_cmd.h) + */ +#define CMD_GET_API_VERSION 0x08 /* out: u8 */ +#define CMD_READ_VOLTAGE 0x10 /* out: u16, *9.76/32, mV */ +#define CMD_READ_CURRENT 0x11 /* out: s16, *15.625/120, mA */ +#define CMD_READ_ACR 0x12 /* out: s16, *6250/15, uAh */ +#define CMD_READ_BATT_TEMPERATURE 0x13 /* out: u16, *100/256, deg C */ +#define CMD_READ_AMBIENT_TEMPERATURE 0x14 /* unimplemented, no hardware */ +#define CMD_READ_BATTERY_STATUS 0x15 /* out: u8, bitmask */ +#define CMD_READ_SOC 0x16 /* out: u8, percentage */ +#define CMD_READ_GAUGE_ID 0x17 /* out: u8 * 8 */ +#define CMD_READ_GAUGE_DATA 0x18 /* in: u8 addr, out: u8 data */ +#define CMD_READ_BOARD_ID 0x19 /* out: u16 (platform id) */ +#define CMD_READ_BATT_ERR_CODE 0x1f /* out: u8, error bitmask */ +#define CMD_SET_DCON_POWER 0x26 /* in: u8 */ +#define CMD_RESET_EC 0x28 /* none */ +#define CMD_READ_BATTERY_TYPE 0x2c /* out: u8 */ +#define CMD_SET_AUTOWAK 0x33 /* out: u8 */ +#define CMD_SET_EC_WAKEUP_TIMER 0x36 /* in: u32, out: ? */ +#define CMD_READ_EXT_SCI_MASK 0x37 /* ? */ +#define CMD_WRITE_EXT_SCI_MASK 0x38 /* ? */ +#define CMD_CLEAR_EC_WAKEUP_TIMER 0x39 /* none */ +#define CMD_ENABLE_RUNIN_DISCHARGE 0x3B /* none */ +#define CMD_DISABLE_RUNIN_DISCHARGE 0x3C /* none */ +#define CMD_READ_MPPT_ACTIVE 0x3d /* out: u8 */ +#define CMD_READ_MPPT_LIMIT 0x3e /* out: u8 */ +#define CMD_SET_MPPT_LIMIT 0x3f /* in: u8 */ +#define CMD_DISABLE_MPPT 0x40 /* none */ +#define CMD_ENABLE_MPPT 0x41 /* none */ +#define CMD_READ_VIN 0x42 /* out: u16 */ +#define CMD_EXT_SCI_QUERY 0x43 /* ? */ +#define RSP_KEYBOARD_DATA 0x48 /* ? */ +#define RSP_TOUCHPAD_DATA 0x49 /* ? */ +#define CMD_GET_FW_VERSION 0x4a /* out: u8 * 16 */ +#define CMD_POWER_CYCLE 0x4b /* none */ +#define CMD_POWER_OFF 0x4c /* none */ +#define CMD_RESET_EC_SOFT 0x4d /* none */ +#define CMD_READ_GAUGE_U16 0x4e /* ? */ +#define CMD_ENABLE_MOUSE 0x4f /* ? */ +#define CMD_ECHO 0x52 /* in: u8 * 5, out: u8 * 5 */ +#define CMD_GET_FW_DATE 0x53 /* out: u8 * 16 */ +#define CMD_GET_FW_USER 0x54 /* out: u8 * 16 */ +#define CMD_TURN_OFF_POWER 0x55 /* none (same as 0x4c) */ +#define CMD_READ_OLS 0x56 /* out: u16 */ +#define CMD_OLS_SMT_LEDON 0x57 /* none */ +#define CMD_OLS_SMT_LEDOFF 0x58 /* none */ +#define CMD_START_OLS_ASSY 0x59 /* none */ +#define CMD_STOP_OLS_ASSY 0x5a /* none */ +#define CMD_OLS_SMTTEST_STOP 0x5b /* none */ +#define CMD_READ_VIN_SCALED 0x5c /* out: u16 */ +#define CMD_READ_BAT_MIN_W 0x5d /* out: u16 */ +#define CMD_READ_BAR_MAX_W 0x5e /* out: u16 */ +#define CMD_RESET_BAT_MINMAX_W 0x5f /* none */ +#define CMD_READ_LOCATION 0x60 /* in: u16 addr, out: u8 data */ +#define CMD_WRITE_LOCATION 0x61 /* in: u16 addr, u8 data */ +#define CMD_KEYBOARD_CMD 0x62 /* in: u8, out: ? */ +#define CMD_TOUCHPAD_CMD 0x63 /* in: u8, out: ? */ +#define CMD_GET_FW_HASH 0x64 /* out: u8 * 16 */ +#define CMD_SUSPEND_HINT 0x65 /* in: u8 */ +#define CMD_ENABLE_WAKE_TIMER 0x66 /* in: u8 */ +#define CMD_SET_WAKE_TIMER 0x67 /* in: 32 */ +#define CMD_ENABLE_WAKE_AUTORESET 0x68 /* in: u8 */ +#define CMD_OLS_SET_LIMITS 0x69 /* in: u16, u16 */ +#define CMD_OLS_GET_LIMITS 0x6a /* out: u16, u16 */ +#define CMD_OLS_SET_CEILING 0x6b /* in: u16 */ +#define CMD_OLS_GET_CEILING 0x6c /* out: u16 */ + +/* + * Accepted EC commands, and how many bytes they return. There are plenty + * of EC commands that are no longer implemented, or are implemented only on + * certain older boards. + */ +static const struct ec_cmd_t olpc_xo175_ec_cmds[] = { + { CMD_GET_API_VERSION, 1 }, + { CMD_READ_VOLTAGE, 2 }, + { CMD_READ_CURRENT, 2 }, + { CMD_READ_ACR, 2 }, + { CMD_READ_BATT_TEMPERATURE, 2 }, + { CMD_READ_BATTERY_STATUS, 1 }, + { CMD_READ_SOC, 1 }, + { CMD_READ_GAUGE_ID, 8 }, + { CMD_READ_GAUGE_DATA, 1 }, + { CMD_READ_BOARD_ID, 2 }, + { CMD_READ_BATT_ERR_CODE, 1 }, + { CMD_SET_DCON_POWER, 0 }, + { CMD_RESET_EC, 0 }, + { CMD_READ_BATTERY_TYPE, 1 }, + { CMD_ENABLE_RUNIN_DISCHARGE, 0 }, + { CMD_DISABLE_RUNIN_DISCHARGE, 0 }, + { CMD_READ_MPPT_ACTIVE, 1 }, + { CMD_READ_MPPT_LIMIT, 1 }, + { CMD_SET_MPPT_LIMIT, 0 }, + { CMD_DISABLE_MPPT, 0 }, + { CMD_ENABLE_MPPT, 0 }, + { CMD_READ_VIN, 2 }, + { CMD_GET_FW_VERSION, 16 }, + { CMD_POWER_CYCLE, 0 }, + { CMD_POWER_OFF, 0 }, + { CMD_RESET_EC_SOFT, 0 }, + { CMD_ECHO, 5 }, + { CMD_GET_FW_DATE, 16 }, + { CMD_GET_FW_USER, 16 }, + { CMD_TURN_OFF_POWER, 0 }, + { CMD_READ_OLS, 2 }, + { CMD_OLS_SMT_LEDON, 0 }, + { CMD_OLS_SMT_LEDOFF, 0 }, + { CMD_START_OLS_ASSY, 0 }, + { CMD_STOP_OLS_ASSY, 0 }, + { CMD_OLS_SMTTEST_STOP, 0 }, + { CMD_READ_VIN_SCALED, 2 }, + { CMD_READ_BAT_MIN_W, 2 }, + { CMD_READ_BAR_MAX_W, 2 }, + { CMD_RESET_BAT_MINMAX_W, 0 }, + { CMD_READ_LOCATION, 1 }, + { CMD_WRITE_LOCATION, 0 }, + { CMD_GET_FW_HASH, 16 }, + { CMD_SUSPEND_HINT, 0 }, + { CMD_ENABLE_WAKE_TIMER, 0 }, + { CMD_SET_WAKE_TIMER, 0 }, + { CMD_ENABLE_WAKE_AUTORESET, 0 }, + { CMD_OLS_SET_LIMITS, 0 }, + { CMD_OLS_GET_LIMITS, 4 }, + { CMD_OLS_SET_CEILING, 0 }, + { CMD_OLS_GET_CEILING, 2 }, + { CMD_READ_EXT_SCI_MASK, 2 }, + { CMD_WRITE_EXT_SCI_MASK, 0 }, + + { } +}; + +#define EC_MAX_CMD_DATA_LEN 5 +#define EC_MAX_RESP_LEN 16 + +#define LOG_BUF_SIZE 128 + +#define PM_WAKEUP_TIME 1000 + +#define EC_ALL_EVENTS GENMASK(15, 0) + +enum ec_state_t { + CMD_STATE_IDLE = 0, + CMD_STATE_WAITING_FOR_SWITCH, + CMD_STATE_CMD_IN_TX_FIFO, + CMD_STATE_CMD_SENT, + CMD_STATE_RESP_RECEIVED, + CMD_STATE_ERROR_RECEIVED, +}; + +struct olpc_xo175_ec_cmd { + u8 command; + u8 nr_args; + u8 data_len; + u8 args[EC_MAX_CMD_DATA_LEN]; +}; + +struct olpc_xo175_ec_resp { + u8 channel; + u8 byte; +}; + +struct olpc_xo175_ec { + bool suspended; + + /* SPI related stuff. */ + struct spi_device *spi; + struct spi_transfer xfer; + struct spi_message msg; + union { + struct olpc_xo175_ec_cmd cmd; + struct olpc_xo175_ec_resp resp; + } tx_buf, rx_buf; + + /* GPIO for the CMD signals. */ + struct gpio_desc *gpio_cmd; + + /* Command handling related state. */ + spinlock_t cmd_state_lock; + int cmd_state; + bool cmd_running; + struct completion cmd_done; + struct olpc_xo175_ec_cmd cmd; + u8 resp_data[EC_MAX_RESP_LEN]; + int expected_resp_len; + int resp_len; + + /* Power button. */ + struct input_dev *pwrbtn; + + /* Debug handling. */ + char logbuf[LOG_BUF_SIZE]; + int logbuf_len; +}; + +static struct platform_device *olpc_ec; + +static int olpc_xo175_ec_resp_len(u8 cmd) +{ + const struct ec_cmd_t *p; + + for (p = olpc_xo175_ec_cmds; p->cmd; p++) { + if (p->cmd == cmd) + return p->bytes_returned; + } + + return -EINVAL; +} + +static void olpc_xo175_ec_flush_logbuf(struct olpc_xo175_ec *priv) +{ + dev_dbg(&priv->spi->dev, "got debug string [%*pE]\n", + priv->logbuf_len, priv->logbuf); + priv->logbuf_len = 0; +} + +static void olpc_xo175_ec_complete(void *arg); + +static void olpc_xo175_ec_send_command(struct olpc_xo175_ec *priv, void *cmd, + size_t cmdlen) +{ + int ret; + + memcpy(&priv->tx_buf, cmd, cmdlen); + priv->xfer.len = cmdlen; + + spi_message_init_with_transfers(&priv->msg, &priv->xfer, 1); + + priv->msg.complete = olpc_xo175_ec_complete; + priv->msg.context = priv; + + ret = spi_async(priv->spi, &priv->msg); + if (ret) + dev_err(&priv->spi->dev, "spi_async() failed %d\n", ret); +} + +static void olpc_xo175_ec_read_packet(struct olpc_xo175_ec *priv) +{ + u8 nonce[] = {0xA5, 0x5A}; + + olpc_xo175_ec_send_command(priv, nonce, sizeof(nonce)); +} + +static void olpc_xo175_ec_complete(void *arg) +{ + struct olpc_xo175_ec *priv = arg; + struct device *dev = &priv->spi->dev; + struct power_supply *psy; + unsigned long flags; + u8 channel; + u8 byte; + int ret; + + ret = priv->msg.status; + if (ret) { + dev_err(dev, "SPI transfer failed: %d\n", ret); + + spin_lock_irqsave(&priv->cmd_state_lock, flags); + if (priv->cmd_running) { + priv->resp_len = 0; + priv->cmd_state = CMD_STATE_ERROR_RECEIVED; + complete(&priv->cmd_done); + } + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + + if (ret != -EINTR) + olpc_xo175_ec_read_packet(priv); + + return; + } + + channel = priv->rx_buf.resp.channel; + byte = priv->rx_buf.resp.byte; + + switch (channel) { + case CHAN_NONE: + spin_lock_irqsave(&priv->cmd_state_lock, flags); + + if (!priv->cmd_running) { + /* We can safely ignore these */ + dev_err(dev, "spurious FIFO read packet\n"); + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + return; + } + + priv->cmd_state = CMD_STATE_CMD_SENT; + if (!priv->expected_resp_len) + complete(&priv->cmd_done); + olpc_xo175_ec_read_packet(priv); + + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + return; + + case CHAN_SWITCH: + spin_lock_irqsave(&priv->cmd_state_lock, flags); + + if (!priv->cmd_running) { + /* Just go with the flow */ + dev_err(dev, "spurious SWITCH packet\n"); + memset(&priv->cmd, 0, sizeof(priv->cmd)); + priv->cmd.command = CMD_ECHO; + } + + priv->cmd_state = CMD_STATE_CMD_IN_TX_FIFO; + + /* Throw command into TxFIFO */ + gpiod_set_value_cansleep(priv->gpio_cmd, 0); + olpc_xo175_ec_send_command(priv, &priv->cmd, sizeof(priv->cmd)); + + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + return; + + case CHAN_CMD_RESP: + spin_lock_irqsave(&priv->cmd_state_lock, flags); + + if (!priv->cmd_running) { + dev_err(dev, "spurious response packet\n"); + } else if (priv->resp_len >= priv->expected_resp_len) { + dev_err(dev, "too many response packets\n"); + } else { + priv->resp_data[priv->resp_len++] = byte; + if (priv->resp_len == priv->expected_resp_len) { + priv->cmd_state = CMD_STATE_RESP_RECEIVED; + complete(&priv->cmd_done); + } + } + + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + break; + + case CHAN_CMD_ERROR: + spin_lock_irqsave(&priv->cmd_state_lock, flags); + + if (!priv->cmd_running) { + dev_err(dev, "spurious cmd error packet\n"); + } else { + priv->resp_data[0] = byte; + priv->resp_len = 1; + priv->cmd_state = CMD_STATE_ERROR_RECEIVED; + complete(&priv->cmd_done); + } + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + break; + + case CHAN_KEYBOARD: + dev_warn(dev, "keyboard is not supported\n"); + break; + + case CHAN_TOUCHPAD: + dev_warn(dev, "touchpad is not supported\n"); + break; + + case CHAN_EVENT: + dev_dbg(dev, "got event %.2x\n", byte); + switch (byte) { + case EVENT_AC_CHANGE: + psy = power_supply_get_by_name("olpc-ac"); + if (psy) { + power_supply_changed(psy); + power_supply_put(psy); + } + break; + case EVENT_BATTERY_STATUS: + case EVENT_BATTERY_CRITICAL: + case EVENT_BATTERY_SOC_CHANGE: + case EVENT_BATTERY_ERROR: + psy = power_supply_get_by_name("olpc-battery"); + if (psy) { + power_supply_changed(psy); + power_supply_put(psy); + } + break; + case EVENT_POWER_PRESSED: + input_report_key(priv->pwrbtn, KEY_POWER, 1); + input_sync(priv->pwrbtn); + input_report_key(priv->pwrbtn, KEY_POWER, 0); + input_sync(priv->pwrbtn); + /* fall through */ + case EVENT_POWER_PRESS_WAKE: + case EVENT_TIMED_HOST_WAKE: + pm_wakeup_event(priv->pwrbtn->dev.parent, + PM_WAKEUP_TIME); + break; + default: + dev_dbg(dev, "ignored unknown event %.2x\n", byte); + break; + } + break; + + case CHAN_DEBUG: + if (byte == '\n') { + olpc_xo175_ec_flush_logbuf(priv); + } else if (isprint(byte)) { + priv->logbuf[priv->logbuf_len++] = byte; + if (priv->logbuf_len == LOG_BUF_SIZE) + olpc_xo175_ec_flush_logbuf(priv); + } + break; + + default: + dev_warn(dev, "unknown channel: %d, %.2x\n", channel, byte); + break; + } + + /* Most non-command packets get the TxFIFO refilled and an ACK. */ + olpc_xo175_ec_read_packet(priv); +} + +/* + * This function is protected with a mutex. We can safely assume that + * there will be only one instance of this function running at a time. + * One of the ways in which we enforce this is by waiting until we get + * all response bytes back from the EC, rather than just the number that + * the caller requests (otherwise, we might start a new command while an + * old command's response bytes are still incoming). + */ +static int olpc_xo175_ec_cmd(u8 cmd, u8 *inbuf, size_t inlen, u8 *resp, + size_t resp_len, void *ec_cb_arg) +{ + struct olpc_xo175_ec *priv = ec_cb_arg; + struct device *dev = &priv->spi->dev; + unsigned long flags; + size_t nr_bytes; + int ret = 0; + + dev_dbg(dev, "CMD %x, %zd bytes expected\n", cmd, resp_len); + + if (inlen > 5) { + dev_err(dev, "command len %zd too big!\n", resp_len); + return -EOVERFLOW; + } + + /* Suspending in the middle of an EC command hoses things badly! */ + if (WARN_ON(priv->suspended)) + return -EBUSY; + + /* Ensure a valid command and return bytes */ + ret = olpc_xo175_ec_resp_len(cmd); + if (ret < 0) { + dev_err_ratelimited(dev, "unknown command 0x%x\n", cmd); + + /* + * Assume the best in our callers, and allow unknown commands + * through. I'm not the charitable type, but it was beaten + * into me. Just maintain a minimum standard of sanity. + */ + if (resp_len > sizeof(priv->resp_data)) { + dev_err(dev, "response too big: %zd!\n", resp_len); + return -EOVERFLOW; + } + nr_bytes = resp_len; + } else { + nr_bytes = (size_t)ret; + } + resp_len = min(resp_len, nr_bytes); + + spin_lock_irqsave(&priv->cmd_state_lock, flags); + + /* Initialize the state machine */ + init_completion(&priv->cmd_done); + priv->cmd_running = true; + priv->cmd_state = CMD_STATE_WAITING_FOR_SWITCH; + memset(&priv->cmd, 0, sizeof(priv->cmd)); + priv->cmd.command = cmd; + priv->cmd.nr_args = inlen; + priv->cmd.data_len = 0; + memcpy(priv->cmd.args, inbuf, inlen); + priv->expected_resp_len = nr_bytes; + priv->resp_len = 0; + + /* Tickle the cmd gpio to get things started */ + gpiod_set_value_cansleep(priv->gpio_cmd, 1); + + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + + /* The irq handler should do the rest */ + if (!wait_for_completion_timeout(&priv->cmd_done, + msecs_to_jiffies(4000))) { + dev_err(dev, "EC cmd error: timeout in STATE %d\n", + priv->cmd_state); + gpiod_set_value_cansleep(priv->gpio_cmd, 0); + spi_slave_abort(priv->spi); + olpc_xo175_ec_read_packet(priv); + return -ETIMEDOUT; + } + + spin_lock_irqsave(&priv->cmd_state_lock, flags); + + /* Deal with the results. */ + if (priv->cmd_state == CMD_STATE_ERROR_RECEIVED) { + /* EC-provided error is in the single response byte */ + dev_err(dev, "command 0x%x returned error 0x%x\n", + cmd, priv->resp_data[0]); + ret = -EREMOTEIO; + } else if (priv->resp_len != nr_bytes) { + dev_err(dev, "command 0x%x returned %d bytes, expected %zd bytes\n", + cmd, priv->resp_len, nr_bytes); + ret = -EREMOTEIO; + } else { + /* + * We may have 8 bytes in priv->resp, but we only care about + * what we've been asked for. If the caller asked for only 2 + * bytes, give them that. We've guaranteed that + * resp_len <= priv->resp_len and priv->resp_len == nr_bytes. + */ + memcpy(resp, priv->resp_data, resp_len); + } + + /* This should already be low, but just in case. */ + gpiod_set_value_cansleep(priv->gpio_cmd, 0); + priv->cmd_running = false; + + spin_unlock_irqrestore(&priv->cmd_state_lock, flags); + + return ret; +} + +static int olpc_xo175_ec_set_event_mask(unsigned int mask) +{ + u8 args[2]; + + args[0] = mask >> 0; + args[1] = mask >> 8; + return olpc_ec_cmd(CMD_WRITE_EXT_SCI_MASK, args, 2, NULL, 0); +} + +static void olpc_xo175_ec_power_off(void) +{ + while (1) { + olpc_ec_cmd(CMD_POWER_OFF, NULL, 0, NULL, 0); + mdelay(1000); + } +} + +static int __maybe_unused olpc_xo175_ec_suspend(struct device *dev) +{ + struct olpc_xo175_ec *priv = dev_get_drvdata(dev); + static struct { + u8 suspend; + u32 suspend_count; + } __packed hintargs; + static unsigned int suspend_count; + + /* + * SOC_SLEEP is not wired to the EC on B3 and earlier boards. + * This command lets the EC know instead. The suspend count doesn't seem + * to be used anywhere but in the EC debug output. + */ + hintargs.suspend = 1; + hintargs.suspend_count = suspend_count++; + olpc_ec_cmd(CMD_SUSPEND_HINT, (void *)&hintargs, sizeof(hintargs), + NULL, 0); + + /* + * After we've sent the suspend hint, don't allow further EC commands + * to be run until we've resumed. Userspace tasks should be frozen, + * but kernel threads and interrupts could still schedule EC commands. + */ + priv->suspended = true; + + return 0; +} + +static int __maybe_unused olpc_xo175_ec_resume_noirq(struct device *dev) +{ + struct olpc_xo175_ec *priv = dev_get_drvdata(dev); + + priv->suspended = false; + + return 0; +} + +static int __maybe_unused olpc_xo175_ec_resume(struct device *dev) +{ + u8 x = 0; + + /* + * The resume hint is only needed if no other commands are + * being sent during resume. all it does is tell the EC + * the SoC is definitely awake. + */ + olpc_ec_cmd(CMD_SUSPEND_HINT, &x, 1, NULL, 0); + + /* Enable all EC events while we're awake */ + olpc_xo175_ec_set_event_mask(EC_ALL_EVENTS); + + return 0; +} + +static struct olpc_ec_driver olpc_xo175_ec_driver = { + .ec_cmd = olpc_xo175_ec_cmd, +}; + +static int olpc_xo175_ec_remove(struct spi_device *spi) +{ + if (pm_power_off == olpc_xo175_ec_power_off) + pm_power_off = NULL; + + spi_slave_abort(spi); + + platform_device_unregister(olpc_ec); + olpc_ec = NULL; + + return 0; +} + +static int olpc_xo175_ec_probe(struct spi_device *spi) +{ + struct olpc_xo175_ec *priv; + int ret; + + if (olpc_ec) { + dev_err(&spi->dev, "OLPC EC already registered.\n"); + return -EBUSY; + } + + priv = devm_kzalloc(&spi->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->gpio_cmd = devm_gpiod_get(&spi->dev, "cmd", GPIOD_OUT_LOW); + if (IS_ERR(priv->gpio_cmd)) { + dev_err(&spi->dev, "failed to get cmd gpio: %ld\n", + PTR_ERR(priv->gpio_cmd)); + return PTR_ERR(priv->gpio_cmd); + } + + priv->spi = spi; + + spin_lock_init(&priv->cmd_state_lock); + priv->cmd_state = CMD_STATE_IDLE; + init_completion(&priv->cmd_done); + + priv->logbuf_len = 0; + + /* Set up power button input device */ + priv->pwrbtn = devm_input_allocate_device(&spi->dev); + if (!priv->pwrbtn) + return -ENOMEM; + priv->pwrbtn->name = "Power Button"; + priv->pwrbtn->dev.parent = &spi->dev; + input_set_capability(priv->pwrbtn, EV_KEY, KEY_POWER); + ret = input_register_device(priv->pwrbtn); + if (ret) { + dev_err(&spi->dev, "error registering input device: %d\n", ret); + return ret; + } + + spi_set_drvdata(spi, priv); + + priv->xfer.rx_buf = &priv->rx_buf; + priv->xfer.tx_buf = &priv->tx_buf; + + olpc_xo175_ec_read_packet(priv); + + olpc_ec_driver_register(&olpc_xo175_ec_driver, priv); + olpc_ec = platform_device_register_resndata(&spi->dev, "olpc-ec", -1, + NULL, 0, NULL, 0); + + /* Enable all EC events while we're awake */ + olpc_xo175_ec_set_event_mask(EC_ALL_EVENTS); + + if (pm_power_off == NULL) + pm_power_off = olpc_xo175_ec_power_off; + + dev_info(&spi->dev, "OLPC XO-1.75 Embedded Controller driver\n"); + + return 0; +} + +static const struct dev_pm_ops olpc_xo175_ec_pm_ops = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(NULL, olpc_xo175_ec_resume_noirq) + SET_RUNTIME_PM_OPS(olpc_xo175_ec_suspend, olpc_xo175_ec_resume, NULL) +}; + +static const struct of_device_id olpc_xo175_ec_of_match[] = { + { .compatible = "olpc,xo1.75-ec" }, + { } +}; +MODULE_DEVICE_TABLE(of, olpc_xo175_ec_of_match); + +static struct spi_driver olpc_xo175_ec_spi_driver = { + .driver = { + .name = "olpc-xo175-ec", + .of_match_table = olpc_xo175_ec_of_match, + .pm = &olpc_xo175_ec_pm_ops, + }, + .probe = olpc_xo175_ec_probe, + .remove = olpc_xo175_ec_remove, +}; +module_spi_driver(olpc_xo175_ec_spi_driver); + +MODULE_DESCRIPTION("OLPC XO-1.75 Embedded Controller driver"); +MODULE_AUTHOR("Lennert Buytenhek "); /* Functionality */ +MODULE_AUTHOR("Lubomir Rintel "); /* Bugs */ +MODULE_LICENSE("GPL"); diff --git a/include/linux/olpc-ec.h b/include/linux/olpc-ec.h index f7b6a7eda232..c4602364e909 100644 --- a/include/linux/olpc-ec.h +++ b/include/linux/olpc-ec.h @@ -41,7 +41,7 @@ struct olpc_ec_driver { bool wakeup_available; }; -#ifdef CONFIG_OLPC +#ifdef CONFIG_OLPC_EC extern void olpc_ec_driver_register(struct olpc_ec_driver *drv, void *arg); @@ -69,6 +69,6 @@ static inline bool olpc_ec_wakeup_available(void) return false; } -#endif /* CONFIG_OLPC */ +#endif /* CONFIG_OLPC_EC */ #endif /* _LINUX_OLPC_EC_H */ -- cgit v1.2.3-59-g8ed1b From 9a0f780958bbcb85604636fa340e2a1efaa4f432 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Mon, 13 May 2019 13:39:51 +0200 Subject: dmaengine: sudmac: remove unused driver SUDMAC driver was introduced in v3.10 but was never integrated for use by any platform. As it is unused remove it. Signed-off-by: Simon Horman Acked-by: Yoshihiro Shimoda Signed-off-by: Vinod Koul --- drivers/dma/sh/Kconfig | 6 - drivers/dma/sh/Makefile | 1 - drivers/dma/sh/sudmac.c | 414 ------------------------------------------------ include/linux/sudmac.h | 52 ------ 4 files changed, 473 deletions(-) delete mode 100644 drivers/dma/sh/sudmac.c delete mode 100644 include/linux/sudmac.h (limited to 'include/linux') diff --git a/drivers/dma/sh/Kconfig b/drivers/dma/sh/Kconfig index 4d6b02b3b1f1..54d5d0369d3c 100644 --- a/drivers/dma/sh/Kconfig +++ b/drivers/dma/sh/Kconfig @@ -47,9 +47,3 @@ config RENESAS_USB_DMAC help This driver supports the USB-DMA controller found in the Renesas SoCs. - -config SUDMAC - tristate "Renesas SUDMAC support" - depends on SH_DMAE_BASE - help - Enable support for the Renesas SUDMAC controllers. diff --git a/drivers/dma/sh/Makefile b/drivers/dma/sh/Makefile index 42110dd57a56..112fbd22bb3f 100644 --- a/drivers/dma/sh/Makefile +++ b/drivers/dma/sh/Makefile @@ -15,4 +15,3 @@ obj-$(CONFIG_SH_DMAE) += shdma.o obj-$(CONFIG_RCAR_DMAC) += rcar-dmac.o obj-$(CONFIG_RENESAS_USB_DMAC) += usb-dmac.o -obj-$(CONFIG_SUDMAC) += sudmac.o diff --git a/drivers/dma/sh/sudmac.c b/drivers/dma/sh/sudmac.c deleted file mode 100644 index 30cc3553cb8b..000000000000 --- a/drivers/dma/sh/sudmac.c +++ /dev/null @@ -1,414 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Renesas SUDMAC support - * - * Copyright (C) 2013 Renesas Solutions Corp. - * - * based on drivers/dma/sh/shdma.c: - * Copyright (C) 2011-2012 Guennadi Liakhovetski - * Copyright (C) 2009 Nobuhiro Iwamatsu - * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved. - * Copyright (C) 2007 Freescale Semiconductor, Inc. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -struct sudmac_chan { - struct shdma_chan shdma_chan; - void __iomem *base; - char dev_id[16]; /* unique name per DMAC of channel */ - - u32 offset; /* for CFG, BA, BBC, CA, CBC, DEN */ - u32 cfg; - u32 dint_end_bit; -}; - -struct sudmac_device { - struct shdma_dev shdma_dev; - struct sudmac_pdata *pdata; - void __iomem *chan_reg; -}; - -struct sudmac_regs { - u32 base_addr; - u32 base_byte_count; -}; - -struct sudmac_desc { - struct sudmac_regs hw; - struct shdma_desc shdma_desc; -}; - -#define to_chan(schan) container_of(schan, struct sudmac_chan, shdma_chan) -#define to_desc(sdesc) container_of(sdesc, struct sudmac_desc, shdma_desc) -#define to_sdev(sc) container_of(sc->shdma_chan.dma_chan.device, \ - struct sudmac_device, shdma_dev.dma_dev) - -/* SUDMAC register */ -#define SUDMAC_CH0CFG 0x00 -#define SUDMAC_CH0BA 0x10 -#define SUDMAC_CH0BBC 0x18 -#define SUDMAC_CH0CA 0x20 -#define SUDMAC_CH0CBC 0x28 -#define SUDMAC_CH0DEN 0x30 -#define SUDMAC_DSTSCLR 0x38 -#define SUDMAC_DBUFCTRL 0x3C -#define SUDMAC_DINTCTRL 0x40 -#define SUDMAC_DINTSTS 0x44 -#define SUDMAC_DINTSTSCLR 0x48 -#define SUDMAC_CH0SHCTRL 0x50 - -/* Definitions for the sudmac_channel.config */ -#define SUDMAC_SENDBUFM 0x1000 /* b12: Transmit Buffer Mode */ -#define SUDMAC_RCVENDM 0x0100 /* b8: Receive Data Transfer End Mode */ -#define SUDMAC_LBA_WAIT 0x0030 /* b5-4: Local Bus Access Wait */ - -/* Definitions for the sudmac_channel.dint_end_bit */ -#define SUDMAC_CH1ENDE 0x0002 /* b1: Ch1 DMA Transfer End Int Enable */ -#define SUDMAC_CH0ENDE 0x0001 /* b0: Ch0 DMA Transfer End Int Enable */ - -#define SUDMAC_DRV_NAME "sudmac" - -static void sudmac_writel(struct sudmac_chan *sc, u32 data, u32 reg) -{ - iowrite32(data, sc->base + reg); -} - -static u32 sudmac_readl(struct sudmac_chan *sc, u32 reg) -{ - return ioread32(sc->base + reg); -} - -static bool sudmac_is_busy(struct sudmac_chan *sc) -{ - u32 den = sudmac_readl(sc, SUDMAC_CH0DEN + sc->offset); - - if (den) - return true; /* working */ - - return false; /* waiting */ -} - -static void sudmac_set_reg(struct sudmac_chan *sc, struct sudmac_regs *hw, - struct shdma_desc *sdesc) -{ - sudmac_writel(sc, sc->cfg, SUDMAC_CH0CFG + sc->offset); - sudmac_writel(sc, hw->base_addr, SUDMAC_CH0BA + sc->offset); - sudmac_writel(sc, hw->base_byte_count, SUDMAC_CH0BBC + sc->offset); -} - -static void sudmac_start(struct sudmac_chan *sc) -{ - u32 dintctrl = sudmac_readl(sc, SUDMAC_DINTCTRL); - - sudmac_writel(sc, dintctrl | sc->dint_end_bit, SUDMAC_DINTCTRL); - sudmac_writel(sc, 1, SUDMAC_CH0DEN + sc->offset); -} - -static void sudmac_start_xfer(struct shdma_chan *schan, - struct shdma_desc *sdesc) -{ - struct sudmac_chan *sc = to_chan(schan); - struct sudmac_desc *sd = to_desc(sdesc); - - sudmac_set_reg(sc, &sd->hw, sdesc); - sudmac_start(sc); -} - -static bool sudmac_channel_busy(struct shdma_chan *schan) -{ - struct sudmac_chan *sc = to_chan(schan); - - return sudmac_is_busy(sc); -} - -static void sudmac_setup_xfer(struct shdma_chan *schan, int slave_id) -{ -} - -static const struct sudmac_slave_config *sudmac_find_slave( - struct sudmac_chan *sc, int slave_id) -{ - struct sudmac_device *sdev = to_sdev(sc); - struct sudmac_pdata *pdata = sdev->pdata; - const struct sudmac_slave_config *cfg; - int i; - - for (i = 0, cfg = pdata->slave; i < pdata->slave_num; i++, cfg++) - if (cfg->slave_id == slave_id) - return cfg; - - return NULL; -} - -static int sudmac_set_slave(struct shdma_chan *schan, int slave_id, - dma_addr_t slave_addr, bool try) -{ - struct sudmac_chan *sc = to_chan(schan); - const struct sudmac_slave_config *cfg = sudmac_find_slave(sc, slave_id); - - if (!cfg) - return -ENODEV; - - return 0; -} - -static inline void sudmac_dma_halt(struct sudmac_chan *sc) -{ - u32 dintctrl = sudmac_readl(sc, SUDMAC_DINTCTRL); - - sudmac_writel(sc, 0, SUDMAC_CH0DEN + sc->offset); - sudmac_writel(sc, dintctrl & ~sc->dint_end_bit, SUDMAC_DINTCTRL); - sudmac_writel(sc, sc->dint_end_bit, SUDMAC_DINTSTSCLR); -} - -static int sudmac_desc_setup(struct shdma_chan *schan, - struct shdma_desc *sdesc, - dma_addr_t src, dma_addr_t dst, size_t *len) -{ - struct sudmac_chan *sc = to_chan(schan); - struct sudmac_desc *sd = to_desc(sdesc); - - dev_dbg(sc->shdma_chan.dev, "%s: src=%pad, dst=%pad, len=%zu\n", - __func__, &src, &dst, *len); - - if (*len > schan->max_xfer_len) - *len = schan->max_xfer_len; - - if (dst) - sd->hw.base_addr = dst; - else if (src) - sd->hw.base_addr = src; - sd->hw.base_byte_count = *len; - - return 0; -} - -static void sudmac_halt(struct shdma_chan *schan) -{ - struct sudmac_chan *sc = to_chan(schan); - - sudmac_dma_halt(sc); -} - -static bool sudmac_chan_irq(struct shdma_chan *schan, int irq) -{ - struct sudmac_chan *sc = to_chan(schan); - u32 dintsts = sudmac_readl(sc, SUDMAC_DINTSTS); - - if (!(dintsts & sc->dint_end_bit)) - return false; - - /* DMA stop */ - sudmac_dma_halt(sc); - - return true; -} - -static size_t sudmac_get_partial(struct shdma_chan *schan, - struct shdma_desc *sdesc) -{ - struct sudmac_chan *sc = to_chan(schan); - struct sudmac_desc *sd = to_desc(sdesc); - u32 current_byte_count = sudmac_readl(sc, SUDMAC_CH0CBC + sc->offset); - - return sd->hw.base_byte_count - current_byte_count; -} - -static bool sudmac_desc_completed(struct shdma_chan *schan, - struct shdma_desc *sdesc) -{ - struct sudmac_chan *sc = to_chan(schan); - struct sudmac_desc *sd = to_desc(sdesc); - u32 current_addr = sudmac_readl(sc, SUDMAC_CH0CA + sc->offset); - - return sd->hw.base_addr + sd->hw.base_byte_count == current_addr; -} - -static int sudmac_chan_probe(struct sudmac_device *su_dev, int id, int irq, - unsigned long flags) -{ - struct shdma_dev *sdev = &su_dev->shdma_dev; - struct platform_device *pdev = to_platform_device(sdev->dma_dev.dev); - struct sudmac_chan *sc; - struct shdma_chan *schan; - int err; - - sc = devm_kzalloc(&pdev->dev, sizeof(struct sudmac_chan), GFP_KERNEL); - if (!sc) - return -ENOMEM; - - schan = &sc->shdma_chan; - schan->max_xfer_len = 64 * 1024 * 1024 - 1; - - shdma_chan_probe(sdev, schan, id); - - sc->base = su_dev->chan_reg; - - /* get platform_data */ - sc->offset = su_dev->pdata->channel->offset; - if (su_dev->pdata->channel->config & SUDMAC_TX_BUFFER_MODE) - sc->cfg |= SUDMAC_SENDBUFM; - if (su_dev->pdata->channel->config & SUDMAC_RX_END_MODE) - sc->cfg |= SUDMAC_RCVENDM; - sc->cfg |= (su_dev->pdata->channel->wait << 4) & SUDMAC_LBA_WAIT; - - if (su_dev->pdata->channel->dint_end_bit & SUDMAC_DMA_BIT_CH0) - sc->dint_end_bit |= SUDMAC_CH0ENDE; - if (su_dev->pdata->channel->dint_end_bit & SUDMAC_DMA_BIT_CH1) - sc->dint_end_bit |= SUDMAC_CH1ENDE; - - /* set up channel irq */ - if (pdev->id >= 0) - snprintf(sc->dev_id, sizeof(sc->dev_id), "sudmac%d.%d", - pdev->id, id); - else - snprintf(sc->dev_id, sizeof(sc->dev_id), "sudmac%d", id); - - err = shdma_request_irq(schan, irq, flags, sc->dev_id); - if (err) { - dev_err(sdev->dma_dev.dev, - "DMA channel %d request_irq failed %d\n", id, err); - goto err_no_irq; - } - - return 0; - -err_no_irq: - /* remove from dmaengine device node */ - shdma_chan_remove(schan); - return err; -} - -static void sudmac_chan_remove(struct sudmac_device *su_dev) -{ - struct shdma_chan *schan; - int i; - - shdma_for_each_chan(schan, &su_dev->shdma_dev, i) { - BUG_ON(!schan); - - shdma_chan_remove(schan); - } -} - -static dma_addr_t sudmac_slave_addr(struct shdma_chan *schan) -{ - /* SUDMAC doesn't need the address */ - return 0; -} - -static struct shdma_desc *sudmac_embedded_desc(void *buf, int i) -{ - return &((struct sudmac_desc *)buf)[i].shdma_desc; -} - -static const struct shdma_ops sudmac_shdma_ops = { - .desc_completed = sudmac_desc_completed, - .halt_channel = sudmac_halt, - .channel_busy = sudmac_channel_busy, - .slave_addr = sudmac_slave_addr, - .desc_setup = sudmac_desc_setup, - .set_slave = sudmac_set_slave, - .setup_xfer = sudmac_setup_xfer, - .start_xfer = sudmac_start_xfer, - .embedded_desc = sudmac_embedded_desc, - .chan_irq = sudmac_chan_irq, - .get_partial = sudmac_get_partial, -}; - -static int sudmac_probe(struct platform_device *pdev) -{ - struct sudmac_pdata *pdata = dev_get_platdata(&pdev->dev); - int err, i; - struct sudmac_device *su_dev; - struct dma_device *dma_dev; - struct resource *chan, *irq_res; - - /* get platform data */ - if (!pdata) - return -ENODEV; - - irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (!irq_res) - return -ENODEV; - - err = -ENOMEM; - su_dev = devm_kzalloc(&pdev->dev, sizeof(struct sudmac_device), - GFP_KERNEL); - if (!su_dev) - return err; - - dma_dev = &su_dev->shdma_dev.dma_dev; - - chan = platform_get_resource(pdev, IORESOURCE_MEM, 0); - su_dev->chan_reg = devm_ioremap_resource(&pdev->dev, chan); - if (IS_ERR(su_dev->chan_reg)) - return PTR_ERR(su_dev->chan_reg); - - dma_cap_set(DMA_SLAVE, dma_dev->cap_mask); - - su_dev->shdma_dev.ops = &sudmac_shdma_ops; - su_dev->shdma_dev.desc_size = sizeof(struct sudmac_desc); - err = shdma_init(&pdev->dev, &su_dev->shdma_dev, pdata->channel_num); - if (err < 0) - return err; - - /* platform data */ - su_dev->pdata = dev_get_platdata(&pdev->dev); - - platform_set_drvdata(pdev, su_dev); - - /* Create DMA Channel */ - for (i = 0; i < pdata->channel_num; i++) { - err = sudmac_chan_probe(su_dev, i, irq_res->start, IRQF_SHARED); - if (err) - goto chan_probe_err; - } - - err = dma_async_device_register(&su_dev->shdma_dev.dma_dev); - if (err < 0) - goto chan_probe_err; - - return err; - -chan_probe_err: - sudmac_chan_remove(su_dev); - - shdma_cleanup(&su_dev->shdma_dev); - - return err; -} - -static int sudmac_remove(struct platform_device *pdev) -{ - struct sudmac_device *su_dev = platform_get_drvdata(pdev); - struct dma_device *dma_dev = &su_dev->shdma_dev.dma_dev; - - dma_async_device_unregister(dma_dev); - sudmac_chan_remove(su_dev); - shdma_cleanup(&su_dev->shdma_dev); - - return 0; -} - -static struct platform_driver sudmac_driver = { - .driver = { - .name = SUDMAC_DRV_NAME, - }, - .probe = sudmac_probe, - .remove = sudmac_remove, -}; -module_platform_driver(sudmac_driver); - -MODULE_AUTHOR("Yoshihiro Shimoda"); -MODULE_DESCRIPTION("Renesas SUDMAC driver"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS("platform:" SUDMAC_DRV_NAME); diff --git a/include/linux/sudmac.h b/include/linux/sudmac.h deleted file mode 100644 index 377b8a5788fa..000000000000 --- a/include/linux/sudmac.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Header for the SUDMAC driver - * - * Copyright (C) 2013 Renesas Solutions Corp. - * - * This is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - */ -#ifndef SUDMAC_H -#define SUDMAC_H - -#include -#include -#include - -/* Used by slave DMA clients to request DMA to/from a specific peripheral */ -struct sudmac_slave { - struct shdma_slave shdma_slave; /* Set by the platform */ -}; - -/* - * Supplied by platforms to specify, how a DMA channel has to be configured for - * a certain peripheral - */ -struct sudmac_slave_config { - int slave_id; -}; - -struct sudmac_channel { - unsigned long offset; - unsigned long config; - unsigned long wait; /* The configuable range is 0 to 3 */ - unsigned long dint_end_bit; -}; - -struct sudmac_pdata { - const struct sudmac_slave_config *slave; - int slave_num; - const struct sudmac_channel *channel; - int channel_num; -}; - -/* Definitions for the sudmac_channel.config */ -#define SUDMAC_TX_BUFFER_MODE BIT(0) -#define SUDMAC_RX_END_MODE BIT(1) - -/* Definitions for the sudmac_channel.dint_end_bit */ -#define SUDMAC_DMA_BIT_CH0 BIT(0) -#define SUDMAC_DMA_BIT_CH1 BIT(1) - -#endif -- cgit v1.2.3-59-g8ed1b From 7e5f7bb08b8cefd3a7e8961861f47fe1f0e830d4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 May 2019 13:44:57 +0100 Subject: unexport simple_dname() Signed-off-by: Al Viro --- fs/d_path.c | 1 - fs/internal.h | 1 + include/linux/dcache.h | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/fs/d_path.c b/fs/d_path.c index e8fce6b1174f..a7d0a96b35ce 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -316,7 +316,6 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) end = ERR_PTR(-ENAMETOOLONG); return end; } -EXPORT_SYMBOL(simple_dname); /* * Write full pathname from the root of the filesystem into the buffer. diff --git a/fs/internal.h b/fs/internal.h index 0010889f2e85..1ac2b8f6c621 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -160,6 +160,7 @@ extern int d_set_mounted(struct dentry *dentry); extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); extern struct dentry *d_alloc_cursor(struct dentry *); extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *); +extern char *simple_dname(struct dentry *, char *, int); /* * read_write.c diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f14e587c5d5d..361305ddd75e 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -291,7 +291,6 @@ static inline unsigned d_count(const struct dentry *dentry) */ extern __printf(4, 5) char *dynamic_dname(struct dentry *, char *, int, const char *, ...); -extern char *simple_dname(struct dentry *, char *, int); extern char *__d_path(const struct path *, const struct path *, char *, int); extern char *d_absolute_path(const struct path *, char *, int); -- cgit v1.2.3-59-g8ed1b From f13e143e7444bffc53f5c2904aeed76646da69d6 Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 3 Jul 2018 16:42:26 +0200 Subject: dma-buf: start caching of sg_table objects v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To allow a smooth transition from pinning buffer objects to dynamic invalidation we first start to cache the sg_table for an attachment. v2: keep closer to the DRM implementation Signed-off-by: Christian König Reviewed-by: Daniel Vetter Link: https://patchwork.kernel.org/patch/10943053/ --- drivers/dma-buf/dma-buf.c | 27 +++++++++++++++++++++++++-- include/linux/dma-buf.h | 13 +++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 3ae6c0c2cc02..f4104a21b069 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -576,6 +576,7 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf, list_add(&attach->node, &dmabuf->attachments); mutex_unlock(&dmabuf->lock); + return attach; err_attach: @@ -598,6 +599,9 @@ void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach) if (WARN_ON(!dmabuf || !attach)) return; + if (attach->sgt) + dmabuf->ops->unmap_dma_buf(attach, attach->sgt, attach->dir); + mutex_lock(&dmabuf->lock); list_del(&attach->node); if (dmabuf->ops->detach) @@ -633,10 +637,27 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach, if (WARN_ON(!attach || !attach->dmabuf)) return ERR_PTR(-EINVAL); + if (attach->sgt) { + /* + * Two mappings with different directions for the same + * attachment are not allowed. + */ + if (attach->dir != direction && + attach->dir != DMA_BIDIRECTIONAL) + return ERR_PTR(-EBUSY); + + return attach->sgt; + } + sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction); if (!sg_table) sg_table = ERR_PTR(-ENOMEM); + if (!IS_ERR(sg_table) && attach->dmabuf->ops->cache_sgt_mapping) { + attach->sgt = sg_table; + attach->dir = direction; + } + return sg_table; } EXPORT_SYMBOL_GPL(dma_buf_map_attachment); @@ -660,8 +681,10 @@ void dma_buf_unmap_attachment(struct dma_buf_attachment *attach, if (WARN_ON(!attach || !attach->dmabuf || !sg_table)) return; - attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, - direction); + if (attach->sgt == sg_table) + return; + + attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction); } EXPORT_SYMBOL_GPL(dma_buf_unmap_attachment); diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index a0bd071466fc..8a327566d7f4 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -44,6 +44,15 @@ struct dma_buf_attachment; * @vunmap: [optional] unmaps a vmap from the buffer */ struct dma_buf_ops { + /** + * @cache_sgt_mapping: + * + * If true the framework will cache the first mapping made for each + * attachment. This avoids creating mappings for attachments multiple + * times. + */ + bool cache_sgt_mapping; + /** * @attach: * @@ -323,6 +332,8 @@ struct dma_buf { * @dmabuf: buffer for this attachment. * @dev: device attached to the buffer. * @node: list of dma_buf_attachment. + * @sgt: cached mapping. + * @dir: direction of cached mapping. * @priv: exporter specific attachment data. * * This structure holds the attachment information between the dma_buf buffer @@ -338,6 +349,8 @@ struct dma_buf_attachment { struct dma_buf *dmabuf; struct device *dev; struct list_head node; + struct sg_table *sgt; + enum dma_data_direction dir; void *priv; }; -- cgit v1.2.3-59-g8ed1b From fbb5d0353c62d10c3699ec844d2d015a762952d7 Mon Sep 17 00:00:00 2001 From: Uma Shankar Date: Thu, 16 May 2019 19:40:06 +0530 Subject: drm: Add HDR source metadata property MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a blob property to get HDR metadata information from userspace. This will be send as part of AVI Infoframe to panel. It also implements get() and set() functions for HDR output metadata property.The blob data is received from userspace and saved in connector state, the same is returned as blob in get property call to userspace. v2: Rebase and modified the metadata structure elements as per Ville's POC changes. v3: No Change v4: Addressed Shashank's review comments v5: Rebase. v6: Addressed Brian Starkey's review comments, defined new structure with header for dynamic metadata scalability. Merge get/set property functions for metadata in this patch. v7: Addressed Jonas Karlman review comments and defined separate structure for infoframe to better align with CTA 861.G spec. Added Shashank's RB. v8: Addressed Ville's review comments. Moved sink metadata structure out of uapi headers as suggested by Jonas Karlman. v9: Rebase and addressed Jonas Karlman review comments. v10: Addressed Ville's review comments, dropped the metdata_changed state variable as its not needed anymore. Signed-off-by: Uma Shankar Reviewed-by: Shashank Sharma Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/1558015817-12025-2-git-send-email-uma.shankar@intel.com --- drivers/gpu/drm/drm_atomic_uapi.c | 12 ++++++++++++ drivers/gpu/drm/drm_connector.c | 6 ++++++ include/drm/drm_connector.h | 10 ++++++++++ include/drm/drm_mode_config.h | 7 +++++++ include/linux/hdmi.h | 26 ++++++++++++++++++++++++++ include/uapi/drm/drm_mode.h | 23 +++++++++++++++++++++++ 6 files changed, 84 insertions(+) (limited to 'include/linux') diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c index 428d82662dc4..125605ff45af 100644 --- a/drivers/gpu/drm/drm_atomic_uapi.c +++ b/drivers/gpu/drm/drm_atomic_uapi.c @@ -676,6 +676,8 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector, { struct drm_device *dev = connector->dev; struct drm_mode_config *config = &dev->mode_config; + bool replaced = false; + int ret; if (property == config->prop_crtc_id) { struct drm_crtc *crtc = drm_crtc_find(dev, file_priv, val); @@ -726,6 +728,13 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector, */ if (state->link_status != DRM_LINK_STATUS_GOOD) state->link_status = val; + } else if (property == config->hdr_output_metadata_property) { + ret = drm_atomic_replace_property_blob_from_id(dev, + &state->hdr_output_metadata, + val, + sizeof(struct hdr_output_metadata), -1, + &replaced); + return ret; } else if (property == config->aspect_ratio_property) { state->picture_aspect_ratio = val; } else if (property == config->content_type_property) { @@ -814,6 +823,9 @@ drm_atomic_connector_get_property(struct drm_connector *connector, *val = state->colorspace; } else if (property == connector->scaling_mode_property) { *val = state->scaling_mode; + } else if (property == config->hdr_output_metadata_property) { + *val = state->hdr_output_metadata ? + state->hdr_output_metadata->base.id : 0; } else if (property == connector->content_protection_property) { *val = state->content_protection; } else if (property == config->writeback_fb_id_property) { diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index b34c3d38bf15..365ace0c0c9e 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -1058,6 +1058,12 @@ int drm_connector_create_standard_properties(struct drm_device *dev) return -ENOMEM; dev->mode_config.non_desktop_property = prop; + prop = drm_property_create(dev, DRM_MODE_PROP_BLOB, + "HDR_OUTPUT_METADATA", 0); + if (!prop) + return -ENOMEM; + dev->mode_config.hdr_output_metadata_property = prop; + return 0; } diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h index f43f40d5888a..f0e987df4c1e 100644 --- a/include/drm/drm_connector.h +++ b/include/drm/drm_connector.h @@ -603,6 +603,12 @@ struct drm_connector_state { * and the connector bpc limitations obtained from edid. */ u8 max_bpc; + + /** + * @hdr_output_metadata: + * DRM blob property for HDR output metadata + */ + struct drm_property_blob *hdr_output_metadata; }; /** @@ -1243,6 +1249,10 @@ struct drm_connector { * &drm_mode_config.connector_free_work. */ struct llist_node free_node; + + /* HDR metdata */ + struct hdr_output_metadata hdr_output_metadata; + struct hdr_sink_metadata hdr_sink_metadata; }; #define obj_to_connector(x) container_of(x, struct drm_connector, base) diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h index 7f60e8eb269a..c031b5a9d8d1 100644 --- a/include/drm/drm_mode_config.h +++ b/include/drm/drm_mode_config.h @@ -836,6 +836,13 @@ struct drm_mode_config { */ struct drm_property *writeback_out_fence_ptr_property; + /** + * hdr_output_metadata_property: Connector property containing hdr + * metatda. This will be provided by userspace compositors based + * on HDR content + */ + struct drm_property *hdr_output_metadata_property; + /* dumb ioctl parameters */ uint32_t preferred_depth, prefer_shadow; diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h index 927ad6451105..6780476dcbff 100644 --- a/include/linux/hdmi.h +++ b/include/linux/hdmi.h @@ -152,6 +152,16 @@ enum hdmi_content_type { HDMI_CONTENT_TYPE_GAME, }; +enum hdmi_metadata_type { + HDMI_STATIC_METADATA_TYPE1 = 1, +}; + +enum hdmi_eotf { + HDMI_EOTF_TRADITIONAL_GAMMA_SDR, + HDMI_EOTF_TRADITIONAL_GAMMA_HDR, + HDMI_EOTF_SMPTE_ST2084, +}; + struct hdmi_avi_infoframe { enum hdmi_infoframe_type type; unsigned char version; @@ -320,6 +330,22 @@ struct hdmi_vendor_infoframe { unsigned int s3d_ext_data; }; +/* HDR Metadata as per 861.G spec */ +struct hdr_static_metadata { + __u8 eotf; + __u8 metadata_type; + __u16 max_cll; + __u16 max_fall; + __u16 min_cll; +}; + +struct hdr_sink_metadata { + __u32 metadata_type; + union { + struct hdr_static_metadata hdmi_type1; + }; +}; + int hdmi_vendor_infoframe_init(struct hdmi_vendor_infoframe *frame); ssize_t hdmi_vendor_infoframe_pack(struct hdmi_vendor_infoframe *frame, void *buffer, size_t size); diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h index 83cd1636b9be..997a7e05c0c6 100644 --- a/include/uapi/drm/drm_mode.h +++ b/include/uapi/drm/drm_mode.h @@ -630,6 +630,29 @@ struct drm_color_lut { __u16 reserved; }; +/* HDR Metadata Infoframe as per 861.G spec */ +struct hdr_metadata_infoframe { + __u8 eotf; + __u8 metadata_type; + struct { + __u16 x, y; + } display_primaries[3]; + struct { + __u16 x, y; + } white_point; + __u16 max_display_mastering_luminance; + __u16 min_display_mastering_luminance; + __u16 max_cll; + __u16 max_fall; +}; + +struct hdr_output_metadata { + __u32 metadata_type; + union { + struct hdr_metadata_infoframe hdmi_metadata_type1; + }; +}; + #define DRM_MODE_PAGE_FLIP_EVENT 0x01 #define DRM_MODE_PAGE_FLIP_ASYNC 0x02 #define DRM_MODE_PAGE_FLIP_TARGET_ABSOLUTE 0x4 -- cgit v1.2.3-59-g8ed1b From 2cdbfd66a82969770ce1a7032fb1e2155a08cee8 Mon Sep 17 00:00:00 2001 From: Uma Shankar Date: Thu, 16 May 2019 19:40:09 +0530 Subject: drm: Enable HDR infoframe support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable Dynamic Range and Mastering Infoframe for HDR content, which is defined in CEA 861.3 spec. The metadata will be computed based on blending policy in userspace compositors and passed as a connector property blob to driver. The same will be sent as infoframe to panel which support HDR. Added the const version of infoframe for DRM metadata for HDR. v2: Rebase and added Ville's POC changes. v3: No Change v4: Addressed Shashank's review comments and merged the patch making drm infoframe function arguments as constant. v5: Rebase v6: Fixed checkpatch warnings with --strict option. Addressed Shashank's review comments and added his RB. v7: Addressed Brian Starkey's review comments. Merged 2 patches into one. v8: Addressed Jonas Karlman review comments. v9: Addressed Jonas Karlman review comments. v10: Addressed Ville's review comments. v11: Added BUILD_BUG_ON and sizeof instead of magic numbers as per Ville's comments. Signed-off-by: Uma Shankar Reviewed-by: Shashank Sharma Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/1558015817-12025-5-git-send-email-uma.shankar@intel.com --- drivers/gpu/drm/drm_edid.c | 72 +++++++++++++++++ drivers/video/hdmi.c | 190 +++++++++++++++++++++++++++++++++++++++++++++ include/drm/drm_edid.h | 5 ++ include/linux/hdmi.h | 28 +++++++ 4 files changed, 295 insertions(+) (limited to 'include/linux') diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index a5ef9f45fee0..73560c9437cd 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -4904,6 +4904,78 @@ static bool is_hdmi2_sink(struct drm_connector *connector) connector->display_info.color_formats & DRM_COLOR_FORMAT_YCRCB420; } +static inline bool is_eotf_supported(u8 output_eotf, u8 sink_eotf) +{ + return sink_eotf & BIT(output_eotf); +} + +/** + * drm_hdmi_infoframe_set_hdr_metadata() - fill an HDMI DRM infoframe with + * HDR metadata from userspace + * @frame: HDMI DRM infoframe + * @hdr_metadata: hdr_source_metadata info from userspace + * + * Return: 0 on success or a negative error code on failure. + */ +int +drm_hdmi_infoframe_set_hdr_metadata(struct hdmi_drm_infoframe *frame, + const struct drm_connector_state *conn_state) +{ + struct drm_connector *connector; + struct hdr_output_metadata *hdr_metadata; + int err; + + if (!frame || !conn_state) + return -EINVAL; + + connector = conn_state->connector; + + if (!conn_state->hdr_output_metadata) + return -EINVAL; + + hdr_metadata = conn_state->hdr_output_metadata->data; + + if (!hdr_metadata || !connector) + return -EINVAL; + + /* Sink EOTF is Bit map while infoframe is absolute values */ + if (!is_eotf_supported(hdr_metadata->hdmi_metadata_type1.eotf, + connector->hdr_sink_metadata.hdmi_type1.eotf)) { + DRM_DEBUG_KMS("EOTF Not Supported\n"); + return -EINVAL; + } + + err = hdmi_drm_infoframe_init(frame); + if (err < 0) + return err; + + frame->eotf = hdr_metadata->hdmi_metadata_type1.eotf; + frame->metadata_type = hdr_metadata->hdmi_metadata_type1.metadata_type; + + BUILD_BUG_ON(sizeof(frame->display_primaries) != + sizeof(hdr_metadata->hdmi_metadata_type1.display_primaries)); + BUILD_BUG_ON(sizeof(frame->white_point) != + sizeof(hdr_metadata->hdmi_metadata_type1.white_point)); + + memcpy(&frame->display_primaries, + &hdr_metadata->hdmi_metadata_type1.display_primaries, + sizeof(frame->display_primaries)); + + memcpy(&frame->white_point, + &hdr_metadata->hdmi_metadata_type1.white_point, + sizeof(frame->white_point)); + + frame->max_display_mastering_luminance = + hdr_metadata->hdmi_metadata_type1.max_display_mastering_luminance; + frame->min_display_mastering_luminance = + hdr_metadata->hdmi_metadata_type1.min_display_mastering_luminance; + frame->max_fall = hdr_metadata->hdmi_metadata_type1.max_fall; + frame->max_cll = hdr_metadata->hdmi_metadata_type1.max_cll; + + return 0; +} +EXPORT_SYMBOL(drm_hdmi_infoframe_set_hdr_metadata); + /** * drm_hdmi_avi_infoframe_from_display_mode() - fill an HDMI AVI infoframe with * data from a DRM display mode diff --git a/drivers/video/hdmi.c b/drivers/video/hdmi.c index 799ae49774f5..481f0367dfd3 100644 --- a/drivers/video/hdmi.c +++ b/drivers/video/hdmi.c @@ -650,6 +650,150 @@ hdmi_vendor_any_infoframe_check_only(const union hdmi_vendor_any_infoframe *fram return 0; } +/** + * hdmi_drm_infoframe_init() - initialize an HDMI Dynaminc Range and + * mastering infoframe + * @frame: HDMI DRM infoframe + * + * Returns 0 on success or a negative error code on failure. + */ +int hdmi_drm_infoframe_init(struct hdmi_drm_infoframe *frame) +{ + memset(frame, 0, sizeof(*frame)); + + frame->type = HDMI_INFOFRAME_TYPE_DRM; + frame->version = 1; + frame->length = HDMI_DRM_INFOFRAME_SIZE; + + return 0; +} +EXPORT_SYMBOL(hdmi_drm_infoframe_init); + +static int hdmi_drm_infoframe_check_only(const struct hdmi_drm_infoframe *frame) +{ + if (frame->type != HDMI_INFOFRAME_TYPE_DRM || + frame->version != 1) + return -EINVAL; + + if (frame->length != HDMI_DRM_INFOFRAME_SIZE) + return -EINVAL; + + return 0; +} + +/** + * hdmi_drm_infoframe_check() - check a HDMI DRM infoframe + * @frame: HDMI DRM infoframe + * + * Validates that the infoframe is consistent. + * Returns 0 on success or a negative error code on failure. + */ +int hdmi_drm_infoframe_check(struct hdmi_drm_infoframe *frame) +{ + return hdmi_drm_infoframe_check_only(frame); +} +EXPORT_SYMBOL(hdmi_drm_infoframe_check); + +/** + * hdmi_drm_infoframe_pack_only() - write HDMI DRM infoframe to binary buffer + * @frame: HDMI DRM infoframe + * @buffer: destination buffer + * @size: size of buffer + * + * Packs the information contained in the @frame structure into a binary + * representation that can be written into the corresponding controller + * registers. Also computes the checksum as required by section 5.3.5 of + * the HDMI 1.4 specification. + * + * Returns the number of bytes packed into the binary buffer or a negative + * error code on failure. + */ +ssize_t hdmi_drm_infoframe_pack_only(const struct hdmi_drm_infoframe *frame, + void *buffer, size_t size) +{ + u8 *ptr = buffer; + size_t length; + int i; + + length = HDMI_INFOFRAME_HEADER_SIZE + frame->length; + + if (size < length) + return -ENOSPC; + + memset(buffer, 0, size); + + ptr[0] = frame->type; + ptr[1] = frame->version; + ptr[2] = frame->length; + ptr[3] = 0; /* checksum */ + + /* start infoframe payload */ + ptr += HDMI_INFOFRAME_HEADER_SIZE; + + *ptr++ = frame->eotf; + *ptr++ = frame->metadata_type; + + for (i = 0; i < 3; i++) { + *ptr++ = frame->display_primaries[i].x; + *ptr++ = frame->display_primaries[i].x >> 8; + *ptr++ = frame->display_primaries[i].y; + *ptr++ = frame->display_primaries[i].y >> 8; + } + + *ptr++ = frame->white_point.x; + *ptr++ = frame->white_point.x >> 8; + + *ptr++ = frame->white_point.y; + *ptr++ = frame->white_point.y >> 8; + + *ptr++ = frame->max_display_mastering_luminance; + *ptr++ = frame->max_display_mastering_luminance >> 8; + + *ptr++ = frame->min_display_mastering_luminance; + *ptr++ = frame->min_display_mastering_luminance >> 8; + + *ptr++ = frame->max_cll; + *ptr++ = frame->max_cll >> 8; + + *ptr++ = frame->max_fall; + *ptr++ = frame->max_fall >> 8; + + hdmi_infoframe_set_checksum(buffer, length); + + return length; +} +EXPORT_SYMBOL(hdmi_drm_infoframe_pack_only); + +/** + * hdmi_drm_infoframe_pack() - check a HDMI DRM infoframe, + * and write it to binary buffer + * @frame: HDMI DRM infoframe + * @buffer: destination buffer + * @size: size of buffer + * + * Validates that the infoframe is consistent and updates derived fields + * (eg. length) based on other fields, after which it packs the information + * contained in the @frame structure into a binary representation that + * can be written into the corresponding controller registers. This function + * also computes the checksum as required by section 5.3.5 of the HDMI 1.4 + * specification. + * + * Returns the number of bytes packed into the binary buffer or a negative + * error code on failure. + */ +ssize_t hdmi_drm_infoframe_pack(struct hdmi_drm_infoframe *frame, + void *buffer, size_t size) +{ + int ret; + + ret = hdmi_drm_infoframe_check(frame); + if (ret) + return ret; + + return hdmi_drm_infoframe_pack_only(frame, buffer, size); +} +EXPORT_SYMBOL(hdmi_drm_infoframe_pack); + /* * hdmi_vendor_any_infoframe_check() - check a vendor infoframe */ @@ -758,6 +902,10 @@ hdmi_infoframe_pack_only(const union hdmi_infoframe *frame, void *buffer, size_t length = hdmi_avi_infoframe_pack_only(&frame->avi, buffer, size); break; + case HDMI_INFOFRAME_TYPE_DRM: + length = hdmi_drm_infoframe_pack_only(&frame->drm, + buffer, size); + break; case HDMI_INFOFRAME_TYPE_SPD: length = hdmi_spd_infoframe_pack_only(&frame->spd, buffer, size); @@ -806,6 +954,9 @@ hdmi_infoframe_pack(union hdmi_infoframe *frame, case HDMI_INFOFRAME_TYPE_AVI: length = hdmi_avi_infoframe_pack(&frame->avi, buffer, size); break; + case HDMI_INFOFRAME_TYPE_DRM: + length = hdmi_drm_infoframe_pack(&frame->drm, buffer, size); + break; case HDMI_INFOFRAME_TYPE_SPD: length = hdmi_spd_infoframe_pack(&frame->spd, buffer, size); break; @@ -838,6 +989,8 @@ static const char *hdmi_infoframe_type_get_name(enum hdmi_infoframe_type type) return "Source Product Description (SPD)"; case HDMI_INFOFRAME_TYPE_AUDIO: return "Audio"; + case HDMI_INFOFRAME_TYPE_DRM: + return "Dynamic Range and Mastering"; } return "Reserved"; } @@ -1284,6 +1437,40 @@ static void hdmi_audio_infoframe_log(const char *level, frame->downmix_inhibit ? "Yes" : "No"); } +/** + * hdmi_drm_infoframe_log() - log info of HDMI DRM infoframe + * @level: logging level + * @dev: device + * @frame: HDMI DRM infoframe + */ +static void hdmi_drm_infoframe_log(const char *level, + struct device *dev, + const struct hdmi_drm_infoframe *frame) +{ + int i; + + hdmi_infoframe_log_header(level, dev, + (struct hdmi_any_infoframe *)frame); + hdmi_log("length: %d\n", frame->length); + hdmi_log("metadata type: %d\n", frame->metadata_type); + hdmi_log("eotf: %d\n", frame->eotf); + for (i = 0; i < 3; i++) { + hdmi_log("x[%d]: %d\n", i, frame->display_primaries[i].x); + hdmi_log("y[%d]: %d\n", i, frame->display_primaries[i].y); + } + + hdmi_log("white point x: %d\n", frame->white_point.x); + hdmi_log("white point y: %d\n", frame->white_point.y); + + hdmi_log("max_display_mastering_luminance: %d\n", + frame->max_display_mastering_luminance); + hdmi_log("min_display_mastering_luminance: %d\n", + frame->min_display_mastering_luminance); + + hdmi_log("max_cll: %d\n", frame->max_cll); + hdmi_log("max_fall: %d\n", frame->max_fall); +} + static const char * hdmi_3d_structure_get_name(enum hdmi_3d_structure s3d_struct) { @@ -1372,6 +1559,9 @@ void hdmi_infoframe_log(const char *level, case HDMI_INFOFRAME_TYPE_VENDOR: hdmi_vendor_any_infoframe_log(level, dev, &frame->vendor); break; + case HDMI_INFOFRAME_TYPE_DRM: + hdmi_drm_infoframe_log(level, dev, &frame->drm); + break; } } EXPORT_SYMBOL(hdmi_infoframe_log); diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h index 9d3b5b93102c..0e21e91c4314 100644 --- a/include/drm/drm_edid.h +++ b/include/drm/drm_edid.h @@ -25,6 +25,7 @@ #include #include +#include struct drm_device; struct i2c_adapter; @@ -370,6 +371,10 @@ drm_hdmi_avi_infoframe_quant_range(struct hdmi_avi_infoframe *frame, const struct drm_display_mode *mode, enum hdmi_quantization_range rgb_quant_range); +int +drm_hdmi_infoframe_set_hdr_metadata(struct hdmi_drm_infoframe *frame, + const struct drm_connector_state *conn_state); + /** * drm_eld_mnl - Get ELD monitor name length in bytes. * @eld: pointer to an eld memory structure with mnl set diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h index 6780476dcbff..bcf3c6c3499e 100644 --- a/include/linux/hdmi.h +++ b/include/linux/hdmi.h @@ -47,6 +47,7 @@ enum hdmi_infoframe_type { HDMI_INFOFRAME_TYPE_AVI = 0x82, HDMI_INFOFRAME_TYPE_SPD = 0x83, HDMI_INFOFRAME_TYPE_AUDIO = 0x84, + HDMI_INFOFRAME_TYPE_DRM = 0x87, }; #define HDMI_IEEE_OUI 0x000c03 @@ -55,6 +56,7 @@ enum hdmi_infoframe_type { #define HDMI_AVI_INFOFRAME_SIZE 13 #define HDMI_SPD_INFOFRAME_SIZE 25 #define HDMI_AUDIO_INFOFRAME_SIZE 10 +#define HDMI_DRM_INFOFRAME_SIZE 26 #define HDMI_INFOFRAME_SIZE(type) \ (HDMI_INFOFRAME_HEADER_SIZE + HDMI_ ## type ## _INFOFRAME_SIZE) @@ -185,12 +187,37 @@ struct hdmi_avi_infoframe { unsigned short right_bar; }; +/* DRM Infoframe as per CTA 861.G spec */ +struct hdmi_drm_infoframe { + enum hdmi_infoframe_type type; + unsigned char version; + unsigned char length; + enum hdmi_eotf eotf; + enum hdmi_metadata_type metadata_type; + struct { + u16 x, y; + } display_primaries[3]; + struct { + u16 x, y; + } white_point; + u16 max_display_mastering_luminance; + u16 min_display_mastering_luminance; + u16 max_cll; + u16 max_fall; +}; + int hdmi_avi_infoframe_init(struct hdmi_avi_infoframe *frame); ssize_t hdmi_avi_infoframe_pack(struct hdmi_avi_infoframe *frame, void *buffer, size_t size); ssize_t hdmi_avi_infoframe_pack_only(const struct hdmi_avi_infoframe *frame, void *buffer, size_t size); int hdmi_avi_infoframe_check(struct hdmi_avi_infoframe *frame); +int hdmi_drm_infoframe_init(struct hdmi_drm_infoframe *frame); +ssize_t hdmi_drm_infoframe_pack(struct hdmi_drm_infoframe *frame, void *buffer, + size_t size); +ssize_t hdmi_drm_infoframe_pack_only(const struct hdmi_drm_infoframe *frame, + void *buffer, size_t size); +int hdmi_drm_infoframe_check(struct hdmi_drm_infoframe *frame); enum hdmi_spd_sdi { HDMI_SPD_SDI_UNKNOWN, @@ -381,6 +408,7 @@ union hdmi_infoframe { struct hdmi_spd_infoframe spd; union hdmi_vendor_any_infoframe vendor; struct hdmi_audio_infoframe audio; + struct hdmi_drm_infoframe drm; }; ssize_t hdmi_infoframe_pack(union hdmi_infoframe *frame, void *buffer, -- cgit v1.2.3-59-g8ed1b From b5e3eed1eeb363c148e2935d9d3c12c30a280de6 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Thu, 16 May 2019 19:40:12 +0530 Subject: drm: Add HLG EOTF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ADD HLG EOTF to the list of EOTF transfer functions supported. Hybrid Log-Gamma (HLG) is a high dynamic range (HDR) standard. HLG defines a nonlinear transfer function in which the lower half of the signal values use a gamma curve and the upper half of the signal values use a logarithmic curve. v2: Rebase v3: Fixed a warning message v4: Addressed Shashank's review comments v5: Addressed Jonas Karlman's review comment and dropped the i915 tag from header. Signed-off-by: Ville Syrjälä Signed-off-by: Uma Shankar Reviewed-by: Shashank Sharma Link: https://patchwork.freedesktop.org/patch/msgid/1558015817-12025-8-git-send-email-uma.shankar@intel.com --- drivers/gpu/drm/drm_edid.c | 3 ++- include/linux/hdmi.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index 73560c9437cd..262510c2a670 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -3854,7 +3854,8 @@ static uint8_t eotf_supported(const u8 *edid_ext) return edid_ext[2] & (BIT(HDMI_EOTF_TRADITIONAL_GAMMA_SDR) | BIT(HDMI_EOTF_TRADITIONAL_GAMMA_HDR) | - BIT(HDMI_EOTF_SMPTE_ST2084)); + BIT(HDMI_EOTF_SMPTE_ST2084) | + BIT(HDMI_EOTF_BT_2100_HLG)); } static uint8_t hdr_metadata_type(const u8 *edid_ext) diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h index bcf3c6c3499e..ee55ba589cdc 100644 --- a/include/linux/hdmi.h +++ b/include/linux/hdmi.h @@ -162,6 +162,7 @@ enum hdmi_eotf { HDMI_EOTF_TRADITIONAL_GAMMA_SDR, HDMI_EOTF_TRADITIONAL_GAMMA_HDR, HDMI_EOTF_SMPTE_ST2084, + HDMI_EOTF_BT_2100_HLG, }; struct hdmi_avi_infoframe { -- cgit v1.2.3-59-g8ed1b From c08e7e4c8a6f04e01d16117eb4a0077059ec2cd4 Mon Sep 17 00:00:00 2001 From: Guillaume La Roque Date: Tue, 14 May 2019 10:26:48 +0200 Subject: pinctrl: generic: add new 'drive-strength-microamp' property support Add drive-strength-microamp property support to allow drive strength in uA Signed-off-by: Guillaume La Roque Signed-off-by: Linus Walleij --- drivers/pinctrl/pinconf-generic.c | 2 ++ include/linux/pinctrl/pinconf-generic.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c index b4f7f8a458ea..d0cbdb1ad76a 100644 --- a/drivers/pinctrl/pinconf-generic.c +++ b/drivers/pinctrl/pinconf-generic.c @@ -39,6 +39,7 @@ static const struct pin_config_item conf_items[] = { PCONFDUMP(PIN_CONFIG_DRIVE_OPEN_SOURCE, "output drive open source", NULL, false), PCONFDUMP(PIN_CONFIG_DRIVE_PUSH_PULL, "output drive push pull", NULL, false), PCONFDUMP(PIN_CONFIG_DRIVE_STRENGTH, "output drive strength", "mA", true), + PCONFDUMP(PIN_CONFIG_DRIVE_STRENGTH_UA, "output drive strength", "uA", true), PCONFDUMP(PIN_CONFIG_INPUT_DEBOUNCE, "input debounce", "usec", true), PCONFDUMP(PIN_CONFIG_INPUT_ENABLE, "input enabled", NULL, false), PCONFDUMP(PIN_CONFIG_INPUT_SCHMITT, "input schmitt trigger", NULL, false), @@ -167,6 +168,7 @@ static const struct pinconf_generic_params dt_params[] = { { "drive-open-source", PIN_CONFIG_DRIVE_OPEN_SOURCE, 0 }, { "drive-push-pull", PIN_CONFIG_DRIVE_PUSH_PULL, 0 }, { "drive-strength", PIN_CONFIG_DRIVE_STRENGTH, 0 }, + { "drive-strength-microamp", PIN_CONFIG_DRIVE_STRENGTH_UA, 0 }, { "input-debounce", PIN_CONFIG_INPUT_DEBOUNCE, 0 }, { "input-disable", PIN_CONFIG_INPUT_ENABLE, 0 }, { "input-enable", PIN_CONFIG_INPUT_ENABLE, 1 }, diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 6c0680641108..72d06d6a3099 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -55,6 +55,8 @@ * push-pull mode, the argument is ignored. * @PIN_CONFIG_DRIVE_STRENGTH: the pin will sink or source at most the current * passed as argument. The argument is in mA. + * @PIN_CONFIG_DRIVE_STRENGTH_UA: the pin will sink or source at most the current + * passed as argument. The argument is in uA. * @PIN_CONFIG_INPUT_DEBOUNCE: this will configure the pin to debounce mode, * which means it will wait for signals to settle when reading inputs. The * argument gives the debounce time in usecs. Setting the @@ -112,6 +114,7 @@ enum pin_config_param { PIN_CONFIG_DRIVE_OPEN_SOURCE, PIN_CONFIG_DRIVE_PUSH_PULL, PIN_CONFIG_DRIVE_STRENGTH, + PIN_CONFIG_DRIVE_STRENGTH_UA, PIN_CONFIG_INPUT_DEBOUNCE, PIN_CONFIG_INPUT_ENABLE, PIN_CONFIG_INPUT_SCHMITT, -- cgit v1.2.3-59-g8ed1b From 036f394dd77f8117346874151793ec38967d843f Mon Sep 17 00:00:00 2001 From: Benjamin Gaignard Date: Wed, 22 May 2019 17:29:24 +0200 Subject: pinctrl: Enable device link creation for pin control A pin controller may want to create a link between itself and its clients to be sure of suspend/resume call ordering. Introduce link_consumers field in pinctrl_desc structure to let pinctrl core knows that controller expect to create a link. Signed-off-by: Benjamin Gaignard [Renamed create_link to link_consumers] Signed-off-by: Linus Walleij --- drivers/pinctrl/core.c | 11 +++++++++++ include/linux/pinctrl/pinctrl.h | 5 +++++ 2 files changed, 16 insertions(+) (limited to 'include/linux') diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index c6ff4d5fa482..d757c51d7114 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -1216,6 +1216,15 @@ struct pinctrl_state *pinctrl_lookup_state(struct pinctrl *p, } EXPORT_SYMBOL_GPL(pinctrl_lookup_state); +static void pinctrl_link_add(struct pinctrl_dev *pctldev, + struct device *consumer) +{ + if (pctldev->desc->link_consumers) + device_link_add(consumer, pctldev->dev, + DL_FLAG_PM_RUNTIME | + DL_FLAG_AUTOREMOVE_CONSUMER); +} + /** * pinctrl_commit_state() - select/activate/program a pinctrl state to HW * @p: the pinctrl handle for the device that requests configuration @@ -1261,6 +1270,8 @@ static int pinctrl_commit_state(struct pinctrl *p, struct pinctrl_state *state) if (ret < 0) { goto unapply_new_state; } + + pinctrl_link_add(setting->pctldev, p->dev); } p->state = state; diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index 8f5dbb84547a..2744113f1024 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -125,6 +125,10 @@ struct pinctrl_ops { * the hardware description * @custom_conf_items: Information how to print @params in debugfs, must be * the same size as the @custom_params, i.e. @num_custom_params + * @link_consumers: If true create a device link between pinctrl and its + * consumers (i.e. the devices requesting pin control states). This is + * sometimes necessary to ascertain the right suspend/resume order for + * example. */ struct pinctrl_desc { const char *name; @@ -139,6 +143,7 @@ struct pinctrl_desc { const struct pinconf_generic_params *custom_params; const struct pin_config_item *custom_conf_items; #endif + bool link_consumers; }; /* External interface to pin controller */ -- cgit v1.2.3-59-g8ed1b From 59fcdce425b7c947ccea03a16e393af9bb4d6262 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 23 May 2019 17:05:59 -0700 Subject: clk: Remove ifdef for COMMON_CLK in clk-provider.h This ifdef has been there since the beginning of this file, but it doesn't really seem to serve any purpose besides obfuscating the struct definitions and #defines here from compilation units that include it. Let's always expose these function prototypes and struct definitions so that code can inspect clk providers without needing to have CONFIG_COMMON_CLK enabled. Signed-off-by: Stephen Boyd --- include/linux/clk-provider.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index bb6118f79784..3bced2ec9f26 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -9,8 +9,6 @@ #include #include -#ifdef CONFIG_COMMON_CLK - /* * flags used across common struct clk. these flags should only affect the * top-level framework. custom flags for dealing with hardware specifics @@ -1019,5 +1017,4 @@ static inline int of_clk_detect_critical(struct device_node *np, int index, void clk_gate_restore_context(struct clk_hw *hw); -#endif /* CONFIG_COMMON_CLK */ #endif /* CLK_PROVIDER_H */ -- cgit v1.2.3-59-g8ed1b From 30d5a945743cd05ec5c847f2e38c2fbda5e00944 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 23 May 2019 17:11:57 -0700 Subject: clk: Unexport __clk_of_table This symbol doesn't need to be exported to clk providers anymore. Originally, it was hidden inside clk.c, but then OMAP needed to get access to it in commit 819b4861c18d ("CLK: ti: add init support for clock IP blocks"), but eventually that code also changed in commit c08ee14cc663 ("clk: ti: change clock init to use generic of_clk_init") and we were left with this exported. Move this back into clk.c so that it isn't exposed anymore. Signed-off-by: Stephen Boyd --- drivers/clk/clk.c | 1 + include/linux/clk-provider.h | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c index aa51756fd4d6..b34e84bb8167 100644 --- a/drivers/clk/clk.c +++ b/drivers/clk/clk.c @@ -4038,6 +4038,7 @@ struct of_clk_provider { void *data; }; +extern struct of_device_id __clk_of_table; static const struct of_device_id __clk_of_table_sentinel __used __section(__clk_of_table_end); diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 3bced2ec9f26..9ba000e3a50d 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -865,8 +865,6 @@ static inline long divider_ro_round_rate(struct clk_hw *hw, unsigned long rate, */ unsigned long clk_hw_round_rate(struct clk_hw *hw, unsigned long rate); -struct of_device_id; - struct clk_onecell_data { struct clk **clks; unsigned int clk_num; @@ -877,8 +875,6 @@ struct clk_hw_onecell_data { struct clk_hw *hws[]; }; -extern struct of_device_id __clk_of_table; - #define CLK_OF_DECLARE(name, compat, fn) OF_DECLARE_1(clk, name, compat, fn) /* -- cgit v1.2.3-59-g8ed1b From 1f58bb18f6f28d1df0b7144d90bc90ee5672416d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 May 2019 13:44:57 +0100 Subject: mount_pseudo(): drop 'name' argument, switch to d_make_root() Once upon a time we used to set ->d_name of e.g. pipefs root so that d_path() on pipes would work. These days it's completely pointless - dentries of pipes are not even connected to pipefs root. However, mount_pseudo() had set the root dentry name (passed as the second argument) and callers kept inventing names to pass to it. Including those that didn't *have* any non-root dentries to start with... All of that had been pointless for about 8 years now; it's time to get rid of that cargo-culting... Signed-off-by: Al Viro --- arch/ia64/kernel/perfmon.c | 2 +- drivers/dax/super.c | 2 +- drivers/gpu/drm/drm_drv.c | 6 +----- drivers/misc/cxl/api.c | 3 +-- drivers/scsi/cxlflash/ocxl_hw.c | 3 +-- drivers/virtio/virtio_balloon.c | 3 +-- fs/aio.c | 3 +-- fs/anon_inodes.c | 4 ++-- fs/block_dev.c | 2 +- fs/btrfs/tests/btrfs-tests.c | 2 +- fs/libfs.c | 12 +++--------- fs/nsfs.c | 2 +- fs/pipe.c | 2 +- include/linux/fs.h | 6 +++--- mm/z3fold.c | 2 +- mm/zsmalloc.c | 2 +- net/socket.c | 2 +- 17 files changed, 22 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 7a969f4c3534..a30da6f2c28e 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -602,7 +602,7 @@ static const struct dentry_operations pfmfs_dentry_operations; static struct dentry * pfmfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "pfm:", NULL, &pfmfs_dentry_operations, + return mount_pseudo(fs_type, NULL, &pfmfs_dentry_operations, PFMFS_MAGIC); } diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 35f051efaf35..f83814eea5ad 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -440,7 +440,7 @@ static const struct super_operations dax_sops = { static struct dentry *dax_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); + return mount_pseudo(fs_type, &dax_sops, NULL, DAXFS_MAGIC); } static struct file_system_type dax_fs_type = { diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c index 8b44ac9a92ae..48365c62a190 100644 --- a/drivers/gpu/drm/drm_drv.c +++ b/drivers/gpu/drm/drm_drv.c @@ -535,11 +535,7 @@ static struct vfsmount *drm_fs_mnt; static struct dentry *drm_fs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, - "drm:", - NULL, - NULL, - 0x010203ff); + return mount_pseudo(fs_type, NULL, NULL, 0x010203ff); } static struct file_system_type drm_fs_type = { diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index a59c7af79873..1f2b0535a8cf 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -40,8 +40,7 @@ static struct vfsmount *cxl_vfs_mount; static struct dentry *cxl_fs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "cxl:", NULL, NULL, - CXL_PSEUDO_FS_MAGIC); + return mount_pseudo(fs_type, NULL, NULL, CXL_PSEUDO_FS_MAGIC); } static struct file_system_type cxl_fs_type = { diff --git a/drivers/scsi/cxlflash/ocxl_hw.c b/drivers/scsi/cxlflash/ocxl_hw.c index 31cfdf2c8c30..38e1fbd2b406 100644 --- a/drivers/scsi/cxlflash/ocxl_hw.c +++ b/drivers/scsi/cxlflash/ocxl_hw.c @@ -48,8 +48,7 @@ static struct dentry *ocxlflash_fs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "ocxlflash:", NULL, NULL, - OCXLFLASH_FS_MAGIC); + return mount_pseudo(fs_type, NULL, NULL, OCXLFLASH_FS_MAGIC); } static struct file_system_type ocxlflash_fs_type = { diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 554d1a98d193..62bafc4f2662 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -761,8 +761,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, static struct dentry *balloon_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "balloon-kvm:", NULL, NULL, - BALLOON_KVM_MAGIC); + return mount_pseudo(fs_type, NULL, NULL, BALLOON_KVM_MAGIC); } static struct file_system_type balloon_fs = { diff --git a/fs/aio.c b/fs/aio.c index 3490d1fa0e16..09bc35fa6810 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -252,8 +252,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages) static struct dentry *aio_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, NULL, - AIO_RING_MAGIC); + struct dentry *root = mount_pseudo(fs_type, NULL, NULL, AIO_RING_MAGIC); if (!IS_ERR(root)) root->d_sb->s_iflags |= SB_I_NOEXEC; diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 91262c34b797..644d0837aafe 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -41,8 +41,8 @@ static const struct dentry_operations anon_inodefs_dentry_operations = { static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "anon_inode:", NULL, - &anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC); + return mount_pseudo(fs_type, NULL, &anon_inodefs_dentry_operations, + ANON_INODE_FS_MAGIC); } static struct file_system_type anon_inode_fs_type = { diff --git a/fs/block_dev.c b/fs/block_dev.c index 0f7552a87d54..3143da7b0998 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -837,7 +837,7 @@ static struct dentry *bd_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct dentry *dent; - dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC); + dent = mount_pseudo(fs_type, &bdev_sops, NULL, BDEVFS_MAGIC); if (!IS_ERR(dent)) dent->d_sb->s_iflags |= SB_I_CGROUPWB; return dent; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 9238fd4f1734..6da54323eaf8 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -36,7 +36,7 @@ static struct dentry *btrfs_test_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops, + return mount_pseudo(fs_type, &btrfs_test_super_ops, NULL, BTRFS_TEST_MAGIC); } diff --git a/fs/libfs.c b/fs/libfs.c index 4b59b1816efb..030e545f586e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -239,14 +239,12 @@ static const struct super_operations simple_super_operations = { * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that * will never be mountable) */ -struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, +struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, const struct super_operations *ops, const struct xattr_handler **xattr, const struct dentry_operations *dops, unsigned long magic) { struct super_block *s; - struct dentry *dentry; struct inode *root; - struct qstr d_name = QSTR_INIT(name, strlen(name)); s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER, &init_user_ns, NULL); @@ -271,13 +269,9 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; root->i_atime = root->i_mtime = root->i_ctime = current_time(root); - dentry = __d_alloc(s, &d_name); - if (!dentry) { - iput(root); + s->s_root = d_make_root(root); + if (!s->s_root) goto Enomem; - } - d_instantiate(dentry, root); - s->s_root = dentry; s->s_d_op = dops; s->s_flags |= SB_ACTIVE; return dget(s->s_root); diff --git a/fs/nsfs.c b/fs/nsfs.c index e3bf08c5af41..b3c49ddc0f85 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -261,7 +261,7 @@ static const struct super_operations nsfs_ops = { static struct dentry *nsfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "nsfs:", &nsfs_ops, + return mount_pseudo(fs_type, &nsfs_ops, &ns_dentry_operations, NSFS_MAGIC); } static struct file_system_type nsfs = { diff --git a/fs/pipe.c b/fs/pipe.c index 41065901106b..99a023730e6f 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1185,7 +1185,7 @@ static const struct super_operations pipefs_ops = { static struct dentry *pipefs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "pipe:", &pipefs_ops, + return mount_pseudo(fs_type, &pipefs_ops, &pipefs_dentry_operations, PIPEFS_MAGIC); } diff --git a/include/linux/fs.h b/include/linux/fs.h index f7fdfe93e25d..b06251dd429f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2257,18 +2257,18 @@ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data); -extern struct dentry *mount_pseudo_xattr(struct file_system_type *, char *, +extern struct dentry *mount_pseudo_xattr(struct file_system_type *, const struct super_operations *ops, const struct xattr_handler **xattr, const struct dentry_operations *dops, unsigned long); static inline struct dentry * -mount_pseudo(struct file_system_type *fs_type, char *name, +mount_pseudo(struct file_system_type *fs_type, const struct super_operations *ops, const struct dentry_operations *dops, unsigned long magic) { - return mount_pseudo_xattr(fs_type, name, ops, NULL, dops, magic); + return mount_pseudo_xattr(fs_type, ops, NULL, dops, magic); } /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ diff --git a/mm/z3fold.c b/mm/z3fold.c index 0b14daf930a8..abeb5bcbea57 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -242,7 +242,7 @@ static inline void free_handle(unsigned long handle) static struct dentry *z3fold_do_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "z3fold:", NULL, NULL, 0x33); + return mount_pseudo(fs_type, NULL, NULL, 0x33); } static struct file_system_type z3fold_fs = { diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d9f831f63625..ef230be8c03e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1817,7 +1817,7 @@ static void lock_zspage(struct zspage *zspage) static struct dentry *zs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo(fs_type, "zsmalloc:", NULL, NULL, ZSMALLOC_MAGIC); + return mount_pseudo(fs_type, NULL, NULL, ZSMALLOC_MAGIC); } static struct file_system_type zsmalloc_fs = { diff --git a/net/socket.c b/net/socket.c index 472fbefa5d9b..c86679584eed 100644 --- a/net/socket.c +++ b/net/socket.c @@ -362,7 +362,7 @@ static const struct xattr_handler *sockfs_xattr_handlers[] = { static struct dentry *sockfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_pseudo_xattr(fs_type, "socket:", &sockfs_ops, + return mount_pseudo_xattr(fs_type, &sockfs_ops, sockfs_xattr_handlers, &sockfs_dentry_operations, SOCKFS_MAGIC); } -- cgit v1.2.3-59-g8ed1b From bb7b6b2bbdb827e68cd506c8f5e3ba13215cccb2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Mar 2019 16:38:28 +0000 Subject: vfs: Kill mount_ns() Kill mount_ns() as it has been replaced by vfs_get_super() in the new mount API. Signed-off-by: David Howells cc: linux-fsdevel@vger.kernel.org Signed-off-by: Al Viro --- fs/super.c | 38 -------------------------------------- include/linux/fs.h | 3 --- 2 files changed, 41 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index 3ba91d70c2a8..6919f5c728f0 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1143,44 +1143,6 @@ void kill_litter_super(struct super_block *sb) } EXPORT_SYMBOL(kill_litter_super); -static int ns_test_super(struct super_block *sb, void *data) -{ - return sb->s_fs_info == data; -} - -static int ns_set_super(struct super_block *sb, void *data) -{ - sb->s_fs_info = data; - return set_anon_super(sb, NULL); -} - -struct dentry *mount_ns(struct file_system_type *fs_type, - int flags, void *data, void *ns, struct user_namespace *user_ns, - int (*fill_super)(struct super_block *, void *, int)) -{ - struct super_block *sb; - - sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags, - user_ns, ns); - if (IS_ERR(sb)) - return ERR_CAST(sb); - - if (!sb->s_root) { - int err; - err = fill_super(sb, data, flags & SB_SILENT ? 1 : 0); - if (err) { - deactivate_locked_super(sb); - return ERR_PTR(err); - } - - sb->s_flags |= SB_ACTIVE; - } - - return dget(sb->s_root); -} - -EXPORT_SYMBOL(mount_ns); - int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); diff --git a/include/linux/fs.h b/include/linux/fs.h index b06251dd429f..790342cf4df9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2206,9 +2206,6 @@ struct file_system_type { #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) -extern struct dentry *mount_ns(struct file_system_type *fs_type, - int flags, void *data, void *ns, struct user_namespace *user_ns, - int (*fill_super)(struct super_block *, void *, int)); #ifdef CONFIG_BLOCK extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, -- cgit v1.2.3-59-g8ed1b From c80fa7c8301c10ad10d997b9e86b4aeac5923b3e Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Mar 2019 16:38:23 +0000 Subject: vfs: Provide sb->s_iflags settings in fs_context struct Provide a field in the fs_context struct through which bits in the sb->s_iflags superblock field can be set. Signed-off-by: David Howells cc: linux-fsdevel@vger.kernel.org --- fs/super.c | 1 + include/linux/fs_context.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index 72b4a5afcfd6..f836b67abffe 100644 --- a/fs/super.c +++ b/fs/super.c @@ -540,6 +540,7 @@ retry: } fc->s_fs_info = NULL; s->s_type = fc->fs_type; + s->s_iflags |= fc->s_iflags; strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 1f966670c8dc..c995b852ba40 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -103,6 +103,7 @@ struct fs_context { void *s_fs_info; /* Proposed s_fs_info */ unsigned int sb_flags; /* Proposed superblock flags (SB_*) */ unsigned int sb_flags_mask; /* Superblock flags that were changed */ + unsigned int s_iflags; /* OR'd with sb->s_iflags */ unsigned int lsm_flags; /* Information flags from the fs to the LSM */ enum fs_context_purpose purpose:8; enum fs_context_phase phase:8; /* The phase the context is in */ -- cgit v1.2.3-59-g8ed1b From 31d6d5ce53400d6dc58e29ddd8dc184b3ba89d66 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Mar 2019 16:38:23 +0000 Subject: vfs: Provide a mount_pseudo-replacement for the new mount API Provide a function, init_pseudo(), that provides a common infrastructure for converting pseudo-filesystems that can never be mountable. [AV: once all users of mount_pseudo_xattr() get converted, it will be folded into pseudo_fs_get_tree()] Signed-off-by: David Howells cc: linux-fsdevel@vger.kernel.org --- fs/libfs.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pseudo_fs.h | 16 ++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 include/linux/pseudo_fs.h (limited to 'include/linux') diff --git a/fs/libfs.c b/fs/libfs.c index 030e545f586e..edef70d35438 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -16,6 +16,8 @@ #include #include #include /* sync_mapping_buffers */ +#include +#include #include @@ -235,6 +237,50 @@ static const struct super_operations simple_super_operations = { .statfs = simple_statfs, }; +static int pseudo_fs_get_tree(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx = fc->fs_private; + struct dentry *root; + + root = mount_pseudo_xattr(fc->fs_type, + ctx->ops, ctx->xattr, + ctx->dops, ctx->magic); + if (IS_ERR(root)) + return PTR_ERR(root); + + fc->root = root; + return 0; +} + +static void pseudo_fs_free(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations pseudo_fs_context_ops = { + .free = pseudo_fs_free, + .get_tree = pseudo_fs_get_tree, +}; + +/* + * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that + * will never be mountable) + */ +struct pseudo_fs_context *init_pseudo(struct fs_context *fc, + unsigned long magic) +{ + struct pseudo_fs_context *ctx; + + ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL); + if (likely(ctx)) { + ctx->magic = magic; + fc->fs_private = ctx; + fc->ops = &pseudo_fs_context_ops; + } + return ctx; +} +EXPORT_SYMBOL(init_pseudo); + /* * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that * will never be mountable) diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h new file mode 100644 index 000000000000..eceda1d1407a --- /dev/null +++ b/include/linux/pseudo_fs.h @@ -0,0 +1,16 @@ +#ifndef __LINUX_PSEUDO_FS__ +#define __LINUX_PSEUDO_FS__ + +#include + +struct pseudo_fs_context { + const struct super_operations *ops; + const struct xattr_handler **xattr; + const struct dentry_operations *dops; + unsigned long magic; +}; + +struct pseudo_fs_context *init_pseudo(struct fs_context *fc, + unsigned long magic); + +#endif -- cgit v1.2.3-59-g8ed1b From 8d9e46d80777b484f8f0945c317ad618224d7811 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 May 2019 11:43:59 -0400 Subject: fold mount_pseudo_xattr() into pseudo_fs_get_tree() ... now that all other callers are gone Signed-off-by: Al Viro --- fs/libfs.c | 88 +++++++++++++++++++++--------------------------------- include/linux/fs.h | 13 -------- 2 files changed, 34 insertions(+), 67 deletions(-) (limited to 'include/linux') diff --git a/fs/libfs.c b/fs/libfs.c index edef70d35438..7df3c9a85f6b 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -240,16 +240,43 @@ static const struct super_operations simple_super_operations = { static int pseudo_fs_get_tree(struct fs_context *fc) { struct pseudo_fs_context *ctx = fc->fs_private; - struct dentry *root; + struct super_block *s; + struct inode *root; - root = mount_pseudo_xattr(fc->fs_type, - ctx->ops, ctx->xattr, - ctx->dops, ctx->magic); - if (IS_ERR(root)) - return PTR_ERR(root); + s = sget_userns(fc->fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER, + &init_user_ns, NULL); + if (IS_ERR(s)) + return PTR_ERR(s); - fc->root = root; + s->s_maxbytes = MAX_LFS_FILESIZE; + s->s_blocksize = PAGE_SIZE; + s->s_blocksize_bits = PAGE_SHIFT; + s->s_magic = ctx->magic; + s->s_op = ctx->ops ?: &simple_super_operations; + s->s_xattr = ctx->xattr; + s->s_time_gran = 1; + root = new_inode(s); + if (!root) + goto Enomem; + /* + * since this is the first inode, make it number 1. New inodes created + * after this must take care not to collide with it (by passing + * max_reserved of 1 to iunique). + */ + root->i_ino = 1; + root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; + root->i_atime = root->i_mtime = root->i_ctime = current_time(root); + s->s_root = d_make_root(root); + if (!s->s_root) + goto Enomem; + s->s_d_op = ctx->dops; + s->s_flags |= SB_ACTIVE; + fc->root = dget(s->s_root); return 0; + +Enomem: + deactivate_locked_super(s); + return -ENOMEM; } static void pseudo_fs_free(struct fs_context *fc) @@ -281,53 +308,6 @@ struct pseudo_fs_context *init_pseudo(struct fs_context *fc, } EXPORT_SYMBOL(init_pseudo); -/* - * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that - * will never be mountable) - */ -struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, - const struct super_operations *ops, const struct xattr_handler **xattr, - const struct dentry_operations *dops, unsigned long magic) -{ - struct super_block *s; - struct inode *root; - - s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER, - &init_user_ns, NULL); - if (IS_ERR(s)) - return ERR_CAST(s); - - s->s_maxbytes = MAX_LFS_FILESIZE; - s->s_blocksize = PAGE_SIZE; - s->s_blocksize_bits = PAGE_SHIFT; - s->s_magic = magic; - s->s_op = ops ? ops : &simple_super_operations; - s->s_xattr = xattr; - s->s_time_gran = 1; - root = new_inode(s); - if (!root) - goto Enomem; - /* - * since this is the first inode, make it number 1. New inodes created - * after this must take care not to collide with it (by passing - * max_reserved of 1 to iunique). - */ - root->i_ino = 1; - root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; - root->i_atime = root->i_mtime = root->i_ctime = current_time(root); - s->s_root = d_make_root(root); - if (!s->s_root) - goto Enomem; - s->s_d_op = dops; - s->s_flags |= SB_ACTIVE; - return dget(s->s_root); - -Enomem: - deactivate_locked_super(s); - return ERR_PTR(-ENOMEM); -} -EXPORT_SYMBOL(mount_pseudo_xattr); - int simple_open(struct inode *inode, struct file *file) { if (inode->i_private) diff --git a/include/linux/fs.h b/include/linux/fs.h index 790342cf4df9..d625acabbfcf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2254,19 +2254,6 @@ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data); -extern struct dentry *mount_pseudo_xattr(struct file_system_type *, - const struct super_operations *ops, - const struct xattr_handler **xattr, - const struct dentry_operations *dops, - unsigned long); - -static inline struct dentry * -mount_pseudo(struct file_system_type *fs_type, - const struct super_operations *ops, - const struct dentry_operations *dops, unsigned long magic) -{ - return mount_pseudo_xattr(fs_type, ops, NULL, dops, magic); -} /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ #define fops_get(fops) \ -- cgit v1.2.3-59-g8ed1b From 023d066a0d0a87696c04b0de2ceae53063d0b655 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Mar 2019 16:38:28 +0000 Subject: vfs: Kill sget_userns() Kill sget_userns(), folding it into sget() as that's the only remaining user. Signed-off-by: David Howells cc: linux-fsdevel@vger.kernel.org --- fs/super.c | 54 ++++++++++++++++-------------------------------------- include/linux/fs.h | 5 ----- 2 files changed, 16 insertions(+), 43 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index f836b67abffe..ca2302501d32 100644 --- a/fs/super.c +++ b/fs/super.c @@ -563,24 +563,31 @@ share_extant_sb: EXPORT_SYMBOL(sget_fc); /** - * sget_userns - find or create a superblock - * @type: filesystem type superblock should belong to - * @test: comparison callback - * @set: setup callback - * @flags: mount flags - * @user_ns: User namespace for the super_block - * @data: argument to each of them + * sget - find or create a superblock + * @type: filesystem type superblock should belong to + * @test: comparison callback + * @set: setup callback + * @flags: mount flags + * @data: argument to each of them */ -struct super_block *sget_userns(struct file_system_type *type, +struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), - int flags, struct user_namespace *user_ns, + int flags, void *data) { + struct user_namespace *user_ns = current_user_ns(); struct super_block *s = NULL; struct super_block *old; int err; + /* We don't yet pass the user namespace of the parent + * mount through to here so always use &init_user_ns + * until that changes. + */ + if (flags & SB_SUBMOUNT) + user_ns = &init_user_ns; + retry: spin_lock(&sb_lock); if (test) { @@ -621,35 +628,6 @@ retry: register_shrinker_prepared(&s->s_shrink); return s; } - -EXPORT_SYMBOL(sget_userns); - -/** - * sget - find or create a superblock - * @type: filesystem type superblock should belong to - * @test: comparison callback - * @set: setup callback - * @flags: mount flags - * @data: argument to each of them - */ -struct super_block *sget(struct file_system_type *type, - int (*test)(struct super_block *,void *), - int (*set)(struct super_block *,void *), - int flags, - void *data) -{ - struct user_namespace *user_ns = current_user_ns(); - - /* We don't yet pass the user namespace of the parent - * mount through to here so always use &init_user_ns - * until that changes. - */ - if (flags & SB_SUBMOUNT) - user_ns = &init_user_ns; - - return sget_userns(type, test, set, flags, user_ns, data); -} - EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) diff --git a/include/linux/fs.h b/include/linux/fs.h index d625acabbfcf..71421856ff2c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2245,11 +2245,6 @@ void free_anon_bdev(dev_t); struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)); -struct super_block *sget_userns(struct file_system_type *type, - int (*test)(struct super_block *,void *), - int (*set)(struct super_block *,void *), - int flags, struct user_namespace *user_ns, - void *data); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), -- cgit v1.2.3-59-g8ed1b From 7375dca1647fa978310f2d706ddbff537f72110b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 20 May 2019 09:26:24 -0400 Subject: ftrace: Make enable and update parameters bool when applicable The code modification functions have "enable" and "update" variables that are sometimes "int" but used as "bool". Remove the ambiguity and make them "bool" when they are only used for true or false values. Link: http://lkml.kernel.org/r/e1429923d9eda92a3cf5ee9e33c7eacce539781d.1558115654.git.naveen.n.rao@linux.vnet.ibm.com Reported-by: "Naveen N. Rao" Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 4 ++-- kernel/trace/ftrace.c | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 25e2995d4a4c..8a8cb3c401b2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -427,8 +427,8 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter); iter = ftrace_rec_iter_next(iter)) -int ftrace_update_record(struct dyn_ftrace *rec, int enable); -int ftrace_test_record(struct dyn_ftrace *rec, int enable); +int ftrace_update_record(struct dyn_ftrace *rec, bool enable); +int ftrace_test_record(struct dyn_ftrace *rec, bool enable); void ftrace_run_stop_machine(int command); unsigned long ftrace_location(unsigned long ip); unsigned long ftrace_location_range(unsigned long start, unsigned long end); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a12aff849c04..4f2c26bebe2a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1768,7 +1768,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, count++; /* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */ - update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE; + update |= ftrace_test_record(rec, true) != FTRACE_UPDATE_IGNORE; /* Shortcut, if we handled all records, we are done. */ if (!all && count == hash->count) @@ -2047,7 +2047,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) } } -static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) +static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) { unsigned long flag = 0UL; @@ -2146,28 +2146,28 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) /** * ftrace_update_record, set a record that now is tracing or not * @rec: the record to update - * @enable: set to 1 if the record is tracing, zero to force disable + * @enable: set to true if the record is tracing, false to force disable * * The records that represent all functions that can be traced need * to be updated when tracing has been enabled. */ -int ftrace_update_record(struct dyn_ftrace *rec, int enable) +int ftrace_update_record(struct dyn_ftrace *rec, bool enable) { - return ftrace_check_record(rec, enable, 1); + return ftrace_check_record(rec, enable, true); } /** * ftrace_test_record, check if the record has been enabled or not * @rec: the record to test - * @enable: set to 1 to check if enabled, 0 if it is disabled + * @enable: set to true to check if enabled, false if it is disabled * * The arch code may need to test if a record is already set to * tracing to determine how to modify the function code that it * represents. */ -int ftrace_test_record(struct dyn_ftrace *rec, int enable) +int ftrace_test_record(struct dyn_ftrace *rec, bool enable) { - return ftrace_check_record(rec, enable, 0); + return ftrace_check_record(rec, enable, false); } static struct ftrace_ops * @@ -2356,7 +2356,7 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) } static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +__ftrace_replace_code(struct dyn_ftrace *rec, bool enable) { unsigned long ftrace_old_addr; unsigned long ftrace_addr; @@ -2395,7 +2395,7 @@ void __weak ftrace_replace_code(int mod_flags) { struct dyn_ftrace *rec; struct ftrace_page *pg; - int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; + bool enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL; int failed; -- cgit v1.2.3-59-g8ed1b From 2d8d8fac3b4eab035dcd0068e1f5a746a697fbb3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 May 2019 14:38:06 +0900 Subject: x86/uaccess: Allow access_ok() in irq context if pagefault_disabled WARN_ON_IN_IRQ() assumes that the access_ok() and following user memory access can sleep. But this assumption is not always correct; when the pagefault is disabled, following memory access will just returns -EFAULT and never sleep. Add pagefault_disabled() check in WARN_ON_ONCE() so that it can ignore the case we call it with disabling pagefault. For this purpose, this modified pagefault_disabled() as an inline function. Link: http://lkml.kernel.org/r/155789868664.26965.7932665824135793317.stgit@devnote2 Acked-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/x86/include/asm/uaccess.h | 4 +++- include/linux/uaccess.h | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index c82abd6e4ca3..9c4435307ff8 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -66,7 +66,9 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un }) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP -# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task()) +static inline bool pagefault_disabled(void); +# define WARN_ON_IN_IRQ() \ + WARN_ON_ONCE(!in_task() && !pagefault_disabled()) #else # define WARN_ON_IN_IRQ() #endif diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 2b70130af585..5a43ef7db492 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -203,7 +203,10 @@ static inline void pagefault_enable(void) /* * Is the pagefault handler disabled? If so, user access methods will not sleep. */ -#define pagefault_disabled() (current->pagefault_disabled != 0) +static inline bool pagefault_disabled(void) +{ + return current->pagefault_disabled != 0; +} /* * The pagefault handler is in general disabled by pagefault_disable() or -- cgit v1.2.3-59-g8ed1b From 3d7081822f7f9eab867d9bcc8fd635208ec438e0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 15 May 2019 14:38:18 +0900 Subject: uaccess: Add non-pagefault user-space read functions Add probe_user_read(), strncpy_from_unsafe_user() and strnlen_unsafe_user() which allows caller to access user-space in IRQ context. Current probe_kernel_read() and strncpy_from_unsafe() are not available for user-space memory, because it sets KERNEL_DS while accessing data. On some arch, user address space and kernel address space can be co-exist, but others can not. In that case, setting KERNEL_DS means given address is treated as a kernel address space. Also strnlen_user() is only available from user context since it can sleep if pagefault is enabled. To access user-space memory without pagefault, we need these new functions which sets USER_DS while accessing the data. Link: http://lkml.kernel.org/r/155789869802.26965.4940338412595759063.stgit@devnote2 Acked-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/uaccess.h | 14 ++++++ mm/maccess.c | 122 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 130 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 5a43ef7db492..9c435c3f2105 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -242,6 +242,17 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to, extern long probe_kernel_read(void *dst, const void *src, size_t size); extern long __probe_kernel_read(void *dst, const void *src, size_t size); +/* + * probe_user_read(): safely attempt to read from a location in user space + * @dst: pointer to the buffer that shall take the data + * @src: address to read from + * @size: size of the data chunk + * + * Safely read from address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + */ +extern long probe_user_read(void *dst, const void __user *src, size_t size); + /* * probe_kernel_write(): safely attempt to write to a location * @dst: address to write to @@ -255,6 +266,9 @@ extern long notrace probe_kernel_write(void *dst, const void *src, size_t size); extern long notrace __probe_kernel_write(void *dst, const void *src, size_t size); extern long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count); +extern long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, + long count); +extern long strnlen_unsafe_user(const void __user *unsafe_addr, long count); /** * probe_kernel_address(): safely attempt to read from a location diff --git a/mm/maccess.c b/mm/maccess.c index ec00be51a24f..19c8c3dc14df 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -5,8 +5,20 @@ #include #include +static __always_inline long +probe_read_common(void *dst, const void __user *src, size_t size) +{ + long ret; + + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, src, size); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} + /** - * probe_kernel_read(): safely attempt to read from a location + * probe_kernel_read(): safely attempt to read from a kernel-space location * @dst: pointer to the buffer that shall take the data * @src: address to read from * @size: size of the data chunk @@ -29,16 +41,40 @@ long __probe_kernel_read(void *dst, const void *src, size_t size) mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); - pagefault_disable(); - ret = __copy_from_user_inatomic(dst, - (__force const void __user *)src, size); - pagefault_enable(); + ret = probe_read_common(dst, (__force const void __user *)src, size); set_fs(old_fs); - return ret ? -EFAULT : 0; + return ret; } EXPORT_SYMBOL_GPL(probe_kernel_read); +/** + * probe_user_read(): safely attempt to read from a user-space location + * @dst: pointer to the buffer that shall take the data + * @src: address to read from. This must be a user address. + * @size: size of the data chunk + * + * Safely read from user address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + */ + +long __weak probe_user_read(void *dst, const void __user *src, size_t size) + __attribute__((alias("__probe_user_read"))); + +long __probe_user_read(void *dst, const void __user *src, size_t size) +{ + long ret = -EFAULT; + mm_segment_t old_fs = get_fs(); + + set_fs(USER_DS); + if (access_ok(src, size)) + ret = probe_read_common(dst, src, size); + set_fs(old_fs); + + return ret; +} +EXPORT_SYMBOL_GPL(probe_user_read); + /** * probe_kernel_write(): safely attempt to write to a location * @dst: address to write to @@ -66,6 +102,7 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) } EXPORT_SYMBOL_GPL(probe_kernel_write); + /** * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. * @dst: Destination address, in kernel space. This buffer must be at @@ -105,3 +142,76 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) return ret ? -EFAULT : src - unsafe_addr; } + +/** + * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user + * address. + * @dst: Destination address, in kernel space. This buffer must be at + * least @count bytes long. + * @unsafe_addr: Unsafe user address. + * @count: Maximum number of bytes to copy, including the trailing NUL. + * + * Copies a NUL-terminated string from unsafe user address to kernel buffer. + * + * On success, returns the length of the string INCLUDING the trailing NUL. + * + * If access fails, returns -EFAULT (some data may have been copied + * and the trailing NUL added). + * + * If @count is smaller than the length of the string, copies @count-1 bytes, + * sets the last byte of @dst buffer to NUL and returns @count. + */ +long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, + long count) +{ + mm_segment_t old_fs = get_fs(); + long ret; + + if (unlikely(count <= 0)) + return 0; + + set_fs(USER_DS); + pagefault_disable(); + ret = strncpy_from_user(dst, unsafe_addr, count); + pagefault_enable(); + set_fs(old_fs); + + if (ret >= count) { + ret = count; + dst[ret - 1] = '\0'; + } else if (ret > 0) { + ret++; + } + + return ret; +} + +/** + * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL. + * @unsafe_addr: The string to measure. + * @count: Maximum count (including NUL) + * + * Get the size of a NUL-terminated string in user space without pagefault. + * + * Returns the size of the string INCLUDING the terminating NUL. + * + * If the string is too long, returns a number larger than @count. User + * has to check the return value against "> count". + * On exception (or invalid count), returns 0. + * + * Unlike strnlen_user, this can be used from IRQ handler etc. because + * it disables pagefaults. + */ +long strnlen_unsafe_user(const void __user *unsafe_addr, long count) +{ + mm_segment_t old_fs = get_fs(); + int ret; + + set_fs(USER_DS); + pagefault_disable(); + ret = strnlen_user(unsafe_addr, count); + pagefault_enable(); + set_fs(old_fs); + + return ret; +} -- cgit v1.2.3-59-g8ed1b From 87a90956eeab260a469a51897bfda27b28adf67d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 22 May 2019 17:27:44 +0900 Subject: uaccess: Add a prototype of non-static __probe_user_read() Declare a prototype of non-static __probe_user_read() as same as __probe_kernel_read() at uaccess.h. Reported-by: kbuild test robot Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/uaccess.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 9c435c3f2105..34a038563d97 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -252,6 +252,7 @@ extern long __probe_kernel_read(void *dst, const void *src, size_t size); * happens, handle that and return -EFAULT. */ extern long probe_user_read(void *dst, const void __user *src, size_t size); +extern long __probe_user_read(void *dst, const void __user *src, size_t size); /* * probe_kernel_write(): safely attempt to write to a location -- cgit v1.2.3-59-g8ed1b From f5151311c3f37f6edc85b2253ccf6d3e2a4c4c26 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 20 May 2019 19:32:14 +0800 Subject: dmaengine: Add matching device node validation in __dma_request_channel() When user try to request one DMA channel by __dma_request_channel(), it won't validate if it is the correct DMA device to request, that will lead each DMA engine driver to validate the correct device node in their filter function if it is necessary. Thus we can add the matching device node validation in the DMA engine core, to remove all of device node validation in the drivers. Tested-by: Peter Ujfalusi Signed-off-by: Baolin Wang Signed-off-by: Vinod Koul --- drivers/dma/dmaengine.c | 10 ++++++++-- drivers/dma/of-dma.c | 4 ++-- include/linux/dmaengine.h | 12 ++++++++---- 3 files changed, 18 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 3a11b1092e80..610080c629bb 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -641,11 +641,13 @@ EXPORT_SYMBOL_GPL(dma_get_any_slave_channel); * @mask: capabilities that the channel must satisfy * @fn: optional callback to disposition available channels * @fn_param: opaque parameter to pass to dma_filter_fn + * @np: device node to look for DMA channels * * Returns pointer to appropriate DMA channel on success or NULL. */ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask, - dma_filter_fn fn, void *fn_param) + dma_filter_fn fn, void *fn_param, + struct device_node *np) { struct dma_device *device, *_d; struct dma_chan *chan = NULL; @@ -653,6 +655,10 @@ struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask, /* Find a channel */ mutex_lock(&dma_list_mutex); list_for_each_entry_safe(device, _d, &dma_device_list, global_node) { + /* Finds a DMA controller with matching device node */ + if (np && device->dev->of_node && np != device->dev->of_node) + continue; + chan = find_candidate(device, mask, fn, fn_param); if (!IS_ERR(chan)) break; @@ -769,7 +775,7 @@ struct dma_chan *dma_request_chan_by_mask(const dma_cap_mask_t *mask) if (!mask) return ERR_PTR(-ENODEV); - chan = __dma_request_channel(mask, NULL, NULL); + chan = __dma_request_channel(mask, NULL, NULL, NULL); if (!chan) { mutex_lock(&dma_list_mutex); if (list_empty(&dma_device_list)) diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c index 91fd395c90c4..6b43d04da05d 100644 --- a/drivers/dma/of-dma.c +++ b/drivers/dma/of-dma.c @@ -316,8 +316,8 @@ struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec, if (count != 1) return NULL; - return dma_request_channel(info->dma_cap, info->filter_fn, - &dma_spec->args[0]); + return __dma_request_channel(&info->dma_cap, info->filter_fn, + &dma_spec->args[0], dma_spec->np); } EXPORT_SYMBOL_GPL(of_dma_simple_xlate); diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index d49ec5c31944..504085b2bf21 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -1314,7 +1314,8 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie); enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx); void dma_issue_pending_all(void); struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask, - dma_filter_fn fn, void *fn_param); + dma_filter_fn fn, void *fn_param, + struct device_node *np); struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name); struct dma_chan *dma_request_chan(struct device *dev, const char *name); @@ -1339,7 +1340,9 @@ static inline void dma_issue_pending_all(void) { } static inline struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask, - dma_filter_fn fn, void *fn_param) + dma_filter_fn fn, + void *fn_param, + struct device_node *np) { return NULL; } @@ -1411,7 +1414,8 @@ void dma_async_device_unregister(struct dma_device *device); void dma_run_dependencies(struct dma_async_tx_descriptor *tx); struct dma_chan *dma_get_slave_channel(struct dma_chan *chan); struct dma_chan *dma_get_any_slave_channel(struct dma_device *device); -#define dma_request_channel(mask, x, y) __dma_request_channel(&(mask), x, y) +#define dma_request_channel(mask, x, y) \ + __dma_request_channel(&(mask), x, y, NULL) #define dma_request_slave_channel_compat(mask, x, y, dev, name) \ __dma_request_slave_channel_compat(&(mask), x, y, dev, name) @@ -1429,6 +1433,6 @@ static inline struct dma_chan if (!fn || !fn_param) return NULL; - return __dma_request_channel(mask, fn, fn_param); + return __dma_request_channel(mask, fn, fn_param, NULL); } #endif /* DMAENGINE_H */ -- cgit v1.2.3-59-g8ed1b From 990c0b53bf6599a9c9c7df1529dde681dee6cf64 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 20 May 2019 19:32:16 +0800 Subject: dmaengine: imx-sdma: Let the core do the device node validation Let the DMA engine core do the device node validation instead of drivers. Signed-off-by: Baolin Wang Signed-off-by: Vinod Koul --- drivers/dma/imx-sdma.c | 9 ++------- include/linux/platform_data/dma-imx.h | 1 - 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index 99d9f431ae2c..ca296f0849ef 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -1934,16 +1934,11 @@ disable_clk_ipg: static bool sdma_filter_fn(struct dma_chan *chan, void *fn_param) { struct sdma_channel *sdmac = to_sdma_chan(chan); - struct sdma_engine *sdma = sdmac->sdma; struct imx_dma_data *data = fn_param; if (!imx_dma_is_general_purpose(chan)) return false; - /* return false if it's not the right device */ - if (sdma->dev->of_node != data->of_node) - return false; - sdmac->data = *data; chan->private = &sdmac->data; @@ -1971,9 +1966,9 @@ static struct dma_chan *sdma_xlate(struct of_phandle_args *dma_spec, * be set to sdmac->event_id1. */ data.dma_request2 = 0; - data.of_node = ofdma->of_node; - return dma_request_channel(mask, sdma_filter_fn, &data); + return __dma_request_channel(&mask, sdma_filter_fn, &data, + ofdma->of_node); } static int sdma_probe(struct platform_device *pdev) diff --git a/include/linux/platform_data/dma-imx.h b/include/linux/platform_data/dma-imx.h index 9daea8d42a10..7d964e787299 100644 --- a/include/linux/platform_data/dma-imx.h +++ b/include/linux/platform_data/dma-imx.h @@ -55,7 +55,6 @@ struct imx_dma_data { int dma_request2; /* secondary DMA request line */ enum sdma_peripheral_type peripheral_type; int priority; - struct device_node *of_node; }; static inline int imx_dma_is_ipu(struct dma_chan *chan) -- cgit v1.2.3-59-g8ed1b From d27ac2e02bf256d4e824e7c1e1e1afa2b96cefcc Mon Sep 17 00:00:00 2001 From: Alexandru Ardelean Date: Mon, 27 May 2019 09:55:16 +0300 Subject: include: fpga: adi-axi-common.h: add common regs & defs header The AXI HDL cores provided for Analog Devices reference designs all share some common base registers (e.g. version register at address 0x00). To reduce duplication for this, a common header is added to define these registers as well as bitfields & macros to work with these registers. Signed-off-by: Alexandru Ardelean Signed-off-by: Vinod Koul --- include/linux/fpga/adi-axi-common.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/fpga/adi-axi-common.h (limited to 'include/linux') diff --git a/include/linux/fpga/adi-axi-common.h b/include/linux/fpga/adi-axi-common.h new file mode 100644 index 000000000000..7fc95d5c95bb --- /dev/null +++ b/include/linux/fpga/adi-axi-common.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Analog Devices AXI common registers & definitions + * + * Copyright 2019 Analog Devices Inc. + * + * https://wiki.analog.com/resources/fpga/docs/axi_ip + * https://wiki.analog.com/resources/fpga/docs/hdl/regmap + */ + +#ifndef ADI_AXI_COMMON_H_ +#define ADI_AXI_COMMON_H_ + +#define ADI_AXI_REG_VERSION 0x0000 + +#define ADI_AXI_PCORE_VER(major, minor, patch) \ + (((major) << 16) | ((minor) << 8) | (patch)) + +#endif /* ADI_AXI_COMMON_H_ */ -- cgit v1.2.3-59-g8ed1b From 153969fd952d81ab8f57574f9be1a90b0a0fa791 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 21 May 2019 03:38:25 +0200 Subject: ARM: versatile: Drop CLCD platform data The Versatile family no longer makes any use of the CLCD platform data, we have moved over all users to the DRM driver that has built-in handling of the displays. Delete the old auxdata. Signed-off-by: Linus Walleij --- arch/arm/mach-versatile/versatile_dt.c | 157 --------------------- include/linux/platform_data/video-clcd-versatile.h | 28 ---- 2 files changed, 185 deletions(-) delete mode 100644 include/linux/platform_data/video-clcd-versatile.h (limited to 'include/linux') diff --git a/arch/arm/mach-versatile/versatile_dt.c b/arch/arm/mach-versatile/versatile_dt.c index 028463af726d..b5ff1ea5a944 100644 --- a/arch/arm/mach-versatile/versatile_dt.c +++ b/arch/arm/mach-versatile/versatile_dt.c @@ -29,8 +29,6 @@ #include #include #include -#include -#include #include #include #include @@ -47,14 +45,12 @@ */ #define VERSATILE_SYS_PCICTL_OFFSET 0x44 #define VERSATILE_SYS_MCI_OFFSET 0x48 -#define VERSATILE_SYS_CLCD_OFFSET 0x50 /* * VERSATILE peripheral addresses */ #define VERSATILE_MMCI0_BASE 0x10005000 /* MMC interface */ #define VERSATILE_MMCI1_BASE 0x1000B000 /* MMC Interface */ -#define VERSATILE_CLCD_BASE 0x10120000 /* CLCD */ #define VERSATILE_SCTL_BASE 0x101E0000 /* System controller */ #define VERSATILE_IB2_BASE 0x24000000 /* IB2 module */ #define VERSATILE_IB2_CTL_BASE (VERSATILE_IB2_BASE + 0x03000000) @@ -96,158 +92,6 @@ static struct mmci_platform_data mmc1_plat_data = { .status = mmc_status, }; -/* - * CLCD support. - */ -#define SYS_CLCD_MODE_MASK (3 << 0) -#define SYS_CLCD_MODE_888 (0 << 0) -#define SYS_CLCD_MODE_5551 (1 << 0) -#define SYS_CLCD_MODE_565_RLSB (2 << 0) -#define SYS_CLCD_MODE_565_BLSB (3 << 0) -#define SYS_CLCD_NLCDIOON (1 << 2) -#define SYS_CLCD_VDDPOSSWITCH (1 << 3) -#define SYS_CLCD_PWR3V5SWITCH (1 << 4) -#define SYS_CLCD_ID_MASK (0x1f << 8) -#define SYS_CLCD_ID_SANYO_3_8 (0x00 << 8) -#define SYS_CLCD_ID_UNKNOWN_8_4 (0x01 << 8) -#define SYS_CLCD_ID_EPSON_2_2 (0x02 << 8) -#define SYS_CLCD_ID_SANYO_2_5 (0x07 << 8) -#define SYS_CLCD_ID_VGA (0x1f << 8) - -static bool is_sanyo_2_5_lcd; - -/* - * Disable all display connectors on the interface module. - */ -static void versatile_clcd_disable(struct clcd_fb *fb) -{ - void __iomem *sys_clcd = versatile_sys_base + VERSATILE_SYS_CLCD_OFFSET; - u32 val; - - val = readl(sys_clcd); - val &= ~SYS_CLCD_NLCDIOON | SYS_CLCD_PWR3V5SWITCH; - writel(val, sys_clcd); - - /* - * If the LCD is Sanyo 2x5 in on the IB2 board, turn the back-light off - */ - if (of_machine_is_compatible("arm,versatile-ab") && is_sanyo_2_5_lcd) { - unsigned long ctrl; - - ctrl = readl(versatile_ib2_ctrl); - ctrl &= ~0x01; - writel(ctrl, versatile_ib2_ctrl); - } -} - -/* - * Enable the relevant connector on the interface module. - */ -static void versatile_clcd_enable(struct clcd_fb *fb) -{ - struct fb_var_screeninfo *var = &fb->fb.var; - void __iomem *sys_clcd = versatile_sys_base + VERSATILE_SYS_CLCD_OFFSET; - u32 val; - - val = readl(sys_clcd); - val &= ~SYS_CLCD_MODE_MASK; - - switch (var->green.length) { - case 5: - val |= SYS_CLCD_MODE_5551; - break; - case 6: - if (var->red.offset == 0) - val |= SYS_CLCD_MODE_565_RLSB; - else - val |= SYS_CLCD_MODE_565_BLSB; - break; - case 8: - val |= SYS_CLCD_MODE_888; - break; - } - - /* - * Set the MUX - */ - writel(val, sys_clcd); - - /* - * And now enable the PSUs - */ - val |= SYS_CLCD_NLCDIOON | SYS_CLCD_PWR3V5SWITCH; - writel(val, sys_clcd); - - /* - * If the LCD is Sanyo 2x5 in on the IB2 board, turn the back-light on - */ - if (of_machine_is_compatible("arm,versatile-ab") && is_sanyo_2_5_lcd) { - unsigned long ctrl; - - ctrl = readl(versatile_ib2_ctrl); - ctrl |= 0x01; - writel(ctrl, versatile_ib2_ctrl); - } -} - -/* - * Detect which LCD panel is connected, and return the appropriate - * clcd_panel structure. Note: we do not have any information on - * the required timings for the 8.4in panel, so we presently assume - * VGA timings. - */ -static int versatile_clcd_setup(struct clcd_fb *fb) -{ - void __iomem *sys_clcd = versatile_sys_base + VERSATILE_SYS_CLCD_OFFSET; - const char *panel_name; - u32 val; - - is_sanyo_2_5_lcd = false; - - val = readl(sys_clcd) & SYS_CLCD_ID_MASK; - if (val == SYS_CLCD_ID_SANYO_3_8) - panel_name = "Sanyo TM38QV67A02A"; - else if (val == SYS_CLCD_ID_SANYO_2_5) { - panel_name = "Sanyo QVGA Portrait"; - is_sanyo_2_5_lcd = true; - } else if (val == SYS_CLCD_ID_EPSON_2_2) - panel_name = "Epson L2F50113T00"; - else if (val == SYS_CLCD_ID_VGA) - panel_name = "VGA"; - else { - printk(KERN_ERR "CLCD: unknown LCD panel ID 0x%08x, using VGA\n", - val); - panel_name = "VGA"; - } - - fb->panel = versatile_clcd_get_panel(panel_name); - if (!fb->panel) - return -EINVAL; - - return versatile_clcd_setup_dma(fb, SZ_1M); -} - -static void versatile_clcd_decode(struct clcd_fb *fb, struct clcd_regs *regs) -{ - clcdfb_decode(fb, regs); - - /* Always clear BGR for RGB565: we do the routing externally */ - if (fb->fb.var.green.length == 6) - regs->cntl &= ~CNTL_BGR; -} - -static struct clcd_board clcd_plat_data = { - .name = "Versatile", - .caps = CLCD_CAP_5551 | CLCD_CAP_565 | CLCD_CAP_888, - .check = clcdfb_check, - .decode = versatile_clcd_decode, - .disable = versatile_clcd_disable, - .enable = versatile_clcd_enable, - .setup = versatile_clcd_setup, - .mmap = versatile_clcd_mmap_dma, - .remove = versatile_clcd_remove_dma, -}; - /* * Lookup table for attaching a specific name and platform_data pointer to * devices as they get created by of_platform_populate(). Ideally this table @@ -257,7 +101,6 @@ static struct clcd_board clcd_plat_data = { struct of_dev_auxdata versatile_auxdata_lookup[] __initdata = { OF_DEV_AUXDATA("arm,primecell", VERSATILE_MMCI0_BASE, "fpga:05", &mmc0_plat_data), OF_DEV_AUXDATA("arm,primecell", VERSATILE_MMCI1_BASE, "fpga:0b", &mmc1_plat_data), - OF_DEV_AUXDATA("arm,primecell", VERSATILE_CLCD_BASE, "dev:20", &clcd_plat_data), {} }; diff --git a/include/linux/platform_data/video-clcd-versatile.h b/include/linux/platform_data/video-clcd-versatile.h deleted file mode 100644 index 305ebaec3afd..000000000000 --- a/include/linux/platform_data/video-clcd-versatile.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef PLAT_CLCD_H -#define PLAT_CLCD_H - -#ifdef CONFIG_PLAT_VERSATILE_CLCD -struct clcd_panel *versatile_clcd_get_panel(const char *); -int versatile_clcd_setup_dma(struct clcd_fb *, unsigned long); -int versatile_clcd_mmap_dma(struct clcd_fb *, struct vm_area_struct *); -void versatile_clcd_remove_dma(struct clcd_fb *); -#else -static inline struct clcd_panel *versatile_clcd_get_panel(const char *s) -{ - return NULL; -} -static inline int versatile_clcd_setup_dma(struct clcd_fb *fb, unsigned long framesize) -{ - return -ENODEV; -} -static inline int versatile_clcd_mmap_dma(struct clcd_fb *fb, struct vm_area_struct *vm) -{ - return -ENODEV; -} -static inline void versatile_clcd_remove_dma(struct clcd_fb *fb) -{ -} -#endif - -#endif -- cgit v1.2.3-59-g8ed1b From 2b2f7def058a5386838ef4dba70a860285f79e66 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 27 May 2019 04:51:53 -0700 Subject: bus: ti-sysc: Add support for missing clockdomain handling We need to let ti-sysc driver manage clockdomain autoidle for the duration of of reset, enable and idle. And we need to do it before we enable the clock and after we disable it. Currently we are still relying on platform callbacks indirectly managing clockdomain autoidle. But I noticed that for device tree only probed drivers it now happens only after we enabling the clocks and before we disable the clocks, while it should be the other way around. So far I have not noticed any issues with this though. Let's add new ti_sysc_clkdm_deny_idle() and ti_sysc_clkdm_allow_idle() functions for ti-sysc driver to use to manage clockdomains directly via platform data callbacks. Note that we can implement the clockdomain functions in pdata-quirks.c as for probing devices without "ti,hwmods" custom property we don't need to use the other platform data callbacks. Let's do this in one patch as there's is still an unlikely chance we may need to apply this as a fix for v5.2 for dropping legacy platform data for some devices. We also do have the option of adding back the platform data if needed in case of trouble. Tested-by: Keerthy Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/omap_hwmod.c | 39 ++--------- arch/arm/mach-omap2/pdata-quirks.c | 60 ++++++++++++++++ drivers/bus/ti-sysc.c | 127 +++++++++++++++++++++++++++------- include/linux/platform_data/ti-sysc.h | 8 +++ 4 files changed, 174 insertions(+), 60 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/mach-omap2/omap_hwmod.c b/arch/arm/mach-omap2/omap_hwmod.c index 405ac24def05..932ba221e8e7 100644 --- a/arch/arm/mach-omap2/omap_hwmod.c +++ b/arch/arm/mach-omap2/omap_hwmod.c @@ -3445,6 +3445,7 @@ static int omap_hwmod_check_module(struct device *dev, * @dev: struct device * @oh: module * @sysc_fields: sysc register bits + * @clockdomain: clockdomain * @rev_offs: revision register offset * @sysc_offs: sysconfig register offset * @syss_offs: sysstatus register offset @@ -3456,6 +3457,7 @@ static int omap_hwmod_check_module(struct device *dev, static int omap_hwmod_allocate_module(struct device *dev, struct omap_hwmod *oh, const struct ti_sysc_module_data *data, struct sysc_regbits *sysc_fields, + struct clockdomain *clkdm, s32 rev_offs, s32 sysc_offs, s32 syss_offs, u32 sysc_flags, u32 idlemodes) @@ -3463,8 +3465,6 @@ static int omap_hwmod_allocate_module(struct device *dev, struct omap_hwmod *oh, struct omap_hwmod_class_sysconfig *sysc; struct omap_hwmod_class *class = NULL; struct omap_hwmod_ocp_if *oi = NULL; - struct clockdomain *clkdm = NULL; - struct clk *clk = NULL; void __iomem *regs = NULL; unsigned long flags; @@ -3511,36 +3511,6 @@ static int omap_hwmod_allocate_module(struct device *dev, struct omap_hwmod *oh, oi->user = OCP_USER_MPU | OCP_USER_SDMA; } - if (!oh->_clk) { - struct clk_hw_omap *hwclk; - - clk = of_clk_get_by_name(dev->of_node, "fck"); - if (!IS_ERR(clk)) - clk_prepare(clk); - else - clk = NULL; - - /* - * Populate clockdomain based on dts clock. It is needed for - * clkdm_deny_idle() and clkdm_allow_idle() until we have have - * interconnect driver and reset driver capable of blocking - * clockdomain idle during reset, enable and idle. - */ - if (clk) { - hwclk = to_clk_hw_omap(__clk_get_hw(clk)); - if (hwclk && hwclk->clkdm_name) - clkdm = clkdm_lookup(hwclk->clkdm_name); - } - - /* - * Note that we assume interconnect driver manages the clocks - * and do not need to populate oh->_clk for dynamically - * allocated modules. - */ - clk_unprepare(clk); - clk_put(clk); - } - spin_lock_irqsave(&oh->_lock, flags); if (regs) oh->_mpu_rt_va = regs; @@ -3626,7 +3596,7 @@ int omap_hwmod_init_module(struct device *dev, u32 sysc_flags, idlemodes; int error; - if (!dev || !data) + if (!dev || !data || !data->name || !cookie) return -EINVAL; oh = _lookup(data->name); @@ -3697,7 +3667,8 @@ int omap_hwmod_init_module(struct device *dev, return error; return omap_hwmod_allocate_module(dev, oh, data, sysc_fields, - rev_offs, sysc_offs, syss_offs, + cookie->clkdm, rev_offs, + sysc_offs, syss_offs, sysc_flags, idlemodes); } diff --git a/arch/arm/mach-omap2/pdata-quirks.c b/arch/arm/mach-omap2/pdata-quirks.c index a2ecc5e69abb..b09cc4e8d240 100644 --- a/arch/arm/mach-omap2/pdata-quirks.c +++ b/arch/arm/mach-omap2/pdata-quirks.c @@ -29,6 +29,7 @@ #include #include +#include "clockdomain.h" #include "common.h" #include "common-board-devices.h" #include "control.h" @@ -463,6 +464,62 @@ static void __init dra7x_evm_mmc_quirk(void) } #endif +static struct clockdomain *ti_sysc_find_one_clockdomain(struct clk *clk) +{ + struct clockdomain *clkdm = NULL; + struct clk_hw_omap *hwclk; + + hwclk = to_clk_hw_omap(__clk_get_hw(clk)); + if (hwclk && hwclk->clkdm_name) + clkdm = clkdm_lookup(hwclk->clkdm_name); + + return clkdm; +} + +/** + * ti_sysc_clkdm_init - find clockdomain based on clock + * @fck: device functional clock + * @ick: device interface clock + * @dev: struct device + * + * Populate clockdomain based on clock. It is needed for + * clkdm_deny_idle() and clkdm_allow_idle() for blocking clockdomain + * clockdomain idle during reset, enable and idle. + * + * Note that we assume interconnect driver manages the clocks + * and do not need to populate oh->_clk for dynamically + * allocated modules. + */ +static int ti_sysc_clkdm_init(struct device *dev, + struct clk *fck, struct clk *ick, + struct ti_sysc_cookie *cookie) +{ + if (fck) + cookie->clkdm = ti_sysc_find_one_clockdomain(fck); + if (cookie->clkdm) + return 0; + if (ick) + cookie->clkdm = ti_sysc_find_one_clockdomain(ick); + if (cookie->clkdm) + return 0; + + return -ENODEV; +} + +static void ti_sysc_clkdm_deny_idle(struct device *dev, + const struct ti_sysc_cookie *cookie) +{ + if (cookie->clkdm) + clkdm_deny_idle(cookie->clkdm); +} + +static void ti_sysc_clkdm_allow_idle(struct device *dev, + const struct ti_sysc_cookie *cookie) +{ + if (cookie->clkdm) + clkdm_allow_idle(cookie->clkdm); +} + static int ti_sysc_enable_module(struct device *dev, const struct ti_sysc_cookie *cookie) { @@ -494,6 +551,9 @@ static struct of_dev_auxdata omap_auxdata_lookup[]; static struct ti_sysc_platform_data ti_sysc_pdata = { .auxdata = omap_auxdata_lookup, + .init_clockdomain = ti_sysc_clkdm_init, + .clkdm_deny_idle = ti_sysc_clkdm_deny_idle, + .clkdm_allow_idle = ti_sysc_clkdm_allow_idle, .init_module = omap_hwmod_init_module, .enable_module = ti_sysc_enable_module, .idle_module = ti_sysc_idle_module, diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index b72741668c92..e86f7850206a 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -422,6 +422,30 @@ static void sysc_disable_opt_clocks(struct sysc *ddata) } } +static void sysc_clkdm_deny_idle(struct sysc *ddata) +{ + struct ti_sysc_platform_data *pdata; + + if (ddata->legacy_mode) + return; + + pdata = dev_get_platdata(ddata->dev); + if (pdata && pdata->clkdm_deny_idle) + pdata->clkdm_deny_idle(ddata->dev, &ddata->cookie); +} + +static void sysc_clkdm_allow_idle(struct sysc *ddata) +{ + struct ti_sysc_platform_data *pdata; + + if (ddata->legacy_mode) + return; + + pdata = dev_get_platdata(ddata->dev); + if (pdata && pdata->clkdm_allow_idle) + pdata->clkdm_allow_idle(ddata->dev, &ddata->cookie); +} + /** * sysc_init_resets - init rstctrl reset line if configured * @ddata: device driver data @@ -795,6 +819,7 @@ static void sysc_show_registers(struct sysc *ddata) #define SYSC_IDLE_MASK (SYSC_NR_IDLEMODES - 1) +/* Caller needs to manage sysc_clkdm_deny_idle() and sysc_clkdm_allow_idle() */ static int sysc_enable_module(struct device *dev) { struct sysc *ddata; @@ -805,11 +830,6 @@ static int sysc_enable_module(struct device *dev) if (ddata->offsets[SYSC_SYSCONFIG] == -ENODEV) return 0; - /* - * TODO: Need to prevent clockdomain autoidle? - * See clkdm_deny_idle() in arch/mach-omap2/omap_hwmod.c - */ - regbits = ddata->cap->regbits; reg = sysc_read(ddata, ddata->offsets[SYSC_SYSCONFIG]); @@ -861,6 +881,7 @@ static int sysc_best_idle_mode(u32 idlemodes, u32 *best_mode) return 0; } +/* Caller needs to manage sysc_clkdm_deny_idle() and sysc_clkdm_allow_idle() */ static int sysc_disable_module(struct device *dev) { struct sysc *ddata; @@ -872,11 +893,6 @@ static int sysc_disable_module(struct device *dev) if (ddata->offsets[SYSC_SYSCONFIG] == -ENODEV) return 0; - /* - * TODO: Need to prevent clockdomain autoidle? - * See clkdm_deny_idle() in arch/mach-omap2/omap_hwmod.c - */ - regbits = ddata->cap->regbits; reg = sysc_read(ddata, ddata->offsets[SYSC_SYSCONFIG]); @@ -966,14 +982,16 @@ static int __maybe_unused sysc_runtime_suspend(struct device *dev) if (!ddata->enabled) return 0; + sysc_clkdm_deny_idle(ddata); + if (ddata->legacy_mode) { error = sysc_runtime_suspend_legacy(dev, ddata); if (error) - return error; + goto err_allow_idle; } else { error = sysc_disable_module(dev); if (error) - return error; + goto err_allow_idle; } sysc_disable_main_clocks(ddata); @@ -983,6 +1001,9 @@ static int __maybe_unused sysc_runtime_suspend(struct device *dev) ddata->enabled = false; +err_allow_idle: + sysc_clkdm_allow_idle(ddata); + return error; } @@ -996,10 +1017,12 @@ static int __maybe_unused sysc_runtime_resume(struct device *dev) if (ddata->enabled) return 0; + sysc_clkdm_deny_idle(ddata); + if (sysc_opt_clks_needed(ddata)) { error = sysc_enable_opt_clocks(ddata); if (error) - return error; + goto err_allow_idle; } error = sysc_enable_main_clocks(ddata); @@ -1018,6 +1041,8 @@ static int __maybe_unused sysc_runtime_resume(struct device *dev) ddata->enabled = true; + sysc_clkdm_allow_idle(ddata); + return 0; err_main_clocks: @@ -1025,6 +1050,8 @@ err_main_clocks: err_opt_clocks: if (sysc_opt_clks_needed(ddata)) sysc_disable_opt_clocks(ddata); +err_allow_idle: + sysc_clkdm_allow_idle(ddata); return error; } @@ -1245,6 +1272,33 @@ static void sysc_init_revision_quirks(struct sysc *ddata) } } +static int sysc_clockdomain_init(struct sysc *ddata) +{ + struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev); + struct clk *fck = NULL, *ick = NULL; + int error; + + if (!pdata || !pdata->init_clockdomain) + return 0; + + switch (ddata->nr_clocks) { + case 2: + ick = ddata->clocks[SYSC_ICK]; + /* fallthrough */ + case 1: + fck = ddata->clocks[SYSC_FCK]; + break; + case 0: + return 0; + } + + error = pdata->init_clockdomain(ddata->dev, fck, ick, &ddata->cookie); + if (!error || error == -ENODEV) + return 0; + + return error; +} + /* * Note that pdata->init_module() typically does a reset first. After * pdata->init_module() is done, PM runtime can be used for the interconnect @@ -1255,7 +1309,7 @@ static int sysc_legacy_init(struct sysc *ddata) struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev); int error; - if (!ddata->legacy_mode || !pdata || !pdata->init_module) + if (!pdata || !pdata->init_module) return 0; error = pdata->init_module(ddata->dev, ddata->mdata, &ddata->cookie); @@ -1347,7 +1401,13 @@ static int sysc_init_module(struct sysc *ddata) (SYSC_QUIRK_NO_IDLE | SYSC_QUIRK_NO_IDLE_ON_INIT)) manage_clocks = false; + error = sysc_clockdomain_init(ddata); + if (error) + return error; + if (manage_clocks) { + sysc_clkdm_deny_idle(ddata); + error = sysc_enable_opt_clocks(ddata); if (error) return error; @@ -1360,20 +1420,33 @@ static int sysc_init_module(struct sysc *ddata) ddata->revision = sysc_read_revision(ddata); sysc_init_revision_quirks(ddata); - error = sysc_legacy_init(ddata); - if (error) - goto err_main_clocks; + if (ddata->legacy_mode) { + error = sysc_legacy_init(ddata); + if (error) + goto err_main_clocks; + } + + if (!ddata->legacy_mode && manage_clocks) { + error = sysc_enable_module(ddata->dev); + if (error) + goto err_main_clocks; + } error = sysc_reset(ddata); if (error) dev_err(ddata->dev, "Reset failed with %d\n", error); + if (!ddata->legacy_mode && manage_clocks) + sysc_disable_module(ddata->dev); + err_main_clocks: if (manage_clocks) sysc_disable_main_clocks(ddata); err_opt_clocks: - if (manage_clocks) + if (manage_clocks) { sysc_disable_opt_clocks(ddata); + sysc_clkdm_allow_idle(ddata); + } return error; } @@ -2012,20 +2085,22 @@ static int sysc_init_pdata(struct sysc *ddata) struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev); struct ti_sysc_module_data *mdata; - if (!pdata || !ddata->legacy_mode) + if (!pdata) return 0; mdata = devm_kzalloc(ddata->dev, sizeof(*mdata), GFP_KERNEL); if (!mdata) return -ENOMEM; - mdata->name = ddata->legacy_mode; - mdata->module_pa = ddata->module_pa; - mdata->module_size = ddata->module_size; - mdata->offsets = ddata->offsets; - mdata->nr_offsets = SYSC_MAX_REGS; - mdata->cap = ddata->cap; - mdata->cfg = &ddata->cfg; + if (ddata->legacy_mode) { + mdata->name = ddata->legacy_mode; + mdata->module_pa = ddata->module_pa; + mdata->module_size = ddata->module_size; + mdata->offsets = ddata->offsets; + mdata->nr_offsets = SYSC_MAX_REGS; + mdata->cap = ddata->cap; + mdata->cfg = &ddata->cfg; + } ddata->mdata = mdata; diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h index 9256c0305968..6626fd31e309 100644 --- a/include/linux/platform_data/ti-sysc.h +++ b/include/linux/platform_data/ti-sysc.h @@ -19,6 +19,7 @@ enum ti_sysc_module_type { struct ti_sysc_cookie { void *data; + void *clkdm; }; /** @@ -125,9 +126,16 @@ struct ti_sysc_module_data { }; struct device; +struct clk; struct ti_sysc_platform_data { struct of_dev_auxdata *auxdata; + int (*init_clockdomain)(struct device *dev, struct clk *fck, + struct clk *ick, struct ti_sysc_cookie *cookie); + void (*clkdm_deny_idle)(struct device *dev, + const struct ti_sysc_cookie *cookie); + void (*clkdm_allow_idle)(struct device *dev, + const struct ti_sysc_cookie *cookie); int (*init_module)(struct device *dev, const struct ti_sysc_module_data *data, struct ti_sysc_cookie *cookie); -- cgit v1.2.3-59-g8ed1b From e0db94fe87dacd72be0699adcc29e321db7f1689 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 27 May 2019 04:51:53 -0700 Subject: bus: ti-sysc: Make OCP reset work for sysstatus and sysconfig reset bits We've had minimal OCP softreset support in ti-sysc interconnect target module driver only used for MCAN driver so far. But it turns out that MCAN has the sysstatus register resetdone bit inverted compared to most other modules. Let's make OCP softreset work for other typical cases with reset status in sysstatus or sysconfig register so we can use the new functions for sysc_enable_module() and sysc_disable_module() without "ti,hwmods" property in the following patches. Tested-by: Keerthy Signed-off-by: Tony Lindgren --- drivers/bus/ti-sysc.c | 72 ++++++++++++++++++++++++++--------- include/linux/platform_data/ti-sysc.h | 1 + 2 files changed, 55 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index f00997eea207..f4a048430cd1 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -153,6 +153,26 @@ static u32 sysc_read_revision(struct sysc *ddata) return sysc_read(ddata, offset); } +static u32 sysc_read_sysconfig(struct sysc *ddata) +{ + int offset = ddata->offsets[SYSC_SYSCONFIG]; + + if (offset < 0) + return 0; + + return sysc_read(ddata, offset); +} + +static u32 sysc_read_sysstatus(struct sysc *ddata) +{ + int offset = ddata->offsets[SYSC_SYSSTATUS]; + + if (offset < 0) + return 0; + + return sysc_read(ddata, offset); +} + static int sysc_add_named_clock_from_child(struct sysc *ddata, const char *name, const char *optfck_name) @@ -1369,34 +1389,49 @@ static int sysc_rstctrl_reset_deassert(struct sysc *ddata, bool reset) return reset_control_deassert(ddata->rsts); } +/* + * Note that the caller must ensure the interconnect target module is enabled + * before calling reset. Otherwise reset will not complete. + */ static int sysc_reset(struct sysc *ddata) { - int offset = ddata->offsets[SYSC_SYSCONFIG]; - int val; + int sysc_offset, syss_offset, sysc_val, rstval, quirks, error = 0; + u32 sysc_mask, syss_done; + + sysc_offset = ddata->offsets[SYSC_SYSCONFIG]; + syss_offset = ddata->offsets[SYSC_SYSSTATUS]; + quirks = ddata->cfg.quirks; - if (ddata->legacy_mode || offset < 0 || + if (ddata->legacy_mode || sysc_offset < 0 || + ddata->cap->regbits->srst_shift < 0 || ddata->cfg.quirks & SYSC_QUIRK_NO_RESET_ON_INIT) return 0; - /* - * Currently only support reset status in sysstatus. - * Warn and return error in all other cases - */ - if (!ddata->cfg.syss_mask) { - dev_err(ddata->dev, "No ti,syss-mask. Reset failed\n"); - return -EINVAL; - } + sysc_mask = BIT(ddata->cap->regbits->srst_shift); - val = sysc_read(ddata, offset); - val |= (0x1 << ddata->cap->regbits->srst_shift); - sysc_write(ddata, offset, val); + if (ddata->cfg.quirks & SYSS_QUIRK_RESETDONE_INVERTED) + syss_done = 0; + else + syss_done = ddata->cfg.syss_mask; + + sysc_val = sysc_read_sysconfig(ddata); + sysc_val |= sysc_mask; + sysc_write(ddata, sysc_offset, sysc_val); /* Poll on reset status */ - offset = ddata->offsets[SYSC_SYSSTATUS]; + if (syss_offset >= 0) { + error = readx_poll_timeout(sysc_read_sysstatus, ddata, rstval, + (rstval & ddata->cfg.syss_mask) == + syss_done, + 100, MAX_MODULE_SOFTRESET_WAIT); + + } else if (ddata->cfg.quirks & SYSC_QUIRK_RESET_STATUS) { + error = readx_poll_timeout(sysc_read_sysconfig, ddata, rstval, + !(rstval & sysc_mask), + 100, MAX_MODULE_SOFTRESET_WAIT); + } - return readl_poll_timeout(ddata->module_va + offset, val, - (val & ddata->cfg.syss_mask) == 0x0, - 100, MAX_MODULE_SOFTRESET_WAIT); + return error; } /* @@ -2099,6 +2134,7 @@ static const struct sysc_capabilities sysc_dra7_mcan = { .type = TI_SYSC_DRA7_MCAN, .sysc_mask = SYSC_DRA7_MCAN_ENAWAKEUP | SYSC_OMAP4_SOFTRESET, .regbits = &sysc_regbits_dra7_mcan, + .mod_quirks = SYSS_QUIRK_RESETDONE_INVERTED, }; static int sysc_init_pdata(struct sysc *ddata) diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h index 6626fd31e309..8822e99ff813 100644 --- a/include/linux/platform_data/ti-sysc.h +++ b/include/linux/platform_data/ti-sysc.h @@ -47,6 +47,7 @@ struct sysc_regbits { s8 emufree_shift; }; +#define SYSS_QUIRK_RESETDONE_INVERTED BIT(14) #define SYSC_QUIRK_SWSUP_MSTANDBY BIT(13) #define SYSC_QUIRK_SWSUP_SIDLE_ACT BIT(12) #define SYSC_QUIRK_SWSUP_SIDLE BIT(11) -- cgit v1.2.3-59-g8ed1b From 229b4e0728e0a6ddca2645e73696d5b104fbbbfb Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 14 May 2019 22:47:24 +0800 Subject: Documentation: PCI: convert pci.txt to reST Convert plain text documentation to reStructuredText format and add it to Sphinx TOC tree. No essential content change. Move the description of struct pci_driver and struct pci_device_id into in-source comments. Signed-off-by: Changbin Du [bhelgaas: fix kernel-doc warnings related to moving descriptions to linux/pci.h, fix "space tab" whitespace errors in mod_devicetable.h] Signed-off-by: Bjorn Helgaas Reviewed-by: Mauro Carvalho Chehab --- Documentation/PCI/index.rst | 2 + Documentation/PCI/pci.rst | 578 ++++++++++++++++++++++++++++++++++++ Documentation/PCI/pci.txt | 636 ---------------------------------------- include/linux/mod_devicetable.h | 29 +- include/linux/pci.h | 48 ++- 5 files changed, 651 insertions(+), 642 deletions(-) create mode 100644 Documentation/PCI/pci.rst delete mode 100644 Documentation/PCI/pci.txt (limited to 'include/linux') diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst index c2f8728d11cf..7babf43709b0 100644 --- a/Documentation/PCI/index.rst +++ b/Documentation/PCI/index.rst @@ -7,3 +7,5 @@ Linux PCI Bus Subsystem .. toctree:: :maxdepth: 2 :numbered: + + pci diff --git a/Documentation/PCI/pci.rst b/Documentation/PCI/pci.rst new file mode 100644 index 000000000000..6864f9a70f5f --- /dev/null +++ b/Documentation/PCI/pci.rst @@ -0,0 +1,578 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================== +How To Write Linux PCI Drivers +============================== + +:Authors: - Martin Mares + - Grant Grundler + +The world of PCI is vast and full of (mostly unpleasant) surprises. +Since each CPU architecture implements different chip-sets and PCI devices +have different requirements (erm, "features"), the result is the PCI support +in the Linux kernel is not as trivial as one would wish. This short paper +tries to introduce all potential driver authors to Linux APIs for +PCI device drivers. + +A more complete resource is the third edition of "Linux Device Drivers" +by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman. +LDD3 is available for free (under Creative Commons License) from: +http://lwn.net/Kernel/LDD3/. + +However, keep in mind that all documents are subject to "bit rot". +Refer to the source code if things are not working as described here. + +Please send questions/comments/patches about Linux PCI API to the +"Linux PCI" mailing list. + + +Structure of PCI drivers +======================== +PCI drivers "discover" PCI devices in a system via pci_register_driver(). +Actually, it's the other way around. When the PCI generic code discovers +a new device, the driver with a matching "description" will be notified. +Details on this below. + +pci_register_driver() leaves most of the probing for devices to +the PCI layer and supports online insertion/removal of devices [thus +supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver]. +pci_register_driver() call requires passing in a table of function +pointers and thus dictates the high level structure of a driver. + +Once the driver knows about a PCI device and takes ownership, the +driver generally needs to perform the following initialization: + + - Enable the device + - Request MMIO/IOP resources + - Set the DMA mask size (for both coherent and streaming DMA) + - Allocate and initialize shared control data (pci_allocate_coherent()) + - Access device configuration space (if needed) + - Register IRQ handler (request_irq()) + - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip) + - Enable DMA/processing engines + +When done using the device, and perhaps the module needs to be unloaded, +the driver needs to take the follow steps: + + - Disable the device from generating IRQs + - Release the IRQ (free_irq()) + - Stop all DMA activity + - Release DMA buffers (both streaming and coherent) + - Unregister from other subsystems (e.g. scsi or netdev) + - Release MMIO/IOP resources + - Disable the device + +Most of these topics are covered in the following sections. +For the rest look at LDD3 or . + +If the PCI subsystem is not configured (CONFIG_PCI is not set), most of +the PCI functions described below are defined as inline functions either +completely empty or just returning an appropriate error codes to avoid +lots of ifdefs in the drivers. + + +pci_register_driver() call +========================== + +PCI device drivers call ``pci_register_driver()`` during their +initialization with a pointer to a structure describing the driver +(``struct pci_driver``): + +.. kernel-doc:: include/linux/pci.h + :functions: pci_driver + +The ID table is an array of ``struct pci_device_id`` entries ending with an +all-zero entry. Definitions with static const are generally preferred. + +.. kernel-doc:: include/linux/mod_devicetable.h + :functions: pci_device_id + +Most drivers only need ``PCI_DEVICE()`` or ``PCI_DEVICE_CLASS()`` to set up +a pci_device_id table. + +New PCI IDs may be added to a device driver pci_ids table at runtime +as shown below:: + + echo "vendor device subvendor subdevice class class_mask driver_data" > \ + /sys/bus/pci/drivers/{driver}/new_id + +All fields are passed in as hexadecimal values (no leading 0x). +The vendor and device fields are mandatory, the others are optional. Users +need pass only as many optional fields as necessary: + + - subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF) + - class and classmask fields default to 0 + - driver_data defaults to 0UL. + +Note that driver_data must match the value used by any of the pci_device_id +entries defined in the driver. This makes the driver_data field mandatory +if all the pci_device_id entries have a non-zero driver_data value. + +Once added, the driver probe routine will be invoked for any unclaimed +PCI devices listed in its (newly updated) pci_ids list. + +When the driver exits, it just calls pci_unregister_driver() and the PCI layer +automatically calls the remove hook for all devices handled by the driver. + + +"Attributes" for driver functions/data +-------------------------------------- + +Please mark the initialization and cleanup functions where appropriate +(the corresponding macros are defined in ): + + ====== ================================================= + __init Initialization code. Thrown away after the driver + initializes. + __exit Exit code. Ignored for non-modular drivers. + ====== ================================================= + +Tips on when/where to use the above attributes: + - The module_init()/module_exit() functions (and all + initialization functions called _only_ from these) + should be marked __init/__exit. + + - Do not mark the struct pci_driver. + + - Do NOT mark a function if you are not sure which mark to use. + Better to not mark the function than mark the function wrong. + + +How to find PCI devices manually +================================ + +PCI drivers should have a really good reason for not using the +pci_register_driver() interface to search for PCI devices. +The main reason PCI devices are controlled by multiple drivers +is because one PCI device implements several different HW services. +E.g. combined serial/parallel port/floppy controller. + +A manual search may be performed using the following constructs: + +Searching by vendor and device ID:: + + struct pci_dev *dev = NULL; + while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev)) + configure_device(dev); + +Searching by class ID (iterate in a similar way):: + + pci_get_class(CLASS_ID, dev) + +Searching by both vendor/device and subsystem vendor/device ID:: + + pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev). + +You can use the constant PCI_ANY_ID as a wildcard replacement for +VENDOR_ID or DEVICE_ID. This allows searching for any device from a +specific vendor, for example. + +These functions are hotplug-safe. They increment the reference count on +the pci_dev that they return. You must eventually (possibly at module unload) +decrement the reference count on these devices by calling pci_dev_put(). + + +Device Initialization Steps +=========================== + +As noted in the introduction, most PCI drivers need the following steps +for device initialization: + + - Enable the device + - Request MMIO/IOP resources + - Set the DMA mask size (for both coherent and streaming DMA) + - Allocate and initialize shared control data (pci_allocate_coherent()) + - Access device configuration space (if needed) + - Register IRQ handler (request_irq()) + - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip) + - Enable DMA/processing engines. + +The driver can access PCI config space registers at any time. +(Well, almost. When running BIST, config space can go away...but +that will just result in a PCI Bus Master Abort and config reads +will return garbage). + + +Enable the PCI device +--------------------- +Before touching any device registers, the driver needs to enable +the PCI device by calling pci_enable_device(). This will: + + - wake up the device if it was in suspended state, + - allocate I/O and memory regions of the device (if BIOS did not), + - allocate an IRQ (if BIOS did not). + +.. note:: + pci_enable_device() can fail! Check the return value. + +.. warning:: + OS BUG: we don't check resource allocations before enabling those + resources. The sequence would make more sense if we called + pci_request_resources() before calling pci_enable_device(). + Currently, the device drivers can't detect the bug when when two + devices have been allocated the same range. This is not a common + problem and unlikely to get fixed soon. + + This has been discussed before but not changed as of 2.6.19: + http://lkml.org/lkml/2006/3/2/194 + + +pci_set_master() will enable DMA by setting the bus master bit +in the PCI_COMMAND register. It also fixes the latency timer value if +it's set to something bogus by the BIOS. pci_clear_master() will +disable DMA by clearing the bus master bit. + +If the PCI device can use the PCI Memory-Write-Invalidate transaction, +call pci_set_mwi(). This enables the PCI_COMMAND bit for Mem-Wr-Inval +and also ensures that the cache line size register is set correctly. +Check the return value of pci_set_mwi() as not all architectures +or chip-sets may support Memory-Write-Invalidate. Alternatively, +if Mem-Wr-Inval would be nice to have but is not required, call +pci_try_set_mwi() to have the system do its best effort at enabling +Mem-Wr-Inval. + + +Request MMIO/IOP resources +-------------------------- +Memory (MMIO), and I/O port addresses should NOT be read directly +from the PCI device config space. Use the values in the pci_dev structure +as the PCI "bus address" might have been remapped to a "host physical" +address by the arch/chip-set specific kernel support. + +See Documentation/io-mapping.txt for how to access device registers +or device memory. + +The device driver needs to call pci_request_region() to verify +no other device is already using the same address resource. +Conversely, drivers should call pci_release_region() AFTER +calling pci_disable_device(). +The idea is to prevent two devices colliding on the same address range. + +.. tip:: + See OS BUG comment above. Currently (2.6.19), The driver can only + determine MMIO and IO Port resource availability _after_ calling + pci_enable_device(). + +Generic flavors of pci_request_region() are request_mem_region() +(for MMIO ranges) and request_region() (for IO Port ranges). +Use these for address resources that are not described by "normal" PCI +BARs. + +Also see pci_request_selected_regions() below. + + +Set the DMA mask size +--------------------- +.. note:: + If anything below doesn't make sense, please refer to + Documentation/DMA-API.txt. This section is just a reminder that + drivers need to indicate DMA capabilities of the device and is not + an authoritative source for DMA interfaces. + +While all drivers should explicitly indicate the DMA capability +(e.g. 32 or 64 bit) of the PCI bus master, devices with more than +32-bit bus master capability for streaming data need the driver +to "register" this capability by calling pci_set_dma_mask() with +appropriate parameters. In general this allows more efficient DMA +on systems where System RAM exists above 4G _physical_ address. + +Drivers for all PCI-X and PCIe compliant devices must call +pci_set_dma_mask() as they are 64-bit DMA devices. + +Similarly, drivers must also "register" this capability if the device +can directly address "consistent memory" in System RAM above 4G physical +address by calling pci_set_consistent_dma_mask(). +Again, this includes drivers for all PCI-X and PCIe compliant devices. +Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are +64-bit DMA capable for payload ("streaming") data but not control +("consistent") data. + + +Setup shared control data +------------------------- +Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared) +memory. See Documentation/DMA-API.txt for a full description of +the DMA APIs. This section is just a reminder that it needs to be done +before enabling DMA on the device. + + +Initialize device registers +--------------------------- +Some drivers will need specific "capability" fields programmed +or other "vendor specific" register initialized or reset. +E.g. clearing pending interrupts. + + +Register IRQ handler +-------------------- +While calling request_irq() is the last step described here, +this is often just another intermediate step to initialize a device. +This step can often be deferred until the device is opened for use. + +All interrupt handlers for IRQ lines should be registered with IRQF_SHARED +and use the devid to map IRQs to devices (remember that all PCI IRQ lines +can be shared). + +request_irq() will associate an interrupt handler and device handle +with an interrupt number. Historically interrupt numbers represent +IRQ lines which run from the PCI device to the Interrupt controller. +With MSI and MSI-X (more below) the interrupt number is a CPU "vector". + +request_irq() also enables the interrupt. Make sure the device is +quiesced and does not have any interrupts pending before registering +the interrupt handler. + +MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts" +which deliver interrupts to the CPU via a DMA write to a Local APIC. +The fundamental difference between MSI and MSI-X is how multiple +"vectors" get allocated. MSI requires contiguous blocks of vectors +while MSI-X can allocate several individual ones. + +MSI capability can be enabled by calling pci_alloc_irq_vectors() with the +PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This +causes the PCI support to program CPU vector data into the PCI device +capability registers. Many architectures, chip-sets, or BIOSes do NOT +support MSI or MSI-X and a call to pci_alloc_irq_vectors with just +the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always +specify PCI_IRQ_LEGACY as well. + +Drivers that have different interrupt handlers for MSI/MSI-X and +legacy INTx should chose the right one based on the msi_enabled +and msix_enabled flags in the pci_dev structure after calling +pci_alloc_irq_vectors. + +There are (at least) two really good reasons for using MSI: + +1) MSI is an exclusive interrupt vector by definition. + This means the interrupt handler doesn't have to verify + its device caused the interrupt. + +2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed + to be visible to the host CPU(s) when the MSI is delivered. This + is important for both data coherency and avoiding stale control data. + This guarantee allows the driver to omit MMIO reads to flush + the DMA stream. + +See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples +of MSI/MSI-X usage. + + +PCI device shutdown +=================== + +When a PCI device driver is being unloaded, most of the following +steps need to be performed: + + - Disable the device from generating IRQs + - Release the IRQ (free_irq()) + - Stop all DMA activity + - Release DMA buffers (both streaming and consistent) + - Unregister from other subsystems (e.g. scsi or netdev) + - Disable device from responding to MMIO/IO Port addresses + - Release MMIO/IO Port resource(s) + + +Stop IRQs on the device +----------------------- +How to do this is chip/device specific. If it's not done, it opens +the possibility of a "screaming interrupt" if (and only if) +the IRQ is shared with another device. + +When the shared IRQ handler is "unhooked", the remaining devices +using the same IRQ line will still need the IRQ enabled. Thus if the +"unhooked" device asserts IRQ line, the system will respond assuming +it was one of the remaining devices asserted the IRQ line. Since none +of the other devices will handle the IRQ, the system will "hang" until +it decides the IRQ isn't going to get handled and masks the IRQ (100,000 +iterations later). Once the shared IRQ is masked, the remaining devices +will stop functioning properly. Not a nice situation. + +This is another reason to use MSI or MSI-X if it's available. +MSI and MSI-X are defined to be exclusive interrupts and thus +are not susceptible to the "screaming interrupt" problem. + + +Release the IRQ +--------------- +Once the device is quiesced (no more IRQs), one can call free_irq(). +This function will return control once any pending IRQs are handled, +"unhook" the drivers IRQ handler from that IRQ, and finally release +the IRQ if no one else is using it. + + +Stop all DMA activity +--------------------- +It's extremely important to stop all DMA operations BEFORE attempting +to deallocate DMA control data. Failure to do so can result in memory +corruption, hangs, and on some chip-sets a hard crash. + +Stopping DMA after stopping the IRQs can avoid races where the +IRQ handler might restart DMA engines. + +While this step sounds obvious and trivial, several "mature" drivers +didn't get this step right in the past. + + +Release DMA buffers +------------------- +Once DMA is stopped, clean up streaming DMA first. +I.e. unmap data buffers and return buffers to "upstream" +owners if there is one. + +Then clean up "consistent" buffers which contain the control data. + +See Documentation/DMA-API.txt for details on unmapping interfaces. + + +Unregister from other subsystems +-------------------------------- +Most low level PCI device drivers support some other subsystem +like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your +driver isn't losing resources from that other subsystem. +If this happens, typically the symptom is an Oops (panic) when +the subsystem attempts to call into a driver that has been unloaded. + + +Disable Device from responding to MMIO/IO Port addresses +-------------------------------------------------------- +io_unmap() MMIO or IO Port resources and then call pci_disable_device(). +This is the symmetric opposite of pci_enable_device(). +Do not access device registers after calling pci_disable_device(). + + +Release MMIO/IO Port Resource(s) +-------------------------------- +Call pci_release_region() to mark the MMIO or IO Port range as available. +Failure to do so usually results in the inability to reload the driver. + + +How to access PCI config space +============================== + +You can use `pci_(read|write)_config_(byte|word|dword)` to access the config +space of a device represented by `struct pci_dev *`. All these functions return +0 when successful or an error code (`PCIBIOS_...`) which can be translated to a +text string by pcibios_strerror. Most drivers expect that accesses to valid PCI +devices don't fail. + +If you don't have a struct pci_dev available, you can call +`pci_bus_(read|write)_config_(byte|word|dword)` to access a given device +and function on that bus. + +If you access fields in the standard portion of the config header, please +use symbolic names of locations and bits declared in . + +If you need to access Extended PCI Capability registers, just call +pci_find_capability() for the particular capability and it will find the +corresponding register block for you. + + +Other interesting functions +=========================== + +============================= ================================================ +pci_get_domain_bus_and_slot() Find pci_dev corresponding to given domain, + bus and slot and number. If the device is + found, its reference count is increased. +pci_set_power_state() Set PCI Power Management state (0=D0 ... 3=D3) +pci_find_capability() Find specified capability in device's capability + list. +pci_resource_start() Returns bus start address for a given PCI region +pci_resource_end() Returns bus end address for a given PCI region +pci_resource_len() Returns the byte length of a PCI region +pci_set_drvdata() Set private driver data pointer for a pci_dev +pci_get_drvdata() Return private driver data pointer for a pci_dev +pci_set_mwi() Enable Memory-Write-Invalidate transactions. +pci_clear_mwi() Disable Memory-Write-Invalidate transactions. +============================= ================================================ + + +Miscellaneous hints +=================== + +When displaying PCI device names to the user (for example when a driver wants +to tell the user what card has it found), please use pci_name(pci_dev). + +Always refer to the PCI devices by a pointer to the pci_dev structure. +All PCI layer functions use this identification and it's the only +reasonable one. Don't use bus/slot/function numbers except for very +special purposes -- on systems with multiple primary buses their semantics +can be pretty complex. + +Don't try to turn on Fast Back to Back writes in your driver. All devices +on the bus need to be capable of doing it, so this is something which needs +to be handled by platform and generic code, not individual drivers. + + +Vendor and device identifications +================================= + +Do not add new device or vendor IDs to include/linux/pci_ids.h unless they +are shared across multiple drivers. You can add private definitions in +your driver if they're helpful, or just use plain hex constants. + +The device IDs are arbitrary hex numbers (vendor controlled) and normally used +only in a single location, the pci_device_id table. + +Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/. +There are mirrors of the pci.ids file at http://pciids.sourceforge.net/ +and https://github.com/pciutils/pciids. + + +Obsolete functions +================== + +There are several functions which you might come across when trying to +port an old driver to the new PCI interface. They are no longer present +in the kernel as they aren't compatible with hotplug or PCI domains or +having sane locking. + +================= =========================================== +pci_find_device() Superseded by pci_get_device() +pci_find_subsys() Superseded by pci_get_subsys() +pci_find_slot() Superseded by pci_get_domain_bus_and_slot() +pci_get_slot() Superseded by pci_get_domain_bus_and_slot() +================= =========================================== + +The alternative is the traditional PCI device driver that walks PCI +device lists. This is still possible but discouraged. + + +MMIO Space and "Write Posting" +============================== + +Converting a driver from using I/O Port space to using MMIO space +often requires some additional changes. Specifically, "write posting" +needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2) +already do this. I/O Port space guarantees write transactions reach the PCI +device before the CPU can continue. Writes to MMIO space allow the CPU +to continue before the transaction reaches the PCI device. HW weenies +call this "Write Posting" because the write completion is "posted" to +the CPU before the transaction has reached its destination. + +Thus, timing sensitive code should add readl() where the CPU is +expected to wait before doing other work. The classic "bit banging" +sequence works fine for I/O Port space:: + + for (i = 8; --i; val >>= 1) { + outb(val & 1, ioport_reg); /* write bit */ + udelay(10); + } + +The same sequence for MMIO space should be:: + + for (i = 8; --i; val >>= 1) { + writeb(val & 1, mmio_reg); /* write bit */ + readb(safe_mmio_reg); /* flush posted write */ + udelay(10); + } + +It is important that "safe_mmio_reg" not have any side effects that +interferes with the correct operation of the device. + +Another case to watch out for is when resetting a PCI device. Use PCI +Configuration space reads to flush the writel(). This will gracefully +handle the PCI master abort on all platforms if the PCI device is +expected to not respond to a readl(). Most x86 platforms will allow +MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage +(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail"). diff --git a/Documentation/PCI/pci.txt b/Documentation/PCI/pci.txt deleted file mode 100644 index badb26ac33dc..000000000000 --- a/Documentation/PCI/pci.txt +++ /dev/null @@ -1,636 +0,0 @@ - - How To Write Linux PCI Drivers - - by Martin Mares on 07-Feb-2000 - updated by Grant Grundler on 23-Dec-2006 - -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The world of PCI is vast and full of (mostly unpleasant) surprises. -Since each CPU architecture implements different chip-sets and PCI devices -have different requirements (erm, "features"), the result is the PCI support -in the Linux kernel is not as trivial as one would wish. This short paper -tries to introduce all potential driver authors to Linux APIs for -PCI device drivers. - -A more complete resource is the third edition of "Linux Device Drivers" -by Jonathan Corbet, Alessandro Rubini, and Greg Kroah-Hartman. -LDD3 is available for free (under Creative Commons License) from: - - http://lwn.net/Kernel/LDD3/ - -However, keep in mind that all documents are subject to "bit rot". -Refer to the source code if things are not working as described here. - -Please send questions/comments/patches about Linux PCI API to the -"Linux PCI" mailing list. - - - -0. Structure of PCI drivers -~~~~~~~~~~~~~~~~~~~~~~~~~~~ -PCI drivers "discover" PCI devices in a system via pci_register_driver(). -Actually, it's the other way around. When the PCI generic code discovers -a new device, the driver with a matching "description" will be notified. -Details on this below. - -pci_register_driver() leaves most of the probing for devices to -the PCI layer and supports online insertion/removal of devices [thus -supporting hot-pluggable PCI, CardBus, and Express-Card in a single driver]. -pci_register_driver() call requires passing in a table of function -pointers and thus dictates the high level structure of a driver. - -Once the driver knows about a PCI device and takes ownership, the -driver generally needs to perform the following initialization: - - Enable the device - Request MMIO/IOP resources - Set the DMA mask size (for both coherent and streaming DMA) - Allocate and initialize shared control data (pci_allocate_coherent()) - Access device configuration space (if needed) - Register IRQ handler (request_irq()) - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip) - Enable DMA/processing engines - -When done using the device, and perhaps the module needs to be unloaded, -the driver needs to take the follow steps: - Disable the device from generating IRQs - Release the IRQ (free_irq()) - Stop all DMA activity - Release DMA buffers (both streaming and coherent) - Unregister from other subsystems (e.g. scsi or netdev) - Release MMIO/IOP resources - Disable the device - -Most of these topics are covered in the following sections. -For the rest look at LDD3 or . - -If the PCI subsystem is not configured (CONFIG_PCI is not set), most of -the PCI functions described below are defined as inline functions either -completely empty or just returning an appropriate error codes to avoid -lots of ifdefs in the drivers. - - - -1. pci_register_driver() call -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -PCI device drivers call pci_register_driver() during their -initialization with a pointer to a structure describing the driver -(struct pci_driver): - - field name Description - ---------- ------------------------------------------------------ - id_table Pointer to table of device ID's the driver is - interested in. Most drivers should export this - table using MODULE_DEVICE_TABLE(pci,...). - - probe This probing function gets called (during execution - of pci_register_driver() for already existing - devices or later if a new device gets inserted) for - all PCI devices which match the ID table and are not - "owned" by the other drivers yet. This function gets - passed a "struct pci_dev *" for each device whose - entry in the ID table matches the device. The probe - function returns zero when the driver chooses to - take "ownership" of the device or an error code - (negative number) otherwise. - The probe function always gets called from process - context, so it can sleep. - - remove The remove() function gets called whenever a device - being handled by this driver is removed (either during - deregistration of the driver or when it's manually - pulled out of a hot-pluggable slot). - The remove function always gets called from process - context, so it can sleep. - - suspend Put device into low power state. - suspend_late Put device into low power state. - - resume_early Wake device from low power state. - resume Wake device from low power state. - - (Please see Documentation/power/pci.txt for descriptions - of PCI Power Management and the related functions.) - - shutdown Hook into reboot_notifier_list (kernel/sys.c). - Intended to stop any idling DMA operations. - Useful for enabling wake-on-lan (NIC) or changing - the power state of a device before reboot. - e.g. drivers/net/e100.c. - - err_handler See Documentation/PCI/pci-error-recovery.txt - - -The ID table is an array of struct pci_device_id entries ending with an -all-zero entry. Definitions with static const are generally preferred. - -Each entry consists of: - - vendor,device Vendor and device ID to match (or PCI_ANY_ID) - - subvendor, Subsystem vendor and device ID to match (or PCI_ANY_ID) - subdevice, - - class Device class, subclass, and "interface" to match. - See Appendix D of the PCI Local Bus Spec or - include/linux/pci_ids.h for a full list of classes. - Most drivers do not need to specify class/class_mask - as vendor/device is normally sufficient. - - class_mask limit which sub-fields of the class field are compared. - See drivers/scsi/sym53c8xx_2/ for example of usage. - - driver_data Data private to the driver. - Most drivers don't need to use driver_data field. - Best practice is to use driver_data as an index - into a static list of equivalent device types, - instead of using it as a pointer. - - -Most drivers only need PCI_DEVICE() or PCI_DEVICE_CLASS() to set up -a pci_device_id table. - -New PCI IDs may be added to a device driver pci_ids table at runtime -as shown below: - -echo "vendor device subvendor subdevice class class_mask driver_data" > \ -/sys/bus/pci/drivers/{driver}/new_id - -All fields are passed in as hexadecimal values (no leading 0x). -The vendor and device fields are mandatory, the others are optional. Users -need pass only as many optional fields as necessary: - o subvendor and subdevice fields default to PCI_ANY_ID (FFFFFFFF) - o class and classmask fields default to 0 - o driver_data defaults to 0UL. - -Note that driver_data must match the value used by any of the pci_device_id -entries defined in the driver. This makes the driver_data field mandatory -if all the pci_device_id entries have a non-zero driver_data value. - -Once added, the driver probe routine will be invoked for any unclaimed -PCI devices listed in its (newly updated) pci_ids list. - -When the driver exits, it just calls pci_unregister_driver() and the PCI layer -automatically calls the remove hook for all devices handled by the driver. - - -1.1 "Attributes" for driver functions/data - -Please mark the initialization and cleanup functions where appropriate -(the corresponding macros are defined in ): - - __init Initialization code. Thrown away after the driver - initializes. - __exit Exit code. Ignored for non-modular drivers. - -Tips on when/where to use the above attributes: - o The module_init()/module_exit() functions (and all - initialization functions called _only_ from these) - should be marked __init/__exit. - - o Do not mark the struct pci_driver. - - o Do NOT mark a function if you are not sure which mark to use. - Better to not mark the function than mark the function wrong. - - - -2. How to find PCI devices manually -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -PCI drivers should have a really good reason for not using the -pci_register_driver() interface to search for PCI devices. -The main reason PCI devices are controlled by multiple drivers -is because one PCI device implements several different HW services. -E.g. combined serial/parallel port/floppy controller. - -A manual search may be performed using the following constructs: - -Searching by vendor and device ID: - - struct pci_dev *dev = NULL; - while (dev = pci_get_device(VENDOR_ID, DEVICE_ID, dev)) - configure_device(dev); - -Searching by class ID (iterate in a similar way): - - pci_get_class(CLASS_ID, dev) - -Searching by both vendor/device and subsystem vendor/device ID: - - pci_get_subsys(VENDOR_ID,DEVICE_ID, SUBSYS_VENDOR_ID, SUBSYS_DEVICE_ID, dev). - -You can use the constant PCI_ANY_ID as a wildcard replacement for -VENDOR_ID or DEVICE_ID. This allows searching for any device from a -specific vendor, for example. - -These functions are hotplug-safe. They increment the reference count on -the pci_dev that they return. You must eventually (possibly at module unload) -decrement the reference count on these devices by calling pci_dev_put(). - - - -3. Device Initialization Steps -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -As noted in the introduction, most PCI drivers need the following steps -for device initialization: - - Enable the device - Request MMIO/IOP resources - Set the DMA mask size (for both coherent and streaming DMA) - Allocate and initialize shared control data (pci_allocate_coherent()) - Access device configuration space (if needed) - Register IRQ handler (request_irq()) - Initialize non-PCI (i.e. LAN/SCSI/etc parts of the chip) - Enable DMA/processing engines. - -The driver can access PCI config space registers at any time. -(Well, almost. When running BIST, config space can go away...but -that will just result in a PCI Bus Master Abort and config reads -will return garbage). - - -3.1 Enable the PCI device -~~~~~~~~~~~~~~~~~~~~~~~~~ -Before touching any device registers, the driver needs to enable -the PCI device by calling pci_enable_device(). This will: - o wake up the device if it was in suspended state, - o allocate I/O and memory regions of the device (if BIOS did not), - o allocate an IRQ (if BIOS did not). - -NOTE: pci_enable_device() can fail! Check the return value. - -[ OS BUG: we don't check resource allocations before enabling those - resources. The sequence would make more sense if we called - pci_request_resources() before calling pci_enable_device(). - Currently, the device drivers can't detect the bug when when two - devices have been allocated the same range. This is not a common - problem and unlikely to get fixed soon. - - This has been discussed before but not changed as of 2.6.19: - http://lkml.org/lkml/2006/3/2/194 -] - -pci_set_master() will enable DMA by setting the bus master bit -in the PCI_COMMAND register. It also fixes the latency timer value if -it's set to something bogus by the BIOS. pci_clear_master() will -disable DMA by clearing the bus master bit. - -If the PCI device can use the PCI Memory-Write-Invalidate transaction, -call pci_set_mwi(). This enables the PCI_COMMAND bit for Mem-Wr-Inval -and also ensures that the cache line size register is set correctly. -Check the return value of pci_set_mwi() as not all architectures -or chip-sets may support Memory-Write-Invalidate. Alternatively, -if Mem-Wr-Inval would be nice to have but is not required, call -pci_try_set_mwi() to have the system do its best effort at enabling -Mem-Wr-Inval. - - -3.2 Request MMIO/IOP resources -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Memory (MMIO), and I/O port addresses should NOT be read directly -from the PCI device config space. Use the values in the pci_dev structure -as the PCI "bus address" might have been remapped to a "host physical" -address by the arch/chip-set specific kernel support. - -See Documentation/io-mapping.txt for how to access device registers -or device memory. - -The device driver needs to call pci_request_region() to verify -no other device is already using the same address resource. -Conversely, drivers should call pci_release_region() AFTER -calling pci_disable_device(). -The idea is to prevent two devices colliding on the same address range. - -[ See OS BUG comment above. Currently (2.6.19), The driver can only - determine MMIO and IO Port resource availability _after_ calling - pci_enable_device(). ] - -Generic flavors of pci_request_region() are request_mem_region() -(for MMIO ranges) and request_region() (for IO Port ranges). -Use these for address resources that are not described by "normal" PCI -BARs. - -Also see pci_request_selected_regions() below. - - -3.3 Set the DMA mask size -~~~~~~~~~~~~~~~~~~~~~~~~~ -[ If anything below doesn't make sense, please refer to - Documentation/DMA-API.txt. This section is just a reminder that - drivers need to indicate DMA capabilities of the device and is not - an authoritative source for DMA interfaces. ] - -While all drivers should explicitly indicate the DMA capability -(e.g. 32 or 64 bit) of the PCI bus master, devices with more than -32-bit bus master capability for streaming data need the driver -to "register" this capability by calling pci_set_dma_mask() with -appropriate parameters. In general this allows more efficient DMA -on systems where System RAM exists above 4G _physical_ address. - -Drivers for all PCI-X and PCIe compliant devices must call -pci_set_dma_mask() as they are 64-bit DMA devices. - -Similarly, drivers must also "register" this capability if the device -can directly address "consistent memory" in System RAM above 4G physical -address by calling pci_set_consistent_dma_mask(). -Again, this includes drivers for all PCI-X and PCIe compliant devices. -Many 64-bit "PCI" devices (before PCI-X) and some PCI-X devices are -64-bit DMA capable for payload ("streaming") data but not control -("consistent") data. - - -3.4 Setup shared control data -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Once the DMA masks are set, the driver can allocate "consistent" (a.k.a. shared) -memory. See Documentation/DMA-API.txt for a full description of -the DMA APIs. This section is just a reminder that it needs to be done -before enabling DMA on the device. - - -3.5 Initialize device registers -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Some drivers will need specific "capability" fields programmed -or other "vendor specific" register initialized or reset. -E.g. clearing pending interrupts. - - -3.6 Register IRQ handler -~~~~~~~~~~~~~~~~~~~~~~~~ -While calling request_irq() is the last step described here, -this is often just another intermediate step to initialize a device. -This step can often be deferred until the device is opened for use. - -All interrupt handlers for IRQ lines should be registered with IRQF_SHARED -and use the devid to map IRQs to devices (remember that all PCI IRQ lines -can be shared). - -request_irq() will associate an interrupt handler and device handle -with an interrupt number. Historically interrupt numbers represent -IRQ lines which run from the PCI device to the Interrupt controller. -With MSI and MSI-X (more below) the interrupt number is a CPU "vector". - -request_irq() also enables the interrupt. Make sure the device is -quiesced and does not have any interrupts pending before registering -the interrupt handler. - -MSI and MSI-X are PCI capabilities. Both are "Message Signaled Interrupts" -which deliver interrupts to the CPU via a DMA write to a Local APIC. -The fundamental difference between MSI and MSI-X is how multiple -"vectors" get allocated. MSI requires contiguous blocks of vectors -while MSI-X can allocate several individual ones. - -MSI capability can be enabled by calling pci_alloc_irq_vectors() with the -PCI_IRQ_MSI and/or PCI_IRQ_MSIX flags before calling request_irq(). This -causes the PCI support to program CPU vector data into the PCI device -capability registers. Many architectures, chip-sets, or BIOSes do NOT -support MSI or MSI-X and a call to pci_alloc_irq_vectors with just -the PCI_IRQ_MSI and PCI_IRQ_MSIX flags will fail, so try to always -specify PCI_IRQ_LEGACY as well. - -Drivers that have different interrupt handlers for MSI/MSI-X and -legacy INTx should chose the right one based on the msi_enabled -and msix_enabled flags in the pci_dev structure after calling -pci_alloc_irq_vectors. - -There are (at least) two really good reasons for using MSI: -1) MSI is an exclusive interrupt vector by definition. - This means the interrupt handler doesn't have to verify - its device caused the interrupt. - -2) MSI avoids DMA/IRQ race conditions. DMA to host memory is guaranteed - to be visible to the host CPU(s) when the MSI is delivered. This - is important for both data coherency and avoiding stale control data. - This guarantee allows the driver to omit MMIO reads to flush - the DMA stream. - -See drivers/infiniband/hw/mthca/ or drivers/net/tg3.c for examples -of MSI/MSI-X usage. - - - -4. PCI device shutdown -~~~~~~~~~~~~~~~~~~~~~~~ - -When a PCI device driver is being unloaded, most of the following -steps need to be performed: - - Disable the device from generating IRQs - Release the IRQ (free_irq()) - Stop all DMA activity - Release DMA buffers (both streaming and consistent) - Unregister from other subsystems (e.g. scsi or netdev) - Disable device from responding to MMIO/IO Port addresses - Release MMIO/IO Port resource(s) - - -4.1 Stop IRQs on the device -~~~~~~~~~~~~~~~~~~~~~~~~~~~ -How to do this is chip/device specific. If it's not done, it opens -the possibility of a "screaming interrupt" if (and only if) -the IRQ is shared with another device. - -When the shared IRQ handler is "unhooked", the remaining devices -using the same IRQ line will still need the IRQ enabled. Thus if the -"unhooked" device asserts IRQ line, the system will respond assuming -it was one of the remaining devices asserted the IRQ line. Since none -of the other devices will handle the IRQ, the system will "hang" until -it decides the IRQ isn't going to get handled and masks the IRQ (100,000 -iterations later). Once the shared IRQ is masked, the remaining devices -will stop functioning properly. Not a nice situation. - -This is another reason to use MSI or MSI-X if it's available. -MSI and MSI-X are defined to be exclusive interrupts and thus -are not susceptible to the "screaming interrupt" problem. - - -4.2 Release the IRQ -~~~~~~~~~~~~~~~~~~~ -Once the device is quiesced (no more IRQs), one can call free_irq(). -This function will return control once any pending IRQs are handled, -"unhook" the drivers IRQ handler from that IRQ, and finally release -the IRQ if no one else is using it. - - -4.3 Stop all DMA activity -~~~~~~~~~~~~~~~~~~~~~~~~~ -It's extremely important to stop all DMA operations BEFORE attempting -to deallocate DMA control data. Failure to do so can result in memory -corruption, hangs, and on some chip-sets a hard crash. - -Stopping DMA after stopping the IRQs can avoid races where the -IRQ handler might restart DMA engines. - -While this step sounds obvious and trivial, several "mature" drivers -didn't get this step right in the past. - - -4.4 Release DMA buffers -~~~~~~~~~~~~~~~~~~~~~~~ -Once DMA is stopped, clean up streaming DMA first. -I.e. unmap data buffers and return buffers to "upstream" -owners if there is one. - -Then clean up "consistent" buffers which contain the control data. - -See Documentation/DMA-API.txt for details on unmapping interfaces. - - -4.5 Unregister from other subsystems -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Most low level PCI device drivers support some other subsystem -like USB, ALSA, SCSI, NetDev, Infiniband, etc. Make sure your -driver isn't losing resources from that other subsystem. -If this happens, typically the symptom is an Oops (panic) when -the subsystem attempts to call into a driver that has been unloaded. - - -4.6 Disable Device from responding to MMIO/IO Port addresses -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -io_unmap() MMIO or IO Port resources and then call pci_disable_device(). -This is the symmetric opposite of pci_enable_device(). -Do not access device registers after calling pci_disable_device(). - - -4.7 Release MMIO/IO Port Resource(s) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Call pci_release_region() to mark the MMIO or IO Port range as available. -Failure to do so usually results in the inability to reload the driver. - - - -5. How to access PCI config space -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -You can use pci_(read|write)_config_(byte|word|dword) to access the config -space of a device represented by struct pci_dev *. All these functions return 0 -when successful or an error code (PCIBIOS_...) which can be translated to a text -string by pcibios_strerror. Most drivers expect that accesses to valid PCI -devices don't fail. - -If you don't have a struct pci_dev available, you can call -pci_bus_(read|write)_config_(byte|word|dword) to access a given device -and function on that bus. - -If you access fields in the standard portion of the config header, please -use symbolic names of locations and bits declared in . - -If you need to access Extended PCI Capability registers, just call -pci_find_capability() for the particular capability and it will find the -corresponding register block for you. - - - -6. Other interesting functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -pci_get_domain_bus_and_slot() Find pci_dev corresponding to given domain, - bus and slot and number. If the device is - found, its reference count is increased. -pci_set_power_state() Set PCI Power Management state (0=D0 ... 3=D3) -pci_find_capability() Find specified capability in device's capability - list. -pci_resource_start() Returns bus start address for a given PCI region -pci_resource_end() Returns bus end address for a given PCI region -pci_resource_len() Returns the byte length of a PCI region -pci_set_drvdata() Set private driver data pointer for a pci_dev -pci_get_drvdata() Return private driver data pointer for a pci_dev -pci_set_mwi() Enable Memory-Write-Invalidate transactions. -pci_clear_mwi() Disable Memory-Write-Invalidate transactions. - - - -7. Miscellaneous hints -~~~~~~~~~~~~~~~~~~~~~~ - -When displaying PCI device names to the user (for example when a driver wants -to tell the user what card has it found), please use pci_name(pci_dev). - -Always refer to the PCI devices by a pointer to the pci_dev structure. -All PCI layer functions use this identification and it's the only -reasonable one. Don't use bus/slot/function numbers except for very -special purposes -- on systems with multiple primary buses their semantics -can be pretty complex. - -Don't try to turn on Fast Back to Back writes in your driver. All devices -on the bus need to be capable of doing it, so this is something which needs -to be handled by platform and generic code, not individual drivers. - - - -8. Vendor and device identifications -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Do not add new device or vendor IDs to include/linux/pci_ids.h unless they -are shared across multiple drivers. You can add private definitions in -your driver if they're helpful, or just use plain hex constants. - -The device IDs are arbitrary hex numbers (vendor controlled) and normally used -only in a single location, the pci_device_id table. - -Please DO submit new vendor/device IDs to http://pci-ids.ucw.cz/. -There are mirrors of the pci.ids file at http://pciids.sourceforge.net/ -and https://github.com/pciutils/pciids. - - - -9. Obsolete functions -~~~~~~~~~~~~~~~~~~~~~ - -There are several functions which you might come across when trying to -port an old driver to the new PCI interface. They are no longer present -in the kernel as they aren't compatible with hotplug or PCI domains or -having sane locking. - -pci_find_device() Superseded by pci_get_device() -pci_find_subsys() Superseded by pci_get_subsys() -pci_find_slot() Superseded by pci_get_domain_bus_and_slot() -pci_get_slot() Superseded by pci_get_domain_bus_and_slot() - - -The alternative is the traditional PCI device driver that walks PCI -device lists. This is still possible but discouraged. - - - -10. MMIO Space and "Write Posting" -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Converting a driver from using I/O Port space to using MMIO space -often requires some additional changes. Specifically, "write posting" -needs to be handled. Many drivers (e.g. tg3, acenic, sym53c8xx_2) -already do this. I/O Port space guarantees write transactions reach the PCI -device before the CPU can continue. Writes to MMIO space allow the CPU -to continue before the transaction reaches the PCI device. HW weenies -call this "Write Posting" because the write completion is "posted" to -the CPU before the transaction has reached its destination. - -Thus, timing sensitive code should add readl() where the CPU is -expected to wait before doing other work. The classic "bit banging" -sequence works fine for I/O Port space: - - for (i = 8; --i; val >>= 1) { - outb(val & 1, ioport_reg); /* write bit */ - udelay(10); - } - -The same sequence for MMIO space should be: - - for (i = 8; --i; val >>= 1) { - writeb(val & 1, mmio_reg); /* write bit */ - readb(safe_mmio_reg); /* flush posted write */ - udelay(10); - } - -It is important that "safe_mmio_reg" not have any side effects that -interferes with the correct operation of the device. - -Another case to watch out for is when resetting a PCI device. Use PCI -Configuration space reads to flush the writel(). This will gracefully -handle the PCI master abort on all platforms if the PCI device is -expected to not respond to a readl(). Most x86 platforms will allow -MMIO reads to master abort (a.k.a. "Soft Fail") and return garbage -(e.g. ~0). But many RISC platforms will crash (a.k.a."Hard Fail"). - diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 448621c32e4d..664c0fb1d53d 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -16,6 +16,25 @@ typedef unsigned long kernel_ulong_t; #define PCI_ANY_ID (~0) +/** + * struct pci_device_id - PCI device ID structure + * @vendor: Vendor ID to match (or PCI_ANY_ID) + * @device: Device ID to match (or PCI_ANY_ID) + * @subvendor: Subsystem vendor ID to match (or PCI_ANY_ID) + * @subdevice: Subsystem device ID to match (or PCI_ANY_ID) + * @class: Device class, subclass, and "interface" to match. + * See Appendix D of the PCI Local Bus Spec or + * include/linux/pci_ids.h for a full list of classes. + * Most drivers do not need to specify class/class_mask + * as vendor/device is normally sufficient. + * @class_mask: Limit which sub-fields of the class field are compared. + * See drivers/scsi/sym53c8xx_2/ for example of usage. + * @driver_data: Data private to the driver. + * Most drivers don't need to use driver_data field. + * Best practice is to use driver_data as an index + * into a static list of equivalent device types, + * instead of using it as a pointer. + */ struct pci_device_id { __u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/ __u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */ @@ -257,17 +276,17 @@ struct pcmcia_device_id { __u16 match_flags; __u16 manf_id; - __u16 card_id; + __u16 card_id; - __u8 func_id; + __u8 func_id; /* for real multi-function devices */ - __u8 function; + __u8 function; /* for pseudo multi-function devices */ - __u8 device_no; + __u8 device_no; - __u32 prod_id_hash[4]; + __u32 prod_id_hash[4]; /* not matched against in kernelspace */ const char * prod_id[4]; diff --git a/include/linux/pci.h b/include/linux/pci.h index 4a5a84d7bdd4..b74b2a4e6df2 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -151,6 +151,8 @@ static inline const char *pci_power_name(pci_power_t state) #define PCI_PM_BUS_WAIT 50 /** + * typedef pci_channel_state_t + * * The pci_channel state describes connectivity between the CPU and * the PCI device. If some PCI bus between here and the PCI device * has crashed or locked up, this info is reflected here. @@ -775,6 +777,50 @@ struct pci_error_handlers { struct module; + +/** + * struct pci_driver - PCI driver structure + * @node: List of driver structures. + * @name: Driver name. + * @id_table: Pointer to table of device IDs the driver is + * interested in. Most drivers should export this + * table using MODULE_DEVICE_TABLE(pci,...). + * @probe: This probing function gets called (during execution + * of pci_register_driver() for already existing + * devices or later if a new device gets inserted) for + * all PCI devices which match the ID table and are not + * "owned" by the other drivers yet. This function gets + * passed a "struct pci_dev \*" for each device whose + * entry in the ID table matches the device. The probe + * function returns zero when the driver chooses to + * take "ownership" of the device or an error code + * (negative number) otherwise. + * The probe function always gets called from process + * context, so it can sleep. + * @remove: The remove() function gets called whenever a device + * being handled by this driver is removed (either during + * deregistration of the driver or when it's manually + * pulled out of a hot-pluggable slot). + * The remove function always gets called from process + * context, so it can sleep. + * @suspend: Put device into low power state. + * @suspend_late: Put device into low power state. + * @resume_early: Wake device from low power state. + * @resume: Wake device from low power state. + * (Please see Documentation/power/pci.txt for descriptions + * of PCI Power Management and the related functions.) + * @shutdown: Hook into reboot_notifier_list (kernel/sys.c). + * Intended to stop any idling DMA operations. + * Useful for enabling wake-on-lan (NIC) or changing + * the power state of a device before reboot. + * e.g. drivers/net/e100.c. + * @sriov_configure: Optional driver callback to allow configuration of + * number of VFs to enable via sysfs "sriov_numvfs" file. + * @err_handler: See Documentation/PCI/pci-error-recovery.rst + * @groups: Sysfs attribute groups. + * @driver: Driver model structure. + * @dynids: List of dynamically added device IDs. + */ struct pci_driver { struct list_head node; const char *name; @@ -2206,7 +2252,7 @@ static inline u8 pci_vpd_srdt_tag(const u8 *srdt) /** * pci_vpd_info_field_size - Extracts the information field length - * @lrdt: Pointer to the beginning of an information field header + * @info_field: Pointer to the beginning of an information field header * * Returns the extracted information field length. */ -- cgit v1.2.3-59-g8ed1b From 87e5e6dab6c2a21fab2620f37786276d202e2ce0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 May 2019 16:02:22 -0600 Subject: uio: make import_iovec()/compat_import_iovec() return bytes on success Currently these functions return < 0 on error, and 0 for success. Change that so that we return < 0 on error, but number of bytes for success. Some callers already treat the return value that way, others need a slight tweak. Signed-off-by: Jens Axboe --- fs/aio.c | 9 +++++---- fs/io_uring.c | 16 ++++++++-------- fs/splice.c | 8 ++++---- include/linux/uio.h | 4 ++-- lib/iov_iter.c | 15 ++++++++------- net/compat.c | 3 ++- net/socket.c | 3 ++- 7 files changed, 31 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/fs/aio.c b/fs/aio.c index 3490d1fa0e16..41824c710b36 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1479,8 +1479,9 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) return 0; } -static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, - bool vectored, bool compat, struct iov_iter *iter) +static ssize_t aio_setup_rw(int rw, const struct iocb *iocb, + struct iovec **iovec, bool vectored, bool compat, + struct iov_iter *iter) { void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; size_t len = iocb->aio_nbytes; @@ -1537,7 +1538,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, return -EINVAL; ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); - if (ret) + if (ret < 0) return ret; ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) @@ -1565,7 +1566,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, return -EINVAL; ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); - if (ret) + if (ret < 0) return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { diff --git a/fs/io_uring.c b/fs/io_uring.c index 0fbb486a320e..23e08c10f486 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1003,9 +1003,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, return 0; } -static int io_import_iovec(struct io_ring_ctx *ctx, int rw, - const struct sqe_submit *s, struct iovec **iovec, - struct iov_iter *iter) +static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw, + const struct sqe_submit *s, struct iovec **iovec, + struct iov_iter *iter) { const struct io_uring_sqe *sqe = s->sqe; void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -1023,7 +1023,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw, opcode = READ_ONCE(sqe->opcode); if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - int ret = io_import_fixed(ctx, rw, sqe, iter); + ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); *iovec = NULL; return ret; } @@ -1089,7 +1089,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, struct iov_iter iter; struct file *file; size_t iov_count; - int ret; + ssize_t ret; ret = io_prep_rw(req, s, force_nonblock); if (ret) @@ -1102,7 +1102,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, return -EINVAL; ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); - if (ret) + if (ret < 0) return ret; iov_count = iov_iter_count(&iter); @@ -1136,7 +1136,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, struct iov_iter iter; struct file *file; size_t iov_count; - int ret; + ssize_t ret; ret = io_prep_rw(req, s, force_nonblock); if (ret) @@ -1149,7 +1149,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, return -EINVAL; ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); - if (ret) + if (ret < 0) return ret; iov_count = iov_iter_count(&iter); diff --git a/fs/splice.c b/fs/splice.c index 14cb602d9a2f..98412721f056 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1356,7 +1356,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - long error; + ssize_t error; struct fd f; int type; @@ -1367,7 +1367,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, error = import_iovec(type, uiov, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); - if (!error) { + if (error >= 0) { error = do_vmsplice(f.file, &iter, flags); kfree(iov); } @@ -1382,7 +1382,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - long error; + ssize_t error; struct fd f; int type; @@ -1393,7 +1393,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io error = compat_import_iovec(type, iov32, nr_segs, ARRAY_SIZE(iovstack), &iov, &iter); - if (!error) { + if (error >= 0) { error = do_vmsplice(f.file, &iter, flags); kfree(iov); } diff --git a/include/linux/uio.h b/include/linux/uio.h index 2d0131ad4604..a61ceb6575ab 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -279,13 +279,13 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, struct iov_iter *i); -int import_iovec(int type, const struct iovec __user * uvector, +ssize_t import_iovec(int type, const struct iovec __user * uvector, unsigned nr_segs, unsigned fast_segs, struct iovec **iov, struct iov_iter *i); #ifdef CONFIG_COMPAT struct compat_iovec; -int compat_import_iovec(int type, const struct compat_iovec __user * uvector, +ssize_t compat_import_iovec(int type, const struct compat_iovec __user * uvector, unsigned nr_segs, unsigned fast_segs, struct iovec **iov, struct iov_iter *i); #endif diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f99c41d4eb54..f1e0569b4539 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1634,9 +1634,9 @@ EXPORT_SYMBOL(dup_iter); * on-stack array was used or not (and regardless of whether this function * returns an error or not). * - * Return: 0 on success or negative error code on error. + * Return: Negative error code on error, bytes imported on success */ -int import_iovec(int type, const struct iovec __user * uvector, +ssize_t import_iovec(int type, const struct iovec __user * uvector, unsigned nr_segs, unsigned fast_segs, struct iovec **iov, struct iov_iter *i) { @@ -1652,16 +1652,17 @@ int import_iovec(int type, const struct iovec __user * uvector, } iov_iter_init(i, type, p, nr_segs, n); *iov = p == *iov ? NULL : p; - return 0; + return n; } EXPORT_SYMBOL(import_iovec); #ifdef CONFIG_COMPAT #include -int compat_import_iovec(int type, const struct compat_iovec __user * uvector, - unsigned nr_segs, unsigned fast_segs, - struct iovec **iov, struct iov_iter *i) +ssize_t compat_import_iovec(int type, + const struct compat_iovec __user * uvector, + unsigned nr_segs, unsigned fast_segs, + struct iovec **iov, struct iov_iter *i) { ssize_t n; struct iovec *p; @@ -1675,7 +1676,7 @@ int compat_import_iovec(int type, const struct compat_iovec __user * uvector, } iov_iter_init(i, type, p, nr_segs, n); *iov = p == *iov ? NULL : p; - return 0; + return n; } #endif diff --git a/net/compat.c b/net/compat.c index 3f9ce609397f..0f7ded26059e 100644 --- a/net/compat.c +++ b/net/compat.c @@ -80,9 +80,10 @@ int get_compat_msghdr(struct msghdr *kmsg, kmsg->msg_iocb = NULL; - return compat_import_iovec(save_addr ? READ : WRITE, + err = compat_import_iovec(save_addr ? READ : WRITE, compat_ptr(msg.msg_iov), msg.msg_iovlen, UIO_FASTIOV, iov, &kmsg->msg_iter); + return err < 0 ? err : 0; } /* Bleech... */ diff --git a/net/socket.c b/net/socket.c index 72372dc5dd70..bffec466b4f1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2208,9 +2208,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, kmsg->msg_iocb = NULL; - return import_iovec(save_addr ? READ : WRITE, + err = import_iovec(save_addr ? READ : WRITE, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, iov, &kmsg->msg_iter); + return err < 0 ? err : 0; } static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, -- cgit v1.2.3-59-g8ed1b From 5213d7efc8ec26ed8938dce75427eff9275a62d9 Mon Sep 17 00:00:00 2001 From: Ruslan Babayev Date: Tue, 28 May 2019 16:02:32 -0700 Subject: i2c: acpi: export i2c_acpi_find_adapter_by_handle This allows drivers to lookup i2c adapters on ACPI based systems similar to of_get_i2c_adapter_by_node() with DT based systems. Signed-off-by: Ruslan Babayev Reviewed-by: Andrew Lunn Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-acpi.c | 3 ++- include/linux/i2c.h | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c index 272800692088..964687534754 100644 --- a/drivers/i2c/i2c-core-acpi.c +++ b/drivers/i2c/i2c-core-acpi.c @@ -337,7 +337,7 @@ static int i2c_acpi_find_match_device(struct device *dev, void *data) return ACPI_COMPANION(dev) == data; } -static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle) +struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle) { struct device *dev; @@ -345,6 +345,7 @@ static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle) i2c_acpi_find_match_adapter); return dev ? i2c_verify_adapter(dev) : NULL; } +EXPORT_SYMBOL_GPL(i2c_acpi_find_adapter_by_handle); static struct i2c_client *i2c_acpi_find_client_by_adev(struct acpi_device *adev) { diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 1308126fc384..e982b8913b73 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -14,6 +14,7 @@ #ifndef _LINUX_I2C_H #define _LINUX_I2C_H +#include /* for acpi_handle */ #include #include /* for struct device */ #include /* for completion */ @@ -981,6 +982,7 @@ bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, u32 i2c_acpi_find_bus_speed(struct device *dev); struct i2c_client *i2c_acpi_new_device(struct device *dev, int index, struct i2c_board_info *info); +struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle); #else static inline bool i2c_acpi_get_i2c_resource(struct acpi_resource *ares, struct acpi_resource_i2c_serialbus **i2c) @@ -996,6 +998,10 @@ static inline struct i2c_client *i2c_acpi_new_device(struct device *dev, { return NULL; } +static inline struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle) +{ + return NULL; +} #endif /* CONFIG_ACPI */ #endif /* _LINUX_I2C_H */ -- cgit v1.2.3-59-g8ed1b From a09db883e5d938b525a86a4630fc04f98ff1063d Mon Sep 17 00:00:00 2001 From: Uma Shankar Date: Tue, 4 Jun 2019 16:47:02 +0530 Subject: drm: Fix docbook warnings in hdr metadata helper structures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the following warnings: ./include/drm/drm_mode_config.h:841: warning: Incorrect use of kernel-doc format: * hdr_output_metadata_property: Connector property containing hdr ./include/drm/drm_mode_config.h:918: warning: Function parameter or member 'hdr_output_metadata_property' not described in 'drm_mode_config' ./include/drm/drm_connector.h:1251: warning: Function parameter or member 'hdr_output_metadata' not described in 'drm_connector' ./include/drm/drm_connector.h:1251: warning: Function parameter or member 'hdr_sink_metadata' not described in 'drm_connector' Also adds some property documentation for HDR Metadata Connector Property in connector property create function. v2: Fixed Sean Paul's review comments. v3: Fixed Daniel Vetter's review comments, added the UAPI structure definition section in kernel docs. v4: Fixed Daniel Vetter's review comments. v5: Added structure member references as per Daniel's suggestion. Cc: Shashank Sharma Cc: Ville Syrjä Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Sean Paul Cc: David Airlie Cc: Daniel Vetter Cc: Bartlomiej Zolnierkiewicz Cc: "Ville Syrjä" Cc: Hans Verkuil Cc: dri-devel@lists.freedesktop.org Cc: linux-fbdev@vger.kernel.org Reviewed-by: Sean Paul (v1) Signed-off-by: Uma Shankar [danvet: Fix up markup: () for functions, & for structs. Style guide also recommends to prepend struct for structures.] Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/1559647022-7336-1-git-send-email-uma.shankar@intel.com --- drivers/gpu/drm/drm_connector.c | 40 ++++++++++++++++++++++ include/drm/drm_connector.h | 1 + include/drm/drm_mode_config.h | 4 +-- include/linux/hdmi.h | 12 +++++++ include/uapi/drm/drm_mode.h | 74 ++++++++++++++++++++++++++++++++++++++++- 5 files changed, 128 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c index c9ac8b9e83ea..e17586aaa80f 100644 --- a/drivers/gpu/drm/drm_connector.c +++ b/drivers/gpu/drm/drm_connector.c @@ -956,6 +956,46 @@ static const struct drm_prop_enum_list hdmi_colorspaces[] = { * is no longer protected and userspace should take appropriate action * (whatever that might be). * + * HDR_OUTPUT_METADATA: + * Connector property to enable userspace to send HDR Metadata to + * driver. This metadata is based on the composition and blending + * policies decided by user, taking into account the hardware and + * sink capabilities. The driver gets this metadata and creates a + * Dynamic Range and Mastering Infoframe (DRM) in case of HDMI, + * SDP packet (Non-audio INFOFRAME SDP v1.3) for DP. This is then + * sent to sink. This notifies the sink of the upcoming frame's Color + * Encoding and Luminance parameters. + * + * Userspace first need to detect the HDR capabilities of sink by + * reading and parsing the EDID. Details of HDR metadata for HDMI + * are added in CTA 861.G spec. For DP , its defined in VESA DP + * Standard v1.4. It needs to then get the metadata information + * of the video/game/app content which are encoded in HDR (basically + * using HDR transfer functions). With this information it needs to + * decide on a blending policy and compose the relevant + * layers/overlays into a common format. Once this blending is done, + * userspace will be aware of the metadata of the composed frame to + * be send to sink. It then uses this property to communicate this + * metadata to driver which then make a Infoframe packet and sends + * to sink based on the type of encoder connected. + * + * Userspace will be responsible to do Tone mapping operation in case: + * - Some layers are HDR and others are SDR + * - HDR layers luminance is not same as sink + * It will even need to do colorspace conversion and get all layers + * to one common colorspace for blending. It can use either GL, Media + * or display engine to get this done based on the capabilties of the + * associated hardware. + * + * Driver expects metadata to be put in &struct hdr_output_metadata + * structure from userspace. This is received as blob and stored in + * &drm_connector_state.hdr_output_metadata. It parses EDID and saves the + * sink metadata in &struct hdr_sink_metadata, as + * &drm_connector.hdr_sink_metadata. Driver uses + * drm_hdmi_infoframe_set_hdr_metadata() helper to set the HDR metadata, + * hdmi_drm_infoframe_pack() to pack the infoframe as per spec, in case of + * HDMI encoder. + * * max bpc: * This range property is used by userspace to limit the bit depth. When * used the driver would limit the bpc in accordance with the valid range diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h index 547656173c74..47e749b74e5f 100644 --- a/include/drm/drm_connector.h +++ b/include/drm/drm_connector.h @@ -1244,6 +1244,7 @@ struct drm_connector { */ struct llist_node free_node; + /** @hdr_sink_metadata: HDR Metadata Information read from sink */ struct hdr_sink_metadata hdr_sink_metadata; }; diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h index 4f88cc972407..759d462d028b 100644 --- a/include/drm/drm_mode_config.h +++ b/include/drm/drm_mode_config.h @@ -837,8 +837,8 @@ struct drm_mode_config { struct drm_property *writeback_out_fence_ptr_property; /** - * hdr_output_metadata_property: Connector property containing hdr - * metatda. This will be provided by userspace compositors based + * @hdr_output_metadata_property: Connector property containing hdr + * metatada. This will be provided by userspace compositors based * on HDR content */ struct drm_property *hdr_output_metadata_property; diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h index ee55ba589cdc..9918a6c910c5 100644 --- a/include/linux/hdmi.h +++ b/include/linux/hdmi.h @@ -367,8 +367,19 @@ struct hdr_static_metadata { __u16 min_cll; }; +/** + * struct hdr_sink_metadata - HDR sink metadata + * + * Metadata Information read from Sink's EDID + */ struct hdr_sink_metadata { + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ __u32 metadata_type; + /** + * @hdmi_type1: HDR Metadata Infoframe. + */ union { struct hdr_static_metadata hdmi_type1; }; @@ -398,6 +409,7 @@ union hdmi_vendor_any_infoframe { * @spd: spd infoframe * @vendor: union of all vendor infoframes * @audio: audio infoframe + * @drm: Dynamic Range and Mastering infoframe * * This is used by the generic pack function. This works since all infoframes * have the same header which also indicates which type of infoframe should be diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h index 19b5cf368cff..5ab331e5dc23 100644 --- a/include/uapi/drm/drm_mode.h +++ b/include/uapi/drm/drm_mode.h @@ -33,6 +33,15 @@ extern "C" { #endif +/** + * DOC: overview + * + * DRM exposes many UAPI and structure definition to have a consistent + * and standardized interface with user. + * Userspace can refer to these structure definitions and UAPI formats + * to communicate to driver + */ + #define DRM_CONNECTOR_NAME_LEN 32 #define DRM_DISPLAY_MODE_LEN 32 #define DRM_PROP_NAME_LEN 32 @@ -630,24 +639,87 @@ struct drm_color_lut { __u16 reserved; }; -/* HDR Metadata Infoframe as per 861.G spec */ +/** + * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data. + * + * HDR Metadata Infoframe as per CTA 861.G spec. This is expected + * to match exactly with the spec. + * + * Userspace is expected to pass the metadata information as per + * the format described in this structure. + */ struct hdr_metadata_infoframe { + /** + * @eotf: Electro-Optical Transfer Function (EOTF) + * used in the stream. + */ __u8 eotf; + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ __u8 metadata_type; + /** + * @display_primaries: Color Primaries of the Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @display_primaries.x: X cordinate of color primary. + * @display_primaries.y: Y cordinate of color primary. + */ struct { __u16 x, y; } display_primaries[3]; + /** + * @white_point: White Point of Colorspace Data. + * These are coded as unsigned 16-bit values in units of + * 0.00002, where 0x0000 represents zero and 0xC350 + * represents 1.0000. + * @white_point.x: X cordinate of whitepoint of color primary. + * @white_point.y: Y cordinate of whitepoint of color primary. + */ struct { __u16 x, y; } white_point; + /** + * @max_display_mastering_luminance: Max Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ __u16 max_display_mastering_luminance; + /** + * @min_display_mastering_luminance: Min Mastering Display Luminance. + * This value is coded as an unsigned 16-bit value in units of + * 0.0001 cd/m2, where 0x0001 represents 0.0001 cd/m2 and 0xFFFF + * represents 6.5535 cd/m2. + */ __u16 min_display_mastering_luminance; + /** + * @max_cll: Max Content Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ __u16 max_cll; + /** + * @max_fall: Max Frame Average Light Level. + * This value is coded as an unsigned 16-bit value in units of 1 cd/m2, + * where 0x0001 represents 1 cd/m2 and 0xFFFF represents 65535 cd/m2. + */ __u16 max_fall; }; +/** + * struct hdr_output_metadata - HDR output metadata + * + * Metadata Information to be passed from userspace + */ struct hdr_output_metadata { + /** + * @metadata_type: Static_Metadata_Descriptor_ID. + */ __u32 metadata_type; + /** + * @hdmi_metadata_type1: HDR Metadata Infoframe. + */ union { struct hdr_metadata_infoframe hdmi_metadata_type1; }; -- cgit v1.2.3-59-g8ed1b From 1e390478cfb527e34c9ab89ba57212cb05c33c51 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 5 Jun 2019 10:46:05 +0200 Subject: gpu: host1x: Increase maximum DMA segment size Recent versions of the DMA API debug code have started to warn about violations of the maximum DMA segment size. This is because the segment size defaults to 64 KiB, which can easily be exceeded in large buffer allocations such as used in DRM/KMS for framebuffers. Technically the Tegra SMMU and ARM SMMU don't have a maximum segment size (they map individual pages irrespective of whether they are contiguous or not), so the choice of 4 MiB is a bit arbitrary here. The maximum segment size is a 32-bit unsigned integer, though, so we can't set it to the correct maximum size, which would be the size of the aperture. Signed-off-by: Thierry Reding --- drivers/gpu/host1x/bus.c | 3 +++ include/linux/host1x.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c index 103fffc1904b..c9a637d9417e 100644 --- a/drivers/gpu/host1x/bus.c +++ b/drivers/gpu/host1x/bus.c @@ -425,6 +425,9 @@ static int host1x_device_add(struct host1x *host1x, of_dma_configure(&device->dev, host1x->dev->of_node, true); + device->dev.dma_parms = &device->dma_parms; + dma_set_max_seg_size(&device->dev, SZ_4M); + err = host1x_device_parse_dt(device, driver); if (err < 0) { kfree(device); diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 89110d896d72..aef6e2f73802 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -310,6 +310,8 @@ struct host1x_device { struct list_head clients; bool registered; + + struct device_dma_parameters dma_parms; }; static inline struct host1x_device *to_host1x_device(struct device *dev) -- cgit v1.2.3-59-g8ed1b From 2076e5c0451ca943ff8ecc6def7239c84c77e070 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 6 May 2019 16:29:38 -0700 Subject: mm/hmm: update HMM documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the HMM documentation to reflect the latest API and make a few minor wording changes. Cc: John Hubbard Cc: Ira Weiny Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Souptick Joarder Cc: Andrew Morton Signed-off-by: Ralph Campbell Reviewed-by: Jérôme Glisse Signed-off-by: Jason Gunthorpe --- Documentation/vm/hmm.rst | 141 +++++++++++++++++++++++++---------------------- include/linux/hmm.h | 7 ++- 2 files changed, 78 insertions(+), 70 deletions(-) (limited to 'include/linux') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 7cdf7282e022..7b6eeda5a7c0 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -10,7 +10,7 @@ of this being specialized struct page for such memory (see sections 5 to 7 of this document). HMM also provides optional helpers for SVM (Share Virtual Memory), i.e., -allowing a device to transparently access program address coherently with +allowing a device to transparently access program addresses coherently with the CPU meaning that any valid pointer on the CPU is also a valid pointer for the device. This is becoming mandatory to simplify the use of advanced heterogeneous computing where GPU, DSP, or FPGA are used to perform various @@ -22,8 +22,8 @@ expose the hardware limitations that are inherent to many platforms. The third section gives an overview of the HMM design. The fourth section explains how CPU page-table mirroring works and the purpose of HMM in this context. The fifth section deals with how device memory is represented inside the kernel. -Finally, the last section presents a new migration helper that allows lever- -aging the device DMA engine. +Finally, the last section presents a new migration helper that allows +leveraging the device DMA engine. .. contents:: :local: @@ -39,20 +39,20 @@ address space. I use shared address space to refer to the opposite situation: i.e., one in which any application memory region can be used by a device transparently. -Split address space happens because device can only access memory allocated -through device specific API. This implies that all memory objects in a program +Split address space happens because devices can only access memory allocated +through a device specific API. This implies that all memory objects in a program are not equal from the device point of view which complicates large programs that rely on a wide set of libraries. -Concretely this means that code that wants to leverage devices like GPUs needs -to copy object between generically allocated memory (malloc, mmap private, mmap +Concretely, this means that code that wants to leverage devices like GPUs needs +to copy objects between generically allocated memory (malloc, mmap private, mmap share) and memory allocated through the device driver API (this still ends up with an mmap but of the device file). For flat data sets (array, grid, image, ...) this isn't too hard to achieve but -complex data sets (list, tree, ...) are hard to get right. Duplicating a +for complex data sets (list, tree, ...) it's hard to get right. Duplicating a complex data set needs to re-map all the pointer relations between each of its -elements. This is error prone and program gets harder to debug because of the +elements. This is error prone and programs get harder to debug because of the duplicate data set and addresses. Split address space also means that libraries cannot transparently use data @@ -77,12 +77,12 @@ I/O bus, device memory characteristics I/O buses cripple shared address spaces due to a few limitations. Most I/O buses only allow basic memory access from device to main memory; even cache -coherency is often optional. Access to device memory from CPU is even more +coherency is often optional. Access to device memory from a CPU is even more limited. More often than not, it is not cache coherent. If we only consider the PCIE bus, then a device can access main memory (often through an IOMMU) and be cache coherent with the CPUs. However, it only allows -a limited set of atomic operations from device on main memory. This is worse +a limited set of atomic operations from the device on main memory. This is worse in the other direction: the CPU can only access a limited range of the device memory and cannot perform atomic operations on it. Thus device memory cannot be considered the same as regular memory from the kernel point of view. @@ -93,20 +93,20 @@ The final limitation is latency. Access to main memory from the device has an order of magnitude higher latency than when the device accesses its own memory. Some platforms are developing new I/O buses or additions/modifications to PCIE -to address some of these limitations (OpenCAPI, CCIX). They mainly allow two- -way cache coherency between CPU and device and allow all atomic operations the +to address some of these limitations (OpenCAPI, CCIX). They mainly allow +two-way cache coherency between CPU and device and allow all atomic operations the architecture supports. Sadly, not all platforms are following this trend and some major architectures are left without hardware solutions to these problems. So for shared address space to make sense, not only must we allow devices to access any memory but we must also permit any memory to be migrated to device -memory while device is using it (blocking CPU access while it happens). +memory while the device is using it (blocking CPU access while it happens). Shared address space and migration ================================== -HMM intends to provide two main features. First one is to share the address +HMM intends to provide two main features. The first one is to share the address space by duplicating the CPU page table in the device page table so the same address points to the same physical memory for any valid main memory address in the process address space. @@ -121,14 +121,14 @@ why HMM provides helpers to factor out everything that can be while leaving the hardware specific details to the device driver. The second mechanism HMM provides is a new kind of ZONE_DEVICE memory that -allows allocating a struct page for each page of the device memory. Those pages +allows allocating a struct page for each page of device memory. Those pages are special because the CPU cannot map them. However, they allow migrating main memory to device memory using existing migration mechanisms and everything -looks like a page is swapped out to disk from the CPU point of view. Using a -struct page gives the easiest and cleanest integration with existing mm mech- -anisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE +looks like a page that is swapped out to disk from the CPU point of view. Using a +struct page gives the easiest and cleanest integration with existing mm +mechanisms. Here again, HMM only provides helpers, first to hotplug new ZONE_DEVICE memory for the device memory and second to perform migration. Policy decisions -of what and when to migrate things is left to the device driver. +of what and when to migrate is left to the device driver. Note that any CPU access to a device page triggers a page fault and a migration back to main memory. For example, when a page backing a given CPU address A is @@ -136,8 +136,8 @@ migrated from a main memory page to a device page, then any CPU access to address A triggers a page fault and initiates a migration back to main memory. With these two features, HMM not only allows a device to mirror process address -space and keeping both CPU and device page table synchronized, but also lever- -ages device memory by migrating the part of the data set that is actively being +space and keeps both CPU and device page tables synchronized, but also +leverages device memory by migrating the part of the data set that is actively being used by the device. @@ -151,21 +151,28 @@ registration of an hmm_mirror struct:: int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); - int hmm_mirror_register_locked(struct hmm_mirror *mirror, - struct mm_struct *mm); - -The locked variant is to be used when the driver is already holding mmap_sem -of the mm in write mode. The mirror struct has a set of callbacks that are used +The mirror struct has a set of callbacks that are used to propagate CPU page tables:: struct hmm_mirror_ops { + /* release() - release hmm_mirror + * + * @mirror: pointer to struct hmm_mirror + * + * This is called when the mm_struct is being released. The callback + * must ensure that all access to any pages obtained from this mirror + * is halted before the callback returns. All future access should + * fault. + */ + void (*release)(struct hmm_mirror *mirror); + /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror - * @update_type: type of update that occurred to the CPU page table - * @start: virtual start address of the range to update - * @end: virtual end address of the range to update + * @update: update information (see struct mmu_notifier_range) + * Return: -EAGAIN if update.blockable false and callback need to + * block, 0 otherwise. * * This callback ultimately originates from mmu_notifiers when the CPU * page table is updated. The device driver must update its page table @@ -176,14 +183,12 @@ to propagate CPU page tables:: * page tables are completely updated (TLBs flushed, etc); this is a * synchronous call. */ - void (*update)(struct hmm_mirror *mirror, - enum hmm_update action, - unsigned long start, - unsigned long end); + int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, + const struct hmm_update *update); }; The device driver must perform the update action to the range (mark range -read only, or fully unmap, ...). The device must be done with the update before +read only, or fully unmap, etc.). The device must complete the update before the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can @@ -194,17 +199,18 @@ use either:: The first one (hmm_range_snapshot()) will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. -The second one does trigger a page fault on missing or read-only entry if the -write parameter is true. Page faults use the generic mm page fault code path -just like a CPU page fault. +The second one does trigger a page fault on missing or read-only entries if +write access is requested (see below). Page faults use the generic mm page +fault code path just like a CPU page fault. Both functions copy CPU page table entries into their pfns array argument. Each entry in that array corresponds to an address in the virtual range. HMM provides a set of flags to help the driver identify special CPU page table entries. -Locking with the update() callback is the most important aspect the driver must -respect in order to keep things properly synchronized. The usage pattern is:: +Locking within the sync_cpu_device_pagetables() callback is the most important +aspect the driver must respect in order to keep things properly synchronized. +The usage pattern is:: int driver_populate_range(...) { @@ -239,11 +245,11 @@ respect in order to keep things properly synchronized. The usage pattern is:: hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); goto again; } - hmm_mirror_unregister(&range); + hmm_range_unregister(&range); return ret; } take_lock(driver->update); - if (!range.valid) { + if (!hmm_range_valid(&range)) { release_lock(driver->update); up_read(&mm->mmap_sem); goto again; @@ -251,15 +257,15 @@ respect in order to keep things properly synchronized. The usage pattern is:: // Use pfns array content to update device page table - hmm_mirror_unregister(&range); + hmm_range_unregister(&range); release_lock(driver->update); up_read(&mm->mmap_sem); return 0; } The driver->update lock is the same lock that the driver takes inside its -update() callback. That lock must be held before checking the range.valid -field to avoid any race with a concurrent CPU page table update. +sync_cpu_device_pagetables() callback. That lock must be held before calling +hmm_range_valid() to avoid any race with a concurrent CPU page table update. HMM implements all this on top of the mmu_notifier API because we wanted a simpler API and also to be able to perform optimizations latter on like doing @@ -279,46 +285,47 @@ concurrently). Leverage default_flags and pfn_flags_mask ========================================= -The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows -to set fault or snapshot policy for a whole range instead of having to set them -for each entries in the range. +The hmm_range struct has 2 fields, default_flags and pfn_flags_mask, that specify +fault or snapshot policy for the whole range instead of having to set them +for each entry in the pfns array. + +For instance, if the device flags for range.flags are:: -For instance if the device flags for device entries are: - VALID (1 << 63) - WRITE (1 << 62) + range.flags[HMM_PFN_VALID] = (1 << 63); + range.flags[HMM_PFN_WRITE] = (1 << 62); -Now let say that device driver wants to fault with at least read a range then -it does set:: +and the device driver wants pages for a range with at least read permission, +it sets:: range->default_flags = (1 << 63); range->pfn_flags_mask = 0; -and calls hmm_range_fault() as described above. This will fill fault all page +and calls hmm_range_fault() as described above. This will fill fault all pages in the range with at least read permission. -Now let say driver wants to do the same except for one page in the range for -which its want to have write. Now driver set:: +Now let's say the driver wants to do the same except for one page in the range for +which it wants to have write permission. Now driver set:: range->default_flags = (1 << 63); range->pfn_flags_mask = (1 << 62); range->pfns[index_of_write] = (1 << 62); -With this HMM will fault in all page with at least read (ie valid) and for the +With this, HMM will fault in all pages with at least read (i.e., valid) and for the address == range->start + (index_of_write << PAGE_SHIFT) it will fault with -write permission ie if the CPU pte does not have write permission set then HMM +write permission i.e., if the CPU pte does not have write permission set then HMM will call handle_mm_fault(). -Note that HMM will populate the pfns array with write permission for any entry -that have write permission within the CPU pte no matter what are the values set +Note that HMM will populate the pfns array with write permission for any page +that is mapped with CPU write permission no matter what values are set in default_flags or pfn_flags_mask. Represent and manage device memory from core kernel point of view ================================================================= -Several different designs were tried to support device memory. First one used -a device specific data structure to keep information about migrated memory and -HMM hooked itself in various places of mm code to handle any access to +Several different designs were tried to support device memory. The first one +used a device specific data structure to keep information about migrated memory +and HMM hooked itself in various places of mm code to handle any access to addresses that were backed by device memory. It turns out that this ended up replicating most of the fields of struct page and also needed many kernel code paths to be updated to understand this new kind of memory. @@ -341,7 +348,7 @@ The hmm_devmem_ops is where most of the important things are:: struct hmm_devmem_ops { void (*free)(struct hmm_devmem *devmem, struct page *page); - int (*fault)(struct hmm_devmem *devmem, + vm_fault_t (*fault)(struct hmm_devmem *devmem, struct vm_area_struct *vma, unsigned long addr, struct page *page, @@ -417,9 +424,9 @@ willing to pay to keep all the code simpler. Memory cgroup (memcg) and rss accounting ======================================== -For now device memory is accounted as any regular page in rss counters (either +For now, device memory is accounted as any regular page in rss counters (either anonymous if device page is used for anonymous, file if device page is used for -file backed page or shmem if device page is used for shared memory). This is a +file backed page, or shmem if device page is used for shared memory). This is a deliberate choice to keep existing applications, that might start using device memory without knowing about it, running unimpacted. @@ -439,6 +446,6 @@ get more experience in how device memory is used and its impact on memory resource control. -Note that device memory can never be pinned by device driver nor through GUP +Note that device memory can never be pinned by a device driver nor through GUP and thus such memory is always free upon process exit. Or when last reference is dropped in case of shared memory or file backed memory. diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 044a36d7c3f8..740bb00853f5 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -418,9 +418,10 @@ struct hmm_mirror_ops { * * @mirror: pointer to struct hmm_mirror * - * This is called when the mm_struct is being released. - * The callback should make sure no references to the mirror occur - * after the callback returns. + * This is called when the mm_struct is being released. The callback + * must ensure that all access to any pages obtained from this mirror + * is halted before the callback returns. All future access should + * fault. */ void (*release)(struct hmm_mirror *mirror); -- cgit v1.2.3-59-g8ed1b From 085ea25064a9169eba5f2ed6484c111ab0f3ee79 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 6 May 2019 16:29:39 -0700 Subject: mm/hmm: clean up some coding style and comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no functional changes, just some coding style clean ups and minor comment changes. Cc: John Hubbard Cc: Ira Weiny Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Souptick Joarder Cc: Andrew Morton Signed-off-by: Ralph Campbell Reviewed-by: Jérôme Glisse Signed-off-by: Jason Gunthorpe --- include/linux/hmm.h | 71 +++++++++++++++++++++++++++-------------------------- mm/hmm.c | 62 ++++++++++++++++++++++++---------------------- 2 files changed, 68 insertions(+), 65 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 740bb00853f5..7007123842ba 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -21,8 +21,8 @@ * * HMM address space mirroring API: * - * Use HMM address space mirroring if you want to mirror range of the CPU page - * table of a process into a device page table. Here, "mirror" means "keep + * Use HMM address space mirroring if you want to mirror a range of the CPU + * page tables of a process into a device page table. Here, "mirror" means "keep * synchronized". Prerequisites: the device must provide the ability to write- * protect its page tables (at PAGE_SIZE granularity), and must be able to * recover from the resulting potential page faults. @@ -105,10 +105,11 @@ struct hmm { * HMM_PFN_WRITE: CPU page table has write permission set * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE) * - * The driver provide a flags array, if driver valid bit for an entry is bit - * 3 ie (entry & (1 << 3)) is true if entry is valid then driver must provide + * The driver provides a flags array for mapping page protections to device + * PTE bits. If the driver valid bit for an entry is bit 3, + * i.e., (entry & (1 << 3)), then the driver must provide * an array in hmm_range.flags with hmm_range.flags[HMM_PFN_VALID] == 1 << 3. - * Same logic apply to all flags. This is same idea as vm_page_prot in vma + * Same logic apply to all flags. This is the same idea as vm_page_prot in vma * except that this is per device driver rather than per architecture. */ enum hmm_pfn_flag_e { @@ -129,13 +130,13 @@ enum hmm_pfn_flag_e { * be mirrored by a device, because the entry will never have HMM_PFN_VALID * set and the pfn value is undefined. * - * Driver provide entry value for none entry, error entry and special entry, - * driver can alias (ie use same value for error and special for instance). It - * should not alias none and error or special. + * Driver provides values for none entry, error entry, and special entry. + * Driver can alias (i.e., use same value) error and special, but + * it should not alias none with error or special. * * HMM pfn value returned by hmm_vma_get_pfns() or hmm_vma_fault() will be: * hmm_range.values[HMM_PFN_ERROR] if CPU page table entry is poisonous, - * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table + * hmm_range.values[HMM_PFN_NONE] if there is no CPU page table entry, * hmm_range.values[HMM_PFN_SPECIAL] if CPU page table entry is a special one */ enum hmm_pfn_value_e { @@ -158,6 +159,7 @@ enum hmm_pfn_value_e { * @values: pfn value for some special case (none, special, error, ...) * @default_flags: default flags for the range (write, read, ... see hmm doc) * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter + * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT) * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) * @valid: pfns array did not change since it has been fill by an HMM function */ @@ -180,7 +182,7 @@ struct hmm_range { /* * hmm_range_page_shift() - return the page shift for the range * @range: range being queried - * Returns: page shift (page size = 1 << page shift) for the range + * Return: page shift (page size = 1 << page shift) for the range */ static inline unsigned hmm_range_page_shift(const struct hmm_range *range) { @@ -190,7 +192,7 @@ static inline unsigned hmm_range_page_shift(const struct hmm_range *range) /* * hmm_range_page_size() - return the page size for the range * @range: range being queried - * Returns: page size for the range in bytes + * Return: page size for the range in bytes */ static inline unsigned long hmm_range_page_size(const struct hmm_range *range) { @@ -201,7 +203,7 @@ static inline unsigned long hmm_range_page_size(const struct hmm_range *range) * hmm_range_wait_until_valid() - wait for range to be valid * @range: range affected by invalidation to wait on * @timeout: time out for wait in ms (ie abort wait after that period of time) - * Returns: true if the range is valid, false otherwise. + * Return: true if the range is valid, false otherwise. */ static inline bool hmm_range_wait_until_valid(struct hmm_range *range, unsigned long timeout) @@ -222,7 +224,7 @@ static inline bool hmm_range_wait_until_valid(struct hmm_range *range, /* * hmm_range_valid() - test if a range is valid or not * @range: range - * Returns: true if the range is valid, false otherwise. + * Return: true if the range is valid, false otherwise. */ static inline bool hmm_range_valid(struct hmm_range *range) { @@ -233,7 +235,7 @@ static inline bool hmm_range_valid(struct hmm_range *range) * hmm_device_entry_to_page() - return struct page pointed to by a device entry * @range: range use to decode device entry value * @entry: device entry value to get corresponding struct page from - * Returns: struct page pointer if entry is a valid, NULL otherwise + * Return: struct page pointer if entry is a valid, NULL otherwise * * If the device entry is valid (ie valid flag set) then return the struct page * matching the entry value. Otherwise return NULL. @@ -256,7 +258,7 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang * hmm_device_entry_to_pfn() - return pfn value store in a device entry * @range: range use to decode device entry value * @entry: device entry to extract pfn from - * Returns: pfn value if device entry is valid, -1UL otherwise + * Return: pfn value if device entry is valid, -1UL otherwise */ static inline unsigned long hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn) @@ -276,7 +278,7 @@ hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn) * hmm_device_entry_from_page() - create a valid device entry for a page * @range: range use to encode HMM pfn value * @page: page for which to create the device entry - * Returns: valid device entry for the page + * Return: valid device entry for the page */ static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range, struct page *page) @@ -289,7 +291,7 @@ static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range, * hmm_device_entry_from_pfn() - create a valid device entry value from pfn * @range: range use to encode HMM pfn value * @pfn: pfn value for which to create the device entry - * Returns: valid device entry for the pfn + * Return: valid device entry for the pfn */ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, unsigned long pfn) @@ -394,7 +396,7 @@ enum hmm_update_event { }; /* - * struct hmm_update - HMM update informations for callback + * struct hmm_update - HMM update information for callback * * @start: virtual start address of the range to update * @end: virtual end address of the range to update @@ -428,8 +430,8 @@ struct hmm_mirror_ops { /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror - * @update: update informations (see struct hmm_update) - * Returns: -EAGAIN if update.blockable false and callback need to + * @update: update information (see struct hmm_update) + * Return: -EAGAIN if update.blockable false and callback need to * block, 0 otherwise. * * This callback ultimately originates from mmu_notifiers when the CPU @@ -468,13 +470,13 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); /* * hmm_mirror_mm_is_alive() - test if mm is still alive * @mirror: the HMM mm mirror for which we want to lock the mmap_sem - * Returns: false if the mm is dead, true otherwise + * Return: false if the mm is dead, true otherwise * - * This is an optimization it will not accurately always return -EINVAL if the - * mm is dead ie there can be false negative (process is being kill but HMM is - * not yet inform of that). It is only intented to be use to optimize out case - * where driver is about to do something time consuming and it would be better - * to skip it if the mm is dead. + * This is an optimization, it will not always accurately return false if the + * mm is dead; i.e., there can be false negatives (process is being killed but + * HMM is not yet informed of that). It is only intended to be used to optimize + * out cases where the driver is about to do something time consuming and it + * would be better to skip it if the mm is dead. */ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) { @@ -489,7 +491,6 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) return true; } - /* * Please see Documentation/vm/hmm.rst for how to use the range API. */ @@ -562,7 +563,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) ret = hmm_range_fault(range, block); if (ret <= 0) { if (ret == -EBUSY || !ret) { - /* Same as above drop mmap_sem to match old API. */ + /* Same as above, drop mmap_sem to match old API. */ up_read(&range->vma->vm_mm->mmap_sem); ret = -EBUSY; } else if (ret == -EAGAIN) @@ -629,7 +630,7 @@ struct hmm_devmem_ops { * @page: pointer to struct page backing virtual address (unreliable) * @flags: FAULT_FLAG_* (see include/linux/mm.h) * @pmdp: page middle directory - * Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR + * Return: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR * on error * * The callback occurs whenever there is a CPU page fault or GUP on a @@ -637,14 +638,14 @@ struct hmm_devmem_ops { * page back to regular memory (CPU accessible). * * The device driver is free to migrate more than one page from the - * fault() callback as an optimization. However if device decide to - * migrate more than one page it must always priotirize the faulting + * fault() callback as an optimization. However if the device decides + * to migrate more than one page it must always priotirize the faulting * address over the others. * - * The struct page pointer is only given as an hint to allow quick + * The struct page pointer is only given as a hint to allow quick * lookup of internal device driver data. A concurrent migration - * might have already free that page and the virtual address might - * not longer be back by it. So it should not be modified by the + * might have already freed that page and the virtual address might + * no longer be backed by it. So it should not be modified by the * callback. * * Note that mmap semaphore is held in read mode at least when this @@ -671,7 +672,7 @@ struct hmm_devmem_ops { * @ref: per CPU refcount * @page_fault: callback when CPU fault on an unaddressable device page * - * This an helper structure for device drivers that do not wish to implement + * This is a helper structure for device drivers that do not wish to implement * the gory details related to hotplugging new memoy and allocating struct * pages. * diff --git a/mm/hmm.c b/mm/hmm.c index c62ae414a3a2..4db5dcf110ba 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -153,9 +153,8 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) /* Wake-up everyone waiting on any range. */ mutex_lock(&hmm->lock); - list_for_each_entry(range, &hmm->ranges, list) { + list_for_each_entry(range, &hmm->ranges, list) range->valid = false; - } wake_up_all(&hmm->wq); mutex_unlock(&hmm->lock); @@ -166,9 +165,10 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) list_del_init(&mirror->list); if (mirror->ops->release) { /* - * Drop mirrors_sem so callback can wait on any pending - * work that might itself trigger mmu_notifier callback - * and thus would deadlock with us. + * Drop mirrors_sem so the release callback can wait + * on any pending work that might itself trigger a + * mmu_notifier callback and thus would deadlock with + * us. */ up_write(&hmm->mirrors_sem); mirror->ops->release(mirror); @@ -223,11 +223,8 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, int ret; ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update); - if (!update.blockable && ret == -EAGAIN) { - up_read(&hmm->mirrors_sem); - ret = -EAGAIN; - goto out; - } + if (!update.blockable && ret == -EAGAIN) + break; } up_read(&hmm->mirrors_sem); @@ -271,6 +268,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { * * @mirror: new mirror struct to register * @mm: mm to register against + * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments * * To start mirroring a process address space, the device driver must register * an HMM mirror struct. @@ -298,7 +296,7 @@ EXPORT_SYMBOL(hmm_mirror_register); /* * hmm_mirror_unregister() - unregister a mirror * - * @mirror: new mirror struct to register + * @mirror: mirror struct to unregister * * Stop mirroring a process address space, and cleanup. */ @@ -372,7 +370,7 @@ static int hmm_pfns_bad(unsigned long addr, * @fault: should we fault or not ? * @write_fault: write fault ? * @walk: mm_walk structure - * Returns: 0 on success, -EBUSY after page fault, or page fault error + * Return: 0 on success, -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. @@ -911,6 +909,7 @@ int hmm_range_register(struct hmm_range *range, unsigned page_shift) { unsigned long mask = ((1UL << page_shift) - 1UL); + struct hmm *hmm; range->valid = false; range->hmm = NULL; @@ -924,28 +923,29 @@ int hmm_range_register(struct hmm_range *range, range->start = start; range->end = end; - range->hmm = hmm_get_or_create(mm); - if (!range->hmm) + hmm = hmm_get_or_create(mm); + if (!hmm) return -EFAULT; /* Check if hmm_mm_destroy() was call. */ - if (range->hmm->mm == NULL || range->hmm->dead) { - hmm_put(range->hmm); + if (hmm->mm == NULL || hmm->dead) { + hmm_put(hmm); return -EFAULT; } - /* Initialize range to track CPU page table update */ - mutex_lock(&range->hmm->lock); + /* Initialize range to track CPU page table updates. */ + mutex_lock(&hmm->lock); - list_add_rcu(&range->list, &range->hmm->ranges); + range->hmm = hmm; + list_add_rcu(&range->list, &hmm->ranges); /* * If there are any concurrent notifiers we have to wait for them for * the range to be valid (see hmm_range_wait_until_valid()). */ - if (!range->hmm->notifiers) + if (!hmm->notifiers) range->valid = true; - mutex_unlock(&range->hmm->lock); + mutex_unlock(&hmm->lock); return 0; } @@ -960,17 +960,19 @@ EXPORT_SYMBOL(hmm_range_register); */ void hmm_range_unregister(struct hmm_range *range) { + struct hmm *hmm = range->hmm; + /* Sanity check this really should not happen. */ - if (range->hmm == NULL || range->end <= range->start) + if (hmm == NULL || range->end <= range->start) return; - mutex_lock(&range->hmm->lock); + mutex_lock(&hmm->lock); list_del_rcu(&range->list); - mutex_unlock(&range->hmm->lock); + mutex_unlock(&hmm->lock); /* Drop reference taken by hmm_range_register() */ range->valid = false; - hmm_put(range->hmm); + hmm_put(hmm); range->hmm = NULL; } EXPORT_SYMBOL(hmm_range_unregister); @@ -978,7 +980,7 @@ EXPORT_SYMBOL(hmm_range_unregister); /* * hmm_range_snapshot() - snapshot CPU page table for a range * @range: range - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid * permission (for instance asking for write and range is read only), * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid * vma or it is illegal to access that range), number of valid pages @@ -1061,7 +1063,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); * hmm_range_fault() - try to fault some address in a virtual address range * @range: range being faulted * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: number of valid pages in range->pfns[] (from range start + * Return: number of valid pages in range->pfns[] (from range start * address). This may be zero. If the return value is negative, * then one of the following values may be returned: * @@ -1179,7 +1181,7 @@ EXPORT_SYMBOL(hmm_range_fault); * @device: device against to dma map page to * @daddrs: dma address of mapped pages * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been + * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been * drop and you need to try again, some other error value otherwise * * Note same usage pattern as hmm_range_fault(). @@ -1267,7 +1269,7 @@ EXPORT_SYMBOL(hmm_range_dma_map); * @device: device against which dma map was done * @daddrs: dma address of mapped pages * @dirty: dirty page if it had the write flag set - * Returns: number of page unmapped on success, -EINVAL otherwise + * Return: number of page unmapped on success, -EINVAL otherwise * * Note that caller MUST abide by mmu notifier or use HMM mirror and abide * to the sync_cpu_device_pagetables() callback so that it is safe here to @@ -1390,7 +1392,7 @@ static void hmm_devmem_free(struct page *page, void *data) * @ops: memory event device driver callback (see struct hmm_devmem_ops) * @device: device struct to bind the resource too * @size: size in bytes of the device memory to add - * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise + * Return: pointer to new hmm_devmem struct ERR_PTR otherwise * * This function first finds an empty range of physical address big enough to * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which -- cgit v1.2.3-59-g8ed1b From 6d7c3cde93c1d9ac0b37f78ec3f2ff052159a242 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 22 May 2019 16:52:52 -0300 Subject: mm/hmm: fix use after free with struct hmm in the mmu notifiers mmu_notifier_unregister_no_release() is not a fence and the mmu_notifier system will continue to reference hmm->mn until the srcu grace period expires. Resulting in use after free races like this: CPU0 CPU1 __mmu_notifier_invalidate_range_start() srcu_read_lock hlist_for_each () // mn == hmm->mn hmm_mirror_unregister() hmm_put() hmm_free() mmu_notifier_unregister_no_release() hlist_del_init_rcu(hmm-mn->list) mn->ops->invalidate_range_start(mn, range); mm_get_hmm() mm->hmm = NULL; kfree(hmm) mutex_lock(&hmm->lock); Use SRCU to kfree the hmm memory so that the notifiers can rely on hmm existing. Get the now-safe hmm struct through container_of and directly check kref_get_unless_zero to lock it against free. Signed-off-by: Jason Gunthorpe Reviewed-by: Ira Weiny Reviewed-by: John Hubbard Reviewed-by: Ralph Campbell Reviewed-by: Christoph Hellwig Tested-by: Philip Yang --- include/linux/hmm.h | 1 + mm/hmm.c | 23 +++++++++++++++++------ 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 7007123842ba..cb01cf1fa3c0 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -93,6 +93,7 @@ struct hmm { struct mmu_notifier mmu_notifier; struct rw_semaphore mirrors_sem; wait_queue_head_t wq; + struct rcu_head rcu; long notifiers; bool dead; }; diff --git a/mm/hmm.c b/mm/hmm.c index 826816ab2377..f6956d78e3cb 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -104,6 +104,11 @@ error: return NULL; } +static void hmm_free_rcu(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct hmm, rcu)); +} + static void hmm_free(struct kref *kref) { struct hmm *hmm = container_of(kref, struct hmm, kref); @@ -116,7 +121,7 @@ static void hmm_free(struct kref *kref) mm->hmm = NULL; spin_unlock(&mm->page_table_lock); - kfree(hmm); + mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu); } static inline void hmm_put(struct hmm *hmm) @@ -144,10 +149,14 @@ void hmm_mm_destroy(struct mm_struct *mm) static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct hmm *hmm = mm_get_hmm(mm); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; struct hmm_range *range; + /* Bail out if hmm is in the process of being freed */ + if (!kref_get_unless_zero(&hmm->kref)) + return; + /* Report this HMM as dying. */ hmm->dead = true; @@ -185,13 +194,14 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) static int hmm_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(nrange->mm); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; struct hmm_update update; struct hmm_range *range; int ret = 0; - VM_BUG_ON(!hmm); + if (!kref_get_unless_zero(&hmm->kref)) + return 0; update.start = nrange->start; update.end = nrange->end; @@ -236,9 +246,10 @@ out: static void hmm_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(nrange->mm); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - VM_BUG_ON(!hmm); + if (!kref_get_unless_zero(&hmm->kref)) + return; mutex_lock(&hmm->lock); hmm->notifiers--; -- cgit v1.2.3-59-g8ed1b From f652e66fcca07e59f207bcca27c5566193feabd5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 9 Jun 2019 23:43:13 +0900 Subject: pinctrl: add include guard to pinctrl-state.h Signed-off-by: Masahiro Yamada Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinctrl-state.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinctrl-state.h b/include/linux/pinctrl/pinctrl-state.h index a0e785815a64..635d97e9285e 100644 --- a/include/linux/pinctrl/pinctrl-state.h +++ b/include/linux/pinctrl/pinctrl-state.h @@ -3,6 +3,9 @@ * Standard pin control state definitions */ +#ifndef __LINUX_PINCTRL_PINCTRL_STATE_H +#define __LINUX_PINCTRL_PINCTRL_STATE_H + /** * @PINCTRL_STATE_DEFAULT: the state the pinctrl handle shall be put * into as default, usually this means the pins are up and ready to @@ -31,3 +34,5 @@ #define PINCTRL_STATE_INIT "init" #define PINCTRL_STATE_IDLE "idle" #define PINCTRL_STATE_SLEEP "sleep" + +#endif /* __LINUX_PINCTRL_PINCTRL_STATE_H */ -- cgit v1.2.3-59-g8ed1b From e63d79d1ffcd2201a2dbff1d7a1184b8f3ec74cf Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Tue, 4 Jun 2019 15:29:22 +0200 Subject: dmaengine: Add Synopsys eDMA IP core driver Add Synopsys PCIe Endpoint eDMA IP core driver to kernel. This IP is generally distributed with Synopsys PCIe Endpoint IP (depends of the use and licensing agreement). This core driver, initializes and configures the eDMA IP using vma-helpers functions and dma-engine subsystem. This driver can be compile as built-in or external module in kernel. To enable this driver just select DW_EDMA option in kernel configuration, however it requires and selects automatically DMA_ENGINE and DMA_VIRTUAL_CHANNELS option too. In order to transfer data from point A to B as fast as possible this IP requires a dedicated memory space containing linked list of elements. All elements of this linked list are continuous and each one describes a data transfer (source and destination addresses, length and a control variable). For the sake of simplicity, lets assume a memory space for channel write 0 which allows about 42 elements. +---------+ | Desc #0 |-+ +---------+ | V +----------+ | Chunk #0 |-+ | CB = 1 | | +----------+ +-----+ +-----------+ +-----+ +----------+ +->| Burst #0 |->| ... |->| Burst #41 |->| llp | | +----------+ +-----+ +-----------+ +-----+ V +----------+ | Chunk #1 |-+ | CB = 0 | | +-----------+ +-----+ +-----------+ +-----+ +----------+ +->| Burst #42 |->| ... |->| Burst #83 |->| llp | | +-----------+ +-----+ +-----------+ +-----+ V +----------+ | Chunk #2 |-+ | CB = 1 | | +-----------+ +-----+ +------------+ +-----+ +----------+ +->| Burst #84 |->| ... |->| Burst #125 |->| llp | | +-----------+ +-----+ +------------+ +-----+ V +----------+ | Chunk #3 |-+ | CB = 0 | | +------------+ +-----+ +------------+ +-----+ +----------+ +->| Burst #126 |->| ... |->| Burst #129 |->| llp | +------------+ +-----+ +------------+ +-----+ Legend: - Linked list, also know as Chunk - Linked list element*, also know as Burst *CB*, also know as Change Bit, it's a control bit (and typically is toggled) that allows to easily identify and differentiate between the current linked list and the previous or the next one. - LLP, is a special element that indicates the end of the linked list element stream also informs that the next CB should be toggle On every last Burst of the Chunk (Burst #41, Burst #83, Burst #125 or even Burst #129) is set some flags on their control variable (RIE and LIE bits) that will trigger the send of "done" interruption. On the interruptions callback, is decided whether to recycle the linked list memory space by writing a new set of Bursts elements (if still exists Chunks to transfer) or is considered completed (if there is no Chunks available to transfer). On scatter-gather transfer mode, the client will submit a scatter-gather list of n (on this case 130) elements, that will be divide in multiple Chunks, each Chunk will have (on this case 42) a limited number of Bursts and after transferring all Bursts, an interrupt will be triggered, which will allow to recycle the all linked list dedicated memory again with the new information relative to the next Chunk and respective Burst associated and repeat the whole cycle again. On cyclic transfer mode, the client will submit a buffer pointer, length of it and number of repetitions, in this case each burst will correspond directly to each repetition. Each Burst can describes a data transfer from point A(source) to point B(destination) with a length that can be from 1 byte up to 4 GB. Since dedicated the memory space where the linked list will reside is limited, the whole n burst elements will be organized in several Chunks, that will be used later to recycle the dedicated memory space to initiate a new sequence of data transfers. The whole transfer is considered has completed when it was transferred all bursts. Currently this IP has a set well-known register map, which includes support for legacy and unroll modes. Legacy mode is version of this register map that has multiplexer register that allows to switch registers between all write and read channels and the unroll modes repeats all write and read channels registers with an offset between them. This register map is called v0. The IP team is creating a new register map more suitable to the latest PCIe features, that very likely will change the map register, which this version will be called v1. As soon as this new version is released by the IP team the support for this version in be included on this driver. According to the logic, patches 1, 2 and 3 should be squashed into 1 unique patch, but for the sake of simplicity of review, it was divided in this 3 patches files. Signed-off-by: Gustavo Pimentel Cc: Vinod Koul Cc: Dan Williams Cc: Andy Shevchenko Cc: Russell King Cc: Joao Pinto Signed-off-by: Vinod Koul --- drivers/dma/Kconfig | 2 + drivers/dma/Makefile | 1 + drivers/dma/dw-edma/Kconfig | 9 + drivers/dma/dw-edma/Makefile | 4 + drivers/dma/dw-edma/dw-edma-core.c | 936 +++++++++++++++++++++++++++++++++++++ drivers/dma/dw-edma/dw-edma-core.h | 165 +++++++ include/linux/dma/edma.h | 47 ++ 7 files changed, 1164 insertions(+) create mode 100644 drivers/dma/dw-edma/Kconfig create mode 100644 drivers/dma/dw-edma/Makefile create mode 100644 drivers/dma/dw-edma/dw-edma-core.c create mode 100644 drivers/dma/dw-edma/dw-edma-core.h create mode 100644 include/linux/dma/edma.h (limited to 'include/linux') diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index eaf78f4e07ce..76859aa2688c 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -665,6 +665,8 @@ source "drivers/dma/qcom/Kconfig" source "drivers/dma/dw/Kconfig" +source "drivers/dma/dw-edma/Kconfig" + source "drivers/dma/hsu/Kconfig" source "drivers/dma/sh/Kconfig" diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index 6126e1c3a875..5bddf6f8790f 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_DMA_SUN4I) += sun4i-dma.o obj-$(CONFIG_DMA_SUN6I) += sun6i-dma.o obj-$(CONFIG_DW_AXI_DMAC) += dw-axi-dmac/ obj-$(CONFIG_DW_DMAC_CORE) += dw/ +obj-$(CONFIG_DW_EDMA) += dw-edma/ obj-$(CONFIG_EP93XX_DMA) += ep93xx_dma.o obj-$(CONFIG_FSL_DMA) += fsldma.o obj-$(CONFIG_FSL_EDMA) += fsl-edma.o fsl-edma-common.o diff --git a/drivers/dma/dw-edma/Kconfig b/drivers/dma/dw-edma/Kconfig new file mode 100644 index 000000000000..3016bed63589 --- /dev/null +++ b/drivers/dma/dw-edma/Kconfig @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 + +config DW_EDMA + tristate "Synopsys DesignWare eDMA controller driver" + select DMA_ENGINE + select DMA_VIRTUAL_CHANNELS + help + Support the Synopsys DesignWare eDMA controller, normally + implemented on endpoints SoCs. diff --git a/drivers/dma/dw-edma/Makefile b/drivers/dma/dw-edma/Makefile new file mode 100644 index 000000000000..322401089891 --- /dev/null +++ b/drivers/dma/dw-edma/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_DW_EDMA) += dw-edma.o +dw-edma-objs := dw-edma-core.o diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c new file mode 100644 index 000000000000..c9d032f49dc3 --- /dev/null +++ b/drivers/dma/dw-edma/dw-edma-core.c @@ -0,0 +1,936 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018-2019 Synopsys, Inc. and/or its affiliates. + * Synopsys DesignWare eDMA core driver + * + * Author: Gustavo Pimentel + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dw-edma-core.h" +#include "../dmaengine.h" +#include "../virt-dma.h" + +static inline +struct device *dchan2dev(struct dma_chan *dchan) +{ + return &dchan->dev->device; +} + +static inline +struct device *chan2dev(struct dw_edma_chan *chan) +{ + return &chan->vc.chan.dev->device; +} + +static inline +struct dw_edma_desc *vd2dw_edma_desc(struct virt_dma_desc *vd) +{ + return container_of(vd, struct dw_edma_desc, vd); +} + +static struct dw_edma_burst *dw_edma_alloc_burst(struct dw_edma_chunk *chunk) +{ + struct dw_edma_burst *burst; + + burst = kzalloc(sizeof(*burst), GFP_NOWAIT); + if (unlikely(!burst)) + return NULL; + + INIT_LIST_HEAD(&burst->list); + if (chunk->burst) { + /* Create and add new element into the linked list */ + chunk->bursts_alloc++; + list_add_tail(&burst->list, &chunk->burst->list); + } else { + /* List head */ + chunk->bursts_alloc = 0; + chunk->burst = burst; + } + + return burst; +} + +static struct dw_edma_chunk *dw_edma_alloc_chunk(struct dw_edma_desc *desc) +{ + struct dw_edma_chan *chan = desc->chan; + struct dw_edma *dw = chan->chip->dw; + struct dw_edma_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_NOWAIT); + if (unlikely(!chunk)) + return NULL; + + INIT_LIST_HEAD(&chunk->list); + chunk->chan = chan; + /* Toggling change bit (CB) in each chunk, this is a mechanism to + * inform the eDMA HW block that this is a new linked list ready + * to be consumed. + * - Odd chunks originate CB equal to 0 + * - Even chunks originate CB equal to 1 + */ + chunk->cb = !(desc->chunks_alloc % 2); + chunk->ll_region.paddr = dw->ll_region.paddr + chan->ll_off; + chunk->ll_region.vaddr = dw->ll_region.vaddr + chan->ll_off; + + if (desc->chunk) { + /* Create and add new element into the linked list */ + desc->chunks_alloc++; + list_add_tail(&chunk->list, &desc->chunk->list); + if (!dw_edma_alloc_burst(chunk)) { + kfree(chunk); + return NULL; + } + } else { + /* List head */ + chunk->burst = NULL; + desc->chunks_alloc = 0; + desc->chunk = chunk; + } + + return chunk; +} + +static struct dw_edma_desc *dw_edma_alloc_desc(struct dw_edma_chan *chan) +{ + struct dw_edma_desc *desc; + + desc = kzalloc(sizeof(*desc), GFP_NOWAIT); + if (unlikely(!desc)) + return NULL; + + desc->chan = chan; + if (!dw_edma_alloc_chunk(desc)) { + kfree(desc); + return NULL; + } + + return desc; +} + +static void dw_edma_free_burst(struct dw_edma_chunk *chunk) +{ + struct dw_edma_burst *child, *_next; + + /* Remove all the list elements */ + list_for_each_entry_safe(child, _next, &chunk->burst->list, list) { + list_del(&child->list); + kfree(child); + chunk->bursts_alloc--; + } + + /* Remove the list head */ + kfree(child); + chunk->burst = NULL; +} + +static void dw_edma_free_chunk(struct dw_edma_desc *desc) +{ + struct dw_edma_chunk *child, *_next; + + if (!desc->chunk) + return; + + /* Remove all the list elements */ + list_for_each_entry_safe(child, _next, &desc->chunk->list, list) { + dw_edma_free_burst(child); + list_del(&child->list); + kfree(child); + desc->chunks_alloc--; + } + + /* Remove the list head */ + kfree(child); + desc->chunk = NULL; +} + +static void dw_edma_free_desc(struct dw_edma_desc *desc) +{ + dw_edma_free_chunk(desc); + kfree(desc); +} + +static void vchan_free_desc(struct virt_dma_desc *vdesc) +{ + dw_edma_free_desc(vd2dw_edma_desc(vdesc)); +} + +static void dw_edma_start_transfer(struct dw_edma_chan *chan) +{ + struct dw_edma_chunk *child; + struct dw_edma_desc *desc; + struct virt_dma_desc *vd; + + vd = vchan_next_desc(&chan->vc); + if (!vd) + return; + + desc = vd2dw_edma_desc(vd); + if (!desc) + return; + + child = list_first_entry_or_null(&desc->chunk->list, + struct dw_edma_chunk, list); + if (!child) + return; + + dw_edma_v0_core_start(child, !desc->xfer_sz); + desc->xfer_sz += child->ll_region.sz; + dw_edma_free_burst(child); + list_del(&child->list); + kfree(child); + desc->chunks_alloc--; +} + +static int dw_edma_device_config(struct dma_chan *dchan, + struct dma_slave_config *config) +{ + struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan); + + memcpy(&chan->config, config, sizeof(*config)); + chan->configured = true; + + return 0; +} + +static int dw_edma_device_pause(struct dma_chan *dchan) +{ + struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan); + int err = 0; + + if (!chan->configured) + err = -EPERM; + else if (chan->status != EDMA_ST_BUSY) + err = -EPERM; + else if (chan->request != EDMA_REQ_NONE) + err = -EPERM; + else + chan->request = EDMA_REQ_PAUSE; + + return err; +} + +static int dw_edma_device_resume(struct dma_chan *dchan) +{ + struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan); + int err = 0; + + if (!chan->configured) { + err = -EPERM; + } else if (chan->status != EDMA_ST_PAUSE) { + err = -EPERM; + } else if (chan->request != EDMA_REQ_NONE) { + err = -EPERM; + } else { + chan->status = EDMA_ST_BUSY; + dw_edma_start_transfer(chan); + } + + return err; +} + +static int dw_edma_device_terminate_all(struct dma_chan *dchan) +{ + struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan); + int err = 0; + LIST_HEAD(head); + + if (!chan->configured) { + /* Do nothing */ + } else if (chan->status == EDMA_ST_PAUSE) { + chan->status = EDMA_ST_IDLE; + chan->configured = false; + } else if (chan->status == EDMA_ST_IDLE) { + chan->configured = false; + } else if (dw_edma_v0_core_ch_status(chan) == DMA_COMPLETE) { + /* + * The channel is in a false BUSY state, probably didn't + * receive or lost an interrupt + */ + chan->status = EDMA_ST_IDLE; + chan->configured = false; + } else if (chan->request > EDMA_REQ_PAUSE) { + err = -EPERM; + } else { + chan->request = EDMA_REQ_STOP; + } + + return err; +} + +static void dw_edma_device_issue_pending(struct dma_chan *dchan) +{ + struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan); + unsigned long flags; + + spin_lock_irqsave(&chan->vc.lock, flags); + if (chan->configured && chan->request == EDMA_REQ_NONE && + chan->status == EDMA_ST_IDLE && vchan_issue_pending(&chan->vc)) { + chan->status = EDMA_ST_BUSY; + dw_edma_start_transfer(chan); + } + spin_unlock_irqrestore(&chan->vc.lock, flags); +} + +static enum dma_status +dw_edma_device_tx_status(struct dma_chan *dchan, dma_cookie_t cookie, + struct dma_tx_state *txstate) +{ + struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan); + struct dw_edma_desc *desc; + struct virt_dma_desc *vd; + unsigned long flags; + enum dma_status ret; + u32 residue = 0; + + ret = dma_cookie_status(dchan, cookie, txstate); + if (ret == DMA_COMPLETE) + return ret; + + if (ret == DMA_IN_PROGRESS && chan->status == EDMA_ST_PAUSE) + ret = DMA_PAUSED; + + if (!txstate) + goto ret_residue; + + spin_lock_irqsave(&chan->vc.lock, flags); + vd = vchan_find_desc(&chan->vc, cookie); + if (vd) { + desc = vd2dw_edma_desc(vd); + if (desc) + residue = desc->alloc_sz - desc->xfer_sz; + } + spin_unlock_irqrestore(&chan->vc.lock, flags); + +ret_residue: + dma_set_residue(txstate, residue); + + return ret; +} + +static struct dma_async_tx_descriptor * +dw_edma_device_transfer(struct dw_edma_transfer *xfer) +{ + struct dw_edma_chan *chan = dchan2dw_edma_chan(xfer->dchan); + enum dma_transfer_direction direction = xfer->direction; + phys_addr_t src_addr, dst_addr; + struct scatterlist *sg = NULL; + struct dw_edma_chunk *chunk; + struct dw_edma_burst *burst; + struct dw_edma_desc *desc; + u32 cnt; + int i; + + if ((direction == DMA_MEM_TO_DEV && chan->dir == EDMA_DIR_WRITE) || + (direction == DMA_DEV_TO_MEM && chan->dir == EDMA_DIR_READ)) + return NULL; + + if (xfer->cyclic) { + if (!xfer->xfer.cyclic.len || !xfer->xfer.cyclic.cnt) + return NULL; + } else { + if (xfer->xfer.sg.len < 1) + return NULL; + } + + if (!chan->configured) + return NULL; + + desc = dw_edma_alloc_desc(chan); + if (unlikely(!desc)) + goto err_alloc; + + chunk = dw_edma_alloc_chunk(desc); + if (unlikely(!chunk)) + goto err_alloc; + + src_addr = chan->config.src_addr; + dst_addr = chan->config.dst_addr; + + if (xfer->cyclic) { + cnt = xfer->xfer.cyclic.cnt; + } else { + cnt = xfer->xfer.sg.len; + sg = xfer->xfer.sg.sgl; + } + + for (i = 0; i < cnt; i++) { + if (!xfer->cyclic && !sg) + break; + + if (chunk->bursts_alloc == chan->ll_max) { + chunk = dw_edma_alloc_chunk(desc); + if (unlikely(!chunk)) + goto err_alloc; + } + + burst = dw_edma_alloc_burst(chunk); + if (unlikely(!burst)) + goto err_alloc; + + if (xfer->cyclic) + burst->sz = xfer->xfer.cyclic.len; + else + burst->sz = sg_dma_len(sg); + + chunk->ll_region.sz += burst->sz; + desc->alloc_sz += burst->sz; + + if (direction == DMA_DEV_TO_MEM) { + burst->sar = src_addr; + if (xfer->cyclic) { + burst->dar = xfer->xfer.cyclic.paddr; + } else { + burst->dar = sg_dma_address(sg); + /* Unlike the typical assumption by other + * drivers/IPs the peripheral memory isn't + * a FIFO memory, in this case, it's a + * linear memory and that why the source + * and destination addresses are increased + * by the same portion (data length) + */ + src_addr += sg_dma_len(sg); + } + } else { + burst->dar = dst_addr; + if (xfer->cyclic) { + burst->sar = xfer->xfer.cyclic.paddr; + } else { + burst->sar = sg_dma_address(sg); + /* Unlike the typical assumption by other + * drivers/IPs the peripheral memory isn't + * a FIFO memory, in this case, it's a + * linear memory and that why the source + * and destination addresses are increased + * by the same portion (data length) + */ + dst_addr += sg_dma_len(sg); + } + } + + if (!xfer->cyclic) + sg = sg_next(sg); + } + + return vchan_tx_prep(&chan->vc, &desc->vd, xfer->flags); + +err_alloc: + if (desc) + dw_edma_free_desc(desc); + + return NULL; +} + +static struct dma_async_tx_descriptor * +dw_edma_device_prep_slave_sg(struct dma_chan *dchan, struct scatterlist *sgl, + unsigned int len, + enum dma_transfer_direction direction, + unsigned long flags, void *context) +{ + struct dw_edma_transfer xfer; + + xfer.dchan = dchan; + xfer.direction = direction; + xfer.xfer.sg.sgl = sgl; + xfer.xfer.sg.len = len; + xfer.flags = flags; + xfer.cyclic = false; + + return dw_edma_device_transfer(&xfer); +} + +static struct dma_async_tx_descriptor * +dw_edma_device_prep_dma_cyclic(struct dma_chan *dchan, dma_addr_t paddr, + size_t len, size_t count, + enum dma_transfer_direction direction, + unsigned long flags) +{ + struct dw_edma_transfer xfer; + + xfer.dchan = dchan; + xfer.direction = direction; + xfer.xfer.cyclic.paddr = paddr; + xfer.xfer.cyclic.len = len; + xfer.xfer.cyclic.cnt = count; + xfer.flags = flags; + xfer.cyclic = true; + + return dw_edma_device_transfer(&xfer); +} + +static void dw_edma_done_interrupt(struct dw_edma_chan *chan) +{ + struct dw_edma_desc *desc; + struct virt_dma_desc *vd; + unsigned long flags; + + dw_edma_v0_core_clear_done_int(chan); + + spin_lock_irqsave(&chan->vc.lock, flags); + vd = vchan_next_desc(&chan->vc); + if (vd) { + switch (chan->request) { + case EDMA_REQ_NONE: + desc = vd2dw_edma_desc(vd); + if (desc->chunks_alloc) { + chan->status = EDMA_ST_BUSY; + dw_edma_start_transfer(chan); + } else { + list_del(&vd->node); + vchan_cookie_complete(vd); + chan->status = EDMA_ST_IDLE; + } + break; + + case EDMA_REQ_STOP: + list_del(&vd->node); + vchan_cookie_complete(vd); + chan->request = EDMA_REQ_NONE; + chan->status = EDMA_ST_IDLE; + break; + + case EDMA_REQ_PAUSE: + chan->request = EDMA_REQ_NONE; + chan->status = EDMA_ST_PAUSE; + break; + + default: + break; + } + } + spin_unlock_irqrestore(&chan->vc.lock, flags); +} + +static void dw_edma_abort_interrupt(struct dw_edma_chan *chan) +{ + struct virt_dma_desc *vd; + unsigned long flags; + + dw_edma_v0_core_clear_abort_int(chan); + + spin_lock_irqsave(&chan->vc.lock, flags); + vd = vchan_next_desc(&chan->vc); + if (vd) { + list_del(&vd->node); + vchan_cookie_complete(vd); + } + spin_unlock_irqrestore(&chan->vc.lock, flags); + chan->request = EDMA_REQ_NONE; + chan->status = EDMA_ST_IDLE; +} + +static irqreturn_t dw_edma_interrupt(int irq, void *data, bool write) +{ + struct dw_edma_irq *dw_irq = data; + struct dw_edma *dw = dw_irq->dw; + unsigned long total, pos, val; + unsigned long off; + u32 mask; + + if (write) { + total = dw->wr_ch_cnt; + off = 0; + mask = dw_irq->wr_mask; + } else { + total = dw->rd_ch_cnt; + off = dw->wr_ch_cnt; + mask = dw_irq->rd_mask; + } + + val = dw_edma_v0_core_status_done_int(dw, write ? + EDMA_DIR_WRITE : + EDMA_DIR_READ); + val &= mask; + for_each_set_bit(pos, &val, total) { + struct dw_edma_chan *chan = &dw->chan[pos + off]; + + dw_edma_done_interrupt(chan); + } + + val = dw_edma_v0_core_status_abort_int(dw, write ? + EDMA_DIR_WRITE : + EDMA_DIR_READ); + val &= mask; + for_each_set_bit(pos, &val, total) { + struct dw_edma_chan *chan = &dw->chan[pos + off]; + + dw_edma_abort_interrupt(chan); + } + + return IRQ_HANDLED; +} + +static inline irqreturn_t dw_edma_interrupt_write(int irq, void *data) +{ + return dw_edma_interrupt(irq, data, true); +} + +static inline irqreturn_t dw_edma_interrupt_read(int irq, void *data) +{ + return dw_edma_interrupt(irq, data, false); +} + +static irqreturn_t dw_edma_interrupt_common(int irq, void *data) +{ + dw_edma_interrupt(irq, data, true); + dw_edma_interrupt(irq, data, false); + + return IRQ_HANDLED; +} + +static int dw_edma_alloc_chan_resources(struct dma_chan *dchan) +{ + struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan); + + if (chan->status != EDMA_ST_IDLE) + return -EBUSY; + + pm_runtime_get(chan->chip->dev); + + return 0; +} + +static void dw_edma_free_chan_resources(struct dma_chan *dchan) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(5000); + struct dw_edma_chan *chan = dchan2dw_edma_chan(dchan); + int ret; + + while (time_before(jiffies, timeout)) { + ret = dw_edma_device_terminate_all(dchan); + if (!ret) + break; + + if (time_after_eq(jiffies, timeout)) + return; + + cpu_relax(); + }; + + pm_runtime_put(chan->chip->dev); +} + +static int dw_edma_channel_setup(struct dw_edma_chip *chip, bool write, + u32 wr_alloc, u32 rd_alloc) +{ + struct dw_edma_region *dt_region; + struct device *dev = chip->dev; + struct dw_edma *dw = chip->dw; + struct dw_edma_chan *chan; + size_t ll_chunk, dt_chunk; + struct dw_edma_irq *irq; + struct dma_device *dma; + u32 i, j, cnt, ch_cnt; + u32 alloc, off_alloc; + int err = 0; + u32 pos; + + ch_cnt = dw->wr_ch_cnt + dw->rd_ch_cnt; + ll_chunk = dw->ll_region.sz; + dt_chunk = dw->dt_region.sz; + + /* Calculate linked list chunk for each channel */ + ll_chunk /= roundup_pow_of_two(ch_cnt); + + /* Calculate linked list chunk for each channel */ + dt_chunk /= roundup_pow_of_two(ch_cnt); + + if (write) { + i = 0; + cnt = dw->wr_ch_cnt; + dma = &dw->wr_edma; + alloc = wr_alloc; + off_alloc = 0; + } else { + i = dw->wr_ch_cnt; + cnt = dw->rd_ch_cnt; + dma = &dw->rd_edma; + alloc = rd_alloc; + off_alloc = wr_alloc; + } + + INIT_LIST_HEAD(&dma->channels); + for (j = 0; (alloc || dw->nr_irqs == 1) && j < cnt; j++, i++) { + chan = &dw->chan[i]; + + dt_region = devm_kzalloc(dev, sizeof(*dt_region), GFP_KERNEL); + if (!dt_region) + return -ENOMEM; + + chan->vc.chan.private = dt_region; + + chan->chip = chip; + chan->id = j; + chan->dir = write ? EDMA_DIR_WRITE : EDMA_DIR_READ; + chan->configured = false; + chan->request = EDMA_REQ_NONE; + chan->status = EDMA_ST_IDLE; + + chan->ll_off = (ll_chunk * i); + chan->ll_max = (ll_chunk / EDMA_LL_SZ) - 1; + + chan->dt_off = (dt_chunk * i); + + dev_vdbg(dev, "L. List:\tChannel %s[%u] off=0x%.8lx, max_cnt=%u\n", + write ? "write" : "read", j, + chan->ll_off, chan->ll_max); + + if (dw->nr_irqs == 1) + pos = 0; + else + pos = off_alloc + (j % alloc); + + irq = &dw->irq[pos]; + + if (write) + irq->wr_mask |= BIT(j); + else + irq->rd_mask |= BIT(j); + + irq->dw = dw; + memcpy(&chan->msi, &irq->msi, sizeof(chan->msi)); + + dev_vdbg(dev, "MSI:\t\tChannel %s[%u] addr=0x%.8x%.8x, data=0x%.8x\n", + write ? "write" : "read", j, + chan->msi.address_hi, chan->msi.address_lo, + chan->msi.data); + + chan->vc.desc_free = vchan_free_desc; + vchan_init(&chan->vc, dma); + + dt_region->paddr = dw->dt_region.paddr + chan->dt_off; + dt_region->vaddr = dw->dt_region.vaddr + chan->dt_off; + dt_region->sz = dt_chunk; + + dev_vdbg(dev, "Data:\tChannel %s[%u] off=0x%.8lx\n", + write ? "write" : "read", j, chan->dt_off); + + dw_edma_v0_core_device_config(chan); + } + + /* Set DMA channel capabilities */ + dma_cap_zero(dma->cap_mask); + dma_cap_set(DMA_SLAVE, dma->cap_mask); + dma_cap_set(DMA_CYCLIC, dma->cap_mask); + dma_cap_set(DMA_PRIVATE, dma->cap_mask); + dma->directions = BIT(write ? DMA_DEV_TO_MEM : DMA_MEM_TO_DEV); + dma->src_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES); + dma->dst_addr_widths = BIT(DMA_SLAVE_BUSWIDTH_4_BYTES); + dma->residue_granularity = DMA_RESIDUE_GRANULARITY_DESCRIPTOR; + dma->chancnt = cnt; + + /* Set DMA channel callbacks */ + dma->dev = chip->dev; + dma->device_alloc_chan_resources = dw_edma_alloc_chan_resources; + dma->device_free_chan_resources = dw_edma_free_chan_resources; + dma->device_config = dw_edma_device_config; + dma->device_pause = dw_edma_device_pause; + dma->device_resume = dw_edma_device_resume; + dma->device_terminate_all = dw_edma_device_terminate_all; + dma->device_issue_pending = dw_edma_device_issue_pending; + dma->device_tx_status = dw_edma_device_tx_status; + dma->device_prep_slave_sg = dw_edma_device_prep_slave_sg; + dma->device_prep_dma_cyclic = dw_edma_device_prep_dma_cyclic; + + dma_set_max_seg_size(dma->dev, U32_MAX); + + /* Register DMA device */ + err = dma_async_device_register(dma); + + return err; +} + +static inline void dw_edma_dec_irq_alloc(int *nr_irqs, u32 *alloc, u16 cnt) +{ + if (*nr_irqs && *alloc < cnt) { + (*alloc)++; + (*nr_irqs)--; + } +} + +static inline void dw_edma_add_irq_mask(u32 *mask, u32 alloc, u16 cnt) +{ + while (*mask * alloc < cnt) + (*mask)++; +} + +static int dw_edma_irq_request(struct dw_edma_chip *chip, + u32 *wr_alloc, u32 *rd_alloc) +{ + struct device *dev = chip->dev; + struct dw_edma *dw = chip->dw; + u32 wr_mask = 1; + u32 rd_mask = 1; + int i, err = 0; + u32 ch_cnt; + + ch_cnt = dw->wr_ch_cnt + dw->rd_ch_cnt; + + if (dw->nr_irqs < 1) + return -EINVAL; + + if (dw->nr_irqs == 1) { + /* Common IRQ shared among all channels */ + err = request_irq(pci_irq_vector(to_pci_dev(dev), 0), + dw_edma_interrupt_common, + IRQF_SHARED, dw->name, &dw->irq[0]); + if (err) { + dw->nr_irqs = 0; + return err; + } + + get_cached_msi_msg(pci_irq_vector(to_pci_dev(dev), 0), + &dw->irq[0].msi); + } else { + /* Distribute IRQs equally among all channels */ + int tmp = dw->nr_irqs; + + while (tmp && (*wr_alloc + *rd_alloc) < ch_cnt) { + dw_edma_dec_irq_alloc(&tmp, wr_alloc, dw->wr_ch_cnt); + dw_edma_dec_irq_alloc(&tmp, rd_alloc, dw->rd_ch_cnt); + } + + dw_edma_add_irq_mask(&wr_mask, *wr_alloc, dw->wr_ch_cnt); + dw_edma_add_irq_mask(&rd_mask, *rd_alloc, dw->rd_ch_cnt); + + for (i = 0; i < (*wr_alloc + *rd_alloc); i++) { + err = request_irq(pci_irq_vector(to_pci_dev(dev), i), + i < *wr_alloc ? + dw_edma_interrupt_write : + dw_edma_interrupt_read, + IRQF_SHARED, dw->name, + &dw->irq[i]); + if (err) { + dw->nr_irqs = i; + return err; + } + + get_cached_msi_msg(pci_irq_vector(to_pci_dev(dev), i), + &dw->irq[i].msi); + } + + dw->nr_irqs = i; + } + + return err; +} + +int dw_edma_probe(struct dw_edma_chip *chip) +{ + struct device *dev = chip->dev; + struct dw_edma *dw = chip->dw; + u32 wr_alloc = 0; + u32 rd_alloc = 0; + int i, err; + + raw_spin_lock_init(&dw->lock); + + /* Find out how many write channels are supported by hardware */ + dw->wr_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_WRITE); + if (!dw->wr_ch_cnt) + return -EINVAL; + + /* Find out how many read channels are supported by hardware */ + dw->rd_ch_cnt = dw_edma_v0_core_ch_count(dw, EDMA_DIR_READ); + if (!dw->rd_ch_cnt) + return -EINVAL; + + dev_vdbg(dev, "Channels:\twrite=%d, read=%d\n", + dw->wr_ch_cnt, dw->rd_ch_cnt); + + /* Allocate channels */ + dw->chan = devm_kcalloc(dev, dw->wr_ch_cnt + dw->rd_ch_cnt, + sizeof(*dw->chan), GFP_KERNEL); + if (!dw->chan) + return -ENOMEM; + + snprintf(dw->name, sizeof(dw->name), "dw-edma-core:%d", chip->id); + + /* Disable eDMA, only to establish the ideal initial conditions */ + dw_edma_v0_core_off(dw); + + /* Request IRQs */ + err = dw_edma_irq_request(chip, &wr_alloc, &rd_alloc); + if (err) + return err; + + /* Setup write channels */ + err = dw_edma_channel_setup(chip, true, wr_alloc, rd_alloc); + if (err) + goto err_irq_free; + + /* Setup read channels */ + err = dw_edma_channel_setup(chip, false, wr_alloc, rd_alloc); + if (err) + goto err_irq_free; + + /* Power management */ + pm_runtime_enable(dev); + + /* Turn debugfs on */ + dw_edma_v0_core_debugfs_on(chip); + + return 0; + +err_irq_free: + for (i = (dw->nr_irqs - 1); i >= 0; i--) + free_irq(pci_irq_vector(to_pci_dev(dev), i), &dw->irq[i]); + + dw->nr_irqs = 0; + + return err; +} +EXPORT_SYMBOL_GPL(dw_edma_probe); + +int dw_edma_remove(struct dw_edma_chip *chip) +{ + struct dw_edma_chan *chan, *_chan; + struct device *dev = chip->dev; + struct dw_edma *dw = chip->dw; + int i; + + /* Disable eDMA */ + dw_edma_v0_core_off(dw); + + /* Free irqs */ + for (i = (dw->nr_irqs - 1); i >= 0; i--) + free_irq(pci_irq_vector(to_pci_dev(dev), i), &dw->irq[i]); + + /* Power management */ + pm_runtime_disable(dev); + + list_for_each_entry_safe(chan, _chan, &dw->wr_edma.channels, + vc.chan.device_node) { + list_del(&chan->vc.chan.device_node); + tasklet_kill(&chan->vc.task); + } + + list_for_each_entry_safe(chan, _chan, &dw->rd_edma.channels, + vc.chan.device_node) { + list_del(&chan->vc.chan.device_node); + tasklet_kill(&chan->vc.task); + } + + /* Deregister eDMA device */ + dma_async_device_unregister(&dw->wr_edma); + dma_async_device_unregister(&dw->rd_edma); + + /* Turn debugfs off */ + dw_edma_v0_core_debugfs_off(); + + return 0; +} +EXPORT_SYMBOL_GPL(dw_edma_remove); + +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("Synopsys DesignWare eDMA controller core driver"); +MODULE_AUTHOR("Gustavo Pimentel "); diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h new file mode 100644 index 000000000000..b6cc90cbc9dc --- /dev/null +++ b/drivers/dma/dw-edma/dw-edma-core.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2018-2019 Synopsys, Inc. and/or its affiliates. + * Synopsys DesignWare eDMA core driver + * + * Author: Gustavo Pimentel + */ + +#ifndef _DW_EDMA_CORE_H +#define _DW_EDMA_CORE_H + +#include +#include + +#include "../virt-dma.h" + +#define EDMA_LL_SZ 24 + +enum dw_edma_dir { + EDMA_DIR_WRITE = 0, + EDMA_DIR_READ +}; + +enum dw_edma_mode { + EDMA_MODE_LEGACY = 0, + EDMA_MODE_UNROLL +}; + +enum dw_edma_request { + EDMA_REQ_NONE = 0, + EDMA_REQ_STOP, + EDMA_REQ_PAUSE +}; + +enum dw_edma_status { + EDMA_ST_IDLE = 0, + EDMA_ST_PAUSE, + EDMA_ST_BUSY +}; + +struct dw_edma_chan; +struct dw_edma_chunk; + +struct dw_edma_burst { + struct list_head list; + u64 sar; + u64 dar; + u32 sz; +}; + +struct dw_edma_region { + phys_addr_t paddr; + dma_addr_t vaddr; + size_t sz; +}; + +struct dw_edma_chunk { + struct list_head list; + struct dw_edma_chan *chan; + struct dw_edma_burst *burst; + + u32 bursts_alloc; + + u8 cb; + struct dw_edma_region ll_region; /* Linked list */ +}; + +struct dw_edma_desc { + struct virt_dma_desc vd; + struct dw_edma_chan *chan; + struct dw_edma_chunk *chunk; + + u32 chunks_alloc; + + u32 alloc_sz; + u32 xfer_sz; +}; + +struct dw_edma_chan { + struct virt_dma_chan vc; + struct dw_edma_chip *chip; + int id; + enum dw_edma_dir dir; + + off_t ll_off; + u32 ll_max; + + off_t dt_off; + + struct msi_msg msi; + + enum dw_edma_request request; + enum dw_edma_status status; + u8 configured; + + struct dma_slave_config config; +}; + +struct dw_edma_irq { + struct msi_msg msi; + u32 wr_mask; + u32 rd_mask; + struct dw_edma *dw; +}; + +struct dw_edma { + char name[20]; + + struct dma_device wr_edma; + u16 wr_ch_cnt; + + struct dma_device rd_edma; + u16 rd_ch_cnt; + + struct dw_edma_region rg_region; /* Registers */ + struct dw_edma_region ll_region; /* Linked list */ + struct dw_edma_region dt_region; /* Data */ + + struct dw_edma_irq *irq; + int nr_irqs; + + u32 version; + enum dw_edma_mode mode; + + struct dw_edma_chan *chan; + const struct dw_edma_core_ops *ops; + + raw_spinlock_t lock; /* Only for legacy */ +}; + +struct dw_edma_sg { + struct scatterlist *sgl; + unsigned int len; +}; + +struct dw_edma_cyclic { + dma_addr_t paddr; + size_t len; + size_t cnt; +}; + +struct dw_edma_transfer { + struct dma_chan *dchan; + union dw_edma_xfer { + struct dw_edma_sg sg; + struct dw_edma_cyclic cyclic; + } xfer; + enum dma_transfer_direction direction; + unsigned long flags; + bool cyclic; +}; + +static inline +struct dw_edma_chan *vc2dw_edma_chan(struct virt_dma_chan *vc) +{ + return container_of(vc, struct dw_edma_chan, vc); +} + +static inline +struct dw_edma_chan *dchan2dw_edma_chan(struct dma_chan *dchan) +{ + return vc2dw_edma_chan(to_virt_chan(dchan)); +} + +#endif /* _DW_EDMA_CORE_H */ diff --git a/include/linux/dma/edma.h b/include/linux/dma/edma.h new file mode 100644 index 000000000000..cab6e18773da --- /dev/null +++ b/include/linux/dma/edma.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2018-2019 Synopsys, Inc. and/or its affiliates. + * Synopsys DesignWare eDMA core driver + * + * Author: Gustavo Pimentel + */ + +#ifndef _DW_EDMA_H +#define _DW_EDMA_H + +#include +#include + +struct dw_edma; + +/** + * struct dw_edma_chip - representation of DesignWare eDMA controller hardware + * @dev: struct device of the eDMA controller + * @id: instance ID + * @irq: irq line + * @dw: struct dw_edma that is filed by dw_edma_probe() + */ +struct dw_edma_chip { + struct device *dev; + int id; + int irq; + struct dw_edma *dw; +}; + +/* Export to the platform drivers */ +#if IS_ENABLED(CONFIG_DW_EDMA) +int dw_edma_probe(struct dw_edma_chip *chip); +int dw_edma_remove(struct dw_edma_chip *chip); +#else +static inline int dw_edma_probe(struct dw_edma_chip *chip) +{ + return -ENODEV; +} + +static inline int dw_edma_remove(struct dw_edma_chip *chip) +{ + return 0; +} +#endif /* CONFIG_DW_EDMA */ + +#endif /* _DW_EDMA_H */ -- cgit v1.2.3-59-g8ed1b From 1f418f46503d72594bbe6407d97fd2ae1ce15ee6 Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Tue, 4 Jun 2019 15:29:25 +0200 Subject: PCI: Add Synopsys endpoint EDDA Device ID Create and add Synopsys Endpoint EDDA Device ID to PCI ID list, since this ID is now being use on two different drivers (pci_endpoint_test.ko and dw-edma-pcie.ko). Signed-off-by: Gustavo Pimentel Acked-by: Bjorn Helgaas Cc: Kishon Vijay Abraham I Cc: Bjorn Helgaas Cc: Lorenzo Pieralisi Cc: Joao Pinto Signed-off-by: Vinod Koul --- drivers/misc/pci_endpoint_test.c | 2 +- include/linux/pci_ids.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c index 7b015f2a1c6f..1f531c1b4f74 100644 --- a/drivers/misc/pci_endpoint_test.c +++ b/drivers/misc/pci_endpoint_test.c @@ -804,7 +804,7 @@ static const struct pci_device_id pci_endpoint_test_tbl[] = { { PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA74x) }, { PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_DRA72x) }, { PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, 0x81c0) }, - { PCI_DEVICE(PCI_VENDOR_ID_SYNOPSYS, 0xedda) }, + { PCI_DEVICE_DATA(SYNOPSYS, EDDA, NULL) }, { PCI_DEVICE(PCI_VENDOR_ID_TI, PCI_DEVICE_ID_TI_AM654), .driver_data = (kernel_ulong_t)&am654_data }, diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 70e86148cb1e..4aad69fc4d6b 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2366,6 +2366,7 @@ #define PCI_DEVICE_ID_SYNOPSYS_HAPSUSB3 0xabcd #define PCI_DEVICE_ID_SYNOPSYS_HAPSUSB3_AXI 0xabce #define PCI_DEVICE_ID_SYNOPSYS_HAPSUSB31 0xabcf +#define PCI_DEVICE_ID_SYNOPSYS_EDDA 0xedda #define PCI_VENDOR_ID_USR 0x16ec -- cgit v1.2.3-59-g8ed1b From 4e23be473e3063a9d3bc06bb0aee89885fffab0e Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 10 Jun 2019 04:48:05 -0700 Subject: bus: ti-sysc: Add support for module specific reset quirks Some older interconnect target modules need module internal clock toggling quirks to reset properly. We've been doing this in the platform code earlier, but need to be able to it directly in the ti-sysc driver when we no longer rely on on the platform code. Let's add reset handling for 1-wire, i2c and watchdog. Later on we can add more modules like msdi and dss as they get tested. For dra7 pcie, we should be able to just use the rstctrl reset driver when available. Signed-off-by: Tony Lindgren --- drivers/bus/ti-sysc.c | 129 ++++++++++++++++++++++++++++++++-- include/linux/platform_data/ti-sysc.h | 3 + 2 files changed, 127 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index a366ae548ec9..e6deabd8305d 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -71,6 +71,9 @@ static const char * const clock_names[SYSC_MAX_CLOCKS] = { * @name: name if available * @revision: interconnect target module revision * @needs_resume: runtime resume needed on resume from suspend + * @clk_enable_quirk: module specific clock enable quirk + * @clk_disable_quirk: module specific clock disable quirk + * @reset_done_quirk: module specific reset done quirk */ struct sysc { struct device *dev; @@ -94,6 +97,9 @@ struct sysc { unsigned int child_needs_resume:1; unsigned int disable_on_idle:1; struct delayed_work idle_work; + void (*clk_enable_quirk)(struct sysc *sysc); + void (*clk_disable_quirk)(struct sysc *sysc); + void (*reset_done_quirk)(struct sysc *sysc); }; static void sysc_parse_dts_quirks(struct sysc *ddata, struct device_node *np, @@ -760,8 +766,11 @@ static int sysc_ioremap(struct sysc *ddata) ddata->offsets[SYSC_SYSCONFIG], ddata->offsets[SYSC_SYSSTATUS]); + if (size < SZ_1K) + size = SZ_1K; + if ((size + sizeof(u32)) > ddata->module_size) - return -EINVAL; + size = ddata->module_size; } ddata->module_va = devm_ioremap(ddata->dev, @@ -1234,6 +1243,22 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = { SYSC_QUIRK_EXT_OPT_CLOCK | SYSC_QUIRK_NO_RESET_ON_INIT | SYSC_QUIRK_SWSUP_SIDLE), + /* Quirks that need to be set based on detected module */ + SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x00000006, 0xffffffff, + SYSC_MODULE_QUIRK_HDQ1W), + SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x0000000a, 0xffffffff, + SYSC_MODULE_QUIRK_HDQ1W), + SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x00000036, 0x000000ff, + SYSC_MODULE_QUIRK_I2C), + SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x0000003c, 0x000000ff, + SYSC_MODULE_QUIRK_I2C), + SYSC_QUIRK("i2c", 0, 0, 0x20, 0x10, 0x00000040, 0x000000ff, + SYSC_MODULE_QUIRK_I2C), + SYSC_QUIRK("i2c", 0, 0, 0x10, 0x90, 0x5040000a, 0xfffff0f0, + SYSC_MODULE_QUIRK_I2C), + SYSC_QUIRK("wdt", 0, 0, 0x10, 0x14, 0x502a0500, 0xfffff0f0, + SYSC_MODULE_QUIRK_WDT), + #ifdef DEBUG SYSC_QUIRK("adc", 0, 0, 0x10, -1, 0x47300001, 0xffffffff, 0), SYSC_QUIRK("atl", 0, 0, -1, -1, 0x0a070100, 0xffffffff, 0), @@ -1247,11 +1272,8 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = { SYSC_QUIRK("dwc3", 0, 0, 0x10, -1, 0x500a0200, 0xffffffff, 0), SYSC_QUIRK("epwmss", 0, 0, 0x4, -1, 0x47400001, 0xffffffff, 0), SYSC_QUIRK("gpu", 0, 0x1fc00, 0x1fc10, -1, 0, 0, 0), - SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x00000006, 0xffffffff, 0), - SYSC_QUIRK("hdq1w", 0, 0, 0x14, 0x18, 0x0000000a, 0xffffffff, 0), SYSC_QUIRK("hsi", 0, 0, 0x10, 0x14, 0x50043101, 0xffffffff, 0), SYSC_QUIRK("iss", 0, 0, 0x10, -1, 0x40000101, 0xffffffff, 0), - SYSC_QUIRK("i2c", 0, 0, 0x10, 0x90, 0x5040000a, 0xfffff0f0, 0), SYSC_QUIRK("lcdc", 0, 0, 0x54, -1, 0x4f201000, 0xffffffff, 0), SYSC_QUIRK("mcasp", 0, 0, 0x4, -1, 0x44306302, 0xffffffff, 0), SYSC_QUIRK("mcasp", 0, 0, 0x4, -1, 0x44307b02, 0xffffffff, 0), @@ -1287,7 +1309,6 @@ static const struct sysc_revision_quirk sysc_revision_quirks[] = { SYSC_QUIRK("usb_host_hs", 0, 0, 0x10, -1, 0x50700101, 0xffffffff, 0), SYSC_QUIRK("usb_otg_hs", 0, 0x400, 0x404, 0x408, 0x00000050, 0xffffffff, 0), - SYSC_QUIRK("wdt", 0, 0, 0x10, 0x14, 0x502a0500, 0xfffff0f0, 0), SYSC_QUIRK("vfpe", 0, 0, 0x104, -1, 0x4d001200, 0xffffffff, 0), #endif }; @@ -1360,6 +1381,94 @@ static void sysc_init_revision_quirks(struct sysc *ddata) } } +/* 1-wire needs module's internal clocks enabled for reset */ +static void sysc_clk_enable_quirk_hdq1w(struct sysc *ddata) +{ + int offset = 0x0c; /* HDQ_CTRL_STATUS */ + u16 val; + + val = sysc_read(ddata, offset); + val |= BIT(5); + sysc_write(ddata, offset, val); +} + +/* I2C needs extra enable bit toggling for reset */ +static void sysc_clk_quirk_i2c(struct sysc *ddata, bool enable) +{ + int offset; + u16 val; + + /* I2C_CON, omap2/3 is different from omap4 and later */ + if ((ddata->revision & 0xffffff00) == 0x001f0000) + offset = 0x24; + else + offset = 0xa4; + + /* I2C_EN */ + val = sysc_read(ddata, offset); + if (enable) + val |= BIT(15); + else + val &= ~BIT(15); + sysc_write(ddata, offset, val); +} + +static void sysc_clk_enable_quirk_i2c(struct sysc *ddata) +{ + sysc_clk_quirk_i2c(ddata, true); +} + +static void sysc_clk_disable_quirk_i2c(struct sysc *ddata) +{ + sysc_clk_quirk_i2c(ddata, false); +} + +/* Watchdog timer needs a disable sequence after reset */ +static void sysc_reset_done_quirk_wdt(struct sysc *ddata) +{ + int wps, spr, error; + u32 val; + + wps = 0x34; + spr = 0x48; + + sysc_write(ddata, spr, 0xaaaa); + error = readl_poll_timeout(ddata->module_va + wps, val, + !(val & 0x10), 100, + MAX_MODULE_SOFTRESET_WAIT); + if (error) + dev_warn(ddata->dev, "wdt disable spr failed\n"); + + sysc_write(ddata, wps, 0x5555); + error = readl_poll_timeout(ddata->module_va + wps, val, + !(val & 0x10), 100, + MAX_MODULE_SOFTRESET_WAIT); + if (error) + dev_warn(ddata->dev, "wdt disable wps failed\n"); +} + +static void sysc_init_module_quirks(struct sysc *ddata) +{ + if (ddata->legacy_mode || !ddata->name) + return; + + if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_HDQ1W) { + ddata->clk_enable_quirk = sysc_clk_enable_quirk_hdq1w; + + return; + } + + if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_I2C) { + ddata->clk_enable_quirk = sysc_clk_enable_quirk_i2c; + ddata->clk_disable_quirk = sysc_clk_disable_quirk_i2c; + + return; + } + + if (ddata->cfg.quirks & SYSC_MODULE_QUIRK_WDT) + ddata->reset_done_quirk = sysc_reset_done_quirk_wdt; +} + static int sysc_clockdomain_init(struct sysc *ddata) { struct ti_sysc_platform_data *pdata = dev_get_platdata(ddata->dev); @@ -1468,10 +1577,16 @@ static int sysc_reset(struct sysc *ddata) else syss_done = ddata->cfg.syss_mask; + if (ddata->clk_disable_quirk) + ddata->clk_disable_quirk(ddata); + sysc_val = sysc_read_sysconfig(ddata); sysc_val |= sysc_mask; sysc_write(ddata, sysc_offset, sysc_val); + if (ddata->clk_enable_quirk) + ddata->clk_enable_quirk(ddata); + /* Poll on reset status */ if (syss_offset >= 0) { error = readx_poll_timeout(sysc_read_sysstatus, ddata, rstval, @@ -1485,6 +1600,9 @@ static int sysc_reset(struct sysc *ddata) 100, MAX_MODULE_SOFTRESET_WAIT); } + if (ddata->reset_done_quirk) + ddata->reset_done_quirk(ddata); + return error; } @@ -1531,6 +1649,7 @@ static int sysc_init_module(struct sysc *ddata) ddata->revision = sysc_read_revision(ddata); sysc_init_revision_quirks(ddata); + sysc_init_module_quirks(ddata); if (ddata->legacy_mode) { error = sysc_legacy_init(ddata); diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h index 8822e99ff813..0c587d4fc718 100644 --- a/include/linux/platform_data/ti-sysc.h +++ b/include/linux/platform_data/ti-sysc.h @@ -47,6 +47,9 @@ struct sysc_regbits { s8 emufree_shift; }; +#define SYSC_MODULE_QUIRK_HDQ1W BIT(17) +#define SYSC_MODULE_QUIRK_I2C BIT(16) +#define SYSC_MODULE_QUIRK_WDT BIT(15) #define SYSS_QUIRK_RESETDONE_INVERTED BIT(14) #define SYSC_QUIRK_SWSUP_MSTANDBY BIT(13) #define SYSC_QUIRK_SWSUP_SIDLE_ACT BIT(12) -- cgit v1.2.3-59-g8ed1b From e36acfe6c86d13eec62321e1e86a1ce287e52e7d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 23 May 2019 09:41:19 -0300 Subject: mm/hmm: Use hmm_mirror not mm as an argument for hmm_range_register Ralph observes that hmm_range_register() can only be called by a driver while a mirror is registered. Make this clear in the API by passing in the mirror structure as a parameter. This also simplifies understanding the lifetime model for struct hmm, as the hmm pointer must be valid as part of a registered mirror so all we need in hmm_register_range() is a simple kref_get. Suggested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Reviewed-by: Christoph Hellwig Tested-by: Philip Yang --- drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- include/linux/hmm.h | 7 ++++--- mm/hmm.c | 13 ++++--------- 3 files changed, 9 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 93ed43c413f0..8c92374afcf2 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify) range.values = nouveau_svm_pfn_values; range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; again: - ret = hmm_vma_fault(&range, true); + ret = hmm_vma_fault(&svmm->mirror, &range, true); if (ret == 0) { mutex_lock(&svmm->mutex); if (!hmm_vma_range_done(&range)) { diff --git a/include/linux/hmm.h b/include/linux/hmm.h index cb01cf1fa3c0..1fba6979adf4 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -496,7 +496,7 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) * Please see Documentation/vm/hmm.rst for how to use the range API. */ int hmm_range_register(struct hmm_range *range, - struct mm_struct *mm, + struct hmm_mirror *mirror, unsigned long start, unsigned long end, unsigned page_shift); @@ -532,7 +532,8 @@ static inline bool hmm_vma_range_done(struct hmm_range *range) } /* This is a temporary helper to avoid merge conflict between trees. */ -static inline int hmm_vma_fault(struct hmm_range *range, bool block) +static inline int hmm_vma_fault(struct hmm_mirror *mirror, + struct hmm_range *range, bool block) { long ret; @@ -545,7 +546,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) range->default_flags = 0; range->pfn_flags_mask = -1UL; - ret = hmm_range_register(range, range->vma->vm_mm, + ret = hmm_range_register(range, mirror, range->start, range->end, PAGE_SHIFT); if (ret) diff --git a/mm/hmm.c b/mm/hmm.c index f6956d78e3cb..22a97ada108b 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -914,13 +914,13 @@ static void hmm_pfns_clear(struct hmm_range *range, * Track updates to the CPU page table see include/linux/hmm.h */ int hmm_range_register(struct hmm_range *range, - struct mm_struct *mm, + struct hmm_mirror *mirror, unsigned long start, unsigned long end, unsigned page_shift) { unsigned long mask = ((1UL << page_shift) - 1UL); - struct hmm *hmm; + struct hmm *hmm = mirror->hmm; range->valid = false; range->hmm = NULL; @@ -934,20 +934,15 @@ int hmm_range_register(struct hmm_range *range, range->start = start; range->end = end; - hmm = hmm_get_or_create(mm); - if (!hmm) - return -EFAULT; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) { - hmm_put(hmm); + if (hmm->mm == NULL || hmm->dead) return -EFAULT; - } /* Initialize range to track CPU page table updates. */ mutex_lock(&hmm->lock); range->hmm = hmm; + kref_get(&hmm->kref); list_add_rcu(&range->list, &hmm->ranges); /* -- cgit v1.2.3-59-g8ed1b From c8a53b2db0aec40d8b217936e1b7f3d840c50390 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 23 May 2019 10:36:46 -0300 Subject: mm/hmm: Hold a mmgrab from hmm to mm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So long as a struct hmm pointer exists, so should the struct mm it is linked too. Hold the mmgrab() as soon as a hmm is created, and mmdrop() it once the hmm refcount goes to zero. Since mmdrop() (ie a 0 kref on struct mm) is now impossible with a !NULL mm->hmm delete the hmm_hmm_destroy(). Signed-off-by: Jason Gunthorpe Reviewed-by: Jérôme Glisse Reviewed-by: John Hubbard Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Reviewed-by: Christoph Hellwig Tested-by: Philip Yang --- include/linux/hmm.h | 3 --- kernel/fork.c | 1 - mm/hmm.c | 22 ++++------------------ 3 files changed, 4 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 1fba6979adf4..1d97b6d62c5b 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -577,14 +577,11 @@ static inline int hmm_vma_fault(struct hmm_mirror *mirror, } /* Below are for HMM internal use only! Not to be used by device driver! */ -void hmm_mm_destroy(struct mm_struct *mm); - static inline void hmm_mm_init(struct mm_struct *mm) { mm->hmm = NULL; } #else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -static inline void hmm_mm_destroy(struct mm_struct *mm) {} static inline void hmm_mm_init(struct mm_struct *mm) {} #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ diff --git a/kernel/fork.c b/kernel/fork.c index 75675b9bf6df..c704c3cedee7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -673,7 +673,6 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); - hmm_mm_destroy(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); diff --git a/mm/hmm.c b/mm/hmm.c index 22a97ada108b..080b17a2e87e 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +74,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) hmm->notifiers = 0; hmm->dead = false; hmm->mm = mm; + mmgrab(hmm->mm); spin_lock(&mm->page_table_lock); if (!mm->hmm) @@ -100,6 +102,7 @@ error_mm: mm->hmm = NULL; spin_unlock(&mm->page_table_lock); error: + mmdrop(hmm->mm); kfree(hmm); return NULL; } @@ -121,6 +124,7 @@ static void hmm_free(struct kref *kref) mm->hmm = NULL; spin_unlock(&mm->page_table_lock); + mmdrop(hmm->mm); mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu); } @@ -129,24 +133,6 @@ static inline void hmm_put(struct hmm *hmm) kref_put(&hmm->kref, hmm_free); } -void hmm_mm_destroy(struct mm_struct *mm) -{ - struct hmm *hmm; - - spin_lock(&mm->page_table_lock); - hmm = mm_get_hmm(mm); - mm->hmm = NULL; - if (hmm) { - hmm->mm = NULL; - hmm->dead = true; - spin_unlock(&mm->page_table_lock); - hmm_put(hmm); - return; - } - - spin_unlock(&mm->page_table_lock); -} - static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); -- cgit v1.2.3-59-g8ed1b From 78b99577b3934e3e787fe0c52aa1b59442c8bbb5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 10 Jun 2019 00:09:53 +0900 Subject: pinctrl: remove unused pin_is_valid() This function was used by pin_request() to pointlessly double-check the pin validity, and it was the only user ever. Since commit d2f6a1c6fb0e ("pinctrl: remove double pin validity check."), no one has ever used it. Signed-off-by: Masahiro Yamada Signed-off-by: Linus Walleij --- drivers/pinctrl/core.c | 23 ----------------------- include/linux/pinctrl/pinctrl.h | 10 ---------- 2 files changed, 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c index 04787eefe2a2..e745788fa36f 100644 --- a/drivers/pinctrl/core.c +++ b/drivers/pinctrl/core.c @@ -178,29 +178,6 @@ const char *pin_get_name(struct pinctrl_dev *pctldev, const unsigned pin) return desc->name; } -/** - * pin_is_valid() - check if pin exists on controller - * @pctldev: the pin control device to check the pin on - * @pin: pin to check, use the local pin controller index number - * - * This tells us whether a certain pin exist on a certain pin controller or - * not. Pin lists may be sparse, so some pins may not exist. - */ -bool pin_is_valid(struct pinctrl_dev *pctldev, int pin) -{ - struct pin_desc *pindesc; - - if (pin < 0) - return false; - - mutex_lock(&pctldev->mutex); - pindesc = pin_desc_get(pctldev, pin); - mutex_unlock(&pctldev->mutex); - - return pindesc != NULL; -} -EXPORT_SYMBOL_GPL(pin_is_valid); - /* Deletes a range of pin descriptors */ static void pinctrl_free_pindescs(struct pinctrl_dev *pctldev, const struct pinctrl_pin_desc *pins, diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index 2744113f1024..36a79fe7b84f 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -172,7 +172,6 @@ extern struct pinctrl_dev *devm_pinctrl_register(struct device *dev, extern void devm_pinctrl_unregister(struct device *dev, struct pinctrl_dev *pctldev); -extern bool pin_is_valid(struct pinctrl_dev *pctldev, int pin); extern void pinctrl_add_gpio_range(struct pinctrl_dev *pctldev, struct pinctrl_gpio_range *range); extern void pinctrl_add_gpio_ranges(struct pinctrl_dev *pctldev, @@ -203,15 +202,6 @@ struct pinctrl_dev *of_pinctrl_get(struct device_node *np) extern const char *pinctrl_dev_get_name(struct pinctrl_dev *pctldev); extern const char *pinctrl_dev_get_devname(struct pinctrl_dev *pctldev); extern void *pinctrl_dev_get_drvdata(struct pinctrl_dev *pctldev); -#else - -struct pinctrl_dev; - -/* Sufficiently stupid default functions when pinctrl is not in use */ -static inline bool pin_is_valid(struct pinctrl_dev *pctldev, int pin) -{ - return pin >= 0; -} #endif /* !CONFIG_PINCTRL */ -- cgit v1.2.3-59-g8ed1b From 0b673b6486998061b0489b09447ebe8452da0146 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 8 May 2019 11:46:34 -0700 Subject: firmware: arm_scmi: fetch and store sensor scale In preparation for dealing with scales within the SCMI HWMON driver, fetch and store the sensor unit scale into the scmi_sensor_info structure. In order to simplify computations for upper layer, take care of sign extending the scale to a full 8-bit signed value. Reviewed-by: Guenter Roeck Signed-off-by: Florian Fainelli [sudeep.holla: update bitfield values as per specification] Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/sensors.c | 6 ++++++ include/linux/scmi_protocol.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c index c00287b5f2c2..0e94ab56f679 100644 --- a/drivers/firmware/arm_scmi/sensors.c +++ b/drivers/firmware/arm_scmi/sensors.c @@ -34,6 +34,8 @@ struct scmi_msg_resp_sensor_description { __le32 attributes_high; #define SENSOR_TYPE(x) ((x) & 0xff) #define SENSOR_SCALE(x) (((x) >> 11) & 0x1f) +#define SENSOR_SCALE_SIGN BIT(4) +#define SENSOR_SCALE_EXTEND GENMASK(7, 5) #define SENSOR_UPDATE_SCALE(x) (((x) >> 22) & 0x1f) #define SENSOR_UPDATE_BASE(x) (((x) >> 27) & 0x1f) u8 name[SCMI_MAX_STR_SIZE]; @@ -140,6 +142,10 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle, s = &si->sensors[desc_index + cnt]; s->id = le32_to_cpu(buf->desc[cnt].id); s->type = SENSOR_TYPE(attrh); + s->scale = SENSOR_SCALE(attrh); + /* Sign extend to a full s8 */ + if (s->scale & SENSOR_SCALE_SIGN) + s->scale |= SENSOR_SCALE_EXTEND; strlcpy(s->name, buf->desc[cnt].name, SCMI_MAX_STR_SIZE); } diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 3105055c00a7..9ff2e9357e9a 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -144,6 +144,7 @@ struct scmi_power_ops { struct scmi_sensor_info { u32 id; u8 type; + s8 scale; char name[SCMI_MAX_STR_SIZE]; }; -- cgit v1.2.3-59-g8ed1b From 81f4458c9c6998fcd37c427d16d76d4dba65d015 Mon Sep 17 00:00:00 2001 From: Tero Kristo Date: Tue, 28 May 2019 16:10:24 +0300 Subject: firmware: ti_sci: extend clock identifiers from u8 to u32 Future SoCs are going to have more than 255 device clocks in certain cases, and thus the API must be extended to support this. The support is done in backwards compatible extension, in which the new u32 clock identifier fields are only used if the existing u8 size clock identifier is set as 255. In all the other cases, the existing u8 clock identifier is used. As the size of the messages sent / received is not verified for existing devices / old firmware, increasing the size of the messages from the end is also fine. Due to this reason, depending on ABI version isn't necessary either. Acked-by: Santosh Shilimkar Signed-off-by: Tero Kristo --- drivers/firmware/ti_sci.c | 115 ++++++++++++++++++++++++--------- drivers/firmware/ti_sci.h | 63 ++++++++++++++---- include/linux/soc/ti/ti_sci_protocol.h | 28 ++++---- 3 files changed, 150 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c index ef93406ace1b..b417cef35769 100644 --- a/drivers/firmware/ti_sci.c +++ b/drivers/firmware/ti_sci.c @@ -916,7 +916,7 @@ static int ti_sci_cmd_get_device_resets(const struct ti_sci_handle *handle, * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_set_clock_state(const struct ti_sci_handle *handle, - u32 dev_id, u8 clk_id, + u32 dev_id, u32 clk_id, u32 flags, u8 state) { struct ti_sci_info *info; @@ -944,7 +944,12 @@ static int ti_sci_set_clock_state(const struct ti_sci_handle *handle, } req = (struct ti_sci_msg_req_set_clock_state *)xfer->xfer_buf; req->dev_id = dev_id; - req->clk_id = clk_id; + if (clk_id < 255) { + req->clk_id = clk_id; + } else { + req->clk_id = 255; + req->clk_id_32 = clk_id; + } req->request_state = state; ret = ti_sci_do_xfer(info, xfer); @@ -976,7 +981,7 @@ fail: * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_get_clock_state(const struct ti_sci_handle *handle, - u32 dev_id, u8 clk_id, + u32 dev_id, u32 clk_id, u8 *programmed_state, u8 *current_state) { struct ti_sci_info *info; @@ -1007,7 +1012,12 @@ static int ti_sci_cmd_get_clock_state(const struct ti_sci_handle *handle, } req = (struct ti_sci_msg_req_get_clock_state *)xfer->xfer_buf; req->dev_id = dev_id; - req->clk_id = clk_id; + if (clk_id < 255) { + req->clk_id = clk_id; + } else { + req->clk_id = 255; + req->clk_id_32 = clk_id; + } ret = ti_sci_do_xfer(info, xfer); if (ret) { @@ -1047,8 +1057,8 @@ fail: * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_get_clock(const struct ti_sci_handle *handle, u32 dev_id, - u8 clk_id, bool needs_ssc, bool can_change_freq, - bool enable_input_term) + u32 clk_id, bool needs_ssc, + bool can_change_freq, bool enable_input_term) { u32 flags = 0; @@ -1073,7 +1083,7 @@ static int ti_sci_cmd_get_clock(const struct ti_sci_handle *handle, u32 dev_id, * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_idle_clock(const struct ti_sci_handle *handle, - u32 dev_id, u8 clk_id) + u32 dev_id, u32 clk_id) { return ti_sci_set_clock_state(handle, dev_id, clk_id, 0, MSG_CLOCK_SW_STATE_UNREQ); @@ -1092,7 +1102,7 @@ static int ti_sci_cmd_idle_clock(const struct ti_sci_handle *handle, * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_put_clock(const struct ti_sci_handle *handle, - u32 dev_id, u8 clk_id) + u32 dev_id, u32 clk_id) { return ti_sci_set_clock_state(handle, dev_id, clk_id, 0, MSG_CLOCK_SW_STATE_AUTO); @@ -1110,7 +1120,7 @@ static int ti_sci_cmd_put_clock(const struct ti_sci_handle *handle, * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_clk_is_auto(const struct ti_sci_handle *handle, - u32 dev_id, u8 clk_id, bool *req_state) + u32 dev_id, u32 clk_id, bool *req_state) { u8 state = 0; int ret; @@ -1139,7 +1149,7 @@ static int ti_sci_cmd_clk_is_auto(const struct ti_sci_handle *handle, * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_clk_is_on(const struct ti_sci_handle *handle, u32 dev_id, - u8 clk_id, bool *req_state, bool *curr_state) + u32 clk_id, bool *req_state, bool *curr_state) { u8 c_state = 0, r_state = 0; int ret; @@ -1172,7 +1182,7 @@ static int ti_sci_cmd_clk_is_on(const struct ti_sci_handle *handle, u32 dev_id, * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_clk_is_off(const struct ti_sci_handle *handle, u32 dev_id, - u8 clk_id, bool *req_state, bool *curr_state) + u32 clk_id, bool *req_state, bool *curr_state) { u8 c_state = 0, r_state = 0; int ret; @@ -1204,7 +1214,7 @@ static int ti_sci_cmd_clk_is_off(const struct ti_sci_handle *handle, u32 dev_id, * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_clk_set_parent(const struct ti_sci_handle *handle, - u32 dev_id, u8 clk_id, u8 parent_id) + u32 dev_id, u32 clk_id, u32 parent_id) { struct ti_sci_info *info; struct ti_sci_msg_req_set_clock_parent *req; @@ -1231,8 +1241,18 @@ static int ti_sci_cmd_clk_set_parent(const struct ti_sci_handle *handle, } req = (struct ti_sci_msg_req_set_clock_parent *)xfer->xfer_buf; req->dev_id = dev_id; - req->clk_id = clk_id; - req->parent_id = parent_id; + if (clk_id < 255) { + req->clk_id = clk_id; + } else { + req->clk_id = 255; + req->clk_id_32 = clk_id; + } + if (parent_id < 255) { + req->parent_id = parent_id; + } else { + req->parent_id = 255; + req->parent_id_32 = parent_id; + } ret = ti_sci_do_xfer(info, xfer); if (ret) { @@ -1262,7 +1282,7 @@ fail: * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_clk_get_parent(const struct ti_sci_handle *handle, - u32 dev_id, u8 clk_id, u8 *parent_id) + u32 dev_id, u32 clk_id, u32 *parent_id) { struct ti_sci_info *info; struct ti_sci_msg_req_get_clock_parent *req; @@ -1289,7 +1309,12 @@ static int ti_sci_cmd_clk_get_parent(const struct ti_sci_handle *handle, } req = (struct ti_sci_msg_req_get_clock_parent *)xfer->xfer_buf; req->dev_id = dev_id; - req->clk_id = clk_id; + if (clk_id < 255) { + req->clk_id = clk_id; + } else { + req->clk_id = 255; + req->clk_id_32 = clk_id; + } ret = ti_sci_do_xfer(info, xfer); if (ret) { @@ -1299,10 +1324,14 @@ static int ti_sci_cmd_clk_get_parent(const struct ti_sci_handle *handle, resp = (struct ti_sci_msg_resp_get_clock_parent *)xfer->xfer_buf; - if (!ti_sci_is_response_ack(resp)) + if (!ti_sci_is_response_ack(resp)) { ret = -ENODEV; - else - *parent_id = resp->parent_id; + } else { + if (resp->parent_id < 255) + *parent_id = resp->parent_id; + else + *parent_id = resp->parent_id_32; + } fail: ti_sci_put_one_xfer(&info->minfo, xfer); @@ -1322,8 +1351,8 @@ fail: * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_clk_get_num_parents(const struct ti_sci_handle *handle, - u32 dev_id, u8 clk_id, - u8 *num_parents) + u32 dev_id, u32 clk_id, + u32 *num_parents) { struct ti_sci_info *info; struct ti_sci_msg_req_get_clock_num_parents *req; @@ -1350,7 +1379,12 @@ static int ti_sci_cmd_clk_get_num_parents(const struct ti_sci_handle *handle, } req = (struct ti_sci_msg_req_get_clock_num_parents *)xfer->xfer_buf; req->dev_id = dev_id; - req->clk_id = clk_id; + if (clk_id < 255) { + req->clk_id = clk_id; + } else { + req->clk_id = 255; + req->clk_id_32 = clk_id; + } ret = ti_sci_do_xfer(info, xfer); if (ret) { @@ -1360,10 +1394,14 @@ static int ti_sci_cmd_clk_get_num_parents(const struct ti_sci_handle *handle, resp = (struct ti_sci_msg_resp_get_clock_num_parents *)xfer->xfer_buf; - if (!ti_sci_is_response_ack(resp)) + if (!ti_sci_is_response_ack(resp)) { ret = -ENODEV; - else - *num_parents = resp->num_parents; + } else { + if (resp->num_parents < 255) + *num_parents = resp->num_parents; + else + *num_parents = resp->num_parents_32; + } fail: ti_sci_put_one_xfer(&info->minfo, xfer); @@ -1391,7 +1429,7 @@ fail: * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_clk_get_match_freq(const struct ti_sci_handle *handle, - u32 dev_id, u8 clk_id, u64 min_freq, + u32 dev_id, u32 clk_id, u64 min_freq, u64 target_freq, u64 max_freq, u64 *match_freq) { @@ -1420,7 +1458,12 @@ static int ti_sci_cmd_clk_get_match_freq(const struct ti_sci_handle *handle, } req = (struct ti_sci_msg_req_query_clock_freq *)xfer->xfer_buf; req->dev_id = dev_id; - req->clk_id = clk_id; + if (clk_id < 255) { + req->clk_id = clk_id; + } else { + req->clk_id = 255; + req->clk_id_32 = clk_id; + } req->min_freq_hz = min_freq; req->target_freq_hz = target_freq; req->max_freq_hz = max_freq; @@ -1463,7 +1506,7 @@ fail: * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_clk_set_freq(const struct ti_sci_handle *handle, - u32 dev_id, u8 clk_id, u64 min_freq, + u32 dev_id, u32 clk_id, u64 min_freq, u64 target_freq, u64 max_freq) { struct ti_sci_info *info; @@ -1491,7 +1534,12 @@ static int ti_sci_cmd_clk_set_freq(const struct ti_sci_handle *handle, } req = (struct ti_sci_msg_req_set_clock_freq *)xfer->xfer_buf; req->dev_id = dev_id; - req->clk_id = clk_id; + if (clk_id < 255) { + req->clk_id = clk_id; + } else { + req->clk_id = 255; + req->clk_id_32 = clk_id; + } req->min_freq_hz = min_freq; req->target_freq_hz = target_freq; req->max_freq_hz = max_freq; @@ -1524,7 +1572,7 @@ fail: * Return: 0 if all went well, else returns appropriate error value. */ static int ti_sci_cmd_clk_get_freq(const struct ti_sci_handle *handle, - u32 dev_id, u8 clk_id, u64 *freq) + u32 dev_id, u32 clk_id, u64 *freq) { struct ti_sci_info *info; struct ti_sci_msg_req_get_clock_freq *req; @@ -1551,7 +1599,12 @@ static int ti_sci_cmd_clk_get_freq(const struct ti_sci_handle *handle, } req = (struct ti_sci_msg_req_get_clock_freq *)xfer->xfer_buf; req->dev_id = dev_id; - req->clk_id = clk_id; + if (clk_id < 255) { + req->clk_id = clk_id; + } else { + req->clk_id = 255; + req->clk_id_32 = clk_id; + } ret = ti_sci_do_xfer(info, xfer); if (ret) { diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h index 4983827151bf..ad0b47981b87 100644 --- a/drivers/firmware/ti_sci.h +++ b/drivers/firmware/ti_sci.h @@ -202,7 +202,8 @@ struct ti_sci_msg_req_set_device_resets { * @dev_id: Device identifier this request is for * @clk_id: Clock identifier for the device for this request. * Each device has it's own set of clock inputs. This indexes - * which clock input to modify. + * which clock input to modify. Set to 255 if clock ID is + * greater than or equal to 255. * @request_state: Request the state for the clock to be set to. * MSG_CLOCK_SW_STATE_UNREQ: The IP does not require this clock, * it can be disabled, regardless of the state of the device @@ -213,6 +214,9 @@ struct ti_sci_msg_req_set_device_resets { * being required by the device.(default) * MSG_CLOCK_SW_STATE_REQ: Configure the clock to be enabled, * regardless of the state of the device. + * @clk_id_32: Clock identifier for the device for this request. + * Only to be used if the clock ID is greater than or equal to + * 255. * * Normally, all required clocks are managed by TISCI entity, this is used * only for specific control *IF* required. Auto managed state is @@ -234,6 +238,7 @@ struct ti_sci_msg_req_set_clock_state { #define MSG_CLOCK_SW_STATE_AUTO 1 #define MSG_CLOCK_SW_STATE_REQ 2 u8 request_state; + u32 clk_id_32; } __packed; /** @@ -242,7 +247,11 @@ struct ti_sci_msg_req_set_clock_state { * @dev_id: Device identifier this request is for * @clk_id: Clock identifier for the device for this request. * Each device has it's own set of clock inputs. This indexes - * which clock input to get state of. + * which clock input to get state of. Set to 255 if the clock + * ID is greater than or equal to 255. + * @clk_id_32: Clock identifier for the device for the request. + * Only to be used if the clock ID is greater than or equal to + * 255. * * Request type is TI_SCI_MSG_GET_CLOCK_STATE, response is state * of the clock @@ -251,6 +260,7 @@ struct ti_sci_msg_req_get_clock_state { struct ti_sci_msg_hdr hdr; u32 dev_id; u8 clk_id; + u32 clk_id_32; } __packed; /** @@ -278,9 +288,13 @@ struct ti_sci_msg_resp_get_clock_state { * @dev_id: Device identifier this request is for * @clk_id: Clock identifier for the device for this request. * Each device has it's own set of clock inputs. This indexes - * which clock input to modify. + * which clock input to modify. Set to 255 if clock ID is + * greater than or equal to 255. * @parent_id: The new clock parent is selectable by an index via this - * parameter. + * parameter. Set to 255 if clock ID is greater than or + * equal to 255. + * @clk_id_32: Clock identifier if @clk_id field is 255. + * @parent_id_32: Parent identifier if @parent_id is 255. * * Request type is TI_SCI_MSG_SET_CLOCK_PARENT, response is generic * ACK / NACK message. @@ -290,6 +304,8 @@ struct ti_sci_msg_req_set_clock_parent { u32 dev_id; u8 clk_id; u8 parent_id; + u32 clk_id_32; + u32 parent_id_32; } __packed; /** @@ -298,7 +314,10 @@ struct ti_sci_msg_req_set_clock_parent { * @dev_id: Device identifier this request is for * @clk_id: Clock identifier for the device for this request. * Each device has it's own set of clock inputs. This indexes - * which clock input to get the parent for. + * which clock input to get the parent for. If this field + * contains 255, the actual clock identifier is stored in + * @clk_id_32. + * @clk_id_32: Clock identifier if the @clk_id field contains 255. * * Request type is TI_SCI_MSG_GET_CLOCK_PARENT, response is parent information */ @@ -306,25 +325,32 @@ struct ti_sci_msg_req_get_clock_parent { struct ti_sci_msg_hdr hdr; u32 dev_id; u8 clk_id; + u32 clk_id_32; } __packed; /** * struct ti_sci_msg_resp_get_clock_parent - Response with clock parent * @hdr: Generic Header - * @parent_id: The current clock parent + * @parent_id: The current clock parent. If set to 255, the current parent + * ID can be found from the @parent_id_32 field. + * @parent_id_32: Current clock parent if @parent_id field is set to + * 255. * * Response to TI_SCI_MSG_GET_CLOCK_PARENT. */ struct ti_sci_msg_resp_get_clock_parent { struct ti_sci_msg_hdr hdr; u8 parent_id; + u32 parent_id_32; } __packed; /** * struct ti_sci_msg_req_get_clock_num_parents - Request to get clock parents * @hdr: Generic header * @dev_id: Device identifier this request is for - * @clk_id: Clock identifier for the device for this request. + * @clk_id: Clock identifier for the device for this request. Set to + * 255 if clock ID is greater than or equal to 255. + * @clk_id_32: Clock identifier if the @clk_id field contains 255. * * This request provides information about how many clock parent options * are available for a given clock to a device. This is typically used @@ -337,18 +363,24 @@ struct ti_sci_msg_req_get_clock_num_parents { struct ti_sci_msg_hdr hdr; u32 dev_id; u8 clk_id; + u32 clk_id_32; } __packed; /** * struct ti_sci_msg_resp_get_clock_num_parents - Response for get clk parents * @hdr: Generic header - * @num_parents: Number of clock parents + * @num_parents: Number of clock parents. If set to 255, the actual + * number of parents is stored into @num_parents_32 + * field instead. + * @num_parents_32: Number of clock parents if @num_parents field is + * set to 255. * * Response to TI_SCI_MSG_GET_NUM_CLOCK_PARENTS */ struct ti_sci_msg_resp_get_clock_num_parents { struct ti_sci_msg_hdr hdr; u8 num_parents; + u32 num_parents_32; } __packed; /** @@ -363,7 +395,9 @@ struct ti_sci_msg_resp_get_clock_num_parents { * @max_freq_hz: The maximum allowable frequency in Hz. This is the maximum * allowable programmed frequency and does not account for clock * tolerances and jitter. - * @clk_id: Clock identifier for the device for this request. + * @clk_id: Clock identifier for the device for this request. Set to + * 255 if clock identifier is greater than or equal to 255. + * @clk_id_32: Clock identifier if @clk_id is set to 255. * * NOTE: Normally clock frequency management is automatically done by TISCI * entity. In case of specific requests, TISCI evaluates capability to achieve @@ -380,6 +414,7 @@ struct ti_sci_msg_req_query_clock_freq { u64 target_freq_hz; u64 max_freq_hz; u8 clk_id; + u32 clk_id_32; } __packed; /** @@ -407,7 +442,9 @@ struct ti_sci_msg_resp_query_clock_freq { * @max_freq_hz: The maximum allowable frequency in Hz. This is the maximum * allowable programmed frequency and does not account for clock * tolerances and jitter. - * @clk_id: Clock identifier for the device for this request. + * @clk_id: Clock identifier for the device for this request. Set to + * 255 if clock ID is greater than or equal to 255. + * @clk_id_32: Clock identifier if @clk_id field is set to 255. * * NOTE: Normally clock frequency management is automatically done by TISCI * entity. In case of specific requests, TISCI evaluates capability to achieve @@ -436,13 +473,16 @@ struct ti_sci_msg_req_set_clock_freq { u64 target_freq_hz; u64 max_freq_hz; u8 clk_id; + u32 clk_id_32; } __packed; /** * struct ti_sci_msg_req_get_clock_freq - Request to get the clock frequency * @hdr: Generic Header * @dev_id: Device identifier this request is for - * @clk_id: Clock identifier for the device for this request. + * @clk_id: Clock identifier for the device for this request. Set to + * 255 if clock ID is greater than or equal to 255. + * @clk_id_32: Clock identifier if @clk_id field is set to 255. * * NOTE: Normally clock frequency management is automatically done by TISCI * entity. In some cases, clock frequencies are configured by host. @@ -454,6 +494,7 @@ struct ti_sci_msg_req_get_clock_freq { struct ti_sci_msg_hdr hdr; u32 dev_id; u8 clk_id; + u32 clk_id_32; } __packed; /** diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h index 568722a041bf..406e6717d252 100644 --- a/include/linux/soc/ti/ti_sci_protocol.h +++ b/include/linux/soc/ti/ti_sci_protocol.h @@ -166,29 +166,29 @@ struct ti_sci_dev_ops { * managed by driver for that purpose. */ struct ti_sci_clk_ops { - int (*get_clock)(const struct ti_sci_handle *handle, u32 did, u8 cid, + int (*get_clock)(const struct ti_sci_handle *handle, u32 did, u32 cid, bool needs_ssc, bool can_change_freq, bool enable_input_term); - int (*idle_clock)(const struct ti_sci_handle *handle, u32 did, u8 cid); - int (*put_clock)(const struct ti_sci_handle *handle, u32 did, u8 cid); - int (*is_auto)(const struct ti_sci_handle *handle, u32 did, u8 cid, + int (*idle_clock)(const struct ti_sci_handle *handle, u32 did, u32 cid); + int (*put_clock)(const struct ti_sci_handle *handle, u32 did, u32 cid); + int (*is_auto)(const struct ti_sci_handle *handle, u32 did, u32 cid, bool *req_state); - int (*is_on)(const struct ti_sci_handle *handle, u32 did, u8 cid, + int (*is_on)(const struct ti_sci_handle *handle, u32 did, u32 cid, bool *req_state, bool *current_state); - int (*is_off)(const struct ti_sci_handle *handle, u32 did, u8 cid, + int (*is_off)(const struct ti_sci_handle *handle, u32 did, u32 cid, bool *req_state, bool *current_state); - int (*set_parent)(const struct ti_sci_handle *handle, u32 did, u8 cid, - u8 parent_id); - int (*get_parent)(const struct ti_sci_handle *handle, u32 did, u8 cid, - u8 *parent_id); + int (*set_parent)(const struct ti_sci_handle *handle, u32 did, u32 cid, + u32 parent_id); + int (*get_parent)(const struct ti_sci_handle *handle, u32 did, u32 cid, + u32 *parent_id); int (*get_num_parents)(const struct ti_sci_handle *handle, u32 did, - u8 cid, u8 *num_parents); + u32 cid, u32 *num_parents); int (*get_best_match_freq)(const struct ti_sci_handle *handle, u32 did, - u8 cid, u64 min_freq, u64 target_freq, + u32 cid, u64 min_freq, u64 target_freq, u64 max_freq, u64 *match_freq); - int (*set_freq)(const struct ti_sci_handle *handle, u32 did, u8 cid, + int (*set_freq)(const struct ti_sci_handle *handle, u32 did, u32 cid, u64 min_freq, u64 target_freq, u64 max_freq); - int (*get_freq)(const struct ti_sci_handle *handle, u32 did, u8 cid, + int (*get_freq)(const struct ti_sci_handle *handle, u32 did, u32 cid, u64 *current_freq); }; -- cgit v1.2.3-59-g8ed1b From 5740671e596bdc3986a5391997de194300970201 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 12 Jun 2019 14:28:30 +0100 Subject: dma-fence/reservation: Markup rcu protected access for DEBUG_MUTEXES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark the access to reservation_object.fence as being protected to silence sparse. Signed-off-by: Chris Wilson Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20190612132830.31221-1-chris@chris-wilson.co.uk --- include/linux/reservation.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/reservation.h b/include/linux/reservation.h index ee750765cc94..644a22dbe53b 100644 --- a/include/linux/reservation.h +++ b/include/linux/reservation.h @@ -216,8 +216,12 @@ reservation_object_unlock(struct reservation_object *obj) { #ifdef CONFIG_DEBUG_MUTEXES /* Test shared fence slot reservation */ - if (obj->fence) - obj->fence->shared_max = obj->fence->shared_count; + if (rcu_access_pointer(obj->fence)) { + struct reservation_object_list *fence = + reservation_object_get_list(obj); + + fence->shared_max = fence->shared_count; + } #endif ww_mutex_unlock(&obj->lock); } -- cgit v1.2.3-59-g8ed1b From d664c43958e0d9e0b34e23b6f8a8f4cf8ec61a2e Mon Sep 17 00:00:00 2001 From: Enrico Weigelt Date: Wed, 12 Jun 2019 23:59:36 +0200 Subject: gpio: Fix build warnings on undefined struct pinctrl_dev This fixes the warnings: * include/linux/gpio.h:254:11: warning: 'struct pinctrl_dev' declared inside parameter list will not be visible outside of this definition or declaration * include/linux/gpio/driver.h:602:11: warning: 'struct pinctrl_dev' declared inside parameter list will not be visible outside of this definition or declaration Fixes: 78b99577b393 ("pinctrl: remove unused pin_is_valid()") Reported-by: kbuild test robot Signed-off-by: Enrico Weigelt Signed-off-by: Linus Walleij --- include/linux/gpio.h | 1 + include/linux/gpio/driver.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gpio.h b/include/linux/gpio.h index 39745b8bdd65..40915b461f18 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -106,6 +106,7 @@ void devm_gpio_free(struct device *dev, unsigned int gpio); struct device; struct gpio_chip; +struct pinctrl_dev; static inline bool gpio_is_valid(int number) { diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index a1d273c96016..b58b27c11355 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -590,6 +590,8 @@ void gpiochip_remove_pin_ranges(struct gpio_chip *chip); #else +struct pinctrl_dev; + static inline int gpiochip_add_pin_range(struct gpio_chip *chip, const char *pinctl_name, unsigned int gpio_offset, unsigned int pin_offset, -- cgit v1.2.3-59-g8ed1b From 68608b5e5063dd12942f1118286c6f595d0c4a05 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 10 Jun 2019 12:18:56 +0300 Subject: firmware: ti_sci: Add resource management APIs for ringacc, psi-l and udma Configuration of NAVSS resource, like rings, UDMAP channels, flows and PSI-L thread management need to be done via TISCI. Add the needed structures and functions for NAVSS resource configuration of the following: Rings from Ring Accelerator PSI-L thread management UDMAP tchan, rchan and rflow configuration. Signed-off-by: Peter Ujfalusi Reviewed-by: Lokesh Vutla Signed-off-by: Tero Kristo Signed-off-by: Santosh Shilimkar --- drivers/firmware/ti_sci.c | 488 ++++++++++++++++++++++++ drivers/firmware/ti_sci.h | 675 +++++++++++++++++++++++++++++++++ include/linux/soc/ti/ti_sci_protocol.h | 215 +++++++++++ 3 files changed, 1378 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c index 36ce11a67235..02fa196428d8 100644 --- a/drivers/firmware/ti_sci.c +++ b/drivers/firmware/ti_sci.c @@ -2004,6 +2004,481 @@ static int ti_sci_cmd_free_event_map(const struct ti_sci_handle *handle, ia_id, vint, global_event, vint_status_bit, 0); } +/** + * ti_sci_cmd_ring_config() - configure RA ring + * @handle: Pointer to TI SCI handle. + * @valid_params: Bitfield defining validity of ring configuration + * parameters + * @nav_id: Device ID of Navigator Subsystem from which the ring is + * allocated + * @index: Ring index + * @addr_lo: The ring base address lo 32 bits + * @addr_hi: The ring base address hi 32 bits + * @count: Number of ring elements + * @mode: The mode of the ring + * @size: The ring element size. + * @order_id: Specifies the ring's bus order ID + * + * Return: 0 if all went well, else returns appropriate error value. + * + * See @ti_sci_msg_rm_ring_cfg_req for more info. + */ +static int ti_sci_cmd_ring_config(const struct ti_sci_handle *handle, + u32 valid_params, u16 nav_id, u16 index, + u32 addr_lo, u32 addr_hi, u32 count, + u8 mode, u8 size, u8 order_id) +{ + struct ti_sci_msg_rm_ring_cfg_req *req; + struct ti_sci_msg_hdr *resp; + struct ti_sci_xfer *xfer; + struct ti_sci_info *info; + struct device *dev; + int ret = 0; + + if (IS_ERR_OR_NULL(handle)) + return -EINVAL; + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_CFG, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(info->dev, "RM_RA:Message config failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_rm_ring_cfg_req *)xfer->xfer_buf; + req->valid_params = valid_params; + req->nav_id = nav_id; + req->index = index; + req->addr_lo = addr_lo; + req->addr_hi = addr_hi; + req->count = count; + req->mode = mode; + req->size = size; + req->order_id = order_id; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(info->dev, "RM_RA:Mbox config send fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf; + ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV; + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + dev_dbg(info->dev, "RM_RA:config ring %u ret:%d\n", index, ret); + return ret; +} + +/** + * ti_sci_cmd_ring_get_config() - get RA ring configuration + * @handle: Pointer to TI SCI handle. + * @nav_id: Device ID of Navigator Subsystem from which the ring is + * allocated + * @index: Ring index + * @addr_lo: Returns ring's base address lo 32 bits + * @addr_hi: Returns ring's base address hi 32 bits + * @count: Returns number of ring elements + * @mode: Returns mode of the ring + * @size: Returns ring element size + * @order_id: Returns ring's bus order ID + * + * Return: 0 if all went well, else returns appropriate error value. + * + * See @ti_sci_msg_rm_ring_get_cfg_req for more info. + */ +static int ti_sci_cmd_ring_get_config(const struct ti_sci_handle *handle, + u32 nav_id, u32 index, u8 *mode, + u32 *addr_lo, u32 *addr_hi, + u32 *count, u8 *size, u8 *order_id) +{ + struct ti_sci_msg_rm_ring_get_cfg_resp *resp; + struct ti_sci_msg_rm_ring_get_cfg_req *req; + struct ti_sci_xfer *xfer; + struct ti_sci_info *info; + struct device *dev; + int ret = 0; + + if (IS_ERR_OR_NULL(handle)) + return -EINVAL; + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_RING_GET_CFG, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(info->dev, + "RM_RA:Message get config failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_rm_ring_get_cfg_req *)xfer->xfer_buf; + req->nav_id = nav_id; + req->index = index; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(info->dev, "RM_RA:Mbox get config send fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_rm_ring_get_cfg_resp *)xfer->xfer_buf; + + if (!ti_sci_is_response_ack(resp)) { + ret = -ENODEV; + } else { + if (mode) + *mode = resp->mode; + if (addr_lo) + *addr_lo = resp->addr_lo; + if (addr_hi) + *addr_hi = resp->addr_hi; + if (count) + *count = resp->count; + if (size) + *size = resp->size; + if (order_id) + *order_id = resp->order_id; + }; + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + dev_dbg(info->dev, "RM_RA:get config ring %u ret:%d\n", index, ret); + return ret; +} + +/** + * ti_sci_cmd_rm_psil_pair() - Pair PSI-L source to destination thread + * @handle: Pointer to TI SCI handle. + * @nav_id: Device ID of Navigator Subsystem which should be used for + * pairing + * @src_thread: Source PSI-L thread ID + * @dst_thread: Destination PSI-L thread ID + * + * Return: 0 if all went well, else returns appropriate error value. + */ +static int ti_sci_cmd_rm_psil_pair(const struct ti_sci_handle *handle, + u32 nav_id, u32 src_thread, u32 dst_thread) +{ + struct ti_sci_msg_psil_pair *req; + struct ti_sci_msg_hdr *resp; + struct ti_sci_xfer *xfer; + struct ti_sci_info *info; + struct device *dev; + int ret = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (!handle) + return -EINVAL; + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_PSIL_PAIR, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(dev, "RM_PSIL:Message reconfig failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_psil_pair *)xfer->xfer_buf; + req->nav_id = nav_id; + req->src_thread = src_thread; + req->dst_thread = dst_thread; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(dev, "RM_PSIL:Mbox send fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf; + ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL; + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + + return ret; +} + +/** + * ti_sci_cmd_rm_psil_unpair() - Unpair PSI-L source from destination thread + * @handle: Pointer to TI SCI handle. + * @nav_id: Device ID of Navigator Subsystem which should be used for + * unpairing + * @src_thread: Source PSI-L thread ID + * @dst_thread: Destination PSI-L thread ID + * + * Return: 0 if all went well, else returns appropriate error value. + */ +static int ti_sci_cmd_rm_psil_unpair(const struct ti_sci_handle *handle, + u32 nav_id, u32 src_thread, u32 dst_thread) +{ + struct ti_sci_msg_psil_unpair *req; + struct ti_sci_msg_hdr *resp; + struct ti_sci_xfer *xfer; + struct ti_sci_info *info; + struct device *dev; + int ret = 0; + + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (!handle) + return -EINVAL; + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_RM_PSIL_UNPAIR, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(dev, "RM_PSIL:Message reconfig failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_psil_unpair *)xfer->xfer_buf; + req->nav_id = nav_id; + req->src_thread = src_thread; + req->dst_thread = dst_thread; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(dev, "RM_PSIL:Mbox send fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf; + ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL; + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + + return ret; +} + +/** + * ti_sci_cmd_rm_udmap_tx_ch_cfg() - Configure a UDMAP TX channel + * @handle: Pointer to TI SCI handle. + * @params: Pointer to ti_sci_msg_rm_udmap_tx_ch_cfg TX channel config + * structure + * + * Return: 0 if all went well, else returns appropriate error value. + * + * See @ti_sci_msg_rm_udmap_tx_ch_cfg and @ti_sci_msg_rm_udmap_tx_ch_cfg_req for + * more info. + */ +static int ti_sci_cmd_rm_udmap_tx_ch_cfg(const struct ti_sci_handle *handle, + const struct ti_sci_msg_rm_udmap_tx_ch_cfg *params) +{ + struct ti_sci_msg_rm_udmap_tx_ch_cfg_req *req; + struct ti_sci_msg_hdr *resp; + struct ti_sci_xfer *xfer; + struct ti_sci_info *info; + struct device *dev; + int ret = 0; + + if (IS_ERR_OR_NULL(handle)) + return -EINVAL; + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TISCI_MSG_RM_UDMAP_TX_CH_CFG, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(info->dev, "Message TX_CH_CFG alloc failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_rm_udmap_tx_ch_cfg_req *)xfer->xfer_buf; + req->valid_params = params->valid_params; + req->nav_id = params->nav_id; + req->index = params->index; + req->tx_pause_on_err = params->tx_pause_on_err; + req->tx_filt_einfo = params->tx_filt_einfo; + req->tx_filt_pswords = params->tx_filt_pswords; + req->tx_atype = params->tx_atype; + req->tx_chan_type = params->tx_chan_type; + req->tx_supr_tdpkt = params->tx_supr_tdpkt; + req->tx_fetch_size = params->tx_fetch_size; + req->tx_credit_count = params->tx_credit_count; + req->txcq_qnum = params->txcq_qnum; + req->tx_priority = params->tx_priority; + req->tx_qos = params->tx_qos; + req->tx_orderid = params->tx_orderid; + req->fdepth = params->fdepth; + req->tx_sched_priority = params->tx_sched_priority; + req->tx_burst_size = params->tx_burst_size; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(info->dev, "Mbox send TX_CH_CFG fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf; + ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL; + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + dev_dbg(info->dev, "TX_CH_CFG: chn %u ret:%u\n", params->index, ret); + return ret; +} + +/** + * ti_sci_cmd_rm_udmap_rx_ch_cfg() - Configure a UDMAP RX channel + * @handle: Pointer to TI SCI handle. + * @params: Pointer to ti_sci_msg_rm_udmap_rx_ch_cfg RX channel config + * structure + * + * Return: 0 if all went well, else returns appropriate error value. + * + * See @ti_sci_msg_rm_udmap_rx_ch_cfg and @ti_sci_msg_rm_udmap_rx_ch_cfg_req for + * more info. + */ +static int ti_sci_cmd_rm_udmap_rx_ch_cfg(const struct ti_sci_handle *handle, + const struct ti_sci_msg_rm_udmap_rx_ch_cfg *params) +{ + struct ti_sci_msg_rm_udmap_rx_ch_cfg_req *req; + struct ti_sci_msg_hdr *resp; + struct ti_sci_xfer *xfer; + struct ti_sci_info *info; + struct device *dev; + int ret = 0; + + if (IS_ERR_OR_NULL(handle)) + return -EINVAL; + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TISCI_MSG_RM_UDMAP_RX_CH_CFG, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(info->dev, "Message RX_CH_CFG alloc failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_rm_udmap_rx_ch_cfg_req *)xfer->xfer_buf; + req->valid_params = params->valid_params; + req->nav_id = params->nav_id; + req->index = params->index; + req->rx_fetch_size = params->rx_fetch_size; + req->rxcq_qnum = params->rxcq_qnum; + req->rx_priority = params->rx_priority; + req->rx_qos = params->rx_qos; + req->rx_orderid = params->rx_orderid; + req->rx_sched_priority = params->rx_sched_priority; + req->flowid_start = params->flowid_start; + req->flowid_cnt = params->flowid_cnt; + req->rx_pause_on_err = params->rx_pause_on_err; + req->rx_atype = params->rx_atype; + req->rx_chan_type = params->rx_chan_type; + req->rx_ignore_short = params->rx_ignore_short; + req->rx_ignore_long = params->rx_ignore_long; + req->rx_burst_size = params->rx_burst_size; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(info->dev, "Mbox send RX_CH_CFG fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf; + ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL; + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + dev_dbg(info->dev, "RX_CH_CFG: chn %u ret:%d\n", params->index, ret); + return ret; +} + +/** + * ti_sci_cmd_rm_udmap_rx_flow_cfg() - Configure UDMAP RX FLOW + * @handle: Pointer to TI SCI handle. + * @params: Pointer to ti_sci_msg_rm_udmap_flow_cfg RX FLOW config + * structure + * + * Return: 0 if all went well, else returns appropriate error value. + * + * See @ti_sci_msg_rm_udmap_flow_cfg and @ti_sci_msg_rm_udmap_flow_cfg_req for + * more info. + */ +static int ti_sci_cmd_rm_udmap_rx_flow_cfg(const struct ti_sci_handle *handle, + const struct ti_sci_msg_rm_udmap_flow_cfg *params) +{ + struct ti_sci_msg_rm_udmap_flow_cfg_req *req; + struct ti_sci_msg_hdr *resp; + struct ti_sci_xfer *xfer; + struct ti_sci_info *info; + struct device *dev; + int ret = 0; + + if (IS_ERR_OR_NULL(handle)) + return -EINVAL; + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TISCI_MSG_RM_UDMAP_FLOW_CFG, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(dev, "RX_FL_CFG: Message alloc failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_rm_udmap_flow_cfg_req *)xfer->xfer_buf; + req->valid_params = params->valid_params; + req->nav_id = params->nav_id; + req->flow_index = params->flow_index; + req->rx_einfo_present = params->rx_einfo_present; + req->rx_psinfo_present = params->rx_psinfo_present; + req->rx_error_handling = params->rx_error_handling; + req->rx_desc_type = params->rx_desc_type; + req->rx_sop_offset = params->rx_sop_offset; + req->rx_dest_qnum = params->rx_dest_qnum; + req->rx_src_tag_hi = params->rx_src_tag_hi; + req->rx_src_tag_lo = params->rx_src_tag_lo; + req->rx_dest_tag_hi = params->rx_dest_tag_hi; + req->rx_dest_tag_lo = params->rx_dest_tag_lo; + req->rx_src_tag_hi_sel = params->rx_src_tag_hi_sel; + req->rx_src_tag_lo_sel = params->rx_src_tag_lo_sel; + req->rx_dest_tag_hi_sel = params->rx_dest_tag_hi_sel; + req->rx_dest_tag_lo_sel = params->rx_dest_tag_lo_sel; + req->rx_fdq0_sz0_qnum = params->rx_fdq0_sz0_qnum; + req->rx_fdq1_qnum = params->rx_fdq1_qnum; + req->rx_fdq2_qnum = params->rx_fdq2_qnum; + req->rx_fdq3_qnum = params->rx_fdq3_qnum; + req->rx_ps_location = params->rx_ps_location; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(dev, "RX_FL_CFG: Mbox send fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_hdr *)xfer->xfer_buf; + ret = ti_sci_is_response_ack(resp) ? 0 : -EINVAL; + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + dev_dbg(info->dev, "RX_FL_CFG: %u ret:%d\n", params->flow_index, ret); + return ret; +} + /* * ti_sci_setup_ops() - Setup the operations structures * @info: pointer to TISCI pointer @@ -2016,6 +2491,9 @@ static void ti_sci_setup_ops(struct ti_sci_info *info) struct ti_sci_clk_ops *cops = &ops->clk_ops; struct ti_sci_rm_core_ops *rm_core_ops = &ops->rm_core_ops; struct ti_sci_rm_irq_ops *iops = &ops->rm_irq_ops; + struct ti_sci_rm_ringacc_ops *rops = &ops->rm_ring_ops; + struct ti_sci_rm_psil_ops *psilops = &ops->rm_psil_ops; + struct ti_sci_rm_udmap_ops *udmap_ops = &ops->rm_udmap_ops; core_ops->reboot_device = ti_sci_cmd_core_reboot; @@ -2055,6 +2533,16 @@ static void ti_sci_setup_ops(struct ti_sci_info *info) iops->set_event_map = ti_sci_cmd_set_event_map; iops->free_irq = ti_sci_cmd_free_irq; iops->free_event_map = ti_sci_cmd_free_event_map; + + rops->config = ti_sci_cmd_ring_config; + rops->get_config = ti_sci_cmd_ring_get_config; + + psilops->pair = ti_sci_cmd_rm_psil_pair; + psilops->unpair = ti_sci_cmd_rm_psil_unpair; + + udmap_ops->tx_ch_cfg = ti_sci_cmd_rm_udmap_tx_ch_cfg; + udmap_ops->rx_ch_cfg = ti_sci_cmd_rm_udmap_rx_ch_cfg; + udmap_ops->rx_flow_cfg = ti_sci_cmd_rm_udmap_rx_flow_cfg; } /** diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h index 4983827151bf..2bb81ec7793c 100644 --- a/drivers/firmware/ti_sci.h +++ b/drivers/firmware/ti_sci.h @@ -42,6 +42,35 @@ #define TI_SCI_MSG_SET_IRQ 0x1000 #define TI_SCI_MSG_FREE_IRQ 0x1001 +/* NAVSS resource management */ +/* Ringacc requests */ +#define TI_SCI_MSG_RM_RING_ALLOCATE 0x1100 +#define TI_SCI_MSG_RM_RING_FREE 0x1101 +#define TI_SCI_MSG_RM_RING_RECONFIG 0x1102 +#define TI_SCI_MSG_RM_RING_RESET 0x1103 +#define TI_SCI_MSG_RM_RING_CFG 0x1110 +#define TI_SCI_MSG_RM_RING_GET_CFG 0x1111 + +/* PSI-L requests */ +#define TI_SCI_MSG_RM_PSIL_PAIR 0x1280 +#define TI_SCI_MSG_RM_PSIL_UNPAIR 0x1281 + +#define TI_SCI_MSG_RM_UDMAP_TX_ALLOC 0x1200 +#define TI_SCI_MSG_RM_UDMAP_TX_FREE 0x1201 +#define TI_SCI_MSG_RM_UDMAP_RX_ALLOC 0x1210 +#define TI_SCI_MSG_RM_UDMAP_RX_FREE 0x1211 +#define TI_SCI_MSG_RM_UDMAP_FLOW_CFG 0x1220 +#define TI_SCI_MSG_RM_UDMAP_OPT_FLOW_CFG 0x1221 + +#define TISCI_MSG_RM_UDMAP_TX_CH_CFG 0x1205 +#define TISCI_MSG_RM_UDMAP_TX_CH_GET_CFG 0x1206 +#define TISCI_MSG_RM_UDMAP_RX_CH_CFG 0x1215 +#define TISCI_MSG_RM_UDMAP_RX_CH_GET_CFG 0x1216 +#define TISCI_MSG_RM_UDMAP_FLOW_CFG 0x1230 +#define TISCI_MSG_RM_UDMAP_FLOW_SIZE_THRESH_CFG 0x1231 +#define TISCI_MSG_RM_UDMAP_FLOW_GET_CFG 0x1232 +#define TISCI_MSG_RM_UDMAP_FLOW_SIZE_THRESH_GET_CFG 0x1233 + /** * struct ti_sci_msg_hdr - Generic Message Header for All messages and responses * @type: Type of messages: One of TI_SCI_MSG* values @@ -563,4 +592,650 @@ struct ti_sci_msg_req_manage_irq { u8 secondary_host; } __packed; +/** + * struct ti_sci_msg_rm_ring_cfg_req - Configure a Navigator Subsystem ring + * + * Configures the non-real-time registers of a Navigator Subsystem ring. + * @hdr: Generic Header + * @valid_params: Bitfield defining validity of ring configuration parameters. + * The ring configuration fields are not valid, and will not be used for + * ring configuration, if their corresponding valid bit is zero. + * Valid bit usage: + * 0 - Valid bit for @tisci_msg_rm_ring_cfg_req addr_lo + * 1 - Valid bit for @tisci_msg_rm_ring_cfg_req addr_hi + * 2 - Valid bit for @tisci_msg_rm_ring_cfg_req count + * 3 - Valid bit for @tisci_msg_rm_ring_cfg_req mode + * 4 - Valid bit for @tisci_msg_rm_ring_cfg_req size + * 5 - Valid bit for @tisci_msg_rm_ring_cfg_req order_id + * @nav_id: Device ID of Navigator Subsystem from which the ring is allocated + * @index: ring index to be configured. + * @addr_lo: 32 LSBs of ring base address to be programmed into the ring's + * RING_BA_LO register + * @addr_hi: 16 MSBs of ring base address to be programmed into the ring's + * RING_BA_HI register. + * @count: Number of ring elements. Must be even if mode is CREDENTIALS or QM + * modes. + * @mode: Specifies the mode the ring is to be configured. + * @size: Specifies encoded ring element size. To calculate the encoded size use + * the formula (log2(size_bytes) - 2), where size_bytes cannot be + * greater than 256. + * @order_id: Specifies the ring's bus order ID. + */ +struct ti_sci_msg_rm_ring_cfg_req { + struct ti_sci_msg_hdr hdr; + u32 valid_params; + u16 nav_id; + u16 index; + u32 addr_lo; + u32 addr_hi; + u32 count; + u8 mode; + u8 size; + u8 order_id; +} __packed; + +/** + * struct ti_sci_msg_rm_ring_get_cfg_req - Get RA ring's configuration + * + * Gets the configuration of the non-real-time register fields of a ring. The + * host, or a supervisor of the host, who owns the ring must be the requesting + * host. The values of the non-real-time registers are returned in + * @ti_sci_msg_rm_ring_get_cfg_resp. + * + * @hdr: Generic Header + * @nav_id: Device ID of Navigator Subsystem from which the ring is allocated + * @index: ring index. + */ +struct ti_sci_msg_rm_ring_get_cfg_req { + struct ti_sci_msg_hdr hdr; + u16 nav_id; + u16 index; +} __packed; + +/** + * struct ti_sci_msg_rm_ring_get_cfg_resp - Ring get configuration response + * + * Response received by host processor after RM has handled + * @ti_sci_msg_rm_ring_get_cfg_req. The response contains the ring's + * non-real-time register values. + * + * @hdr: Generic Header + * @addr_lo: Ring 32 LSBs of base address + * @addr_hi: Ring 16 MSBs of base address. + * @count: Ring number of elements. + * @mode: Ring mode. + * @size: encoded Ring element size + * @order_id: ing order ID. + */ +struct ti_sci_msg_rm_ring_get_cfg_resp { + struct ti_sci_msg_hdr hdr; + u32 addr_lo; + u32 addr_hi; + u32 count; + u8 mode; + u8 size; + u8 order_id; +} __packed; + +/** + * struct ti_sci_msg_psil_pair - Pairs a PSI-L source thread to a destination + * thread + * @hdr: Generic Header + * @nav_id: SoC Navigator Subsystem device ID whose PSI-L config proxy is + * used to pair the source and destination threads. + * @src_thread: PSI-L source thread ID within the PSI-L System thread map. + * + * UDMAP transmit channels mapped to source threads will have their + * TCHAN_THRD_ID register programmed with the destination thread if the pairing + * is successful. + + * @dst_thread: PSI-L destination thread ID within the PSI-L System thread map. + * PSI-L destination threads start at index 0x8000. The request is NACK'd if + * the destination thread is not greater than or equal to 0x8000. + * + * UDMAP receive channels mapped to destination threads will have their + * RCHAN_THRD_ID register programmed with the source thread if the pairing + * is successful. + * + * Request type is TI_SCI_MSG_RM_PSIL_PAIR, response is a generic ACK or NACK + * message. + */ +struct ti_sci_msg_psil_pair { + struct ti_sci_msg_hdr hdr; + u32 nav_id; + u32 src_thread; + u32 dst_thread; +} __packed; + +/** + * struct ti_sci_msg_psil_unpair - Unpairs a PSI-L source thread from a + * destination thread + * @hdr: Generic Header + * @nav_id: SoC Navigator Subsystem device ID whose PSI-L config proxy is + * used to unpair the source and destination threads. + * @src_thread: PSI-L source thread ID within the PSI-L System thread map. + * + * UDMAP transmit channels mapped to source threads will have their + * TCHAN_THRD_ID register cleared if the unpairing is successful. + * + * @dst_thread: PSI-L destination thread ID within the PSI-L System thread map. + * PSI-L destination threads start at index 0x8000. The request is NACK'd if + * the destination thread is not greater than or equal to 0x8000. + * + * UDMAP receive channels mapped to destination threads will have their + * RCHAN_THRD_ID register cleared if the unpairing is successful. + * + * Request type is TI_SCI_MSG_RM_PSIL_UNPAIR, response is a generic ACK or NACK + * message. + */ +struct ti_sci_msg_psil_unpair { + struct ti_sci_msg_hdr hdr; + u32 nav_id; + u32 src_thread; + u32 dst_thread; +} __packed; + +/** + * struct ti_sci_msg_udmap_rx_flow_cfg - UDMAP receive flow configuration + * message + * @hdr: Generic Header + * @nav_id: SoC Navigator Subsystem device ID from which the receive flow is + * allocated + * @flow_index: UDMAP receive flow index for non-optional configuration. + * @rx_ch_index: Specifies the index of the receive channel using the flow_index + * @rx_einfo_present: UDMAP receive flow extended packet info present. + * @rx_psinfo_present: UDMAP receive flow PS words present. + * @rx_error_handling: UDMAP receive flow error handling configuration. Valid + * values are TI_SCI_RM_UDMAP_RX_FLOW_ERR_DROP/RETRY. + * @rx_desc_type: UDMAP receive flow descriptor type. It can be one of + * TI_SCI_RM_UDMAP_RX_FLOW_DESC_HOST/MONO. + * @rx_sop_offset: UDMAP receive flow start of packet offset. + * @rx_dest_qnum: UDMAP receive flow destination queue number. + * @rx_ps_location: UDMAP receive flow PS words location. + * 0 - end of packet descriptor + * 1 - Beginning of the data buffer + * @rx_src_tag_hi: UDMAP receive flow source tag high byte constant + * @rx_src_tag_lo: UDMAP receive flow source tag low byte constant + * @rx_dest_tag_hi: UDMAP receive flow destination tag high byte constant + * @rx_dest_tag_lo: UDMAP receive flow destination tag low byte constant + * @rx_src_tag_hi_sel: UDMAP receive flow source tag high byte selector + * @rx_src_tag_lo_sel: UDMAP receive flow source tag low byte selector + * @rx_dest_tag_hi_sel: UDMAP receive flow destination tag high byte selector + * @rx_dest_tag_lo_sel: UDMAP receive flow destination tag low byte selector + * @rx_size_thresh_en: UDMAP receive flow packet size based free buffer queue + * enable. If enabled, the ti_sci_rm_udmap_rx_flow_opt_cfg also need to be + * configured and sent. + * @rx_fdq0_sz0_qnum: UDMAP receive flow free descriptor queue 0. + * @rx_fdq1_qnum: UDMAP receive flow free descriptor queue 1. + * @rx_fdq2_qnum: UDMAP receive flow free descriptor queue 2. + * @rx_fdq3_qnum: UDMAP receive flow free descriptor queue 3. + * + * For detailed information on the settings, see the UDMAP section of the TRM. + */ +struct ti_sci_msg_udmap_rx_flow_cfg { + struct ti_sci_msg_hdr hdr; + u32 nav_id; + u32 flow_index; + u32 rx_ch_index; + u8 rx_einfo_present; + u8 rx_psinfo_present; + u8 rx_error_handling; + u8 rx_desc_type; + u16 rx_sop_offset; + u16 rx_dest_qnum; + u8 rx_ps_location; + u8 rx_src_tag_hi; + u8 rx_src_tag_lo; + u8 rx_dest_tag_hi; + u8 rx_dest_tag_lo; + u8 rx_src_tag_hi_sel; + u8 rx_src_tag_lo_sel; + u8 rx_dest_tag_hi_sel; + u8 rx_dest_tag_lo_sel; + u8 rx_size_thresh_en; + u16 rx_fdq0_sz0_qnum; + u16 rx_fdq1_qnum; + u16 rx_fdq2_qnum; + u16 rx_fdq3_qnum; +} __packed; + +/** + * struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg - parameters for UDMAP receive + * flow optional configuration + * @hdr: Generic Header + * @nav_id: SoC Navigator Subsystem device ID from which the receive flow is + * allocated + * @flow_index: UDMAP receive flow index for optional configuration. + * @rx_ch_index: Specifies the index of the receive channel using the flow_index + * @rx_size_thresh0: UDMAP receive flow packet size threshold 0. + * @rx_size_thresh1: UDMAP receive flow packet size threshold 1. + * @rx_size_thresh2: UDMAP receive flow packet size threshold 2. + * @rx_fdq0_sz1_qnum: UDMAP receive flow free descriptor queue for size + * threshold 1. + * @rx_fdq0_sz2_qnum: UDMAP receive flow free descriptor queue for size + * threshold 2. + * @rx_fdq0_sz3_qnum: UDMAP receive flow free descriptor queue for size + * threshold 3. + * + * For detailed information on the settings, see the UDMAP section of the TRM. + */ +struct rm_ti_sci_msg_udmap_rx_flow_opt_cfg { + struct ti_sci_msg_hdr hdr; + u32 nav_id; + u32 flow_index; + u32 rx_ch_index; + u16 rx_size_thresh0; + u16 rx_size_thresh1; + u16 rx_size_thresh2; + u16 rx_fdq0_sz1_qnum; + u16 rx_fdq0_sz2_qnum; + u16 rx_fdq0_sz3_qnum; +} __packed; + +/** + * Configures a Navigator Subsystem UDMAP transmit channel + * + * Configures the non-real-time registers of a Navigator Subsystem UDMAP + * transmit channel. The channel index must be assigned to the host defined + * in the TISCI header via the RM board configuration resource assignment + * range list. + * + * @hdr: Generic Header + * + * @valid_params: Bitfield defining validity of tx channel configuration + * parameters. The tx channel configuration fields are not valid, and will not + * be used for ch configuration, if their corresponding valid bit is zero. + * Valid bit usage: + * 0 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_pause_on_err + * 1 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_atype + * 2 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_chan_type + * 3 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_fetch_size + * 4 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::txcq_qnum + * 5 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_priority + * 6 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_qos + * 7 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_orderid + * 8 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_sched_priority + * 9 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_filt_einfo + * 10 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_filt_pswords + * 11 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_supr_tdpkt + * 12 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_credit_count + * 13 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::fdepth + * 14 - Valid bit for @ref ti_sci_msg_rm_udmap_tx_ch_cfg::tx_burst_size + * + * @nav_id: SoC device ID of Navigator Subsystem where tx channel is located + * + * @index: UDMAP transmit channel index. + * + * @tx_pause_on_err: UDMAP transmit channel pause on error configuration to + * be programmed into the tx_pause_on_err field of the channel's TCHAN_TCFG + * register. + * + * @tx_filt_einfo: UDMAP transmit channel extended packet information passing + * configuration to be programmed into the tx_filt_einfo field of the + * channel's TCHAN_TCFG register. + * + * @tx_filt_pswords: UDMAP transmit channel protocol specific word passing + * configuration to be programmed into the tx_filt_pswords field of the + * channel's TCHAN_TCFG register. + * + * @tx_atype: UDMAP transmit channel non Ring Accelerator access pointer + * interpretation configuration to be programmed into the tx_atype field of + * the channel's TCHAN_TCFG register. + * + * @tx_chan_type: UDMAP transmit channel functional channel type and work + * passing mechanism configuration to be programmed into the tx_chan_type + * field of the channel's TCHAN_TCFG register. + * + * @tx_supr_tdpkt: UDMAP transmit channel teardown packet generation suppression + * configuration to be programmed into the tx_supr_tdpkt field of the channel's + * TCHAN_TCFG register. + * + * @tx_fetch_size: UDMAP transmit channel number of 32-bit descriptor words to + * fetch configuration to be programmed into the tx_fetch_size field of the + * channel's TCHAN_TCFG register. The user must make sure to set the maximum + * word count that can pass through the channel for any allowed descriptor type. + * + * @tx_credit_count: UDMAP transmit channel transfer request credit count + * configuration to be programmed into the count field of the TCHAN_TCREDIT + * register. Specifies how many credits for complete TRs are available. + * + * @txcq_qnum: UDMAP transmit channel completion queue configuration to be + * programmed into the txcq_qnum field of the TCHAN_TCQ register. The specified + * completion queue must be assigned to the host, or a subordinate of the host, + * requesting configuration of the transmit channel. + * + * @tx_priority: UDMAP transmit channel transmit priority value to be programmed + * into the priority field of the channel's TCHAN_TPRI_CTRL register. + * + * @tx_qos: UDMAP transmit channel transmit qos value to be programmed into the + * qos field of the channel's TCHAN_TPRI_CTRL register. + * + * @tx_orderid: UDMAP transmit channel bus order id value to be programmed into + * the orderid field of the channel's TCHAN_TPRI_CTRL register. + * + * @fdepth: UDMAP transmit channel FIFO depth configuration to be programmed + * into the fdepth field of the TCHAN_TFIFO_DEPTH register. Sets the number of + * Tx FIFO bytes which are allowed to be stored for the channel. Check the UDMAP + * section of the TRM for restrictions regarding this parameter. + * + * @tx_sched_priority: UDMAP transmit channel tx scheduling priority + * configuration to be programmed into the priority field of the channel's + * TCHAN_TST_SCHED register. + * + * @tx_burst_size: UDMAP transmit channel burst size configuration to be + * programmed into the tx_burst_size field of the TCHAN_TCFG register. + */ +struct ti_sci_msg_rm_udmap_tx_ch_cfg_req { + struct ti_sci_msg_hdr hdr; + u32 valid_params; + u16 nav_id; + u16 index; + u8 tx_pause_on_err; + u8 tx_filt_einfo; + u8 tx_filt_pswords; + u8 tx_atype; + u8 tx_chan_type; + u8 tx_supr_tdpkt; + u16 tx_fetch_size; + u8 tx_credit_count; + u16 txcq_qnum; + u8 tx_priority; + u8 tx_qos; + u8 tx_orderid; + u16 fdepth; + u8 tx_sched_priority; + u8 tx_burst_size; +} __packed; + +/** + * Configures a Navigator Subsystem UDMAP receive channel + * + * Configures the non-real-time registers of a Navigator Subsystem UDMAP + * receive channel. The channel index must be assigned to the host defined + * in the TISCI header via the RM board configuration resource assignment + * range list. + * + * @hdr: Generic Header + * + * @valid_params: Bitfield defining validity of rx channel configuration + * parameters. + * The rx channel configuration fields are not valid, and will not be used for + * ch configuration, if their corresponding valid bit is zero. + * Valid bit usage: + * 0 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_pause_on_err + * 1 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_atype + * 2 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_chan_type + * 3 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_fetch_size + * 4 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rxcq_qnum + * 5 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_priority + * 6 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_qos + * 7 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_orderid + * 8 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_sched_priority + * 9 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::flowid_start + * 10 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::flowid_cnt + * 11 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_ignore_short + * 12 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_ignore_long + * 14 - Valid bit for @ti_sci_msg_rm_udmap_rx_ch_cfg_req::rx_burst_size + * + * @nav_id: SoC device ID of Navigator Subsystem where rx channel is located + * + * @index: UDMAP receive channel index. + * + * @rx_fetch_size: UDMAP receive channel number of 32-bit descriptor words to + * fetch configuration to be programmed into the rx_fetch_size field of the + * channel's RCHAN_RCFG register. + * + * @rxcq_qnum: UDMAP receive channel completion queue configuration to be + * programmed into the rxcq_qnum field of the RCHAN_RCQ register. + * The specified completion queue must be assigned to the host, or a subordinate + * of the host, requesting configuration of the receive channel. + * + * @rx_priority: UDMAP receive channel receive priority value to be programmed + * into the priority field of the channel's RCHAN_RPRI_CTRL register. + * + * @rx_qos: UDMAP receive channel receive qos value to be programmed into the + * qos field of the channel's RCHAN_RPRI_CTRL register. + * + * @rx_orderid: UDMAP receive channel bus order id value to be programmed into + * the orderid field of the channel's RCHAN_RPRI_CTRL register. + * + * @rx_sched_priority: UDMAP receive channel rx scheduling priority + * configuration to be programmed into the priority field of the channel's + * RCHAN_RST_SCHED register. + * + * @flowid_start: UDMAP receive channel additional flows starting index + * configuration to program into the flow_start field of the RCHAN_RFLOW_RNG + * register. Specifies the starting index for flow IDs the receive channel is to + * make use of beyond the default flow. flowid_start and @ref flowid_cnt must be + * set as valid and configured together. The starting flow ID set by + * @ref flowid_cnt must be a flow index within the Navigator Subsystem's subset + * of flows beyond the default flows statically mapped to receive channels. + * The additional flows must be assigned to the host, or a subordinate of the + * host, requesting configuration of the receive channel. + * + * @flowid_cnt: UDMAP receive channel additional flows count configuration to + * program into the flowid_cnt field of the RCHAN_RFLOW_RNG register. + * This field specifies how many flow IDs are in the additional contiguous range + * of legal flow IDs for the channel. @ref flowid_start and flowid_cnt must be + * set as valid and configured together. Disabling the valid_params field bit + * for flowid_cnt indicates no flow IDs other than the default are to be + * allocated and used by the receive channel. @ref flowid_start plus flowid_cnt + * cannot be greater than the number of receive flows in the receive channel's + * Navigator Subsystem. The additional flows must be assigned to the host, or a + * subordinate of the host, requesting configuration of the receive channel. + * + * @rx_pause_on_err: UDMAP receive channel pause on error configuration to be + * programmed into the rx_pause_on_err field of the channel's RCHAN_RCFG + * register. + * + * @rx_atype: UDMAP receive channel non Ring Accelerator access pointer + * interpretation configuration to be programmed into the rx_atype field of the + * channel's RCHAN_RCFG register. + * + * @rx_chan_type: UDMAP receive channel functional channel type and work passing + * mechanism configuration to be programmed into the rx_chan_type field of the + * channel's RCHAN_RCFG register. + * + * @rx_ignore_short: UDMAP receive channel short packet treatment configuration + * to be programmed into the rx_ignore_short field of the RCHAN_RCFG register. + * + * @rx_ignore_long: UDMAP receive channel long packet treatment configuration to + * be programmed into the rx_ignore_long field of the RCHAN_RCFG register. + * + * @rx_burst_size: UDMAP receive channel burst size configuration to be + * programmed into the rx_burst_size field of the RCHAN_RCFG register. + */ +struct ti_sci_msg_rm_udmap_rx_ch_cfg_req { + struct ti_sci_msg_hdr hdr; + u32 valid_params; + u16 nav_id; + u16 index; + u16 rx_fetch_size; + u16 rxcq_qnum; + u8 rx_priority; + u8 rx_qos; + u8 rx_orderid; + u8 rx_sched_priority; + u16 flowid_start; + u16 flowid_cnt; + u8 rx_pause_on_err; + u8 rx_atype; + u8 rx_chan_type; + u8 rx_ignore_short; + u8 rx_ignore_long; + u8 rx_burst_size; +} __packed; + +/** + * Configures a Navigator Subsystem UDMAP receive flow + * + * Configures a Navigator Subsystem UDMAP receive flow's registers. + * Configuration does not include the flow registers which handle size-based + * free descriptor queue routing. + * + * The flow index must be assigned to the host defined in the TISCI header via + * the RM board configuration resource assignment range list. + * + * @hdr: Standard TISCI header + * + * @valid_params + * Bitfield defining validity of rx flow configuration parameters. The + * rx flow configuration fields are not valid, and will not be used for flow + * configuration, if their corresponding valid bit is zero. Valid bit usage: + * 0 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_einfo_present + * 1 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_psinfo_present + * 2 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_error_handling + * 3 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_desc_type + * 4 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_sop_offset + * 5 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_qnum + * 6 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_hi + * 7 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_lo + * 8 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_hi + * 9 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_lo + * 10 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_hi_sel + * 11 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_src_tag_lo_sel + * 12 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_hi_sel + * 13 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_dest_tag_lo_sel + * 14 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq0_sz0_qnum + * 15 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq1_sz0_qnum + * 16 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq2_sz0_qnum + * 17 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_fdq3_sz0_qnum + * 18 - Valid bit for @tisci_msg_rm_udmap_flow_cfg_req::rx_ps_location + * + * @nav_id: SoC device ID of Navigator Subsystem from which the receive flow is + * allocated + * + * @flow_index: UDMAP receive flow index for non-optional configuration. + * + * @rx_einfo_present: + * UDMAP receive flow extended packet info present configuration to be + * programmed into the rx_einfo_present field of the flow's RFLOW_RFA register. + * + * @rx_psinfo_present: + * UDMAP receive flow PS words present configuration to be programmed into the + * rx_psinfo_present field of the flow's RFLOW_RFA register. + * + * @rx_error_handling: + * UDMAP receive flow error handling configuration to be programmed into the + * rx_error_handling field of the flow's RFLOW_RFA register. + * + * @rx_desc_type: + * UDMAP receive flow descriptor type configuration to be programmed into the + * rx_desc_type field field of the flow's RFLOW_RFA register. + * + * @rx_sop_offset: + * UDMAP receive flow start of packet offset configuration to be programmed + * into the rx_sop_offset field of the RFLOW_RFA register. See the UDMAP + * section of the TRM for more information on this setting. Valid values for + * this field are 0-255 bytes. + * + * @rx_dest_qnum: + * UDMAP receive flow destination queue configuration to be programmed into the + * rx_dest_qnum field of the flow's RFLOW_RFA register. The specified + * destination queue must be valid within the Navigator Subsystem and must be + * owned by the host, or a subordinate of the host, requesting allocation and + * configuration of the receive flow. + * + * @rx_src_tag_hi: + * UDMAP receive flow source tag high byte constant configuration to be + * programmed into the rx_src_tag_hi field of the flow's RFLOW_RFB register. + * See the UDMAP section of the TRM for more information on this setting. + * + * @rx_src_tag_lo: + * UDMAP receive flow source tag low byte constant configuration to be + * programmed into the rx_src_tag_lo field of the flow's RFLOW_RFB register. + * See the UDMAP section of the TRM for more information on this setting. + * + * @rx_dest_tag_hi: + * UDMAP receive flow destination tag high byte constant configuration to be + * programmed into the rx_dest_tag_hi field of the flow's RFLOW_RFB register. + * See the UDMAP section of the TRM for more information on this setting. + * + * @rx_dest_tag_lo: + * UDMAP receive flow destination tag low byte constant configuration to be + * programmed into the rx_dest_tag_lo field of the flow's RFLOW_RFB register. + * See the UDMAP section of the TRM for more information on this setting. + * + * @rx_src_tag_hi_sel: + * UDMAP receive flow source tag high byte selector configuration to be + * programmed into the rx_src_tag_hi_sel field of the RFLOW_RFC register. See + * the UDMAP section of the TRM for more information on this setting. + * + * @rx_src_tag_lo_sel: + * UDMAP receive flow source tag low byte selector configuration to be + * programmed into the rx_src_tag_lo_sel field of the RFLOW_RFC register. See + * the UDMAP section of the TRM for more information on this setting. + * + * @rx_dest_tag_hi_sel: + * UDMAP receive flow destination tag high byte selector configuration to be + * programmed into the rx_dest_tag_hi_sel field of the RFLOW_RFC register. See + * the UDMAP section of the TRM for more information on this setting. + * + * @rx_dest_tag_lo_sel: + * UDMAP receive flow destination tag low byte selector configuration to be + * programmed into the rx_dest_tag_lo_sel field of the RFLOW_RFC register. See + * the UDMAP section of the TRM for more information on this setting. + * + * @rx_fdq0_sz0_qnum: + * UDMAP receive flow free descriptor queue 0 configuration to be programmed + * into the rx_fdq0_sz0_qnum field of the flow's RFLOW_RFD register. See the + * UDMAP section of the TRM for more information on this setting. The specified + * free queue must be valid within the Navigator Subsystem and must be owned + * by the host, or a subordinate of the host, requesting allocation and + * configuration of the receive flow. + * + * @rx_fdq1_qnum: + * UDMAP receive flow free descriptor queue 1 configuration to be programmed + * into the rx_fdq1_qnum field of the flow's RFLOW_RFD register. See the + * UDMAP section of the TRM for more information on this setting. The specified + * free queue must be valid within the Navigator Subsystem and must be owned + * by the host, or a subordinate of the host, requesting allocation and + * configuration of the receive flow. + * + * @rx_fdq2_qnum: + * UDMAP receive flow free descriptor queue 2 configuration to be programmed + * into the rx_fdq2_qnum field of the flow's RFLOW_RFE register. See the + * UDMAP section of the TRM for more information on this setting. The specified + * free queue must be valid within the Navigator Subsystem and must be owned + * by the host, or a subordinate of the host, requesting allocation and + * configuration of the receive flow. + * + * @rx_fdq3_qnum: + * UDMAP receive flow free descriptor queue 3 configuration to be programmed + * into the rx_fdq3_qnum field of the flow's RFLOW_RFE register. See the + * UDMAP section of the TRM for more information on this setting. The specified + * free queue must be valid within the Navigator Subsystem and must be owned + * by the host, or a subordinate of the host, requesting allocation and + * configuration of the receive flow. + * + * @rx_ps_location: + * UDMAP receive flow PS words location configuration to be programmed into the + * rx_ps_location field of the flow's RFLOW_RFA register. + */ +struct ti_sci_msg_rm_udmap_flow_cfg_req { + struct ti_sci_msg_hdr hdr; + u32 valid_params; + u16 nav_id; + u16 flow_index; + u8 rx_einfo_present; + u8 rx_psinfo_present; + u8 rx_error_handling; + u8 rx_desc_type; + u16 rx_sop_offset; + u16 rx_dest_qnum; + u8 rx_src_tag_hi; + u8 rx_src_tag_lo; + u8 rx_dest_tag_hi; + u8 rx_dest_tag_lo; + u8 rx_src_tag_hi_sel; + u8 rx_src_tag_lo_sel; + u8 rx_dest_tag_hi_sel; + u8 rx_dest_tag_lo_sel; + u16 rx_fdq0_sz0_qnum; + u16 rx_fdq1_qnum; + u16 rx_fdq2_qnum; + u16 rx_fdq3_qnum; + u8 rx_ps_location; +} __packed; + #endif /* __TI_SCI_H */ diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h index 568722a041bf..4fd9bff5806b 100644 --- a/include/linux/soc/ti/ti_sci_protocol.h +++ b/include/linux/soc/ti/ti_sci_protocol.h @@ -241,6 +241,218 @@ struct ti_sci_rm_irq_ops { u16 global_event, u8 vint_status_bit); }; +/* RA config.addr_lo parameter is valid for RM ring configure TI_SCI message */ +#define TI_SCI_MSG_VALUE_RM_RING_ADDR_LO_VALID BIT(0) +/* RA config.addr_hi parameter is valid for RM ring configure TI_SCI message */ +#define TI_SCI_MSG_VALUE_RM_RING_ADDR_HI_VALID BIT(1) + /* RA config.count parameter is valid for RM ring configure TI_SCI message */ +#define TI_SCI_MSG_VALUE_RM_RING_COUNT_VALID BIT(2) +/* RA config.mode parameter is valid for RM ring configure TI_SCI message */ +#define TI_SCI_MSG_VALUE_RM_RING_MODE_VALID BIT(3) +/* RA config.size parameter is valid for RM ring configure TI_SCI message */ +#define TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID BIT(4) +/* RA config.order_id parameter is valid for RM ring configure TISCI message */ +#define TI_SCI_MSG_VALUE_RM_RING_ORDER_ID_VALID BIT(5) + +#define TI_SCI_MSG_VALUE_RM_ALL_NO_ORDER \ + (TI_SCI_MSG_VALUE_RM_RING_ADDR_LO_VALID | \ + TI_SCI_MSG_VALUE_RM_RING_ADDR_HI_VALID | \ + TI_SCI_MSG_VALUE_RM_RING_COUNT_VALID | \ + TI_SCI_MSG_VALUE_RM_RING_MODE_VALID | \ + TI_SCI_MSG_VALUE_RM_RING_SIZE_VALID) + +/** + * struct ti_sci_rm_ringacc_ops - Ring Accelerator Management operations + * @config: configure the SoC Navigator Subsystem Ring Accelerator ring + * @get_config: get the SoC Navigator Subsystem Ring Accelerator ring + * configuration + */ +struct ti_sci_rm_ringacc_ops { + int (*config)(const struct ti_sci_handle *handle, + u32 valid_params, u16 nav_id, u16 index, + u32 addr_lo, u32 addr_hi, u32 count, u8 mode, + u8 size, u8 order_id + ); + int (*get_config)(const struct ti_sci_handle *handle, + u32 nav_id, u32 index, u8 *mode, + u32 *addr_lo, u32 *addr_hi, u32 *count, + u8 *size, u8 *order_id); +}; + +/** + * struct ti_sci_rm_psil_ops - PSI-L thread operations + * @pair: pair PSI-L source thread to a destination thread. + * If the src_thread is mapped to UDMA tchan, the corresponding channel's + * TCHAN_THRD_ID register is updated. + * If the dst_thread is mapped to UDMA rchan, the corresponding channel's + * RCHAN_THRD_ID register is updated. + * @unpair: unpair PSI-L source thread from a destination thread. + * If the src_thread is mapped to UDMA tchan, the corresponding channel's + * TCHAN_THRD_ID register is cleared. + * If the dst_thread is mapped to UDMA rchan, the corresponding channel's + * RCHAN_THRD_ID register is cleared. + */ +struct ti_sci_rm_psil_ops { + int (*pair)(const struct ti_sci_handle *handle, u32 nav_id, + u32 src_thread, u32 dst_thread); + int (*unpair)(const struct ti_sci_handle *handle, u32 nav_id, + u32 src_thread, u32 dst_thread); +}; + +/* UDMAP channel types */ +#define TI_SCI_RM_UDMAP_CHAN_TYPE_PKT_PBRR 2 +#define TI_SCI_RM_UDMAP_CHAN_TYPE_PKT_PBRR_SB 3 /* RX only */ +#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_PBRR 10 +#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_PBVR 11 +#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_BCOPY_PBRR 12 +#define TI_SCI_RM_UDMAP_CHAN_TYPE_3RDP_BCOPY_PBVR 13 + +#define TI_SCI_RM_UDMAP_RX_FLOW_DESC_HOST 0 +#define TI_SCI_RM_UDMAP_RX_FLOW_DESC_MONO 2 + +#define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_64_BYTES 1 +#define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_128_BYTES 2 +#define TI_SCI_RM_UDMAP_CHAN_BURST_SIZE_256_BYTES 3 + +/* UDMAP TX/RX channel valid_params common declarations */ +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_PAUSE_ON_ERR_VALID BIT(0) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_ATYPE_VALID BIT(1) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_CHAN_TYPE_VALID BIT(2) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_FETCH_SIZE_VALID BIT(3) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_CQ_QNUM_VALID BIT(4) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_PRIORITY_VALID BIT(5) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_QOS_VALID BIT(6) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_ORDER_ID_VALID BIT(7) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_SCHED_PRIORITY_VALID BIT(8) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_BURST_SIZE_VALID BIT(14) + +/** + * Configures a Navigator Subsystem UDMAP transmit channel + * + * Configures a Navigator Subsystem UDMAP transmit channel registers. + * See @ti_sci_msg_rm_udmap_tx_ch_cfg_req + */ +struct ti_sci_msg_rm_udmap_tx_ch_cfg { + u32 valid_params; +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FILT_EINFO_VALID BIT(9) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FILT_PSWORDS_VALID BIT(10) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_SUPR_TDPKT_VALID BIT(11) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_CREDIT_COUNT_VALID BIT(12) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_TX_FDEPTH_VALID BIT(13) + u16 nav_id; + u16 index; + u8 tx_pause_on_err; + u8 tx_filt_einfo; + u8 tx_filt_pswords; + u8 tx_atype; + u8 tx_chan_type; + u8 tx_supr_tdpkt; + u16 tx_fetch_size; + u8 tx_credit_count; + u16 txcq_qnum; + u8 tx_priority; + u8 tx_qos; + u8 tx_orderid; + u16 fdepth; + u8 tx_sched_priority; + u8 tx_burst_size; +}; + +/** + * Configures a Navigator Subsystem UDMAP receive channel + * + * Configures a Navigator Subsystem UDMAP receive channel registers. + * See @ti_sci_msg_rm_udmap_rx_ch_cfg_req + */ +struct ti_sci_msg_rm_udmap_rx_ch_cfg { + u32 valid_params; +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_START_VALID BIT(9) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_FLOWID_CNT_VALID BIT(10) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_IGNORE_SHORT_VALID BIT(11) +#define TI_SCI_MSG_VALUE_RM_UDMAP_CH_RX_IGNORE_LONG_VALID BIT(12) + u16 nav_id; + u16 index; + u16 rx_fetch_size; + u16 rxcq_qnum; + u8 rx_priority; + u8 rx_qos; + u8 rx_orderid; + u8 rx_sched_priority; + u16 flowid_start; + u16 flowid_cnt; + u8 rx_pause_on_err; + u8 rx_atype; + u8 rx_chan_type; + u8 rx_ignore_short; + u8 rx_ignore_long; + u8 rx_burst_size; +}; + +/** + * Configures a Navigator Subsystem UDMAP receive flow + * + * Configures a Navigator Subsystem UDMAP receive flow's registers. + * See @tis_ci_msg_rm_udmap_flow_cfg_req + */ +struct ti_sci_msg_rm_udmap_flow_cfg { + u32 valid_params; +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_EINFO_PRESENT_VALID BIT(0) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_PSINFO_PRESENT_VALID BIT(1) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_ERROR_HANDLING_VALID BIT(2) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DESC_TYPE_VALID BIT(3) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SOP_OFFSET_VALID BIT(4) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_QNUM_VALID BIT(5) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_HI_VALID BIT(6) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_LO_VALID BIT(7) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_HI_VALID BIT(8) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_LO_VALID BIT(9) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_HI_SEL_VALID BIT(10) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_SRC_TAG_LO_SEL_VALID BIT(11) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_HI_SEL_VALID BIT(12) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_DEST_TAG_LO_SEL_VALID BIT(13) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ0_SZ0_QNUM_VALID BIT(14) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ1_QNUM_VALID BIT(15) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ2_QNUM_VALID BIT(16) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_FDQ3_QNUM_VALID BIT(17) +#define TI_SCI_MSG_VALUE_RM_UDMAP_FLOW_PS_LOCATION_VALID BIT(18) + u16 nav_id; + u16 flow_index; + u8 rx_einfo_present; + u8 rx_psinfo_present; + u8 rx_error_handling; + u8 rx_desc_type; + u16 rx_sop_offset; + u16 rx_dest_qnum; + u8 rx_src_tag_hi; + u8 rx_src_tag_lo; + u8 rx_dest_tag_hi; + u8 rx_dest_tag_lo; + u8 rx_src_tag_hi_sel; + u8 rx_src_tag_lo_sel; + u8 rx_dest_tag_hi_sel; + u8 rx_dest_tag_lo_sel; + u16 rx_fdq0_sz0_qnum; + u16 rx_fdq1_qnum; + u16 rx_fdq2_qnum; + u16 rx_fdq3_qnum; + u8 rx_ps_location; +}; + +/** + * struct ti_sci_rm_udmap_ops - UDMA Management operations + * @tx_ch_cfg: configure SoC Navigator Subsystem UDMA transmit channel. + * @rx_ch_cfg: configure SoC Navigator Subsystem UDMA receive channel. + * @rx_flow_cfg1: configure SoC Navigator Subsystem UDMA receive flow. + */ +struct ti_sci_rm_udmap_ops { + int (*tx_ch_cfg)(const struct ti_sci_handle *handle, + const struct ti_sci_msg_rm_udmap_tx_ch_cfg *params); + int (*rx_ch_cfg)(const struct ti_sci_handle *handle, + const struct ti_sci_msg_rm_udmap_rx_ch_cfg *params); + int (*rx_flow_cfg)(const struct ti_sci_handle *handle, + const struct ti_sci_msg_rm_udmap_flow_cfg *params); +}; + /** * struct ti_sci_ops - Function support for TI SCI * @dev_ops: Device specific operations @@ -254,6 +466,9 @@ struct ti_sci_ops { struct ti_sci_clk_ops clk_ops; struct ti_sci_rm_core_ops rm_core_ops; struct ti_sci_rm_irq_ops rm_irq_ops; + struct ti_sci_rm_ringacc_ops rm_ring_ops; + struct ti_sci_rm_psil_ops rm_psil_ops; + struct ti_sci_rm_udmap_ops rm_udmap_ops; }; /** -- cgit v1.2.3-59-g8ed1b From 1e407f337f4015c8ffc56e7cfd70e06b2e9fc9da Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Wed, 5 Jun 2019 17:33:34 -0500 Subject: firmware: ti_sci: Add support for processor control Texas Instrument's System Control Interface (TI-SCI) Message Protocol is used in Texas Instrument's System on Chip (SoC) such as those in K3 family AM654 SoC to communicate between various compute processors with a central system controller entity. The system controller provides various services including the control of other compute processors within the SoC. Extend the TI-SCI protocol support to add various TI-SCI commands to invoke services associated with power and reset control, and boot vector management of the various compute processors from the Linux kernel. Signed-off-by: Suman Anna Signed-off-by: Tero Kristo Signed-off-by: Santosh Shilimkar --- drivers/firmware/ti_sci.c | 350 +++++++++++++++++++++++++++++++++ drivers/firmware/ti_sci.h | 135 +++++++++++++ include/linux/soc/ti/ti_sci_protocol.h | 31 +++ 3 files changed, 516 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/ti_sci.c b/drivers/firmware/ti_sci.c index 02fa196428d8..b47e33e7411f 100644 --- a/drivers/firmware/ti_sci.c +++ b/drivers/firmware/ti_sci.c @@ -2479,6 +2479,348 @@ fail: return ret; } +/** + * ti_sci_cmd_proc_request() - Command to request a physical processor control + * @handle: Pointer to TI SCI handle + * @proc_id: Processor ID this request is for + * + * Return: 0 if all went well, else returns appropriate error value. + */ +static int ti_sci_cmd_proc_request(const struct ti_sci_handle *handle, + u8 proc_id) +{ + struct ti_sci_msg_req_proc_request *req; + struct ti_sci_msg_hdr *resp; + struct ti_sci_info *info; + struct ti_sci_xfer *xfer; + struct device *dev; + int ret = 0; + + if (!handle) + return -EINVAL; + if (IS_ERR(handle)) + return PTR_ERR(handle); + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PROC_REQUEST, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(dev, "Message alloc failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_req_proc_request *)xfer->xfer_buf; + req->processor_id = proc_id; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(dev, "Mbox send fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf; + + ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV; + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + + return ret; +} + +/** + * ti_sci_cmd_proc_release() - Command to release a physical processor control + * @handle: Pointer to TI SCI handle + * @proc_id: Processor ID this request is for + * + * Return: 0 if all went well, else returns appropriate error value. + */ +static int ti_sci_cmd_proc_release(const struct ti_sci_handle *handle, + u8 proc_id) +{ + struct ti_sci_msg_req_proc_release *req; + struct ti_sci_msg_hdr *resp; + struct ti_sci_info *info; + struct ti_sci_xfer *xfer; + struct device *dev; + int ret = 0; + + if (!handle) + return -EINVAL; + if (IS_ERR(handle)) + return PTR_ERR(handle); + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PROC_RELEASE, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(dev, "Message alloc failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_req_proc_release *)xfer->xfer_buf; + req->processor_id = proc_id; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(dev, "Mbox send fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf; + + ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV; + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + + return ret; +} + +/** + * ti_sci_cmd_proc_handover() - Command to handover a physical processor + * control to a host in the processor's access + * control list. + * @handle: Pointer to TI SCI handle + * @proc_id: Processor ID this request is for + * @host_id: Host ID to get the control of the processor + * + * Return: 0 if all went well, else returns appropriate error value. + */ +static int ti_sci_cmd_proc_handover(const struct ti_sci_handle *handle, + u8 proc_id, u8 host_id) +{ + struct ti_sci_msg_req_proc_handover *req; + struct ti_sci_msg_hdr *resp; + struct ti_sci_info *info; + struct ti_sci_xfer *xfer; + struct device *dev; + int ret = 0; + + if (!handle) + return -EINVAL; + if (IS_ERR(handle)) + return PTR_ERR(handle); + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_PROC_HANDOVER, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(dev, "Message alloc failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_req_proc_handover *)xfer->xfer_buf; + req->processor_id = proc_id; + req->host_id = host_id; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(dev, "Mbox send fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf; + + ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV; + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + + return ret; +} + +/** + * ti_sci_cmd_proc_set_config() - Command to set the processor boot + * configuration flags + * @handle: Pointer to TI SCI handle + * @proc_id: Processor ID this request is for + * @config_flags_set: Configuration flags to be set + * @config_flags_clear: Configuration flags to be cleared. + * + * Return: 0 if all went well, else returns appropriate error value. + */ +static int ti_sci_cmd_proc_set_config(const struct ti_sci_handle *handle, + u8 proc_id, u64 bootvector, + u32 config_flags_set, + u32 config_flags_clear) +{ + struct ti_sci_msg_req_set_config *req; + struct ti_sci_msg_hdr *resp; + struct ti_sci_info *info; + struct ti_sci_xfer *xfer; + struct device *dev; + int ret = 0; + + if (!handle) + return -EINVAL; + if (IS_ERR(handle)) + return PTR_ERR(handle); + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_SET_CONFIG, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(dev, "Message alloc failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_req_set_config *)xfer->xfer_buf; + req->processor_id = proc_id; + req->bootvector_low = bootvector & TI_SCI_ADDR_LOW_MASK; + req->bootvector_high = (bootvector & TI_SCI_ADDR_HIGH_MASK) >> + TI_SCI_ADDR_HIGH_SHIFT; + req->config_flags_set = config_flags_set; + req->config_flags_clear = config_flags_clear; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(dev, "Mbox send fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf; + + ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV; + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + + return ret; +} + +/** + * ti_sci_cmd_proc_set_control() - Command to set the processor boot + * control flags + * @handle: Pointer to TI SCI handle + * @proc_id: Processor ID this request is for + * @control_flags_set: Control flags to be set + * @control_flags_clear: Control flags to be cleared + * + * Return: 0 if all went well, else returns appropriate error value. + */ +static int ti_sci_cmd_proc_set_control(const struct ti_sci_handle *handle, + u8 proc_id, u32 control_flags_set, + u32 control_flags_clear) +{ + struct ti_sci_msg_req_set_ctrl *req; + struct ti_sci_msg_hdr *resp; + struct ti_sci_info *info; + struct ti_sci_xfer *xfer; + struct device *dev; + int ret = 0; + + if (!handle) + return -EINVAL; + if (IS_ERR(handle)) + return PTR_ERR(handle); + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_SET_CTRL, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(dev, "Message alloc failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_req_set_ctrl *)xfer->xfer_buf; + req->processor_id = proc_id; + req->control_flags_set = control_flags_set; + req->control_flags_clear = control_flags_clear; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(dev, "Mbox send fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_hdr *)xfer->tx_message.buf; + + ret = ti_sci_is_response_ack(resp) ? 0 : -ENODEV; + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + + return ret; +} + +/** + * ti_sci_cmd_get_boot_status() - Command to get the processor boot status + * @handle: Pointer to TI SCI handle + * @proc_id: Processor ID this request is for + * + * Return: 0 if all went well, else returns appropriate error value. + */ +static int ti_sci_cmd_proc_get_status(const struct ti_sci_handle *handle, + u8 proc_id, u64 *bv, u32 *cfg_flags, + u32 *ctrl_flags, u32 *sts_flags) +{ + struct ti_sci_msg_resp_get_status *resp; + struct ti_sci_msg_req_get_status *req; + struct ti_sci_info *info; + struct ti_sci_xfer *xfer; + struct device *dev; + int ret = 0; + + if (!handle) + return -EINVAL; + if (IS_ERR(handle)) + return PTR_ERR(handle); + + info = handle_to_ti_sci_info(handle); + dev = info->dev; + + xfer = ti_sci_get_one_xfer(info, TI_SCI_MSG_GET_STATUS, + TI_SCI_FLAG_REQ_ACK_ON_PROCESSED, + sizeof(*req), sizeof(*resp)); + if (IS_ERR(xfer)) { + ret = PTR_ERR(xfer); + dev_err(dev, "Message alloc failed(%d)\n", ret); + return ret; + } + req = (struct ti_sci_msg_req_get_status *)xfer->xfer_buf; + req->processor_id = proc_id; + + ret = ti_sci_do_xfer(info, xfer); + if (ret) { + dev_err(dev, "Mbox send fail %d\n", ret); + goto fail; + } + + resp = (struct ti_sci_msg_resp_get_status *)xfer->tx_message.buf; + + if (!ti_sci_is_response_ack(resp)) { + ret = -ENODEV; + } else { + *bv = (resp->bootvector_low & TI_SCI_ADDR_LOW_MASK) | + (((u64)resp->bootvector_high << TI_SCI_ADDR_HIGH_SHIFT) & + TI_SCI_ADDR_HIGH_MASK); + *cfg_flags = resp->config_flags; + *ctrl_flags = resp->control_flags; + *sts_flags = resp->status_flags; + } + +fail: + ti_sci_put_one_xfer(&info->minfo, xfer); + + return ret; +} + /* * ti_sci_setup_ops() - Setup the operations structures * @info: pointer to TISCI pointer @@ -2494,6 +2836,7 @@ static void ti_sci_setup_ops(struct ti_sci_info *info) struct ti_sci_rm_ringacc_ops *rops = &ops->rm_ring_ops; struct ti_sci_rm_psil_ops *psilops = &ops->rm_psil_ops; struct ti_sci_rm_udmap_ops *udmap_ops = &ops->rm_udmap_ops; + struct ti_sci_proc_ops *pops = &ops->proc_ops; core_ops->reboot_device = ti_sci_cmd_core_reboot; @@ -2543,6 +2886,13 @@ static void ti_sci_setup_ops(struct ti_sci_info *info) udmap_ops->tx_ch_cfg = ti_sci_cmd_rm_udmap_tx_ch_cfg; udmap_ops->rx_ch_cfg = ti_sci_cmd_rm_udmap_rx_ch_cfg; udmap_ops->rx_flow_cfg = ti_sci_cmd_rm_udmap_rx_flow_cfg; + + pops->request = ti_sci_cmd_proc_request; + pops->release = ti_sci_cmd_proc_release; + pops->handover = ti_sci_cmd_proc_handover; + pops->set_config = ti_sci_cmd_proc_set_config; + pops->set_control = ti_sci_cmd_proc_set_control; + pops->get_status = ti_sci_cmd_proc_get_status; } /** diff --git a/drivers/firmware/ti_sci.h b/drivers/firmware/ti_sci.h index 2bb81ec7793c..662dcffef311 100644 --- a/drivers/firmware/ti_sci.h +++ b/drivers/firmware/ti_sci.h @@ -71,6 +71,14 @@ #define TISCI_MSG_RM_UDMAP_FLOW_GET_CFG 0x1232 #define TISCI_MSG_RM_UDMAP_FLOW_SIZE_THRESH_GET_CFG 0x1233 +/* Processor Control requests */ +#define TI_SCI_MSG_PROC_REQUEST 0xc000 +#define TI_SCI_MSG_PROC_RELEASE 0xc001 +#define TI_SCI_MSG_PROC_HANDOVER 0xc005 +#define TI_SCI_MSG_SET_CONFIG 0xc100 +#define TI_SCI_MSG_SET_CTRL 0xc101 +#define TI_SCI_MSG_GET_STATUS 0xc400 + /** * struct ti_sci_msg_hdr - Generic Message Header for All messages and responses * @type: Type of messages: One of TI_SCI_MSG* values @@ -1238,4 +1246,131 @@ struct ti_sci_msg_rm_udmap_flow_cfg_req { u8 rx_ps_location; } __packed; +/** + * struct ti_sci_msg_req_proc_request - Request a processor + * @hdr: Generic Header + * @processor_id: ID of processor being requested + * + * Request type is TI_SCI_MSG_PROC_REQUEST, response is a generic ACK/NACK + * message. + */ +struct ti_sci_msg_req_proc_request { + struct ti_sci_msg_hdr hdr; + u8 processor_id; +} __packed; + +/** + * struct ti_sci_msg_req_proc_release - Release a processor + * @hdr: Generic Header + * @processor_id: ID of processor being released + * + * Request type is TI_SCI_MSG_PROC_RELEASE, response is a generic ACK/NACK + * message. + */ +struct ti_sci_msg_req_proc_release { + struct ti_sci_msg_hdr hdr; + u8 processor_id; +} __packed; + +/** + * struct ti_sci_msg_req_proc_handover - Handover a processor to a host + * @hdr: Generic Header + * @processor_id: ID of processor being handed over + * @host_id: Host ID the control needs to be transferred to + * + * Request type is TI_SCI_MSG_PROC_HANDOVER, response is a generic ACK/NACK + * message. + */ +struct ti_sci_msg_req_proc_handover { + struct ti_sci_msg_hdr hdr; + u8 processor_id; + u8 host_id; +} __packed; + +/* Boot Vector masks */ +#define TI_SCI_ADDR_LOW_MASK GENMASK_ULL(31, 0) +#define TI_SCI_ADDR_HIGH_MASK GENMASK_ULL(63, 32) +#define TI_SCI_ADDR_HIGH_SHIFT 32 + +/** + * struct ti_sci_msg_req_set_config - Set Processor boot configuration + * @hdr: Generic Header + * @processor_id: ID of processor being configured + * @bootvector_low: Lower 32 bit address (Little Endian) of boot vector + * @bootvector_high: Higher 32 bit address (Little Endian) of boot vector + * @config_flags_set: Optional Processor specific Config Flags to set. + * Setting a bit here implies the corresponding mode + * will be set + * @config_flags_clear: Optional Processor specific Config Flags to clear. + * Setting a bit here implies the corresponding mode + * will be cleared + * + * Request type is TI_SCI_MSG_PROC_HANDOVER, response is a generic ACK/NACK + * message. + */ +struct ti_sci_msg_req_set_config { + struct ti_sci_msg_hdr hdr; + u8 processor_id; + u32 bootvector_low; + u32 bootvector_high; + u32 config_flags_set; + u32 config_flags_clear; +} __packed; + +/** + * struct ti_sci_msg_req_set_ctrl - Set Processor boot control flags + * @hdr: Generic Header + * @processor_id: ID of processor being configured + * @control_flags_set: Optional Processor specific Control Flags to set. + * Setting a bit here implies the corresponding mode + * will be set + * @control_flags_clear:Optional Processor specific Control Flags to clear. + * Setting a bit here implies the corresponding mode + * will be cleared + * + * Request type is TI_SCI_MSG_SET_CTRL, response is a generic ACK/NACK + * message. + */ +struct ti_sci_msg_req_set_ctrl { + struct ti_sci_msg_hdr hdr; + u8 processor_id; + u32 control_flags_set; + u32 control_flags_clear; +} __packed; + +/** + * struct ti_sci_msg_req_get_status - Processor boot status request + * @hdr: Generic Header + * @processor_id: ID of processor whose status is being requested + * + * Request type is TI_SCI_MSG_GET_STATUS, response is an appropriate + * message, or NACK in case of inability to satisfy request. + */ +struct ti_sci_msg_req_get_status { + struct ti_sci_msg_hdr hdr; + u8 processor_id; +} __packed; + +/** + * struct ti_sci_msg_resp_get_status - Processor boot status response + * @hdr: Generic Header + * @processor_id: ID of processor whose status is returned + * @bootvector_low: Lower 32 bit address (Little Endian) of boot vector + * @bootvector_high: Higher 32 bit address (Little Endian) of boot vector + * @config_flags: Optional Processor specific Config Flags set currently + * @control_flags: Optional Processor specific Control Flags set currently + * @status_flags: Optional Processor specific Status Flags set currently + * + * Response structure to a TI_SCI_MSG_GET_STATUS request. + */ +struct ti_sci_msg_resp_get_status { + struct ti_sci_msg_hdr hdr; + u8 processor_id; + u32 bootvector_low; + u32 bootvector_high; + u32 config_flags; + u32 control_flags; + u32 status_flags; +} __packed; + #endif /* __TI_SCI_H */ diff --git a/include/linux/soc/ti/ti_sci_protocol.h b/include/linux/soc/ti/ti_sci_protocol.h index 4fd9bff5806b..7b3762f68df9 100644 --- a/include/linux/soc/ti/ti_sci_protocol.h +++ b/include/linux/soc/ti/ti_sci_protocol.h @@ -453,12 +453,42 @@ struct ti_sci_rm_udmap_ops { const struct ti_sci_msg_rm_udmap_flow_cfg *params); }; +/** + * struct ti_sci_proc_ops - Processor Control operations + * @request: Request to control a physical processor. The requesting host + * should be in the processor access list + * @release: Relinquish a physical processor control + * @handover: Handover a physical processor control to another host + * in the permitted list + * @set_config: Set base configuration of a processor + * @set_control: Setup limited control flags in specific cases + * @get_status: Get the state of physical processor + * + * NOTE: The following paramteres are generic in nature for all these ops, + * -handle: Pointer to TI SCI handle as retrieved by *ti_sci_get_handle + * -pid: Processor ID + * -hid: Host ID + */ +struct ti_sci_proc_ops { + int (*request)(const struct ti_sci_handle *handle, u8 pid); + int (*release)(const struct ti_sci_handle *handle, u8 pid); + int (*handover)(const struct ti_sci_handle *handle, u8 pid, u8 hid); + int (*set_config)(const struct ti_sci_handle *handle, u8 pid, + u64 boot_vector, u32 cfg_set, u32 cfg_clr); + int (*set_control)(const struct ti_sci_handle *handle, u8 pid, + u32 ctrl_set, u32 ctrl_clr); + int (*get_status)(const struct ti_sci_handle *handle, u8 pid, + u64 *boot_vector, u32 *cfg_flags, u32 *ctrl_flags, + u32 *status_flags); +}; + /** * struct ti_sci_ops - Function support for TI SCI * @dev_ops: Device specific operations * @clk_ops: Clock specific operations * @rm_core_ops: Resource management core operations. * @rm_irq_ops: IRQ management specific operations + * @proc_ops: Processor Control specific operations */ struct ti_sci_ops { struct ti_sci_core_ops core_ops; @@ -469,6 +499,7 @@ struct ti_sci_ops { struct ti_sci_rm_ringacc_ops rm_ring_ops; struct ti_sci_rm_psil_ops rm_psil_ops; struct ti_sci_rm_udmap_ops rm_udmap_ops; + struct ti_sci_proc_ops proc_ops; }; /** -- cgit v1.2.3-59-g8ed1b From 18c8c0954d15105b02f7d2f556b9eafae426871f Mon Sep 17 00:00:00 2001 From: Wesley Sheng Date: Tue, 30 Apr 2019 18:04:29 +0800 Subject: NTB: correct ntb_dev_ops and ntb_dev comment typos The comment for ntb_dev_ops and ntb_dev incorrectly referred to ntb_ctx_ops and ntb_device. Signed-off-by: Wesley Sheng Reviewed-by: Logan Gunthorpe Signed-off-by: Jon Mason --- include/linux/ntb.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ntb.h b/include/linux/ntb.h index 56a92e3ae3ae..604abc883741 100644 --- a/include/linux/ntb.h +++ b/include/linux/ntb.h @@ -205,7 +205,7 @@ static inline int ntb_ctx_ops_is_valid(const struct ntb_ctx_ops *ops) } /** - * struct ntb_ctx_ops - ntb device operations + * struct ntb_dev_ops - ntb device operations * @port_number: See ntb_port_number(). * @peer_port_count: See ntb_peer_port_count(). * @peer_port_number: See ntb_peer_port_number(). @@ -404,7 +404,7 @@ struct ntb_client { #define drv_ntb_client(__drv) container_of((__drv), struct ntb_client, drv) /** - * struct ntb_device - ntb device + * struct ntb_dev - ntb device * @dev: Linux device object. * @pdev: PCI device entry of the ntb. * @topo: Detected topology of the ntb. -- cgit v1.2.3-59-g8ed1b From d7cc609fb679e11dc2b00cbe6c50cbd37ec4bfa2 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 23 May 2019 16:30:51 -0600 Subject: PCI/MSI: Support allocating virtual MSI interrupts For NTB devices, we want to be able to trigger MSI interrupts through a memory window. In these cases we may want to use more interrupts than the NTB PCI device has available in its MSI-X table. We allow for this by creating a new 'virtual' interrupt. These interrupts are allocated as usual but are not programmed into the MSI-X table (as there may not be space for them). The MSI address and data will then handled through an NTB MSI library introduced later in this series. Signed-off-by: Logan Gunthorpe Acked-by: Bjorn Helgaas Signed-off-by: Jon Mason --- drivers/pci/msi.c | 54 ++++++++++++++++++++++++++++++++++++++++++++--------- include/linux/msi.h | 8 ++++++++ include/linux/pci.h | 9 +++++++++ 3 files changed, 62 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index e039b740fe74..ace978deaf93 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -192,6 +192,9 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) static void __iomem *pci_msix_desc_addr(struct msi_desc *desc) { + if (desc->msi_attrib.is_virtual) + return NULL; + return desc->mask_base + desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; } @@ -206,14 +209,19 @@ static void __iomem *pci_msix_desc_addr(struct msi_desc *desc) u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag) { u32 mask_bits = desc->masked; + void __iomem *desc_addr; if (pci_msi_ignore_mask) return 0; + desc_addr = pci_msix_desc_addr(desc); + if (!desc_addr) + return 0; mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT; if (flag) mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT; - writel(mask_bits, pci_msix_desc_addr(desc) + PCI_MSIX_ENTRY_VECTOR_CTRL); + + writel(mask_bits, desc_addr + PCI_MSIX_ENTRY_VECTOR_CTRL); return mask_bits; } @@ -273,6 +281,11 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) if (entry->msi_attrib.is_msix) { void __iomem *base = pci_msix_desc_addr(entry); + if (!base) { + WARN_ON(1); + return; + } + msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR); msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR); msg->data = readl(base + PCI_MSIX_ENTRY_DATA); @@ -303,6 +316,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) } else if (entry->msi_attrib.is_msix) { void __iomem *base = pci_msix_desc_addr(entry); + if (!base) + goto skip; + writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR); writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR); writel(msg->data, base + PCI_MSIX_ENTRY_DATA); @@ -327,7 +343,13 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) msg->data); } } + +skip: entry->msg = *msg; + + if (entry->write_msi_msg) + entry->write_msi_msg(entry, entry->write_msi_msg_data); + } void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg) @@ -550,6 +572,7 @@ msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd) entry->msi_attrib.is_msix = 0; entry->msi_attrib.is_64 = !!(control & PCI_MSI_FLAGS_64BIT); + entry->msi_attrib.is_virtual = 0; entry->msi_attrib.entry_nr = 0; entry->msi_attrib.maskbit = !!(control & PCI_MSI_FLAGS_MASKBIT); entry->msi_attrib.default_irq = dev->irq; /* Save IOAPIC IRQ */ @@ -674,6 +697,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, struct irq_affinity_desc *curmsk, *masks = NULL; struct msi_desc *entry; int ret, i; + int vec_count = pci_msix_vec_count(dev); if (affd) masks = irq_create_affinity_masks(nvec, affd); @@ -696,6 +720,10 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, entry->msi_attrib.entry_nr = entries[i].entry; else entry->msi_attrib.entry_nr = i; + + entry->msi_attrib.is_virtual = + entry->msi_attrib.entry_nr >= vec_count; + entry->msi_attrib.default_irq = dev->irq; entry->mask_base = base; @@ -714,12 +742,19 @@ static void msix_program_entries(struct pci_dev *dev, { struct msi_desc *entry; int i = 0; + void __iomem *desc_addr; for_each_pci_msi_entry(entry, dev) { if (entries) entries[i++].vector = entry->irq; - entry->masked = readl(pci_msix_desc_addr(entry) + - PCI_MSIX_ENTRY_VECTOR_CTRL); + + desc_addr = pci_msix_desc_addr(entry); + if (desc_addr) + entry->masked = readl(desc_addr + + PCI_MSIX_ENTRY_VECTOR_CTRL); + else + entry->masked = 0; + msix_mask_irq(entry, 1); } } @@ -932,7 +967,7 @@ int pci_msix_vec_count(struct pci_dev *dev) EXPORT_SYMBOL(pci_msix_vec_count); static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, - int nvec, struct irq_affinity *affd) + int nvec, struct irq_affinity *affd, int flags) { int nr_entries; int i, j; @@ -943,7 +978,7 @@ static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, nr_entries = pci_msix_vec_count(dev); if (nr_entries < 0) return nr_entries; - if (nvec > nr_entries) + if (nvec > nr_entries && !(flags & PCI_IRQ_VIRTUAL)) return nr_entries; if (entries) { @@ -1079,7 +1114,8 @@ EXPORT_SYMBOL(pci_enable_msi); static int __pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, int minvec, - int maxvec, struct irq_affinity *affd) + int maxvec, struct irq_affinity *affd, + int flags) { int rc, nvec = maxvec; @@ -1096,7 +1132,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev, return -ENOSPC; } - rc = __pci_enable_msix(dev, entries, nvec, affd); + rc = __pci_enable_msix(dev, entries, nvec, affd, flags); if (rc == 0) return nvec; @@ -1127,7 +1163,7 @@ static int __pci_enable_msix_range(struct pci_dev *dev, int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, int minvec, int maxvec) { - return __pci_enable_msix_range(dev, entries, minvec, maxvec, NULL); + return __pci_enable_msix_range(dev, entries, minvec, maxvec, NULL, 0); } EXPORT_SYMBOL(pci_enable_msix_range); @@ -1167,7 +1203,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, if (flags & PCI_IRQ_MSIX) { msix_vecs = __pci_enable_msix_range(dev, NULL, min_vecs, - max_vecs, affd); + max_vecs, affd, flags); if (msix_vecs > 0) return msix_vecs; } diff --git a/include/linux/msi.h b/include/linux/msi.h index d48e919d55ae..8ad679e9d9c0 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -64,6 +64,10 @@ struct ti_sci_inta_msi_desc { * @msg: The last set MSI message cached for reuse * @affinity: Optional pointer to a cpu affinity mask for this descriptor * + * @write_msi_msg: Callback that may be called when the MSI message + * address or data changes + * @write_msi_msg_data: Data parameter for the callback. + * * @masked: [PCI MSI/X] Mask bits * @is_msix: [PCI MSI/X] True if MSI-X * @multiple: [PCI MSI/X] log2 num of messages allocated @@ -90,6 +94,9 @@ struct msi_desc { const void *iommu_cookie; #endif + void (*write_msi_msg)(struct msi_desc *entry, void *data); + void *write_msi_msg_data; + union { /* PCI MSI/X specific data */ struct { @@ -100,6 +107,7 @@ struct msi_desc { u8 multi_cap : 3; u8 maskbit : 1; u8 is_64 : 1; + u8 is_virtual : 1; u16 entry_nr; unsigned default_irq; } msi_attrib; diff --git a/include/linux/pci.h b/include/linux/pci.h index 4a5a84d7bdd4..19b5c27c6f63 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1362,6 +1362,15 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode, #define PCI_IRQ_MSI (1 << 1) /* Allow MSI interrupts */ #define PCI_IRQ_MSIX (1 << 2) /* Allow MSI-X interrupts */ #define PCI_IRQ_AFFINITY (1 << 3) /* Auto-assign affinity */ + +/* + * Virtual interrupts allow for more interrupts to be allocated + * than the device has interrupts for. These are not programmed + * into the device's MSI-X table and must be handled by some + * other driver means. + */ +#define PCI_IRQ_VIRTUAL (1 << 4) + #define PCI_IRQ_ALL_TYPES \ (PCI_IRQ_LEGACY | PCI_IRQ_MSI | PCI_IRQ_MSIX) -- cgit v1.2.3-59-g8ed1b From 246a42c51bc5dd247629f86c87d5e1b7628343c4 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 23 May 2019 16:30:53 -0600 Subject: NTB: Introduce helper functions to calculate logical port number This patch introduces the "Logical Port Number" which is similar to the "Port Number" in that it enumerates the ports in the system. The original (or Physical) "Port Number" can be any number used by the hardware to uniquely identify a port in the system. The "Logical Port Number" enumerates all ports in the system from 0 to the number of ports minus one. For example a system with 5 ports might have the following port numbers which would be enumerated thusly: Port Number: 1 2 5 7 116 Logical Port Number: 0 1 2 3 4 The logical port number is useful when calculating which resources to use for which peers. So we thus define two helper functions: ntb_logical_port_number() and ntb_peer_logical_port_number() which provide the "Logical Port Number" for the local port and any peer respectively. Signed-off-by: Logan Gunthorpe Cc: Dave Jiang Cc: Allen Hubbe Cc: Serge Semin Signed-off-by: Jon Mason --- include/linux/ntb.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ntb.h b/include/linux/ntb.h index 604abc883741..2fadd0352683 100644 --- a/include/linux/ntb.h +++ b/include/linux/ntb.h @@ -616,7 +616,6 @@ static inline int ntb_port_number(struct ntb_dev *ntb) return ntb->ops->port_number(ntb); } - /** * ntb_peer_port_count() - get the number of peer device ports * @ntb: NTB device context. @@ -653,6 +652,58 @@ static inline int ntb_peer_port_number(struct ntb_dev *ntb, int pidx) return ntb->ops->peer_port_number(ntb, pidx); } +/** + * ntb_logical_port_number() - get the logical port number of the local port + * @ntb: NTB device context. + * + * The Logical Port Number is defined to be a unique number for each + * port starting from zero through to the number of ports minus one. + * This is in contrast to the Port Number where each port can be assigned + * any unique physical number by the hardware. + * + * The logical port number is useful for calculating the resource indexes + * used by peers. + * + * Return: the logical port number or negative value indicating an error + */ +static inline int ntb_logical_port_number(struct ntb_dev *ntb) +{ + int lport = ntb_port_number(ntb); + int pidx; + + if (lport < 0) + return lport; + + for (pidx = 0; pidx < ntb_peer_port_count(ntb); pidx++) + if (lport <= ntb_peer_port_number(ntb, pidx)) + return pidx; + + return pidx; +} + +/** + * ntb_peer_logical_port_number() - get the logical peer port by given index + * @ntb: NTB device context. + * @pidx: Peer port index. + * + * The Logical Port Number is defined to be a unique number for each + * port starting from zero through to the number of ports minus one. + * This is in contrast to the Port Number where each port can be assigned + * any unique physical number by the hardware. + * + * The logical port number is useful for calculating the resource indexes + * used by peers. + * + * Return: the peer's logical port number or negative value indicating an error + */ +static inline int ntb_peer_logical_port_number(struct ntb_dev *ntb, int pidx) +{ + if (ntb_peer_port_number(ntb, pidx) < ntb_port_number(ntb)) + return pidx; + else + return pidx + 1; +} + /** * ntb_peer_port_idx() - get the peer device port index by given port number * @ntb: NTB device context. -- cgit v1.2.3-59-g8ed1b From 5f1b1f065c791de71017502ed3ba46779e231d9b Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 23 May 2019 16:30:54 -0600 Subject: NTB: Introduce functions to calculate multi-port resource index When using multi-ports each port uses resources (dbs, msgs, mws, etc) on every other port. Creating a mapping for these resources such that each port has a corresponding resource on every other port is a bit tricky. Introduce the ntb_peer_resource_idx() function for this purpose. It returns the peer resource number that will correspond with the local peer index on the remote peer. Also, introduce ntb_peer_highest_mw_idx() which will use ntb_peer_resource_idx() but return the MW index starting with the highest index and working down. Signed-off-by: Logan Gunthorpe Cc: Dave Jiang Cc: Allen Hubbe Signed-off-by: Jon Mason --- include/linux/ntb.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ntb.h b/include/linux/ntb.h index 2fadd0352683..bed421b9579b 100644 --- a/include/linux/ntb.h +++ b/include/linux/ntb.h @@ -1557,4 +1557,74 @@ static inline int ntb_peer_msg_write(struct ntb_dev *ntb, int pidx, int midx, return ntb->ops->peer_msg_write(ntb, pidx, midx, msg); } +/** + * ntb_peer_resource_idx() - get a resource index for a given peer idx + * @ntb: NTB device context. + * @pidx: Peer port index. + * + * When constructing a graph of peers, each remote peer must use a different + * resource index (mw, doorbell, etc) to communicate with each other + * peer. + * + * In a two peer system, this function should always return 0 such that + * resource 0 points to the remote peer on both ports. + * + * In a 5 peer system, this function will return the following matrix + * + * pidx \ port 0 1 2 3 4 + * 0 0 0 1 2 3 + * 1 0 1 1 2 3 + * 2 0 1 2 2 3 + * 3 0 1 2 3 3 + * + * For example, if this function is used to program peer's memory + * windows, port 0 will program MW 0 on all it's peers to point to itself. + * port 1 will program MW 0 in port 0 to point to itself and MW 1 on all + * other ports. etc. + * + * For the legacy two host case, ntb_port_number() and ntb_peer_port_number() + * both return zero and therefore this function will always return zero. + * So MW 0 on each host would be programmed to point to the other host. + * + * Return: the resource index to use for that peer. + */ +static inline int ntb_peer_resource_idx(struct ntb_dev *ntb, int pidx) +{ + int local_port, peer_port; + + if (pidx >= ntb_peer_port_count(ntb)) + return -EINVAL; + + local_port = ntb_logical_port_number(ntb); + peer_port = ntb_peer_logical_port_number(ntb, pidx); + + if (peer_port < local_port) + return local_port - 1; + else + return local_port; +} + +/** + * ntb_peer_highest_mw_idx() - get a memory window index for a given peer idx + * using the highest index memory windows first + * + * @ntb: NTB device context. + * @pidx: Peer port index. + * + * Like ntb_peer_resource_idx(), except it returns indexes starting with + * last memory window index. + * + * Return: the resource index to use for that peer. + */ +static inline int ntb_peer_highest_mw_idx(struct ntb_dev *ntb, int pidx) +{ + int ret; + + ret = ntb_peer_resource_idx(ntb, pidx); + if (ret < 0) + return ret; + + return ntb_mw_count(ntb, pidx) - ret - 1; +} + #endif -- cgit v1.2.3-59-g8ed1b From 26b3a37b928457ba2cd98eaf6d7b0feca5a30fa6 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 23 May 2019 16:30:56 -0600 Subject: NTB: Introduce MSI library The NTB MSI library allows passing MSI interrupts across a memory window. This offers similar functionality to doorbells or messages except will often have much better latency and the client can potentially use significantly more remote interrupts than typical hardware provides for doorbells. (Which can be important in high-multiport setups.) The library utilizes one memory window per peer and uses the highest index memory windows. Before any ntb_msi function may be used, the user must call ntb_msi_init(). It may then setup and tear down the memory windows when the link state changes using ntb_msi_setup_mws() and ntb_msi_clear_mws(). The peer which receives the interrupt must call ntb_msim_request_irq() to assign the interrupt handler (this function is functionally similar to devm_request_irq()) and the returned descriptor must be transferred to the peer which can use it to trigger the interrupt. The triggering peer, once having received the descriptor, can trigger the interrupt by calling ntb_msi_peer_trigger(). Signed-off-by: Logan Gunthorpe Cc: Dave Jiang Cc: Allen Hubbe Signed-off-by: Jon Mason --- drivers/ntb/Kconfig | 11 ++ drivers/ntb/Makefile | 3 +- drivers/ntb/msi.c | 415 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/ntb.h | 73 +++++++++ 4 files changed, 501 insertions(+), 1 deletion(-) create mode 100644 drivers/ntb/msi.c (limited to 'include/linux') diff --git a/drivers/ntb/Kconfig b/drivers/ntb/Kconfig index 95944e52fa36..5760764052be 100644 --- a/drivers/ntb/Kconfig +++ b/drivers/ntb/Kconfig @@ -12,6 +12,17 @@ menuconfig NTB if NTB +config NTB_MSI + bool "MSI Interrupt Support" + depends on PCI_MSI + help + Support using MSI interrupt forwarding instead of (or in addition to) + hardware doorbells. MSI interrupts typically offer lower latency + than doorbells and more MSI interrupts can be made available to + clients. However this requires an extra memory window and support + in the hardware driver for creating the MSI interrupts. + + If unsure, say N. source "drivers/ntb/hw/Kconfig" source "drivers/ntb/test/Kconfig" diff --git a/drivers/ntb/Makefile b/drivers/ntb/Makefile index 537226f8e78d..cc27ad2ef150 100644 --- a/drivers/ntb/Makefile +++ b/drivers/ntb/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_NTB) += ntb.o hw/ test/ obj-$(CONFIG_NTB_TRANSPORT) += ntb_transport.o -ntb-y := core.o +ntb-y := core.o +ntb-$(CONFIG_NTB_MSI) += msi.o diff --git a/drivers/ntb/msi.c b/drivers/ntb/msi.c new file mode 100644 index 000000000000..9dddf133658f --- /dev/null +++ b/drivers/ntb/msi.c @@ -0,0 +1,415 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) + +#include +#include +#include +#include +#include + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION("0.1"); +MODULE_AUTHOR("Logan Gunthorpe "); +MODULE_DESCRIPTION("NTB MSI Interrupt Library"); + +struct ntb_msi { + u64 base_addr; + u64 end_addr; + + void (*desc_changed)(void *ctx); + + u32 __iomem *peer_mws[]; +}; + +/** + * ntb_msi_init() - Initialize the MSI context + * @ntb: NTB device context + * + * This function must be called before any other ntb_msi function. + * It initializes the context for MSI operations and maps + * the peer memory windows. + * + * This function reserves the last N outbound memory windows (where N + * is the number of peers). + * + * Return: Zero on success, otherwise a negative error number. + */ +int ntb_msi_init(struct ntb_dev *ntb, + void (*desc_changed)(void *ctx)) +{ + phys_addr_t mw_phys_addr; + resource_size_t mw_size; + size_t struct_size; + int peer_widx; + int peers; + int ret; + int i; + + peers = ntb_peer_port_count(ntb); + if (peers <= 0) + return -EINVAL; + + struct_size = sizeof(*ntb->msi) + sizeof(*ntb->msi->peer_mws) * peers; + + ntb->msi = devm_kzalloc(&ntb->dev, struct_size, GFP_KERNEL); + if (!ntb->msi) + return -ENOMEM; + + ntb->msi->desc_changed = desc_changed; + + for (i = 0; i < peers; i++) { + peer_widx = ntb_peer_mw_count(ntb) - 1 - i; + + ret = ntb_peer_mw_get_addr(ntb, peer_widx, &mw_phys_addr, + &mw_size); + if (ret) + goto unroll; + + ntb->msi->peer_mws[i] = devm_ioremap(&ntb->dev, mw_phys_addr, + mw_size); + if (!ntb->msi->peer_mws[i]) { + ret = -EFAULT; + goto unroll; + } + } + + return 0; + +unroll: + for (i = 0; i < peers; i++) + if (ntb->msi->peer_mws[i]) + devm_iounmap(&ntb->dev, ntb->msi->peer_mws[i]); + + devm_kfree(&ntb->dev, ntb->msi); + ntb->msi = NULL; + return ret; +} +EXPORT_SYMBOL(ntb_msi_init); + +/** + * ntb_msi_setup_mws() - Initialize the MSI inbound memory windows + * @ntb: NTB device context + * + * This function sets up the required inbound memory windows. It should be + * called from a work function after a link up event. + * + * Over the entire network, this function will reserves the last N + * inbound memory windows for each peer (where N is the number of peers). + * + * ntb_msi_init() must be called before this function. + * + * Return: Zero on success, otherwise a negative error number. + */ +int ntb_msi_setup_mws(struct ntb_dev *ntb) +{ + struct msi_desc *desc; + u64 addr; + int peer, peer_widx; + resource_size_t addr_align, size_align, size_max; + resource_size_t mw_size = SZ_32K; + resource_size_t mw_min_size = mw_size; + int i; + int ret; + + if (!ntb->msi) + return -EINVAL; + + desc = first_msi_entry(&ntb->pdev->dev); + addr = desc->msg.address_lo + ((uint64_t)desc->msg.address_hi << 32); + + for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) { + peer_widx = ntb_peer_highest_mw_idx(ntb, peer); + if (peer_widx < 0) + return peer_widx; + + ret = ntb_mw_get_align(ntb, peer, peer_widx, &addr_align, + NULL, NULL); + if (ret) + return ret; + + addr &= ~(addr_align - 1); + } + + for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) { + peer_widx = ntb_peer_highest_mw_idx(ntb, peer); + if (peer_widx < 0) { + ret = peer_widx; + goto error_out; + } + + ret = ntb_mw_get_align(ntb, peer, peer_widx, NULL, + &size_align, &size_max); + if (ret) + goto error_out; + + mw_size = round_up(mw_size, size_align); + mw_size = max(mw_size, size_max); + if (mw_size < mw_min_size) + mw_min_size = mw_size; + + ret = ntb_mw_set_trans(ntb, peer, peer_widx, + addr, mw_size); + if (ret) + goto error_out; + } + + ntb->msi->base_addr = addr; + ntb->msi->end_addr = addr + mw_min_size; + + return 0; + +error_out: + for (i = 0; i < peer; i++) { + peer_widx = ntb_peer_highest_mw_idx(ntb, peer); + if (peer_widx < 0) + continue; + + ntb_mw_clear_trans(ntb, i, peer_widx); + } + + return ret; +} +EXPORT_SYMBOL(ntb_msi_setup_mws); + +/** + * ntb_msi_clear_mws() - Clear all inbound memory windows + * @ntb: NTB device context + * + * This function tears down the resources used by ntb_msi_setup_mws(). + */ +void ntb_msi_clear_mws(struct ntb_dev *ntb) +{ + int peer; + int peer_widx; + + for (peer = 0; peer < ntb_peer_port_count(ntb); peer++) { + peer_widx = ntb_peer_highest_mw_idx(ntb, peer); + if (peer_widx < 0) + continue; + + ntb_mw_clear_trans(ntb, peer, peer_widx); + } +} +EXPORT_SYMBOL(ntb_msi_clear_mws); + +struct ntb_msi_devres { + struct ntb_dev *ntb; + struct msi_desc *entry; + struct ntb_msi_desc *msi_desc; +}; + +static int ntb_msi_set_desc(struct ntb_dev *ntb, struct msi_desc *entry, + struct ntb_msi_desc *msi_desc) +{ + u64 addr; + + addr = entry->msg.address_lo + + ((uint64_t)entry->msg.address_hi << 32); + + if (addr < ntb->msi->base_addr || addr >= ntb->msi->end_addr) { + dev_warn_once(&ntb->dev, + "IRQ %d: MSI Address not within the memory window (%llx, [%llx %llx])\n", + entry->irq, addr, ntb->msi->base_addr, + ntb->msi->end_addr); + return -EFAULT; + } + + msi_desc->addr_offset = addr - ntb->msi->base_addr; + msi_desc->data = entry->msg.data; + + return 0; +} + +static void ntb_msi_write_msg(struct msi_desc *entry, void *data) +{ + struct ntb_msi_devres *dr = data; + + WARN_ON(ntb_msi_set_desc(dr->ntb, entry, dr->msi_desc)); + + if (dr->ntb->msi->desc_changed) + dr->ntb->msi->desc_changed(dr->ntb->ctx); +} + +static void ntbm_msi_callback_release(struct device *dev, void *res) +{ + struct ntb_msi_devres *dr = res; + + dr->entry->write_msi_msg = NULL; + dr->entry->write_msi_msg_data = NULL; +} + +static int ntbm_msi_setup_callback(struct ntb_dev *ntb, struct msi_desc *entry, + struct ntb_msi_desc *msi_desc) +{ + struct ntb_msi_devres *dr; + + dr = devres_alloc(ntbm_msi_callback_release, + sizeof(struct ntb_msi_devres), GFP_KERNEL); + if (!dr) + return -ENOMEM; + + dr->ntb = ntb; + dr->entry = entry; + dr->msi_desc = msi_desc; + + devres_add(&ntb->dev, dr); + + dr->entry->write_msi_msg = ntb_msi_write_msg; + dr->entry->write_msi_msg_data = dr; + + return 0; +} + +/** + * ntbm_msi_request_threaded_irq() - allocate an MSI interrupt + * @ntb: NTB device context + * @handler: Function to be called when the IRQ occurs + * @thread_fn: Function to be called in a threaded interrupt context. NULL + * for clients which handle everything in @handler + * @devname: An ascii name for the claiming device, dev_name(dev) if NULL + * @dev_id: A cookie passed back to the handler function + * + * This function assigns an interrupt handler to an unused + * MSI interrupt and returns the descriptor used to trigger + * it. The descriptor can then be sent to a peer to trigger + * the interrupt. + * + * The interrupt resource is managed with devres so it will + * be automatically freed when the NTB device is torn down. + * + * If an IRQ allocated with this function needs to be freed + * separately, ntbm_free_irq() must be used. + * + * Return: IRQ number assigned on success, otherwise a negative error number. + */ +int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler, + irq_handler_t thread_fn, + const char *name, void *dev_id, + struct ntb_msi_desc *msi_desc) +{ + struct msi_desc *entry; + struct irq_desc *desc; + int ret; + + if (!ntb->msi) + return -EINVAL; + + for_each_pci_msi_entry(entry, ntb->pdev) { + desc = irq_to_desc(entry->irq); + if (desc->action) + continue; + + ret = devm_request_threaded_irq(&ntb->dev, entry->irq, handler, + thread_fn, 0, name, dev_id); + if (ret) + continue; + + if (ntb_msi_set_desc(ntb, entry, msi_desc)) { + devm_free_irq(&ntb->dev, entry->irq, dev_id); + continue; + } + + ret = ntbm_msi_setup_callback(ntb, entry, msi_desc); + if (ret) { + devm_free_irq(&ntb->dev, entry->irq, dev_id); + return ret; + } + + + return entry->irq; + } + + return -ENODEV; +} +EXPORT_SYMBOL(ntbm_msi_request_threaded_irq); + +static int ntbm_msi_callback_match(struct device *dev, void *res, void *data) +{ + struct ntb_dev *ntb = dev_ntb(dev); + struct ntb_msi_devres *dr = res; + + return dr->ntb == ntb && dr->entry == data; +} + +/** + * ntbm_msi_free_irq() - free an interrupt + * @ntb: NTB device context + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * This function should be used to manually free IRQs allocated with + * ntbm_request_[threaded_]irq(). + */ +void ntbm_msi_free_irq(struct ntb_dev *ntb, unsigned int irq, void *dev_id) +{ + struct msi_desc *entry = irq_get_msi_desc(irq); + + entry->write_msi_msg = NULL; + entry->write_msi_msg_data = NULL; + + WARN_ON(devres_destroy(&ntb->dev, ntbm_msi_callback_release, + ntbm_msi_callback_match, entry)); + + devm_free_irq(&ntb->dev, irq, dev_id); +} +EXPORT_SYMBOL(ntbm_msi_free_irq); + +/** + * ntb_msi_peer_trigger() - Trigger an interrupt handler on a peer + * @ntb: NTB device context + * @peer: Peer index + * @desc: MSI descriptor data which triggers the interrupt + * + * This function triggers an interrupt on a peer. It requires + * the descriptor structure to have been passed from that peer + * by some other means. + * + * Return: Zero on success, otherwise a negative error number. + */ +int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer, + struct ntb_msi_desc *desc) +{ + int idx; + + if (!ntb->msi) + return -EINVAL; + + idx = desc->addr_offset / sizeof(*ntb->msi->peer_mws[peer]); + + iowrite32(desc->data, &ntb->msi->peer_mws[peer][idx]); + + return 0; +} +EXPORT_SYMBOL(ntb_msi_peer_trigger); + +/** + * ntb_msi_peer_addr() - Get the DMA address to trigger a peer's MSI interrupt + * @ntb: NTB device context + * @peer: Peer index + * @desc: MSI descriptor data which triggers the interrupt + * @msi_addr: Physical address to trigger the interrupt + * + * This function allows using DMA engines to trigger an interrupt + * (for example, trigger an interrupt to process the data after + * sending it). To trigger the interrupt, write @desc.data to the address + * returned in @msi_addr + * + * Return: Zero on success, otherwise a negative error number. + */ +int ntb_msi_peer_addr(struct ntb_dev *ntb, int peer, + struct ntb_msi_desc *desc, + phys_addr_t *msi_addr) +{ + int peer_widx = ntb_peer_mw_count(ntb) - 1 - peer; + phys_addr_t mw_phys_addr; + int ret; + + ret = ntb_peer_mw_get_addr(ntb, peer_widx, &mw_phys_addr, NULL); + if (ret) + return ret; + + if (msi_addr) + *msi_addr = mw_phys_addr + desc->addr_offset; + + return 0; +} +EXPORT_SYMBOL(ntb_msi_peer_addr); diff --git a/include/linux/ntb.h b/include/linux/ntb.h index bed421b9579b..8c13538aeffe 100644 --- a/include/linux/ntb.h +++ b/include/linux/ntb.h @@ -58,9 +58,11 @@ #include #include +#include struct ntb_client; struct ntb_dev; +struct ntb_msi; struct pci_dev; /** @@ -426,6 +428,10 @@ struct ntb_dev { spinlock_t ctx_lock; /* block unregister until device is fully released */ struct completion released; + +#ifdef CONFIG_NTB_MSI + struct ntb_msi *msi; +#endif }; #define dev_ntb(__dev) container_of((__dev), struct ntb_dev, dev) @@ -1627,4 +1633,71 @@ static inline int ntb_peer_highest_mw_idx(struct ntb_dev *ntb, int pidx) return ntb_mw_count(ntb, pidx) - ret - 1; } +struct ntb_msi_desc { + u32 addr_offset; + u32 data; +}; + +#ifdef CONFIG_NTB_MSI + +int ntb_msi_init(struct ntb_dev *ntb, void (*desc_changed)(void *ctx)); +int ntb_msi_setup_mws(struct ntb_dev *ntb); +void ntb_msi_clear_mws(struct ntb_dev *ntb); +int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, irq_handler_t handler, + irq_handler_t thread_fn, + const char *name, void *dev_id, + struct ntb_msi_desc *msi_desc); +void ntbm_msi_free_irq(struct ntb_dev *ntb, unsigned int irq, void *dev_id); +int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer, + struct ntb_msi_desc *desc); +int ntb_msi_peer_addr(struct ntb_dev *ntb, int peer, + struct ntb_msi_desc *desc, + phys_addr_t *msi_addr); + +#else /* not CONFIG_NTB_MSI */ + +static inline int ntb_msi_init(struct ntb_dev *ntb, + void (*desc_changed)(void *ctx)) +{ + return -EOPNOTSUPP; +} +static inline int ntb_msi_setup_mws(struct ntb_dev *ntb) +{ + return -EOPNOTSUPP; +} +static inline void ntb_msi_clear_mws(struct ntb_dev *ntb) {} +static inline int ntbm_msi_request_threaded_irq(struct ntb_dev *ntb, + irq_handler_t handler, + irq_handler_t thread_fn, + const char *name, void *dev_id, + struct ntb_msi_desc *msi_desc) +{ + return -EOPNOTSUPP; +} +static inline void ntbm_msi_free_irq(struct ntb_dev *ntb, unsigned int irq, + void *dev_id) {} +static inline int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer, + struct ntb_msi_desc *desc) +{ + return -EOPNOTSUPP; +} +static inline int ntb_msi_peer_addr(struct ntb_dev *ntb, int peer, + struct ntb_msi_desc *desc, + phys_addr_t *msi_addr) +{ + return -EOPNOTSUPP; + +} + +#endif /* CONFIG_NTB_MSI */ + +static inline int ntbm_msi_request_irq(struct ntb_dev *ntb, + irq_handler_t handler, + const char *name, void *dev_id, + struct ntb_msi_desc *msi_desc) +{ + return ntbm_msi_request_threaded_irq(ntb, handler, NULL, name, + dev_id, msi_desc); +} + #endif -- cgit v1.2.3-59-g8ed1b From de76cda215d56256ffcda7ffa538b70f9fb301a7 Mon Sep 17 00:00:00 2001 From: Gustavo Pimentel Date: Tue, 4 Jun 2019 18:24:43 +0200 Subject: PCI: Decode PCIe 32 GT/s link speed PCIe r5.0, sec 7.5.3.18, defines a new 32.0 GT/s bit in the Supported Link Speeds Vector of Link Capabilities 2. Decode this new speed. This does not affect the speed of the link, which should be negotiated automatically by the hardware; it only adds decoding when showing the speed to the user. Previously, reading the speed of a link operating at this speed showed "Unknown speed" instead of "32.0 GT/s". Link: https://lore.kernel.org/lkml/92365e3caf0fc559f9ab14bcd053bfc92d4f661c.1559664969.git.gustavo.pimentel@synopsys.com Signed-off-by: Gustavo Pimentel [bhelgaas: changelog] Signed-off-by: Bjorn Helgaas --- drivers/pci/pci-sysfs.c | 3 +++ drivers/pci/pci.c | 4 +++- drivers/pci/probe.c | 2 +- drivers/pci/slot.c | 1 + include/linux/pci.h | 1 + include/uapi/linux/pci_regs.h | 4 ++++ 6 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 6d27475e39b2..d52d30448e41 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -182,6 +182,9 @@ static ssize_t current_link_speed_show(struct device *dev, return -EINVAL; switch (linkstat & PCI_EXP_LNKSTA_CLS) { + case PCI_EXP_LNKSTA_CLS_32_0GB: + speed = "32 GT/s"; + break; case PCI_EXP_LNKSTA_CLS_16_0GB: speed = "16 GT/s"; break; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 8abc843b1615..4729a7c7a9d9 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5621,7 +5621,9 @@ enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev) */ pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2); if (lnkcap2) { /* PCIe r3.0-compliant */ - if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB) + if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_32_0GB) + return PCIE_SPEED_32_0GT; + else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB) return PCIE_SPEED_16_0GT; else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB) return PCIE_SPEED_8_0GT; diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 0e8e2c186f50..c5f27c8cd140 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -668,7 +668,7 @@ const unsigned char pcie_link_speed[] = { PCIE_SPEED_5_0GT, /* 2 */ PCIE_SPEED_8_0GT, /* 3 */ PCIE_SPEED_16_0GT, /* 4 */ - PCI_SPEED_UNKNOWN, /* 5 */ + PCIE_SPEED_32_0GT, /* 5 */ PCI_SPEED_UNKNOWN, /* 6 */ PCI_SPEED_UNKNOWN, /* 7 */ PCI_SPEED_UNKNOWN, /* 8 */ diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c index f4d92b1afe7b..ae4aa0e1f2f4 100644 --- a/drivers/pci/slot.c +++ b/drivers/pci/slot.c @@ -75,6 +75,7 @@ static const char *pci_bus_speed_strings[] = { "5.0 GT/s PCIe", /* 0x15 */ "8.0 GT/s PCIe", /* 0x16 */ "16.0 GT/s PCIe", /* 0x17 */ + "32.0 GT/s PCIe", /* 0x18 */ }; static ssize_t bus_speed_read(enum pci_bus_speed speed, char *buf) diff --git a/include/linux/pci.h b/include/linux/pci.h index 4a5a84d7bdd4..2173e6b75579 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -258,6 +258,7 @@ enum pci_bus_speed { PCIE_SPEED_5_0GT = 0x15, PCIE_SPEED_8_0GT = 0x16, PCIE_SPEED_16_0GT = 0x17, + PCIE_SPEED_32_0GT = 0x18, PCI_SPEED_UNKNOWN = 0xff, }; diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 27164769d184..f28e562d7ca8 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -528,6 +528,7 @@ #define PCI_EXP_LNKCAP_SLS_5_0GB 0x00000002 /* LNKCAP2 SLS Vector bit 1 */ #define PCI_EXP_LNKCAP_SLS_8_0GB 0x00000003 /* LNKCAP2 SLS Vector bit 2 */ #define PCI_EXP_LNKCAP_SLS_16_0GB 0x00000004 /* LNKCAP2 SLS Vector bit 3 */ +#define PCI_EXP_LNKCAP_SLS_32_0GB 0x00000005 /* LNKCAP2 SLS Vector bit 4 */ #define PCI_EXP_LNKCAP_MLW 0x000003f0 /* Maximum Link Width */ #define PCI_EXP_LNKCAP_ASPMS 0x00000c00 /* ASPM Support */ #define PCI_EXP_LNKCAP_L0SEL 0x00007000 /* L0s Exit Latency */ @@ -556,6 +557,7 @@ #define PCI_EXP_LNKSTA_CLS_5_0GB 0x0002 /* Current Link Speed 5.0GT/s */ #define PCI_EXP_LNKSTA_CLS_8_0GB 0x0003 /* Current Link Speed 8.0GT/s */ #define PCI_EXP_LNKSTA_CLS_16_0GB 0x0004 /* Current Link Speed 16.0GT/s */ +#define PCI_EXP_LNKSTA_CLS_32_0GB 0x0005 /* Current Link Speed 32.0GT/s */ #define PCI_EXP_LNKSTA_NLW 0x03f0 /* Negotiated Link Width */ #define PCI_EXP_LNKSTA_NLW_X1 0x0010 /* Current Link Width x1 */ #define PCI_EXP_LNKSTA_NLW_X2 0x0020 /* Current Link Width x2 */ @@ -661,6 +663,7 @@ #define PCI_EXP_LNKCAP2_SLS_5_0GB 0x00000004 /* Supported Speed 5GT/s */ #define PCI_EXP_LNKCAP2_SLS_8_0GB 0x00000008 /* Supported Speed 8GT/s */ #define PCI_EXP_LNKCAP2_SLS_16_0GB 0x00000010 /* Supported Speed 16GT/s */ +#define PCI_EXP_LNKCAP2_SLS_32_0GB 0x00000020 /* Supported Speed 32GT/s */ #define PCI_EXP_LNKCAP2_CROSSLINK 0x00000100 /* Crosslink supported */ #define PCI_EXP_LNKCTL2 48 /* Link Control 2 */ #define PCI_EXP_LNKCTL2_TLS 0x000f @@ -668,6 +671,7 @@ #define PCI_EXP_LNKCTL2_TLS_5_0GT 0x0002 /* Supported Speed 5GT/s */ #define PCI_EXP_LNKCTL2_TLS_8_0GT 0x0003 /* Supported Speed 8GT/s */ #define PCI_EXP_LNKCTL2_TLS_16_0GT 0x0004 /* Supported Speed 16GT/s */ +#define PCI_EXP_LNKCTL2_TLS_32_0GT 0x0005 /* Supported Speed 32GT/s */ #define PCI_EXP_LNKSTA2 50 /* Link Status 2 */ #define PCI_CAP_EXP_ENDPOINT_SIZEOF_V2 52 /* v2 endpoints with link end here */ #define PCI_EXP_SLTCAP2 52 /* Slot Capabilities 2 */ -- cgit v1.2.3-59-g8ed1b From bb2bb903042517b8fb17b2bc21e00512f2dcac01 Mon Sep 17 00:00:00 2001 From: Greg Hackmann Date: Thu, 13 Jun 2019 15:34:07 -0700 Subject: dma-buf: add DMA_BUF_SET_NAME ioctls This patch adds complimentary DMA_BUF_SET_NAME ioctls, which lets userspace processes attach a free-form name to each buffer. This information can be extremely helpful for tracking and accounting shared buffers. For example, on Android, we know what each buffer will be used for at allocation time: GL, multimedia, camera, etc. The userspace allocator can use DMA_BUF_SET_NAME to associate that information with the buffer, so we can later give developers a breakdown of how much memory they're allocating for graphics, camera, etc. Signed-off-by: Greg Hackmann Signed-off-by: Chenbo Feng Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20190613223408.139221-3-fengc@google.com --- drivers/dma-buf/dma-buf.c | 65 ++++++++++++++++++++++++++++++++++++++++++-- include/linux/dma-buf.h | 5 +++- include/uapi/linux/dma-buf.h | 3 ++ 3 files changed, 69 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 3612ccededd6..ab96410d1dcd 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -48,8 +48,24 @@ struct dma_buf_list { static struct dma_buf_list db_list; +static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + struct dma_buf *dmabuf; + char name[DMA_BUF_NAME_LEN]; + size_t ret = 0; + + dmabuf = dentry->d_fsdata; + mutex_lock(&dmabuf->lock); + if (dmabuf->name) + ret = strlcpy(name, dmabuf->name, DMA_BUF_NAME_LEN); + mutex_unlock(&dmabuf->lock); + + return dynamic_dname(dentry, buffer, buflen, "/%s:%s", + dentry->d_name.name, ret > 0 ? name : ""); +} + static const struct dentry_operations dma_buf_dentry_ops = { - .d_dname = simple_dname, + .d_dname = dmabuffs_dname, }; static struct vfsmount *dma_buf_mnt; @@ -301,6 +317,43 @@ out: return events; } +/** + * dma_buf_set_name - Set a name to a specific dma_buf to track the usage. + * The name of the dma-buf buffer can only be set when the dma-buf is not + * attached to any devices. It could theoritically support changing the + * name of the dma-buf if the same piece of memory is used for multiple + * purpose between different devices. + * + * @dmabuf [in] dmabuf buffer that will be renamed. + * @buf: [in] A piece of userspace memory that contains the name of + * the dma-buf. + * + * Returns 0 on success. If the dma-buf buffer is already attached to + * devices, return -EBUSY. + * + */ +static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf) +{ + char *name = strndup_user(buf, DMA_BUF_NAME_LEN); + long ret = 0; + + if (IS_ERR(name)) + return PTR_ERR(name); + + mutex_lock(&dmabuf->lock); + if (!list_empty(&dmabuf->attachments)) { + ret = -EBUSY; + kfree(name); + goto out_unlock; + } + kfree(dmabuf->name); + dmabuf->name = name; + +out_unlock: + mutex_unlock(&dmabuf->lock); + return ret; +} + static long dma_buf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -339,6 +392,10 @@ static long dma_buf_ioctl(struct file *file, ret = dma_buf_begin_cpu_access(dmabuf, direction); return ret; + + case DMA_BUF_SET_NAME: + return dma_buf_set_name(dmabuf, (const char __user *)arg); + default: return -ENOTTY; } @@ -380,6 +437,7 @@ static struct file *dma_buf_getfile(struct dma_buf *dmabuf, int flags) goto err_alloc_file; file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); file->private_data = dmabuf; + file->f_path.dentry->d_fsdata = dmabuf; return file; @@ -1112,12 +1170,13 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused) continue; } - seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\n", + seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\t%s\n", buf_obj->size, buf_obj->file->f_flags, buf_obj->file->f_mode, file_count(buf_obj->file), buf_obj->exp_name, - file_inode(buf_obj->file)->i_ino); + file_inode(buf_obj->file)->i_ino, + buf_obj->name ?: ""); robj = buf_obj->resv; while (true) { diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 8a327566d7f4..01ad5b942a6f 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -280,10 +280,12 @@ struct dma_buf_ops { * @file: file pointer used for sharing buffers across, and for refcounting. * @attachments: list of dma_buf_attachment that denotes all devices attached. * @ops: dma_buf_ops associated with this buffer object. - * @lock: used internally to serialize list manipulation, attach/detach and vmap/unmap + * @lock: used internally to serialize list manipulation, attach/detach and + * vmap/unmap, and accesses to name * @vmapping_counter: used internally to refcnt the vmaps * @vmap_ptr: the current vmap ptr if vmapping_counter > 0 * @exp_name: name of the exporter; useful for debugging. + * @name: userspace-provided name; useful for accounting and debugging. * @owner: pointer to exporter module; used for refcounting when exporter is a * kernel module. * @list_node: node for dma_buf accounting and debugging. @@ -311,6 +313,7 @@ struct dma_buf { unsigned vmapping_counter; void *vmap_ptr; const char *exp_name; + const char *name; struct module *owner; struct list_head list_node; void *priv; diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h index d75df5210a4a..dbc7092e04b5 100644 --- a/include/uapi/linux/dma-buf.h +++ b/include/uapi/linux/dma-buf.h @@ -35,7 +35,10 @@ struct dma_buf_sync { #define DMA_BUF_SYNC_VALID_FLAGS_MASK \ (DMA_BUF_SYNC_RW | DMA_BUF_SYNC_END) +#define DMA_BUF_NAME_LEN 32 + #define DMA_BUF_BASE 'b' #define DMA_BUF_IOCTL_SYNC _IOW(DMA_BUF_BASE, 0, struct dma_buf_sync) +#define DMA_BUF_SET_NAME _IOW(DMA_BUF_BASE, 1, const char *) #endif -- cgit v1.2.3-59-g8ed1b From 151f4e2bdc7a04020ae5c533896fb91a16e1f501 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 13 Jun 2019 07:10:36 -0300 Subject: docs: power: convert docs to ReST and rename to *.rst Convert the PM documents to ReST, in order to allow them to build with Sphinx. The conversion is actually: - add blank lines and indentation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Bjorn Helgaas Acked-by: Mark Brown Acked-by: Srivatsa S. Bhat (VMware) --- Documentation/ABI/testing/sysfs-class-powercap | 2 +- Documentation/admin-guide/kernel-parameters.txt | 6 +- Documentation/cpu-freq/core.txt | 2 +- Documentation/driver-api/pm/devices.rst | 6 +- Documentation/driver-api/usb/power-management.rst | 2 +- Documentation/power/apm-acpi.rst | 36 + Documentation/power/apm-acpi.txt | 32 - Documentation/power/basic-pm-debugging.rst | 269 +++++ Documentation/power/basic-pm-debugging.txt | 254 ----- Documentation/power/charger-manager.rst | 205 ++++ Documentation/power/charger-manager.txt | 200 ---- Documentation/power/drivers-testing.rst | 51 + Documentation/power/drivers-testing.txt | 46 - Documentation/power/energy-model.rst | 147 +++ Documentation/power/energy-model.txt | 144 --- Documentation/power/freezing-of-tasks.rst | 244 +++++ Documentation/power/freezing-of-tasks.txt | 231 ---- Documentation/power/index.rst | 46 + Documentation/power/interface.rst | 79 ++ Documentation/power/interface.txt | 77 -- Documentation/power/opp.rst | 379 +++++++ Documentation/power/opp.txt | 342 ------ Documentation/power/pci.rst | 1135 ++++++++++++++++++++ Documentation/power/pci.txt | 1094 ------------------- Documentation/power/pm_qos_interface.rst | 225 ++++ Documentation/power/pm_qos_interface.txt | 212 ---- Documentation/power/power_supply_class.rst | 282 +++++ Documentation/power/power_supply_class.txt | 231 ---- Documentation/power/powercap/powercap.rst | 257 +++++ Documentation/power/powercap/powercap.txt | 236 ---- Documentation/power/regulator/consumer.rst | 229 ++++ Documentation/power/regulator/consumer.txt | 218 ---- Documentation/power/regulator/design.rst | 38 + Documentation/power/regulator/design.txt | 33 - Documentation/power/regulator/machine.rst | 97 ++ Documentation/power/regulator/machine.txt | 96 -- Documentation/power/regulator/overview.rst | 178 +++ Documentation/power/regulator/overview.txt | 171 --- Documentation/power/regulator/regulator.rst | 32 + Documentation/power/regulator/regulator.txt | 30 - Documentation/power/runtime_pm.rst | 940 ++++++++++++++++ Documentation/power/runtime_pm.txt | 928 ---------------- Documentation/power/s2ram.rst | 87 ++ Documentation/power/s2ram.txt | 85 -- Documentation/power/suspend-and-cpuhotplug.rst | 286 +++++ Documentation/power/suspend-and-cpuhotplug.txt | 274 ----- Documentation/power/suspend-and-interrupts.rst | 137 +++ Documentation/power/suspend-and-interrupts.txt | 135 --- Documentation/power/swsusp-and-swap-files.rst | 63 ++ Documentation/power/swsusp-and-swap-files.txt | 60 -- Documentation/power/swsusp-dmcrypt.rst | 140 +++ Documentation/power/swsusp-dmcrypt.txt | 138 --- Documentation/power/swsusp.rst | 501 +++++++++ Documentation/power/swsusp.txt | 446 -------- Documentation/power/tricks.rst | 29 + Documentation/power/tricks.txt | 27 - Documentation/power/userland-swsusp.rst | 191 ++++ Documentation/power/userland-swsusp.txt | 170 --- Documentation/power/video.rst | 213 ++++ Documentation/power/video.txt | 185 ---- Documentation/process/submitting-drivers.rst | 2 +- Documentation/scheduler/sched-energy.txt | 6 +- Documentation/trace/coresight-cpu-debug.txt | 2 +- .../zh_CN/process/submitting-drivers.rst | 2 +- MAINTAINERS | 4 +- arch/x86/Kconfig | 2 +- drivers/gpu/drm/i915/i915_drv.h | 2 +- drivers/opp/Kconfig | 2 +- drivers/power/supply/power_supply_core.c | 2 +- include/linux/interrupt.h | 2 +- include/linux/pci.h | 2 +- include/linux/pm.h | 2 +- kernel/power/Kconfig | 6 +- net/wireless/Kconfig | 2 +- 74 files changed, 6544 insertions(+), 6123 deletions(-) create mode 100644 Documentation/power/apm-acpi.rst delete mode 100644 Documentation/power/apm-acpi.txt create mode 100644 Documentation/power/basic-pm-debugging.rst delete mode 100644 Documentation/power/basic-pm-debugging.txt create mode 100644 Documentation/power/charger-manager.rst delete mode 100644 Documentation/power/charger-manager.txt create mode 100644 Documentation/power/drivers-testing.rst delete mode 100644 Documentation/power/drivers-testing.txt create mode 100644 Documentation/power/energy-model.rst delete mode 100644 Documentation/power/energy-model.txt create mode 100644 Documentation/power/freezing-of-tasks.rst delete mode 100644 Documentation/power/freezing-of-tasks.txt create mode 100644 Documentation/power/index.rst create mode 100644 Documentation/power/interface.rst delete mode 100644 Documentation/power/interface.txt create mode 100644 Documentation/power/opp.rst delete mode 100644 Documentation/power/opp.txt create mode 100644 Documentation/power/pci.rst delete mode 100644 Documentation/power/pci.txt create mode 100644 Documentation/power/pm_qos_interface.rst delete mode 100644 Documentation/power/pm_qos_interface.txt create mode 100644 Documentation/power/power_supply_class.rst delete mode 100644 Documentation/power/power_supply_class.txt create mode 100644 Documentation/power/powercap/powercap.rst delete mode 100644 Documentation/power/powercap/powercap.txt create mode 100644 Documentation/power/regulator/consumer.rst delete mode 100644 Documentation/power/regulator/consumer.txt create mode 100644 Documentation/power/regulator/design.rst delete mode 100644 Documentation/power/regulator/design.txt create mode 100644 Documentation/power/regulator/machine.rst delete mode 100644 Documentation/power/regulator/machine.txt create mode 100644 Documentation/power/regulator/overview.rst delete mode 100644 Documentation/power/regulator/overview.txt create mode 100644 Documentation/power/regulator/regulator.rst delete mode 100644 Documentation/power/regulator/regulator.txt create mode 100644 Documentation/power/runtime_pm.rst delete mode 100644 Documentation/power/runtime_pm.txt create mode 100644 Documentation/power/s2ram.rst delete mode 100644 Documentation/power/s2ram.txt create mode 100644 Documentation/power/suspend-and-cpuhotplug.rst delete mode 100644 Documentation/power/suspend-and-cpuhotplug.txt create mode 100644 Documentation/power/suspend-and-interrupts.rst delete mode 100644 Documentation/power/suspend-and-interrupts.txt create mode 100644 Documentation/power/swsusp-and-swap-files.rst delete mode 100644 Documentation/power/swsusp-and-swap-files.txt create mode 100644 Documentation/power/swsusp-dmcrypt.rst delete mode 100644 Documentation/power/swsusp-dmcrypt.txt create mode 100644 Documentation/power/swsusp.rst delete mode 100644 Documentation/power/swsusp.txt create mode 100644 Documentation/power/tricks.rst delete mode 100644 Documentation/power/tricks.txt create mode 100644 Documentation/power/userland-swsusp.rst delete mode 100644 Documentation/power/userland-swsusp.txt create mode 100644 Documentation/power/video.rst delete mode 100644 Documentation/power/video.txt (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-class-powercap b/Documentation/ABI/testing/sysfs-class-powercap index db3b3ff70d84..742dfd966592 100644 --- a/Documentation/ABI/testing/sysfs-class-powercap +++ b/Documentation/ABI/testing/sysfs-class-powercap @@ -5,7 +5,7 @@ Contact: linux-pm@vger.kernel.org Description: The powercap/ class sub directory belongs to the power cap subsystem. Refer to - Documentation/power/powercap/powercap.txt for details. + Documentation/power/powercap/powercap.rst for details. What: /sys/class/powercap/ Date: September 2013 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 138f6664b2e2..7f5ca6e7c4d3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -13,7 +13,7 @@ For ARM64, ONLY "acpi=off", "acpi=on" or "acpi=force" are available - See also Documentation/power/runtime_pm.txt, pci=noacpi + See also Documentation/power/runtime_pm.rst, pci=noacpi acpi_apic_instance= [ACPI, IOAPIC] Format: @@ -223,7 +223,7 @@ acpi_sleep= [HW,ACPI] Sleep options Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, old_ordering, nonvs, sci_force_enable, nobl } - See Documentation/power/video.txt for information on + See Documentation/power/video.rst for information on s3_bios and s3_mode. s3_beep is for debugging; it makes the PC's speaker beep as soon as the kernel's real-mode entry point is called. @@ -4108,7 +4108,7 @@ Specify the offset from the beginning of the partition given by "resume=" at which the swap header is located, in units (needed only for swap files). - See Documentation/power/swsusp-and-swap-files.txt + See Documentation/power/swsusp-and-swap-files.rst resumedelay= [HIBERNATION] Delay (in seconds) to pause before attempting to read the resume files diff --git a/Documentation/cpu-freq/core.txt b/Documentation/cpu-freq/core.txt index 073f128af5a7..55193e680250 100644 --- a/Documentation/cpu-freq/core.txt +++ b/Documentation/cpu-freq/core.txt @@ -95,7 +95,7 @@ flags - flags of the cpufreq driver 3. CPUFreq Table Generation with Operating Performance Point (OPP) ================================================================== -For details about OPP, see Documentation/power/opp.txt +For details about OPP, see Documentation/power/opp.rst dev_pm_opp_init_cpufreq_table - This function provides a ready to use conversion routine to translate diff --git a/Documentation/driver-api/pm/devices.rst b/Documentation/driver-api/pm/devices.rst index 30835683616a..f66c7b9126ea 100644 --- a/Documentation/driver-api/pm/devices.rst +++ b/Documentation/driver-api/pm/devices.rst @@ -225,7 +225,7 @@ system-wide transition to a sleep state even though its :c:member:`runtime_auto` flag is clear. For more information about the runtime power management framework, refer to -:file:`Documentation/power/runtime_pm.txt`. +:file:`Documentation/power/runtime_pm.rst`. Calling Drivers to Enter and Leave System Sleep States @@ -728,7 +728,7 @@ it into account in any way. Devices may be defined as IRQ-safe which indicates to the PM core that their runtime PM callbacks may be invoked with disabled interrupts (see -:file:`Documentation/power/runtime_pm.txt` for more information). If an +:file:`Documentation/power/runtime_pm.rst` for more information). If an IRQ-safe device belongs to a PM domain, the runtime PM of the domain will be disallowed, unless the domain itself is defined as IRQ-safe. However, it makes sense to define a PM domain as IRQ-safe only if all the devices in it @@ -795,7 +795,7 @@ so on) and the final state of the device must reflect the "active" runtime PM status in that case. During system-wide resume from a sleep state it's easiest to put devices into -the full-power state, as explained in :file:`Documentation/power/runtime_pm.txt`. +the full-power state, as explained in :file:`Documentation/power/runtime_pm.rst`. [Refer to that document for more information regarding this particular issue as well as for information on the device runtime power management framework in general.] diff --git a/Documentation/driver-api/usb/power-management.rst b/Documentation/driver-api/usb/power-management.rst index 4a74cf6f2797..2525c3622cae 100644 --- a/Documentation/driver-api/usb/power-management.rst +++ b/Documentation/driver-api/usb/power-management.rst @@ -46,7 +46,7 @@ device is turned off while the system as a whole remains running, we call it a "dynamic suspend" (also known as a "runtime suspend" or "selective suspend"). This document concentrates mostly on how dynamic PM is implemented in the USB subsystem, although system PM is -covered to some extent (see ``Documentation/power/*.txt`` for more +covered to some extent (see ``Documentation/power/*.rst`` for more information about system PM). System PM support is present only if the kernel was built with diff --git a/Documentation/power/apm-acpi.rst b/Documentation/power/apm-acpi.rst new file mode 100644 index 000000000000..5b90d947126d --- /dev/null +++ b/Documentation/power/apm-acpi.rst @@ -0,0 +1,36 @@ +============ +APM or ACPI? +============ + +If you have a relatively recent x86 mobile, desktop, or server system, +odds are it supports either Advanced Power Management (APM) or +Advanced Configuration and Power Interface (ACPI). ACPI is the newer +of the two technologies and puts power management in the hands of the +operating system, allowing for more intelligent power management than +is possible with BIOS controlled APM. + +The best way to determine which, if either, your system supports is to +build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is +enabled by default). If a working ACPI implementation is found, the +ACPI driver will override and disable APM, otherwise the APM driver +will be used. + +No, sorry, you cannot have both ACPI and APM enabled and running at +once. Some people with broken ACPI or broken APM implementations +would like to use both to get a full set of working features, but you +simply cannot mix and match the two. Only one power management +interface can be in control of the machine at once. Think about it.. + +User-space Daemons +------------------ +Both APM and ACPI rely on user-space daemons, apmd and acpid +respectively, to be completely functional. Obtain both of these +daemons from your Linux distribution or from the Internet (see below) +and be sure that they are started sometime in the system boot process. +Go ahead and start both. If ACPI or APM is not available on your +system the associated daemon will exit gracefully. + + ===== ======================================= + apmd http://ftp.debian.org/pool/main/a/apmd/ + acpid http://acpid.sf.net/ + ===== ======================================= diff --git a/Documentation/power/apm-acpi.txt b/Documentation/power/apm-acpi.txt deleted file mode 100644 index 6cc423d3662e..000000000000 --- a/Documentation/power/apm-acpi.txt +++ /dev/null @@ -1,32 +0,0 @@ -APM or ACPI? ------------- -If you have a relatively recent x86 mobile, desktop, or server system, -odds are it supports either Advanced Power Management (APM) or -Advanced Configuration and Power Interface (ACPI). ACPI is the newer -of the two technologies and puts power management in the hands of the -operating system, allowing for more intelligent power management than -is possible with BIOS controlled APM. - -The best way to determine which, if either, your system supports is to -build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is -enabled by default). If a working ACPI implementation is found, the -ACPI driver will override and disable APM, otherwise the APM driver -will be used. - -No, sorry, you cannot have both ACPI and APM enabled and running at -once. Some people with broken ACPI or broken APM implementations -would like to use both to get a full set of working features, but you -simply cannot mix and match the two. Only one power management -interface can be in control of the machine at once. Think about it.. - -User-space Daemons ------------------- -Both APM and ACPI rely on user-space daemons, apmd and acpid -respectively, to be completely functional. Obtain both of these -daemons from your Linux distribution or from the Internet (see below) -and be sure that they are started sometime in the system boot process. -Go ahead and start both. If ACPI or APM is not available on your -system the associated daemon will exit gracefully. - - apmd: http://ftp.debian.org/pool/main/a/apmd/ - acpid: http://acpid.sf.net/ diff --git a/Documentation/power/basic-pm-debugging.rst b/Documentation/power/basic-pm-debugging.rst new file mode 100644 index 000000000000..69862e759c30 --- /dev/null +++ b/Documentation/power/basic-pm-debugging.rst @@ -0,0 +1,269 @@ +================================= +Debugging hibernation and suspend +================================= + + (C) 2007 Rafael J. Wysocki , GPL + +1. Testing hibernation (aka suspend to disk or STD) +=================================================== + +To check if hibernation works, you can try to hibernate in the "reboot" mode:: + + # echo reboot > /sys/power/disk + # echo disk > /sys/power/state + +and the system should create a hibernation image, reboot, resume and get back to +the command prompt where you have started the transition. If that happens, +hibernation is most likely to work correctly. Still, you need to repeat the +test at least a couple of times in a row for confidence. [This is necessary, +because some problems only show up on a second attempt at suspending and +resuming the system.] Moreover, hibernating in the "reboot" and "shutdown" +modes causes the PM core to skip some platform-related callbacks which on ACPI +systems might be necessary to make hibernation work. Thus, if your machine +fails to hibernate or resume in the "reboot" mode, you should try the +"platform" mode:: + + # echo platform > /sys/power/disk + # echo disk > /sys/power/state + +which is the default and recommended mode of hibernation. + +Unfortunately, the "platform" mode of hibernation does not work on some systems +with broken BIOSes. In such cases the "shutdown" mode of hibernation might +work:: + + # echo shutdown > /sys/power/disk + # echo disk > /sys/power/state + +(it is similar to the "reboot" mode, but it requires you to press the power +button to make the system resume). + +If neither "platform" nor "shutdown" hibernation mode works, you will need to +identify what goes wrong. + +a) Test modes of hibernation +---------------------------- + +To find out why hibernation fails on your system, you can use a special testing +facility available if the kernel is compiled with CONFIG_PM_DEBUG set. Then, +there is the file /sys/power/pm_test that can be used to make the hibernation +core run in a test mode. There are 5 test modes available: + +freezer + - test the freezing of processes + +devices + - test the freezing of processes and suspending of devices + +platform + - test the freezing of processes, suspending of devices and platform + global control methods [1]_ + +processors + - test the freezing of processes, suspending of devices, platform + global control methods [1]_ and the disabling of nonboot CPUs + +core + - test the freezing of processes, suspending of devices, platform global + control methods\ [1]_, the disabling of nonboot CPUs and suspending + of platform/system devices + +.. [1] + + the platform global control methods are only available on ACPI systems + and are only tested if the hibernation mode is set to "platform" + +To use one of them it is necessary to write the corresponding string to +/sys/power/pm_test (eg. "devices" to test the freezing of processes and +suspending devices) and issue the standard hibernation commands. For example, +to use the "devices" test mode along with the "platform" mode of hibernation, +you should do the following:: + + # echo devices > /sys/power/pm_test + # echo platform > /sys/power/disk + # echo disk > /sys/power/state + +Then, the kernel will try to freeze processes, suspend devices, wait a few +seconds (5 by default, but configurable by the suspend.pm_test_delay module +parameter), resume devices and thaw processes. If "platform" is written to +/sys/power/pm_test , then after suspending devices the kernel will additionally +invoke the global control methods (eg. ACPI global control methods) used to +prepare the platform firmware for hibernation. Next, it will wait a +configurable number of seconds and invoke the platform (eg. ACPI) global +methods used to cancel hibernation etc. + +Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal +hibernation/suspend operations. Also, when open for reading, /sys/power/pm_test +contains a space-separated list of all available tests (including "none" that +represents the normal functionality) in which the current test level is +indicated by square brackets. + +Generally, as you can see, each test level is more "invasive" than the previous +one and the "core" level tests the hardware and drivers as deeply as possible +without creating a hibernation image. Obviously, if the "devices" test fails, +the "platform" test will fail as well and so on. Thus, as a rule of thumb, you +should try the test modes starting from "freezer", through "devices", "platform" +and "processors" up to "core" (repeat the test on each level a couple of times +to make sure that any random factors are avoided). + +If the "freezer" test fails, there is a task that cannot be frozen (in that case +it usually is possible to identify the offending task by analysing the output of +dmesg obtained after the failing test). Failure at this level usually means +that there is a problem with the tasks freezer subsystem that should be +reported. + +If the "devices" test fails, most likely there is a driver that cannot suspend +or resume its device (in the latter case the system may hang or become unstable +after the test, so please take that into consideration). To find this driver, +you can carry out a binary search according to the rules: + +- if the test fails, unload a half of the drivers currently loaded and repeat + (that would probably involve rebooting the system, so always note what drivers + have been loaded before the test), +- if the test succeeds, load a half of the drivers you have unloaded most + recently and repeat. + +Once you have found the failing driver (there can be more than just one of +them), you have to unload it every time before hibernation. In that case please +make sure to report the problem with the driver. + +It is also possible that the "devices" test will still fail after you have +unloaded all modules. In that case, you may want to look in your kernel +configuration for the drivers that can be compiled as modules (and test again +with these drivers compiled as modules). You may also try to use some special +kernel command line options such as "noapic", "noacpi" or even "acpi=off". + +If the "platform" test fails, there is a problem with the handling of the +platform (eg. ACPI) firmware on your system. In that case the "platform" mode +of hibernation is not likely to work. You can try the "shutdown" mode, but that +is rather a poor man's workaround. + +If the "processors" test fails, the disabling/enabling of nonboot CPUs does not +work (of course, this only may be an issue on SMP systems) and the problem +should be reported. In that case you can also try to switch the nonboot CPUs +off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and +see if that works. + +If the "core" test fails, which means that suspending of the system/platform +devices has failed (these devices are suspended on one CPU with interrupts off), +the problem is most probably hardware-related and serious, so it should be +reported. + +A failure of any of the "platform", "processors" or "core" tests may cause your +system to hang or become unstable, so please beware. Such a failure usually +indicates a serious problem that very well may be related to the hardware, but +please report it anyway. + +b) Testing minimal configuration +-------------------------------- + +If all of the hibernation test modes work, you can boot the system with the +"init=/bin/bash" command line parameter and attempt to hibernate in the +"reboot", "shutdown" and "platform" modes. If that does not work, there +probably is a problem with a driver statically compiled into the kernel and you +can try to compile more drivers as modules, so that they can be tested +individually. Otherwise, there is a problem with a modular driver and you can +find it by loading a half of the modules you normally use and binary searching +in accordance with the algorithm: +- if there are n modules loaded and the attempt to suspend and resume fails, +unload n/2 of the modules and try again (that would probably involve rebooting +the system), +- if there are n modules loaded and the attempt to suspend and resume succeeds, +load n/2 modules more and try again. + +Again, if you find the offending module(s), it(they) must be unloaded every time +before hibernation, and please report the problem with it(them). + +c) Using the "test_resume" hibernation option +--------------------------------------------- + +/sys/power/disk generally tells the kernel what to do after creating a +hibernation image. One of the available options is "test_resume" which +causes the just created image to be used for immediate restoration. Namely, +after doing:: + + # echo test_resume > /sys/power/disk + # echo disk > /sys/power/state + +a hibernation image will be created and a resume from it will be triggered +immediately without involving the platform firmware in any way. + +That test can be used to check if failures to resume from hibernation are +related to bad interactions with the platform firmware. That is, if the above +works every time, but resume from actual hibernation does not work or is +unreliable, the platform firmware may be responsible for the failures. + +On architectures and platforms that support using different kernels to restore +hibernation images (that is, the kernel used to read the image from storage and +load it into memory is different from the one included in the image) or support +kernel address space randomization, it also can be used to check if failures +to resume may be related to the differences between the restore and image +kernels. + +d) Advanced debugging +--------------------- + +In case that hibernation does not work on your system even in the minimal +configuration and compiling more drivers as modules is not practical or some +modules cannot be unloaded, you can use one of the more advanced debugging +techniques to find the problem. First, if there is a serial port in your box, +you can boot the kernel with the 'no_console_suspend' parameter and try to log +kernel messages using the serial console. This may provide you with some +information about the reasons of the suspend (resume) failure. Alternatively, +it may be possible to use a FireWire port for debugging with firescope +(http://v3.sk/~lkundrak/firescope/). On x86 it is also possible to +use the PM_TRACE mechanism documented in Documentation/power/s2ram.rst . + +2. Testing suspend to RAM (STR) +=============================== + +To verify that the STR works, it is generally more convenient to use the s2ram +tool available from http://suspend.sf.net and documented at +http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK). + +Namely, after writing "freezer", "devices", "platform", "processors", or "core" +into /sys/power/pm_test (available if the kernel is compiled with +CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding +to given string. The STR test modes are defined in the same way as for +hibernation, so please refer to Section 1 for more information about them. In +particular, the "core" test allows you to test everything except for the actual +invocation of the platform firmware in order to put the system into the sleep +state. + +Among other things, the testing with the help of /sys/power/pm_test may allow +you to identify drivers that fail to suspend or resume their devices. They +should be unloaded every time before an STR transition. + +Next, you can follow the instructions at S2RAM_LINK to test the system, but if +it does not work "out of the box", you may need to boot it with +"init=/bin/bash" and test s2ram in the minimal configuration. In that case, +you may be able to search for failing drivers by following the procedure +analogous to the one described in section 1. If you find some failing drivers, +you will have to unload them every time before an STR transition (ie. before +you run s2ram), and please report the problems with them. + +There is a debugfs entry which shows the suspend to RAM statistics. Here is an +example of its output:: + + # mount -t debugfs none /sys/kernel/debug + # cat /sys/kernel/debug/suspend_stats + success: 20 + fail: 5 + failed_freeze: 0 + failed_prepare: 0 + failed_suspend: 5 + failed_suspend_noirq: 0 + failed_resume: 0 + failed_resume_noirq: 0 + failures: + last_failed_dev: alarm + adc + last_failed_errno: -16 + -16 + last_failed_step: suspend + suspend + +Field success means the success number of suspend to RAM, and field fail means +the failure number. Others are the failure number of different steps of suspend +to RAM. suspend_stats just lists the last 2 failed devices, error number and +failed step of suspend. diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt deleted file mode 100644 index 708f87f78a75..000000000000 --- a/Documentation/power/basic-pm-debugging.txt +++ /dev/null @@ -1,254 +0,0 @@ -Debugging hibernation and suspend - (C) 2007 Rafael J. Wysocki , GPL - -1. Testing hibernation (aka suspend to disk or STD) - -To check if hibernation works, you can try to hibernate in the "reboot" mode: - -# echo reboot > /sys/power/disk -# echo disk > /sys/power/state - -and the system should create a hibernation image, reboot, resume and get back to -the command prompt where you have started the transition. If that happens, -hibernation is most likely to work correctly. Still, you need to repeat the -test at least a couple of times in a row for confidence. [This is necessary, -because some problems only show up on a second attempt at suspending and -resuming the system.] Moreover, hibernating in the "reboot" and "shutdown" -modes causes the PM core to skip some platform-related callbacks which on ACPI -systems might be necessary to make hibernation work. Thus, if your machine fails -to hibernate or resume in the "reboot" mode, you should try the "platform" mode: - -# echo platform > /sys/power/disk -# echo disk > /sys/power/state - -which is the default and recommended mode of hibernation. - -Unfortunately, the "platform" mode of hibernation does not work on some systems -with broken BIOSes. In such cases the "shutdown" mode of hibernation might -work: - -# echo shutdown > /sys/power/disk -# echo disk > /sys/power/state - -(it is similar to the "reboot" mode, but it requires you to press the power -button to make the system resume). - -If neither "platform" nor "shutdown" hibernation mode works, you will need to -identify what goes wrong. - -a) Test modes of hibernation - -To find out why hibernation fails on your system, you can use a special testing -facility available if the kernel is compiled with CONFIG_PM_DEBUG set. Then, -there is the file /sys/power/pm_test that can be used to make the hibernation -core run in a test mode. There are 5 test modes available: - -freezer -- test the freezing of processes - -devices -- test the freezing of processes and suspending of devices - -platform -- test the freezing of processes, suspending of devices and platform - global control methods(*) - -processors -- test the freezing of processes, suspending of devices, platform - global control methods(*) and the disabling of nonboot CPUs - -core -- test the freezing of processes, suspending of devices, platform global - control methods(*), the disabling of nonboot CPUs and suspending of - platform/system devices - -(*) the platform global control methods are only available on ACPI systems - and are only tested if the hibernation mode is set to "platform" - -To use one of them it is necessary to write the corresponding string to -/sys/power/pm_test (eg. "devices" to test the freezing of processes and -suspending devices) and issue the standard hibernation commands. For example, -to use the "devices" test mode along with the "platform" mode of hibernation, -you should do the following: - -# echo devices > /sys/power/pm_test -# echo platform > /sys/power/disk -# echo disk > /sys/power/state - -Then, the kernel will try to freeze processes, suspend devices, wait a few -seconds (5 by default, but configurable by the suspend.pm_test_delay module -parameter), resume devices and thaw processes. If "platform" is written to -/sys/power/pm_test , then after suspending devices the kernel will additionally -invoke the global control methods (eg. ACPI global control methods) used to -prepare the platform firmware for hibernation. Next, it will wait a -configurable number of seconds and invoke the platform (eg. ACPI) global -methods used to cancel hibernation etc. - -Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal -hibernation/suspend operations. Also, when open for reading, /sys/power/pm_test -contains a space-separated list of all available tests (including "none" that -represents the normal functionality) in which the current test level is -indicated by square brackets. - -Generally, as you can see, each test level is more "invasive" than the previous -one and the "core" level tests the hardware and drivers as deeply as possible -without creating a hibernation image. Obviously, if the "devices" test fails, -the "platform" test will fail as well and so on. Thus, as a rule of thumb, you -should try the test modes starting from "freezer", through "devices", "platform" -and "processors" up to "core" (repeat the test on each level a couple of times -to make sure that any random factors are avoided). - -If the "freezer" test fails, there is a task that cannot be frozen (in that case -it usually is possible to identify the offending task by analysing the output of -dmesg obtained after the failing test). Failure at this level usually means -that there is a problem with the tasks freezer subsystem that should be -reported. - -If the "devices" test fails, most likely there is a driver that cannot suspend -or resume its device (in the latter case the system may hang or become unstable -after the test, so please take that into consideration). To find this driver, -you can carry out a binary search according to the rules: -- if the test fails, unload a half of the drivers currently loaded and repeat -(that would probably involve rebooting the system, so always note what drivers -have been loaded before the test), -- if the test succeeds, load a half of the drivers you have unloaded most -recently and repeat. - -Once you have found the failing driver (there can be more than just one of -them), you have to unload it every time before hibernation. In that case please -make sure to report the problem with the driver. - -It is also possible that the "devices" test will still fail after you have -unloaded all modules. In that case, you may want to look in your kernel -configuration for the drivers that can be compiled as modules (and test again -with these drivers compiled as modules). You may also try to use some special -kernel command line options such as "noapic", "noacpi" or even "acpi=off". - -If the "platform" test fails, there is a problem with the handling of the -platform (eg. ACPI) firmware on your system. In that case the "platform" mode -of hibernation is not likely to work. You can try the "shutdown" mode, but that -is rather a poor man's workaround. - -If the "processors" test fails, the disabling/enabling of nonboot CPUs does not -work (of course, this only may be an issue on SMP systems) and the problem -should be reported. In that case you can also try to switch the nonboot CPUs -off and on using the /sys/devices/system/cpu/cpu*/online sysfs attributes and -see if that works. - -If the "core" test fails, which means that suspending of the system/platform -devices has failed (these devices are suspended on one CPU with interrupts off), -the problem is most probably hardware-related and serious, so it should be -reported. - -A failure of any of the "platform", "processors" or "core" tests may cause your -system to hang or become unstable, so please beware. Such a failure usually -indicates a serious problem that very well may be related to the hardware, but -please report it anyway. - -b) Testing minimal configuration - -If all of the hibernation test modes work, you can boot the system with the -"init=/bin/bash" command line parameter and attempt to hibernate in the -"reboot", "shutdown" and "platform" modes. If that does not work, there -probably is a problem with a driver statically compiled into the kernel and you -can try to compile more drivers as modules, so that they can be tested -individually. Otherwise, there is a problem with a modular driver and you can -find it by loading a half of the modules you normally use and binary searching -in accordance with the algorithm: -- if there are n modules loaded and the attempt to suspend and resume fails, -unload n/2 of the modules and try again (that would probably involve rebooting -the system), -- if there are n modules loaded and the attempt to suspend and resume succeeds, -load n/2 modules more and try again. - -Again, if you find the offending module(s), it(they) must be unloaded every time -before hibernation, and please report the problem with it(them). - -c) Using the "test_resume" hibernation option - -/sys/power/disk generally tells the kernel what to do after creating a -hibernation image. One of the available options is "test_resume" which -causes the just created image to be used for immediate restoration. Namely, -after doing: - -# echo test_resume > /sys/power/disk -# echo disk > /sys/power/state - -a hibernation image will be created and a resume from it will be triggered -immediately without involving the platform firmware in any way. - -That test can be used to check if failures to resume from hibernation are -related to bad interactions with the platform firmware. That is, if the above -works every time, but resume from actual hibernation does not work or is -unreliable, the platform firmware may be responsible for the failures. - -On architectures and platforms that support using different kernels to restore -hibernation images (that is, the kernel used to read the image from storage and -load it into memory is different from the one included in the image) or support -kernel address space randomization, it also can be used to check if failures -to resume may be related to the differences between the restore and image -kernels. - -d) Advanced debugging - -In case that hibernation does not work on your system even in the minimal -configuration and compiling more drivers as modules is not practical or some -modules cannot be unloaded, you can use one of the more advanced debugging -techniques to find the problem. First, if there is a serial port in your box, -you can boot the kernel with the 'no_console_suspend' parameter and try to log -kernel messages using the serial console. This may provide you with some -information about the reasons of the suspend (resume) failure. Alternatively, -it may be possible to use a FireWire port for debugging with firescope -(http://v3.sk/~lkundrak/firescope/). On x86 it is also possible to -use the PM_TRACE mechanism documented in Documentation/power/s2ram.txt . - -2. Testing suspend to RAM (STR) - -To verify that the STR works, it is generally more convenient to use the s2ram -tool available from http://suspend.sf.net and documented at -http://en.opensuse.org/SDB:Suspend_to_RAM (S2RAM_LINK). - -Namely, after writing "freezer", "devices", "platform", "processors", or "core" -into /sys/power/pm_test (available if the kernel is compiled with -CONFIG_PM_DEBUG set) the suspend code will work in the test mode corresponding -to given string. The STR test modes are defined in the same way as for -hibernation, so please refer to Section 1 for more information about them. In -particular, the "core" test allows you to test everything except for the actual -invocation of the platform firmware in order to put the system into the sleep -state. - -Among other things, the testing with the help of /sys/power/pm_test may allow -you to identify drivers that fail to suspend or resume their devices. They -should be unloaded every time before an STR transition. - -Next, you can follow the instructions at S2RAM_LINK to test the system, but if -it does not work "out of the box", you may need to boot it with -"init=/bin/bash" and test s2ram in the minimal configuration. In that case, -you may be able to search for failing drivers by following the procedure -analogous to the one described in section 1. If you find some failing drivers, -you will have to unload them every time before an STR transition (ie. before -you run s2ram), and please report the problems with them. - -There is a debugfs entry which shows the suspend to RAM statistics. Here is an -example of its output. - # mount -t debugfs none /sys/kernel/debug - # cat /sys/kernel/debug/suspend_stats - success: 20 - fail: 5 - failed_freeze: 0 - failed_prepare: 0 - failed_suspend: 5 - failed_suspend_noirq: 0 - failed_resume: 0 - failed_resume_noirq: 0 - failures: - last_failed_dev: alarm - adc - last_failed_errno: -16 - -16 - last_failed_step: suspend - suspend -Field success means the success number of suspend to RAM, and field fail means -the failure number. Others are the failure number of different steps of suspend -to RAM. suspend_stats just lists the last 2 failed devices, error number and -failed step of suspend. diff --git a/Documentation/power/charger-manager.rst b/Documentation/power/charger-manager.rst new file mode 100644 index 000000000000..84fab9376792 --- /dev/null +++ b/Documentation/power/charger-manager.rst @@ -0,0 +1,205 @@ +=============== +Charger Manager +=============== + + (C) 2011 MyungJoo Ham , GPL + +Charger Manager provides in-kernel battery charger management that +requires temperature monitoring during suspend-to-RAM state +and where each battery may have multiple chargers attached and the userland +wants to look at the aggregated information of the multiple chargers. + +Charger Manager is a platform_driver with power-supply-class entries. +An instance of Charger Manager (a platform-device created with Charger-Manager) +represents an independent battery with chargers. If there are multiple +batteries with their own chargers acting independently in a system, +the system may need multiple instances of Charger Manager. + +1. Introduction +=============== + +Charger Manager supports the following: + +* Support for multiple chargers (e.g., a device with USB, AC, and solar panels) + A system may have multiple chargers (or power sources) and some of + they may be activated at the same time. Each charger may have its + own power-supply-class and each power-supply-class can provide + different information about the battery status. This framework + aggregates charger-related information from multiple sources and + shows combined information as a single power-supply-class. + +* Support for in suspend-to-RAM polling (with suspend_again callback) + While the battery is being charged and the system is in suspend-to-RAM, + we may need to monitor the battery health by looking at the ambient or + battery temperature. We can accomplish this by waking up the system + periodically. However, such a method wakes up devices unnecessarily for + monitoring the battery health and tasks, and user processes that are + supposed to be kept suspended. That, in turn, incurs unnecessary power + consumption and slow down charging process. Or even, such peak power + consumption can stop chargers in the middle of charging + (external power input < device power consumption), which not + only affects the charging time, but the lifespan of the battery. + + Charger Manager provides a function "cm_suspend_again" that can be + used as suspend_again callback of platform_suspend_ops. If the platform + requires tasks other than cm_suspend_again, it may implement its own + suspend_again callback that calls cm_suspend_again in the middle. + Normally, the platform will need to resume and suspend some devices + that are used by Charger Manager. + +* Support for premature full-battery event handling + If the battery voltage drops by "fullbatt_vchkdrop_uV" after + "fullbatt_vchkdrop_ms" from the full-battery event, the framework + restarts charging. This check is also performed while suspended by + setting wakeup time accordingly and using suspend_again. + +* Support for uevent-notify + With the charger-related events, the device sends + notification to users with UEVENT. + +2. Global Charger-Manager Data related with suspend_again +========================================================= +In order to setup Charger Manager with suspend-again feature +(in-suspend monitoring), the user should provide charger_global_desc +with setup_charger_manager(`struct charger_global_desc *`). +This charger_global_desc data for in-suspend monitoring is global +as the name suggests. Thus, the user needs to provide only once even +if there are multiple batteries. If there are multiple batteries, the +multiple instances of Charger Manager share the same charger_global_desc +and it will manage in-suspend monitoring for all instances of Charger Manager. + +The user needs to provide all the three entries to `struct charger_global_desc` +properly in order to activate in-suspend monitoring: + +`char *rtc_name;` + The name of rtc (e.g., "rtc0") used to wakeup the system from + suspend for Charger Manager. The alarm interrupt (AIE) of the rtc + should be able to wake up the system from suspend. Charger Manager + saves and restores the alarm value and use the previously-defined + alarm if it is going to go off earlier than Charger Manager so that + Charger Manager does not interfere with previously-defined alarms. + +`bool (*rtc_only_wakeup)(void);` + This callback should let CM know whether + the wakeup-from-suspend is caused only by the alarm of "rtc" in the + same struct. If there is any other wakeup source triggered the + wakeup, it should return false. If the "rtc" is the only wakeup + reason, it should return true. + +`bool assume_timer_stops_in_suspend;` + if true, Charger Manager assumes that + the timer (CM uses jiffies as timer) stops during suspend. Then, CM + assumes that the suspend-duration is same as the alarm length. + + +3. How to setup suspend_again +============================= +Charger Manager provides a function "extern bool cm_suspend_again(void)". +When cm_suspend_again is called, it monitors every battery. The suspend_ops +callback of the system's platform_suspend_ops can call cm_suspend_again +function to know whether Charger Manager wants to suspend again or not. +If there are no other devices or tasks that want to use suspend_again +feature, the platform_suspend_ops may directly refer to cm_suspend_again +for its suspend_again callback. + +The cm_suspend_again() returns true (meaning "I want to suspend again") +if the system was woken up by Charger Manager and the polling +(in-suspend monitoring) results in "normal". + +4. Charger-Manager Data (struct charger_desc) +============================================= +For each battery charged independently from other batteries (if a series of +batteries are charged by a single charger, they are counted as one independent +battery), an instance of Charger Manager is attached to it. The following + +struct charger_desc elements: + +`char *psy_name;` + The power-supply-class name of the battery. Default is + "battery" if psy_name is NULL. Users can access the psy entries + at "/sys/class/power_supply/[psy_name]/". + +`enum polling_modes polling_mode;` + CM_POLL_DISABLE: + do not poll this battery. + CM_POLL_ALWAYS: + always poll this battery. + CM_POLL_EXTERNAL_POWER_ONLY: + poll this battery if and only if an external power + source is attached. + CM_POLL_CHARGING_ONLY: + poll this battery if and only if the battery is being charged. + +`unsigned int fullbatt_vchkdrop_ms; / unsigned int fullbatt_vchkdrop_uV;` + If both have non-zero values, Charger Manager will check the + battery voltage drop fullbatt_vchkdrop_ms after the battery is fully + charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger + Manager will try to recharge the battery by disabling and enabling + chargers. Recharge with voltage drop condition only (without delay + condition) is needed to be implemented with hardware interrupts from + fuel gauges or charger devices/chips. + +`unsigned int fullbatt_uV;` + If specified with a non-zero value, Charger Manager assumes + that the battery is full (capacity = 100) if the battery is not being + charged and the battery voltage is equal to or greater than + fullbatt_uV. + +`unsigned int polling_interval_ms;` + Required polling interval in ms. Charger Manager will poll + this battery every polling_interval_ms or more frequently. + +`enum data_source battery_present;` + CM_BATTERY_PRESENT: + assume that the battery exists. + CM_NO_BATTERY: + assume that the battery does not exists. + CM_FUEL_GAUGE: + get battery presence information from fuel gauge. + CM_CHARGER_STAT: + get battery presence from chargers. + +`char **psy_charger_stat;` + An array ending with NULL that has power-supply-class names of + chargers. Each power-supply-class should provide "PRESENT" (if + battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an + external power source is attached or not), and "STATUS" (shows whether + the battery is {"FULL" or not FULL} or {"FULL", "Charging", + "Discharging", "NotCharging"}). + +`int num_charger_regulators; / struct regulator_bulk_data *charger_regulators;` + Regulators representing the chargers in the form for + regulator framework's bulk functions. + +`char *psy_fuel_gauge;` + Power-supply-class name of the fuel gauge. + +`int (*temperature_out_of_range)(int *mC); / bool measure_battery_temp;` + This callback returns 0 if the temperature is safe for charging, + a positive number if it is too hot to charge, and a negative number + if it is too cold to charge. With the variable mC, the callback returns + the temperature in 1/1000 of centigrade. + The source of temperature can be battery or ambient one according to + the value of measure_battery_temp. + + +5. Notify Charger-Manager of charger events: cm_notify_event() +============================================================== +If there is an charger event is required to notify +Charger Manager, a charger device driver that triggers the event can call +cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager. +In the function, psy is the charger driver's power_supply pointer, which is +associated with Charger-Manager. The parameter "type" +is the same as irq's type (enum cm_event_types). The event message "msg" is +optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS". + +6. Other Considerations +======================= + +At the charger/battery-related events such as battery-pulled-out, +charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped, +and others critical to chargers, the system should be configured to wake up. +At least the following should wake up the system from a suspend: +a) charger-on/off b) external-power-in/out c) battery-in/out (while charging) + +It is usually accomplished by configuring the PMIC as a wakeup source. diff --git a/Documentation/power/charger-manager.txt b/Documentation/power/charger-manager.txt deleted file mode 100644 index 9ff1105e58d6..000000000000 --- a/Documentation/power/charger-manager.txt +++ /dev/null @@ -1,200 +0,0 @@ -Charger Manager - (C) 2011 MyungJoo Ham , GPL - -Charger Manager provides in-kernel battery charger management that -requires temperature monitoring during suspend-to-RAM state -and where each battery may have multiple chargers attached and the userland -wants to look at the aggregated information of the multiple chargers. - -Charger Manager is a platform_driver with power-supply-class entries. -An instance of Charger Manager (a platform-device created with Charger-Manager) -represents an independent battery with chargers. If there are multiple -batteries with their own chargers acting independently in a system, -the system may need multiple instances of Charger Manager. - -1. Introduction -=============== - -Charger Manager supports the following: - -* Support for multiple chargers (e.g., a device with USB, AC, and solar panels) - A system may have multiple chargers (or power sources) and some of - they may be activated at the same time. Each charger may have its - own power-supply-class and each power-supply-class can provide - different information about the battery status. This framework - aggregates charger-related information from multiple sources and - shows combined information as a single power-supply-class. - -* Support for in suspend-to-RAM polling (with suspend_again callback) - While the battery is being charged and the system is in suspend-to-RAM, - we may need to monitor the battery health by looking at the ambient or - battery temperature. We can accomplish this by waking up the system - periodically. However, such a method wakes up devices unnecessarily for - monitoring the battery health and tasks, and user processes that are - supposed to be kept suspended. That, in turn, incurs unnecessary power - consumption and slow down charging process. Or even, such peak power - consumption can stop chargers in the middle of charging - (external power input < device power consumption), which not - only affects the charging time, but the lifespan of the battery. - - Charger Manager provides a function "cm_suspend_again" that can be - used as suspend_again callback of platform_suspend_ops. If the platform - requires tasks other than cm_suspend_again, it may implement its own - suspend_again callback that calls cm_suspend_again in the middle. - Normally, the platform will need to resume and suspend some devices - that are used by Charger Manager. - -* Support for premature full-battery event handling - If the battery voltage drops by "fullbatt_vchkdrop_uV" after - "fullbatt_vchkdrop_ms" from the full-battery event, the framework - restarts charging. This check is also performed while suspended by - setting wakeup time accordingly and using suspend_again. - -* Support for uevent-notify - With the charger-related events, the device sends - notification to users with UEVENT. - -2. Global Charger-Manager Data related with suspend_again -======================================================== -In order to setup Charger Manager with suspend-again feature -(in-suspend monitoring), the user should provide charger_global_desc -with setup_charger_manager(struct charger_global_desc *). -This charger_global_desc data for in-suspend monitoring is global -as the name suggests. Thus, the user needs to provide only once even -if there are multiple batteries. If there are multiple batteries, the -multiple instances of Charger Manager share the same charger_global_desc -and it will manage in-suspend monitoring for all instances of Charger Manager. - -The user needs to provide all the three entries properly in order to activate -in-suspend monitoring: - -struct charger_global_desc { - -char *rtc_name; - : The name of rtc (e.g., "rtc0") used to wakeup the system from - suspend for Charger Manager. The alarm interrupt (AIE) of the rtc - should be able to wake up the system from suspend. Charger Manager - saves and restores the alarm value and use the previously-defined - alarm if it is going to go off earlier than Charger Manager so that - Charger Manager does not interfere with previously-defined alarms. - -bool (*rtc_only_wakeup)(void); - : This callback should let CM know whether - the wakeup-from-suspend is caused only by the alarm of "rtc" in the - same struct. If there is any other wakeup source triggered the - wakeup, it should return false. If the "rtc" is the only wakeup - reason, it should return true. - -bool assume_timer_stops_in_suspend; - : if true, Charger Manager assumes that - the timer (CM uses jiffies as timer) stops during suspend. Then, CM - assumes that the suspend-duration is same as the alarm length. -}; - -3. How to setup suspend_again -============================= -Charger Manager provides a function "extern bool cm_suspend_again(void)". -When cm_suspend_again is called, it monitors every battery. The suspend_ops -callback of the system's platform_suspend_ops can call cm_suspend_again -function to know whether Charger Manager wants to suspend again or not. -If there are no other devices or tasks that want to use suspend_again -feature, the platform_suspend_ops may directly refer to cm_suspend_again -for its suspend_again callback. - -The cm_suspend_again() returns true (meaning "I want to suspend again") -if the system was woken up by Charger Manager and the polling -(in-suspend monitoring) results in "normal". - -4. Charger-Manager Data (struct charger_desc) -============================================= -For each battery charged independently from other batteries (if a series of -batteries are charged by a single charger, they are counted as one independent -battery), an instance of Charger Manager is attached to it. - -struct charger_desc { - -char *psy_name; - : The power-supply-class name of the battery. Default is - "battery" if psy_name is NULL. Users can access the psy entries - at "/sys/class/power_supply/[psy_name]/". - -enum polling_modes polling_mode; - : CM_POLL_DISABLE: do not poll this battery. - CM_POLL_ALWAYS: always poll this battery. - CM_POLL_EXTERNAL_POWER_ONLY: poll this battery if and only if - an external power source is attached. - CM_POLL_CHARGING_ONLY: poll this battery if and only if the - battery is being charged. - -unsigned int fullbatt_vchkdrop_ms; -unsigned int fullbatt_vchkdrop_uV; - : If both have non-zero values, Charger Manager will check the - battery voltage drop fullbatt_vchkdrop_ms after the battery is fully - charged. If the voltage drop is over fullbatt_vchkdrop_uV, Charger - Manager will try to recharge the battery by disabling and enabling - chargers. Recharge with voltage drop condition only (without delay - condition) is needed to be implemented with hardware interrupts from - fuel gauges or charger devices/chips. - -unsigned int fullbatt_uV; - : If specified with a non-zero value, Charger Manager assumes - that the battery is full (capacity = 100) if the battery is not being - charged and the battery voltage is equal to or greater than - fullbatt_uV. - -unsigned int polling_interval_ms; - : Required polling interval in ms. Charger Manager will poll - this battery every polling_interval_ms or more frequently. - -enum data_source battery_present; - : CM_BATTERY_PRESENT: assume that the battery exists. - CM_NO_BATTERY: assume that the battery does not exists. - CM_FUEL_GAUGE: get battery presence information from fuel gauge. - CM_CHARGER_STAT: get battery presence from chargers. - -char **psy_charger_stat; - : An array ending with NULL that has power-supply-class names of - chargers. Each power-supply-class should provide "PRESENT" (if - battery_present is "CM_CHARGER_STAT"), "ONLINE" (shows whether an - external power source is attached or not), and "STATUS" (shows whether - the battery is {"FULL" or not FULL} or {"FULL", "Charging", - "Discharging", "NotCharging"}). - -int num_charger_regulators; -struct regulator_bulk_data *charger_regulators; - : Regulators representing the chargers in the form for - regulator framework's bulk functions. - -char *psy_fuel_gauge; - : Power-supply-class name of the fuel gauge. - -int (*temperature_out_of_range)(int *mC); -bool measure_battery_temp; - : This callback returns 0 if the temperature is safe for charging, - a positive number if it is too hot to charge, and a negative number - if it is too cold to charge. With the variable mC, the callback returns - the temperature in 1/1000 of centigrade. - The source of temperature can be battery or ambient one according to - the value of measure_battery_temp. -}; - -5. Notify Charger-Manager of charger events: cm_notify_event() -========================================================= -If there is an charger event is required to notify -Charger Manager, a charger device driver that triggers the event can call -cm_notify_event(psy, type, msg) to notify the corresponding Charger Manager. -In the function, psy is the charger driver's power_supply pointer, which is -associated with Charger-Manager. The parameter "type" -is the same as irq's type (enum cm_event_types). The event message "msg" is -optional and is effective only if the event type is "UNDESCRIBED" or "OTHERS". - -6. Other Considerations -======================= - -At the charger/battery-related events such as battery-pulled-out, -charger-pulled-out, charger-inserted, DCIN-over/under-voltage, charger-stopped, -and others critical to chargers, the system should be configured to wake up. -At least the following should wake up the system from a suspend: -a) charger-on/off b) external-power-in/out c) battery-in/out (while charging) - -It is usually accomplished by configuring the PMIC as a wakeup source. diff --git a/Documentation/power/drivers-testing.rst b/Documentation/power/drivers-testing.rst new file mode 100644 index 000000000000..e53f1999fc39 --- /dev/null +++ b/Documentation/power/drivers-testing.rst @@ -0,0 +1,51 @@ +==================================================== +Testing suspend and resume support in device drivers +==================================================== + + (C) 2007 Rafael J. Wysocki , GPL + +1. Preparing the test system +============================ + +Unfortunately, to effectively test the support for the system-wide suspend and +resume transitions in a driver, it is necessary to suspend and resume a fully +functional system with this driver loaded. Moreover, that should be done +several times, preferably several times in a row, and separately for hibernation +(aka suspend to disk or STD) and suspend to RAM (STR), because each of these +cases involves slightly different operations and different interactions with +the machine's BIOS. + +Of course, for this purpose the test system has to be known to suspend and +resume without the driver being tested. Thus, if possible, you should first +resolve all suspend/resume-related problems in the test system before you start +testing the new driver. Please see Documentation/power/basic-pm-debugging.rst +for more information about the debugging of suspend/resume functionality. + +2. Testing the driver +===================== + +Once you have resolved the suspend/resume-related problems with your test system +without the new driver, you are ready to test it: + +a) Build the driver as a module, load it and try the test modes of hibernation + (see: Documentation/power/basic-pm-debugging.rst, 1). + +b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and + "platform" modes (see: Documentation/power/basic-pm-debugging.rst, 1). + +c) Compile the driver directly into the kernel and try the test modes of + hibernation. + +d) Attempt to hibernate with the driver compiled directly into the kernel + in the "reboot", "shutdown" and "platform" modes. + +e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.rst, + 2). [As far as the STR tests are concerned, it should not matter whether or + not the driver is built as a module.] + +f) Attempt to suspend to RAM using the s2ram tool with the driver loaded + (see: Documentation/power/basic-pm-debugging.rst, 2). + +Each of the above tests should be repeated several times and the STD tests +should be mixed with the STR tests. If any of them fails, the driver cannot be +regarded as suspend/resume-safe. diff --git a/Documentation/power/drivers-testing.txt b/Documentation/power/drivers-testing.txt deleted file mode 100644 index 638afdf4d6b8..000000000000 --- a/Documentation/power/drivers-testing.txt +++ /dev/null @@ -1,46 +0,0 @@ -Testing suspend and resume support in device drivers - (C) 2007 Rafael J. Wysocki , GPL - -1. Preparing the test system - -Unfortunately, to effectively test the support for the system-wide suspend and -resume transitions in a driver, it is necessary to suspend and resume a fully -functional system with this driver loaded. Moreover, that should be done -several times, preferably several times in a row, and separately for hibernation -(aka suspend to disk or STD) and suspend to RAM (STR), because each of these -cases involves slightly different operations and different interactions with -the machine's BIOS. - -Of course, for this purpose the test system has to be known to suspend and -resume without the driver being tested. Thus, if possible, you should first -resolve all suspend/resume-related problems in the test system before you start -testing the new driver. Please see Documentation/power/basic-pm-debugging.txt -for more information about the debugging of suspend/resume functionality. - -2. Testing the driver - -Once you have resolved the suspend/resume-related problems with your test system -without the new driver, you are ready to test it: - -a) Build the driver as a module, load it and try the test modes of hibernation - (see: Documentation/power/basic-pm-debugging.txt, 1). - -b) Load the driver and attempt to hibernate in the "reboot", "shutdown" and - "platform" modes (see: Documentation/power/basic-pm-debugging.txt, 1). - -c) Compile the driver directly into the kernel and try the test modes of - hibernation. - -d) Attempt to hibernate with the driver compiled directly into the kernel - in the "reboot", "shutdown" and "platform" modes. - -e) Try the test modes of suspend (see: Documentation/power/basic-pm-debugging.txt, - 2). [As far as the STR tests are concerned, it should not matter whether or - not the driver is built as a module.] - -f) Attempt to suspend to RAM using the s2ram tool with the driver loaded - (see: Documentation/power/basic-pm-debugging.txt, 2). - -Each of the above tests should be repeated several times and the STD tests -should be mixed with the STR tests. If any of them fails, the driver cannot be -regarded as suspend/resume-safe. diff --git a/Documentation/power/energy-model.rst b/Documentation/power/energy-model.rst new file mode 100644 index 000000000000..90a345d57ae9 --- /dev/null +++ b/Documentation/power/energy-model.rst @@ -0,0 +1,147 @@ +==================== +Energy Model of CPUs +==================== + +1. Overview +----------- + +The Energy Model (EM) framework serves as an interface between drivers knowing +the power consumed by CPUs at various performance levels, and the kernel +subsystems willing to use that information to make energy-aware decisions. + +The source of the information about the power consumed by CPUs can vary greatly +from one platform to another. These power costs can be estimated using +devicetree data in some cases. In others, the firmware will know better. +Alternatively, userspace might be best positioned. And so on. In order to avoid +each and every client subsystem to re-implement support for each and every +possible source of information on its own, the EM framework intervenes as an +abstraction layer which standardizes the format of power cost tables in the +kernel, hence enabling to avoid redundant work. + +The figure below depicts an example of drivers (Arm-specific here, but the +approach is applicable to any architecture) providing power costs to the EM +framework, and interested clients reading the data from it:: + + +---------------+ +-----------------+ +---------------+ + | Thermal (IPA) | | Scheduler (EAS) | | Other | + +---------------+ +-----------------+ +---------------+ + | | em_pd_energy() | + | | em_cpu_get() | + +---------+ | +---------+ + | | | + v v v + +---------------------+ + | Energy Model | + | Framework | + +---------------------+ + ^ ^ ^ + | | | em_register_perf_domain() + +----------+ | +---------+ + | | | + +---------------+ +---------------+ +--------------+ + | cpufreq-dt | | arm_scmi | | Other | + +---------------+ +---------------+ +--------------+ + ^ ^ ^ + | | | + +--------------+ +---------------+ +--------------+ + | Device Tree | | Firmware | | ? | + +--------------+ +---------------+ +--------------+ + +The EM framework manages power cost tables per 'performance domain' in the +system. A performance domain is a group of CPUs whose performance is scaled +together. Performance domains generally have a 1-to-1 mapping with CPUFreq +policies. All CPUs in a performance domain are required to have the same +micro-architecture. CPUs in different performance domains can have different +micro-architectures. + + +2. Core APIs +------------ + +2.1 Config options +^^^^^^^^^^^^^^^^^^ + +CONFIG_ENERGY_MODEL must be enabled to use the EM framework. + + +2.2 Registration of performance domains +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Drivers are expected to register performance domains into the EM framework by +calling the following API:: + + int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, + struct em_data_callback *cb); + +Drivers must specify the CPUs of the performance domains using the cpumask +argument, and provide a callback function returning tuples +for each capacity state. The callback function provided by the driver is free +to fetch data from any relevant location (DT, firmware, ...), and by any mean +deemed necessary. See Section 3. for an example of driver implementing this +callback, and kernel/power/energy_model.c for further documentation on this +API. + + +2.3 Accessing performance domains +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Subsystems interested in the energy model of a CPU can retrieve it using the +em_cpu_get() API. The energy model tables are allocated once upon creation of +the performance domains, and kept in memory untouched. + +The energy consumed by a performance domain can be estimated using the +em_pd_energy() API. The estimation is performed assuming that the schedutil +CPUfreq governor is in use. + +More details about the above APIs can be found in include/linux/energy_model.h. + + +3. Example driver +----------------- + +This section provides a simple example of a CPUFreq driver registering a +performance domain in the Energy Model framework using the (fake) 'foo' +protocol. The driver implements an est_power() function to be provided to the +EM framework:: + + -> drivers/cpufreq/foo_cpufreq.c + + 01 static int est_power(unsigned long *mW, unsigned long *KHz, int cpu) + 02 { + 03 long freq, power; + 04 + 05 /* Use the 'foo' protocol to ceil the frequency */ + 06 freq = foo_get_freq_ceil(cpu, *KHz); + 07 if (freq < 0); + 08 return freq; + 09 + 10 /* Estimate the power cost for the CPU at the relevant freq. */ + 11 power = foo_estimate_power(cpu, freq); + 12 if (power < 0); + 13 return power; + 14 + 15 /* Return the values to the EM framework */ + 16 *mW = power; + 17 *KHz = freq; + 18 + 19 return 0; + 20 } + 21 + 22 static int foo_cpufreq_init(struct cpufreq_policy *policy) + 23 { + 24 struct em_data_callback em_cb = EM_DATA_CB(est_power); + 25 int nr_opp, ret; + 26 + 27 /* Do the actual CPUFreq init work ... */ + 28 ret = do_foo_cpufreq_init(policy); + 29 if (ret) + 30 return ret; + 31 + 32 /* Find the number of OPPs for this policy */ + 33 nr_opp = foo_get_nr_opp(policy); + 34 + 35 /* And register the new performance domain */ + 36 em_register_perf_domain(policy->cpus, nr_opp, &em_cb); + 37 + 38 return 0; + 39 } diff --git a/Documentation/power/energy-model.txt b/Documentation/power/energy-model.txt deleted file mode 100644 index a2b0ae4c76bd..000000000000 --- a/Documentation/power/energy-model.txt +++ /dev/null @@ -1,144 +0,0 @@ - ==================== - Energy Model of CPUs - ==================== - -1. Overview ------------ - -The Energy Model (EM) framework serves as an interface between drivers knowing -the power consumed by CPUs at various performance levels, and the kernel -subsystems willing to use that information to make energy-aware decisions. - -The source of the information about the power consumed by CPUs can vary greatly -from one platform to another. These power costs can be estimated using -devicetree data in some cases. In others, the firmware will know better. -Alternatively, userspace might be best positioned. And so on. In order to avoid -each and every client subsystem to re-implement support for each and every -possible source of information on its own, the EM framework intervenes as an -abstraction layer which standardizes the format of power cost tables in the -kernel, hence enabling to avoid redundant work. - -The figure below depicts an example of drivers (Arm-specific here, but the -approach is applicable to any architecture) providing power costs to the EM -framework, and interested clients reading the data from it. - - +---------------+ +-----------------+ +---------------+ - | Thermal (IPA) | | Scheduler (EAS) | | Other | - +---------------+ +-----------------+ +---------------+ - | | em_pd_energy() | - | | em_cpu_get() | - +---------+ | +---------+ - | | | - v v v - +---------------------+ - | Energy Model | - | Framework | - +---------------------+ - ^ ^ ^ - | | | em_register_perf_domain() - +----------+ | +---------+ - | | | - +---------------+ +---------------+ +--------------+ - | cpufreq-dt | | arm_scmi | | Other | - +---------------+ +---------------+ +--------------+ - ^ ^ ^ - | | | - +--------------+ +---------------+ +--------------+ - | Device Tree | | Firmware | | ? | - +--------------+ +---------------+ +--------------+ - -The EM framework manages power cost tables per 'performance domain' in the -system. A performance domain is a group of CPUs whose performance is scaled -together. Performance domains generally have a 1-to-1 mapping with CPUFreq -policies. All CPUs in a performance domain are required to have the same -micro-architecture. CPUs in different performance domains can have different -micro-architectures. - - -2. Core APIs ------------- - - 2.1 Config options - -CONFIG_ENERGY_MODEL must be enabled to use the EM framework. - - - 2.2 Registration of performance domains - -Drivers are expected to register performance domains into the EM framework by -calling the following API: - - int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, - struct em_data_callback *cb); - -Drivers must specify the CPUs of the performance domains using the cpumask -argument, and provide a callback function returning tuples -for each capacity state. The callback function provided by the driver is free -to fetch data from any relevant location (DT, firmware, ...), and by any mean -deemed necessary. See Section 3. for an example of driver implementing this -callback, and kernel/power/energy_model.c for further documentation on this -API. - - - 2.3 Accessing performance domains - -Subsystems interested in the energy model of a CPU can retrieve it using the -em_cpu_get() API. The energy model tables are allocated once upon creation of -the performance domains, and kept in memory untouched. - -The energy consumed by a performance domain can be estimated using the -em_pd_energy() API. The estimation is performed assuming that the schedutil -CPUfreq governor is in use. - -More details about the above APIs can be found in include/linux/energy_model.h. - - -3. Example driver ------------------ - -This section provides a simple example of a CPUFreq driver registering a -performance domain in the Energy Model framework using the (fake) 'foo' -protocol. The driver implements an est_power() function to be provided to the -EM framework. - - -> drivers/cpufreq/foo_cpufreq.c - -01 static int est_power(unsigned long *mW, unsigned long *KHz, int cpu) -02 { -03 long freq, power; -04 -05 /* Use the 'foo' protocol to ceil the frequency */ -06 freq = foo_get_freq_ceil(cpu, *KHz); -07 if (freq < 0); -08 return freq; -09 -10 /* Estimate the power cost for the CPU at the relevant freq. */ -11 power = foo_estimate_power(cpu, freq); -12 if (power < 0); -13 return power; -14 -15 /* Return the values to the EM framework */ -16 *mW = power; -17 *KHz = freq; -18 -19 return 0; -20 } -21 -22 static int foo_cpufreq_init(struct cpufreq_policy *policy) -23 { -24 struct em_data_callback em_cb = EM_DATA_CB(est_power); -25 int nr_opp, ret; -26 -27 /* Do the actual CPUFreq init work ... */ -28 ret = do_foo_cpufreq_init(policy); -29 if (ret) -30 return ret; -31 -32 /* Find the number of OPPs for this policy */ -33 nr_opp = foo_get_nr_opp(policy); -34 -35 /* And register the new performance domain */ -36 em_register_perf_domain(policy->cpus, nr_opp, &em_cb); -37 -38 return 0; -39 } diff --git a/Documentation/power/freezing-of-tasks.rst b/Documentation/power/freezing-of-tasks.rst new file mode 100644 index 000000000000..ef110fe55e82 --- /dev/null +++ b/Documentation/power/freezing-of-tasks.rst @@ -0,0 +1,244 @@ +================= +Freezing of tasks +================= + +(C) 2007 Rafael J. Wysocki , GPL + +I. What is the freezing of tasks? +================================= + +The freezing of tasks is a mechanism by which user space processes and some +kernel threads are controlled during hibernation or system-wide suspend (on some +architectures). + +II. How does it work? +===================== + +There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN +and PF_FREEZER_SKIP (the last one is auxiliary). The tasks that have +PF_NOFREEZE unset (all user space processes and some kernel threads) are +regarded as 'freezable' and treated in a special way before the system enters a +suspend state as well as before a hibernation image is created (in what follows +we only consider hibernation, but the description also applies to suspend). + +Namely, as the first step of the hibernation procedure the function +freeze_processes() (defined in kernel/power/process.c) is called. A system-wide +variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate +whether the system is to undergo a freezing operation. And freeze_processes() +sets this variable. After this, it executes try_to_freeze_tasks() that sends a +fake signal to all user space processes, and wakes up all the kernel threads. +All freezable tasks must react to that by calling try_to_freeze(), which +results in a call to __refrigerator() (defined in kernel/freezer.c), which sets +the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes +it loop until PF_FROZEN is cleared for it. Then, we say that the task is +'frozen' and therefore the set of functions handling this mechanism is referred +to as 'the freezer' (these functions are defined in kernel/power/process.c, +kernel/freezer.c & include/linux/freezer.h). User space processes are generally +frozen before kernel threads. + +__refrigerator() must not be called directly. Instead, use the +try_to_freeze() function (defined in include/linux/freezer.h), that checks +if the task is to be frozen and makes the task enter __refrigerator(). + +For user space processes try_to_freeze() is called automatically from the +signal-handling code, but the freezable kernel threads need to call it +explicitly in suitable places or use the wait_event_freezable() or +wait_event_freezable_timeout() macros (defined in include/linux/freezer.h) +that combine interruptible sleep with checking if the task is to be frozen and +calling try_to_freeze(). The main loop of a freezable kernel thread may look +like the following one:: + + set_freezable(); + do { + hub_events(); + wait_event_freezable(khubd_wait, + !list_empty(&hub_event_list) || + kthread_should_stop()); + } while (!kthread_should_stop() || !list_empty(&hub_event_list)); + +(from drivers/usb/core/hub.c::hub_thread()). + +If a freezable kernel thread fails to call try_to_freeze() after the freezer has +initiated a freezing operation, the freezing of tasks will fail and the entire +hibernation operation will be cancelled. For this reason, freezable kernel +threads must call try_to_freeze() somewhere or use one of the +wait_event_freezable() and wait_event_freezable_timeout() macros. + +After the system memory state has been restored from a hibernation image and +devices have been reinitialized, the function thaw_processes() is called in +order to clear the PF_FROZEN flag for each frozen task. Then, the tasks that +have been frozen leave __refrigerator() and continue running. + + +Rationale behind the functions dealing with freezing and thawing of tasks +------------------------------------------------------------------------- + +freeze_processes(): + - freezes only userspace tasks + +freeze_kernel_threads(): + - freezes all tasks (including kernel threads) because we can't freeze + kernel threads without freezing userspace tasks + +thaw_kernel_threads(): + - thaws only kernel threads; this is particularly useful if we need to do + anything special in between thawing of kernel threads and thawing of + userspace tasks, or if we want to postpone the thawing of userspace tasks + +thaw_processes(): + - thaws all tasks (including kernel threads) because we can't thaw userspace + tasks without thawing kernel threads + + +III. Which kernel threads are freezable? +======================================== + +Kernel threads are not freezable by default. However, a kernel thread may clear +PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE +directly is not allowed). From this point it is regarded as freezable +and must call try_to_freeze() in a suitable place. + +IV. Why do we do that? +====================== + +Generally speaking, there is a couple of reasons to use the freezing of tasks: + +1. The principal reason is to prevent filesystems from being damaged after + hibernation. At the moment we have no simple means of checkpointing + filesystems, so if there are any modifications made to filesystem data and/or + metadata on disks, we cannot bring them back to the state from before the + modifications. At the same time each hibernation image contains some + filesystem-related information that must be consistent with the state of the + on-disk data and metadata after the system memory state has been restored + from the image (otherwise the filesystems will be damaged in a nasty way, + usually making them almost impossible to repair). We therefore freeze + tasks that might cause the on-disk filesystems' data and metadata to be + modified after the hibernation image has been created and before the + system is finally powered off. The majority of these are user space + processes, but if any of the kernel threads may cause something like this + to happen, they have to be freezable. + +2. Next, to create the hibernation image we need to free a sufficient amount of + memory (approximately 50% of available RAM) and we need to do that before + devices are deactivated, because we generally need them for swapping out. + Then, after the memory for the image has been freed, we don't want tasks + to allocate additional memory and we prevent them from doing that by + freezing them earlier. [Of course, this also means that device drivers + should not allocate substantial amounts of memory from their .suspend() + callbacks before hibernation, but this is a separate issue.] + +3. The third reason is to prevent user space processes and some kernel threads + from interfering with the suspending and resuming of devices. A user space + process running on a second CPU while we are suspending devices may, for + example, be troublesome and without the freezing of tasks we would need some + safeguards against race conditions that might occur in such a case. + +Although Linus Torvalds doesn't like the freezing of tasks, he said this in one +of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608): + +"RJW:> Why we freeze tasks at all or why we freeze kernel threads? + +Linus: In many ways, 'at all'. + +I **do** realize the IO request queue issues, and that we cannot actually do +s2ram with some devices in the middle of a DMA. So we want to be able to +avoid *that*, there's no question about that. And I suspect that stopping +user threads and then waiting for a sync is practically one of the easier +ways to do so. + +So in practice, the 'at all' may become a 'why freeze kernel threads?' and +freezing user threads I don't find really objectionable." + +Still, there are kernel threads that may want to be freezable. For example, if +a kernel thread that belongs to a device driver accesses the device directly, it +in principle needs to know when the device is suspended, so that it doesn't try +to access it at that time. However, if the kernel thread is freezable, it will +be frozen before the driver's .suspend() callback is executed and it will be +thawed after the driver's .resume() callback has run, so it won't be accessing +the device while it's suspended. + +4. Another reason for freezing tasks is to prevent user space processes from + realizing that hibernation (or suspend) operation takes place. Ideally, user + space processes should not notice that such a system-wide operation has + occurred and should continue running without any problems after the restore + (or resume from suspend). Unfortunately, in the most general case this + is quite difficult to achieve without the freezing of tasks. Consider, + for example, a process that depends on all CPUs being online while it's + running. Since we need to disable nonboot CPUs during the hibernation, + if this process is not frozen, it may notice that the number of CPUs has + changed and may start to work incorrectly because of that. + +V. Are there any problems related to the freezing of tasks? +=========================================================== + +Yes, there are. + +First of all, the freezing of kernel threads may be tricky if they depend one +on another. For example, if kernel thread A waits for a completion (in the +TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B +and B is frozen in the meantime, then A will be blocked until B is thawed, which +may be undesirable. That's why kernel threads are not freezable by default. + +Second, there are the following two problems related to the freezing of user +space processes: + +1. Putting processes into an uninterruptible sleep distorts the load average. +2. Now that we have FUSE, plus the framework for doing device drivers in + userspace, it gets even more complicated because some userspace processes are + now doing the sorts of things that kernel threads do + (https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html). + +The problem 1. seems to be fixable, although it hasn't been fixed so far. The +other one is more serious, but it seems that we can work around it by using +hibernation (and suspend) notifiers (in that case, though, we won't be able to +avoid the realization by the user space processes that the hibernation is taking +place). + +There are also problems that the freezing of tasks tends to expose, although +they are not directly related to it. For example, if request_firmware() is +called from a device driver's .resume() routine, it will timeout and eventually +fail, because the user land process that should respond to the request is frozen +at this point. So, seemingly, the failure is due to the freezing of tasks. +Suppose, however, that the firmware file is located on a filesystem accessible +only through another device that hasn't been resumed yet. In that case, +request_firmware() will fail regardless of whether or not the freezing of tasks +is used. Consequently, the problem is not really related to the freezing of +tasks, since it generally exists anyway. + +A driver must have all firmwares it may need in RAM before suspend() is called. +If keeping them is not practical, for example due to their size, they must be +requested early enough using the suspend notifier API described in +Documentation/driver-api/pm/notifiers.rst. + +VI. Are there any precautions to be taken to prevent freezing failures? +======================================================================= + +Yes, there are. + +First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code +from system-wide sleep such as suspend/hibernation is not encouraged. +If possible, that piece of code must instead hook onto the suspend/hibernation +notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code +(kernel/cpu.c) for an example. + +However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary, +it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since +that could lead to freezing failures, because if the suspend/hibernate code +successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed +to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE +state. As a consequence, the freezer would not be able to freeze that task, +leading to freezing failure. + +However, the [un]lock_system_sleep() APIs are safe to use in this scenario, +since they ask the freezer to skip freezing this task, since it is anyway +"frozen enough" as it is blocked on 'system_transition_mutex', which will be released +only after the entire suspend/hibernation sequence is complete. +So, to summarize, use [un]lock_system_sleep() instead of directly using +mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures. + +V. Miscellaneous +================ + +/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze +all user space processes or all freezable kernel threads, in unit of millisecond. +The default value is 20000, with range of unsigned integer. diff --git a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt deleted file mode 100644 index cd283190855a..000000000000 --- a/Documentation/power/freezing-of-tasks.txt +++ /dev/null @@ -1,231 +0,0 @@ -Freezing of tasks - (C) 2007 Rafael J. Wysocki , GPL - -I. What is the freezing of tasks? - -The freezing of tasks is a mechanism by which user space processes and some -kernel threads are controlled during hibernation or system-wide suspend (on some -architectures). - -II. How does it work? - -There are three per-task flags used for that, PF_NOFREEZE, PF_FROZEN -and PF_FREEZER_SKIP (the last one is auxiliary). The tasks that have -PF_NOFREEZE unset (all user space processes and some kernel threads) are -regarded as 'freezable' and treated in a special way before the system enters a -suspend state as well as before a hibernation image is created (in what follows -we only consider hibernation, but the description also applies to suspend). - -Namely, as the first step of the hibernation procedure the function -freeze_processes() (defined in kernel/power/process.c) is called. A system-wide -variable system_freezing_cnt (as opposed to a per-task flag) is used to indicate -whether the system is to undergo a freezing operation. And freeze_processes() -sets this variable. After this, it executes try_to_freeze_tasks() that sends a -fake signal to all user space processes, and wakes up all the kernel threads. -All freezable tasks must react to that by calling try_to_freeze(), which -results in a call to __refrigerator() (defined in kernel/freezer.c), which sets -the task's PF_FROZEN flag, changes its state to TASK_UNINTERRUPTIBLE and makes -it loop until PF_FROZEN is cleared for it. Then, we say that the task is -'frozen' and therefore the set of functions handling this mechanism is referred -to as 'the freezer' (these functions are defined in kernel/power/process.c, -kernel/freezer.c & include/linux/freezer.h). User space processes are generally -frozen before kernel threads. - -__refrigerator() must not be called directly. Instead, use the -try_to_freeze() function (defined in include/linux/freezer.h), that checks -if the task is to be frozen and makes the task enter __refrigerator(). - -For user space processes try_to_freeze() is called automatically from the -signal-handling code, but the freezable kernel threads need to call it -explicitly in suitable places or use the wait_event_freezable() or -wait_event_freezable_timeout() macros (defined in include/linux/freezer.h) -that combine interruptible sleep with checking if the task is to be frozen and -calling try_to_freeze(). The main loop of a freezable kernel thread may look -like the following one: - - set_freezable(); - do { - hub_events(); - wait_event_freezable(khubd_wait, - !list_empty(&hub_event_list) || - kthread_should_stop()); - } while (!kthread_should_stop() || !list_empty(&hub_event_list)); - -(from drivers/usb/core/hub.c::hub_thread()). - -If a freezable kernel thread fails to call try_to_freeze() after the freezer has -initiated a freezing operation, the freezing of tasks will fail and the entire -hibernation operation will be cancelled. For this reason, freezable kernel -threads must call try_to_freeze() somewhere or use one of the -wait_event_freezable() and wait_event_freezable_timeout() macros. - -After the system memory state has been restored from a hibernation image and -devices have been reinitialized, the function thaw_processes() is called in -order to clear the PF_FROZEN flag for each frozen task. Then, the tasks that -have been frozen leave __refrigerator() and continue running. - - -Rationale behind the functions dealing with freezing and thawing of tasks: -------------------------------------------------------------------------- - -freeze_processes(): - - freezes only userspace tasks - -freeze_kernel_threads(): - - freezes all tasks (including kernel threads) because we can't freeze - kernel threads without freezing userspace tasks - -thaw_kernel_threads(): - - thaws only kernel threads; this is particularly useful if we need to do - anything special in between thawing of kernel threads and thawing of - userspace tasks, or if we want to postpone the thawing of userspace tasks - -thaw_processes(): - - thaws all tasks (including kernel threads) because we can't thaw userspace - tasks without thawing kernel threads - - -III. Which kernel threads are freezable? - -Kernel threads are not freezable by default. However, a kernel thread may clear -PF_NOFREEZE for itself by calling set_freezable() (the resetting of PF_NOFREEZE -directly is not allowed). From this point it is regarded as freezable -and must call try_to_freeze() in a suitable place. - -IV. Why do we do that? - -Generally speaking, there is a couple of reasons to use the freezing of tasks: - -1. The principal reason is to prevent filesystems from being damaged after -hibernation. At the moment we have no simple means of checkpointing -filesystems, so if there are any modifications made to filesystem data and/or -metadata on disks, we cannot bring them back to the state from before the -modifications. At the same time each hibernation image contains some -filesystem-related information that must be consistent with the state of the -on-disk data and metadata after the system memory state has been restored from -the image (otherwise the filesystems will be damaged in a nasty way, usually -making them almost impossible to repair). We therefore freeze tasks that might -cause the on-disk filesystems' data and metadata to be modified after the -hibernation image has been created and before the system is finally powered off. -The majority of these are user space processes, but if any of the kernel threads -may cause something like this to happen, they have to be freezable. - -2. Next, to create the hibernation image we need to free a sufficient amount of -memory (approximately 50% of available RAM) and we need to do that before -devices are deactivated, because we generally need them for swapping out. Then, -after the memory for the image has been freed, we don't want tasks to allocate -additional memory and we prevent them from doing that by freezing them earlier. -[Of course, this also means that device drivers should not allocate substantial -amounts of memory from their .suspend() callbacks before hibernation, but this -is a separate issue.] - -3. The third reason is to prevent user space processes and some kernel threads -from interfering with the suspending and resuming of devices. A user space -process running on a second CPU while we are suspending devices may, for -example, be troublesome and without the freezing of tasks we would need some -safeguards against race conditions that might occur in such a case. - -Although Linus Torvalds doesn't like the freezing of tasks, he said this in one -of the discussions on LKML (http://lkml.org/lkml/2007/4/27/608): - -"RJW:> Why we freeze tasks at all or why we freeze kernel threads? - -Linus: In many ways, 'at all'. - -I _do_ realize the IO request queue issues, and that we cannot actually do -s2ram with some devices in the middle of a DMA. So we want to be able to -avoid *that*, there's no question about that. And I suspect that stopping -user threads and then waiting for a sync is practically one of the easier -ways to do so. - -So in practice, the 'at all' may become a 'why freeze kernel threads?' and -freezing user threads I don't find really objectionable." - -Still, there are kernel threads that may want to be freezable. For example, if -a kernel thread that belongs to a device driver accesses the device directly, it -in principle needs to know when the device is suspended, so that it doesn't try -to access it at that time. However, if the kernel thread is freezable, it will -be frozen before the driver's .suspend() callback is executed and it will be -thawed after the driver's .resume() callback has run, so it won't be accessing -the device while it's suspended. - -4. Another reason for freezing tasks is to prevent user space processes from -realizing that hibernation (or suspend) operation takes place. Ideally, user -space processes should not notice that such a system-wide operation has occurred -and should continue running without any problems after the restore (or resume -from suspend). Unfortunately, in the most general case this is quite difficult -to achieve without the freezing of tasks. Consider, for example, a process -that depends on all CPUs being online while it's running. Since we need to -disable nonboot CPUs during the hibernation, if this process is not frozen, it -may notice that the number of CPUs has changed and may start to work incorrectly -because of that. - -V. Are there any problems related to the freezing of tasks? - -Yes, there are. - -First of all, the freezing of kernel threads may be tricky if they depend one -on another. For example, if kernel thread A waits for a completion (in the -TASK_UNINTERRUPTIBLE state) that needs to be done by freezable kernel thread B -and B is frozen in the meantime, then A will be blocked until B is thawed, which -may be undesirable. That's why kernel threads are not freezable by default. - -Second, there are the following two problems related to the freezing of user -space processes: -1. Putting processes into an uninterruptible sleep distorts the load average. -2. Now that we have FUSE, plus the framework for doing device drivers in -userspace, it gets even more complicated because some userspace processes are -now doing the sorts of things that kernel threads do -(https://lists.linux-foundation.org/pipermail/linux-pm/2007-May/012309.html). - -The problem 1. seems to be fixable, although it hasn't been fixed so far. The -other one is more serious, but it seems that we can work around it by using -hibernation (and suspend) notifiers (in that case, though, we won't be able to -avoid the realization by the user space processes that the hibernation is taking -place). - -There are also problems that the freezing of tasks tends to expose, although -they are not directly related to it. For example, if request_firmware() is -called from a device driver's .resume() routine, it will timeout and eventually -fail, because the user land process that should respond to the request is frozen -at this point. So, seemingly, the failure is due to the freezing of tasks. -Suppose, however, that the firmware file is located on a filesystem accessible -only through another device that hasn't been resumed yet. In that case, -request_firmware() will fail regardless of whether or not the freezing of tasks -is used. Consequently, the problem is not really related to the freezing of -tasks, since it generally exists anyway. - -A driver must have all firmwares it may need in RAM before suspend() is called. -If keeping them is not practical, for example due to their size, they must be -requested early enough using the suspend notifier API described in -Documentation/driver-api/pm/notifiers.rst. - -VI. Are there any precautions to be taken to prevent freezing failures? - -Yes, there are. - -First of all, grabbing the 'system_transition_mutex' lock to mutually exclude a piece of code -from system-wide sleep such as suspend/hibernation is not encouraged. -If possible, that piece of code must instead hook onto the suspend/hibernation -notifiers to achieve mutual exclusion. Look at the CPU-Hotplug code -(kernel/cpu.c) for an example. - -However, if that is not feasible, and grabbing 'system_transition_mutex' is deemed necessary, -it is strongly discouraged to directly call mutex_[un]lock(&system_transition_mutex) since -that could lead to freezing failures, because if the suspend/hibernate code -successfully acquired the 'system_transition_mutex' lock, and hence that other entity failed -to acquire the lock, then that task would get blocked in TASK_UNINTERRUPTIBLE -state. As a consequence, the freezer would not be able to freeze that task, -leading to freezing failure. - -However, the [un]lock_system_sleep() APIs are safe to use in this scenario, -since they ask the freezer to skip freezing this task, since it is anyway -"frozen enough" as it is blocked on 'system_transition_mutex', which will be released -only after the entire suspend/hibernation sequence is complete. -So, to summarize, use [un]lock_system_sleep() instead of directly using -mutex_[un]lock(&system_transition_mutex). That would prevent freezing failures. - -V. Miscellaneous -/sys/power/pm_freeze_timeout controls how long it will cost at most to freeze -all user space processes or all freezable kernel threads, in unit of millisecond. -The default value is 20000, with range of unsigned integer. diff --git a/Documentation/power/index.rst b/Documentation/power/index.rst new file mode 100644 index 000000000000..20415f21e48a --- /dev/null +++ b/Documentation/power/index.rst @@ -0,0 +1,46 @@ +:orphan: + +================ +Power Management +================ + +.. toctree:: + :maxdepth: 1 + + apm-acpi + basic-pm-debugging + charger-manager + drivers-testing + energy-model + freezing-of-tasks + interface + opp + pci + pm_qos_interface + power_supply_class + runtime_pm + s2ram + suspend-and-cpuhotplug + suspend-and-interrupts + swsusp-and-swap-files + swsusp-dmcrypt + swsusp + video + tricks + + userland-swsusp + + powercap/powercap + + regulator/consumer + regulator/design + regulator/machine + regulator/overview + regulator/regulator + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/power/interface.rst b/Documentation/power/interface.rst new file mode 100644 index 000000000000..8d270ed27228 --- /dev/null +++ b/Documentation/power/interface.rst @@ -0,0 +1,79 @@ +=========================================== +Power Management Interface for System Sleep +=========================================== + +Copyright (c) 2016 Intel Corp., Rafael J. Wysocki + +The power management subsystem provides userspace with a unified sysfs interface +for system sleep regardless of the underlying system architecture or platform. +The interface is located in the /sys/power/ directory (assuming that sysfs is +mounted at /sys). + +/sys/power/state is the system sleep state control file. + +Reading from it returns a list of supported sleep states, encoded as: + +- 'freeze' (Suspend-to-Idle) +- 'standby' (Power-On Suspend) +- 'mem' (Suspend-to-RAM) +- 'disk' (Suspend-to-Disk) + +Suspend-to-Idle is always supported. Suspend-to-Disk is always supported +too as long the kernel has been configured to support hibernation at all +(ie. CONFIG_HIBERNATION is set in the kernel configuration file). Support +for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the +platform. + +If one of the strings listed in /sys/power/state is written to it, the system +will attempt to transition into the corresponding sleep state. Refer to +Documentation/admin-guide/pm/sleep-states.rst for a description of each of +those states. + +/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk). +Specifically, it tells the kernel what to do after creating a hibernation image. + +Reading from it returns a list of supported options encoded as: + +- 'platform' (put the system into sleep using a platform-provided method) +- 'shutdown' (shut the system down) +- 'reboot' (reboot the system) +- 'suspend' (trigger a Suspend-to-RAM transition) +- 'test_resume' (resume-after-hibernation test mode) + +The currently selected option is printed in square brackets. + +The 'platform' option is only available if the platform provides a special +mechanism to put the system to sleep after creating a hibernation image (ACPI +does that, for example). The 'suspend' option is available if Suspend-to-RAM +is supported. Refer to Documentation/power/basic-pm-debugging.rst for the +description of the 'test_resume' option. + +To select an option, write the string representing it to /sys/power/disk. + +/sys/power/image_size controls the size of hibernation images. + +It can be written a string representing a non-negative integer that will be +used as a best-effort upper limit of the image size, in bytes. The hibernation +core will do its best to ensure that the image size will not exceed that number. +However, if that turns out to be impossible to achieve, a hibernation image will +still be created and its size will be as small as possible. In particular, +writing '0' to this file will enforce hibernation images to be as small as +possible. + +Reading from this file returns the current image size limit, which is set to +around 2/5 of available RAM by default. + +/sys/power/pm_trace controls the PM trace mechanism saving the last suspend +or resume event point in the RTC across reboots. + +It helps to debug hard lockups or reboots due to device driver failures that +occur during system suspend or resume (which is more common) more effectively. + +If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume +event point in turn will be stored in the RTC memory (overwriting the actual +RTC information), so it will survive a system crash if one occurs right after +storing it and it can be used later to identify the driver that caused the crash +to happen (see Documentation/power/s2ram.rst for more information). + +Initially it contains '0' which may be changed to '1' by writing a string +representing a nonzero integer into it. diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt deleted file mode 100644 index 27df7f98668a..000000000000 --- a/Documentation/power/interface.txt +++ /dev/null @@ -1,77 +0,0 @@ -Power Management Interface for System Sleep - -Copyright (c) 2016 Intel Corp., Rafael J. Wysocki - -The power management subsystem provides userspace with a unified sysfs interface -for system sleep regardless of the underlying system architecture or platform. -The interface is located in the /sys/power/ directory (assuming that sysfs is -mounted at /sys). - -/sys/power/state is the system sleep state control file. - -Reading from it returns a list of supported sleep states, encoded as: - -'freeze' (Suspend-to-Idle) -'standby' (Power-On Suspend) -'mem' (Suspend-to-RAM) -'disk' (Suspend-to-Disk) - -Suspend-to-Idle is always supported. Suspend-to-Disk is always supported -too as long the kernel has been configured to support hibernation at all -(ie. CONFIG_HIBERNATION is set in the kernel configuration file). Support -for Suspend-to-RAM and Power-On Suspend depends on the capabilities of the -platform. - -If one of the strings listed in /sys/power/state is written to it, the system -will attempt to transition into the corresponding sleep state. Refer to -Documentation/admin-guide/pm/sleep-states.rst for a description of each of -those states. - -/sys/power/disk controls the operating mode of hibernation (Suspend-to-Disk). -Specifically, it tells the kernel what to do after creating a hibernation image. - -Reading from it returns a list of supported options encoded as: - -'platform' (put the system into sleep using a platform-provided method) -'shutdown' (shut the system down) -'reboot' (reboot the system) -'suspend' (trigger a Suspend-to-RAM transition) -'test_resume' (resume-after-hibernation test mode) - -The currently selected option is printed in square brackets. - -The 'platform' option is only available if the platform provides a special -mechanism to put the system to sleep after creating a hibernation image (ACPI -does that, for example). The 'suspend' option is available if Suspend-to-RAM -is supported. Refer to Documentation/power/basic-pm-debugging.txt for the -description of the 'test_resume' option. - -To select an option, write the string representing it to /sys/power/disk. - -/sys/power/image_size controls the size of hibernation images. - -It can be written a string representing a non-negative integer that will be -used as a best-effort upper limit of the image size, in bytes. The hibernation -core will do its best to ensure that the image size will not exceed that number. -However, if that turns out to be impossible to achieve, a hibernation image will -still be created and its size will be as small as possible. In particular, -writing '0' to this file will enforce hibernation images to be as small as -possible. - -Reading from this file returns the current image size limit, which is set to -around 2/5 of available RAM by default. - -/sys/power/pm_trace controls the PM trace mechanism saving the last suspend -or resume event point in the RTC across reboots. - -It helps to debug hard lockups or reboots due to device driver failures that -occur during system suspend or resume (which is more common) more effectively. - -If /sys/power/pm_trace contains '1', the fingerprint of each suspend/resume -event point in turn will be stored in the RTC memory (overwriting the actual -RTC information), so it will survive a system crash if one occurs right after -storing it and it can be used later to identify the driver that caused the crash -to happen (see Documentation/power/s2ram.txt for more information). - -Initially it contains '0' which may be changed to '1' by writing a string -representing a nonzero integer into it. diff --git a/Documentation/power/opp.rst b/Documentation/power/opp.rst new file mode 100644 index 000000000000..b3cf1def9dee --- /dev/null +++ b/Documentation/power/opp.rst @@ -0,0 +1,379 @@ +========================================== +Operating Performance Points (OPP) Library +========================================== + +(C) 2009-2010 Nishanth Menon , Texas Instruments Incorporated + +.. Contents + + 1. Introduction + 2. Initial OPP List Registration + 3. OPP Search Functions + 4. OPP Availability Control Functions + 5. OPP Data Retrieval Functions + 6. Data Structures + +1. Introduction +=============== + +1.1 What is an Operating Performance Point (OPP)? +------------------------------------------------- + +Complex SoCs of today consists of a multiple sub-modules working in conjunction. +In an operational system executing varied use cases, not all modules in the SoC +need to function at their highest performing frequency all the time. To +facilitate this, sub-modules in a SoC are grouped into domains, allowing some +domains to run at lower voltage and frequency while other domains run at +voltage/frequency pairs that are higher. + +The set of discrete tuples consisting of frequency and voltage pairs that +the device will support per domain are called Operating Performance Points or +OPPs. + +As an example: + +Let us consider an MPU device which supports the following: +{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V}, +{1GHz at minimum voltage of 1.3V} + +We can represent these as three OPPs as the following {Hz, uV} tuples: + +- {300000000, 1000000} +- {800000000, 1200000} +- {1000000000, 1300000} + +1.2 Operating Performance Points Library +---------------------------------------- + +OPP library provides a set of helper functions to organize and query the OPP +information. The library is located in drivers/base/power/opp.c and the header +is located in include/linux/pm_opp.h. OPP library can be enabled by enabling +CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on +CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to +optionally boot at a certain OPP without needing cpufreq. + +Typical usage of the OPP library is as follows:: + + (users) -> registers a set of default OPPs -> (library) + SoC framework -> modifies on required cases certain OPPs -> OPP layer + -> queries to search/retrieve information -> + +OPP layer expects each domain to be represented by a unique device pointer. SoC +framework registers a set of initial OPPs per device with the OPP layer. This +list is expected to be an optimally small number typically around 5 per device. +This initial list contains a set of OPPs that the framework expects to be safely +enabled by default in the system. + +Note on OPP Availability +^^^^^^^^^^^^^^^^^^^^^^^^ + +As the system proceeds to operate, SoC framework may choose to make certain +OPPs available or not available on each device based on various external +factors. Example usage: Thermal management or other exceptional situations where +SoC framework might choose to disable a higher frequency OPP to safely continue +operations until that OPP could be re-enabled if possible. + +OPP library facilitates this concept in it's implementation. The following +operational functions operate only on available opps: +opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count + +dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then +be used for dev_pm_opp_enable/disable functions to make an opp available as required. + +WARNING: Users of OPP library should refresh their availability count using +get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the +exact mechanism to trigger these or the notification mechanism to other +dependent subsystems such as cpufreq are left to the discretion of the SoC +specific framework which uses the OPP library. Similar care needs to be taken +care to refresh the cpufreq table in cases of these operations. + +2. Initial OPP List Registration +================================ +The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per +device. It is expected that the SoC framework will register the OPP entries +optimally- typical numbers range to be less than 5. The list generated by +registering the OPPs is maintained by OPP library throughout the device +operation. The SoC framework can subsequently control the availability of the +OPPs dynamically using the dev_pm_opp_enable / disable functions. + +dev_pm_opp_add + Add a new OPP for a specific domain represented by the device pointer. + The OPP is defined using the frequency and voltage. Once added, the OPP + is assumed to be available and control of it's availability can be done + with the dev_pm_opp_enable/disable functions. OPP library internally stores + and manages this information in the opp struct. This function may be + used by SoC framework to define a optimal list as per the demands of + SoC usage environment. + + WARNING: + Do not use this function in interrupt context. + + Example:: + + soc_pm_init() + { + /* Do things */ + r = dev_pm_opp_add(mpu_dev, 1000000, 900000); + if (!r) { + pr_err("%s: unable to register mpu opp(%d)\n", r); + goto no_cpufreq; + } + /* Do cpufreq things */ + no_cpufreq: + /* Do remaining things */ + } + +3. OPP Search Functions +======================= +High level framework such as cpufreq operates on frequencies. To map the +frequency back to the corresponding OPP, OPP library provides handy functions +to search the OPP list that OPP library internally manages. These search +functions return the matching pointer representing the opp if a match is +found, else returns error. These errors are expected to be handled by standard +error checks such as IS_ERR() and appropriate actions taken by the caller. + +Callers of these functions shall call dev_pm_opp_put() after they have used the +OPP. Otherwise the memory for the OPP will never get freed and result in +memleak. + +dev_pm_opp_find_freq_exact + Search for an OPP based on an *exact* frequency and + availability. This function is especially useful to enable an OPP which + is not available by default. + Example: In a case when SoC framework detects a situation where a + higher frequency could be made available, it can use this function to + find the OPP prior to call the dev_pm_opp_enable to actually make + it available:: + + opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false); + dev_pm_opp_put(opp); + /* dont operate on the pointer.. just do a sanity check.. */ + if (IS_ERR(opp)) { + pr_err("frequency not disabled!\n"); + /* trigger appropriate actions.. */ + } else { + dev_pm_opp_enable(dev,1000000000); + } + + NOTE: + This is the only search function that operates on OPPs which are + not available. + +dev_pm_opp_find_freq_floor + Search for an available OPP which is *at most* the + provided frequency. This function is useful while searching for a lesser + match OR operating on OPP information in the order of decreasing + frequency. + Example: To find the highest opp for a device:: + + freq = ULONG_MAX; + opp = dev_pm_opp_find_freq_floor(dev, &freq); + dev_pm_opp_put(opp); + +dev_pm_opp_find_freq_ceil + Search for an available OPP which is *at least* the + provided frequency. This function is useful while searching for a + higher match OR operating on OPP information in the order of increasing + frequency. + Example 1: To find the lowest opp for a device:: + + freq = 0; + opp = dev_pm_opp_find_freq_ceil(dev, &freq); + dev_pm_opp_put(opp); + + Example 2: A simplified implementation of a SoC cpufreq_driver->target:: + + soc_cpufreq_target(..) + { + /* Do stuff like policy checks etc. */ + /* Find the best frequency match for the req */ + opp = dev_pm_opp_find_freq_ceil(dev, &freq); + dev_pm_opp_put(opp); + if (!IS_ERR(opp)) + soc_switch_to_freq_voltage(freq); + else + /* do something when we can't satisfy the req */ + /* do other stuff */ + } + +4. OPP Availability Control Functions +===================================== +A default OPP list registered with the OPP library may not cater to all possible +situation. The OPP library provides a set of functions to modify the +availability of a OPP within the OPP list. This allows SoC frameworks to have +fine grained dynamic control of which sets of OPPs are operationally available. +These functions are intended to *temporarily* remove an OPP in conditions such +as thermal considerations (e.g. don't use OPPx until the temperature drops). + +WARNING: + Do not use these functions in interrupt context. + +dev_pm_opp_enable + Make a OPP available for operation. + Example: Lets say that 1GHz OPP is to be made available only if the + SoC temperature is lower than a certain threshold. The SoC framework + implementation might choose to do something as follows:: + + if (cur_temp < temp_low_thresh) { + /* Enable 1GHz if it was disabled */ + opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false); + dev_pm_opp_put(opp); + /* just error check */ + if (!IS_ERR(opp)) + ret = dev_pm_opp_enable(dev, 1000000000); + else + goto try_something_else; + } + +dev_pm_opp_disable + Make an OPP to be not available for operation + Example: Lets say that 1GHz OPP is to be disabled if the temperature + exceeds a threshold value. The SoC framework implementation might + choose to do something as follows:: + + if (cur_temp > temp_high_thresh) { + /* Disable 1GHz if it was enabled */ + opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true); + dev_pm_opp_put(opp); + /* just error check */ + if (!IS_ERR(opp)) + ret = dev_pm_opp_disable(dev, 1000000000); + else + goto try_something_else; + } + +5. OPP Data Retrieval Functions +=============================== +Since OPP library abstracts away the OPP information, a set of functions to pull +information from the OPP structure is necessary. Once an OPP pointer is +retrieved using the search functions, the following functions can be used by SoC +framework to retrieve the information represented inside the OPP layer. + +dev_pm_opp_get_voltage + Retrieve the voltage represented by the opp pointer. + Example: At a cpufreq transition to a different frequency, SoC + framework requires to set the voltage represented by the OPP using + the regulator framework to the Power Management chip providing the + voltage:: + + soc_switch_to_freq_voltage(freq) + { + /* do things */ + opp = dev_pm_opp_find_freq_ceil(dev, &freq); + v = dev_pm_opp_get_voltage(opp); + dev_pm_opp_put(opp); + if (v) + regulator_set_voltage(.., v); + /* do other things */ + } + +dev_pm_opp_get_freq + Retrieve the freq represented by the opp pointer. + Example: Lets say the SoC framework uses a couple of helper functions + we could pass opp pointers instead of doing additional parameters to + handle quiet a bit of data parameters:: + + soc_cpufreq_target(..) + { + /* do things.. */ + max_freq = ULONG_MAX; + max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq); + requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq); + if (!IS_ERR(max_opp) && !IS_ERR(requested_opp)) + r = soc_test_validity(max_opp, requested_opp); + dev_pm_opp_put(max_opp); + dev_pm_opp_put(requested_opp); + /* do other things */ + } + soc_test_validity(..) + { + if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp)) + return -EINVAL; + if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp)) + return -EINVAL; + /* do things.. */ + } + +dev_pm_opp_get_opp_count + Retrieve the number of available opps for a device + Example: Lets say a co-processor in the SoC needs to know the available + frequencies in a table, the main processor can notify as following:: + + soc_notify_coproc_available_frequencies() + { + /* Do things */ + num_available = dev_pm_opp_get_opp_count(dev); + speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL); + /* populate the table in increasing order */ + freq = 0; + while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) { + speeds[i] = freq; + freq++; + i++; + dev_pm_opp_put(opp); + } + + soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available); + /* Do other things */ + } + +6. Data Structures +================== +Typically an SoC contains multiple voltage domains which are variable. Each +domain is represented by a device pointer. The relationship to OPP can be +represented as follows:: + + SoC + |- device 1 + | |- opp 1 (availability, freq, voltage) + | |- opp 2 .. + ... ... + | `- opp n .. + |- device 2 + ... + `- device m + +OPP library maintains a internal list that the SoC framework populates and +accessed by various functions as described above. However, the structures +representing the actual OPPs and domains are internal to the OPP library itself +to allow for suitable abstraction reusable across systems. + +struct dev_pm_opp + The internal data structure of OPP library which is used to + represent an OPP. In addition to the freq, voltage, availability + information, it also contains internal book keeping information required + for the OPP library to operate on. Pointer to this structure is + provided back to the users such as SoC framework to be used as a + identifier for OPP in the interactions with OPP layer. + + WARNING: + The struct dev_pm_opp pointer should not be parsed or modified by the + users. The defaults of for an instance is populated by + dev_pm_opp_add, but the availability of the OPP can be modified + by dev_pm_opp_enable/disable functions. + +struct device + This is used to identify a domain to the OPP layer. The + nature of the device and it's implementation is left to the user of + OPP library such as the SoC framework. + +Overall, in a simplistic view, the data structure operations is represented as +following:: + + Initialization / modification: + +-----+ /- dev_pm_opp_enable + dev_pm_opp_add --> | opp | <------- + | +-----+ \- dev_pm_opp_disable + \-------> domain_info(device) + + Search functions: + /-- dev_pm_opp_find_freq_ceil ---\ +-----+ + domain_info<---- dev_pm_opp_find_freq_exact -----> | opp | + \-- dev_pm_opp_find_freq_floor ---/ +-----+ + + Retrieval functions: + +-----+ /- dev_pm_opp_get_voltage + | opp | <--- + +-----+ \- dev_pm_opp_get_freq + + domain_info <- dev_pm_opp_get_opp_count diff --git a/Documentation/power/opp.txt b/Documentation/power/opp.txt deleted file mode 100644 index 0c007e250cd1..000000000000 --- a/Documentation/power/opp.txt +++ /dev/null @@ -1,342 +0,0 @@ -Operating Performance Points (OPP) Library -========================================== - -(C) 2009-2010 Nishanth Menon , Texas Instruments Incorporated - -Contents --------- -1. Introduction -2. Initial OPP List Registration -3. OPP Search Functions -4. OPP Availability Control Functions -5. OPP Data Retrieval Functions -6. Data Structures - -1. Introduction -=============== -1.1 What is an Operating Performance Point (OPP)? - -Complex SoCs of today consists of a multiple sub-modules working in conjunction. -In an operational system executing varied use cases, not all modules in the SoC -need to function at their highest performing frequency all the time. To -facilitate this, sub-modules in a SoC are grouped into domains, allowing some -domains to run at lower voltage and frequency while other domains run at -voltage/frequency pairs that are higher. - -The set of discrete tuples consisting of frequency and voltage pairs that -the device will support per domain are called Operating Performance Points or -OPPs. - -As an example: -Let us consider an MPU device which supports the following: -{300MHz at minimum voltage of 1V}, {800MHz at minimum voltage of 1.2V}, -{1GHz at minimum voltage of 1.3V} - -We can represent these as three OPPs as the following {Hz, uV} tuples: -{300000000, 1000000} -{800000000, 1200000} -{1000000000, 1300000} - -1.2 Operating Performance Points Library - -OPP library provides a set of helper functions to organize and query the OPP -information. The library is located in drivers/base/power/opp.c and the header -is located in include/linux/pm_opp.h. OPP library can be enabled by enabling -CONFIG_PM_OPP from power management menuconfig menu. OPP library depends on -CONFIG_PM as certain SoCs such as Texas Instrument's OMAP framework allows to -optionally boot at a certain OPP without needing cpufreq. - -Typical usage of the OPP library is as follows: -(users) -> registers a set of default OPPs -> (library) -SoC framework -> modifies on required cases certain OPPs -> OPP layer - -> queries to search/retrieve information -> - -OPP layer expects each domain to be represented by a unique device pointer. SoC -framework registers a set of initial OPPs per device with the OPP layer. This -list is expected to be an optimally small number typically around 5 per device. -This initial list contains a set of OPPs that the framework expects to be safely -enabled by default in the system. - -Note on OPP Availability: ------------------------- -As the system proceeds to operate, SoC framework may choose to make certain -OPPs available or not available on each device based on various external -factors. Example usage: Thermal management or other exceptional situations where -SoC framework might choose to disable a higher frequency OPP to safely continue -operations until that OPP could be re-enabled if possible. - -OPP library facilitates this concept in it's implementation. The following -operational functions operate only on available opps: -opp_find_freq_{ceil, floor}, dev_pm_opp_get_voltage, dev_pm_opp_get_freq, dev_pm_opp_get_opp_count - -dev_pm_opp_find_freq_exact is meant to be used to find the opp pointer which can then -be used for dev_pm_opp_enable/disable functions to make an opp available as required. - -WARNING: Users of OPP library should refresh their availability count using -get_opp_count if dev_pm_opp_enable/disable functions are invoked for a device, the -exact mechanism to trigger these or the notification mechanism to other -dependent subsystems such as cpufreq are left to the discretion of the SoC -specific framework which uses the OPP library. Similar care needs to be taken -care to refresh the cpufreq table in cases of these operations. - -2. Initial OPP List Registration -================================ -The SoC implementation calls dev_pm_opp_add function iteratively to add OPPs per -device. It is expected that the SoC framework will register the OPP entries -optimally- typical numbers range to be less than 5. The list generated by -registering the OPPs is maintained by OPP library throughout the device -operation. The SoC framework can subsequently control the availability of the -OPPs dynamically using the dev_pm_opp_enable / disable functions. - -dev_pm_opp_add - Add a new OPP for a specific domain represented by the device pointer. - The OPP is defined using the frequency and voltage. Once added, the OPP - is assumed to be available and control of it's availability can be done - with the dev_pm_opp_enable/disable functions. OPP library internally stores - and manages this information in the opp struct. This function may be - used by SoC framework to define a optimal list as per the demands of - SoC usage environment. - - WARNING: Do not use this function in interrupt context. - - Example: - soc_pm_init() - { - /* Do things */ - r = dev_pm_opp_add(mpu_dev, 1000000, 900000); - if (!r) { - pr_err("%s: unable to register mpu opp(%d)\n", r); - goto no_cpufreq; - } - /* Do cpufreq things */ - no_cpufreq: - /* Do remaining things */ - } - -3. OPP Search Functions -======================= -High level framework such as cpufreq operates on frequencies. To map the -frequency back to the corresponding OPP, OPP library provides handy functions -to search the OPP list that OPP library internally manages. These search -functions return the matching pointer representing the opp if a match is -found, else returns error. These errors are expected to be handled by standard -error checks such as IS_ERR() and appropriate actions taken by the caller. - -Callers of these functions shall call dev_pm_opp_put() after they have used the -OPP. Otherwise the memory for the OPP will never get freed and result in -memleak. - -dev_pm_opp_find_freq_exact - Search for an OPP based on an *exact* frequency and - availability. This function is especially useful to enable an OPP which - is not available by default. - Example: In a case when SoC framework detects a situation where a - higher frequency could be made available, it can use this function to - find the OPP prior to call the dev_pm_opp_enable to actually make it available. - opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false); - dev_pm_opp_put(opp); - /* dont operate on the pointer.. just do a sanity check.. */ - if (IS_ERR(opp)) { - pr_err("frequency not disabled!\n"); - /* trigger appropriate actions.. */ - } else { - dev_pm_opp_enable(dev,1000000000); - } - - NOTE: This is the only search function that operates on OPPs which are - not available. - -dev_pm_opp_find_freq_floor - Search for an available OPP which is *at most* the - provided frequency. This function is useful while searching for a lesser - match OR operating on OPP information in the order of decreasing - frequency. - Example: To find the highest opp for a device: - freq = ULONG_MAX; - opp = dev_pm_opp_find_freq_floor(dev, &freq); - dev_pm_opp_put(opp); - -dev_pm_opp_find_freq_ceil - Search for an available OPP which is *at least* the - provided frequency. This function is useful while searching for a - higher match OR operating on OPP information in the order of increasing - frequency. - Example 1: To find the lowest opp for a device: - freq = 0; - opp = dev_pm_opp_find_freq_ceil(dev, &freq); - dev_pm_opp_put(opp); - Example 2: A simplified implementation of a SoC cpufreq_driver->target: - soc_cpufreq_target(..) - { - /* Do stuff like policy checks etc. */ - /* Find the best frequency match for the req */ - opp = dev_pm_opp_find_freq_ceil(dev, &freq); - dev_pm_opp_put(opp); - if (!IS_ERR(opp)) - soc_switch_to_freq_voltage(freq); - else - /* do something when we can't satisfy the req */ - /* do other stuff */ - } - -4. OPP Availability Control Functions -===================================== -A default OPP list registered with the OPP library may not cater to all possible -situation. The OPP library provides a set of functions to modify the -availability of a OPP within the OPP list. This allows SoC frameworks to have -fine grained dynamic control of which sets of OPPs are operationally available. -These functions are intended to *temporarily* remove an OPP in conditions such -as thermal considerations (e.g. don't use OPPx until the temperature drops). - -WARNING: Do not use these functions in interrupt context. - -dev_pm_opp_enable - Make a OPP available for operation. - Example: Lets say that 1GHz OPP is to be made available only if the - SoC temperature is lower than a certain threshold. The SoC framework - implementation might choose to do something as follows: - if (cur_temp < temp_low_thresh) { - /* Enable 1GHz if it was disabled */ - opp = dev_pm_opp_find_freq_exact(dev, 1000000000, false); - dev_pm_opp_put(opp); - /* just error check */ - if (!IS_ERR(opp)) - ret = dev_pm_opp_enable(dev, 1000000000); - else - goto try_something_else; - } - -dev_pm_opp_disable - Make an OPP to be not available for operation - Example: Lets say that 1GHz OPP is to be disabled if the temperature - exceeds a threshold value. The SoC framework implementation might - choose to do something as follows: - if (cur_temp > temp_high_thresh) { - /* Disable 1GHz if it was enabled */ - opp = dev_pm_opp_find_freq_exact(dev, 1000000000, true); - dev_pm_opp_put(opp); - /* just error check */ - if (!IS_ERR(opp)) - ret = dev_pm_opp_disable(dev, 1000000000); - else - goto try_something_else; - } - -5. OPP Data Retrieval Functions -=============================== -Since OPP library abstracts away the OPP information, a set of functions to pull -information from the OPP structure is necessary. Once an OPP pointer is -retrieved using the search functions, the following functions can be used by SoC -framework to retrieve the information represented inside the OPP layer. - -dev_pm_opp_get_voltage - Retrieve the voltage represented by the opp pointer. - Example: At a cpufreq transition to a different frequency, SoC - framework requires to set the voltage represented by the OPP using - the regulator framework to the Power Management chip providing the - voltage. - soc_switch_to_freq_voltage(freq) - { - /* do things */ - opp = dev_pm_opp_find_freq_ceil(dev, &freq); - v = dev_pm_opp_get_voltage(opp); - dev_pm_opp_put(opp); - if (v) - regulator_set_voltage(.., v); - /* do other things */ - } - -dev_pm_opp_get_freq - Retrieve the freq represented by the opp pointer. - Example: Lets say the SoC framework uses a couple of helper functions - we could pass opp pointers instead of doing additional parameters to - handle quiet a bit of data parameters. - soc_cpufreq_target(..) - { - /* do things.. */ - max_freq = ULONG_MAX; - max_opp = dev_pm_opp_find_freq_floor(dev,&max_freq); - requested_opp = dev_pm_opp_find_freq_ceil(dev,&freq); - if (!IS_ERR(max_opp) && !IS_ERR(requested_opp)) - r = soc_test_validity(max_opp, requested_opp); - dev_pm_opp_put(max_opp); - dev_pm_opp_put(requested_opp); - /* do other things */ - } - soc_test_validity(..) - { - if(dev_pm_opp_get_voltage(max_opp) < dev_pm_opp_get_voltage(requested_opp)) - return -EINVAL; - if(dev_pm_opp_get_freq(max_opp) < dev_pm_opp_get_freq(requested_opp)) - return -EINVAL; - /* do things.. */ - } - -dev_pm_opp_get_opp_count - Retrieve the number of available opps for a device - Example: Lets say a co-processor in the SoC needs to know the available - frequencies in a table, the main processor can notify as following: - soc_notify_coproc_available_frequencies() - { - /* Do things */ - num_available = dev_pm_opp_get_opp_count(dev); - speeds = kzalloc(sizeof(u32) * num_available, GFP_KERNEL); - /* populate the table in increasing order */ - freq = 0; - while (!IS_ERR(opp = dev_pm_opp_find_freq_ceil(dev, &freq))) { - speeds[i] = freq; - freq++; - i++; - dev_pm_opp_put(opp); - } - - soc_notify_coproc(AVAILABLE_FREQs, speeds, num_available); - /* Do other things */ - } - -6. Data Structures -================== -Typically an SoC contains multiple voltage domains which are variable. Each -domain is represented by a device pointer. The relationship to OPP can be -represented as follows: -SoC - |- device 1 - | |- opp 1 (availability, freq, voltage) - | |- opp 2 .. - ... ... - | `- opp n .. - |- device 2 - ... - `- device m - -OPP library maintains a internal list that the SoC framework populates and -accessed by various functions as described above. However, the structures -representing the actual OPPs and domains are internal to the OPP library itself -to allow for suitable abstraction reusable across systems. - -struct dev_pm_opp - The internal data structure of OPP library which is used to - represent an OPP. In addition to the freq, voltage, availability - information, it also contains internal book keeping information required - for the OPP library to operate on. Pointer to this structure is - provided back to the users such as SoC framework to be used as a - identifier for OPP in the interactions with OPP layer. - - WARNING: The struct dev_pm_opp pointer should not be parsed or modified by the - users. The defaults of for an instance is populated by dev_pm_opp_add, but the - availability of the OPP can be modified by dev_pm_opp_enable/disable functions. - -struct device - This is used to identify a domain to the OPP layer. The - nature of the device and it's implementation is left to the user of - OPP library such as the SoC framework. - -Overall, in a simplistic view, the data structure operations is represented as -following: - -Initialization / modification: - +-----+ /- dev_pm_opp_enable -dev_pm_opp_add --> | opp | <------- - | +-----+ \- dev_pm_opp_disable - \-------> domain_info(device) - -Search functions: - /-- dev_pm_opp_find_freq_ceil ---\ +-----+ -domain_info<---- dev_pm_opp_find_freq_exact -----> | opp | - \-- dev_pm_opp_find_freq_floor ---/ +-----+ - -Retrieval functions: -+-----+ /- dev_pm_opp_get_voltage -| opp | <--- -+-----+ \- dev_pm_opp_get_freq - -domain_info <- dev_pm_opp_get_opp_count diff --git a/Documentation/power/pci.rst b/Documentation/power/pci.rst new file mode 100644 index 000000000000..0e2ef7429304 --- /dev/null +++ b/Documentation/power/pci.rst @@ -0,0 +1,1135 @@ +==================== +PCI Power Management +==================== + +Copyright (c) 2010 Rafael J. Wysocki , Novell Inc. + +An overview of concepts and the Linux kernel's interfaces related to PCI power +management. Based on previous work by Patrick Mochel +(and others). + +This document only covers the aspects of power management specific to PCI +devices. For general description of the kernel's interfaces related to device +power management refer to Documentation/driver-api/pm/devices.rst and +Documentation/power/runtime_pm.rst. + +.. contents: + + 1. Hardware and Platform Support for PCI Power Management + 2. PCI Subsystem and Device Power Management + 3. PCI Device Drivers and Power Management + 4. Resources + + +1. Hardware and Platform Support for PCI Power Management +========================================================= + +1.1. Native and Platform-Based Power Management +----------------------------------------------- + +In general, power management is a feature allowing one to save energy by putting +devices into states in which they draw less power (low-power states) at the +price of reduced functionality or performance. + +Usually, a device is put into a low-power state when it is underutilized or +completely inactive. However, when it is necessary to use the device once +again, it has to be put back into the "fully functional" state (full-power +state). This may happen when there are some data for the device to handle or +as a result of an external event requiring the device to be active, which may +be signaled by the device itself. + +PCI devices may be put into low-power states in two ways, by using the device +capabilities introduced by the PCI Bus Power Management Interface Specification, +or with the help of platform firmware, such as an ACPI BIOS. In the first +approach, that is referred to as the native PCI power management (native PCI PM) +in what follows, the device power state is changed as a result of writing a +specific value into one of its standard configuration registers. The second +approach requires the platform firmware to provide special methods that may be +used by the kernel to change the device's power state. + +Devices supporting the native PCI PM usually can generate wakeup signals called +Power Management Events (PMEs) to let the kernel know about external events +requiring the device to be active. After receiving a PME the kernel is supposed +to put the device that sent it into the full-power state. However, the PCI Bus +Power Management Interface Specification doesn't define any standard method of +delivering the PME from the device to the CPU and the operating system kernel. +It is assumed that the platform firmware will perform this task and therefore, +even though a PCI device is set up to generate PMEs, it also may be necessary to +prepare the platform firmware for notifying the CPU of the PMEs coming from the +device (e.g. by generating interrupts). + +In turn, if the methods provided by the platform firmware are used for changing +the power state of a device, usually the platform also provides a method for +preparing the device to generate wakeup signals. In that case, however, it +often also is necessary to prepare the device for generating PMEs using the +native PCI PM mechanism, because the method provided by the platform depends on +that. + +Thus in many situations both the native and the platform-based power management +mechanisms have to be used simultaneously to obtain the desired result. + +1.2. Native PCI Power Management +-------------------------------- + +The PCI Bus Power Management Interface Specification (PCI PM Spec) was +introduced between the PCI 2.1 and PCI 2.2 Specifications. It defined a +standard interface for performing various operations related to power +management. + +The implementation of the PCI PM Spec is optional for conventional PCI devices, +but it is mandatory for PCI Express devices. If a device supports the PCI PM +Spec, it has an 8 byte power management capability field in its PCI +configuration space. This field is used to describe and control the standard +features related to the native PCI power management. + +The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses +(B0-B3). The higher the number, the less power is drawn by the device or bus +in that state. However, the higher the number, the longer the latency for +the device or bus to return to the full-power state (D0 or B0, respectively). + +There are two variants of the D3 state defined by the specification. The first +one is D3hot, referred to as the software accessible D3, because devices can be +programmed to go into it. The second one, D3cold, is the state that PCI devices +are in when the supply voltage (Vcc) is removed from them. It is not possible +to program a PCI device to go into D3cold, although there may be a programmable +interface for putting the bus the device is on into a state in which Vcc is +removed from all devices on the bus. + +PCI bus power management, however, is not supported by the Linux kernel at the +time of this writing and therefore it is not covered by this document. + +Note that every PCI device can be in the full-power state (D0) or in D3cold, +regardless of whether or not it implements the PCI PM Spec. In addition to +that, if the PCI PM Spec is implemented by the device, it must support D3hot +as well as D0. The support for the D1 and D2 power states is optional. + +PCI devices supporting the PCI PM Spec can be programmed to go to any of the +supported low-power states (except for D3cold). While in D1-D3hot the +standard configuration registers of the device must be accessible to software +(i.e. the device is required to respond to PCI configuration accesses), although +its I/O and memory spaces are then disabled. This allows the device to be +programmatically put into D0. Thus the kernel can switch the device back and +forth between D0 and the supported low-power states (except for D3cold) and the +possible power state transitions the device can undergo are the following: + ++----------------------------+ +| Current State | New State | ++----------------------------+ +| D0 | D1, D2, D3 | ++----------------------------+ +| D1 | D2, D3 | ++----------------------------+ +| D2 | D3 | ++----------------------------+ +| D1, D2, D3 | D0 | ++----------------------------+ + +The transition from D3cold to D0 occurs when the supply voltage is provided to +the device (i.e. power is restored). In that case the device returns to D0 with +a full power-on reset sequence and the power-on defaults are restored to the +device by hardware just as at initial power up. + +PCI devices supporting the PCI PM Spec can be programmed to generate PMEs +while in a low-power state (D1-D3), but they are not required to be capable +of generating PMEs from all supported low-power states. In particular, the +capability of generating PMEs from D3cold is optional and depends on the +presence of additional voltage (3.3Vaux) allowing the device to remain +sufficiently active to generate a wakeup signal. + +1.3. ACPI Device Power Management +--------------------------------- + +The platform firmware support for the power management of PCI devices is +system-specific. However, if the system in question is compliant with the +Advanced Configuration and Power Interface (ACPI) Specification, like the +majority of x86-based systems, it is supposed to implement device power +management interfaces defined by the ACPI standard. + +For this purpose the ACPI BIOS provides special functions called "control +methods" that may be executed by the kernel to perform specific tasks, such as +putting a device into a low-power state. These control methods are encoded +using special byte-code language called the ACPI Machine Language (AML) and +stored in the machine's BIOS. The kernel loads them from the BIOS and executes +them as needed using an AML interpreter that translates the AML byte code into +computations and memory or I/O space accesses. This way, in theory, a BIOS +writer can provide the kernel with a means to perform actions depending +on the system design in a system-specific fashion. + +ACPI control methods may be divided into global control methods, that are not +associated with any particular devices, and device control methods, that have +to be defined separately for each device supposed to be handled with the help of +the platform. This means, in particular, that ACPI device control methods can +only be used to handle devices that the BIOS writer knew about in advance. The +ACPI methods used for device power management fall into that category. + +The ACPI specification assumes that devices can be in one of four power states +labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM +D0-D3 states (although the difference between D3hot and D3cold is not taken +into account by ACPI). Moreover, for each power state of a device there is a +set of power resources that have to be enabled for the device to be put into +that state. These power resources are controlled (i.e. enabled or disabled) +with the help of their own control methods, _ON and _OFF, that have to be +defined individually for each of them. + +To put a device into the ACPI power state Dx (where x is a number between 0 and +3 inclusive) the kernel is supposed to (1) enable the power resources required +by the device in this state using their _ON control methods and (2) execute the +_PSx control method defined for the device. In addition to that, if the device +is going to be put into a low-power state (D1-D3) and is supposed to generate +wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI +3.0) control method defined for it has to be executed before _PSx. Power +resources that are not required by the device in the target power state and are +not required any more by any other device should be disabled (by executing their +_OFF control methods). If the current power state of the device is D3, it can +only be put into D0 this way. + +However, quite often the power states of devices are changed during a +system-wide transition into a sleep state or back into the working state. ACPI +defines four system sleep states, S1, S2, S3, and S4, and denotes the system +working state as S0. In general, the target system sleep (or working) state +determines the highest power (lowest number) state the device can be put +into and the kernel is supposed to obtain this information by executing the +device's _SxD control method (where x is a number between 0 and 4 inclusive). +If the device is required to wake up the system from the target sleep state, the +lowest power (highest number) state it can be put into is also determined by the +target state of the system. The kernel is then supposed to use the device's +_SxW control method to obtain the number of that state. It also is supposed to +use the device's _PRW control method to learn which power resources need to be +enabled for the device to be able to generate wakeup signals. + +1.4. Wakeup Signaling +--------------------- + +Wakeup signals generated by PCI devices, either as native PCI PMEs, or as +a result of the execution of the _DSW (or _PSW) ACPI control method before +putting the device into a low-power state, have to be caught and handled as +appropriate. If they are sent while the system is in the working state +(ACPI S0), they should be translated into interrupts so that the kernel can +put the devices generating them into the full-power state and take care of the +events that triggered them. In turn, if they are sent while the system is +sleeping, they should cause the system's core logic to trigger wakeup. + +On ACPI-based systems wakeup signals sent by conventional PCI devices are +converted into ACPI General-Purpose Events (GPEs) which are hardware signals +from the system core logic generated in response to various events that need to +be acted upon. Every GPE is associated with one or more sources of potentially +interesting events. In particular, a GPE may be associated with a PCI device +capable of signaling wakeup. The information on the connections between GPEs +and event sources is recorded in the system's ACPI BIOS from where it can be +read by the kernel. + +If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE +associated with it (if there is one) is triggered. The GPEs associated with PCI +bridges may also be triggered in response to a wakeup signal from one of the +devices below the bridge (this also is the case for root bridges) and, for +example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be +handled this way. + +A GPE may be triggered when the system is sleeping (i.e. when it is in one of +the ACPI S1-S4 states), in which case system wakeup is started by its core logic +(the device that was the source of the signal causing the system wakeup to occur +may be identified later). The GPEs used in such situations are referred to as +wakeup GPEs. + +Usually, however, GPEs are also triggered when the system is in the working +state (ACPI S0) and in that case the system's core logic generates a System +Control Interrupt (SCI) to notify the kernel of the event. Then, the SCI +handler identifies the GPE that caused the interrupt to be generated which, +in turn, allows the kernel to identify the source of the event (that may be +a PCI device signaling wakeup). The GPEs used for notifying the kernel of +events occurring while the system is in the working state are referred to as +runtime GPEs. + +Unfortunately, there is no standard way of handling wakeup signals sent by +conventional PCI devices on systems that are not ACPI-based, but there is one +for PCI Express devices. Namely, the PCI Express Base Specification introduced +a native mechanism for converting native PCI PMEs into interrupts generated by +root ports. For conventional PCI devices native PMEs are out-of-band, so they +are routed separately and they need not pass through bridges (in principle they +may be routed directly to the system's core logic), but for PCI Express devices +they are in-band messages that have to pass through the PCI Express hierarchy, +including the root port on the path from the device to the Root Complex. Thus +it was possible to introduce a mechanism by which a root port generates an +interrupt whenever it receives a PME message from one of the devices below it. +The PCI Express Requester ID of the device that sent the PME message is then +recorded in one of the root port's configuration registers from where it may be +read by the interrupt handler allowing the device to be identified. [PME +messages sent by PCI Express endpoints integrated with the Root Complex don't +pass through root ports, but instead they cause a Root Complex Event Collector +(if there is one) to generate interrupts.] + +In principle the native PCI Express PME signaling may also be used on ACPI-based +systems along with the GPEs, but to use it the kernel has to ask the system's +ACPI BIOS to release control of root port configuration registers. The ACPI +BIOS, however, is not required to allow the kernel to control these registers +and if it doesn't do that, the kernel must not modify their contents. Of course +the native PCI Express PME signaling cannot be used by the kernel in that case. + + +2. PCI Subsystem and Device Power Management +============================================ + +2.1. Device Power Management Callbacks +-------------------------------------- + +The PCI Subsystem participates in the power management of PCI devices in a +number of ways. First of all, it provides an intermediate code layer between +the device power management core (PM core) and PCI device drivers. +Specifically, the pm field of the PCI subsystem's struct bus_type object, +pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing +pointers to several device power management callbacks:: + + const struct dev_pm_ops pci_dev_pm_ops = { + .prepare = pci_pm_prepare, + .complete = pci_pm_complete, + .suspend = pci_pm_suspend, + .resume = pci_pm_resume, + .freeze = pci_pm_freeze, + .thaw = pci_pm_thaw, + .poweroff = pci_pm_poweroff, + .restore = pci_pm_restore, + .suspend_noirq = pci_pm_suspend_noirq, + .resume_noirq = pci_pm_resume_noirq, + .freeze_noirq = pci_pm_freeze_noirq, + .thaw_noirq = pci_pm_thaw_noirq, + .poweroff_noirq = pci_pm_poweroff_noirq, + .restore_noirq = pci_pm_restore_noirq, + .runtime_suspend = pci_pm_runtime_suspend, + .runtime_resume = pci_pm_runtime_resume, + .runtime_idle = pci_pm_runtime_idle, + }; + +These callbacks are executed by the PM core in various situations related to +device power management and they, in turn, execute power management callbacks +provided by PCI device drivers. They also perform power management operations +involving some standard configuration registers of PCI devices that device +drivers need not know or care about. + +The structure representing a PCI device, struct pci_dev, contains several fields +that these callbacks operate on:: + + struct pci_dev { + ... + pci_power_t current_state; /* Current operating state. */ + int pm_cap; /* PM capability offset in the + configuration space */ + unsigned int pme_support:5; /* Bitmask of states from which PME# + can be generated */ + unsigned int pme_interrupt:1;/* Is native PCIe PME signaling used? */ + unsigned int d1_support:1; /* Low power state D1 is supported */ + unsigned int d2_support:1; /* Low power state D2 is supported */ + unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ + unsigned int wakeup_prepared:1; /* Device prepared for wake up */ + unsigned int d3_delay; /* D3->D0 transition time in ms */ + ... + }; + +They also indirectly use some fields of the struct device that is embedded in +struct pci_dev. + +2.2. Device Initialization +-------------------------- + +The PCI subsystem's first task related to device power management is to +prepare the device for power management and initialize the fields of struct +pci_dev used for this purpose. This happens in two functions defined in +drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init(). + +The first of these functions checks if the device supports native PCI PM +and if that's the case the offset of its power management capability structure +in the configuration space is stored in the pm_cap field of the device's struct +pci_dev object. Next, the function checks which PCI low-power states are +supported by the device and from which low-power states the device can generate +native PCI PMEs. The power management fields of the device's struct pci_dev and +the struct device embedded in it are updated accordingly and the generation of +PMEs by the device is disabled. + +The second function checks if the device can be prepared to signal wakeup with +the help of the platform firmware, such as the ACPI BIOS. If that is the case, +the function updates the wakeup fields in struct device embedded in the +device's struct pci_dev and uses the firmware-provided method to prevent the +device from signaling wakeup. + +At this point the device is ready for power management. For driverless devices, +however, this functionality is limited to a few basic operations carried out +during system-wide transitions to a sleep state and back to the working state. + +2.3. Runtime Device Power Management +------------------------------------ + +The PCI subsystem plays a vital role in the runtime power management of PCI +devices. For this purpose it uses the general runtime power management +(runtime PM) framework described in Documentation/power/runtime_pm.rst. +Namely, it provides subsystem-level callbacks:: + + pci_pm_runtime_suspend() + pci_pm_runtime_resume() + pci_pm_runtime_idle() + +that are executed by the core runtime PM routines. It also implements the +entire mechanics necessary for handling runtime wakeup signals from PCI devices +in low-power states, which at the time of this writing works for both the native +PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in +Section 1. + +First, a PCI device is put into a low-power state, or suspended, with the help +of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call +pci_pm_runtime_suspend() to do the actual job. For this to work, the device's +driver has to provide a pm->runtime_suspend() callback (see below), which is +run by pci_pm_runtime_suspend() as the first action. If the driver's callback +returns successfully, the device's standard configuration registers are saved, +the device is prepared to generate wakeup signals and, finally, it is put into +the target low-power state. + +The low-power state to put the device into is the lowest-power (highest number) +state from which it can signal wakeup. The exact method of signaling wakeup is +system-dependent and is determined by the PCI subsystem on the basis of the +reported capabilities of the device and the platform firmware. To prepare the +device for signaling wakeup and put it into the selected low-power state, the +PCI subsystem can use the platform firmware as well as the device's native PCI +PM capabilities, if supported. + +It is expected that the device driver's pm->runtime_suspend() callback will +not attempt to prepare the device for signaling wakeup or to put it into a +low-power state. The driver ought to leave these tasks to the PCI subsystem +that has all of the information necessary to perform them. + +A suspended device is brought back into the "active" state, or resumed, +with the help of pm_request_resume() or pm_runtime_resume() which both call +pci_pm_runtime_resume() for PCI devices. Again, this only works if the device's +driver provides a pm->runtime_resume() callback (see below). However, before +the driver's callback is executed, pci_pm_runtime_resume() brings the device +back into the full-power state, prevents it from signaling wakeup while in that +state and restores its standard configuration registers. Thus the driver's +callback need not worry about the PCI-specific aspects of the device resume. + +Note that generally pci_pm_runtime_resume() may be called in two different +situations. First, it may be called at the request of the device's driver, for +example if there are some data for it to process. Second, it may be called +as a result of a wakeup signal from the device itself (this sometimes is +referred to as "remote wakeup"). Of course, for this purpose the wakeup signal +is handled in one of the ways described in Section 1 and finally converted into +a notification for the PCI subsystem after the source device has been +identified. + +The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle() +and pm_request_idle(), executes the device driver's pm->runtime_idle() +callback, if defined, and if that callback doesn't return error code (or is not +present at all), suspends the device with the help of pm_runtime_suspend(). +Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for +example, it is called right after the device has just been resumed), in which +cases it is expected to suspend the device if that makes sense. Usually, +however, the PCI subsystem doesn't really know if the device really can be +suspended, so it lets the device's driver decide by running its +pm->runtime_idle() callback. + +2.4. System-Wide Power Transitions +---------------------------------- +There are a few different types of system-wide power transitions, described in +Documentation/driver-api/pm/devices.rst. Each of them requires devices to be handled +in a specific way and the PM core executes subsystem-level power management +callbacks for this purpose. They are executed in phases such that each phase +involves executing the same subsystem-level callback for every device belonging +to the given subsystem before the next phase begins. These phases always run +after tasks have been frozen. + +2.4.1. System Suspend +^^^^^^^^^^^^^^^^^^^^^ + +When the system is going into a sleep state in which the contents of memory will +be preserved, such as one of the ACPI sleep states S1-S3, the phases are: + + prepare, suspend, suspend_noirq. + +The following PCI bus type's callbacks, respectively, are used in these phases:: + + pci_pm_prepare() + pci_pm_suspend() + pci_pm_suspend_noirq() + +The pci_pm_prepare() routine first puts the device into the "fully functional" +state with the help of pm_runtime_resume(). Then, it executes the device +driver's pm->prepare() callback if defined (i.e. if the driver's struct +dev_pm_ops object is present and the prepare pointer in that object is valid). + +The pci_pm_suspend() routine first checks if the device's driver implements +legacy PCI suspend routines (see Section 3), in which case the driver's legacy +suspend callback is executed, if present, and its result is returned. Next, if +the device's driver doesn't provide a struct dev_pm_ops object (containing +pointers to the driver's callbacks), pci_pm_default_suspend() is called, which +simply turns off the device's bus master capability and runs +pcibios_disable_device() to disable it, unless the device is a bridge (PCI +bridges are ignored by this routine). Next, the device driver's pm->suspend() +callback is executed, if defined, and its result is returned if it fails. +Finally, pci_fixup_device() is called to apply hardware suspend quirks related +to the device if necessary. + +Note that the suspend phase is carried out asynchronously for PCI devices, so +the pci_pm_suspend() callback may be executed in parallel for any pair of PCI +devices that don't depend on each other in a known way (i.e. none of the paths +in the device tree from the root bridge to a leaf device contains both of them). + +The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has +been called, which means that the device driver's interrupt handler won't be +invoked while this routine is running. It first checks if the device's driver +implements legacy PCI suspends routines (Section 3), in which case the legacy +late suspend routine is called and its result is returned (the standard +configuration registers of the device are saved if the driver's callback hasn't +done that). Second, if the device driver's struct dev_pm_ops object is not +present, the device's standard configuration registers are saved and the routine +returns success. Otherwise the device driver's pm->suspend_noirq() callback is +executed, if present, and its result is returned if it fails. Next, if the +device's standard configuration registers haven't been saved yet (one of the +device driver's callbacks executed before might do that), pci_pm_suspend_noirq() +saves them, prepares the device to signal wakeup (if necessary) and puts it into +a low-power state. + +The low-power state to put the device into is the lowest-power (highest number) +state from which it can signal wakeup while the system is in the target sleep +state. Just like in the runtime PM case described above, the mechanism of +signaling wakeup is system-dependent and determined by the PCI subsystem, which +is also responsible for preparing the device to signal wakeup from the system's +target sleep state as appropriate. + +PCI device drivers (that don't implement legacy power management callbacks) are +generally not expected to prepare devices for signaling wakeup or to put them +into low-power states. However, if one of the driver's suspend callbacks +(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration +registers, pci_pm_suspend_noirq() will assume that the device has been prepared +to signal wakeup and put into a low-power state by the driver (the driver is +then assumed to have used the helper functions provided by the PCI subsystem for +this purpose). PCI device drivers are not encouraged to do that, but in some +rare cases doing that in the driver may be the optimum approach. + +2.4.2. System Resume +^^^^^^^^^^^^^^^^^^^^ + +When the system is undergoing a transition from a sleep state in which the +contents of memory have been preserved, such as one of the ACPI sleep states +S1-S3, into the working state (ACPI S0), the phases are: + + resume_noirq, resume, complete. + +The following PCI bus type's callbacks, respectively, are executed in these +phases:: + + pci_pm_resume_noirq() + pci_pm_resume() + pci_pm_complete() + +The pci_pm_resume_noirq() routine first puts the device into the full-power +state, restores its standard configuration registers and applies early resume +hardware quirks related to the device, if necessary. This is done +unconditionally, regardless of whether or not the device's driver implements +legacy PCI power management callbacks (this way all PCI devices are in the +full-power state and their standard configuration registers have been restored +when their interrupt handlers are invoked for the first time during resume, +which allows the kernel to avoid problems with the handling of shared interrupts +by drivers whose devices are still suspended). If legacy PCI power management +callbacks (see Section 3) are implemented by the device's driver, the legacy +early resume callback is executed and its result is returned. Otherwise, the +device driver's pm->resume_noirq() callback is executed, if defined, and its +result is returned. + +The pci_pm_resume() routine first checks if the device's standard configuration +registers have been restored and restores them if that's not the case (this +only is necessary in the error path during a failing suspend). Next, resume +hardware quirks related to the device are applied, if necessary, and if the +device's driver implements legacy PCI power management callbacks (see +Section 3), the driver's legacy resume callback is executed and its result is +returned. Otherwise, the device's wakeup signaling mechanisms are blocked and +its driver's pm->resume() callback is executed, if defined (the callback's +result is then returned). + +The resume phase is carried out asynchronously for PCI devices, like the +suspend phase described above, which means that if two PCI devices don't depend +on each other in a known way, the pci_pm_resume() routine may be executed for +the both of them in parallel. + +The pci_pm_complete() routine only executes the device driver's pm->complete() +callback, if defined. + +2.4.3. System Hibernation +^^^^^^^^^^^^^^^^^^^^^^^^^ + +System hibernation is more complicated than system suspend, because it requires +a system image to be created and written into a persistent storage medium. The +image is created atomically and all devices are quiesced, or frozen, before that +happens. + +The freezing of devices is carried out after enough memory has been freed (at +the time of this writing the image creation requires at least 50% of system RAM +to be free) in the following three phases: + + prepare, freeze, freeze_noirq + +that correspond to the PCI bus type's callbacks:: + + pci_pm_prepare() + pci_pm_freeze() + pci_pm_freeze_noirq() + +This means that the prepare phase is exactly the same as for system suspend. +The other two phases, however, are different. + +The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs +the device driver's pm->freeze() callback, if defined, instead of pm->suspend(), +and it doesn't apply the suspend-related hardware quirks. It is executed +asynchronously for different PCI devices that don't depend on each other in a +known way. + +The pci_pm_freeze_noirq() routine, in turn, is similar to +pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq() +routine instead of pm->suspend_noirq(). It also doesn't attempt to prepare the +device for signaling wakeup and put it into a low-power state. Still, it saves +the device's standard configuration registers if they haven't been saved by one +of the driver's callbacks. + +Once the image has been created, it has to be saved. However, at this point all +devices are frozen and they cannot handle I/O, while their ability to handle +I/O is obviously necessary for the image saving. Thus they have to be brought +back to the fully functional state and this is done in the following phases: + + thaw_noirq, thaw, complete + +using the following PCI bus type's callbacks:: + + pci_pm_thaw_noirq() + pci_pm_thaw() + pci_pm_complete() + +respectively. + +The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(), +but it doesn't put the device into the full power state and doesn't attempt to +restore its standard configuration registers. It also executes the device +driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq(). + +The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device +driver's pm->thaw() callback instead of pm->resume(). It is executed +asynchronously for different PCI devices that don't depend on each other in a +known way. + +The complete phase it the same as for system resume. + +After saving the image, devices need to be powered down before the system can +enter the target sleep state (ACPI S4 for ACPI-based systems). This is done in +three phases: + + prepare, poweroff, poweroff_noirq + +where the prepare phase is exactly the same as for system suspend. The other +two phases are analogous to the suspend and suspend_noirq phases, respectively. +The PCI subsystem-level callbacks they correspond to:: + + pci_pm_poweroff() + pci_pm_poweroff_noirq() + +work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively, +although they don't attempt to save the device's standard configuration +registers. + +2.4.4. System Restore +^^^^^^^^^^^^^^^^^^^^^ + +System restore requires a hibernation image to be loaded into memory and the +pre-hibernation memory contents to be restored before the pre-hibernation system +activity can be resumed. + +As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded +into memory by a fresh instance of the kernel, called the boot kernel, which in +turn is loaded and run by a boot loader in the usual way. After the boot kernel +has loaded the image, it needs to replace its own code and data with the code +and data of the "hibernated" kernel stored within the image, called the image +kernel. For this purpose all devices are frozen just like before creating +the image during hibernation, in the + + prepare, freeze, freeze_noirq + +phases described above. However, the devices affected by these phases are only +those having drivers in the boot kernel; other devices will still be in whatever +state the boot loader left them. + +Should the restoration of the pre-hibernation memory contents fail, the boot +kernel would go through the "thawing" procedure described above, using the +thaw_noirq, thaw, and complete phases (that will only affect the devices having +drivers in the boot kernel), and then continue running normally. + +If the pre-hibernation memory contents are restored successfully, which is the +usual situation, control is passed to the image kernel, which then becomes +responsible for bringing the system back to the working state. To achieve this, +it must restore the devices' pre-hibernation functionality, which is done much +like waking up from the memory sleep state, although it involves different +phases: + + restore_noirq, restore, complete + +The first two of these are analogous to the resume_noirq and resume phases +described above, respectively, and correspond to the following PCI subsystem +callbacks:: + + pci_pm_restore_noirq() + pci_pm_restore() + +These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(), +respectively, but they execute the device driver's pm->restore_noirq() and +pm->restore() callbacks, if available. + +The complete phase is carried out in exactly the same way as during system +resume. + + +3. PCI Device Drivers and Power Management +========================================== + +3.1. Power Management Callbacks +------------------------------- + +PCI device drivers participate in power management by providing callbacks to be +executed by the PCI subsystem's power management routines described above and by +controlling the runtime power management of their devices. + +At the time of this writing there are two ways to define power management +callbacks for a PCI device driver, the recommended one, based on using a +dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the +"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and +.resume() callbacks from struct pci_driver are used. The legacy approach, +however, doesn't allow one to define runtime power management callbacks and is +not really suitable for any new drivers. Therefore it is not covered by this +document (refer to the source code to learn more about it). + +It is recommended that all PCI device drivers define a struct dev_pm_ops object +containing pointers to power management (PM) callbacks that will be executed by +the PCI subsystem's PM routines in various circumstances. A pointer to the +driver's struct dev_pm_ops object has to be assigned to the driver.pm field in +its struct pci_driver object. Once that has happened, the "legacy" PM callbacks +in struct pci_driver are ignored (even if they are not NULL). + +The PM callbacks in struct dev_pm_ops are not mandatory and if they are not +defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI +subsystem will handle the device in a simplified default manner. If they are +defined, though, they are expected to behave as described in the following +subsections. + +3.1.1. prepare() +^^^^^^^^^^^^^^^^ + +The prepare() callback is executed during system suspend, during hibernation +(when a hibernation image is about to be created), during power-off after +saving a hibernation image and during system restore, when a hibernation image +has just been loaded into memory. + +This callback is only necessary if the driver's device has children that in +general may be registered at any time. In that case the role of the prepare() +callback is to prevent new children of the device from being registered until +one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run. + +In addition to that the prepare() callback may carry out some operations +preparing the device to be suspended, although it should not allocate memory +(if additional memory is required to suspend the device, it has to be +preallocated earlier, for example in a suspend/hibernate notifier as described +in Documentation/driver-api/pm/notifiers.rst). + +3.1.2. suspend() +^^^^^^^^^^^^^^^^ + +The suspend() callback is only executed during system suspend, after prepare() +callbacks have been executed for all devices in the system. + +This callback is expected to quiesce the device and prepare it to be put into a +low-power state by the PCI subsystem. It is not required (in fact it even is +not recommended) that a PCI driver's suspend() callback save the standard +configuration registers of the device, prepare it for waking up the system, or +put it into a low-power state. All of these operations can very well be taken +care of by the PCI subsystem, without the driver's participation. + +However, in some rare case it is convenient to carry out these operations in +a PCI driver. Then, pci_save_state(), pci_prepare_to_sleep(), and +pci_set_power_state() should be used to save the device's standard configuration +registers, to prepare it for system wakeup (if necessary), and to put it into a +low-power state, respectively. Moreover, if the driver calls pci_save_state(), +the PCI subsystem will not execute either pci_prepare_to_sleep(), or +pci_set_power_state() for its device, so the driver is then responsible for +handling the device as appropriate. + +While the suspend() callback is being executed, the driver's interrupt handler +can be invoked to handle an interrupt from the device, so all suspend-related +operations relying on the driver's ability to handle interrupts should be +carried out in this callback. + +3.1.3. suspend_noirq() +^^^^^^^^^^^^^^^^^^^^^^ + +The suspend_noirq() callback is only executed during system suspend, after +suspend() callbacks have been executed for all devices in the system and +after device interrupts have been disabled by the PM core. + +The difference between suspend_noirq() and suspend() is that the driver's +interrupt handler will not be invoked while suspend_noirq() is running. Thus +suspend_noirq() can carry out operations that would cause race conditions to +arise if they were performed in suspend(). + +3.1.4. freeze() +^^^^^^^^^^^^^^^ + +The freeze() callback is hibernation-specific and is executed in two situations, +during hibernation, after prepare() callbacks have been executed for all devices +in preparation for the creation of a system image, and during restore, +after a system image has been loaded into memory from persistent storage and the +prepare() callbacks have been executed for all devices. + +The role of this callback is analogous to the role of the suspend() callback +described above. In fact, they only need to be different in the rare cases when +the driver takes the responsibility for putting the device into a low-power +state. + +In that cases the freeze() callback should not prepare the device system wakeup +or put it into a low-power state. Still, either it or freeze_noirq() should +save the device's standard configuration registers using pci_save_state(). + +3.1.5. freeze_noirq() +^^^^^^^^^^^^^^^^^^^^^ + +The freeze_noirq() callback is hibernation-specific. It is executed during +hibernation, after prepare() and freeze() callbacks have been executed for all +devices in preparation for the creation of a system image, and during restore, +after a system image has been loaded into memory and after prepare() and +freeze() callbacks have been executed for all devices. It is always executed +after device interrupts have been disabled by the PM core. + +The role of this callback is analogous to the role of the suspend_noirq() +callback described above and it very rarely is necessary to define +freeze_noirq(). + +The difference between freeze_noirq() and freeze() is analogous to the +difference between suspend_noirq() and suspend(). + +3.1.6. poweroff() +^^^^^^^^^^^^^^^^^ + +The poweroff() callback is hibernation-specific. It is executed when the system +is about to be powered off after saving a hibernation image to a persistent +storage. prepare() callbacks are executed for all devices before poweroff() is +called. + +The role of this callback is analogous to the role of the suspend() and freeze() +callbacks described above, although it does not need to save the contents of +the device's registers. In particular, if the driver wants to put the device +into a low-power state itself instead of allowing the PCI subsystem to do that, +the poweroff() callback should use pci_prepare_to_sleep() and +pci_set_power_state() to prepare the device for system wakeup and to put it +into a low-power state, respectively, but it need not save the device's standard +configuration registers. + +3.1.7. poweroff_noirq() +^^^^^^^^^^^^^^^^^^^^^^^ + +The poweroff_noirq() callback is hibernation-specific. It is executed after +poweroff() callbacks have been executed for all devices in the system. + +The role of this callback is analogous to the role of the suspend_noirq() and +freeze_noirq() callbacks described above, but it does not need to save the +contents of the device's registers. + +The difference between poweroff_noirq() and poweroff() is analogous to the +difference between suspend_noirq() and suspend(). + +3.1.8. resume_noirq() +^^^^^^^^^^^^^^^^^^^^^ + +The resume_noirq() callback is only executed during system resume, after the +PM core has enabled the non-boot CPUs. The driver's interrupt handler will not +be invoked while resume_noirq() is running, so this callback can carry out +operations that might race with the interrupt handler. + +Since the PCI subsystem unconditionally puts all devices into the full power +state in the resume_noirq phase of system resume and restores their standard +configuration registers, resume_noirq() is usually not necessary. In general +it should only be used for performing operations that would lead to race +conditions if carried out by resume(). + +3.1.9. resume() +^^^^^^^^^^^^^^^ + +The resume() callback is only executed during system resume, after +resume_noirq() callbacks have been executed for all devices in the system and +device interrupts have been enabled by the PM core. + +This callback is responsible for restoring the pre-suspend configuration of the +device and bringing it back to the fully functional state. The device should be +able to process I/O in a usual way after resume() has returned. + +3.1.10. thaw_noirq() +^^^^^^^^^^^^^^^^^^^^ + +The thaw_noirq() callback is hibernation-specific. It is executed after a +system image has been created and the non-boot CPUs have been enabled by the PM +core, in the thaw_noirq phase of hibernation. It also may be executed if the +loading of a hibernation image fails during system restore (it is then executed +after enabling the non-boot CPUs). The driver's interrupt handler will not be +invoked while thaw_noirq() is running. + +The role of this callback is analogous to the role of resume_noirq(). The +difference between these two callbacks is that thaw_noirq() is executed after +freeze() and freeze_noirq(), so in general it does not need to modify the +contents of the device's registers. + +3.1.11. thaw() +^^^^^^^^^^^^^^ + +The thaw() callback is hibernation-specific. It is executed after thaw_noirq() +callbacks have been executed for all devices in the system and after device +interrupts have been enabled by the PM core. + +This callback is responsible for restoring the pre-freeze configuration of +the device, so that it will work in a usual way after thaw() has returned. + +3.1.12. restore_noirq() +^^^^^^^^^^^^^^^^^^^^^^^ + +The restore_noirq() callback is hibernation-specific. It is executed in the +restore_noirq phase of hibernation, when the boot kernel has passed control to +the image kernel and the non-boot CPUs have been enabled by the image kernel's +PM core. + +This callback is analogous to resume_noirq() with the exception that it cannot +make any assumption on the previous state of the device, even if the BIOS (or +generally the platform firmware) is known to preserve that state over a +suspend-resume cycle. + +For the vast majority of PCI device drivers there is no difference between +resume_noirq() and restore_noirq(). + +3.1.13. restore() +^^^^^^^^^^^^^^^^^ + +The restore() callback is hibernation-specific. It is executed after +restore_noirq() callbacks have been executed for all devices in the system and +after the PM core has enabled device drivers' interrupt handlers to be invoked. + +This callback is analogous to resume(), just like restore_noirq() is analogous +to resume_noirq(). Consequently, the difference between restore_noirq() and +restore() is analogous to the difference between resume_noirq() and resume(). + +For the vast majority of PCI device drivers there is no difference between +resume() and restore(). + +3.1.14. complete() +^^^^^^^^^^^^^^^^^^ + +The complete() callback is executed in the following situations: + + - during system resume, after resume() callbacks have been executed for all + devices, + - during hibernation, before saving the system image, after thaw() callbacks + have been executed for all devices, + - during system restore, when the system is going back to its pre-hibernation + state, after restore() callbacks have been executed for all devices. + +It also may be executed if the loading of a hibernation image into memory fails +(in that case it is run after thaw() callbacks have been executed for all +devices that have drivers in the boot kernel). + +This callback is entirely optional, although it may be necessary if the +prepare() callback performs operations that need to be reversed. + +3.1.15. runtime_suspend() +^^^^^^^^^^^^^^^^^^^^^^^^^ + +The runtime_suspend() callback is specific to device runtime power management +(runtime PM). It is executed by the PM core's runtime PM framework when the +device is about to be suspended (i.e. quiesced and put into a low-power state) +at run time. + +This callback is responsible for freezing the device and preparing it to be +put into a low-power state, but it must allow the PCI subsystem to perform all +of the PCI-specific actions necessary for suspending the device. + +3.1.16. runtime_resume() +^^^^^^^^^^^^^^^^^^^^^^^^ + +The runtime_resume() callback is specific to device runtime PM. It is executed +by the PM core's runtime PM framework when the device is about to be resumed +(i.e. put into the full-power state and programmed to process I/O normally) at +run time. + +This callback is responsible for restoring the normal functionality of the +device after it has been put into the full-power state by the PCI subsystem. +The device is expected to be able to process I/O in the usual way after +runtime_resume() has returned. + +3.1.17. runtime_idle() +^^^^^^^^^^^^^^^^^^^^^^ + +The runtime_idle() callback is specific to device runtime PM. It is executed +by the PM core's runtime PM framework whenever it may be desirable to suspend +the device according to the PM core's information. In particular, it is +automatically executed right after runtime_resume() has returned in case the +resume of the device has happened as a result of a spurious event. + +This callback is optional, but if it is not implemented or if it returns 0, the +PCI subsystem will call pm_runtime_suspend() for the device, which in turn will +cause the driver's runtime_suspend() callback to be executed. + +3.1.18. Pointing Multiple Callback Pointers to One Routine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Although in principle each of the callbacks described in the previous +subsections can be defined as a separate function, it often is convenient to +point two or more members of struct dev_pm_ops to the same routine. There are +a few convenience macros that can be used for this purpose. + +The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one +suspend routine pointed to by the .suspend(), .freeze(), and .poweroff() +members and one resume routine pointed to by the .resume(), .thaw(), and +.restore() members. The other function pointers in this struct dev_pm_ops are +unset. + +The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it +additionally sets the .runtime_resume() pointer to the same value as +.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to +the same value as .suspend() (and .freeze() and .poweroff()). + +The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct +dev_pm_ops to indicate that one suspend routine is to be pointed to by the +.suspend(), .freeze(), and .poweroff() members and one resume routine is to +be pointed to by the .resume(), .thaw(), and .restore() members. + +3.1.19. Driver Flags for Power Management +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The PM core allows device drivers to set flags that influence the handling of +power management for the devices by the core itself and by middle layer code +including the PCI bus type. The flags should be set once at the driver probe +time with the help of the dev_pm_set_driver_flags() function and they should not +be updated directly afterwards. + +The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete +mechanism allowing device suspend/resume callbacks to be skipped if the device +is in runtime suspend when the system suspend starts. That also affects all of +the ancestors of the device, so this flag should only be used if absolutely +necessary. + +The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a +positive value from pci_pm_prepare() if the ->prepare callback provided by the +driver of the device returns a positive value. That allows the driver to opt +out from using the direct-complete mechanism dynamically. + +The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's +perspective the device can be safely left in runtime suspend during system +suspend. That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff() +to skip resuming the device from runtime suspend unless there are PCI-specific +reasons for doing that. Also, it causes pci_pm_suspend_late/noirq(), +pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early +if the device remains in runtime suspend in the beginning of the "late" phase +of the system-wide transition under way. Moreover, if the device is in +runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime +power management status will be changed to "active" (as it is going to be put +into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(), +the function will set the power.direct_complete flag for it (to make the PM core +skip the subsequent "thaw" callbacks for it) and return. + +Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the +device to be left in suspend after system-wide transitions to the working state. +This flag is checked by the PM core, but the PCI bus type informs the PM core +which devices may be left in suspend from its perspective (that happens during +the "noirq" phase of system-wide suspend and analogous transitions) and next it +uses the dev_pm_may_skip_resume() helper to decide whether or not to return from +pci_pm_resume_noirq() early, as the PM core will skip the remaining resume +callbacks for the device during the transition under way and will set its +runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for +it. + +3.2. Device Runtime Power Management +------------------------------------ + +In addition to providing device power management callbacks PCI device drivers +are responsible for controlling the runtime power management (runtime PM) of +their devices. + +The PCI device runtime PM is optional, but it is recommended that PCI device +drivers implement it at least in the cases where there is a reliable way of +verifying that the device is not used (like when the network cable is detached +from an Ethernet adapter or there are no devices attached to a USB controller). + +To support the PCI runtime PM the driver first needs to implement the +runtime_suspend() and runtime_resume() callbacks. It also may need to implement +the runtime_idle() callback to prevent the device from being suspended again +every time right after the runtime_resume() callback has returned +(alternatively, the runtime_suspend() callback will have to check if the +device should really be suspended and return -EAGAIN if that is not the case). + +The runtime PM of PCI devices is enabled by default by the PCI core. PCI +device drivers do not need to enable it and should not attempt to do so. +However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid() +helper function. In addition to that, the runtime PM usage counter of +each PCI device is incremented by local_pci_probe() before executing the +probe callback provided by the device's driver. + +If a PCI driver implements the runtime PM callbacks and intends to use the +runtime PM framework provided by the PM core and the PCI subsystem, it needs +to decrement the device's runtime PM usage counter in its probe callback +function. If it doesn't do that, the counter will always be different from +zero for the device and it will never be runtime-suspended. The simplest +way to do that is by calling pm_runtime_put_noidle(), but if the driver +wants to schedule an autosuspend right away, for example, it may call +pm_runtime_put_autosuspend() instead for this purpose. Generally, it +just needs to call a function that decrements the devices usage counter +from its probe routine to make runtime PM work for the device. + +It is important to remember that the driver's runtime_suspend() callback +may be executed right after the usage counter has been decremented, because +user space may already have caused the pm_runtime_allow() helper function +unblocking the runtime PM of the device to run via sysfs, so the driver must +be prepared to cope with that. + +The driver itself should not call pm_runtime_allow(), though. Instead, it +should let user space or some platform-specific code do that (user space can +do it via sysfs as stated above), but it must be prepared to handle the +runtime PM of the device correctly as soon as pm_runtime_allow() is called +(which may happen at any time, even before the driver is loaded). + +When the driver's remove callback runs, it has to balance the decrementation +of the device's runtime PM usage counter at the probe time. For this reason, +if it has decremented the counter in its probe callback, it must run +pm_runtime_get_noresume() in its remove callback. [Since the core carries +out a runtime resume of the device and bumps up the device's usage counter +before running the driver's remove callback, the runtime PM of the device +is effectively disabled for the duration of the remove execution and all +runtime PM helper functions incrementing the device's usage counter are +then effectively equivalent to pm_runtime_get_noresume().] + +The runtime PM framework works by processing requests to suspend or resume +devices, or to check if they are idle (in which cases it is reasonable to +subsequently request that they be suspended). These requests are represented +by work items put into the power management workqueue, pm_wq. Although there +are a few situations in which power management requests are automatically +queued by the PM core (for example, after processing a request to resume a +device the PM core automatically queues a request to check if the device is +idle), device drivers are generally responsible for queuing power management +requests for their devices. For this purpose they should use the runtime PM +helper functions provided by the PM core, discussed in +Documentation/power/runtime_pm.rst. + +Devices can also be suspended and resumed synchronously, without placing a +request into pm_wq. In the majority of cases this also is done by their +drivers that use helper functions provided by the PM core for this purpose. + +For more information on the runtime PM of devices refer to +Documentation/power/runtime_pm.rst. + + +4. Resources +============ + +PCI Local Bus Specification, Rev. 3.0 + +PCI Bus Power Management Interface Specification, Rev. 1.2 + +Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b + +PCI Express Base Specification, Rev. 2.0 + +Documentation/driver-api/pm/devices.rst + +Documentation/power/runtime_pm.rst diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt deleted file mode 100644 index 8eaf9ee24d43..000000000000 --- a/Documentation/power/pci.txt +++ /dev/null @@ -1,1094 +0,0 @@ -PCI Power Management - -Copyright (c) 2010 Rafael J. Wysocki , Novell Inc. - -An overview of concepts and the Linux kernel's interfaces related to PCI power -management. Based on previous work by Patrick Mochel -(and others). - -This document only covers the aspects of power management specific to PCI -devices. For general description of the kernel's interfaces related to device -power management refer to Documentation/driver-api/pm/devices.rst and -Documentation/power/runtime_pm.txt. - ---------------------------------------------------------------------------- - -1. Hardware and Platform Support for PCI Power Management -2. PCI Subsystem and Device Power Management -3. PCI Device Drivers and Power Management -4. Resources - - -1. Hardware and Platform Support for PCI Power Management -========================================================= - -1.1. Native and Platform-Based Power Management ------------------------------------------------ -In general, power management is a feature allowing one to save energy by putting -devices into states in which they draw less power (low-power states) at the -price of reduced functionality or performance. - -Usually, a device is put into a low-power state when it is underutilized or -completely inactive. However, when it is necessary to use the device once -again, it has to be put back into the "fully functional" state (full-power -state). This may happen when there are some data for the device to handle or -as a result of an external event requiring the device to be active, which may -be signaled by the device itself. - -PCI devices may be put into low-power states in two ways, by using the device -capabilities introduced by the PCI Bus Power Management Interface Specification, -or with the help of platform firmware, such as an ACPI BIOS. In the first -approach, that is referred to as the native PCI power management (native PCI PM) -in what follows, the device power state is changed as a result of writing a -specific value into one of its standard configuration registers. The second -approach requires the platform firmware to provide special methods that may be -used by the kernel to change the device's power state. - -Devices supporting the native PCI PM usually can generate wakeup signals called -Power Management Events (PMEs) to let the kernel know about external events -requiring the device to be active. After receiving a PME the kernel is supposed -to put the device that sent it into the full-power state. However, the PCI Bus -Power Management Interface Specification doesn't define any standard method of -delivering the PME from the device to the CPU and the operating system kernel. -It is assumed that the platform firmware will perform this task and therefore, -even though a PCI device is set up to generate PMEs, it also may be necessary to -prepare the platform firmware for notifying the CPU of the PMEs coming from the -device (e.g. by generating interrupts). - -In turn, if the methods provided by the platform firmware are used for changing -the power state of a device, usually the platform also provides a method for -preparing the device to generate wakeup signals. In that case, however, it -often also is necessary to prepare the device for generating PMEs using the -native PCI PM mechanism, because the method provided by the platform depends on -that. - -Thus in many situations both the native and the platform-based power management -mechanisms have to be used simultaneously to obtain the desired result. - -1.2. Native PCI Power Management --------------------------------- -The PCI Bus Power Management Interface Specification (PCI PM Spec) was -introduced between the PCI 2.1 and PCI 2.2 Specifications. It defined a -standard interface for performing various operations related to power -management. - -The implementation of the PCI PM Spec is optional for conventional PCI devices, -but it is mandatory for PCI Express devices. If a device supports the PCI PM -Spec, it has an 8 byte power management capability field in its PCI -configuration space. This field is used to describe and control the standard -features related to the native PCI power management. - -The PCI PM Spec defines 4 operating states for devices (D0-D3) and for buses -(B0-B3). The higher the number, the less power is drawn by the device or bus -in that state. However, the higher the number, the longer the latency for -the device or bus to return to the full-power state (D0 or B0, respectively). - -There are two variants of the D3 state defined by the specification. The first -one is D3hot, referred to as the software accessible D3, because devices can be -programmed to go into it. The second one, D3cold, is the state that PCI devices -are in when the supply voltage (Vcc) is removed from them. It is not possible -to program a PCI device to go into D3cold, although there may be a programmable -interface for putting the bus the device is on into a state in which Vcc is -removed from all devices on the bus. - -PCI bus power management, however, is not supported by the Linux kernel at the -time of this writing and therefore it is not covered by this document. - -Note that every PCI device can be in the full-power state (D0) or in D3cold, -regardless of whether or not it implements the PCI PM Spec. In addition to -that, if the PCI PM Spec is implemented by the device, it must support D3hot -as well as D0. The support for the D1 and D2 power states is optional. - -PCI devices supporting the PCI PM Spec can be programmed to go to any of the -supported low-power states (except for D3cold). While in D1-D3hot the -standard configuration registers of the device must be accessible to software -(i.e. the device is required to respond to PCI configuration accesses), although -its I/O and memory spaces are then disabled. This allows the device to be -programmatically put into D0. Thus the kernel can switch the device back and -forth between D0 and the supported low-power states (except for D3cold) and the -possible power state transitions the device can undergo are the following: - -+----------------------------+ -| Current State | New State | -+----------------------------+ -| D0 | D1, D2, D3 | -+----------------------------+ -| D1 | D2, D3 | -+----------------------------+ -| D2 | D3 | -+----------------------------+ -| D1, D2, D3 | D0 | -+----------------------------+ - -The transition from D3cold to D0 occurs when the supply voltage is provided to -the device (i.e. power is restored). In that case the device returns to D0 with -a full power-on reset sequence and the power-on defaults are restored to the -device by hardware just as at initial power up. - -PCI devices supporting the PCI PM Spec can be programmed to generate PMEs -while in a low-power state (D1-D3), but they are not required to be capable -of generating PMEs from all supported low-power states. In particular, the -capability of generating PMEs from D3cold is optional and depends on the -presence of additional voltage (3.3Vaux) allowing the device to remain -sufficiently active to generate a wakeup signal. - -1.3. ACPI Device Power Management ---------------------------------- -The platform firmware support for the power management of PCI devices is -system-specific. However, if the system in question is compliant with the -Advanced Configuration and Power Interface (ACPI) Specification, like the -majority of x86-based systems, it is supposed to implement device power -management interfaces defined by the ACPI standard. - -For this purpose the ACPI BIOS provides special functions called "control -methods" that may be executed by the kernel to perform specific tasks, such as -putting a device into a low-power state. These control methods are encoded -using special byte-code language called the ACPI Machine Language (AML) and -stored in the machine's BIOS. The kernel loads them from the BIOS and executes -them as needed using an AML interpreter that translates the AML byte code into -computations and memory or I/O space accesses. This way, in theory, a BIOS -writer can provide the kernel with a means to perform actions depending -on the system design in a system-specific fashion. - -ACPI control methods may be divided into global control methods, that are not -associated with any particular devices, and device control methods, that have -to be defined separately for each device supposed to be handled with the help of -the platform. This means, in particular, that ACPI device control methods can -only be used to handle devices that the BIOS writer knew about in advance. The -ACPI methods used for device power management fall into that category. - -The ACPI specification assumes that devices can be in one of four power states -labeled as D0, D1, D2, and D3 that roughly correspond to the native PCI PM -D0-D3 states (although the difference between D3hot and D3cold is not taken -into account by ACPI). Moreover, for each power state of a device there is a -set of power resources that have to be enabled for the device to be put into -that state. These power resources are controlled (i.e. enabled or disabled) -with the help of their own control methods, _ON and _OFF, that have to be -defined individually for each of them. - -To put a device into the ACPI power state Dx (where x is a number between 0 and -3 inclusive) the kernel is supposed to (1) enable the power resources required -by the device in this state using their _ON control methods and (2) execute the -_PSx control method defined for the device. In addition to that, if the device -is going to be put into a low-power state (D1-D3) and is supposed to generate -wakeup signals from that state, the _DSW (or _PSW, replaced with _DSW by ACPI -3.0) control method defined for it has to be executed before _PSx. Power -resources that are not required by the device in the target power state and are -not required any more by any other device should be disabled (by executing their -_OFF control methods). If the current power state of the device is D3, it can -only be put into D0 this way. - -However, quite often the power states of devices are changed during a -system-wide transition into a sleep state or back into the working state. ACPI -defines four system sleep states, S1, S2, S3, and S4, and denotes the system -working state as S0. In general, the target system sleep (or working) state -determines the highest power (lowest number) state the device can be put -into and the kernel is supposed to obtain this information by executing the -device's _SxD control method (where x is a number between 0 and 4 inclusive). -If the device is required to wake up the system from the target sleep state, the -lowest power (highest number) state it can be put into is also determined by the -target state of the system. The kernel is then supposed to use the device's -_SxW control method to obtain the number of that state. It also is supposed to -use the device's _PRW control method to learn which power resources need to be -enabled for the device to be able to generate wakeup signals. - -1.4. Wakeup Signaling ---------------------- -Wakeup signals generated by PCI devices, either as native PCI PMEs, or as -a result of the execution of the _DSW (or _PSW) ACPI control method before -putting the device into a low-power state, have to be caught and handled as -appropriate. If they are sent while the system is in the working state -(ACPI S0), they should be translated into interrupts so that the kernel can -put the devices generating them into the full-power state and take care of the -events that triggered them. In turn, if they are sent while the system is -sleeping, they should cause the system's core logic to trigger wakeup. - -On ACPI-based systems wakeup signals sent by conventional PCI devices are -converted into ACPI General-Purpose Events (GPEs) which are hardware signals -from the system core logic generated in response to various events that need to -be acted upon. Every GPE is associated with one or more sources of potentially -interesting events. In particular, a GPE may be associated with a PCI device -capable of signaling wakeup. The information on the connections between GPEs -and event sources is recorded in the system's ACPI BIOS from where it can be -read by the kernel. - -If a PCI device known to the system's ACPI BIOS signals wakeup, the GPE -associated with it (if there is one) is triggered. The GPEs associated with PCI -bridges may also be triggered in response to a wakeup signal from one of the -devices below the bridge (this also is the case for root bridges) and, for -example, native PCI PMEs from devices unknown to the system's ACPI BIOS may be -handled this way. - -A GPE may be triggered when the system is sleeping (i.e. when it is in one of -the ACPI S1-S4 states), in which case system wakeup is started by its core logic -(the device that was the source of the signal causing the system wakeup to occur -may be identified later). The GPEs used in such situations are referred to as -wakeup GPEs. - -Usually, however, GPEs are also triggered when the system is in the working -state (ACPI S0) and in that case the system's core logic generates a System -Control Interrupt (SCI) to notify the kernel of the event. Then, the SCI -handler identifies the GPE that caused the interrupt to be generated which, -in turn, allows the kernel to identify the source of the event (that may be -a PCI device signaling wakeup). The GPEs used for notifying the kernel of -events occurring while the system is in the working state are referred to as -runtime GPEs. - -Unfortunately, there is no standard way of handling wakeup signals sent by -conventional PCI devices on systems that are not ACPI-based, but there is one -for PCI Express devices. Namely, the PCI Express Base Specification introduced -a native mechanism for converting native PCI PMEs into interrupts generated by -root ports. For conventional PCI devices native PMEs are out-of-band, so they -are routed separately and they need not pass through bridges (in principle they -may be routed directly to the system's core logic), but for PCI Express devices -they are in-band messages that have to pass through the PCI Express hierarchy, -including the root port on the path from the device to the Root Complex. Thus -it was possible to introduce a mechanism by which a root port generates an -interrupt whenever it receives a PME message from one of the devices below it. -The PCI Express Requester ID of the device that sent the PME message is then -recorded in one of the root port's configuration registers from where it may be -read by the interrupt handler allowing the device to be identified. [PME -messages sent by PCI Express endpoints integrated with the Root Complex don't -pass through root ports, but instead they cause a Root Complex Event Collector -(if there is one) to generate interrupts.] - -In principle the native PCI Express PME signaling may also be used on ACPI-based -systems along with the GPEs, but to use it the kernel has to ask the system's -ACPI BIOS to release control of root port configuration registers. The ACPI -BIOS, however, is not required to allow the kernel to control these registers -and if it doesn't do that, the kernel must not modify their contents. Of course -the native PCI Express PME signaling cannot be used by the kernel in that case. - - -2. PCI Subsystem and Device Power Management -============================================ - -2.1. Device Power Management Callbacks --------------------------------------- -The PCI Subsystem participates in the power management of PCI devices in a -number of ways. First of all, it provides an intermediate code layer between -the device power management core (PM core) and PCI device drivers. -Specifically, the pm field of the PCI subsystem's struct bus_type object, -pci_bus_type, points to a struct dev_pm_ops object, pci_dev_pm_ops, containing -pointers to several device power management callbacks: - -const struct dev_pm_ops pci_dev_pm_ops = { - .prepare = pci_pm_prepare, - .complete = pci_pm_complete, - .suspend = pci_pm_suspend, - .resume = pci_pm_resume, - .freeze = pci_pm_freeze, - .thaw = pci_pm_thaw, - .poweroff = pci_pm_poweroff, - .restore = pci_pm_restore, - .suspend_noirq = pci_pm_suspend_noirq, - .resume_noirq = pci_pm_resume_noirq, - .freeze_noirq = pci_pm_freeze_noirq, - .thaw_noirq = pci_pm_thaw_noirq, - .poweroff_noirq = pci_pm_poweroff_noirq, - .restore_noirq = pci_pm_restore_noirq, - .runtime_suspend = pci_pm_runtime_suspend, - .runtime_resume = pci_pm_runtime_resume, - .runtime_idle = pci_pm_runtime_idle, -}; - -These callbacks are executed by the PM core in various situations related to -device power management and they, in turn, execute power management callbacks -provided by PCI device drivers. They also perform power management operations -involving some standard configuration registers of PCI devices that device -drivers need not know or care about. - -The structure representing a PCI device, struct pci_dev, contains several fields -that these callbacks operate on: - -struct pci_dev { - ... - pci_power_t current_state; /* Current operating state. */ - int pm_cap; /* PM capability offset in the - configuration space */ - unsigned int pme_support:5; /* Bitmask of states from which PME# - can be generated */ - unsigned int pme_interrupt:1;/* Is native PCIe PME signaling used? */ - unsigned int d1_support:1; /* Low power state D1 is supported */ - unsigned int d2_support:1; /* Low power state D2 is supported */ - unsigned int no_d1d2:1; /* D1 and D2 are forbidden */ - unsigned int wakeup_prepared:1; /* Device prepared for wake up */ - unsigned int d3_delay; /* D3->D0 transition time in ms */ - ... -}; - -They also indirectly use some fields of the struct device that is embedded in -struct pci_dev. - -2.2. Device Initialization --------------------------- -The PCI subsystem's first task related to device power management is to -prepare the device for power management and initialize the fields of struct -pci_dev used for this purpose. This happens in two functions defined in -drivers/pci/pci.c, pci_pm_init() and platform_pci_wakeup_init(). - -The first of these functions checks if the device supports native PCI PM -and if that's the case the offset of its power management capability structure -in the configuration space is stored in the pm_cap field of the device's struct -pci_dev object. Next, the function checks which PCI low-power states are -supported by the device and from which low-power states the device can generate -native PCI PMEs. The power management fields of the device's struct pci_dev and -the struct device embedded in it are updated accordingly and the generation of -PMEs by the device is disabled. - -The second function checks if the device can be prepared to signal wakeup with -the help of the platform firmware, such as the ACPI BIOS. If that is the case, -the function updates the wakeup fields in struct device embedded in the -device's struct pci_dev and uses the firmware-provided method to prevent the -device from signaling wakeup. - -At this point the device is ready for power management. For driverless devices, -however, this functionality is limited to a few basic operations carried out -during system-wide transitions to a sleep state and back to the working state. - -2.3. Runtime Device Power Management ------------------------------------- -The PCI subsystem plays a vital role in the runtime power management of PCI -devices. For this purpose it uses the general runtime power management -(runtime PM) framework described in Documentation/power/runtime_pm.txt. -Namely, it provides subsystem-level callbacks: - - pci_pm_runtime_suspend() - pci_pm_runtime_resume() - pci_pm_runtime_idle() - -that are executed by the core runtime PM routines. It also implements the -entire mechanics necessary for handling runtime wakeup signals from PCI devices -in low-power states, which at the time of this writing works for both the native -PCI Express PME signaling and the ACPI GPE-based wakeup signaling described in -Section 1. - -First, a PCI device is put into a low-power state, or suspended, with the help -of pm_schedule_suspend() or pm_runtime_suspend() which for PCI devices call -pci_pm_runtime_suspend() to do the actual job. For this to work, the device's -driver has to provide a pm->runtime_suspend() callback (see below), which is -run by pci_pm_runtime_suspend() as the first action. If the driver's callback -returns successfully, the device's standard configuration registers are saved, -the device is prepared to generate wakeup signals and, finally, it is put into -the target low-power state. - -The low-power state to put the device into is the lowest-power (highest number) -state from which it can signal wakeup. The exact method of signaling wakeup is -system-dependent and is determined by the PCI subsystem on the basis of the -reported capabilities of the device and the platform firmware. To prepare the -device for signaling wakeup and put it into the selected low-power state, the -PCI subsystem can use the platform firmware as well as the device's native PCI -PM capabilities, if supported. - -It is expected that the device driver's pm->runtime_suspend() callback will -not attempt to prepare the device for signaling wakeup or to put it into a -low-power state. The driver ought to leave these tasks to the PCI subsystem -that has all of the information necessary to perform them. - -A suspended device is brought back into the "active" state, or resumed, -with the help of pm_request_resume() or pm_runtime_resume() which both call -pci_pm_runtime_resume() for PCI devices. Again, this only works if the device's -driver provides a pm->runtime_resume() callback (see below). However, before -the driver's callback is executed, pci_pm_runtime_resume() brings the device -back into the full-power state, prevents it from signaling wakeup while in that -state and restores its standard configuration registers. Thus the driver's -callback need not worry about the PCI-specific aspects of the device resume. - -Note that generally pci_pm_runtime_resume() may be called in two different -situations. First, it may be called at the request of the device's driver, for -example if there are some data for it to process. Second, it may be called -as a result of a wakeup signal from the device itself (this sometimes is -referred to as "remote wakeup"). Of course, for this purpose the wakeup signal -is handled in one of the ways described in Section 1 and finally converted into -a notification for the PCI subsystem after the source device has been -identified. - -The pci_pm_runtime_idle() function, called for PCI devices by pm_runtime_idle() -and pm_request_idle(), executes the device driver's pm->runtime_idle() -callback, if defined, and if that callback doesn't return error code (or is not -present at all), suspends the device with the help of pm_runtime_suspend(). -Sometimes pci_pm_runtime_idle() is called automatically by the PM core (for -example, it is called right after the device has just been resumed), in which -cases it is expected to suspend the device if that makes sense. Usually, -however, the PCI subsystem doesn't really know if the device really can be -suspended, so it lets the device's driver decide by running its -pm->runtime_idle() callback. - -2.4. System-Wide Power Transitions ----------------------------------- -There are a few different types of system-wide power transitions, described in -Documentation/driver-api/pm/devices.rst. Each of them requires devices to be handled -in a specific way and the PM core executes subsystem-level power management -callbacks for this purpose. They are executed in phases such that each phase -involves executing the same subsystem-level callback for every device belonging -to the given subsystem before the next phase begins. These phases always run -after tasks have been frozen. - -2.4.1. System Suspend - -When the system is going into a sleep state in which the contents of memory will -be preserved, such as one of the ACPI sleep states S1-S3, the phases are: - - prepare, suspend, suspend_noirq. - -The following PCI bus type's callbacks, respectively, are used in these phases: - - pci_pm_prepare() - pci_pm_suspend() - pci_pm_suspend_noirq() - -The pci_pm_prepare() routine first puts the device into the "fully functional" -state with the help of pm_runtime_resume(). Then, it executes the device -driver's pm->prepare() callback if defined (i.e. if the driver's struct -dev_pm_ops object is present and the prepare pointer in that object is valid). - -The pci_pm_suspend() routine first checks if the device's driver implements -legacy PCI suspend routines (see Section 3), in which case the driver's legacy -suspend callback is executed, if present, and its result is returned. Next, if -the device's driver doesn't provide a struct dev_pm_ops object (containing -pointers to the driver's callbacks), pci_pm_default_suspend() is called, which -simply turns off the device's bus master capability and runs -pcibios_disable_device() to disable it, unless the device is a bridge (PCI -bridges are ignored by this routine). Next, the device driver's pm->suspend() -callback is executed, if defined, and its result is returned if it fails. -Finally, pci_fixup_device() is called to apply hardware suspend quirks related -to the device if necessary. - -Note that the suspend phase is carried out asynchronously for PCI devices, so -the pci_pm_suspend() callback may be executed in parallel for any pair of PCI -devices that don't depend on each other in a known way (i.e. none of the paths -in the device tree from the root bridge to a leaf device contains both of them). - -The pci_pm_suspend_noirq() routine is executed after suspend_device_irqs() has -been called, which means that the device driver's interrupt handler won't be -invoked while this routine is running. It first checks if the device's driver -implements legacy PCI suspends routines (Section 3), in which case the legacy -late suspend routine is called and its result is returned (the standard -configuration registers of the device are saved if the driver's callback hasn't -done that). Second, if the device driver's struct dev_pm_ops object is not -present, the device's standard configuration registers are saved and the routine -returns success. Otherwise the device driver's pm->suspend_noirq() callback is -executed, if present, and its result is returned if it fails. Next, if the -device's standard configuration registers haven't been saved yet (one of the -device driver's callbacks executed before might do that), pci_pm_suspend_noirq() -saves them, prepares the device to signal wakeup (if necessary) and puts it into -a low-power state. - -The low-power state to put the device into is the lowest-power (highest number) -state from which it can signal wakeup while the system is in the target sleep -state. Just like in the runtime PM case described above, the mechanism of -signaling wakeup is system-dependent and determined by the PCI subsystem, which -is also responsible for preparing the device to signal wakeup from the system's -target sleep state as appropriate. - -PCI device drivers (that don't implement legacy power management callbacks) are -generally not expected to prepare devices for signaling wakeup or to put them -into low-power states. However, if one of the driver's suspend callbacks -(pm->suspend() or pm->suspend_noirq()) saves the device's standard configuration -registers, pci_pm_suspend_noirq() will assume that the device has been prepared -to signal wakeup and put into a low-power state by the driver (the driver is -then assumed to have used the helper functions provided by the PCI subsystem for -this purpose). PCI device drivers are not encouraged to do that, but in some -rare cases doing that in the driver may be the optimum approach. - -2.4.2. System Resume - -When the system is undergoing a transition from a sleep state in which the -contents of memory have been preserved, such as one of the ACPI sleep states -S1-S3, into the working state (ACPI S0), the phases are: - - resume_noirq, resume, complete. - -The following PCI bus type's callbacks, respectively, are executed in these -phases: - - pci_pm_resume_noirq() - pci_pm_resume() - pci_pm_complete() - -The pci_pm_resume_noirq() routine first puts the device into the full-power -state, restores its standard configuration registers and applies early resume -hardware quirks related to the device, if necessary. This is done -unconditionally, regardless of whether or not the device's driver implements -legacy PCI power management callbacks (this way all PCI devices are in the -full-power state and their standard configuration registers have been restored -when their interrupt handlers are invoked for the first time during resume, -which allows the kernel to avoid problems with the handling of shared interrupts -by drivers whose devices are still suspended). If legacy PCI power management -callbacks (see Section 3) are implemented by the device's driver, the legacy -early resume callback is executed and its result is returned. Otherwise, the -device driver's pm->resume_noirq() callback is executed, if defined, and its -result is returned. - -The pci_pm_resume() routine first checks if the device's standard configuration -registers have been restored and restores them if that's not the case (this -only is necessary in the error path during a failing suspend). Next, resume -hardware quirks related to the device are applied, if necessary, and if the -device's driver implements legacy PCI power management callbacks (see -Section 3), the driver's legacy resume callback is executed and its result is -returned. Otherwise, the device's wakeup signaling mechanisms are blocked and -its driver's pm->resume() callback is executed, if defined (the callback's -result is then returned). - -The resume phase is carried out asynchronously for PCI devices, like the -suspend phase described above, which means that if two PCI devices don't depend -on each other in a known way, the pci_pm_resume() routine may be executed for -the both of them in parallel. - -The pci_pm_complete() routine only executes the device driver's pm->complete() -callback, if defined. - -2.4.3. System Hibernation - -System hibernation is more complicated than system suspend, because it requires -a system image to be created and written into a persistent storage medium. The -image is created atomically and all devices are quiesced, or frozen, before that -happens. - -The freezing of devices is carried out after enough memory has been freed (at -the time of this writing the image creation requires at least 50% of system RAM -to be free) in the following three phases: - - prepare, freeze, freeze_noirq - -that correspond to the PCI bus type's callbacks: - - pci_pm_prepare() - pci_pm_freeze() - pci_pm_freeze_noirq() - -This means that the prepare phase is exactly the same as for system suspend. -The other two phases, however, are different. - -The pci_pm_freeze() routine is quite similar to pci_pm_suspend(), but it runs -the device driver's pm->freeze() callback, if defined, instead of pm->suspend(), -and it doesn't apply the suspend-related hardware quirks. It is executed -asynchronously for different PCI devices that don't depend on each other in a -known way. - -The pci_pm_freeze_noirq() routine, in turn, is similar to -pci_pm_suspend_noirq(), but it calls the device driver's pm->freeze_noirq() -routine instead of pm->suspend_noirq(). It also doesn't attempt to prepare the -device for signaling wakeup and put it into a low-power state. Still, it saves -the device's standard configuration registers if they haven't been saved by one -of the driver's callbacks. - -Once the image has been created, it has to be saved. However, at this point all -devices are frozen and they cannot handle I/O, while their ability to handle -I/O is obviously necessary for the image saving. Thus they have to be brought -back to the fully functional state and this is done in the following phases: - - thaw_noirq, thaw, complete - -using the following PCI bus type's callbacks: - - pci_pm_thaw_noirq() - pci_pm_thaw() - pci_pm_complete() - -respectively. - -The first of them, pci_pm_thaw_noirq(), is analogous to pci_pm_resume_noirq(), -but it doesn't put the device into the full power state and doesn't attempt to -restore its standard configuration registers. It also executes the device -driver's pm->thaw_noirq() callback, if defined, instead of pm->resume_noirq(). - -The pci_pm_thaw() routine is similar to pci_pm_resume(), but it runs the device -driver's pm->thaw() callback instead of pm->resume(). It is executed -asynchronously for different PCI devices that don't depend on each other in a -known way. - -The complete phase it the same as for system resume. - -After saving the image, devices need to be powered down before the system can -enter the target sleep state (ACPI S4 for ACPI-based systems). This is done in -three phases: - - prepare, poweroff, poweroff_noirq - -where the prepare phase is exactly the same as for system suspend. The other -two phases are analogous to the suspend and suspend_noirq phases, respectively. -The PCI subsystem-level callbacks they correspond to - - pci_pm_poweroff() - pci_pm_poweroff_noirq() - -work in analogy with pci_pm_suspend() and pci_pm_poweroff_noirq(), respectively, -although they don't attempt to save the device's standard configuration -registers. - -2.4.4. System Restore - -System restore requires a hibernation image to be loaded into memory and the -pre-hibernation memory contents to be restored before the pre-hibernation system -activity can be resumed. - -As described in Documentation/driver-api/pm/devices.rst, the hibernation image is loaded -into memory by a fresh instance of the kernel, called the boot kernel, which in -turn is loaded and run by a boot loader in the usual way. After the boot kernel -has loaded the image, it needs to replace its own code and data with the code -and data of the "hibernated" kernel stored within the image, called the image -kernel. For this purpose all devices are frozen just like before creating -the image during hibernation, in the - - prepare, freeze, freeze_noirq - -phases described above. However, the devices affected by these phases are only -those having drivers in the boot kernel; other devices will still be in whatever -state the boot loader left them. - -Should the restoration of the pre-hibernation memory contents fail, the boot -kernel would go through the "thawing" procedure described above, using the -thaw_noirq, thaw, and complete phases (that will only affect the devices having -drivers in the boot kernel), and then continue running normally. - -If the pre-hibernation memory contents are restored successfully, which is the -usual situation, control is passed to the image kernel, which then becomes -responsible for bringing the system back to the working state. To achieve this, -it must restore the devices' pre-hibernation functionality, which is done much -like waking up from the memory sleep state, although it involves different -phases: - - restore_noirq, restore, complete - -The first two of these are analogous to the resume_noirq and resume phases -described above, respectively, and correspond to the following PCI subsystem -callbacks: - - pci_pm_restore_noirq() - pci_pm_restore() - -These callbacks work in analogy with pci_pm_resume_noirq() and pci_pm_resume(), -respectively, but they execute the device driver's pm->restore_noirq() and -pm->restore() callbacks, if available. - -The complete phase is carried out in exactly the same way as during system -resume. - - -3. PCI Device Drivers and Power Management -========================================== - -3.1. Power Management Callbacks -------------------------------- -PCI device drivers participate in power management by providing callbacks to be -executed by the PCI subsystem's power management routines described above and by -controlling the runtime power management of their devices. - -At the time of this writing there are two ways to define power management -callbacks for a PCI device driver, the recommended one, based on using a -dev_pm_ops structure described in Documentation/driver-api/pm/devices.rst, and the -"legacy" one, in which the .suspend(), .suspend_late(), .resume_early(), and -.resume() callbacks from struct pci_driver are used. The legacy approach, -however, doesn't allow one to define runtime power management callbacks and is -not really suitable for any new drivers. Therefore it is not covered by this -document (refer to the source code to learn more about it). - -It is recommended that all PCI device drivers define a struct dev_pm_ops object -containing pointers to power management (PM) callbacks that will be executed by -the PCI subsystem's PM routines in various circumstances. A pointer to the -driver's struct dev_pm_ops object has to be assigned to the driver.pm field in -its struct pci_driver object. Once that has happened, the "legacy" PM callbacks -in struct pci_driver are ignored (even if they are not NULL). - -The PM callbacks in struct dev_pm_ops are not mandatory and if they are not -defined (i.e. the respective fields of struct dev_pm_ops are unset) the PCI -subsystem will handle the device in a simplified default manner. If they are -defined, though, they are expected to behave as described in the following -subsections. - -3.1.1. prepare() - -The prepare() callback is executed during system suspend, during hibernation -(when a hibernation image is about to be created), during power-off after -saving a hibernation image and during system restore, when a hibernation image -has just been loaded into memory. - -This callback is only necessary if the driver's device has children that in -general may be registered at any time. In that case the role of the prepare() -callback is to prevent new children of the device from being registered until -one of the resume_noirq(), thaw_noirq(), or restore_noirq() callbacks is run. - -In addition to that the prepare() callback may carry out some operations -preparing the device to be suspended, although it should not allocate memory -(if additional memory is required to suspend the device, it has to be -preallocated earlier, for example in a suspend/hibernate notifier as described -in Documentation/driver-api/pm/notifiers.rst). - -3.1.2. suspend() - -The suspend() callback is only executed during system suspend, after prepare() -callbacks have been executed for all devices in the system. - -This callback is expected to quiesce the device and prepare it to be put into a -low-power state by the PCI subsystem. It is not required (in fact it even is -not recommended) that a PCI driver's suspend() callback save the standard -configuration registers of the device, prepare it for waking up the system, or -put it into a low-power state. All of these operations can very well be taken -care of by the PCI subsystem, without the driver's participation. - -However, in some rare case it is convenient to carry out these operations in -a PCI driver. Then, pci_save_state(), pci_prepare_to_sleep(), and -pci_set_power_state() should be used to save the device's standard configuration -registers, to prepare it for system wakeup (if necessary), and to put it into a -low-power state, respectively. Moreover, if the driver calls pci_save_state(), -the PCI subsystem will not execute either pci_prepare_to_sleep(), or -pci_set_power_state() for its device, so the driver is then responsible for -handling the device as appropriate. - -While the suspend() callback is being executed, the driver's interrupt handler -can be invoked to handle an interrupt from the device, so all suspend-related -operations relying on the driver's ability to handle interrupts should be -carried out in this callback. - -3.1.3. suspend_noirq() - -The suspend_noirq() callback is only executed during system suspend, after -suspend() callbacks have been executed for all devices in the system and -after device interrupts have been disabled by the PM core. - -The difference between suspend_noirq() and suspend() is that the driver's -interrupt handler will not be invoked while suspend_noirq() is running. Thus -suspend_noirq() can carry out operations that would cause race conditions to -arise if they were performed in suspend(). - -3.1.4. freeze() - -The freeze() callback is hibernation-specific and is executed in two situations, -during hibernation, after prepare() callbacks have been executed for all devices -in preparation for the creation of a system image, and during restore, -after a system image has been loaded into memory from persistent storage and the -prepare() callbacks have been executed for all devices. - -The role of this callback is analogous to the role of the suspend() callback -described above. In fact, they only need to be different in the rare cases when -the driver takes the responsibility for putting the device into a low-power -state. - -In that cases the freeze() callback should not prepare the device system wakeup -or put it into a low-power state. Still, either it or freeze_noirq() should -save the device's standard configuration registers using pci_save_state(). - -3.1.5. freeze_noirq() - -The freeze_noirq() callback is hibernation-specific. It is executed during -hibernation, after prepare() and freeze() callbacks have been executed for all -devices in preparation for the creation of a system image, and during restore, -after a system image has been loaded into memory and after prepare() and -freeze() callbacks have been executed for all devices. It is always executed -after device interrupts have been disabled by the PM core. - -The role of this callback is analogous to the role of the suspend_noirq() -callback described above and it very rarely is necessary to define -freeze_noirq(). - -The difference between freeze_noirq() and freeze() is analogous to the -difference between suspend_noirq() and suspend(). - -3.1.6. poweroff() - -The poweroff() callback is hibernation-specific. It is executed when the system -is about to be powered off after saving a hibernation image to a persistent -storage. prepare() callbacks are executed for all devices before poweroff() is -called. - -The role of this callback is analogous to the role of the suspend() and freeze() -callbacks described above, although it does not need to save the contents of -the device's registers. In particular, if the driver wants to put the device -into a low-power state itself instead of allowing the PCI subsystem to do that, -the poweroff() callback should use pci_prepare_to_sleep() and -pci_set_power_state() to prepare the device for system wakeup and to put it -into a low-power state, respectively, but it need not save the device's standard -configuration registers. - -3.1.7. poweroff_noirq() - -The poweroff_noirq() callback is hibernation-specific. It is executed after -poweroff() callbacks have been executed for all devices in the system. - -The role of this callback is analogous to the role of the suspend_noirq() and -freeze_noirq() callbacks described above, but it does not need to save the -contents of the device's registers. - -The difference between poweroff_noirq() and poweroff() is analogous to the -difference between suspend_noirq() and suspend(). - -3.1.8. resume_noirq() - -The resume_noirq() callback is only executed during system resume, after the -PM core has enabled the non-boot CPUs. The driver's interrupt handler will not -be invoked while resume_noirq() is running, so this callback can carry out -operations that might race with the interrupt handler. - -Since the PCI subsystem unconditionally puts all devices into the full power -state in the resume_noirq phase of system resume and restores their standard -configuration registers, resume_noirq() is usually not necessary. In general -it should only be used for performing operations that would lead to race -conditions if carried out by resume(). - -3.1.9. resume() - -The resume() callback is only executed during system resume, after -resume_noirq() callbacks have been executed for all devices in the system and -device interrupts have been enabled by the PM core. - -This callback is responsible for restoring the pre-suspend configuration of the -device and bringing it back to the fully functional state. The device should be -able to process I/O in a usual way after resume() has returned. - -3.1.10. thaw_noirq() - -The thaw_noirq() callback is hibernation-specific. It is executed after a -system image has been created and the non-boot CPUs have been enabled by the PM -core, in the thaw_noirq phase of hibernation. It also may be executed if the -loading of a hibernation image fails during system restore (it is then executed -after enabling the non-boot CPUs). The driver's interrupt handler will not be -invoked while thaw_noirq() is running. - -The role of this callback is analogous to the role of resume_noirq(). The -difference between these two callbacks is that thaw_noirq() is executed after -freeze() and freeze_noirq(), so in general it does not need to modify the -contents of the device's registers. - -3.1.11. thaw() - -The thaw() callback is hibernation-specific. It is executed after thaw_noirq() -callbacks have been executed for all devices in the system and after device -interrupts have been enabled by the PM core. - -This callback is responsible for restoring the pre-freeze configuration of -the device, so that it will work in a usual way after thaw() has returned. - -3.1.12. restore_noirq() - -The restore_noirq() callback is hibernation-specific. It is executed in the -restore_noirq phase of hibernation, when the boot kernel has passed control to -the image kernel and the non-boot CPUs have been enabled by the image kernel's -PM core. - -This callback is analogous to resume_noirq() with the exception that it cannot -make any assumption on the previous state of the device, even if the BIOS (or -generally the platform firmware) is known to preserve that state over a -suspend-resume cycle. - -For the vast majority of PCI device drivers there is no difference between -resume_noirq() and restore_noirq(). - -3.1.13. restore() - -The restore() callback is hibernation-specific. It is executed after -restore_noirq() callbacks have been executed for all devices in the system and -after the PM core has enabled device drivers' interrupt handlers to be invoked. - -This callback is analogous to resume(), just like restore_noirq() is analogous -to resume_noirq(). Consequently, the difference between restore_noirq() and -restore() is analogous to the difference between resume_noirq() and resume(). - -For the vast majority of PCI device drivers there is no difference between -resume() and restore(). - -3.1.14. complete() - -The complete() callback is executed in the following situations: - - during system resume, after resume() callbacks have been executed for all - devices, - - during hibernation, before saving the system image, after thaw() callbacks - have been executed for all devices, - - during system restore, when the system is going back to its pre-hibernation - state, after restore() callbacks have been executed for all devices. -It also may be executed if the loading of a hibernation image into memory fails -(in that case it is run after thaw() callbacks have been executed for all -devices that have drivers in the boot kernel). - -This callback is entirely optional, although it may be necessary if the -prepare() callback performs operations that need to be reversed. - -3.1.15. runtime_suspend() - -The runtime_suspend() callback is specific to device runtime power management -(runtime PM). It is executed by the PM core's runtime PM framework when the -device is about to be suspended (i.e. quiesced and put into a low-power state) -at run time. - -This callback is responsible for freezing the device and preparing it to be -put into a low-power state, but it must allow the PCI subsystem to perform all -of the PCI-specific actions necessary for suspending the device. - -3.1.16. runtime_resume() - -The runtime_resume() callback is specific to device runtime PM. It is executed -by the PM core's runtime PM framework when the device is about to be resumed -(i.e. put into the full-power state and programmed to process I/O normally) at -run time. - -This callback is responsible for restoring the normal functionality of the -device after it has been put into the full-power state by the PCI subsystem. -The device is expected to be able to process I/O in the usual way after -runtime_resume() has returned. - -3.1.17. runtime_idle() - -The runtime_idle() callback is specific to device runtime PM. It is executed -by the PM core's runtime PM framework whenever it may be desirable to suspend -the device according to the PM core's information. In particular, it is -automatically executed right after runtime_resume() has returned in case the -resume of the device has happened as a result of a spurious event. - -This callback is optional, but if it is not implemented or if it returns 0, the -PCI subsystem will call pm_runtime_suspend() for the device, which in turn will -cause the driver's runtime_suspend() callback to be executed. - -3.1.18. Pointing Multiple Callback Pointers to One Routine - -Although in principle each of the callbacks described in the previous -subsections can be defined as a separate function, it often is convenient to -point two or more members of struct dev_pm_ops to the same routine. There are -a few convenience macros that can be used for this purpose. - -The SIMPLE_DEV_PM_OPS macro declares a struct dev_pm_ops object with one -suspend routine pointed to by the .suspend(), .freeze(), and .poweroff() -members and one resume routine pointed to by the .resume(), .thaw(), and -.restore() members. The other function pointers in this struct dev_pm_ops are -unset. - -The UNIVERSAL_DEV_PM_OPS macro is similar to SIMPLE_DEV_PM_OPS, but it -additionally sets the .runtime_resume() pointer to the same value as -.resume() (and .thaw(), and .restore()) and the .runtime_suspend() pointer to -the same value as .suspend() (and .freeze() and .poweroff()). - -The SET_SYSTEM_SLEEP_PM_OPS can be used inside of a declaration of struct -dev_pm_ops to indicate that one suspend routine is to be pointed to by the -.suspend(), .freeze(), and .poweroff() members and one resume routine is to -be pointed to by the .resume(), .thaw(), and .restore() members. - -3.1.19. Driver Flags for Power Management - -The PM core allows device drivers to set flags that influence the handling of -power management for the devices by the core itself and by middle layer code -including the PCI bus type. The flags should be set once at the driver probe -time with the help of the dev_pm_set_driver_flags() function and they should not -be updated directly afterwards. - -The DPM_FLAG_NEVER_SKIP flag prevents the PM core from using the direct-complete -mechanism allowing device suspend/resume callbacks to be skipped if the device -is in runtime suspend when the system suspend starts. That also affects all of -the ancestors of the device, so this flag should only be used if absolutely -necessary. - -The DPM_FLAG_SMART_PREPARE flag instructs the PCI bus type to only return a -positive value from pci_pm_prepare() if the ->prepare callback provided by the -driver of the device returns a positive value. That allows the driver to opt -out from using the direct-complete mechanism dynamically. - -The DPM_FLAG_SMART_SUSPEND flag tells the PCI bus type that from the driver's -perspective the device can be safely left in runtime suspend during system -suspend. That causes pci_pm_suspend(), pci_pm_freeze() and pci_pm_poweroff() -to skip resuming the device from runtime suspend unless there are PCI-specific -reasons for doing that. Also, it causes pci_pm_suspend_late/noirq(), -pci_pm_freeze_late/noirq() and pci_pm_poweroff_late/noirq() to return early -if the device remains in runtime suspend in the beginning of the "late" phase -of the system-wide transition under way. Moreover, if the device is in -runtime suspend in pci_pm_resume_noirq() or pci_pm_restore_noirq(), its runtime -power management status will be changed to "active" (as it is going to be put -into D0 going forward), but if it is in runtime suspend in pci_pm_thaw_noirq(), -the function will set the power.direct_complete flag for it (to make the PM core -skip the subsequent "thaw" callbacks for it) and return. - -Setting the DPM_FLAG_LEAVE_SUSPENDED flag means that the driver prefers the -device to be left in suspend after system-wide transitions to the working state. -This flag is checked by the PM core, but the PCI bus type informs the PM core -which devices may be left in suspend from its perspective (that happens during -the "noirq" phase of system-wide suspend and analogous transitions) and next it -uses the dev_pm_may_skip_resume() helper to decide whether or not to return from -pci_pm_resume_noirq() early, as the PM core will skip the remaining resume -callbacks for the device during the transition under way and will set its -runtime PM status to "suspended" if dev_pm_may_skip_resume() returns "true" for -it. - -3.2. Device Runtime Power Management ------------------------------------- -In addition to providing device power management callbacks PCI device drivers -are responsible for controlling the runtime power management (runtime PM) of -their devices. - -The PCI device runtime PM is optional, but it is recommended that PCI device -drivers implement it at least in the cases where there is a reliable way of -verifying that the device is not used (like when the network cable is detached -from an Ethernet adapter or there are no devices attached to a USB controller). - -To support the PCI runtime PM the driver first needs to implement the -runtime_suspend() and runtime_resume() callbacks. It also may need to implement -the runtime_idle() callback to prevent the device from being suspended again -every time right after the runtime_resume() callback has returned -(alternatively, the runtime_suspend() callback will have to check if the -device should really be suspended and return -EAGAIN if that is not the case). - -The runtime PM of PCI devices is enabled by default by the PCI core. PCI -device drivers do not need to enable it and should not attempt to do so. -However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid() -helper function. In addition to that, the runtime PM usage counter of -each PCI device is incremented by local_pci_probe() before executing the -probe callback provided by the device's driver. - -If a PCI driver implements the runtime PM callbacks and intends to use the -runtime PM framework provided by the PM core and the PCI subsystem, it needs -to decrement the device's runtime PM usage counter in its probe callback -function. If it doesn't do that, the counter will always be different from -zero for the device and it will never be runtime-suspended. The simplest -way to do that is by calling pm_runtime_put_noidle(), but if the driver -wants to schedule an autosuspend right away, for example, it may call -pm_runtime_put_autosuspend() instead for this purpose. Generally, it -just needs to call a function that decrements the devices usage counter -from its probe routine to make runtime PM work for the device. - -It is important to remember that the driver's runtime_suspend() callback -may be executed right after the usage counter has been decremented, because -user space may already have caused the pm_runtime_allow() helper function -unblocking the runtime PM of the device to run via sysfs, so the driver must -be prepared to cope with that. - -The driver itself should not call pm_runtime_allow(), though. Instead, it -should let user space or some platform-specific code do that (user space can -do it via sysfs as stated above), but it must be prepared to handle the -runtime PM of the device correctly as soon as pm_runtime_allow() is called -(which may happen at any time, even before the driver is loaded). - -When the driver's remove callback runs, it has to balance the decrementation -of the device's runtime PM usage counter at the probe time. For this reason, -if it has decremented the counter in its probe callback, it must run -pm_runtime_get_noresume() in its remove callback. [Since the core carries -out a runtime resume of the device and bumps up the device's usage counter -before running the driver's remove callback, the runtime PM of the device -is effectively disabled for the duration of the remove execution and all -runtime PM helper functions incrementing the device's usage counter are -then effectively equivalent to pm_runtime_get_noresume().] - -The runtime PM framework works by processing requests to suspend or resume -devices, or to check if they are idle (in which cases it is reasonable to -subsequently request that they be suspended). These requests are represented -by work items put into the power management workqueue, pm_wq. Although there -are a few situations in which power management requests are automatically -queued by the PM core (for example, after processing a request to resume a -device the PM core automatically queues a request to check if the device is -idle), device drivers are generally responsible for queuing power management -requests for their devices. For this purpose they should use the runtime PM -helper functions provided by the PM core, discussed in -Documentation/power/runtime_pm.txt. - -Devices can also be suspended and resumed synchronously, without placing a -request into pm_wq. In the majority of cases this also is done by their -drivers that use helper functions provided by the PM core for this purpose. - -For more information on the runtime PM of devices refer to -Documentation/power/runtime_pm.txt. - - -4. Resources -============ - -PCI Local Bus Specification, Rev. 3.0 -PCI Bus Power Management Interface Specification, Rev. 1.2 -Advanced Configuration and Power Interface (ACPI) Specification, Rev. 3.0b -PCI Express Base Specification, Rev. 2.0 -Documentation/driver-api/pm/devices.rst -Documentation/power/runtime_pm.txt diff --git a/Documentation/power/pm_qos_interface.rst b/Documentation/power/pm_qos_interface.rst new file mode 100644 index 000000000000..945fc6d760c9 --- /dev/null +++ b/Documentation/power/pm_qos_interface.rst @@ -0,0 +1,225 @@ +=============================== +PM Quality Of Service Interface +=============================== + +This interface provides a kernel and user mode interface for registering +performance expectations by drivers, subsystems and user space applications on +one of the parameters. + +Two different PM QoS frameworks are available: +1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput, +memory_bandwidth. +2. the per-device PM QoS framework provides the API to manage the per-device latency +constraints and PM QoS flags. + +Each parameters have defined units: + + * latency: usec + * timeout: usec + * throughput: kbs (kilo bit / sec) + * memory bandwidth: mbs (mega bit / sec) + + +1. PM QoS framework +=================== + +The infrastructure exposes multiple misc device nodes one per implemented +parameter. The set of parameters implement is defined by pm_qos_power_init() +and pm_qos_params.h. This is done because having the available parameters +being runtime configurable or changeable from a driver was seen as too easy to +abuse. + +For each parameter a list of performance requests is maintained along with +an aggregated target value. The aggregated target value is updated with +changes to the request list or elements of the list. Typically the +aggregated target value is simply the max or min of the request values held +in the parameter list elements. +Note: the aggregated target value is implemented as an atomic variable so that +reading the aggregated value does not require any locking mechanism. + + +From kernel mode the use of this interface is simple: + +void pm_qos_add_request(handle, param_class, target_value): + Will insert an element into the list for that identified PM QoS class with the + target value. Upon change to this list the new target is recomputed and any + registered notifiers are called only if the target value is now different. + Clients of pm_qos need to save the returned handle for future use in other + pm_qos API functions. + +void pm_qos_update_request(handle, new_target_value): + Will update the list element pointed to by the handle with the new target value + and recompute the new aggregated target, calling the notification tree if the + target is changed. + +void pm_qos_remove_request(handle): + Will remove the element. After removal it will update the aggregate target and + call the notification tree if the target was changed as a result of removing + the request. + +int pm_qos_request(param_class): + Returns the aggregated value for a given PM QoS class. + +int pm_qos_request_active(handle): + Returns if the request is still active, i.e. it has not been removed from a + PM QoS class constraints list. + +int pm_qos_add_notifier(param_class, notifier): + Adds a notification callback function to the PM QoS class. The callback is + called when the aggregated value for the PM QoS class is changed. + +int pm_qos_remove_notifier(int param_class, notifier): + Removes the notification callback function for the PM QoS class. + + +From user mode: + +Only processes can register a pm_qos request. To provide for automatic +cleanup of a process, the interface requires the process to register its +parameter requests in the following way: + +To register the default pm_qos target for the specific parameter, the process +must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] + +As long as the device node is held open that process has a registered +request on the parameter. + +To change the requested target value the process needs to write an s32 value to +the open device node. Alternatively the user mode program could write a hex +string for the value using 10 char long format e.g. "0x12345678". This +translates to a pm_qos_update_request call. + +To remove the user mode request for a target value simply close the device +node. + + +2. PM QoS per-device latency and flags framework +================================================ + +For each device, there are three lists of PM QoS requests. Two of them are +maintained along with the aggregated targets of resume latency and active +state latency tolerance (in microseconds) and the third one is for PM QoS flags. +Values are updated in response to changes of the request list. + +The target values of resume latency and active state latency tolerance are +simply the minimum of the request values held in the parameter list elements. +The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements' +values. One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF. + +Note: The aggregated target values are implemented in such a way that reading +the aggregated value does not require any locking mechanism. + + +From kernel mode the use of this interface is the following: + +int dev_pm_qos_add_request(device, handle, type, value): + Will insert an element into the list for that identified device with the + target value. Upon change to this list the new target is recomputed and any + registered notifiers are called only if the target value is now different. + Clients of dev_pm_qos need to save the handle for future use in other + dev_pm_qos API functions. + +int dev_pm_qos_update_request(handle, new_value): + Will update the list element pointed to by the handle with the new target + value and recompute the new aggregated target, calling the notification + trees if the target is changed. + +int dev_pm_qos_remove_request(handle): + Will remove the element. After removal it will update the aggregate target + and call the notification trees if the target was changed as a result of + removing the request. + +s32 dev_pm_qos_read_value(device): + Returns the aggregated value for a given device's constraints list. + +enum pm_qos_flags_status dev_pm_qos_flags(device, mask) + Check PM QoS flags of the given device against the given mask of flags. + The meaning of the return values is as follows: + + PM_QOS_FLAGS_ALL: + All flags from the mask are set + PM_QOS_FLAGS_SOME: + Some flags from the mask are set + PM_QOS_FLAGS_NONE: + No flags from the mask are set + PM_QOS_FLAGS_UNDEFINED: + The device's PM QoS structure has not been initialized + or the list of requests is empty. + +int dev_pm_qos_add_ancestor_request(dev, handle, type, value) + Add a PM QoS request for the first direct ancestor of the given device whose + power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests) + or whose power.set_latency_tolerance callback pointer is not NULL (for + DEV_PM_QOS_LATENCY_TOLERANCE requests). + +int dev_pm_qos_expose_latency_limit(device, value) + Add a request to the device's PM QoS list of resume latency constraints and + create a sysfs attribute pm_qos_resume_latency_us under the device's power + directory allowing user space to manipulate that request. + +void dev_pm_qos_hide_latency_limit(device) + Drop the request added by dev_pm_qos_expose_latency_limit() from the device's + PM QoS list of resume latency constraints and remove sysfs attribute + pm_qos_resume_latency_us from the device's power directory. + +int dev_pm_qos_expose_flags(device, value) + Add a request to the device's PM QoS list of flags and create sysfs attribute + pm_qos_no_power_off under the device's power directory allowing user space to + change the value of the PM_QOS_FLAG_NO_POWER_OFF flag. + +void dev_pm_qos_hide_flags(device) + Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list + of flags and remove sysfs attribute pm_qos_no_power_off from the device's power + directory. + +Notification mechanisms: + +The per-device PM QoS framework has a per-device notification tree. + +int dev_pm_qos_add_notifier(device, notifier): + Adds a notification callback function for the device. + The callback is called when the aggregated value of the device constraints list + is changed (for resume latency device PM QoS only). + +int dev_pm_qos_remove_notifier(device, notifier): + Removes the notification callback function for the device. + + +Active state latency tolerance +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This device PM QoS type is used to support systems in which hardware may switch +to energy-saving operation modes on the fly. In those systems, if the operation +mode chosen by the hardware attempts to save energy in an overly aggressive way, +it may cause excess latencies to be visible to software, causing it to miss +certain protocol requirements or target frame or sample rates etc. + +If there is a latency tolerance control mechanism for a given device available +to software, the .set_latency_tolerance callback in that device's dev_pm_info +structure should be populated. The routine pointed to by it is should implement +whatever is necessary to transfer the effective requirement value to the +hardware. + +Whenever the effective latency tolerance changes for the device, its +.set_latency_tolerance() callback will be executed and the effective value will +be passed to it. If that value is negative, which means that the list of +latency tolerance requirements for the device is empty, the callback is expected +to switch the underlying hardware latency tolerance control mechanism to an +autonomous mode if available. If that value is PM_QOS_LATENCY_ANY, in turn, and +the hardware supports a special "no requirement" setting, the callback is +expected to use it. That allows software to prevent the hardware from +automatically updating the device's latency tolerance in response to its power +state changes (e.g. during transitions from D3cold to D0), which generally may +be done in the autonomous latency tolerance control mode. + +If .set_latency_tolerance() is present for the device, sysfs attribute +pm_qos_latency_tolerance_us will be present in the devivce's power directory. +Then, user space can use that attribute to specify its latency tolerance +requirement for the device, if any. Writing "any" to it means "no requirement, +but do not let the hardware control latency tolerance" and writing "auto" to it +allows the hardware to be switched to the autonomous mode if there are no other +requirements from the kernel side in the device's list. + +Kernel code can use the functions described above along with the +DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update +latency tolerance requirements for devices. diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt deleted file mode 100644 index 19c5f7b1a7ba..000000000000 --- a/Documentation/power/pm_qos_interface.txt +++ /dev/null @@ -1,212 +0,0 @@ -PM Quality Of Service Interface. - -This interface provides a kernel and user mode interface for registering -performance expectations by drivers, subsystems and user space applications on -one of the parameters. - -Two different PM QoS frameworks are available: -1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput, -memory_bandwidth. -2. the per-device PM QoS framework provides the API to manage the per-device latency -constraints and PM QoS flags. - -Each parameters have defined units: - * latency: usec - * timeout: usec - * throughput: kbs (kilo bit / sec) - * memory bandwidth: mbs (mega bit / sec) - - -1. PM QoS framework - -The infrastructure exposes multiple misc device nodes one per implemented -parameter. The set of parameters implement is defined by pm_qos_power_init() -and pm_qos_params.h. This is done because having the available parameters -being runtime configurable or changeable from a driver was seen as too easy to -abuse. - -For each parameter a list of performance requests is maintained along with -an aggregated target value. The aggregated target value is updated with -changes to the request list or elements of the list. Typically the -aggregated target value is simply the max or min of the request values held -in the parameter list elements. -Note: the aggregated target value is implemented as an atomic variable so that -reading the aggregated value does not require any locking mechanism. - - -From kernel mode the use of this interface is simple: - -void pm_qos_add_request(handle, param_class, target_value): -Will insert an element into the list for that identified PM QoS class with the -target value. Upon change to this list the new target is recomputed and any -registered notifiers are called only if the target value is now different. -Clients of pm_qos need to save the returned handle for future use in other -pm_qos API functions. - -void pm_qos_update_request(handle, new_target_value): -Will update the list element pointed to by the handle with the new target value -and recompute the new aggregated target, calling the notification tree if the -target is changed. - -void pm_qos_remove_request(handle): -Will remove the element. After removal it will update the aggregate target and -call the notification tree if the target was changed as a result of removing -the request. - -int pm_qos_request(param_class): -Returns the aggregated value for a given PM QoS class. - -int pm_qos_request_active(handle): -Returns if the request is still active, i.e. it has not been removed from a -PM QoS class constraints list. - -int pm_qos_add_notifier(param_class, notifier): -Adds a notification callback function to the PM QoS class. The callback is -called when the aggregated value for the PM QoS class is changed. - -int pm_qos_remove_notifier(int param_class, notifier): -Removes the notification callback function for the PM QoS class. - - -From user mode: -Only processes can register a pm_qos request. To provide for automatic -cleanup of a process, the interface requires the process to register its -parameter requests in the following way: - -To register the default pm_qos target for the specific parameter, the process -must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] - -As long as the device node is held open that process has a registered -request on the parameter. - -To change the requested target value the process needs to write an s32 value to -the open device node. Alternatively the user mode program could write a hex -string for the value using 10 char long format e.g. "0x12345678". This -translates to a pm_qos_update_request call. - -To remove the user mode request for a target value simply close the device -node. - - -2. PM QoS per-device latency and flags framework - -For each device, there are three lists of PM QoS requests. Two of them are -maintained along with the aggregated targets of resume latency and active -state latency tolerance (in microseconds) and the third one is for PM QoS flags. -Values are updated in response to changes of the request list. - -The target values of resume latency and active state latency tolerance are -simply the minimum of the request values held in the parameter list elements. -The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements' -values. One device PM QoS flag is defined currently: PM_QOS_FLAG_NO_POWER_OFF. - -Note: The aggregated target values are implemented in such a way that reading -the aggregated value does not require any locking mechanism. - - -From kernel mode the use of this interface is the following: - -int dev_pm_qos_add_request(device, handle, type, value): -Will insert an element into the list for that identified device with the -target value. Upon change to this list the new target is recomputed and any -registered notifiers are called only if the target value is now different. -Clients of dev_pm_qos need to save the handle for future use in other -dev_pm_qos API functions. - -int dev_pm_qos_update_request(handle, new_value): -Will update the list element pointed to by the handle with the new target value -and recompute the new aggregated target, calling the notification trees if the -target is changed. - -int dev_pm_qos_remove_request(handle): -Will remove the element. After removal it will update the aggregate target and -call the notification trees if the target was changed as a result of removing -the request. - -s32 dev_pm_qos_read_value(device): -Returns the aggregated value for a given device's constraints list. - -enum pm_qos_flags_status dev_pm_qos_flags(device, mask) -Check PM QoS flags of the given device against the given mask of flags. -The meaning of the return values is as follows: - PM_QOS_FLAGS_ALL: All flags from the mask are set - PM_QOS_FLAGS_SOME: Some flags from the mask are set - PM_QOS_FLAGS_NONE: No flags from the mask are set - PM_QOS_FLAGS_UNDEFINED: The device's PM QoS structure has not been - initialized or the list of requests is empty. - -int dev_pm_qos_add_ancestor_request(dev, handle, type, value) -Add a PM QoS request for the first direct ancestor of the given device whose -power.ignore_children flag is unset (for DEV_PM_QOS_RESUME_LATENCY requests) -or whose power.set_latency_tolerance callback pointer is not NULL (for -DEV_PM_QOS_LATENCY_TOLERANCE requests). - -int dev_pm_qos_expose_latency_limit(device, value) -Add a request to the device's PM QoS list of resume latency constraints and -create a sysfs attribute pm_qos_resume_latency_us under the device's power -directory allowing user space to manipulate that request. - -void dev_pm_qos_hide_latency_limit(device) -Drop the request added by dev_pm_qos_expose_latency_limit() from the device's -PM QoS list of resume latency constraints and remove sysfs attribute -pm_qos_resume_latency_us from the device's power directory. - -int dev_pm_qos_expose_flags(device, value) -Add a request to the device's PM QoS list of flags and create sysfs attribute -pm_qos_no_power_off under the device's power directory allowing user space to -change the value of the PM_QOS_FLAG_NO_POWER_OFF flag. - -void dev_pm_qos_hide_flags(device) -Drop the request added by dev_pm_qos_expose_flags() from the device's PM QoS list -of flags and remove sysfs attribute pm_qos_no_power_off from the device's power -directory. - -Notification mechanisms: -The per-device PM QoS framework has a per-device notification tree. - -int dev_pm_qos_add_notifier(device, notifier): -Adds a notification callback function for the device. -The callback is called when the aggregated value of the device constraints list -is changed (for resume latency device PM QoS only). - -int dev_pm_qos_remove_notifier(device, notifier): -Removes the notification callback function for the device. - - -Active state latency tolerance - -This device PM QoS type is used to support systems in which hardware may switch -to energy-saving operation modes on the fly. In those systems, if the operation -mode chosen by the hardware attempts to save energy in an overly aggressive way, -it may cause excess latencies to be visible to software, causing it to miss -certain protocol requirements or target frame or sample rates etc. - -If there is a latency tolerance control mechanism for a given device available -to software, the .set_latency_tolerance callback in that device's dev_pm_info -structure should be populated. The routine pointed to by it is should implement -whatever is necessary to transfer the effective requirement value to the -hardware. - -Whenever the effective latency tolerance changes for the device, its -.set_latency_tolerance() callback will be executed and the effective value will -be passed to it. If that value is negative, which means that the list of -latency tolerance requirements for the device is empty, the callback is expected -to switch the underlying hardware latency tolerance control mechanism to an -autonomous mode if available. If that value is PM_QOS_LATENCY_ANY, in turn, and -the hardware supports a special "no requirement" setting, the callback is -expected to use it. That allows software to prevent the hardware from -automatically updating the device's latency tolerance in response to its power -state changes (e.g. during transitions from D3cold to D0), which generally may -be done in the autonomous latency tolerance control mode. - -If .set_latency_tolerance() is present for the device, sysfs attribute -pm_qos_latency_tolerance_us will be present in the devivce's power directory. -Then, user space can use that attribute to specify its latency tolerance -requirement for the device, if any. Writing "any" to it means "no requirement, -but do not let the hardware control latency tolerance" and writing "auto" to it -allows the hardware to be switched to the autonomous mode if there are no other -requirements from the kernel side in the device's list. - -Kernel code can use the functions described above along with the -DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update -latency tolerance requirements for devices. diff --git a/Documentation/power/power_supply_class.rst b/Documentation/power/power_supply_class.rst new file mode 100644 index 000000000000..3f2c3fe38a61 --- /dev/null +++ b/Documentation/power/power_supply_class.rst @@ -0,0 +1,282 @@ +======================== +Linux power supply class +======================== + +Synopsis +~~~~~~~~ +Power supply class used to represent battery, UPS, AC or DC power supply +properties to user-space. + +It defines core set of attributes, which should be applicable to (almost) +every power supply out there. Attributes are available via sysfs and uevent +interfaces. + +Each attribute has well defined meaning, up to unit of measure used. While +the attributes provided are believed to be universally applicable to any +power supply, specific monitoring hardware may not be able to provide them +all, so any of them may be skipped. + +Power supply class is extensible, and allows to define drivers own attributes. +The core attribute set is subject to the standard Linux evolution (i.e. +if it will be found that some attribute is applicable to many power supply +types or their drivers, it can be added to the core set). + +It also integrates with LED framework, for the purpose of providing +typically expected feedback of battery charging/fully charged status and +AC/USB power supply online status. (Note that specific details of the +indication (including whether to use it at all) are fully controllable by +user and/or specific machine defaults, per design principles of LED +framework). + + +Attributes/properties +~~~~~~~~~~~~~~~~~~~~~ +Power supply class has predefined set of attributes, this eliminates code +duplication across drivers. Power supply class insist on reusing its +predefined attributes *and* their units. + +So, userspace gets predictable set of attributes and their units for any +kind of power supply, and can process/present them to a user in consistent +manner. Results for different power supplies and machines are also directly +comparable. + +See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c +for the example how to declare and handle attributes. + + +Units +~~~~~ +Quoting include/linux/power_supply.h: + + All voltages, currents, charges, energies, time and temperatures in µV, + µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise + stated. It's driver's job to convert its raw values to units in which + this class operates. + + +Attributes/properties detailed +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ++--------------------------------------------------------------------------+ +| **Charge/Energy/Capacity - how to not confuse** | ++--------------------------------------------------------------------------+ +| **Because both "charge" (µAh) and "energy" (µWh) represents "capacity" | +| of battery, this class distinguish these terms. Don't mix them!** | +| | +| - `CHARGE_*` | +| attributes represents capacity in µAh only. | +| - `ENERGY_*` | +| attributes represents capacity in µWh only. | +| - `CAPACITY` | +| attribute represents capacity in *percents*, from 0 to 100. | ++--------------------------------------------------------------------------+ + +Postfixes: + +_AVG + *hardware* averaged value, use it if your hardware is really able to + report averaged values. +_NOW + momentary/instantaneous values. + +STATUS + this attribute represents operating status (charging, full, + discharging (i.e. powering a load), etc.). This corresponds to + `BATTERY_STATUS_*` values, as defined in battery.h. + +CHARGE_TYPE + batteries can typically charge at different rates. + This defines trickle and fast charges. For batteries that + are already charged or discharging, 'n/a' can be displayed (or + 'unknown', if the status is not known). + +AUTHENTIC + indicates the power supply (battery or charger) connected + to the platform is authentic(1) or non authentic(0). + +HEALTH + represents health of the battery, values corresponds to + POWER_SUPPLY_HEALTH_*, defined in battery.h. + +VOLTAGE_OCV + open circuit voltage of the battery. + +VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN + design values for maximal and minimal power supply voltages. + Maximal/minimal means values of voltages when battery considered + "full"/"empty" at normal conditions. Yes, there is no direct relation + between voltage and battery capacity, but some dumb + batteries use voltage for very approximated calculation of capacity. + Battery driver also can use this attribute just to inform userspace + about maximal and minimal voltage thresholds of a given battery. + +VOLTAGE_MAX, VOLTAGE_MIN + same as _DESIGN voltage values except that these ones should be used + if hardware could only guess (measure and retain) the thresholds of a + given power supply. + +VOLTAGE_BOOT + Reports the voltage measured during boot + +CURRENT_BOOT + Reports the current measured during boot + +CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN + design charge values, when battery considered full/empty. + +ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN + same as above but for energy. + +CHARGE_FULL, CHARGE_EMPTY + These attributes means "last remembered value of charge when battery + became full/empty". It also could mean "value of charge when battery + considered full/empty at given conditions (temperature, age)". + I.e. these attributes represents real thresholds, not design values. + +ENERGY_FULL, ENERGY_EMPTY + same as above but for energy. + +CHARGE_COUNTER + the current charge counter (in µAh). This could easily + be negative; there is no empty or full value. It is only useful for + relative, time-based measurements. + +PRECHARGE_CURRENT + the maximum charge current during precharge phase of charge cycle + (typically 20% of battery capacity). + +CHARGE_TERM_CURRENT + Charge termination current. The charge cycle terminates when battery + voltage is above recharge threshold, and charge current is below + this setting (typically 10% of battery capacity). + +CONSTANT_CHARGE_CURRENT + constant charge current programmed by charger. + + +CONSTANT_CHARGE_CURRENT_MAX + maximum charge current supported by the power supply object. + +CONSTANT_CHARGE_VOLTAGE + constant charge voltage programmed by charger. +CONSTANT_CHARGE_VOLTAGE_MAX + maximum charge voltage supported by the power supply object. + +INPUT_CURRENT_LIMIT + input current limit programmed by charger. Indicates + the current drawn from a charging source. + +CHARGE_CONTROL_LIMIT + current charge control limit setting +CHARGE_CONTROL_LIMIT_MAX + maximum charge control limit setting + +CALIBRATE + battery or coulomb counter calibration status + +CAPACITY + capacity in percents. +CAPACITY_ALERT_MIN + minimum capacity alert value in percents. +CAPACITY_ALERT_MAX + maximum capacity alert value in percents. +CAPACITY_LEVEL + capacity level. This corresponds to POWER_SUPPLY_CAPACITY_LEVEL_*. + +TEMP + temperature of the power supply. +TEMP_ALERT_MIN + minimum battery temperature alert. +TEMP_ALERT_MAX + maximum battery temperature alert. +TEMP_AMBIENT + ambient temperature. +TEMP_AMBIENT_ALERT_MIN + minimum ambient temperature alert. +TEMP_AMBIENT_ALERT_MAX + maximum ambient temperature alert. +TEMP_MIN + minimum operatable temperature +TEMP_MAX + maximum operatable temperature + +TIME_TO_EMPTY + seconds left for battery to be considered empty + (i.e. while battery powers a load) +TIME_TO_FULL + seconds left for battery to be considered full + (i.e. while battery is charging) + + +Battery <-> external power supply interaction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Often power supplies are acting as supplies and supplicants at the same +time. Batteries are good example. So, batteries usually care if they're +externally powered or not. + +For that case, power supply class implements notification mechanism for +batteries. + +External power supply (AC) lists supplicants (batteries) names in +"supplied_to" struct member, and each power_supply_changed() call +issued by external power supply will notify supplicants via +external_power_changed callback. + + +Devicetree battery characteristics +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Drivers should call power_supply_get_battery_info() to obtain battery +characteristics from a devicetree battery node, defined in +Documentation/devicetree/bindings/power/supply/battery.txt. This is +implemented in drivers/power/supply/bq27xxx_battery.c. + +Properties in struct power_supply_battery_info and their counterparts in the +battery node have names corresponding to elements in enum power_supply_property, +for naming consistency between sysfs attributes and battery node properties. + + +QA +~~ + +Q: + Where is POWER_SUPPLY_PROP_XYZ attribute? +A: + If you cannot find attribute suitable for your driver needs, feel free + to add it and send patch along with your driver. + + The attributes available currently are the ones currently provided by the + drivers written. + + Good candidates to add in future: model/part#, cycle_time, manufacturer, + etc. + + +Q: + I have some very specific attribute (e.g. battery color), should I add + this attribute to standard ones? +A: + Most likely, no. Such attribute can be placed in the driver itself, if + it is useful. Of course, if the attribute in question applicable to + large set of batteries, provided by many drivers, and/or comes from + some general battery specification/standard, it may be a candidate to + be added to the core attribute set. + + +Q: + Suppose, my battery monitoring chip/firmware does not provides capacity + in percents, but provides charge_{now,full,empty}. Should I calculate + percentage capacity manually, inside the driver, and register CAPACITY + attribute? The same question about time_to_empty/time_to_full. +A: + Most likely, no. This class is designed to export properties which are + directly measurable by the specific hardware available. + + Inferring not available properties using some heuristics or mathematical + model is not subject of work for a battery driver. Such functionality + should be factored out, and in fact, apm_power, the driver to serve + legacy APM API on top of power supply class, uses a simple heuristic of + approximating remaining battery capacity based on its charge, current, + voltage and so on. But full-fledged battery model is likely not subject + for kernel at all, as it would require floating point calculation to deal + with things like differential equations and Kalman filters. This is + better be handled by batteryd/libbattery, yet to be written. diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt deleted file mode 100644 index 300d37896e51..000000000000 --- a/Documentation/power/power_supply_class.txt +++ /dev/null @@ -1,231 +0,0 @@ -Linux power supply class -======================== - -Synopsis -~~~~~~~~ -Power supply class used to represent battery, UPS, AC or DC power supply -properties to user-space. - -It defines core set of attributes, which should be applicable to (almost) -every power supply out there. Attributes are available via sysfs and uevent -interfaces. - -Each attribute has well defined meaning, up to unit of measure used. While -the attributes provided are believed to be universally applicable to any -power supply, specific monitoring hardware may not be able to provide them -all, so any of them may be skipped. - -Power supply class is extensible, and allows to define drivers own attributes. -The core attribute set is subject to the standard Linux evolution (i.e. -if it will be found that some attribute is applicable to many power supply -types or their drivers, it can be added to the core set). - -It also integrates with LED framework, for the purpose of providing -typically expected feedback of battery charging/fully charged status and -AC/USB power supply online status. (Note that specific details of the -indication (including whether to use it at all) are fully controllable by -user and/or specific machine defaults, per design principles of LED -framework). - - -Attributes/properties -~~~~~~~~~~~~~~~~~~~~~ -Power supply class has predefined set of attributes, this eliminates code -duplication across drivers. Power supply class insist on reusing its -predefined attributes *and* their units. - -So, userspace gets predictable set of attributes and their units for any -kind of power supply, and can process/present them to a user in consistent -manner. Results for different power supplies and machines are also directly -comparable. - -See drivers/power/supply/ds2760_battery.c and drivers/power/supply/pda_power.c -for the example how to declare and handle attributes. - - -Units -~~~~~ -Quoting include/linux/power_supply.h: - - All voltages, currents, charges, energies, time and temperatures in µV, - µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise - stated. It's driver's job to convert its raw values to units in which - this class operates. - - -Attributes/properties detailed -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -~ ~ ~ ~ ~ ~ ~ Charge/Energy/Capacity - how to not confuse ~ ~ ~ ~ ~ ~ ~ -~ ~ -~ Because both "charge" (µAh) and "energy" (µWh) represents "capacity" ~ -~ of battery, this class distinguish these terms. Don't mix them! ~ -~ ~ -~ CHARGE_* attributes represents capacity in µAh only. ~ -~ ENERGY_* attributes represents capacity in µWh only. ~ -~ CAPACITY attribute represents capacity in *percents*, from 0 to 100. ~ -~ ~ -~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ - -Postfixes: -_AVG - *hardware* averaged value, use it if your hardware is really able to -report averaged values. -_NOW - momentary/instantaneous values. - -STATUS - this attribute represents operating status (charging, full, -discharging (i.e. powering a load), etc.). This corresponds to -BATTERY_STATUS_* values, as defined in battery.h. - -CHARGE_TYPE - batteries can typically charge at different rates. -This defines trickle and fast charges. For batteries that -are already charged or discharging, 'n/a' can be displayed (or -'unknown', if the status is not known). - -AUTHENTIC - indicates the power supply (battery or charger) connected -to the platform is authentic(1) or non authentic(0). - -HEALTH - represents health of the battery, values corresponds to -POWER_SUPPLY_HEALTH_*, defined in battery.h. - -VOLTAGE_OCV - open circuit voltage of the battery. - -VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN - design values for maximal and -minimal power supply voltages. Maximal/minimal means values of voltages -when battery considered "full"/"empty" at normal conditions. Yes, there is -no direct relation between voltage and battery capacity, but some dumb -batteries use voltage for very approximated calculation of capacity. -Battery driver also can use this attribute just to inform userspace -about maximal and minimal voltage thresholds of a given battery. - -VOLTAGE_MAX, VOLTAGE_MIN - same as _DESIGN voltage values except that -these ones should be used if hardware could only guess (measure and -retain) the thresholds of a given power supply. - -VOLTAGE_BOOT - Reports the voltage measured during boot - -CURRENT_BOOT - Reports the current measured during boot - -CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN - design charge values, when -battery considered full/empty. - -ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN - same as above but for energy. - -CHARGE_FULL, CHARGE_EMPTY - These attributes means "last remembered value -of charge when battery became full/empty". It also could mean "value of -charge when battery considered full/empty at given conditions (temperature, -age)". I.e. these attributes represents real thresholds, not design values. - -ENERGY_FULL, ENERGY_EMPTY - same as above but for energy. - -CHARGE_COUNTER - the current charge counter (in µAh). This could easily -be negative; there is no empty or full value. It is only useful for -relative, time-based measurements. - -PRECHARGE_CURRENT - the maximum charge current during precharge phase -of charge cycle (typically 20% of battery capacity). -CHARGE_TERM_CURRENT - Charge termination current. The charge cycle -terminates when battery voltage is above recharge threshold, and charge -current is below this setting (typically 10% of battery capacity). - -CONSTANT_CHARGE_CURRENT - constant charge current programmed by charger. -CONSTANT_CHARGE_CURRENT_MAX - maximum charge current supported by the -power supply object. - -CONSTANT_CHARGE_VOLTAGE - constant charge voltage programmed by charger. -CONSTANT_CHARGE_VOLTAGE_MAX - maximum charge voltage supported by the -power supply object. - -INPUT_CURRENT_LIMIT - input current limit programmed by charger. Indicates -the current drawn from a charging source. - -CHARGE_CONTROL_LIMIT - current charge control limit setting -CHARGE_CONTROL_LIMIT_MAX - maximum charge control limit setting - -CALIBRATE - battery or coulomb counter calibration status - -CAPACITY - capacity in percents. -CAPACITY_ALERT_MIN - minimum capacity alert value in percents. -CAPACITY_ALERT_MAX - maximum capacity alert value in percents. -CAPACITY_LEVEL - capacity level. This corresponds to -POWER_SUPPLY_CAPACITY_LEVEL_*. - -TEMP - temperature of the power supply. -TEMP_ALERT_MIN - minimum battery temperature alert. -TEMP_ALERT_MAX - maximum battery temperature alert. -TEMP_AMBIENT - ambient temperature. -TEMP_AMBIENT_ALERT_MIN - minimum ambient temperature alert. -TEMP_AMBIENT_ALERT_MAX - maximum ambient temperature alert. -TEMP_MIN - minimum operatable temperature -TEMP_MAX - maximum operatable temperature - -TIME_TO_EMPTY - seconds left for battery to be considered empty (i.e. -while battery powers a load) -TIME_TO_FULL - seconds left for battery to be considered full (i.e. -while battery is charging) - - -Battery <-> external power supply interaction -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Often power supplies are acting as supplies and supplicants at the same -time. Batteries are good example. So, batteries usually care if they're -externally powered or not. - -For that case, power supply class implements notification mechanism for -batteries. - -External power supply (AC) lists supplicants (batteries) names in -"supplied_to" struct member, and each power_supply_changed() call -issued by external power supply will notify supplicants via -external_power_changed callback. - - -Devicetree battery characteristics -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Drivers should call power_supply_get_battery_info() to obtain battery -characteristics from a devicetree battery node, defined in -Documentation/devicetree/bindings/power/supply/battery.txt. This is -implemented in drivers/power/supply/bq27xxx_battery.c. - -Properties in struct power_supply_battery_info and their counterparts in the -battery node have names corresponding to elements in enum power_supply_property, -for naming consistency between sysfs attributes and battery node properties. - - -QA -~~ -Q: Where is POWER_SUPPLY_PROP_XYZ attribute? -A: If you cannot find attribute suitable for your driver needs, feel free - to add it and send patch along with your driver. - - The attributes available currently are the ones currently provided by the - drivers written. - - Good candidates to add in future: model/part#, cycle_time, manufacturer, - etc. - - -Q: I have some very specific attribute (e.g. battery color), should I add - this attribute to standard ones? -A: Most likely, no. Such attribute can be placed in the driver itself, if - it is useful. Of course, if the attribute in question applicable to - large set of batteries, provided by many drivers, and/or comes from - some general battery specification/standard, it may be a candidate to - be added to the core attribute set. - - -Q: Suppose, my battery monitoring chip/firmware does not provides capacity - in percents, but provides charge_{now,full,empty}. Should I calculate - percentage capacity manually, inside the driver, and register CAPACITY - attribute? The same question about time_to_empty/time_to_full. -A: Most likely, no. This class is designed to export properties which are - directly measurable by the specific hardware available. - - Inferring not available properties using some heuristics or mathematical - model is not subject of work for a battery driver. Such functionality - should be factored out, and in fact, apm_power, the driver to serve - legacy APM API on top of power supply class, uses a simple heuristic of - approximating remaining battery capacity based on its charge, current, - voltage and so on. But full-fledged battery model is likely not subject - for kernel at all, as it would require floating point calculation to deal - with things like differential equations and Kalman filters. This is - better be handled by batteryd/libbattery, yet to be written. diff --git a/Documentation/power/powercap/powercap.rst b/Documentation/power/powercap/powercap.rst new file mode 100644 index 000000000000..7ae3b44c7624 --- /dev/null +++ b/Documentation/power/powercap/powercap.rst @@ -0,0 +1,257 @@ +======================= +Power Capping Framework +======================= + +The power capping framework provides a consistent interface between the kernel +and the user space that allows power capping drivers to expose the settings to +user space in a uniform way. + +Terminology +=========== + +The framework exposes power capping devices to user space via sysfs in the +form of a tree of objects. The objects at the root level of the tree represent +'control types', which correspond to different methods of power capping. For +example, the intel-rapl control type represents the Intel "Running Average +Power Limit" (RAPL) technology, whereas the 'idle-injection' control type +corresponds to the use of idle injection for controlling power. + +Power zones represent different parts of the system, which can be controlled and +monitored using the power capping method determined by the control type the +given zone belongs to. They each contain attributes for monitoring power, as +well as controls represented in the form of power constraints. If the parts of +the system represented by different power zones are hierarchical (that is, one +bigger part consists of multiple smaller parts that each have their own power +controls), those power zones may also be organized in a hierarchy with one +parent power zone containing multiple subzones and so on to reflect the power +control topology of the system. In that case, it is possible to apply power +capping to a set of devices together using the parent power zone and if more +fine grained control is required, it can be applied through the subzones. + + +Example sysfs interface tree:: + + /sys/devices/virtual/powercap + └──intel-rapl + ├──intel-rapl:0 + │   ├──constraint_0_name + │   ├──constraint_0_power_limit_uw + │   ├──constraint_0_time_window_us + │   ├──constraint_1_name + │   ├──constraint_1_power_limit_uw + │   ├──constraint_1_time_window_us + │   ├──device -> ../../intel-rapl + │   ├──energy_uj + │   ├──intel-rapl:0:0 + │   │   ├──constraint_0_name + │   │   ├──constraint_0_power_limit_uw + │   │   ├──constraint_0_time_window_us + │   │   ├──constraint_1_name + │   │   ├──constraint_1_power_limit_uw + │   │   ├──constraint_1_time_window_us + │   │   ├──device -> ../../intel-rapl:0 + │   │   ├──energy_uj + │   │   ├──max_energy_range_uj + │   │   ├──name + │   │   ├──enabled + │   │   ├──power + │   │   │   ├──async + │   │   │   [] + │   │   ├──subsystem -> ../../../../../../class/power_cap + │   │   └──uevent + │   ├──intel-rapl:0:1 + │   │   ├──constraint_0_name + │   │   ├──constraint_0_power_limit_uw + │   │   ├──constraint_0_time_window_us + │   │   ├──constraint_1_name + │   │   ├──constraint_1_power_limit_uw + │   │   ├──constraint_1_time_window_us + │   │   ├──device -> ../../intel-rapl:0 + │   │   ├──energy_uj + │   │   ├──max_energy_range_uj + │   │   ├──name + │   │   ├──enabled + │   │   ├──power + │   │   │   ├──async + │   │   │   [] + │   │   ├──subsystem -> ../../../../../../class/power_cap + │   │   └──uevent + │   ├──max_energy_range_uj + │   ├──max_power_range_uw + │   ├──name + │   ├──enabled + │   ├──power + │   │   ├──async + │   │   [] + │   ├──subsystem -> ../../../../../class/power_cap + │   ├──enabled + │   ├──uevent + ├──intel-rapl:1 + │   ├──constraint_0_name + │   ├──constraint_0_power_limit_uw + │   ├──constraint_0_time_window_us + │   ├──constraint_1_name + │   ├──constraint_1_power_limit_uw + │   ├──constraint_1_time_window_us + │   ├──device -> ../../intel-rapl + │   ├──energy_uj + │   ├──intel-rapl:1:0 + │   │   ├──constraint_0_name + │   │   ├──constraint_0_power_limit_uw + │   │   ├──constraint_0_time_window_us + │   │   ├──constraint_1_name + │   │   ├──constraint_1_power_limit_uw + │   │   ├──constraint_1_time_window_us + │   │   ├──device -> ../../intel-rapl:1 + │   │   ├──energy_uj + │   │   ├──max_energy_range_uj + │   │   ├──name + │   │   ├──enabled + │   │   ├──power + │   │   │   ├──async + │   │   │   [] + │   │   ├──subsystem -> ../../../../../../class/power_cap + │   │   └──uevent + │   ├──intel-rapl:1:1 + │   │   ├──constraint_0_name + │   │   ├──constraint_0_power_limit_uw + │   │   ├──constraint_0_time_window_us + │   │   ├──constraint_1_name + │   │   ├──constraint_1_power_limit_uw + │   │   ├──constraint_1_time_window_us + │   │   ├──device -> ../../intel-rapl:1 + │   │   ├──energy_uj + │   │   ├──max_energy_range_uj + │   │   ├──name + │   │   ├──enabled + │   │   ├──power + │   │   │   ├──async + │   │   │   [] + │   │   ├──subsystem -> ../../../../../../class/power_cap + │   │   └──uevent + │   ├──max_energy_range_uj + │   ├──max_power_range_uw + │   ├──name + │   ├──enabled + │   ├──power + │   │   ├──async + │   │   [] + │   ├──subsystem -> ../../../../../class/power_cap + │   ├──uevent + ├──power + │   ├──async + │   [] + ├──subsystem -> ../../../../class/power_cap + ├──enabled + └──uevent + +The above example illustrates a case in which the Intel RAPL technology, +available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one +control type called intel-rapl which contains two power zones, intel-rapl:0 and +intel-rapl:1, representing CPU packages. Each of these power zones contains +two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the +"core" and the "uncore" parts of the given CPU package, respectively. All of +the zones and subzones contain energy monitoring attributes (energy_uj, +max_energy_range_uj) and constraint attributes (constraint_*) allowing controls +to be applied (the constraints in the 'package' power zones apply to the whole +CPU packages and the subzone constraints only apply to the respective parts of +the given package individually). Since Intel RAPL doesn't provide instantaneous +power value, there is no power_uw attribute. + +In addition to that, each power zone contains a name attribute, allowing the +part of the system represented by that zone to be identified. +For example:: + + cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name + +package-0 +--------- + +The Intel RAPL technology allows two constraints, short term and long term, +with two different time windows to be applied to each power zone. Thus for +each zone there are 2 attributes representing the constraint names, 2 power +limits and 2 attributes representing the sizes of the time windows. Such that, +constraint_j_* attributes correspond to the jth constraint (j = 0,1). + +For example:: + + constraint_0_name + constraint_0_power_limit_uw + constraint_0_time_window_us + constraint_1_name + constraint_1_power_limit_uw + constraint_1_time_window_us + +Power Zone Attributes +===================== + +Monitoring attributes +--------------------- + +energy_uj (rw) + Current energy counter in micro joules. Write "0" to reset. + If the counter can not be reset, then this attribute is read only. + +max_energy_range_uj (ro) + Range of the above energy counter in micro-joules. + +power_uw (ro) + Current power in micro watts. + +max_power_range_uw (ro) + Range of the above power value in micro-watts. + +name (ro) + Name of this power zone. + +It is possible that some domains have both power ranges and energy counter ranges; +however, only one is mandatory. + +Constraints +----------- + +constraint_X_power_limit_uw (rw) + Power limit in micro watts, which should be applicable for the + time window specified by "constraint_X_time_window_us". + +constraint_X_time_window_us (rw) + Time window in micro seconds. + +constraint_X_name (ro) + An optional name of the constraint + +constraint_X_max_power_uw(ro) + Maximum allowed power in micro watts. + +constraint_X_min_power_uw(ro) + Minimum allowed power in micro watts. + +constraint_X_max_time_window_us(ro) + Maximum allowed time window in micro seconds. + +constraint_X_min_time_window_us(ro) + Minimum allowed time window in micro seconds. + +Except power_limit_uw and time_window_us other fields are optional. + +Common zone and control type attributes +--------------------------------------- + +enabled (rw): Enable/Disable controls at zone level or for all zones using +a control type. + +Power Cap Client Driver Interface +================================= + +The API summary: + +Call powercap_register_control_type() to register control type object. +Call powercap_register_zone() to register a power zone (under a given +control type), either as a top-level power zone or as a subzone of another +power zone registered earlier. +The number of constraints in a power zone and the corresponding callbacks have +to be defined prior to calling powercap_register_zone() to register that zone. + +To Free a power zone call powercap_unregister_zone(). +To free a control type object call powercap_unregister_control_type(). +Detailed API can be generated using kernel-doc on include/linux/powercap.h. diff --git a/Documentation/power/powercap/powercap.txt b/Documentation/power/powercap/powercap.txt deleted file mode 100644 index 1e6ef164e07a..000000000000 --- a/Documentation/power/powercap/powercap.txt +++ /dev/null @@ -1,236 +0,0 @@ -Power Capping Framework -================================== - -The power capping framework provides a consistent interface between the kernel -and the user space that allows power capping drivers to expose the settings to -user space in a uniform way. - -Terminology -========================= -The framework exposes power capping devices to user space via sysfs in the -form of a tree of objects. The objects at the root level of the tree represent -'control types', which correspond to different methods of power capping. For -example, the intel-rapl control type represents the Intel "Running Average -Power Limit" (RAPL) technology, whereas the 'idle-injection' control type -corresponds to the use of idle injection for controlling power. - -Power zones represent different parts of the system, which can be controlled and -monitored using the power capping method determined by the control type the -given zone belongs to. They each contain attributes for monitoring power, as -well as controls represented in the form of power constraints. If the parts of -the system represented by different power zones are hierarchical (that is, one -bigger part consists of multiple smaller parts that each have their own power -controls), those power zones may also be organized in a hierarchy with one -parent power zone containing multiple subzones and so on to reflect the power -control topology of the system. In that case, it is possible to apply power -capping to a set of devices together using the parent power zone and if more -fine grained control is required, it can be applied through the subzones. - - -Example sysfs interface tree: - -/sys/devices/virtual/powercap -??? intel-rapl - ??? intel-rapl:0 - ?   ??? constraint_0_name - ?   ??? constraint_0_power_limit_uw - ?   ??? constraint_0_time_window_us - ?   ??? constraint_1_name - ?   ??? constraint_1_power_limit_uw - ?   ??? constraint_1_time_window_us - ?   ??? device -> ../../intel-rapl - ?   ??? energy_uj - ?   ??? intel-rapl:0:0 - ?   ?   ??? constraint_0_name - ?   ?   ??? constraint_0_power_limit_uw - ?   ?   ??? constraint_0_time_window_us - ?   ?   ??? constraint_1_name - ?   ?   ??? constraint_1_power_limit_uw - ?   ?   ??? constraint_1_time_window_us - ?   ?   ??? device -> ../../intel-rapl:0 - ?   ?   ??? energy_uj - ?   ?   ??? max_energy_range_uj - ?   ?   ??? name - ?   ?   ??? enabled - ?   ?   ??? power - ?   ?   ?   ??? async - ?   ?   ?   [] - ?   ?   ??? subsystem -> ../../../../../../class/power_cap - ?   ?   ??? uevent - ?   ??? intel-rapl:0:1 - ?   ?   ??? constraint_0_name - ?   ?   ??? constraint_0_power_limit_uw - ?   ?   ??? constraint_0_time_window_us - ?   ?   ??? constraint_1_name - ?   ?   ??? constraint_1_power_limit_uw - ?   ?   ??? constraint_1_time_window_us - ?   ?   ??? device -> ../../intel-rapl:0 - ?   ?   ??? energy_uj - ?   ?   ??? max_energy_range_uj - ?   ?   ??? name - ?   ?   ??? enabled - ?   ?   ??? power - ?   ?   ?   ??? async - ?   ?   ?   [] - ?   ?   ??? subsystem -> ../../../../../../class/power_cap - ?   ?   ??? uevent - ?   ??? max_energy_range_uj - ?   ??? max_power_range_uw - ?   ??? name - ?   ??? enabled - ?   ??? power - ?   ?   ??? async - ?   ?   [] - ?   ??? subsystem -> ../../../../../class/power_cap - ?   ??? enabled - ?   ??? uevent - ??? intel-rapl:1 - ?   ??? constraint_0_name - ?   ??? constraint_0_power_limit_uw - ?   ??? constraint_0_time_window_us - ?   ??? constraint_1_name - ?   ??? constraint_1_power_limit_uw - ?   ??? constraint_1_time_window_us - ?   ??? device -> ../../intel-rapl - ?   ??? energy_uj - ?   ??? intel-rapl:1:0 - ?   ?   ??? constraint_0_name - ?   ?   ??? constraint_0_power_limit_uw - ?   ?   ??? constraint_0_time_window_us - ?   ?   ??? constraint_1_name - ?   ?   ??? constraint_1_power_limit_uw - ?   ?   ??? constraint_1_time_window_us - ?   ?   ??? device -> ../../intel-rapl:1 - ?   ?   ??? energy_uj - ?   ?   ??? max_energy_range_uj - ?   ?   ??? name - ?   ?   ??? enabled - ?   ?   ??? power - ?   ?   ?   ??? async - ?   ?   ?   [] - ?   ?   ??? subsystem -> ../../../../../../class/power_cap - ?   ?   ??? uevent - ?   ??? intel-rapl:1:1 - ?   ?   ??? constraint_0_name - ?   ?   ??? constraint_0_power_limit_uw - ?   ?   ??? constraint_0_time_window_us - ?   ?   ??? constraint_1_name - ?   ?   ??? constraint_1_power_limit_uw - ?   ?   ??? constraint_1_time_window_us - ?   ?   ??? device -> ../../intel-rapl:1 - ?   ?   ??? energy_uj - ?   ?   ??? max_energy_range_uj - ?   ?   ??? name - ?   ?   ??? enabled - ?   ?   ??? power - ?   ?   ?   ??? async - ?   ?   ?   [] - ?   ?   ??? subsystem -> ../../../../../../class/power_cap - ?   ?   ??? uevent - ?   ??? max_energy_range_uj - ?   ??? max_power_range_uw - ?   ??? name - ?   ??? enabled - ?   ??? power - ?   ?   ??? async - ?   ?   [] - ?   ??? subsystem -> ../../../../../class/power_cap - ?   ??? uevent - ??? power - ?   ??? async - ?   [] - ??? subsystem -> ../../../../class/power_cap - ??? enabled - ??? uevent - -The above example illustrates a case in which the Intel RAPL technology, -available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one -control type called intel-rapl which contains two power zones, intel-rapl:0 and -intel-rapl:1, representing CPU packages. Each of these power zones contains -two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the -"core" and the "uncore" parts of the given CPU package, respectively. All of -the zones and subzones contain energy monitoring attributes (energy_uj, -max_energy_range_uj) and constraint attributes (constraint_*) allowing controls -to be applied (the constraints in the 'package' power zones apply to the whole -CPU packages and the subzone constraints only apply to the respective parts of -the given package individually). Since Intel RAPL doesn't provide instantaneous -power value, there is no power_uw attribute. - -In addition to that, each power zone contains a name attribute, allowing the -part of the system represented by that zone to be identified. -For example: - -cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name -package-0 - -The Intel RAPL technology allows two constraints, short term and long term, -with two different time windows to be applied to each power zone. Thus for -each zone there are 2 attributes representing the constraint names, 2 power -limits and 2 attributes representing the sizes of the time windows. Such that, -constraint_j_* attributes correspond to the jth constraint (j = 0,1). - -For example: - constraint_0_name - constraint_0_power_limit_uw - constraint_0_time_window_us - constraint_1_name - constraint_1_power_limit_uw - constraint_1_time_window_us - -Power Zone Attributes -================================= -Monitoring attributes ----------------------- - -energy_uj (rw): Current energy counter in micro joules. Write "0" to reset. -If the counter can not be reset, then this attribute is read only. - -max_energy_range_uj (ro): Range of the above energy counter in micro-joules. - -power_uw (ro): Current power in micro watts. - -max_power_range_uw (ro): Range of the above power value in micro-watts. - -name (ro): Name of this power zone. - -It is possible that some domains have both power ranges and energy counter ranges; -however, only one is mandatory. - -Constraints ----------------- -constraint_X_power_limit_uw (rw): Power limit in micro watts, which should be -applicable for the time window specified by "constraint_X_time_window_us". - -constraint_X_time_window_us (rw): Time window in micro seconds. - -constraint_X_name (ro): An optional name of the constraint - -constraint_X_max_power_uw(ro): Maximum allowed power in micro watts. - -constraint_X_min_power_uw(ro): Minimum allowed power in micro watts. - -constraint_X_max_time_window_us(ro): Maximum allowed time window in micro seconds. - -constraint_X_min_time_window_us(ro): Minimum allowed time window in micro seconds. - -Except power_limit_uw and time_window_us other fields are optional. - -Common zone and control type attributes ----------------------------------------- -enabled (rw): Enable/Disable controls at zone level or for all zones using -a control type. - -Power Cap Client Driver Interface -================================== -The API summary: - -Call powercap_register_control_type() to register control type object. -Call powercap_register_zone() to register a power zone (under a given -control type), either as a top-level power zone or as a subzone of another -power zone registered earlier. -The number of constraints in a power zone and the corresponding callbacks have -to be defined prior to calling powercap_register_zone() to register that zone. - -To Free a power zone call powercap_unregister_zone(). -To free a control type object call powercap_unregister_control_type(). -Detailed API can be generated using kernel-doc on include/linux/powercap.h. diff --git a/Documentation/power/regulator/consumer.rst b/Documentation/power/regulator/consumer.rst new file mode 100644 index 000000000000..0cd8cc1275a7 --- /dev/null +++ b/Documentation/power/regulator/consumer.rst @@ -0,0 +1,229 @@ +=================================== +Regulator Consumer Driver Interface +=================================== + +This text describes the regulator interface for consumer device drivers. +Please see overview.txt for a description of the terms used in this text. + + +1. Consumer Regulator Access (static & dynamic drivers) +======================================================= + +A consumer driver can get access to its supply regulator by calling :: + + regulator = regulator_get(dev, "Vcc"); + +The consumer passes in its struct device pointer and power supply ID. The core +then finds the correct regulator by consulting a machine specific lookup table. +If the lookup is successful then this call will return a pointer to the struct +regulator that supplies this consumer. + +To release the regulator the consumer driver should call :: + + regulator_put(regulator); + +Consumers can be supplied by more than one regulator e.g. codec consumer with +analog and digital supplies :: + + digital = regulator_get(dev, "Vcc"); /* digital core */ + analog = regulator_get(dev, "Avdd"); /* analog */ + +The regulator access functions regulator_get() and regulator_put() will +usually be called in your device drivers probe() and remove() respectively. + + +2. Regulator Output Enable & Disable (static & dynamic drivers) +=============================================================== + + +A consumer can enable its power supply by calling:: + + int regulator_enable(regulator); + +NOTE: + The supply may already be enabled before regulator_enabled() is called. + This may happen if the consumer shares the regulator or the regulator has been + previously enabled by bootloader or kernel board initialization code. + +A consumer can determine if a regulator is enabled by calling:: + + int regulator_is_enabled(regulator); + +This will return > zero when the regulator is enabled. + + +A consumer can disable its supply when no longer needed by calling:: + + int regulator_disable(regulator); + +NOTE: + This may not disable the supply if it's shared with other consumers. The + regulator will only be disabled when the enabled reference count is zero. + +Finally, a regulator can be forcefully disabled in the case of an emergency:: + + int regulator_force_disable(regulator); + +NOTE: + this will immediately and forcefully shutdown the regulator output. All + consumers will be powered off. + + +3. Regulator Voltage Control & Status (dynamic drivers) +======================================================= + +Some consumer drivers need to be able to dynamically change their supply +voltage to match system operating points. e.g. CPUfreq drivers can scale +voltage along with frequency to save power, SD drivers may need to select the +correct card voltage, etc. + +Consumers can control their supply voltage by calling:: + + int regulator_set_voltage(regulator, min_uV, max_uV); + +Where min_uV and max_uV are the minimum and maximum acceptable voltages in +microvolts. + +NOTE: this can be called when the regulator is enabled or disabled. If called +when enabled, then the voltage changes instantly, otherwise the voltage +configuration changes and the voltage is physically set when the regulator is +next enabled. + +The regulators configured voltage output can be found by calling:: + + int regulator_get_voltage(regulator); + +NOTE: + get_voltage() will return the configured output voltage whether the + regulator is enabled or disabled and should NOT be used to determine regulator + output state. However this can be used in conjunction with is_enabled() to + determine the regulator physical output voltage. + + +4. Regulator Current Limit Control & Status (dynamic drivers) +============================================================= + +Some consumer drivers need to be able to dynamically change their supply +current limit to match system operating points. e.g. LCD backlight driver can +change the current limit to vary the backlight brightness, USB drivers may want +to set the limit to 500mA when supplying power. + +Consumers can control their supply current limit by calling:: + + int regulator_set_current_limit(regulator, min_uA, max_uA); + +Where min_uA and max_uA are the minimum and maximum acceptable current limit in +microamps. + +NOTE: + this can be called when the regulator is enabled or disabled. If called + when enabled, then the current limit changes instantly, otherwise the current + limit configuration changes and the current limit is physically set when the + regulator is next enabled. + +A regulators current limit can be found by calling:: + + int regulator_get_current_limit(regulator); + +NOTE: + get_current_limit() will return the current limit whether the regulator + is enabled or disabled and should not be used to determine regulator current + load. + + +5. Regulator Operating Mode Control & Status (dynamic drivers) +============================================================== + +Some consumers can further save system power by changing the operating mode of +their supply regulator to be more efficient when the consumers operating state +changes. e.g. consumer driver is idle and subsequently draws less current + +Regulator operating mode can be changed indirectly or directly. + +Indirect operating mode control. +-------------------------------- +Consumer drivers can request a change in their supply regulator operating mode +by calling:: + + int regulator_set_load(struct regulator *regulator, int load_uA); + +This will cause the core to recalculate the total load on the regulator (based +on all its consumers) and change operating mode (if necessary and permitted) +to best match the current operating load. + +The load_uA value can be determined from the consumer's datasheet. e.g. most +datasheets have tables showing the maximum current consumed in certain +situations. + +Most consumers will use indirect operating mode control since they have no +knowledge of the regulator or whether the regulator is shared with other +consumers. + +Direct operating mode control. +------------------------------ + +Bespoke or tightly coupled drivers may want to directly control regulator +operating mode depending on their operating point. This can be achieved by +calling:: + + int regulator_set_mode(struct regulator *regulator, unsigned int mode); + unsigned int regulator_get_mode(struct regulator *regulator); + +Direct mode will only be used by consumers that *know* about the regulator and +are not sharing the regulator with other consumers. + + +6. Regulator Events +=================== + +Regulators can notify consumers of external events. Events could be received by +consumers under regulator stress or failure conditions. + +Consumers can register interest in regulator events by calling:: + + int regulator_register_notifier(struct regulator *regulator, + struct notifier_block *nb); + +Consumers can unregister interest by calling:: + + int regulator_unregister_notifier(struct regulator *regulator, + struct notifier_block *nb); + +Regulators use the kernel notifier framework to send event to their interested +consumers. + +7. Regulator Direct Register Access +=================================== + +Some kinds of power management hardware or firmware are designed such that +they need to do low-level hardware access to regulators, with no involvement +from the kernel. Examples of such devices are: + +- clocksource with a voltage-controlled oscillator and control logic to change + the supply voltage over I2C to achieve a desired output clock rate +- thermal management firmware that can issue an arbitrary I2C transaction to + perform system poweroff during overtemperature conditions + +To set up such a device/firmware, various parameters like I2C address of the +regulator, addresses of various regulator registers etc. need to be configured +to it. The regulator framework provides the following helpers for querying +these details. + +Bus-specific details, like I2C addresses or transfer rates are handled by the +regmap framework. To get the regulator's regmap (if supported), use:: + + struct regmap *regulator_get_regmap(struct regulator *regulator); + +To obtain the hardware register offset and bitmask for the regulator's voltage +selector register, use:: + + int regulator_get_hardware_vsel_register(struct regulator *regulator, + unsigned *vsel_reg, + unsigned *vsel_mask); + +To convert a regulator framework voltage selector code (used by +regulator_list_voltage) to a hardware-specific voltage selector that can be +directly written to the voltage selector register, use:: + + int regulator_list_hardware_vsel(struct regulator *regulator, + unsigned selector); diff --git a/Documentation/power/regulator/consumer.txt b/Documentation/power/regulator/consumer.txt deleted file mode 100644 index e51564c1a140..000000000000 --- a/Documentation/power/regulator/consumer.txt +++ /dev/null @@ -1,218 +0,0 @@ -Regulator Consumer Driver Interface -=================================== - -This text describes the regulator interface for consumer device drivers. -Please see overview.txt for a description of the terms used in this text. - - -1. Consumer Regulator Access (static & dynamic drivers) -======================================================= - -A consumer driver can get access to its supply regulator by calling :- - -regulator = regulator_get(dev, "Vcc"); - -The consumer passes in its struct device pointer and power supply ID. The core -then finds the correct regulator by consulting a machine specific lookup table. -If the lookup is successful then this call will return a pointer to the struct -regulator that supplies this consumer. - -To release the regulator the consumer driver should call :- - -regulator_put(regulator); - -Consumers can be supplied by more than one regulator e.g. codec consumer with -analog and digital supplies :- - -digital = regulator_get(dev, "Vcc"); /* digital core */ -analog = regulator_get(dev, "Avdd"); /* analog */ - -The regulator access functions regulator_get() and regulator_put() will -usually be called in your device drivers probe() and remove() respectively. - - -2. Regulator Output Enable & Disable (static & dynamic drivers) -==================================================================== - -A consumer can enable its power supply by calling:- - -int regulator_enable(regulator); - -NOTE: The supply may already be enabled before regulator_enabled() is called. -This may happen if the consumer shares the regulator or the regulator has been -previously enabled by bootloader or kernel board initialization code. - -A consumer can determine if a regulator is enabled by calling :- - -int regulator_is_enabled(regulator); - -This will return > zero when the regulator is enabled. - - -A consumer can disable its supply when no longer needed by calling :- - -int regulator_disable(regulator); - -NOTE: This may not disable the supply if it's shared with other consumers. The -regulator will only be disabled when the enabled reference count is zero. - -Finally, a regulator can be forcefully disabled in the case of an emergency :- - -int regulator_force_disable(regulator); - -NOTE: this will immediately and forcefully shutdown the regulator output. All -consumers will be powered off. - - -3. Regulator Voltage Control & Status (dynamic drivers) -====================================================== - -Some consumer drivers need to be able to dynamically change their supply -voltage to match system operating points. e.g. CPUfreq drivers can scale -voltage along with frequency to save power, SD drivers may need to select the -correct card voltage, etc. - -Consumers can control their supply voltage by calling :- - -int regulator_set_voltage(regulator, min_uV, max_uV); - -Where min_uV and max_uV are the minimum and maximum acceptable voltages in -microvolts. - -NOTE: this can be called when the regulator is enabled or disabled. If called -when enabled, then the voltage changes instantly, otherwise the voltage -configuration changes and the voltage is physically set when the regulator is -next enabled. - -The regulators configured voltage output can be found by calling :- - -int regulator_get_voltage(regulator); - -NOTE: get_voltage() will return the configured output voltage whether the -regulator is enabled or disabled and should NOT be used to determine regulator -output state. However this can be used in conjunction with is_enabled() to -determine the regulator physical output voltage. - - -4. Regulator Current Limit Control & Status (dynamic drivers) -=========================================================== - -Some consumer drivers need to be able to dynamically change their supply -current limit to match system operating points. e.g. LCD backlight driver can -change the current limit to vary the backlight brightness, USB drivers may want -to set the limit to 500mA when supplying power. - -Consumers can control their supply current limit by calling :- - -int regulator_set_current_limit(regulator, min_uA, max_uA); - -Where min_uA and max_uA are the minimum and maximum acceptable current limit in -microamps. - -NOTE: this can be called when the regulator is enabled or disabled. If called -when enabled, then the current limit changes instantly, otherwise the current -limit configuration changes and the current limit is physically set when the -regulator is next enabled. - -A regulators current limit can be found by calling :- - -int regulator_get_current_limit(regulator); - -NOTE: get_current_limit() will return the current limit whether the regulator -is enabled or disabled and should not be used to determine regulator current -load. - - -5. Regulator Operating Mode Control & Status (dynamic drivers) -============================================================= - -Some consumers can further save system power by changing the operating mode of -their supply regulator to be more efficient when the consumers operating state -changes. e.g. consumer driver is idle and subsequently draws less current - -Regulator operating mode can be changed indirectly or directly. - -Indirect operating mode control. --------------------------------- -Consumer drivers can request a change in their supply regulator operating mode -by calling :- - -int regulator_set_load(struct regulator *regulator, int load_uA); - -This will cause the core to recalculate the total load on the regulator (based -on all its consumers) and change operating mode (if necessary and permitted) -to best match the current operating load. - -The load_uA value can be determined from the consumer's datasheet. e.g. most -datasheets have tables showing the maximum current consumed in certain -situations. - -Most consumers will use indirect operating mode control since they have no -knowledge of the regulator or whether the regulator is shared with other -consumers. - -Direct operating mode control. ------------------------------- -Bespoke or tightly coupled drivers may want to directly control regulator -operating mode depending on their operating point. This can be achieved by -calling :- - -int regulator_set_mode(struct regulator *regulator, unsigned int mode); -unsigned int regulator_get_mode(struct regulator *regulator); - -Direct mode will only be used by consumers that *know* about the regulator and -are not sharing the regulator with other consumers. - - -6. Regulator Events -=================== -Regulators can notify consumers of external events. Events could be received by -consumers under regulator stress or failure conditions. - -Consumers can register interest in regulator events by calling :- - -int regulator_register_notifier(struct regulator *regulator, - struct notifier_block *nb); - -Consumers can unregister interest by calling :- - -int regulator_unregister_notifier(struct regulator *regulator, - struct notifier_block *nb); - -Regulators use the kernel notifier framework to send event to their interested -consumers. - -7. Regulator Direct Register Access -=================================== -Some kinds of power management hardware or firmware are designed such that -they need to do low-level hardware access to regulators, with no involvement -from the kernel. Examples of such devices are: - -- clocksource with a voltage-controlled oscillator and control logic to change - the supply voltage over I2C to achieve a desired output clock rate -- thermal management firmware that can issue an arbitrary I2C transaction to - perform system poweroff during overtemperature conditions - -To set up such a device/firmware, various parameters like I2C address of the -regulator, addresses of various regulator registers etc. need to be configured -to it. The regulator framework provides the following helpers for querying -these details. - -Bus-specific details, like I2C addresses or transfer rates are handled by the -regmap framework. To get the regulator's regmap (if supported), use :- - -struct regmap *regulator_get_regmap(struct regulator *regulator); - -To obtain the hardware register offset and bitmask for the regulator's voltage -selector register, use :- - -int regulator_get_hardware_vsel_register(struct regulator *regulator, - unsigned *vsel_reg, - unsigned *vsel_mask); - -To convert a regulator framework voltage selector code (used by -regulator_list_voltage) to a hardware-specific voltage selector that can be -directly written to the voltage selector register, use :- - -int regulator_list_hardware_vsel(struct regulator *regulator, - unsigned selector); diff --git a/Documentation/power/regulator/design.rst b/Documentation/power/regulator/design.rst new file mode 100644 index 000000000000..3b09c6841dc4 --- /dev/null +++ b/Documentation/power/regulator/design.rst @@ -0,0 +1,38 @@ +========================== +Regulator API design notes +========================== + +This document provides a brief, partially structured, overview of some +of the design considerations which impact the regulator API design. + +Safety +------ + + - Errors in regulator configuration can have very serious consequences + for the system, potentially including lasting hardware damage. + - It is not possible to automatically determine the power configuration + of the system - software-equivalent variants of the same chip may + have different power requirements, and not all components with power + requirements are visible to software. + +.. note:: + + The API should make no changes to the hardware state unless it has + specific knowledge that these changes are safe to perform on this + particular system. + +Consumer use cases +------------------ + + - The overwhelming majority of devices in a system will have no + requirement to do any runtime configuration of their power beyond + being able to turn it on or off. + + - Many of the power supplies in the system will be shared between many + different consumers. + +.. note:: + + The consumer API should be structured so that these use cases are + very easy to handle and so that consumers will work with shared + supplies without any additional effort. diff --git a/Documentation/power/regulator/design.txt b/Documentation/power/regulator/design.txt deleted file mode 100644 index fdd919b96830..000000000000 --- a/Documentation/power/regulator/design.txt +++ /dev/null @@ -1,33 +0,0 @@ -Regulator API design notes -========================== - -This document provides a brief, partially structured, overview of some -of the design considerations which impact the regulator API design. - -Safety ------- - - - Errors in regulator configuration can have very serious consequences - for the system, potentially including lasting hardware damage. - - It is not possible to automatically determine the power configuration - of the system - software-equivalent variants of the same chip may - have different power requirements, and not all components with power - requirements are visible to software. - - => The API should make no changes to the hardware state unless it has - specific knowledge that these changes are safe to perform on this - particular system. - -Consumer use cases ------------------- - - - The overwhelming majority of devices in a system will have no - requirement to do any runtime configuration of their power beyond - being able to turn it on or off. - - - Many of the power supplies in the system will be shared between many - different consumers. - - => The consumer API should be structured so that these use cases are - very easy to handle and so that consumers will work with shared - supplies without any additional effort. diff --git a/Documentation/power/regulator/machine.rst b/Documentation/power/regulator/machine.rst new file mode 100644 index 000000000000..22fffefaa3ad --- /dev/null +++ b/Documentation/power/regulator/machine.rst @@ -0,0 +1,97 @@ +================================== +Regulator Machine Driver Interface +================================== + +The regulator machine driver interface is intended for board/machine specific +initialisation code to configure the regulator subsystem. + +Consider the following machine:: + + Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V] + | + +-> [Consumer B @ 3.3V] + +The drivers for consumers A & B must be mapped to the correct regulator in +order to control their power supplies. This mapping can be achieved in machine +initialisation code by creating a struct regulator_consumer_supply for +each regulator:: + + struct regulator_consumer_supply { + const char *dev_name; /* consumer dev_name() */ + const char *supply; /* consumer supply - e.g. "vcc" */ + }; + +e.g. for the machine above:: + + static struct regulator_consumer_supply regulator1_consumers[] = { + REGULATOR_SUPPLY("Vcc", "consumer B"), + }; + + static struct regulator_consumer_supply regulator2_consumers[] = { + REGULATOR_SUPPLY("Vcc", "consumer A"), + }; + +This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2 +to the 'Vcc' supply for Consumer A. + +Constraints can now be registered by defining a struct regulator_init_data +for each regulator power domain. This structure also maps the consumers +to their supply regulators:: + + static struct regulator_init_data regulator1_data = { + .constraints = { + .name = "Regulator-1", + .min_uV = 3300000, + .max_uV = 3300000, + .valid_modes_mask = REGULATOR_MODE_NORMAL, + }, + .num_consumer_supplies = ARRAY_SIZE(regulator1_consumers), + .consumer_supplies = regulator1_consumers, + }; + +The name field should be set to something that is usefully descriptive +for the board for configuration of supplies for other regulators and +for use in logging and other diagnostic output. Normally the name +used for the supply rail in the schematic is a good choice. If no +name is provided then the subsystem will choose one. + +Regulator-1 supplies power to Regulator-2. This relationship must be registered +with the core so that Regulator-1 is also enabled when Consumer A enables its +supply (Regulator-2). The supply regulator is set by the supply_regulator +field below and co:: + + static struct regulator_init_data regulator2_data = { + .supply_regulator = "Regulator-1", + .constraints = { + .min_uV = 1800000, + .max_uV = 2000000, + .valid_ops_mask = REGULATOR_CHANGE_VOLTAGE, + .valid_modes_mask = REGULATOR_MODE_NORMAL, + }, + .num_consumer_supplies = ARRAY_SIZE(regulator2_consumers), + .consumer_supplies = regulator2_consumers, + }; + +Finally the regulator devices must be registered in the usual manner:: + + static struct platform_device regulator_devices[] = { + { + .name = "regulator", + .id = DCDC_1, + .dev = { + .platform_data = ®ulator1_data, + }, + }, + { + .name = "regulator", + .id = DCDC_2, + .dev = { + .platform_data = ®ulator2_data, + }, + }, + }; + /* register regulator 1 device */ + platform_device_register(®ulator_devices[0]); + + /* register regulator 2 device */ + platform_device_register(®ulator_devices[1]); diff --git a/Documentation/power/regulator/machine.txt b/Documentation/power/regulator/machine.txt deleted file mode 100644 index eff4dcaaa252..000000000000 --- a/Documentation/power/regulator/machine.txt +++ /dev/null @@ -1,96 +0,0 @@ -Regulator Machine Driver Interface -=================================== - -The regulator machine driver interface is intended for board/machine specific -initialisation code to configure the regulator subsystem. - -Consider the following machine :- - - Regulator-1 -+-> Regulator-2 --> [Consumer A @ 1.8 - 2.0V] - | - +-> [Consumer B @ 3.3V] - -The drivers for consumers A & B must be mapped to the correct regulator in -order to control their power supplies. This mapping can be achieved in machine -initialisation code by creating a struct regulator_consumer_supply for -each regulator. - -struct regulator_consumer_supply { - const char *dev_name; /* consumer dev_name() */ - const char *supply; /* consumer supply - e.g. "vcc" */ -}; - -e.g. for the machine above - -static struct regulator_consumer_supply regulator1_consumers[] = { - REGULATOR_SUPPLY("Vcc", "consumer B"), -}; - -static struct regulator_consumer_supply regulator2_consumers[] = { - REGULATOR_SUPPLY("Vcc", "consumer A"), -}; - -This maps Regulator-1 to the 'Vcc' supply for Consumer B and maps Regulator-2 -to the 'Vcc' supply for Consumer A. - -Constraints can now be registered by defining a struct regulator_init_data -for each regulator power domain. This structure also maps the consumers -to their supply regulators :- - -static struct regulator_init_data regulator1_data = { - .constraints = { - .name = "Regulator-1", - .min_uV = 3300000, - .max_uV = 3300000, - .valid_modes_mask = REGULATOR_MODE_NORMAL, - }, - .num_consumer_supplies = ARRAY_SIZE(regulator1_consumers), - .consumer_supplies = regulator1_consumers, -}; - -The name field should be set to something that is usefully descriptive -for the board for configuration of supplies for other regulators and -for use in logging and other diagnostic output. Normally the name -used for the supply rail in the schematic is a good choice. If no -name is provided then the subsystem will choose one. - -Regulator-1 supplies power to Regulator-2. This relationship must be registered -with the core so that Regulator-1 is also enabled when Consumer A enables its -supply (Regulator-2). The supply regulator is set by the supply_regulator -field below and co:- - -static struct regulator_init_data regulator2_data = { - .supply_regulator = "Regulator-1", - .constraints = { - .min_uV = 1800000, - .max_uV = 2000000, - .valid_ops_mask = REGULATOR_CHANGE_VOLTAGE, - .valid_modes_mask = REGULATOR_MODE_NORMAL, - }, - .num_consumer_supplies = ARRAY_SIZE(regulator2_consumers), - .consumer_supplies = regulator2_consumers, -}; - -Finally the regulator devices must be registered in the usual manner. - -static struct platform_device regulator_devices[] = { - { - .name = "regulator", - .id = DCDC_1, - .dev = { - .platform_data = ®ulator1_data, - }, - }, - { - .name = "regulator", - .id = DCDC_2, - .dev = { - .platform_data = ®ulator2_data, - }, - }, -}; -/* register regulator 1 device */ -platform_device_register(®ulator_devices[0]); - -/* register regulator 2 device */ -platform_device_register(®ulator_devices[1]); diff --git a/Documentation/power/regulator/overview.rst b/Documentation/power/regulator/overview.rst new file mode 100644 index 000000000000..ee494c70a7c4 --- /dev/null +++ b/Documentation/power/regulator/overview.rst @@ -0,0 +1,178 @@ +============================================= +Linux voltage and current regulator framework +============================================= + +About +===== + +This framework is designed to provide a standard kernel interface to control +voltage and current regulators. + +The intention is to allow systems to dynamically control regulator power output +in order to save power and prolong battery life. This applies to both voltage +regulators (where voltage output is controllable) and current sinks (where +current limit is controllable). + +(C) 2008 Wolfson Microelectronics PLC. + +Author: Liam Girdwood + + +Nomenclature +============ + +Some terms used in this document: + + - Regulator + - Electronic device that supplies power to other devices. + Most regulators can enable and disable their output while + some can control their output voltage and or current. + + Input Voltage -> Regulator -> Output Voltage + + + - PMIC + - Power Management IC. An IC that contains numerous + regulators and often contains other subsystems. + + + - Consumer + - Electronic device that is supplied power by a regulator. + Consumers can be classified into two types:- + + Static: consumer does not change its supply voltage or + current limit. It only needs to enable or disable its + power supply. Its supply voltage is set by the hardware, + bootloader, firmware or kernel board initialisation code. + + Dynamic: consumer needs to change its supply voltage or + current limit to meet operation demands. + + + - Power Domain + - Electronic circuit that is supplied its input power by the + output power of a regulator, switch or by another power + domain. + + The supply regulator may be behind a switch(s). i.e.:: + + Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A] + | | + | +-> [Consumer B], [Consumer C] + | + +-> [Consumer D], [Consumer E] + + That is one regulator and three power domains: + + - Domain 1: Switch-1, Consumers D & E. + - Domain 2: Switch-2, Consumers B & C. + - Domain 3: Consumer A. + + and this represents a "supplies" relationship: + + Domain-1 --> Domain-2 --> Domain-3. + + A power domain may have regulators that are supplied power + by other regulators. i.e.:: + + Regulator-1 -+-> Regulator-2 -+-> [Consumer A] + | + +-> [Consumer B] + + This gives us two regulators and two power domains: + + - Domain 1: Regulator-2, Consumer B. + - Domain 2: Consumer A. + + and a "supplies" relationship: + + Domain-1 --> Domain-2 + + + - Constraints + - Constraints are used to define power levels for performance + and hardware protection. Constraints exist at three levels: + + Regulator Level: This is defined by the regulator hardware + operating parameters and is specified in the regulator + datasheet. i.e. + + - voltage output is in the range 800mV -> 3500mV. + - regulator current output limit is 20mA @ 5V but is + 10mA @ 10V. + + Power Domain Level: This is defined in software by kernel + level board initialisation code. It is used to constrain a + power domain to a particular power range. i.e. + + - Domain-1 voltage is 3300mV + - Domain-2 voltage is 1400mV -> 1600mV + - Domain-3 current limit is 0mA -> 20mA. + + Consumer Level: This is defined by consumer drivers + dynamically setting voltage or current limit levels. + + e.g. a consumer backlight driver asks for a current increase + from 5mA to 10mA to increase LCD illumination. This passes + to through the levels as follows :- + + Consumer: need to increase LCD brightness. Lookup and + request next current mA value in brightness table (the + consumer driver could be used on several different + personalities based upon the same reference device). + + Power Domain: is the new current limit within the domain + operating limits for this domain and system state (e.g. + battery power, USB power) + + Regulator Domains: is the new current limit within the + regulator operating parameters for input/output voltage. + + If the regulator request passes all the constraint tests + then the new regulator value is applied. + + +Design +====== + +The framework is designed and targeted at SoC based devices but may also be +relevant to non SoC devices and is split into the following four interfaces:- + + + 1. Consumer driver interface. + + This uses a similar API to the kernel clock interface in that consumer + drivers can get and put a regulator (like they can with clocks atm) and + get/set voltage, current limit, mode, enable and disable. This should + allow consumers complete control over their supply voltage and current + limit. This also compiles out if not in use so drivers can be reused in + systems with no regulator based power control. + + See Documentation/power/regulator/consumer.rst + + 2. Regulator driver interface. + + This allows regulator drivers to register their regulators and provide + operations to the core. It also has a notifier call chain for propagating + regulator events to clients. + + See Documentation/power/regulator/regulator.rst + + 3. Machine interface. + + This interface is for machine specific code and allows the creation of + voltage/current domains (with constraints) for each regulator. It can + provide regulator constraints that will prevent device damage through + overvoltage or overcurrent caused by buggy client drivers. It also + allows the creation of a regulator tree whereby some regulators are + supplied by others (similar to a clock tree). + + See Documentation/power/regulator/machine.rst + + 4. Userspace ABI. + + The framework also exports a lot of useful voltage/current/opmode data to + userspace via sysfs. This could be used to help monitor device power + consumption and status. + + See Documentation/ABI/testing/sysfs-class-regulator diff --git a/Documentation/power/regulator/overview.txt b/Documentation/power/regulator/overview.txt deleted file mode 100644 index 721b4739ec32..000000000000 --- a/Documentation/power/regulator/overview.txt +++ /dev/null @@ -1,171 +0,0 @@ -Linux voltage and current regulator framework -============================================= - -About -===== - -This framework is designed to provide a standard kernel interface to control -voltage and current regulators. - -The intention is to allow systems to dynamically control regulator power output -in order to save power and prolong battery life. This applies to both voltage -regulators (where voltage output is controllable) and current sinks (where -current limit is controllable). - -(C) 2008 Wolfson Microelectronics PLC. -Author: Liam Girdwood - - -Nomenclature -============ - -Some terms used in this document:- - - o Regulator - Electronic device that supplies power to other devices. - Most regulators can enable and disable their output while - some can control their output voltage and or current. - - Input Voltage -> Regulator -> Output Voltage - - - o PMIC - Power Management IC. An IC that contains numerous regulators - and often contains other subsystems. - - - o Consumer - Electronic device that is supplied power by a regulator. - Consumers can be classified into two types:- - - Static: consumer does not change its supply voltage or - current limit. It only needs to enable or disable its - power supply. Its supply voltage is set by the hardware, - bootloader, firmware or kernel board initialisation code. - - Dynamic: consumer needs to change its supply voltage or - current limit to meet operation demands. - - - o Power Domain - Electronic circuit that is supplied its input power by the - output power of a regulator, switch or by another power - domain. - - The supply regulator may be behind a switch(s). i.e. - - Regulator -+-> Switch-1 -+-> Switch-2 --> [Consumer A] - | | - | +-> [Consumer B], [Consumer C] - | - +-> [Consumer D], [Consumer E] - - That is one regulator and three power domains: - - Domain 1: Switch-1, Consumers D & E. - Domain 2: Switch-2, Consumers B & C. - Domain 3: Consumer A. - - and this represents a "supplies" relationship: - - Domain-1 --> Domain-2 --> Domain-3. - - A power domain may have regulators that are supplied power - by other regulators. i.e. - - Regulator-1 -+-> Regulator-2 -+-> [Consumer A] - | - +-> [Consumer B] - - This gives us two regulators and two power domains: - - Domain 1: Regulator-2, Consumer B. - Domain 2: Consumer A. - - and a "supplies" relationship: - - Domain-1 --> Domain-2 - - - o Constraints - Constraints are used to define power levels for performance - and hardware protection. Constraints exist at three levels: - - Regulator Level: This is defined by the regulator hardware - operating parameters and is specified in the regulator - datasheet. i.e. - - - voltage output is in the range 800mV -> 3500mV. - - regulator current output limit is 20mA @ 5V but is - 10mA @ 10V. - - Power Domain Level: This is defined in software by kernel - level board initialisation code. It is used to constrain a - power domain to a particular power range. i.e. - - - Domain-1 voltage is 3300mV - - Domain-2 voltage is 1400mV -> 1600mV - - Domain-3 current limit is 0mA -> 20mA. - - Consumer Level: This is defined by consumer drivers - dynamically setting voltage or current limit levels. - - e.g. a consumer backlight driver asks for a current increase - from 5mA to 10mA to increase LCD illumination. This passes - to through the levels as follows :- - - Consumer: need to increase LCD brightness. Lookup and - request next current mA value in brightness table (the - consumer driver could be used on several different - personalities based upon the same reference device). - - Power Domain: is the new current limit within the domain - operating limits for this domain and system state (e.g. - battery power, USB power) - - Regulator Domains: is the new current limit within the - regulator operating parameters for input/output voltage. - - If the regulator request passes all the constraint tests - then the new regulator value is applied. - - -Design -====== - -The framework is designed and targeted at SoC based devices but may also be -relevant to non SoC devices and is split into the following four interfaces:- - - - 1. Consumer driver interface. - - This uses a similar API to the kernel clock interface in that consumer - drivers can get and put a regulator (like they can with clocks atm) and - get/set voltage, current limit, mode, enable and disable. This should - allow consumers complete control over their supply voltage and current - limit. This also compiles out if not in use so drivers can be reused in - systems with no regulator based power control. - - See Documentation/power/regulator/consumer.txt - - 2. Regulator driver interface. - - This allows regulator drivers to register their regulators and provide - operations to the core. It also has a notifier call chain for propagating - regulator events to clients. - - See Documentation/power/regulator/regulator.txt - - 3. Machine interface. - - This interface is for machine specific code and allows the creation of - voltage/current domains (with constraints) for each regulator. It can - provide regulator constraints that will prevent device damage through - overvoltage or overcurrent caused by buggy client drivers. It also - allows the creation of a regulator tree whereby some regulators are - supplied by others (similar to a clock tree). - - See Documentation/power/regulator/machine.txt - - 4. Userspace ABI. - - The framework also exports a lot of useful voltage/current/opmode data to - userspace via sysfs. This could be used to help monitor device power - consumption and status. - - See Documentation/ABI/testing/sysfs-class-regulator diff --git a/Documentation/power/regulator/regulator.rst b/Documentation/power/regulator/regulator.rst new file mode 100644 index 000000000000..794b3256fbb9 --- /dev/null +++ b/Documentation/power/regulator/regulator.rst @@ -0,0 +1,32 @@ +========================== +Regulator Driver Interface +========================== + +The regulator driver interface is relatively simple and designed to allow +regulator drivers to register their services with the core framework. + + +Registration +============ + +Drivers can register a regulator by calling:: + + struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc, + const struct regulator_config *config); + +This will register the regulator's capabilities and operations to the regulator +core. + +Regulators can be unregistered by calling:: + + void regulator_unregister(struct regulator_dev *rdev); + + +Regulator Events +================ + +Regulators can send events (e.g. overtemperature, undervoltage, etc) to +consumer drivers by calling:: + + int regulator_notifier_call_chain(struct regulator_dev *rdev, + unsigned long event, void *data); diff --git a/Documentation/power/regulator/regulator.txt b/Documentation/power/regulator/regulator.txt deleted file mode 100644 index b17e5833ce21..000000000000 --- a/Documentation/power/regulator/regulator.txt +++ /dev/null @@ -1,30 +0,0 @@ -Regulator Driver Interface -========================== - -The regulator driver interface is relatively simple and designed to allow -regulator drivers to register their services with the core framework. - - -Registration -============ - -Drivers can register a regulator by calling :- - -struct regulator_dev *regulator_register(struct regulator_desc *regulator_desc, - const struct regulator_config *config); - -This will register the regulator's capabilities and operations to the regulator -core. - -Regulators can be unregistered by calling :- - -void regulator_unregister(struct regulator_dev *rdev); - - -Regulator Events -================ -Regulators can send events (e.g. overtemperature, undervoltage, etc) to -consumer drivers by calling :- - -int regulator_notifier_call_chain(struct regulator_dev *rdev, - unsigned long event, void *data); diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst new file mode 100644 index 000000000000..2c2ec99b5088 --- /dev/null +++ b/Documentation/power/runtime_pm.rst @@ -0,0 +1,940 @@ +================================================== +Runtime Power Management Framework for I/O Devices +================================================== + +(C) 2009-2011 Rafael J. Wysocki , Novell Inc. + +(C) 2010 Alan Stern + +(C) 2014 Intel Corp., Rafael J. Wysocki + +1. Introduction +=============== + +Support for runtime power management (runtime PM) of I/O devices is provided +at the power management core (PM core) level by means of: + +* The power management workqueue pm_wq in which bus types and device drivers can + put their PM-related work items. It is strongly recommended that pm_wq be + used for queuing all work items related to runtime PM, because this allows + them to be synchronized with system-wide power transitions (suspend to RAM, + hibernation and resume from system sleep states). pm_wq is declared in + include/linux/pm_runtime.h and defined in kernel/power/main.c. + +* A number of runtime PM fields in the 'power' member of 'struct device' (which + is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can + be used for synchronizing runtime PM operations with one another. + +* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in + include/linux/pm.h). + +* A set of helper functions defined in drivers/base/power/runtime.c that can be + used for carrying out runtime PM operations in such a way that the + synchronization between them is taken care of by the PM core. Bus types and + device drivers are encouraged to use these functions. + +The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM +fields of 'struct dev_pm_info' and the core helper functions provided for +runtime PM are described below. + +2. Device Runtime PM Callbacks +============================== + +There are three device runtime PM callbacks defined in 'struct dev_pm_ops':: + + struct dev_pm_ops { + ... + int (*runtime_suspend)(struct device *dev); + int (*runtime_resume)(struct device *dev); + int (*runtime_idle)(struct device *dev); + ... + }; + +The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks +are executed by the PM core for the device's subsystem that may be either of +the following: + + 1. PM domain of the device, if the device's PM domain object, dev->pm_domain, + is present. + + 2. Device type of the device, if both dev->type and dev->type->pm are present. + + 3. Device class of the device, if both dev->class and dev->class->pm are + present. + + 4. Bus type of the device, if both dev->bus and dev->bus->pm are present. + +If the subsystem chosen by applying the above rules doesn't provide the relevant +callback, the PM core will invoke the corresponding driver callback stored in +dev->driver->pm directly (if present). + +The PM core always checks which callback to use in the order given above, so the +priority order of callbacks from high to low is: PM domain, device type, class +and bus type. Moreover, the high-priority one will always take precedence over +a low-priority one. The PM domain, bus type, device type and class callbacks +are referred to as subsystem-level callbacks in what follows. + +By default, the callbacks are always invoked in process context with interrupts +enabled. However, the pm_runtime_irq_safe() helper function can be used to tell +the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume() +and ->runtime_idle() callbacks for the given device in atomic context with +interrupts disabled. This implies that the callback routines in question must +not block or sleep, but it also means that the synchronous helper functions +listed at the end of Section 4 may be used for that device within an interrupt +handler or generally in an atomic context. + +The subsystem-level suspend callback, if present, is _entirely_ _responsible_ +for handling the suspend of the device as appropriate, which may, but need not +include executing the device driver's own ->runtime_suspend() callback (from the +PM core's point of view it is not necessary to implement a ->runtime_suspend() +callback in a device driver as long as the subsystem-level suspend callback +knows what to do to handle the device). + + * Once the subsystem-level suspend callback (or the driver suspend callback, + if invoked directly) has completed successfully for the given device, the PM + core regards the device as suspended, which need not mean that it has been + put into a low power state. It is supposed to mean, however, that the + device will not process data and will not communicate with the CPU(s) and + RAM until the appropriate resume callback is executed for it. The runtime + PM status of a device after successful execution of the suspend callback is + 'suspended'. + + * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM + status remains 'active', which means that the device _must_ be fully + operational afterwards. + + * If the suspend callback returns an error code different from -EBUSY and + -EAGAIN, the PM core regards this as a fatal error and will refuse to run + the helper functions described in Section 4 for the device until its status + is directly set to either 'active', or 'suspended' (the PM core provides + special helper functions for this purpose). + +In particular, if the driver requires remote wakeup capability (i.e. hardware +mechanism allowing the device to request a change of its power state, such as +PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the +device, then ->runtime_suspend() should return -EBUSY. On the other hand, if +device_can_wakeup() returns 'true' for the device and the device is put into a +low-power state during the execution of the suspend callback, it is expected +that remote wakeup will be enabled for the device. Generally, remote wakeup +should be enabled for all input devices put into low-power states at run time. + +The subsystem-level resume callback, if present, is **entirely responsible** for +handling the resume of the device as appropriate, which may, but need not +include executing the device driver's own ->runtime_resume() callback (from the +PM core's point of view it is not necessary to implement a ->runtime_resume() +callback in a device driver as long as the subsystem-level resume callback knows +what to do to handle the device). + + * Once the subsystem-level resume callback (or the driver resume callback, if + invoked directly) has completed successfully, the PM core regards the device + as fully operational, which means that the device _must_ be able to complete + I/O operations as needed. The runtime PM status of the device is then + 'active'. + + * If the resume callback returns an error code, the PM core regards this as a + fatal error and will refuse to run the helper functions described in Section + 4 for the device, until its status is directly set to either 'active', or + 'suspended' (by means of special helper functions provided by the PM core + for this purpose). + +The idle callback (a subsystem-level one, if present, or the driver one) is +executed by the PM core whenever the device appears to be idle, which is +indicated to the PM core by two counters, the device's usage counter and the +counter of 'active' children of the device. + + * If any of these counters is decreased using a helper function provided by + the PM core and it turns out to be equal to zero, the other counter is + checked. If that counter also is equal to zero, the PM core executes the + idle callback with the device as its argument. + +The action performed by the idle callback is totally dependent on the subsystem +(or driver) in question, but the expected and recommended action is to check +if the device can be suspended (i.e. if all of the conditions necessary for +suspending the device are satisfied) and to queue up a suspend request for the +device in that case. If there is no idle callback, or if the callback returns +0, then the PM core will attempt to carry out a runtime suspend of the device, +also respecting devices configured for autosuspend. In essence this means a +call to pm_runtime_autosuspend() (do note that drivers needs to update the +device last busy mark, pm_runtime_mark_last_busy(), to control the delay under +this circumstance). To prevent this (for example, if the callback routine has +started a delayed suspend), the routine must return a non-zero value. Negative +error return codes are ignored by the PM core. + +The helper functions provided by the PM core, described in Section 4, guarantee +that the following constraints are met with respect to runtime PM callbacks for +one device: + +(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute + ->runtime_suspend() in parallel with ->runtime_resume() or with another + instance of ->runtime_suspend() for the same device) with the exception that + ->runtime_suspend() or ->runtime_resume() can be executed in parallel with + ->runtime_idle() (although ->runtime_idle() will not be started while any + of the other callbacks is being executed for the same device). + +(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active' + devices (i.e. the PM core will only execute ->runtime_idle() or + ->runtime_suspend() for the devices the runtime PM status of which is + 'active'). + +(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device + the usage counter of which is equal to zero _and_ either the counter of + 'active' children of which is equal to zero, or the 'power.ignore_children' + flag of which is set. + +(4) ->runtime_resume() can only be executed for 'suspended' devices (i.e. the + PM core will only execute ->runtime_resume() for the devices the runtime + PM status of which is 'suspended'). + +Additionally, the helper functions provided by the PM core obey the following +rules: + + * If ->runtime_suspend() is about to be executed or there's a pending request + to execute it, ->runtime_idle() will not be executed for the same device. + + * A request to execute or to schedule the execution of ->runtime_suspend() + will cancel any pending requests to execute ->runtime_idle() for the same + device. + + * If ->runtime_resume() is about to be executed or there's a pending request + to execute it, the other callbacks will not be executed for the same device. + + * A request to execute ->runtime_resume() will cancel any pending or + scheduled requests to execute the other callbacks for the same device, + except for scheduled autosuspends. + +3. Runtime PM Device Fields +=========================== + +The following device runtime PM fields are present in 'struct dev_pm_info', as +defined in include/linux/pm.h: + + `struct timer_list suspend_timer;` + - timer used for scheduling (delayed) suspend and autosuspend requests + + `unsigned long timer_expires;` + - timer expiration time, in jiffies (if this is different from zero, the + timer is running and will expire at that time, otherwise the timer is not + running) + + `struct work_struct work;` + - work structure used for queuing up requests (i.e. work items in pm_wq) + + `wait_queue_head_t wait_queue;` + - wait queue used if any of the helper functions needs to wait for another + one to complete + + `spinlock_t lock;` + - lock used for synchronization + + `atomic_t usage_count;` + - the usage counter of the device + + `atomic_t child_count;` + - the count of 'active' children of the device + + `unsigned int ignore_children;` + - if set, the value of child_count is ignored (but still updated) + + `unsigned int disable_depth;` + - used for disabling the helper functions (they work normally if this is + equal to zero); the initial value of it is 1 (i.e. runtime PM is + initially disabled for all devices) + + `int runtime_error;` + - if set, there was a fatal error (one of the callbacks returned error code + as described in Section 2), so the helper functions will not work until + this flag is cleared; this is the error code returned by the failing + callback + + `unsigned int idle_notification;` + - if set, ->runtime_idle() is being executed + + `unsigned int request_pending;` + - if set, there's a pending request (i.e. a work item queued up into pm_wq) + + `enum rpm_request request;` + - type of request that's pending (valid if request_pending is set) + + `unsigned int deferred_resume;` + - set if ->runtime_resume() is about to be run while ->runtime_suspend() is + being executed for that device and it is not practical to wait for the + suspend to complete; means "start a resume as soon as you've suspended" + + `enum rpm_status runtime_status;` + - the runtime PM status of the device; this field's initial value is + RPM_SUSPENDED, which means that each device is initially regarded by the + PM core as 'suspended', regardless of its real hardware status + + `unsigned int runtime_auto;` + - if set, indicates that the user space has allowed the device driver to + power manage the device at run time via the /sys/devices/.../power/control + `interface;` it may only be modified with the help of the pm_runtime_allow() + and pm_runtime_forbid() helper functions + + `unsigned int no_callbacks;` + - indicates that the device does not use the runtime PM callbacks (see + Section 8); it may be modified only by the pm_runtime_no_callbacks() + helper function + + `unsigned int irq_safe;` + - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks + will be invoked with the spinlock held and interrupts disabled + + `unsigned int use_autosuspend;` + - indicates that the device's driver supports delayed autosuspend (see + Section 9); it may be modified only by the + pm_runtime{_dont}_use_autosuspend() helper functions + + `unsigned int timer_autosuspends;` + - indicates that the PM core should attempt to carry out an autosuspend + when the timer expires rather than a normal suspend + + `int autosuspend_delay;` + - the delay time (in milliseconds) to be used for autosuspend + + `unsigned long last_busy;` + - the time (in jiffies) when the pm_runtime_mark_last_busy() helper + function was last called for this device; used in calculating inactivity + periods for autosuspend + +All of the above fields are members of the 'power' member of 'struct device'. + +4. Runtime PM Device Helper Functions +===================================== + +The following runtime PM helper functions are defined in +drivers/base/power/runtime.c and include/linux/pm_runtime.h: + + `void pm_runtime_init(struct device *dev);` + - initialize the device runtime PM fields in 'struct dev_pm_info' + + `void pm_runtime_remove(struct device *dev);` + - make sure that the runtime PM of the device will be disabled after + removing the device from device hierarchy + + `int pm_runtime_idle(struct device *dev);` + - execute the subsystem-level idle callback for the device; returns an + error code on failure, where -EINPROGRESS means that ->runtime_idle() is + already being executed; if there is no callback or the callback returns 0 + then run pm_runtime_autosuspend(dev) and return its result + + `int pm_runtime_suspend(struct device *dev);` + - execute the subsystem-level suspend callback for the device; returns 0 on + success, 1 if the device's runtime PM status was already 'suspended', or + error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt + to suspend the device again in future and -EACCES means that + 'power.disable_depth' is different from 0 + + `int pm_runtime_autosuspend(struct device *dev);` + - same as pm_runtime_suspend() except that the autosuspend delay is taken + `into account;` if pm_runtime_autosuspend_expiration() says the delay has + not yet expired then an autosuspend is scheduled for the appropriate time + and 0 is returned + + `int pm_runtime_resume(struct device *dev);` + - execute the subsystem-level resume callback for the device; returns 0 on + success, 1 if the device's runtime PM status was already 'active' or + error code on failure, where -EAGAIN means it may be safe to attempt to + resume the device again in future, but 'power.runtime_error' should be + checked additionally, and -EACCES means that 'power.disable_depth' is + different from 0 + + `int pm_request_idle(struct device *dev);` + - submit a request to execute the subsystem-level idle callback for the + device (the request is represented by a work item in pm_wq); returns 0 on + success or error code if the request has not been queued up + + `int pm_request_autosuspend(struct device *dev);` + - schedule the execution of the subsystem-level suspend callback for the + device when the autosuspend delay has expired; if the delay has already + expired then the work item is queued up immediately + + `int pm_schedule_suspend(struct device *dev, unsigned int delay);` + - schedule the execution of the subsystem-level suspend callback for the + device in future, where 'delay' is the time to wait before queuing up a + suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work + item is queued up immediately); returns 0 on success, 1 if the device's PM + runtime status was already 'suspended', or error code if the request + hasn't been scheduled (or queued up if 'delay' is 0); if the execution of + ->runtime_suspend() is already scheduled and not yet expired, the new + value of 'delay' will be used as the time to wait + + `int pm_request_resume(struct device *dev);` + - submit a request to execute the subsystem-level resume callback for the + device (the request is represented by a work item in pm_wq); returns 0 on + success, 1 if the device's runtime PM status was already 'active', or + error code if the request hasn't been queued up + + `void pm_runtime_get_noresume(struct device *dev);` + - increment the device's usage counter + + `int pm_runtime_get(struct device *dev);` + - increment the device's usage counter, run pm_request_resume(dev) and + return its result + + `int pm_runtime_get_sync(struct device *dev);` + - increment the device's usage counter, run pm_runtime_resume(dev) and + return its result + + `int pm_runtime_get_if_in_use(struct device *dev);` + - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the + runtime PM status is RPM_ACTIVE and the runtime PM usage counter is + nonzero, increment the counter and return 1; otherwise return 0 without + changing the counter + + `void pm_runtime_put_noidle(struct device *dev);` + - decrement the device's usage counter + + `int pm_runtime_put(struct device *dev);` + - decrement the device's usage counter; if the result is 0 then run + pm_request_idle(dev) and return its result + + `int pm_runtime_put_autosuspend(struct device *dev);` + - decrement the device's usage counter; if the result is 0 then run + pm_request_autosuspend(dev) and return its result + + `int pm_runtime_put_sync(struct device *dev);` + - decrement the device's usage counter; if the result is 0 then run + pm_runtime_idle(dev) and return its result + + `int pm_runtime_put_sync_suspend(struct device *dev);` + - decrement the device's usage counter; if the result is 0 then run + pm_runtime_suspend(dev) and return its result + + `int pm_runtime_put_sync_autosuspend(struct device *dev);` + - decrement the device's usage counter; if the result is 0 then run + pm_runtime_autosuspend(dev) and return its result + + `void pm_runtime_enable(struct device *dev);` + - decrement the device's 'power.disable_depth' field; if that field is equal + to zero, the runtime PM helper functions can execute subsystem-level + callbacks described in Section 2 for the device + + `int pm_runtime_disable(struct device *dev);` + - increment the device's 'power.disable_depth' field (if the value of that + field was previously zero, this prevents subsystem-level runtime PM + callbacks from being run for the device), make sure that all of the + pending runtime PM operations on the device are either completed or + canceled; returns 1 if there was a resume request pending and it was + necessary to execute the subsystem-level resume callback for the device + to satisfy that request, otherwise 0 is returned + + `int pm_runtime_barrier(struct device *dev);` + - check if there's a resume request pending for the device and resume it + (synchronously) in that case, cancel any other pending runtime PM requests + regarding it and wait for all runtime PM operations on it in progress to + complete; returns 1 if there was a resume request pending and it was + necessary to execute the subsystem-level resume callback for the device to + satisfy that request, otherwise 0 is returned + + `void pm_suspend_ignore_children(struct device *dev, bool enable);` + - set/unset the power.ignore_children flag of the device + + `int pm_runtime_set_active(struct device *dev);` + - clear the device's 'power.runtime_error' flag, set the device's runtime + PM status to 'active' and update its parent's counter of 'active' + children as appropriate (it is only valid to use this function if + 'power.runtime_error' is set or 'power.disable_depth' is greater than + zero); it will fail and return error code if the device has a parent + which is not active and the 'power.ignore_children' flag of which is unset + + `void pm_runtime_set_suspended(struct device *dev);` + - clear the device's 'power.runtime_error' flag, set the device's runtime + PM status to 'suspended' and update its parent's counter of 'active' + children as appropriate (it is only valid to use this function if + 'power.runtime_error' is set or 'power.disable_depth' is greater than + zero) + + `bool pm_runtime_active(struct device *dev);` + - return true if the device's runtime PM status is 'active' or its + 'power.disable_depth' field is not equal to zero, or false otherwise + + `bool pm_runtime_suspended(struct device *dev);` + - return true if the device's runtime PM status is 'suspended' and its + 'power.disable_depth' field is equal to zero, or false otherwise + + `bool pm_runtime_status_suspended(struct device *dev);` + - return true if the device's runtime PM status is 'suspended' + + `void pm_runtime_allow(struct device *dev);` + - set the power.runtime_auto flag for the device and decrease its usage + counter (used by the /sys/devices/.../power/control interface to + effectively allow the device to be power managed at run time) + + `void pm_runtime_forbid(struct device *dev);` + - unset the power.runtime_auto flag for the device and increase its usage + counter (used by the /sys/devices/.../power/control interface to + effectively prevent the device from being power managed at run time) + + `void pm_runtime_no_callbacks(struct device *dev);` + - set the power.no_callbacks flag for the device and remove the runtime + PM attributes from /sys/devices/.../power (or prevent them from being + added when the device is registered) + + `void pm_runtime_irq_safe(struct device *dev);` + - set the power.irq_safe flag for the device, causing the runtime-PM + callbacks to be invoked with interrupts off + + `bool pm_runtime_is_irq_safe(struct device *dev);` + - return true if power.irq_safe flag was set for the device, causing + the runtime-PM callbacks to be invoked with interrupts off + + `void pm_runtime_mark_last_busy(struct device *dev);` + - set the power.last_busy field to the current time + + `void pm_runtime_use_autosuspend(struct device *dev);` + - set the power.use_autosuspend flag, enabling autosuspend delays; call + pm_runtime_get_sync if the flag was previously cleared and + power.autosuspend_delay is negative + + `void pm_runtime_dont_use_autosuspend(struct device *dev);` + - clear the power.use_autosuspend flag, disabling autosuspend delays; + decrement the device's usage counter if the flag was previously set and + power.autosuspend_delay is negative; call pm_runtime_idle + + `void pm_runtime_set_autosuspend_delay(struct device *dev, int delay);` + - set the power.autosuspend_delay value to 'delay' (expressed in + milliseconds); if 'delay' is negative then runtime suspends are + prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be + called or the device's usage counter may be decremented and + pm_runtime_idle called depending on if power.autosuspend_delay is + changed to or from a negative value; if power.use_autosuspend is clear, + pm_runtime_idle is called + + `unsigned long pm_runtime_autosuspend_expiration(struct device *dev);` + - calculate the time when the current autosuspend delay period will expire, + based on power.last_busy and power.autosuspend_delay; if the delay time + is 1000 ms or larger then the expiration time is rounded up to the + nearest second; returns 0 if the delay period has already expired or + power.use_autosuspend isn't set, otherwise returns the expiration time + in jiffies + +It is safe to execute the following helper functions from interrupt context: + +- pm_request_idle() +- pm_request_autosuspend() +- pm_schedule_suspend() +- pm_request_resume() +- pm_runtime_get_noresume() +- pm_runtime_get() +- pm_runtime_put_noidle() +- pm_runtime_put() +- pm_runtime_put_autosuspend() +- pm_runtime_enable() +- pm_suspend_ignore_children() +- pm_runtime_set_active() +- pm_runtime_set_suspended() +- pm_runtime_suspended() +- pm_runtime_mark_last_busy() +- pm_runtime_autosuspend_expiration() + +If pm_runtime_irq_safe() has been called for a device then the following helper +functions may also be used in interrupt context: + +- pm_runtime_idle() +- pm_runtime_suspend() +- pm_runtime_autosuspend() +- pm_runtime_resume() +- pm_runtime_get_sync() +- pm_runtime_put_sync() +- pm_runtime_put_sync_suspend() +- pm_runtime_put_sync_autosuspend() + +5. Runtime PM Initialization, Device Probing and Removal +======================================================== + +Initially, the runtime PM is disabled for all devices, which means that the +majority of the runtime PM helper functions described in Section 4 will return +-EAGAIN until pm_runtime_enable() is called for the device. + +In addition to that, the initial runtime PM status of all devices is +'suspended', but it need not reflect the actual physical state of the device. +Thus, if the device is initially active (i.e. it is able to process I/O), its +runtime PM status must be changed to 'active', with the help of +pm_runtime_set_active(), before pm_runtime_enable() is called for the device. + +However, if the device has a parent and the parent's runtime PM is enabled, +calling pm_runtime_set_active() for the device will affect the parent, unless +the parent's 'power.ignore_children' flag is set. Namely, in that case the +parent won't be able to suspend at run time, using the PM core's helper +functions, as long as the child's status is 'active', even if the child's +runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for +the child yet or pm_runtime_disable() has been called for it). For this reason, +once pm_runtime_set_active() has been called for the device, pm_runtime_enable() +should be called for it too as soon as reasonably possible or its runtime PM +status should be changed back to 'suspended' with the help of +pm_runtime_set_suspended(). + +If the default initial runtime PM status of the device (i.e. 'suspended') +reflects the actual state of the device, its bus type's or its driver's +->probe() callback will likely need to wake it up using one of the PM core's +helper functions described in Section 4. In that case, pm_runtime_resume() +should be used. Of course, for this purpose the device's runtime PM has to be +enabled earlier by calling pm_runtime_enable(). + +Note, if the device may execute pm_runtime calls during the probe (such as +if it is registers with a subsystem that may call back in) then the +pm_runtime_get_sync() call paired with a pm_runtime_put() call will be +appropriate to ensure that the device is not put back to sleep during the +probe. This can happen with systems such as the network device layer. + +It may be desirable to suspend the device once ->probe() has finished. +Therefore the driver core uses the asynchronous pm_request_idle() to submit a +request to execute the subsystem-level idle callback for the device at that +time. A driver that makes use of the runtime autosuspend feature, may want to +update the last busy mark before returning from ->probe(). + +Moreover, the driver core prevents runtime PM callbacks from racing with the bus +notifier callback in __device_release_driver(), which is necessary, because the +notifier is used by some subsystems to carry out operations affecting the +runtime PM functionality. It does so by calling pm_runtime_get_sync() before +driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications. This +resumes the device if it's in the suspended state and prevents it from +being suspended again while those routines are being executed. + +To allow bus types and drivers to put devices into the suspended state by +calling pm_runtime_suspend() from their ->remove() routines, the driver core +executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER +notifications in __device_release_driver(). This requires bus types and +drivers to make their ->remove() callbacks avoid races with runtime PM directly, +but also it allows of more flexibility in the handling of devices during the +removal of their drivers. + +Drivers in ->remove() callback should undo the runtime PM changes done +in ->probe(). Usually this means calling pm_runtime_disable(), +pm_runtime_dont_use_autosuspend() etc. + +The user space can effectively disallow the driver of the device to power manage +it at run time by changing the value of its /sys/devices/.../power/control +attribute to "on", which causes pm_runtime_forbid() to be called. In principle, +this mechanism may also be used by the driver to effectively turn off the +runtime power management of the device until the user space turns it on. +Namely, during the initialization the driver can make sure that the runtime PM +status of the device is 'active' and call pm_runtime_forbid(). It should be +noted, however, that if the user space has already intentionally changed the +value of /sys/devices/.../power/control to "auto" to allow the driver to power +manage the device at run time, the driver may confuse it by using +pm_runtime_forbid() this way. + +6. Runtime PM and System Sleep +============================== + +Runtime PM and system sleep (i.e., system suspend and hibernation, also known +as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of +ways. If a device is active when a system sleep starts, everything is +straightforward. But what should happen if the device is already suspended? + +The device may have different wake-up settings for runtime PM and system sleep. +For example, remote wake-up may be enabled for runtime suspend but disallowed +for system sleep (device_may_wakeup(dev) returns 'false'). When this happens, +the subsystem-level system suspend callback is responsible for changing the +device's wake-up setting (it may leave that to the device driver's system +suspend routine). It may be necessary to resume the device and suspend it again +in order to do so. The same is true if the driver uses different power levels +or other settings for runtime suspend and system sleep. + +During system resume, the simplest approach is to bring all devices back to full +power, even if they had been suspended before the system suspend began. There +are several reasons for this, including: + + * The device might need to switch power levels, wake-up settings, etc. + + * Remote wake-up events might have been lost by the firmware. + + * The device's children may need the device to be at full power in order + to resume themselves. + + * The driver's idea of the device state may not agree with the device's + physical state. This can happen during resume from hibernation. + + * The device might need to be reset. + + * Even though the device was suspended, if its usage counter was > 0 then most + likely it would need a runtime resume in the near future anyway. + +If the device had been suspended before the system suspend began and it's +brought back to full power during resume, then its runtime PM status will have +to be updated to reflect the actual post-system sleep status. The way to do +this is: + + - pm_runtime_disable(dev); + - pm_runtime_set_active(dev); + - pm_runtime_enable(dev); + +The PM core always increments the runtime usage counter before calling the +->suspend() callback and decrements it after calling the ->resume() callback. +Hence disabling runtime PM temporarily like this will not cause any runtime +suspend attempts to be permanently lost. If the usage count goes to zero +following the return of the ->resume() callback, the ->runtime_idle() callback +will be invoked as usual. + +On some systems, however, system sleep is not entered through a global firmware +or hardware operation. Instead, all hardware components are put into low-power +states directly by the kernel in a coordinated way. Then, the system sleep +state effectively follows from the states the hardware components end up in +and the system is woken up from that state by a hardware interrupt or a similar +mechanism entirely under the kernel's control. As a result, the kernel never +gives control away and the states of all devices during resume are precisely +known to it. If that is the case and none of the situations listed above takes +place (in particular, if the system is not waking up from hibernation), it may +be more efficient to leave the devices that had been suspended before the system +suspend began in the suspended state. + +To this end, the PM core provides a mechanism allowing some coordination between +different levels of device hierarchy. Namely, if a system suspend .prepare() +callback returns a positive number for a device, that indicates to the PM core +that the device appears to be runtime-suspended and its state is fine, so it +may be left in runtime suspend provided that all of its descendants are also +left in runtime suspend. If that happens, the PM core will not execute any +system suspend and resume callbacks for all of those devices, except for the +complete callback, which is then entirely responsible for handling the device +as appropriate. This only applies to system suspend transitions that are not +related to hibernation (see Documentation/driver-api/pm/devices.rst for more +information). + +The PM core does its best to reduce the probability of race conditions between +the runtime PM and system suspend/resume (and hibernation) callbacks by carrying +out the following operations: + + * During system suspend pm_runtime_get_noresume() is called for every device + right before executing the subsystem-level .prepare() callback for it and + pm_runtime_barrier() is called for every device right before executing the + subsystem-level .suspend() callback for it. In addition to that the PM core + calls __pm_runtime_disable() with 'false' as the second argument for every + device right before executing the subsystem-level .suspend_late() callback + for it. + + * During system resume pm_runtime_enable() and pm_runtime_put() are called for + every device right after executing the subsystem-level .resume_early() + callback and right after executing the subsystem-level .complete() callback + for it, respectively. + +7. Generic subsystem callbacks + +Subsystems may wish to conserve code space by using the set of generic power +management callbacks provided by the PM core, defined in +driver/base/power/generic_ops.c: + + `int pm_generic_runtime_suspend(struct device *dev);` + - invoke the ->runtime_suspend() callback provided by the driver of this + device and return its result, or return 0 if not defined + + `int pm_generic_runtime_resume(struct device *dev);` + - invoke the ->runtime_resume() callback provided by the driver of this + device and return its result, or return 0 if not defined + + `int pm_generic_suspend(struct device *dev);` + - if the device has not been suspended at run time, invoke the ->suspend() + callback provided by its driver and return its result, or return 0 if not + defined + + `int pm_generic_suspend_noirq(struct device *dev);` + - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq() + callback provided by the device's driver and return its result, or return + 0 if not defined + + `int pm_generic_resume(struct device *dev);` + - invoke the ->resume() callback provided by the driver of this device and, + if successful, change the device's runtime PM status to 'active' + + `int pm_generic_resume_noirq(struct device *dev);` + - invoke the ->resume_noirq() callback provided by the driver of this device + + `int pm_generic_freeze(struct device *dev);` + - if the device has not been suspended at run time, invoke the ->freeze() + callback provided by its driver and return its result, or return 0 if not + defined + + `int pm_generic_freeze_noirq(struct device *dev);` + - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq() + callback provided by the device's driver and return its result, or return + 0 if not defined + + `int pm_generic_thaw(struct device *dev);` + - if the device has not been suspended at run time, invoke the ->thaw() + callback provided by its driver and return its result, or return 0 if not + defined + + `int pm_generic_thaw_noirq(struct device *dev);` + - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq() + callback provided by the device's driver and return its result, or return + 0 if not defined + + `int pm_generic_poweroff(struct device *dev);` + - if the device has not been suspended at run time, invoke the ->poweroff() + callback provided by its driver and return its result, or return 0 if not + defined + + `int pm_generic_poweroff_noirq(struct device *dev);` + - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq() + callback provided by the device's driver and return its result, or return + 0 if not defined + + `int pm_generic_restore(struct device *dev);` + - invoke the ->restore() callback provided by the driver of this device and, + if successful, change the device's runtime PM status to 'active' + + `int pm_generic_restore_noirq(struct device *dev);` + - invoke the ->restore_noirq() callback provided by the device's driver + +These functions are the defaults used by the PM core, if a subsystem doesn't +provide its own callbacks for ->runtime_idle(), ->runtime_suspend(), +->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(), +->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(), +->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the +subsystem-level dev_pm_ops structure. + +Device drivers that wish to use the same function as a system suspend, freeze, +poweroff and runtime suspend callback, and similarly for system resume, thaw, +restore, and runtime resume, can achieve this with the help of the +UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its +last argument to NULL). + +8. "No-Callback" Devices +======================== + +Some "devices" are only logical sub-devices of their parent and cannot be +power-managed on their own. (The prototype example is a USB interface. Entire +USB devices can go into low-power mode or send wake-up requests, but neither is +possible for individual interfaces.) The drivers for these devices have no +need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend() +and ->runtime_resume() would always return 0 without doing anything else and +->runtime_idle() would always call pm_runtime_suspend(). + +Subsystems can tell the PM core about these devices by calling +pm_runtime_no_callbacks(). This should be done after the device structure is +initialized and before it is registered (although after device registration is +also okay). The routine will set the device's power.no_callbacks flag and +prevent the non-debugging runtime PM sysfs attributes from being created. + +When power.no_callbacks is set, the PM core will not invoke the +->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks. +Instead it will assume that suspends and resumes always succeed and that idle +devices should be suspended. + +As a consequence, the PM core will never directly inform the device's subsystem +or driver about runtime power changes. Instead, the driver for the device's +parent must take responsibility for telling the device's driver when the +parent's power state changes. + +9. Autosuspend, or automatically-delayed suspends +================================================= + +Changing a device's power state isn't free; it requires both time and energy. +A device should be put in a low-power state only when there's some reason to +think it will remain in that state for a substantial time. A common heuristic +says that a device which hasn't been used for a while is liable to remain +unused; following this advice, drivers should not allow devices to be suspended +at runtime until they have been inactive for some minimum period. Even when +the heuristic ends up being non-optimal, it will still prevent devices from +"bouncing" too rapidly between low-power and full-power states. + +The term "autosuspend" is an historical remnant. It doesn't mean that the +device is automatically suspended (the subsystem or driver still has to call +the appropriate PM routines); rather it means that runtime suspends will +automatically be delayed until the desired period of inactivity has elapsed. + +Inactivity is determined based on the power.last_busy field. Drivers should +call pm_runtime_mark_last_busy() to update this field after carrying out I/O, +typically just before calling pm_runtime_put_autosuspend(). The desired length +of the inactivity period is a matter of policy. Subsystems can set this length +initially by calling pm_runtime_set_autosuspend_delay(), but after device +registration the length should be controlled by user space, using the +/sys/devices/.../power/autosuspend_delay_ms attribute. + +In order to use autosuspend, subsystems or drivers must call +pm_runtime_use_autosuspend() (preferably before registering the device), and +thereafter they should use the various `*_autosuspend()` helper functions +instead of the non-autosuspend counterparts:: + + Instead of: pm_runtime_suspend use: pm_runtime_autosuspend; + Instead of: pm_schedule_suspend use: pm_request_autosuspend; + Instead of: pm_runtime_put use: pm_runtime_put_autosuspend; + Instead of: pm_runtime_put_sync use: pm_runtime_put_sync_autosuspend. + +Drivers may also continue to use the non-autosuspend helper functions; they +will behave normally, which means sometimes taking the autosuspend delay into +account (see pm_runtime_idle). + +Under some circumstances a driver or subsystem may want to prevent a device +from autosuspending immediately, even though the usage counter is zero and the +autosuspend delay time has expired. If the ->runtime_suspend() callback +returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is +in the future (as it normally would be if the callback invoked +pm_runtime_mark_last_busy()), the PM core will automatically reschedule the +autosuspend. The ->runtime_suspend() callback can't do this rescheduling +itself because no suspend requests of any kind are accepted while the device is +suspending (i.e., while the callback is running). + +The implementation is well suited for asynchronous use in interrupt contexts. +However such use inevitably involves races, because the PM core can't +synchronize ->runtime_suspend() callbacks with the arrival of I/O requests. +This synchronization must be handled by the driver, using its private lock. +Here is a schematic pseudo-code example:: + + foo_read_or_write(struct foo_priv *foo, void *data) + { + lock(&foo->private_lock); + add_request_to_io_queue(foo, data); + if (foo->num_pending_requests++ == 0) + pm_runtime_get(&foo->dev); + if (!foo->is_suspended) + foo_process_next_request(foo); + unlock(&foo->private_lock); + } + + foo_io_completion(struct foo_priv *foo, void *req) + { + lock(&foo->private_lock); + if (--foo->num_pending_requests == 0) { + pm_runtime_mark_last_busy(&foo->dev); + pm_runtime_put_autosuspend(&foo->dev); + } else { + foo_process_next_request(foo); + } + unlock(&foo->private_lock); + /* Send req result back to the user ... */ + } + + int foo_runtime_suspend(struct device *dev) + { + struct foo_priv foo = container_of(dev, ...); + int ret = 0; + + lock(&foo->private_lock); + if (foo->num_pending_requests > 0) { + ret = -EBUSY; + } else { + /* ... suspend the device ... */ + foo->is_suspended = 1; + } + unlock(&foo->private_lock); + return ret; + } + + int foo_runtime_resume(struct device *dev) + { + struct foo_priv foo = container_of(dev, ...); + + lock(&foo->private_lock); + /* ... resume the device ... */ + foo->is_suspended = 0; + pm_runtime_mark_last_busy(&foo->dev); + if (foo->num_pending_requests > 0) + foo_process_next_request(foo); + unlock(&foo->private_lock); + return 0; + } + +The important point is that after foo_io_completion() asks for an autosuspend, +the foo_runtime_suspend() callback may race with foo_read_or_write(). +Therefore foo_runtime_suspend() has to check whether there are any pending I/O +requests (while holding the private lock) before allowing the suspend to +proceed. + +In addition, the power.autosuspend_delay field can be changed by user space at +any time. If a driver cares about this, it can call +pm_runtime_autosuspend_expiration() from within the ->runtime_suspend() +callback while holding its private lock. If the function returns a nonzero +value then the delay has not yet expired and the callback should return +-EAGAIN. diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt deleted file mode 100644 index 937e33c46211..000000000000 --- a/Documentation/power/runtime_pm.txt +++ /dev/null @@ -1,928 +0,0 @@ -Runtime Power Management Framework for I/O Devices - -(C) 2009-2011 Rafael J. Wysocki , Novell Inc. -(C) 2010 Alan Stern -(C) 2014 Intel Corp., Rafael J. Wysocki - -1. Introduction - -Support for runtime power management (runtime PM) of I/O devices is provided -at the power management core (PM core) level by means of: - -* The power management workqueue pm_wq in which bus types and device drivers can - put their PM-related work items. It is strongly recommended that pm_wq be - used for queuing all work items related to runtime PM, because this allows - them to be synchronized with system-wide power transitions (suspend to RAM, - hibernation and resume from system sleep states). pm_wq is declared in - include/linux/pm_runtime.h and defined in kernel/power/main.c. - -* A number of runtime PM fields in the 'power' member of 'struct device' (which - is of the type 'struct dev_pm_info', defined in include/linux/pm.h) that can - be used for synchronizing runtime PM operations with one another. - -* Three device runtime PM callbacks in 'struct dev_pm_ops' (defined in - include/linux/pm.h). - -* A set of helper functions defined in drivers/base/power/runtime.c that can be - used for carrying out runtime PM operations in such a way that the - synchronization between them is taken care of by the PM core. Bus types and - device drivers are encouraged to use these functions. - -The runtime PM callbacks present in 'struct dev_pm_ops', the device runtime PM -fields of 'struct dev_pm_info' and the core helper functions provided for -runtime PM are described below. - -2. Device Runtime PM Callbacks - -There are three device runtime PM callbacks defined in 'struct dev_pm_ops': - -struct dev_pm_ops { - ... - int (*runtime_suspend)(struct device *dev); - int (*runtime_resume)(struct device *dev); - int (*runtime_idle)(struct device *dev); - ... -}; - -The ->runtime_suspend(), ->runtime_resume() and ->runtime_idle() callbacks -are executed by the PM core for the device's subsystem that may be either of -the following: - - 1. PM domain of the device, if the device's PM domain object, dev->pm_domain, - is present. - - 2. Device type of the device, if both dev->type and dev->type->pm are present. - - 3. Device class of the device, if both dev->class and dev->class->pm are - present. - - 4. Bus type of the device, if both dev->bus and dev->bus->pm are present. - -If the subsystem chosen by applying the above rules doesn't provide the relevant -callback, the PM core will invoke the corresponding driver callback stored in -dev->driver->pm directly (if present). - -The PM core always checks which callback to use in the order given above, so the -priority order of callbacks from high to low is: PM domain, device type, class -and bus type. Moreover, the high-priority one will always take precedence over -a low-priority one. The PM domain, bus type, device type and class callbacks -are referred to as subsystem-level callbacks in what follows. - -By default, the callbacks are always invoked in process context with interrupts -enabled. However, the pm_runtime_irq_safe() helper function can be used to tell -the PM core that it is safe to run the ->runtime_suspend(), ->runtime_resume() -and ->runtime_idle() callbacks for the given device in atomic context with -interrupts disabled. This implies that the callback routines in question must -not block or sleep, but it also means that the synchronous helper functions -listed at the end of Section 4 may be used for that device within an interrupt -handler or generally in an atomic context. - -The subsystem-level suspend callback, if present, is _entirely_ _responsible_ -for handling the suspend of the device as appropriate, which may, but need not -include executing the device driver's own ->runtime_suspend() callback (from the -PM core's point of view it is not necessary to implement a ->runtime_suspend() -callback in a device driver as long as the subsystem-level suspend callback -knows what to do to handle the device). - - * Once the subsystem-level suspend callback (or the driver suspend callback, - if invoked directly) has completed successfully for the given device, the PM - core regards the device as suspended, which need not mean that it has been - put into a low power state. It is supposed to mean, however, that the - device will not process data and will not communicate with the CPU(s) and - RAM until the appropriate resume callback is executed for it. The runtime - PM status of a device after successful execution of the suspend callback is - 'suspended'. - - * If the suspend callback returns -EBUSY or -EAGAIN, the device's runtime PM - status remains 'active', which means that the device _must_ be fully - operational afterwards. - - * If the suspend callback returns an error code different from -EBUSY and - -EAGAIN, the PM core regards this as a fatal error and will refuse to run - the helper functions described in Section 4 for the device until its status - is directly set to either 'active', or 'suspended' (the PM core provides - special helper functions for this purpose). - -In particular, if the driver requires remote wakeup capability (i.e. hardware -mechanism allowing the device to request a change of its power state, such as -PCI PME) for proper functioning and device_can_wakeup() returns 'false' for the -device, then ->runtime_suspend() should return -EBUSY. On the other hand, if -device_can_wakeup() returns 'true' for the device and the device is put into a -low-power state during the execution of the suspend callback, it is expected -that remote wakeup will be enabled for the device. Generally, remote wakeup -should be enabled for all input devices put into low-power states at run time. - -The subsystem-level resume callback, if present, is _entirely_ _responsible_ for -handling the resume of the device as appropriate, which may, but need not -include executing the device driver's own ->runtime_resume() callback (from the -PM core's point of view it is not necessary to implement a ->runtime_resume() -callback in a device driver as long as the subsystem-level resume callback knows -what to do to handle the device). - - * Once the subsystem-level resume callback (or the driver resume callback, if - invoked directly) has completed successfully, the PM core regards the device - as fully operational, which means that the device _must_ be able to complete - I/O operations as needed. The runtime PM status of the device is then - 'active'. - - * If the resume callback returns an error code, the PM core regards this as a - fatal error and will refuse to run the helper functions described in Section - 4 for the device, until its status is directly set to either 'active', or - 'suspended' (by means of special helper functions provided by the PM core - for this purpose). - -The idle callback (a subsystem-level one, if present, or the driver one) is -executed by the PM core whenever the device appears to be idle, which is -indicated to the PM core by two counters, the device's usage counter and the -counter of 'active' children of the device. - - * If any of these counters is decreased using a helper function provided by - the PM core and it turns out to be equal to zero, the other counter is - checked. If that counter also is equal to zero, the PM core executes the - idle callback with the device as its argument. - -The action performed by the idle callback is totally dependent on the subsystem -(or driver) in question, but the expected and recommended action is to check -if the device can be suspended (i.e. if all of the conditions necessary for -suspending the device are satisfied) and to queue up a suspend request for the -device in that case. If there is no idle callback, or if the callback returns -0, then the PM core will attempt to carry out a runtime suspend of the device, -also respecting devices configured for autosuspend. In essence this means a -call to pm_runtime_autosuspend() (do note that drivers needs to update the -device last busy mark, pm_runtime_mark_last_busy(), to control the delay under -this circumstance). To prevent this (for example, if the callback routine has -started a delayed suspend), the routine must return a non-zero value. Negative -error return codes are ignored by the PM core. - -The helper functions provided by the PM core, described in Section 4, guarantee -that the following constraints are met with respect to runtime PM callbacks for -one device: - -(1) The callbacks are mutually exclusive (e.g. it is forbidden to execute - ->runtime_suspend() in parallel with ->runtime_resume() or with another - instance of ->runtime_suspend() for the same device) with the exception that - ->runtime_suspend() or ->runtime_resume() can be executed in parallel with - ->runtime_idle() (although ->runtime_idle() will not be started while any - of the other callbacks is being executed for the same device). - -(2) ->runtime_idle() and ->runtime_suspend() can only be executed for 'active' - devices (i.e. the PM core will only execute ->runtime_idle() or - ->runtime_suspend() for the devices the runtime PM status of which is - 'active'). - -(3) ->runtime_idle() and ->runtime_suspend() can only be executed for a device - the usage counter of which is equal to zero _and_ either the counter of - 'active' children of which is equal to zero, or the 'power.ignore_children' - flag of which is set. - -(4) ->runtime_resume() can only be executed for 'suspended' devices (i.e. the - PM core will only execute ->runtime_resume() for the devices the runtime - PM status of which is 'suspended'). - -Additionally, the helper functions provided by the PM core obey the following -rules: - - * If ->runtime_suspend() is about to be executed or there's a pending request - to execute it, ->runtime_idle() will not be executed for the same device. - - * A request to execute or to schedule the execution of ->runtime_suspend() - will cancel any pending requests to execute ->runtime_idle() for the same - device. - - * If ->runtime_resume() is about to be executed or there's a pending request - to execute it, the other callbacks will not be executed for the same device. - - * A request to execute ->runtime_resume() will cancel any pending or - scheduled requests to execute the other callbacks for the same device, - except for scheduled autosuspends. - -3. Runtime PM Device Fields - -The following device runtime PM fields are present in 'struct dev_pm_info', as -defined in include/linux/pm.h: - - struct timer_list suspend_timer; - - timer used for scheduling (delayed) suspend and autosuspend requests - - unsigned long timer_expires; - - timer expiration time, in jiffies (if this is different from zero, the - timer is running and will expire at that time, otherwise the timer is not - running) - - struct work_struct work; - - work structure used for queuing up requests (i.e. work items in pm_wq) - - wait_queue_head_t wait_queue; - - wait queue used if any of the helper functions needs to wait for another - one to complete - - spinlock_t lock; - - lock used for synchronization - - atomic_t usage_count; - - the usage counter of the device - - atomic_t child_count; - - the count of 'active' children of the device - - unsigned int ignore_children; - - if set, the value of child_count is ignored (but still updated) - - unsigned int disable_depth; - - used for disabling the helper functions (they work normally if this is - equal to zero); the initial value of it is 1 (i.e. runtime PM is - initially disabled for all devices) - - int runtime_error; - - if set, there was a fatal error (one of the callbacks returned error code - as described in Section 2), so the helper functions will not work until - this flag is cleared; this is the error code returned by the failing - callback - - unsigned int idle_notification; - - if set, ->runtime_idle() is being executed - - unsigned int request_pending; - - if set, there's a pending request (i.e. a work item queued up into pm_wq) - - enum rpm_request request; - - type of request that's pending (valid if request_pending is set) - - unsigned int deferred_resume; - - set if ->runtime_resume() is about to be run while ->runtime_suspend() is - being executed for that device and it is not practical to wait for the - suspend to complete; means "start a resume as soon as you've suspended" - - enum rpm_status runtime_status; - - the runtime PM status of the device; this field's initial value is - RPM_SUSPENDED, which means that each device is initially regarded by the - PM core as 'suspended', regardless of its real hardware status - - unsigned int runtime_auto; - - if set, indicates that the user space has allowed the device driver to - power manage the device at run time via the /sys/devices/.../power/control - interface; it may only be modified with the help of the pm_runtime_allow() - and pm_runtime_forbid() helper functions - - unsigned int no_callbacks; - - indicates that the device does not use the runtime PM callbacks (see - Section 8); it may be modified only by the pm_runtime_no_callbacks() - helper function - - unsigned int irq_safe; - - indicates that the ->runtime_suspend() and ->runtime_resume() callbacks - will be invoked with the spinlock held and interrupts disabled - - unsigned int use_autosuspend; - - indicates that the device's driver supports delayed autosuspend (see - Section 9); it may be modified only by the - pm_runtime{_dont}_use_autosuspend() helper functions - - unsigned int timer_autosuspends; - - indicates that the PM core should attempt to carry out an autosuspend - when the timer expires rather than a normal suspend - - int autosuspend_delay; - - the delay time (in milliseconds) to be used for autosuspend - - unsigned long last_busy; - - the time (in jiffies) when the pm_runtime_mark_last_busy() helper - function was last called for this device; used in calculating inactivity - periods for autosuspend - -All of the above fields are members of the 'power' member of 'struct device'. - -4. Runtime PM Device Helper Functions - -The following runtime PM helper functions are defined in -drivers/base/power/runtime.c and include/linux/pm_runtime.h: - - void pm_runtime_init(struct device *dev); - - initialize the device runtime PM fields in 'struct dev_pm_info' - - void pm_runtime_remove(struct device *dev); - - make sure that the runtime PM of the device will be disabled after - removing the device from device hierarchy - - int pm_runtime_idle(struct device *dev); - - execute the subsystem-level idle callback for the device; returns an - error code on failure, where -EINPROGRESS means that ->runtime_idle() is - already being executed; if there is no callback or the callback returns 0 - then run pm_runtime_autosuspend(dev) and return its result - - int pm_runtime_suspend(struct device *dev); - - execute the subsystem-level suspend callback for the device; returns 0 on - success, 1 if the device's runtime PM status was already 'suspended', or - error code on failure, where -EAGAIN or -EBUSY means it is safe to attempt - to suspend the device again in future and -EACCES means that - 'power.disable_depth' is different from 0 - - int pm_runtime_autosuspend(struct device *dev); - - same as pm_runtime_suspend() except that the autosuspend delay is taken - into account; if pm_runtime_autosuspend_expiration() says the delay has - not yet expired then an autosuspend is scheduled for the appropriate time - and 0 is returned - - int pm_runtime_resume(struct device *dev); - - execute the subsystem-level resume callback for the device; returns 0 on - success, 1 if the device's runtime PM status was already 'active' or - error code on failure, where -EAGAIN means it may be safe to attempt to - resume the device again in future, but 'power.runtime_error' should be - checked additionally, and -EACCES means that 'power.disable_depth' is - different from 0 - - int pm_request_idle(struct device *dev); - - submit a request to execute the subsystem-level idle callback for the - device (the request is represented by a work item in pm_wq); returns 0 on - success or error code if the request has not been queued up - - int pm_request_autosuspend(struct device *dev); - - schedule the execution of the subsystem-level suspend callback for the - device when the autosuspend delay has expired; if the delay has already - expired then the work item is queued up immediately - - int pm_schedule_suspend(struct device *dev, unsigned int delay); - - schedule the execution of the subsystem-level suspend callback for the - device in future, where 'delay' is the time to wait before queuing up a - suspend work item in pm_wq, in milliseconds (if 'delay' is zero, the work - item is queued up immediately); returns 0 on success, 1 if the device's PM - runtime status was already 'suspended', or error code if the request - hasn't been scheduled (or queued up if 'delay' is 0); if the execution of - ->runtime_suspend() is already scheduled and not yet expired, the new - value of 'delay' will be used as the time to wait - - int pm_request_resume(struct device *dev); - - submit a request to execute the subsystem-level resume callback for the - device (the request is represented by a work item in pm_wq); returns 0 on - success, 1 if the device's runtime PM status was already 'active', or - error code if the request hasn't been queued up - - void pm_runtime_get_noresume(struct device *dev); - - increment the device's usage counter - - int pm_runtime_get(struct device *dev); - - increment the device's usage counter, run pm_request_resume(dev) and - return its result - - int pm_runtime_get_sync(struct device *dev); - - increment the device's usage counter, run pm_runtime_resume(dev) and - return its result - - int pm_runtime_get_if_in_use(struct device *dev); - - return -EINVAL if 'power.disable_depth' is nonzero; otherwise, if the - runtime PM status is RPM_ACTIVE and the runtime PM usage counter is - nonzero, increment the counter and return 1; otherwise return 0 without - changing the counter - - void pm_runtime_put_noidle(struct device *dev); - - decrement the device's usage counter - - int pm_runtime_put(struct device *dev); - - decrement the device's usage counter; if the result is 0 then run - pm_request_idle(dev) and return its result - - int pm_runtime_put_autosuspend(struct device *dev); - - decrement the device's usage counter; if the result is 0 then run - pm_request_autosuspend(dev) and return its result - - int pm_runtime_put_sync(struct device *dev); - - decrement the device's usage counter; if the result is 0 then run - pm_runtime_idle(dev) and return its result - - int pm_runtime_put_sync_suspend(struct device *dev); - - decrement the device's usage counter; if the result is 0 then run - pm_runtime_suspend(dev) and return its result - - int pm_runtime_put_sync_autosuspend(struct device *dev); - - decrement the device's usage counter; if the result is 0 then run - pm_runtime_autosuspend(dev) and return its result - - void pm_runtime_enable(struct device *dev); - - decrement the device's 'power.disable_depth' field; if that field is equal - to zero, the runtime PM helper functions can execute subsystem-level - callbacks described in Section 2 for the device - - int pm_runtime_disable(struct device *dev); - - increment the device's 'power.disable_depth' field (if the value of that - field was previously zero, this prevents subsystem-level runtime PM - callbacks from being run for the device), make sure that all of the - pending runtime PM operations on the device are either completed or - canceled; returns 1 if there was a resume request pending and it was - necessary to execute the subsystem-level resume callback for the device - to satisfy that request, otherwise 0 is returned - - int pm_runtime_barrier(struct device *dev); - - check if there's a resume request pending for the device and resume it - (synchronously) in that case, cancel any other pending runtime PM requests - regarding it and wait for all runtime PM operations on it in progress to - complete; returns 1 if there was a resume request pending and it was - necessary to execute the subsystem-level resume callback for the device to - satisfy that request, otherwise 0 is returned - - void pm_suspend_ignore_children(struct device *dev, bool enable); - - set/unset the power.ignore_children flag of the device - - int pm_runtime_set_active(struct device *dev); - - clear the device's 'power.runtime_error' flag, set the device's runtime - PM status to 'active' and update its parent's counter of 'active' - children as appropriate (it is only valid to use this function if - 'power.runtime_error' is set or 'power.disable_depth' is greater than - zero); it will fail and return error code if the device has a parent - which is not active and the 'power.ignore_children' flag of which is unset - - void pm_runtime_set_suspended(struct device *dev); - - clear the device's 'power.runtime_error' flag, set the device's runtime - PM status to 'suspended' and update its parent's counter of 'active' - children as appropriate (it is only valid to use this function if - 'power.runtime_error' is set or 'power.disable_depth' is greater than - zero) - - bool pm_runtime_active(struct device *dev); - - return true if the device's runtime PM status is 'active' or its - 'power.disable_depth' field is not equal to zero, or false otherwise - - bool pm_runtime_suspended(struct device *dev); - - return true if the device's runtime PM status is 'suspended' and its - 'power.disable_depth' field is equal to zero, or false otherwise - - bool pm_runtime_status_suspended(struct device *dev); - - return true if the device's runtime PM status is 'suspended' - - void pm_runtime_allow(struct device *dev); - - set the power.runtime_auto flag for the device and decrease its usage - counter (used by the /sys/devices/.../power/control interface to - effectively allow the device to be power managed at run time) - - void pm_runtime_forbid(struct device *dev); - - unset the power.runtime_auto flag for the device and increase its usage - counter (used by the /sys/devices/.../power/control interface to - effectively prevent the device from being power managed at run time) - - void pm_runtime_no_callbacks(struct device *dev); - - set the power.no_callbacks flag for the device and remove the runtime - PM attributes from /sys/devices/.../power (or prevent them from being - added when the device is registered) - - void pm_runtime_irq_safe(struct device *dev); - - set the power.irq_safe flag for the device, causing the runtime-PM - callbacks to be invoked with interrupts off - - bool pm_runtime_is_irq_safe(struct device *dev); - - return true if power.irq_safe flag was set for the device, causing - the runtime-PM callbacks to be invoked with interrupts off - - void pm_runtime_mark_last_busy(struct device *dev); - - set the power.last_busy field to the current time - - void pm_runtime_use_autosuspend(struct device *dev); - - set the power.use_autosuspend flag, enabling autosuspend delays; call - pm_runtime_get_sync if the flag was previously cleared and - power.autosuspend_delay is negative - - void pm_runtime_dont_use_autosuspend(struct device *dev); - - clear the power.use_autosuspend flag, disabling autosuspend delays; - decrement the device's usage counter if the flag was previously set and - power.autosuspend_delay is negative; call pm_runtime_idle - - void pm_runtime_set_autosuspend_delay(struct device *dev, int delay); - - set the power.autosuspend_delay value to 'delay' (expressed in - milliseconds); if 'delay' is negative then runtime suspends are - prevented; if power.use_autosuspend is set, pm_runtime_get_sync may be - called or the device's usage counter may be decremented and - pm_runtime_idle called depending on if power.autosuspend_delay is - changed to or from a negative value; if power.use_autosuspend is clear, - pm_runtime_idle is called - - unsigned long pm_runtime_autosuspend_expiration(struct device *dev); - - calculate the time when the current autosuspend delay period will expire, - based on power.last_busy and power.autosuspend_delay; if the delay time - is 1000 ms or larger then the expiration time is rounded up to the - nearest second; returns 0 if the delay period has already expired or - power.use_autosuspend isn't set, otherwise returns the expiration time - in jiffies - -It is safe to execute the following helper functions from interrupt context: - -pm_request_idle() -pm_request_autosuspend() -pm_schedule_suspend() -pm_request_resume() -pm_runtime_get_noresume() -pm_runtime_get() -pm_runtime_put_noidle() -pm_runtime_put() -pm_runtime_put_autosuspend() -pm_runtime_enable() -pm_suspend_ignore_children() -pm_runtime_set_active() -pm_runtime_set_suspended() -pm_runtime_suspended() -pm_runtime_mark_last_busy() -pm_runtime_autosuspend_expiration() - -If pm_runtime_irq_safe() has been called for a device then the following helper -functions may also be used in interrupt context: - -pm_runtime_idle() -pm_runtime_suspend() -pm_runtime_autosuspend() -pm_runtime_resume() -pm_runtime_get_sync() -pm_runtime_put_sync() -pm_runtime_put_sync_suspend() -pm_runtime_put_sync_autosuspend() - -5. Runtime PM Initialization, Device Probing and Removal - -Initially, the runtime PM is disabled for all devices, which means that the -majority of the runtime PM helper functions described in Section 4 will return --EAGAIN until pm_runtime_enable() is called for the device. - -In addition to that, the initial runtime PM status of all devices is -'suspended', but it need not reflect the actual physical state of the device. -Thus, if the device is initially active (i.e. it is able to process I/O), its -runtime PM status must be changed to 'active', with the help of -pm_runtime_set_active(), before pm_runtime_enable() is called for the device. - -However, if the device has a parent and the parent's runtime PM is enabled, -calling pm_runtime_set_active() for the device will affect the parent, unless -the parent's 'power.ignore_children' flag is set. Namely, in that case the -parent won't be able to suspend at run time, using the PM core's helper -functions, as long as the child's status is 'active', even if the child's -runtime PM is still disabled (i.e. pm_runtime_enable() hasn't been called for -the child yet or pm_runtime_disable() has been called for it). For this reason, -once pm_runtime_set_active() has been called for the device, pm_runtime_enable() -should be called for it too as soon as reasonably possible or its runtime PM -status should be changed back to 'suspended' with the help of -pm_runtime_set_suspended(). - -If the default initial runtime PM status of the device (i.e. 'suspended') -reflects the actual state of the device, its bus type's or its driver's -->probe() callback will likely need to wake it up using one of the PM core's -helper functions described in Section 4. In that case, pm_runtime_resume() -should be used. Of course, for this purpose the device's runtime PM has to be -enabled earlier by calling pm_runtime_enable(). - -Note, if the device may execute pm_runtime calls during the probe (such as -if it is registers with a subsystem that may call back in) then the -pm_runtime_get_sync() call paired with a pm_runtime_put() call will be -appropriate to ensure that the device is not put back to sleep during the -probe. This can happen with systems such as the network device layer. - -It may be desirable to suspend the device once ->probe() has finished. -Therefore the driver core uses the asynchronous pm_request_idle() to submit a -request to execute the subsystem-level idle callback for the device at that -time. A driver that makes use of the runtime autosuspend feature, may want to -update the last busy mark before returning from ->probe(). - -Moreover, the driver core prevents runtime PM callbacks from racing with the bus -notifier callback in __device_release_driver(), which is necessary, because the -notifier is used by some subsystems to carry out operations affecting the -runtime PM functionality. It does so by calling pm_runtime_get_sync() before -driver_sysfs_remove() and the BUS_NOTIFY_UNBIND_DRIVER notifications. This -resumes the device if it's in the suspended state and prevents it from -being suspended again while those routines are being executed. - -To allow bus types and drivers to put devices into the suspended state by -calling pm_runtime_suspend() from their ->remove() routines, the driver core -executes pm_runtime_put_sync() after running the BUS_NOTIFY_UNBIND_DRIVER -notifications in __device_release_driver(). This requires bus types and -drivers to make their ->remove() callbacks avoid races with runtime PM directly, -but also it allows of more flexibility in the handling of devices during the -removal of their drivers. - -Drivers in ->remove() callback should undo the runtime PM changes done -in ->probe(). Usually this means calling pm_runtime_disable(), -pm_runtime_dont_use_autosuspend() etc. - -The user space can effectively disallow the driver of the device to power manage -it at run time by changing the value of its /sys/devices/.../power/control -attribute to "on", which causes pm_runtime_forbid() to be called. In principle, -this mechanism may also be used by the driver to effectively turn off the -runtime power management of the device until the user space turns it on. -Namely, during the initialization the driver can make sure that the runtime PM -status of the device is 'active' and call pm_runtime_forbid(). It should be -noted, however, that if the user space has already intentionally changed the -value of /sys/devices/.../power/control to "auto" to allow the driver to power -manage the device at run time, the driver may confuse it by using -pm_runtime_forbid() this way. - -6. Runtime PM and System Sleep - -Runtime PM and system sleep (i.e., system suspend and hibernation, also known -as suspend-to-RAM and suspend-to-disk) interact with each other in a couple of -ways. If a device is active when a system sleep starts, everything is -straightforward. But what should happen if the device is already suspended? - -The device may have different wake-up settings for runtime PM and system sleep. -For example, remote wake-up may be enabled for runtime suspend but disallowed -for system sleep (device_may_wakeup(dev) returns 'false'). When this happens, -the subsystem-level system suspend callback is responsible for changing the -device's wake-up setting (it may leave that to the device driver's system -suspend routine). It may be necessary to resume the device and suspend it again -in order to do so. The same is true if the driver uses different power levels -or other settings for runtime suspend and system sleep. - -During system resume, the simplest approach is to bring all devices back to full -power, even if they had been suspended before the system suspend began. There -are several reasons for this, including: - - * The device might need to switch power levels, wake-up settings, etc. - - * Remote wake-up events might have been lost by the firmware. - - * The device's children may need the device to be at full power in order - to resume themselves. - - * The driver's idea of the device state may not agree with the device's - physical state. This can happen during resume from hibernation. - - * The device might need to be reset. - - * Even though the device was suspended, if its usage counter was > 0 then most - likely it would need a runtime resume in the near future anyway. - -If the device had been suspended before the system suspend began and it's -brought back to full power during resume, then its runtime PM status will have -to be updated to reflect the actual post-system sleep status. The way to do -this is: - - pm_runtime_disable(dev); - pm_runtime_set_active(dev); - pm_runtime_enable(dev); - -The PM core always increments the runtime usage counter before calling the -->suspend() callback and decrements it after calling the ->resume() callback. -Hence disabling runtime PM temporarily like this will not cause any runtime -suspend attempts to be permanently lost. If the usage count goes to zero -following the return of the ->resume() callback, the ->runtime_idle() callback -will be invoked as usual. - -On some systems, however, system sleep is not entered through a global firmware -or hardware operation. Instead, all hardware components are put into low-power -states directly by the kernel in a coordinated way. Then, the system sleep -state effectively follows from the states the hardware components end up in -and the system is woken up from that state by a hardware interrupt or a similar -mechanism entirely under the kernel's control. As a result, the kernel never -gives control away and the states of all devices during resume are precisely -known to it. If that is the case and none of the situations listed above takes -place (in particular, if the system is not waking up from hibernation), it may -be more efficient to leave the devices that had been suspended before the system -suspend began in the suspended state. - -To this end, the PM core provides a mechanism allowing some coordination between -different levels of device hierarchy. Namely, if a system suspend .prepare() -callback returns a positive number for a device, that indicates to the PM core -that the device appears to be runtime-suspended and its state is fine, so it -may be left in runtime suspend provided that all of its descendants are also -left in runtime suspend. If that happens, the PM core will not execute any -system suspend and resume callbacks for all of those devices, except for the -complete callback, which is then entirely responsible for handling the device -as appropriate. This only applies to system suspend transitions that are not -related to hibernation (see Documentation/driver-api/pm/devices.rst for more -information). - -The PM core does its best to reduce the probability of race conditions between -the runtime PM and system suspend/resume (and hibernation) callbacks by carrying -out the following operations: - - * During system suspend pm_runtime_get_noresume() is called for every device - right before executing the subsystem-level .prepare() callback for it and - pm_runtime_barrier() is called for every device right before executing the - subsystem-level .suspend() callback for it. In addition to that the PM core - calls __pm_runtime_disable() with 'false' as the second argument for every - device right before executing the subsystem-level .suspend_late() callback - for it. - - * During system resume pm_runtime_enable() and pm_runtime_put() are called for - every device right after executing the subsystem-level .resume_early() - callback and right after executing the subsystem-level .complete() callback - for it, respectively. - -7. Generic subsystem callbacks - -Subsystems may wish to conserve code space by using the set of generic power -management callbacks provided by the PM core, defined in -driver/base/power/generic_ops.c: - - int pm_generic_runtime_suspend(struct device *dev); - - invoke the ->runtime_suspend() callback provided by the driver of this - device and return its result, or return 0 if not defined - - int pm_generic_runtime_resume(struct device *dev); - - invoke the ->runtime_resume() callback provided by the driver of this - device and return its result, or return 0 if not defined - - int pm_generic_suspend(struct device *dev); - - if the device has not been suspended at run time, invoke the ->suspend() - callback provided by its driver and return its result, or return 0 if not - defined - - int pm_generic_suspend_noirq(struct device *dev); - - if pm_runtime_suspended(dev) returns "false", invoke the ->suspend_noirq() - callback provided by the device's driver and return its result, or return - 0 if not defined - - int pm_generic_resume(struct device *dev); - - invoke the ->resume() callback provided by the driver of this device and, - if successful, change the device's runtime PM status to 'active' - - int pm_generic_resume_noirq(struct device *dev); - - invoke the ->resume_noirq() callback provided by the driver of this device - - int pm_generic_freeze(struct device *dev); - - if the device has not been suspended at run time, invoke the ->freeze() - callback provided by its driver and return its result, or return 0 if not - defined - - int pm_generic_freeze_noirq(struct device *dev); - - if pm_runtime_suspended(dev) returns "false", invoke the ->freeze_noirq() - callback provided by the device's driver and return its result, or return - 0 if not defined - - int pm_generic_thaw(struct device *dev); - - if the device has not been suspended at run time, invoke the ->thaw() - callback provided by its driver and return its result, or return 0 if not - defined - - int pm_generic_thaw_noirq(struct device *dev); - - if pm_runtime_suspended(dev) returns "false", invoke the ->thaw_noirq() - callback provided by the device's driver and return its result, or return - 0 if not defined - - int pm_generic_poweroff(struct device *dev); - - if the device has not been suspended at run time, invoke the ->poweroff() - callback provided by its driver and return its result, or return 0 if not - defined - - int pm_generic_poweroff_noirq(struct device *dev); - - if pm_runtime_suspended(dev) returns "false", run the ->poweroff_noirq() - callback provided by the device's driver and return its result, or return - 0 if not defined - - int pm_generic_restore(struct device *dev); - - invoke the ->restore() callback provided by the driver of this device and, - if successful, change the device's runtime PM status to 'active' - - int pm_generic_restore_noirq(struct device *dev); - - invoke the ->restore_noirq() callback provided by the device's driver - -These functions are the defaults used by the PM core, if a subsystem doesn't -provide its own callbacks for ->runtime_idle(), ->runtime_suspend(), -->runtime_resume(), ->suspend(), ->suspend_noirq(), ->resume(), -->resume_noirq(), ->freeze(), ->freeze_noirq(), ->thaw(), ->thaw_noirq(), -->poweroff(), ->poweroff_noirq(), ->restore(), ->restore_noirq() in the -subsystem-level dev_pm_ops structure. - -Device drivers that wish to use the same function as a system suspend, freeze, -poweroff and runtime suspend callback, and similarly for system resume, thaw, -restore, and runtime resume, can achieve this with the help of the -UNIVERSAL_DEV_PM_OPS macro defined in include/linux/pm.h (possibly setting its -last argument to NULL). - -8. "No-Callback" Devices - -Some "devices" are only logical sub-devices of their parent and cannot be -power-managed on their own. (The prototype example is a USB interface. Entire -USB devices can go into low-power mode or send wake-up requests, but neither is -possible for individual interfaces.) The drivers for these devices have no -need of runtime PM callbacks; if the callbacks did exist, ->runtime_suspend() -and ->runtime_resume() would always return 0 without doing anything else and -->runtime_idle() would always call pm_runtime_suspend(). - -Subsystems can tell the PM core about these devices by calling -pm_runtime_no_callbacks(). This should be done after the device structure is -initialized and before it is registered (although after device registration is -also okay). The routine will set the device's power.no_callbacks flag and -prevent the non-debugging runtime PM sysfs attributes from being created. - -When power.no_callbacks is set, the PM core will not invoke the -->runtime_idle(), ->runtime_suspend(), or ->runtime_resume() callbacks. -Instead it will assume that suspends and resumes always succeed and that idle -devices should be suspended. - -As a consequence, the PM core will never directly inform the device's subsystem -or driver about runtime power changes. Instead, the driver for the device's -parent must take responsibility for telling the device's driver when the -parent's power state changes. - -9. Autosuspend, or automatically-delayed suspends - -Changing a device's power state isn't free; it requires both time and energy. -A device should be put in a low-power state only when there's some reason to -think it will remain in that state for a substantial time. A common heuristic -says that a device which hasn't been used for a while is liable to remain -unused; following this advice, drivers should not allow devices to be suspended -at runtime until they have been inactive for some minimum period. Even when -the heuristic ends up being non-optimal, it will still prevent devices from -"bouncing" too rapidly between low-power and full-power states. - -The term "autosuspend" is an historical remnant. It doesn't mean that the -device is automatically suspended (the subsystem or driver still has to call -the appropriate PM routines); rather it means that runtime suspends will -automatically be delayed until the desired period of inactivity has elapsed. - -Inactivity is determined based on the power.last_busy field. Drivers should -call pm_runtime_mark_last_busy() to update this field after carrying out I/O, -typically just before calling pm_runtime_put_autosuspend(). The desired length -of the inactivity period is a matter of policy. Subsystems can set this length -initially by calling pm_runtime_set_autosuspend_delay(), but after device -registration the length should be controlled by user space, using the -/sys/devices/.../power/autosuspend_delay_ms attribute. - -In order to use autosuspend, subsystems or drivers must call -pm_runtime_use_autosuspend() (preferably before registering the device), and -thereafter they should use the various *_autosuspend() helper functions instead -of the non-autosuspend counterparts: - - Instead of: pm_runtime_suspend use: pm_runtime_autosuspend; - Instead of: pm_schedule_suspend use: pm_request_autosuspend; - Instead of: pm_runtime_put use: pm_runtime_put_autosuspend; - Instead of: pm_runtime_put_sync use: pm_runtime_put_sync_autosuspend. - -Drivers may also continue to use the non-autosuspend helper functions; they -will behave normally, which means sometimes taking the autosuspend delay into -account (see pm_runtime_idle). - -Under some circumstances a driver or subsystem may want to prevent a device -from autosuspending immediately, even though the usage counter is zero and the -autosuspend delay time has expired. If the ->runtime_suspend() callback -returns -EAGAIN or -EBUSY, and if the next autosuspend delay expiration time is -in the future (as it normally would be if the callback invoked -pm_runtime_mark_last_busy()), the PM core will automatically reschedule the -autosuspend. The ->runtime_suspend() callback can't do this rescheduling -itself because no suspend requests of any kind are accepted while the device is -suspending (i.e., while the callback is running). - -The implementation is well suited for asynchronous use in interrupt contexts. -However such use inevitably involves races, because the PM core can't -synchronize ->runtime_suspend() callbacks with the arrival of I/O requests. -This synchronization must be handled by the driver, using its private lock. -Here is a schematic pseudo-code example: - - foo_read_or_write(struct foo_priv *foo, void *data) - { - lock(&foo->private_lock); - add_request_to_io_queue(foo, data); - if (foo->num_pending_requests++ == 0) - pm_runtime_get(&foo->dev); - if (!foo->is_suspended) - foo_process_next_request(foo); - unlock(&foo->private_lock); - } - - foo_io_completion(struct foo_priv *foo, void *req) - { - lock(&foo->private_lock); - if (--foo->num_pending_requests == 0) { - pm_runtime_mark_last_busy(&foo->dev); - pm_runtime_put_autosuspend(&foo->dev); - } else { - foo_process_next_request(foo); - } - unlock(&foo->private_lock); - /* Send req result back to the user ... */ - } - - int foo_runtime_suspend(struct device *dev) - { - struct foo_priv foo = container_of(dev, ...); - int ret = 0; - - lock(&foo->private_lock); - if (foo->num_pending_requests > 0) { - ret = -EBUSY; - } else { - /* ... suspend the device ... */ - foo->is_suspended = 1; - } - unlock(&foo->private_lock); - return ret; - } - - int foo_runtime_resume(struct device *dev) - { - struct foo_priv foo = container_of(dev, ...); - - lock(&foo->private_lock); - /* ... resume the device ... */ - foo->is_suspended = 0; - pm_runtime_mark_last_busy(&foo->dev); - if (foo->num_pending_requests > 0) - foo_process_next_request(foo); - unlock(&foo->private_lock); - return 0; - } - -The important point is that after foo_io_completion() asks for an autosuspend, -the foo_runtime_suspend() callback may race with foo_read_or_write(). -Therefore foo_runtime_suspend() has to check whether there are any pending I/O -requests (while holding the private lock) before allowing the suspend to -proceed. - -In addition, the power.autosuspend_delay field can be changed by user space at -any time. If a driver cares about this, it can call -pm_runtime_autosuspend_expiration() from within the ->runtime_suspend() -callback while holding its private lock. If the function returns a nonzero -value then the delay has not yet expired and the callback should return --EAGAIN. diff --git a/Documentation/power/s2ram.rst b/Documentation/power/s2ram.rst new file mode 100644 index 000000000000..d739aa7c742c --- /dev/null +++ b/Documentation/power/s2ram.rst @@ -0,0 +1,87 @@ +======================== +How to get s2ram working +======================== + +2006 Linus Torvalds +2006 Pavel Machek + +1) Check suspend.sf.net, program s2ram there has long whitelist of + "known ok" machines, along with tricks to use on each one. + +2) If that does not help, try reading tricks.txt and + video.txt. Perhaps problem is as simple as broken module, and + simple module unload can fix it. + +3) You can use Linus' TRACE_RESUME infrastructure, described below. + +Using TRACE_RESUME +~~~~~~~~~~~~~~~~~~ + +I've been working at making the machines I have able to STR, and almost +always it's a driver that is buggy. Thank God for the suspend/resume +debugging - the thing that Chuck tried to disable. That's often the _only_ +way to debug these things, and it's actually pretty powerful (but +time-consuming - having to insert TRACE_RESUME() markers into the device +driver that doesn't resume and recompile and reboot). + +Anyway, the way to debug this for people who are interested (have a +machine that doesn't boot) is: + + - enable PM_DEBUG, and PM_TRACE + + - use a script like this:: + + #!/bin/sh + sync + echo 1 > /sys/power/pm_trace + echo mem > /sys/power/state + + to suspend + + - if it doesn't come back up (which is usually the problem), reboot by + holding the power button down, and look at the dmesg output for things + like:: + + Magic number: 4:156:725 + hash matches drivers/base/power/resume.c:28 + hash matches device 0000:01:00.0 + + which means that the last trace event was just before trying to resume + device 0000:01:00.0. Then figure out what driver is controlling that + device (lspci and /sys/devices/pci* is your friend), and see if you can + fix it, disable it, or trace into its resume function. + + If no device matches the hash (or any matches appear to be false positives), + the culprit may be a device from a loadable kernel module that is not loaded + until after the hash is checked. You can check the hash against the current + devices again after more modules are loaded using sysfs:: + + cat /sys/power/pm_trace_dev_match + +For example, the above happens to be the VGA device on my EVO, which I +used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out +that "radeonfb" simply cannot resume that device - it tries to set the +PLL's, and it just _hangs_. Using the regular VGA console and letting X +resume it instead works fine. + +NOTE +==== +pm_trace uses the system's Real Time Clock (RTC) to save the magic number. +Reason for this is that the RTC is the only reliably available piece of +hardware during resume operations where a value can be set that will +survive a reboot. + +pm_trace is not compatible with asynchronous suspend, so it turns +asynchronous suspend off (which may work around timing or +ordering-sensitive bugs). + +Consequence is that after a resume (even if it is successful) your system +clock will have a value corresponding to the magic number instead of the +correct date/time! It is therefore advisable to use a program like ntp-date +or rdate to reset the correct date/time from an external time source when +using this trace option. + +As the clock keeps ticking it is also essential that the reboot is done +quickly after the resume failure. The trace option does not use the seconds +or the low order bits of the minutes of the RTC, but a too long delay will +corrupt the magic value. diff --git a/Documentation/power/s2ram.txt b/Documentation/power/s2ram.txt deleted file mode 100644 index 4685aee197fd..000000000000 --- a/Documentation/power/s2ram.txt +++ /dev/null @@ -1,85 +0,0 @@ - How to get s2ram working - ~~~~~~~~~~~~~~~~~~~~~~~~ - 2006 Linus Torvalds - 2006 Pavel Machek - -1) Check suspend.sf.net, program s2ram there has long whitelist of - "known ok" machines, along with tricks to use on each one. - -2) If that does not help, try reading tricks.txt and - video.txt. Perhaps problem is as simple as broken module, and - simple module unload can fix it. - -3) You can use Linus' TRACE_RESUME infrastructure, described below. - - Using TRACE_RESUME - ~~~~~~~~~~~~~~~~~~ - -I've been working at making the machines I have able to STR, and almost -always it's a driver that is buggy. Thank God for the suspend/resume -debugging - the thing that Chuck tried to disable. That's often the _only_ -way to debug these things, and it's actually pretty powerful (but -time-consuming - having to insert TRACE_RESUME() markers into the device -driver that doesn't resume and recompile and reboot). - -Anyway, the way to debug this for people who are interested (have a -machine that doesn't boot) is: - - - enable PM_DEBUG, and PM_TRACE - - - use a script like this: - - #!/bin/sh - sync - echo 1 > /sys/power/pm_trace - echo mem > /sys/power/state - - to suspend - - - if it doesn't come back up (which is usually the problem), reboot by - holding the power button down, and look at the dmesg output for things - like - - Magic number: 4:156:725 - hash matches drivers/base/power/resume.c:28 - hash matches device 0000:01:00.0 - - which means that the last trace event was just before trying to resume - device 0000:01:00.0. Then figure out what driver is controlling that - device (lspci and /sys/devices/pci* is your friend), and see if you can - fix it, disable it, or trace into its resume function. - - If no device matches the hash (or any matches appear to be false positives), - the culprit may be a device from a loadable kernel module that is not loaded - until after the hash is checked. You can check the hash against the current - devices again after more modules are loaded using sysfs: - - cat /sys/power/pm_trace_dev_match - -For example, the above happens to be the VGA device on my EVO, which I -used to run with "radeonfb" (it's an ATI Radeon mobility). It turns out -that "radeonfb" simply cannot resume that device - it tries to set the -PLL's, and it just _hangs_. Using the regular VGA console and letting X -resume it instead works fine. - -NOTE -==== -pm_trace uses the system's Real Time Clock (RTC) to save the magic number. -Reason for this is that the RTC is the only reliably available piece of -hardware during resume operations where a value can be set that will -survive a reboot. - -pm_trace is not compatible with asynchronous suspend, so it turns -asynchronous suspend off (which may work around timing or -ordering-sensitive bugs). - -Consequence is that after a resume (even if it is successful) your system -clock will have a value corresponding to the magic number instead of the -correct date/time! It is therefore advisable to use a program like ntp-date -or rdate to reset the correct date/time from an external time source when -using this trace option. - -As the clock keeps ticking it is also essential that the reboot is done -quickly after the resume failure. The trace option does not use the seconds -or the low order bits of the minutes of the RTC, but a too long delay will -corrupt the magic value. diff --git a/Documentation/power/suspend-and-cpuhotplug.rst b/Documentation/power/suspend-and-cpuhotplug.rst new file mode 100644 index 000000000000..7ac8e1f549f4 --- /dev/null +++ b/Documentation/power/suspend-and-cpuhotplug.rst @@ -0,0 +1,286 @@ +==================================================================== +Interaction of Suspend code (S3) with the CPU hotplug infrastructure +==================================================================== + +(C) 2011 - 2014 Srivatsa S. Bhat + + +I. Differences between CPU hotplug and Suspend-to-RAM +====================================================== + +How does the regular CPU hotplug code differ from how the Suspend-to-RAM +infrastructure uses it internally? And where do they share common code? + +Well, a picture is worth a thousand words... So ASCII art follows :-) + +[This depicts the current design in the kernel, and focusses only on the +interactions involving the freezer and CPU hotplug and also tries to explain +the locking involved. It outlines the notifications involved as well. +But please note that here, only the call paths are illustrated, with the aim +of describing where they take different paths and where they share code. +What happens when regular CPU hotplug and Suspend-to-RAM race with each other +is not depicted here.] + +On a high level, the suspend-resume cycle goes like this:: + + |Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw | + |tasks | | cpus | | | | cpus | |tasks| + + +More details follow:: + + Suspend call path + ----------------- + + Write 'mem' to + /sys/power/state + sysfs file + | + v + Acquire system_transition_mutex lock + | + v + Send PM_SUSPEND_PREPARE + notifications + | + v + Freeze tasks + | + | + v + disable_nonboot_cpus() + /* start */ + | + v + Acquire cpu_add_remove_lock + | + v + Iterate over CURRENTLY + online CPUs + | + | + | ---------- + v | L + ======> _cpu_down() | + | [This takes cpuhotplug.lock | + Common | before taking down the CPU | + code | and releases it when done] | O + | While it is at it, notifications | + | are sent when notable events occur, | + ======> by running all registered callbacks. | + | | O + | | + | | + v | + Note down these cpus in | P + frozen_cpus mask ---------- + | + v + Disable regular cpu hotplug + by increasing cpu_hotplug_disabled + | + v + Release cpu_add_remove_lock + | + v + /* disable_nonboot_cpus() complete */ + | + v + Do suspend + + + +Resuming back is likewise, with the counterparts being (in the order of +execution during resume): + +* enable_nonboot_cpus() which involves:: + + | Acquire cpu_add_remove_lock + | Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug + | Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop] + | Release cpu_add_remove_lock + v + +* thaw tasks +* send PM_POST_SUSPEND notifications +* Release system_transition_mutex lock. + + +It is to be noted here that the system_transition_mutex lock is acquired at the very +beginning, when we are just starting out to suspend, and then released only +after the entire cycle is complete (i.e., suspend + resume). + +:: + + + + Regular CPU hotplug call path + ----------------------------- + + Write 0 (or 1) to + /sys/devices/system/cpu/cpu*/online + sysfs file + | + | + v + cpu_down() + | + v + Acquire cpu_add_remove_lock + | + v + If cpu_hotplug_disabled > 0 + return gracefully + | + | + v + ======> _cpu_down() + | [This takes cpuhotplug.lock + Common | before taking down the CPU + code | and releases it when done] + | While it is at it, notifications + | are sent when notable events occur, + ======> by running all registered callbacks. + | + | + v + Release cpu_add_remove_lock + [That's it!, for + regular CPU hotplug] + + + +So, as can be seen from the two diagrams (the parts marked as "Common code"), +regular CPU hotplug and the suspend code path converge at the _cpu_down() and +_cpu_up() functions. They differ in the arguments passed to these functions, +in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen' +argument. But during suspend, since the tasks are already frozen by the time +the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called +with the 'tasks_frozen' argument set to 1. +[See below for some known issues regarding this.] + + +Important files and functions/entry points: +------------------------------------------- + +- kernel/power/process.c : freeze_processes(), thaw_processes() +- kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish() +- kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus() + + + +II. What are the issues involved in CPU hotplug? +------------------------------------------------ + +There are some interesting situations involving CPU hotplug and microcode +update on the CPUs, as discussed below: + +[Please bear in mind that the kernel requests the microcode images from +userspace, using the request_firmware() function defined in +drivers/base/firmware_loader/main.c] + + +a. When all the CPUs are identical: + + This is the most common situation and it is quite straightforward: we want + to apply the same microcode revision to each of the CPUs. + To give an example of x86, the collect_cpu_info() function defined in + arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU + and thereby in applying the correct microcode revision to it. + But note that the kernel does not maintain a common microcode image for the + all CPUs, in order to handle case 'b' described below. + + +b. When some of the CPUs are different than the rest: + + In this case since we probably need to apply different microcode revisions + to different CPUs, the kernel maintains a copy of the correct microcode + image for each CPU (after appropriate CPU type/model discovery using + functions such as collect_cpu_info()). + + +c. When a CPU is physically hot-unplugged and a new (and possibly different + type of) CPU is hot-plugged into the system: + + In the current design of the kernel, whenever a CPU is taken offline during + a regular CPU hotplug operation, upon receiving the CPU_DEAD notification + (which is sent by the CPU hotplug code), the microcode update driver's + callback for that event reacts by freeing the kernel's copy of the + microcode image for that CPU. + + Hence, when a new CPU is brought online, since the kernel finds that it + doesn't have the microcode image, it does the CPU type/model discovery + afresh and then requests the userspace for the appropriate microcode image + for that CPU, which is subsequently applied. + + For example, in x86, the mc_cpu_callback() function (which is the microcode + update driver's callback registered for CPU hotplug events) calls + microcode_update_cpu() which would call microcode_init_cpu() in this case, + instead of microcode_resume_cpu() when it finds that the kernel doesn't + have a valid microcode image. This ensures that the CPU type/model + discovery is performed and the right microcode is applied to the CPU after + getting it from userspace. + + +d. Handling microcode update during suspend/hibernate: + + Strictly speaking, during a CPU hotplug operation which does not involve + physically removing or inserting CPUs, the CPUs are not actually powered + off during a CPU offline. They are just put to the lowest C-states possible. + Hence, in such a case, it is not really necessary to re-apply microcode + when the CPUs are brought back online, since they wouldn't have lost the + image during the CPU offline operation. + + This is the usual scenario encountered during a resume after a suspend. + However, in the case of hibernation, since all the CPUs are completely + powered off, during restore it becomes necessary to apply the microcode + images to all the CPUs. + + [Note that we don't expect someone to physically pull out nodes and insert + nodes with a different type of CPUs in-between a suspend-resume or a + hibernate/restore cycle.] + + In the current design of the kernel however, during a CPU offline operation + as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set), + the existing copy of microcode image in the kernel is not freed up. + And during the CPU online operations (during resume/restore), since the + kernel finds that it already has copies of the microcode images for all the + CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU + type/model and the need for validating whether the microcode revisions are + right for the CPUs or not (due to the above assumption that physical CPU + hotplug will not be done in-between suspend/resume or hibernate/restore + cycles). + + +III. Known problems +=================== + +Are there any known problems when regular CPU hotplug and suspend race +with each other? + +Yes, they are listed below: + +1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to + the _cpu_down() and _cpu_up() functions is *always* 0. + This might not reflect the true current state of the system, since the + tasks could have been frozen by an out-of-band event such as a suspend + operation in progress. Hence, the cpuhp_tasks_frozen variable will not + reflect the frozen state and the CPU hotplug callbacks which evaluate + that variable might execute the wrong code path. + +2. If a regular CPU hotplug stress test happens to race with the freezer due + to a suspend operation in progress at the same time, then we could hit the + situation described below: + + * A regular cpu online operation continues its journey from userspace + into the kernel, since the freezing has not yet begun. + * Then freezer gets to work and freezes userspace. + * If cpu online has not yet completed the microcode update stuff by now, + it will now start waiting on the frozen userspace in the + TASK_UNINTERRUPTIBLE state, in order to get the microcode image. + * Now the freezer continues and tries to freeze the remaining tasks. But + due to this wait mentioned above, the freezer won't be able to freeze + the cpu online hotplug task and hence freezing of tasks fails. + + As a result of this task freezing failure, the suspend operation gets + aborted. diff --git a/Documentation/power/suspend-and-cpuhotplug.txt b/Documentation/power/suspend-and-cpuhotplug.txt deleted file mode 100644 index a8751b8df10e..000000000000 --- a/Documentation/power/suspend-and-cpuhotplug.txt +++ /dev/null @@ -1,274 +0,0 @@ -Interaction of Suspend code (S3) with the CPU hotplug infrastructure - - (C) 2011 - 2014 Srivatsa S. Bhat - - -I. How does the regular CPU hotplug code differ from how the Suspend-to-RAM - infrastructure uses it internally? And where do they share common code? - -Well, a picture is worth a thousand words... So ASCII art follows :-) - -[This depicts the current design in the kernel, and focusses only on the -interactions involving the freezer and CPU hotplug and also tries to explain -the locking involved. It outlines the notifications involved as well. -But please note that here, only the call paths are illustrated, with the aim -of describing where they take different paths and where they share code. -What happens when regular CPU hotplug and Suspend-to-RAM race with each other -is not depicted here.] - -On a high level, the suspend-resume cycle goes like this: - -|Freeze| -> |Disable nonboot| -> |Do suspend| -> |Enable nonboot| -> |Thaw | -|tasks | | cpus | | | | cpus | |tasks| - - -More details follow: - - Suspend call path - ----------------- - - Write 'mem' to - /sys/power/state - sysfs file - | - v - Acquire system_transition_mutex lock - | - v - Send PM_SUSPEND_PREPARE - notifications - | - v - Freeze tasks - | - | - v - disable_nonboot_cpus() - /* start */ - | - v - Acquire cpu_add_remove_lock - | - v - Iterate over CURRENTLY - online CPUs - | - | - | ---------- - v | L - ======> _cpu_down() | - | [This takes cpuhotplug.lock | - Common | before taking down the CPU | - code | and releases it when done] | O - | While it is at it, notifications | - | are sent when notable events occur, | - ======> by running all registered callbacks. | - | | O - | | - | | - v | - Note down these cpus in | P - frozen_cpus mask ---------- - | - v - Disable regular cpu hotplug - by increasing cpu_hotplug_disabled - | - v - Release cpu_add_remove_lock - | - v - /* disable_nonboot_cpus() complete */ - | - v - Do suspend - - - -Resuming back is likewise, with the counterparts being (in the order of -execution during resume): -* enable_nonboot_cpus() which involves: - | Acquire cpu_add_remove_lock - | Decrease cpu_hotplug_disabled, thereby enabling regular cpu hotplug - | Call _cpu_up() [for all those cpus in the frozen_cpus mask, in a loop] - | Release cpu_add_remove_lock - v - -* thaw tasks -* send PM_POST_SUSPEND notifications -* Release system_transition_mutex lock. - - -It is to be noted here that the system_transition_mutex lock is acquired at the very -beginning, when we are just starting out to suspend, and then released only -after the entire cycle is complete (i.e., suspend + resume). - - - - Regular CPU hotplug call path - ----------------------------- - - Write 0 (or 1) to - /sys/devices/system/cpu/cpu*/online - sysfs file - | - | - v - cpu_down() - | - v - Acquire cpu_add_remove_lock - | - v - If cpu_hotplug_disabled > 0 - return gracefully - | - | - v - ======> _cpu_down() - | [This takes cpuhotplug.lock - Common | before taking down the CPU - code | and releases it when done] - | While it is at it, notifications - | are sent when notable events occur, - ======> by running all registered callbacks. - | - | - v - Release cpu_add_remove_lock - [That's it!, for - regular CPU hotplug] - - - -So, as can be seen from the two diagrams (the parts marked as "Common code"), -regular CPU hotplug and the suspend code path converge at the _cpu_down() and -_cpu_up() functions. They differ in the arguments passed to these functions, -in that during regular CPU hotplug, 0 is passed for the 'tasks_frozen' -argument. But during suspend, since the tasks are already frozen by the time -the non-boot CPUs are offlined or onlined, the _cpu_*() functions are called -with the 'tasks_frozen' argument set to 1. -[See below for some known issues regarding this.] - - -Important files and functions/entry points: ------------------------------------------- - -kernel/power/process.c : freeze_processes(), thaw_processes() -kernel/power/suspend.c : suspend_prepare(), suspend_enter(), suspend_finish() -kernel/cpu.c: cpu_[up|down](), _cpu_[up|down](), [disable|enable]_nonboot_cpus() - - - -II. What are the issues involved in CPU hotplug? - ------------------------------------------- - -There are some interesting situations involving CPU hotplug and microcode -update on the CPUs, as discussed below: - -[Please bear in mind that the kernel requests the microcode images from -userspace, using the request_firmware() function defined in -drivers/base/firmware_loader/main.c] - - -a. When all the CPUs are identical: - - This is the most common situation and it is quite straightforward: we want - to apply the same microcode revision to each of the CPUs. - To give an example of x86, the collect_cpu_info() function defined in - arch/x86/kernel/microcode_core.c helps in discovering the type of the CPU - and thereby in applying the correct microcode revision to it. - But note that the kernel does not maintain a common microcode image for the - all CPUs, in order to handle case 'b' described below. - - -b. When some of the CPUs are different than the rest: - - In this case since we probably need to apply different microcode revisions - to different CPUs, the kernel maintains a copy of the correct microcode - image for each CPU (after appropriate CPU type/model discovery using - functions such as collect_cpu_info()). - - -c. When a CPU is physically hot-unplugged and a new (and possibly different - type of) CPU is hot-plugged into the system: - - In the current design of the kernel, whenever a CPU is taken offline during - a regular CPU hotplug operation, upon receiving the CPU_DEAD notification - (which is sent by the CPU hotplug code), the microcode update driver's - callback for that event reacts by freeing the kernel's copy of the - microcode image for that CPU. - - Hence, when a new CPU is brought online, since the kernel finds that it - doesn't have the microcode image, it does the CPU type/model discovery - afresh and then requests the userspace for the appropriate microcode image - for that CPU, which is subsequently applied. - - For example, in x86, the mc_cpu_callback() function (which is the microcode - update driver's callback registered for CPU hotplug events) calls - microcode_update_cpu() which would call microcode_init_cpu() in this case, - instead of microcode_resume_cpu() when it finds that the kernel doesn't - have a valid microcode image. This ensures that the CPU type/model - discovery is performed and the right microcode is applied to the CPU after - getting it from userspace. - - -d. Handling microcode update during suspend/hibernate: - - Strictly speaking, during a CPU hotplug operation which does not involve - physically removing or inserting CPUs, the CPUs are not actually powered - off during a CPU offline. They are just put to the lowest C-states possible. - Hence, in such a case, it is not really necessary to re-apply microcode - when the CPUs are brought back online, since they wouldn't have lost the - image during the CPU offline operation. - - This is the usual scenario encountered during a resume after a suspend. - However, in the case of hibernation, since all the CPUs are completely - powered off, during restore it becomes necessary to apply the microcode - images to all the CPUs. - - [Note that we don't expect someone to physically pull out nodes and insert - nodes with a different type of CPUs in-between a suspend-resume or a - hibernate/restore cycle.] - - In the current design of the kernel however, during a CPU offline operation - as part of the suspend/hibernate cycle (cpuhp_tasks_frozen is set), - the existing copy of microcode image in the kernel is not freed up. - And during the CPU online operations (during resume/restore), since the - kernel finds that it already has copies of the microcode images for all the - CPUs, it just applies them to the CPUs, avoiding any re-discovery of CPU - type/model and the need for validating whether the microcode revisions are - right for the CPUs or not (due to the above assumption that physical CPU - hotplug will not be done in-between suspend/resume or hibernate/restore - cycles). - - -III. Are there any known problems when regular CPU hotplug and suspend race - with each other? - -Yes, they are listed below: - -1. When invoking regular CPU hotplug, the 'tasks_frozen' argument passed to - the _cpu_down() and _cpu_up() functions is *always* 0. - This might not reflect the true current state of the system, since the - tasks could have been frozen by an out-of-band event such as a suspend - operation in progress. Hence, the cpuhp_tasks_frozen variable will not - reflect the frozen state and the CPU hotplug callbacks which evaluate - that variable might execute the wrong code path. - -2. If a regular CPU hotplug stress test happens to race with the freezer due - to a suspend operation in progress at the same time, then we could hit the - situation described below: - - * A regular cpu online operation continues its journey from userspace - into the kernel, since the freezing has not yet begun. - * Then freezer gets to work and freezes userspace. - * If cpu online has not yet completed the microcode update stuff by now, - it will now start waiting on the frozen userspace in the - TASK_UNINTERRUPTIBLE state, in order to get the microcode image. - * Now the freezer continues and tries to freeze the remaining tasks. But - due to this wait mentioned above, the freezer won't be able to freeze - the cpu online hotplug task and hence freezing of tasks fails. - - As a result of this task freezing failure, the suspend operation gets - aborted. diff --git a/Documentation/power/suspend-and-interrupts.rst b/Documentation/power/suspend-and-interrupts.rst new file mode 100644 index 000000000000..4cda6617709a --- /dev/null +++ b/Documentation/power/suspend-and-interrupts.rst @@ -0,0 +1,137 @@ +==================================== +System Suspend and Device Interrupts +==================================== + +Copyright (C) 2014 Intel Corp. +Author: Rafael J. Wysocki + + +Suspending and Resuming Device IRQs +----------------------------------- + +Device interrupt request lines (IRQs) are generally disabled during system +suspend after the "late" phase of suspending devices (that is, after all of the +->prepare, ->suspend and ->suspend_late callbacks have been executed for all +devices). That is done by suspend_device_irqs(). + +The rationale for doing so is that after the "late" phase of device suspend +there is no legitimate reason why any interrupts from suspended devices should +trigger and if any devices have not been suspended properly yet, it is better to +block interrupts from them anyway. Also, in the past we had problems with +interrupt handlers for shared IRQs that device drivers implementing them were +not prepared for interrupts triggering after their devices had been suspended. +In some cases they would attempt to access, for example, memory address spaces +of suspended devices and cause unpredictable behavior to ensue as a result. +Unfortunately, such problems are very difficult to debug and the introduction +of suspend_device_irqs(), along with the "noirq" phase of device suspend and +resume, was the only practical way to mitigate them. + +Device IRQs are re-enabled during system resume, right before the "early" phase +of resuming devices (that is, before starting to execute ->resume_early +callbacks for devices). The function doing that is resume_device_irqs(). + + +The IRQF_NO_SUSPEND Flag +------------------------ + +There are interrupts that can legitimately trigger during the entire system +suspend-resume cycle, including the "noirq" phases of suspending and resuming +devices as well as during the time when nonboot CPUs are taken offline and +brought back online. That applies to timer interrupts in the first place, +but also to IPIs and to some other special-purpose interrupts. + +The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when +requesting a special-purpose interrupt. It causes suspend_device_irqs() to +leave the corresponding IRQ enabled so as to allow the interrupt to work as +expected during the suspend-resume cycle, but does not guarantee that the +interrupt will wake the system from a suspended state -- for such cases it is +necessary to use enable_irq_wake(). + +Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one +user of it. Thus, if the IRQ is shared, all of the interrupt handlers installed +for it will be executed as usual after suspend_device_irqs(), even if the +IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of +the IRQ's users. For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the +same time should be avoided. + + +System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake() +------------------------------------------------------------------ + +System wakeup interrupts generally need to be configured to wake up the system +from sleep states, especially if they are used for different purposes (e.g. as +I/O interrupts) in the working state. + +That may involve turning on a special signal handling logic within the platform +(such as an SoC) so that signals from a given line are routed in a different way +during system sleep so as to trigger a system wakeup when needed. For example, +the platform may include a dedicated interrupt controller used specifically for +handling system wakeup events. Then, if a given interrupt line is supposed to +wake up the system from sleep sates, the corresponding input of that interrupt +controller needs to be enabled to receive signals from the line in question. +After wakeup, it generally is better to disable that input to prevent the +dedicated controller from triggering interrupts unnecessarily. + +The IRQ subsystem provides two helper functions to be used by device drivers for +those purposes. Namely, enable_irq_wake() turns on the platform's logic for +handling the given IRQ as a system wakeup interrupt line and disable_irq_wake() +turns that logic off. + +Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ +in a special way. Namely, the IRQ remains enabled, by on the first interrupt +it will be disabled, marked as pending and "suspended" so that it will be +re-enabled by resume_device_irqs() during the subsequent system resume. Also +the PM core is notified about the event which causes the system suspend in +progress to be aborted (that doesn't have to happen immediately, but at one +of the points where the suspend thread looks for pending wakeup events). + +This way every interrupt from a wakeup interrupt source will either cause the +system suspend currently in progress to be aborted or wake up the system if +already suspended. However, after suspend_device_irqs() interrupt handlers are +not executed for system wakeup IRQs. They are only executed for IRQF_NO_SUSPEND +IRQs at that time, but those IRQs should not be configured for system wakeup +using enable_irq_wake(). + + +Interrupts and Suspend-to-Idle +------------------------------ + +Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new +system sleep state that works by idling all of the processors and waiting for +interrupts right after the "noirq" phase of suspending devices. + +Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag +set will bring CPUs out of idle while in that state, but they will not cause the +IRQ subsystem to trigger a system wakeup. + +System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in +analogy with what they do in the full system suspend case. The only difference +is that the wakeup from suspend-to-idle is signaled using the usual working +state interrupt delivery mechanisms and doesn't require the platform to use +any special interrupt handling logic for it to work. + + +IRQF_NO_SUSPEND and enable_irq_wake() +------------------------------------- + +There are very few valid reasons to use both enable_irq_wake() and the +IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the +same device. + +First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND +interrupts (interrupt handlers are invoked after suspend_device_irqs()) are +directly at odds with the rules for handling system wakeup interrupts (interrupt +handlers are not invoked after suspend_device_irqs()). + +Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not +to individual interrupt handlers, so sharing an IRQ between a system wakeup +interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally +make sense. + +In rare cases an IRQ can be shared between a wakeup device driver and an +IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver +must be able to discern spurious IRQs from genuine wakeup events (signalling +the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to +ensure that the IRQ will function as a wakeup source, and must request the IRQ +with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If +these requirements are not met, it is not valid to use IRQF_COND_SUSPEND. diff --git a/Documentation/power/suspend-and-interrupts.txt b/Documentation/power/suspend-and-interrupts.txt deleted file mode 100644 index 8afb29a8604a..000000000000 --- a/Documentation/power/suspend-and-interrupts.txt +++ /dev/null @@ -1,135 +0,0 @@ -System Suspend and Device Interrupts - -Copyright (C) 2014 Intel Corp. -Author: Rafael J. Wysocki - - -Suspending and Resuming Device IRQs ------------------------------------ - -Device interrupt request lines (IRQs) are generally disabled during system -suspend after the "late" phase of suspending devices (that is, after all of the -->prepare, ->suspend and ->suspend_late callbacks have been executed for all -devices). That is done by suspend_device_irqs(). - -The rationale for doing so is that after the "late" phase of device suspend -there is no legitimate reason why any interrupts from suspended devices should -trigger and if any devices have not been suspended properly yet, it is better to -block interrupts from them anyway. Also, in the past we had problems with -interrupt handlers for shared IRQs that device drivers implementing them were -not prepared for interrupts triggering after their devices had been suspended. -In some cases they would attempt to access, for example, memory address spaces -of suspended devices and cause unpredictable behavior to ensue as a result. -Unfortunately, such problems are very difficult to debug and the introduction -of suspend_device_irqs(), along with the "noirq" phase of device suspend and -resume, was the only practical way to mitigate them. - -Device IRQs are re-enabled during system resume, right before the "early" phase -of resuming devices (that is, before starting to execute ->resume_early -callbacks for devices). The function doing that is resume_device_irqs(). - - -The IRQF_NO_SUSPEND Flag ------------------------- - -There are interrupts that can legitimately trigger during the entire system -suspend-resume cycle, including the "noirq" phases of suspending and resuming -devices as well as during the time when nonboot CPUs are taken offline and -brought back online. That applies to timer interrupts in the first place, -but also to IPIs and to some other special-purpose interrupts. - -The IRQF_NO_SUSPEND flag is used to indicate that to the IRQ subsystem when -requesting a special-purpose interrupt. It causes suspend_device_irqs() to -leave the corresponding IRQ enabled so as to allow the interrupt to work as -expected during the suspend-resume cycle, but does not guarantee that the -interrupt will wake the system from a suspended state -- for such cases it is -necessary to use enable_irq_wake(). - -Note that the IRQF_NO_SUSPEND flag affects the entire IRQ and not just one -user of it. Thus, if the IRQ is shared, all of the interrupt handlers installed -for it will be executed as usual after suspend_device_irqs(), even if the -IRQF_NO_SUSPEND flag was not passed to request_irq() (or equivalent) by some of -the IRQ's users. For this reason, using IRQF_NO_SUSPEND and IRQF_SHARED at the -same time should be avoided. - - -System Wakeup Interrupts, enable_irq_wake() and disable_irq_wake() ------------------------------------------------------------------- - -System wakeup interrupts generally need to be configured to wake up the system -from sleep states, especially if they are used for different purposes (e.g. as -I/O interrupts) in the working state. - -That may involve turning on a special signal handling logic within the platform -(such as an SoC) so that signals from a given line are routed in a different way -during system sleep so as to trigger a system wakeup when needed. For example, -the platform may include a dedicated interrupt controller used specifically for -handling system wakeup events. Then, if a given interrupt line is supposed to -wake up the system from sleep sates, the corresponding input of that interrupt -controller needs to be enabled to receive signals from the line in question. -After wakeup, it generally is better to disable that input to prevent the -dedicated controller from triggering interrupts unnecessarily. - -The IRQ subsystem provides two helper functions to be used by device drivers for -those purposes. Namely, enable_irq_wake() turns on the platform's logic for -handling the given IRQ as a system wakeup interrupt line and disable_irq_wake() -turns that logic off. - -Calling enable_irq_wake() causes suspend_device_irqs() to treat the given IRQ -in a special way. Namely, the IRQ remains enabled, by on the first interrupt -it will be disabled, marked as pending and "suspended" so that it will be -re-enabled by resume_device_irqs() during the subsequent system resume. Also -the PM core is notified about the event which causes the system suspend in -progress to be aborted (that doesn't have to happen immediately, but at one -of the points where the suspend thread looks for pending wakeup events). - -This way every interrupt from a wakeup interrupt source will either cause the -system suspend currently in progress to be aborted or wake up the system if -already suspended. However, after suspend_device_irqs() interrupt handlers are -not executed for system wakeup IRQs. They are only executed for IRQF_NO_SUSPEND -IRQs at that time, but those IRQs should not be configured for system wakeup -using enable_irq_wake(). - - -Interrupts and Suspend-to-Idle ------------------------------- - -Suspend-to-idle (also known as the "freeze" sleep state) is a relatively new -system sleep state that works by idling all of the processors and waiting for -interrupts right after the "noirq" phase of suspending devices. - -Of course, this means that all of the interrupts with the IRQF_NO_SUSPEND flag -set will bring CPUs out of idle while in that state, but they will not cause the -IRQ subsystem to trigger a system wakeup. - -System wakeup interrupts, in turn, will trigger wakeup from suspend-to-idle in -analogy with what they do in the full system suspend case. The only difference -is that the wakeup from suspend-to-idle is signaled using the usual working -state interrupt delivery mechanisms and doesn't require the platform to use -any special interrupt handling logic for it to work. - - -IRQF_NO_SUSPEND and enable_irq_wake() -------------------------------------- - -There are very few valid reasons to use both enable_irq_wake() and the -IRQF_NO_SUSPEND flag on the same IRQ, and it is never valid to use both for the -same device. - -First of all, if the IRQ is not shared, the rules for handling IRQF_NO_SUSPEND -interrupts (interrupt handlers are invoked after suspend_device_irqs()) are -directly at odds with the rules for handling system wakeup interrupts (interrupt -handlers are not invoked after suspend_device_irqs()). - -Second, both enable_irq_wake() and IRQF_NO_SUSPEND apply to entire IRQs and not -to individual interrupt handlers, so sharing an IRQ between a system wakeup -interrupt source and an IRQF_NO_SUSPEND interrupt source does not generally -make sense. - -In rare cases an IRQ can be shared between a wakeup device driver and an -IRQF_NO_SUSPEND user. In order for this to be safe, the wakeup device driver -must be able to discern spurious IRQs from genuine wakeup events (signalling -the latter to the core with pm_system_wakeup()), must use enable_irq_wake() to -ensure that the IRQ will function as a wakeup source, and must request the IRQ -with IRQF_COND_SUSPEND to tell the core that it meets these requirements. If -these requirements are not met, it is not valid to use IRQF_COND_SUSPEND. diff --git a/Documentation/power/swsusp-and-swap-files.rst b/Documentation/power/swsusp-and-swap-files.rst new file mode 100644 index 000000000000..a33a2919dbe4 --- /dev/null +++ b/Documentation/power/swsusp-and-swap-files.rst @@ -0,0 +1,63 @@ +=============================================== +Using swap files with software suspend (swsusp) +=============================================== + + (C) 2006 Rafael J. Wysocki + +The Linux kernel handles swap files almost in the same way as it handles swap +partitions and there are only two differences between these two types of swap +areas: +(1) swap files need not be contiguous, +(2) the header of a swap file is not in the first block of the partition that +holds it. From the swsusp's point of view (1) is not a problem, because it is +already taken care of by the swap-handling code, but (2) has to be taken into +consideration. + +In principle the location of a swap file's header may be determined with the +help of appropriate filesystem driver. Unfortunately, however, it requires the +filesystem holding the swap file to be mounted, and if this filesystem is +journaled, it cannot be mounted during resume from disk. For this reason to +identify a swap file swsusp uses the name of the partition that holds the file +and the offset from the beginning of the partition at which the swap file's +header is located. For convenience, this offset is expressed in +units. + +In order to use a swap file with swsusp, you need to: + +1) Create the swap file and make it active, eg.:: + + # dd if=/dev/zero of= bs=1024 count= + # mkswap + # swapon + +2) Use an application that will bmap the swap file with the help of the +FIBMAP ioctl and determine the location of the file's swap header, as the +offset, in units, from the beginning of the partition which +holds the swap file. + +3) Add the following parameters to the kernel command line:: + + resume= resume_offset= + +where is the partition on which the swap file is located +and is the offset of the swap header determined by the +application in 2) (of course, this step may be carried out automatically +by the same application that determines the swap file's header offset using the +FIBMAP ioctl) + +OR + +Use a userland suspend application that will set the partition and offset +with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in +Documentation/power/userland-swsusp.rst (this is the only method to suspend +to a swap file allowing the resume to be initiated from an initrd or initramfs +image). + +Now, swsusp will use the swap file in the same way in which it would use a swap +partition. In particular, the swap file has to be active (ie. be present in +/proc/swaps) so that it can be used for suspending. + +Note that if the swap file used for suspending is deleted and recreated, +the location of its header need not be the same as before. Thus every time +this happens the value of the "resume_offset=" kernel command line parameter +has to be updated. diff --git a/Documentation/power/swsusp-and-swap-files.txt b/Documentation/power/swsusp-and-swap-files.txt deleted file mode 100644 index f281886de490..000000000000 --- a/Documentation/power/swsusp-and-swap-files.txt +++ /dev/null @@ -1,60 +0,0 @@ -Using swap files with software suspend (swsusp) - (C) 2006 Rafael J. Wysocki - -The Linux kernel handles swap files almost in the same way as it handles swap -partitions and there are only two differences between these two types of swap -areas: -(1) swap files need not be contiguous, -(2) the header of a swap file is not in the first block of the partition that -holds it. From the swsusp's point of view (1) is not a problem, because it is -already taken care of by the swap-handling code, but (2) has to be taken into -consideration. - -In principle the location of a swap file's header may be determined with the -help of appropriate filesystem driver. Unfortunately, however, it requires the -filesystem holding the swap file to be mounted, and if this filesystem is -journaled, it cannot be mounted during resume from disk. For this reason to -identify a swap file swsusp uses the name of the partition that holds the file -and the offset from the beginning of the partition at which the swap file's -header is located. For convenience, this offset is expressed in -units. - -In order to use a swap file with swsusp, you need to: - -1) Create the swap file and make it active, eg. - -# dd if=/dev/zero of= bs=1024 count= -# mkswap -# swapon - -2) Use an application that will bmap the swap file with the help of the -FIBMAP ioctl and determine the location of the file's swap header, as the -offset, in units, from the beginning of the partition which -holds the swap file. - -3) Add the following parameters to the kernel command line: - -resume= resume_offset= - -where is the partition on which the swap file is located -and is the offset of the swap header determined by the -application in 2) (of course, this step may be carried out automatically -by the same application that determines the swap file's header offset using the -FIBMAP ioctl) - -OR - -Use a userland suspend application that will set the partition and offset -with the help of the SNAPSHOT_SET_SWAP_AREA ioctl described in -Documentation/power/userland-swsusp.txt (this is the only method to suspend -to a swap file allowing the resume to be initiated from an initrd or initramfs -image). - -Now, swsusp will use the swap file in the same way in which it would use a swap -partition. In particular, the swap file has to be active (ie. be present in -/proc/swaps) so that it can be used for suspending. - -Note that if the swap file used for suspending is deleted and recreated, -the location of its header need not be the same as before. Thus every time -this happens the value of the "resume_offset=" kernel command line parameter -has to be updated. diff --git a/Documentation/power/swsusp-dmcrypt.rst b/Documentation/power/swsusp-dmcrypt.rst new file mode 100644 index 000000000000..426df59172cd --- /dev/null +++ b/Documentation/power/swsusp-dmcrypt.rst @@ -0,0 +1,140 @@ +======================================= +How to use dm-crypt and swsusp together +======================================= + +Author: Andreas Steinmetz + + + +Some prerequisites: +You know how dm-crypt works. If not, visit the following web page: +http://www.saout.de/misc/dm-crypt/ +You have read Documentation/power/swsusp.rst and understand it. +You did read Documentation/admin-guide/initrd.rst and know how an initrd works. +You know how to create or how to modify an initrd. + +Now your system is properly set up, your disk is encrypted except for +the swap device(s) and the boot partition which may contain a mini +system for crypto setup and/or rescue purposes. You may even have +an initrd that does your current crypto setup already. + +At this point you want to encrypt your swap, too. Still you want to +be able to suspend using swsusp. This, however, means that you +have to be able to either enter a passphrase or that you read +the key(s) from an external device like a pcmcia flash disk +or an usb stick prior to resume. So you need an initrd, that sets +up dm-crypt and then asks swsusp to resume from the encrypted +swap device. + +The most important thing is that you set up dm-crypt in such +a way that the swap device you suspend to/resume from has +always the same major/minor within the initrd as well as +within your running system. The easiest way to achieve this is +to always set up this swap device first with dmsetup, so that +it will always look like the following:: + + brw------- 1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0 + +Now set up your kernel to use /dev/mapper/swap0 as the default +resume partition, so your kernel .config contains:: + + CONFIG_PM_STD_PARTITION="/dev/mapper/swap0" + +Prepare your boot loader to use the initrd you will create or +modify. For lilo the simplest setup looks like the following +lines:: + + image=/boot/vmlinuz + initrd=/boot/initrd.gz + label=linux + append="root=/dev/ram0 init=/linuxrc rw" + +Finally you need to create or modify your initrd. Lets assume +you create an initrd that reads the required dm-crypt setup +from a pcmcia flash disk card. The card is formatted with an ext2 +fs which resides on /dev/hde1 when the card is inserted. The +card contains at least the encrypted swap setup in a file +named "swapkey". /etc/fstab of your initrd contains something +like the following:: + + /dev/hda1 /mnt ext3 ro 0 0 + none /proc proc defaults,noatime,nodiratime 0 0 + none /sys sysfs defaults,noatime,nodiratime 0 0 + +/dev/hda1 contains an unencrypted mini system that sets up all +of your crypto devices, again by reading the setup from the +pcmcia flash disk. What follows now is a /linuxrc for your +initrd that allows you to resume from encrypted swap and that +continues boot with your mini system on /dev/hda1 if resume +does not happen:: + + #!/bin/sh + PATH=/sbin:/bin:/usr/sbin:/usr/bin + mount /proc + mount /sys + mapped=0 + noresume=`grep -c noresume /proc/cmdline` + if [ "$*" != "" ] + then + noresume=1 + fi + dmesg -n 1 + /sbin/cardmgr -q + for i in 1 2 3 4 5 6 7 8 9 0 + do + if [ -f /proc/ide/hde/media ] + then + usleep 500000 + mount -t ext2 -o ro /dev/hde1 /mnt + if [ -f /mnt/swapkey ] + then + dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1 + fi + umount /mnt + break + fi + usleep 500000 + done + killproc /sbin/cardmgr + dmesg -n 6 + if [ $mapped = 1 ] + then + if [ $noresume != 0 ] + then + mkswap /dev/mapper/swap0 > /dev/null 2>&1 + fi + echo 254:0 > /sys/power/resume + dmsetup remove swap0 + fi + umount /sys + mount /mnt + umount /proc + cd /mnt + pivot_root . mnt + mount /proc + umount -l /mnt + umount /proc + exec chroot . /sbin/init $* < dev/console > dev/console 2>&1 + +Please don't mind the weird loop above, busybox's msh doesn't know +the let statement. Now, what is happening in the script? +First we have to decide if we want to try to resume, or not. +We will not resume if booting with "noresume" or any parameters +for init like "single" or "emergency" as boot parameters. + +Then we need to set up dmcrypt with the setup data from the +pcmcia flash disk. If this succeeds we need to reset the swap +device if we don't want to resume. The line "echo 254:0 > /sys/power/resume" +then attempts to resume from the first device mapper device. +Note that it is important to set the device in /sys/power/resume, +regardless if resuming or not, otherwise later suspend will fail. +If resume starts, script execution terminates here. + +Otherwise we just remove the encrypted swap device and leave it to the +mini system on /dev/hda1 to set the whole crypto up (it is up to +you to modify this to your taste). + +What then follows is the well known process to change the root +file system and continue booting from there. I prefer to unmount +the initrd prior to continue booting but it is up to you to modify +this. diff --git a/Documentation/power/swsusp-dmcrypt.txt b/Documentation/power/swsusp-dmcrypt.txt deleted file mode 100644 index b802fbfd95ef..000000000000 --- a/Documentation/power/swsusp-dmcrypt.txt +++ /dev/null @@ -1,138 +0,0 @@ -Author: Andreas Steinmetz - - -How to use dm-crypt and swsusp together: -======================================== - -Some prerequisites: -You know how dm-crypt works. If not, visit the following web page: -http://www.saout.de/misc/dm-crypt/ -You have read Documentation/power/swsusp.txt and understand it. -You did read Documentation/admin-guide/initrd.rst and know how an initrd works. -You know how to create or how to modify an initrd. - -Now your system is properly set up, your disk is encrypted except for -the swap device(s) and the boot partition which may contain a mini -system for crypto setup and/or rescue purposes. You may even have -an initrd that does your current crypto setup already. - -At this point you want to encrypt your swap, too. Still you want to -be able to suspend using swsusp. This, however, means that you -have to be able to either enter a passphrase or that you read -the key(s) from an external device like a pcmcia flash disk -or an usb stick prior to resume. So you need an initrd, that sets -up dm-crypt and then asks swsusp to resume from the encrypted -swap device. - -The most important thing is that you set up dm-crypt in such -a way that the swap device you suspend to/resume from has -always the same major/minor within the initrd as well as -within your running system. The easiest way to achieve this is -to always set up this swap device first with dmsetup, so that -it will always look like the following: - -brw------- 1 root root 254, 0 Jul 28 13:37 /dev/mapper/swap0 - -Now set up your kernel to use /dev/mapper/swap0 as the default -resume partition, so your kernel .config contains: - -CONFIG_PM_STD_PARTITION="/dev/mapper/swap0" - -Prepare your boot loader to use the initrd you will create or -modify. For lilo the simplest setup looks like the following -lines: - -image=/boot/vmlinuz -initrd=/boot/initrd.gz -label=linux -append="root=/dev/ram0 init=/linuxrc rw" - -Finally you need to create or modify your initrd. Lets assume -you create an initrd that reads the required dm-crypt setup -from a pcmcia flash disk card. The card is formatted with an ext2 -fs which resides on /dev/hde1 when the card is inserted. The -card contains at least the encrypted swap setup in a file -named "swapkey". /etc/fstab of your initrd contains something -like the following: - -/dev/hda1 /mnt ext3 ro 0 0 -none /proc proc defaults,noatime,nodiratime 0 0 -none /sys sysfs defaults,noatime,nodiratime 0 0 - -/dev/hda1 contains an unencrypted mini system that sets up all -of your crypto devices, again by reading the setup from the -pcmcia flash disk. What follows now is a /linuxrc for your -initrd that allows you to resume from encrypted swap and that -continues boot with your mini system on /dev/hda1 if resume -does not happen: - -#!/bin/sh -PATH=/sbin:/bin:/usr/sbin:/usr/bin -mount /proc -mount /sys -mapped=0 -noresume=`grep -c noresume /proc/cmdline` -if [ "$*" != "" ] -then - noresume=1 -fi -dmesg -n 1 -/sbin/cardmgr -q -for i in 1 2 3 4 5 6 7 8 9 0 -do - if [ -f /proc/ide/hde/media ] - then - usleep 500000 - mount -t ext2 -o ro /dev/hde1 /mnt - if [ -f /mnt/swapkey ] - then - dmsetup create swap0 /mnt/swapkey > /dev/null 2>&1 && mapped=1 - fi - umount /mnt - break - fi - usleep 500000 -done -killproc /sbin/cardmgr -dmesg -n 6 -if [ $mapped = 1 ] -then - if [ $noresume != 0 ] - then - mkswap /dev/mapper/swap0 > /dev/null 2>&1 - fi - echo 254:0 > /sys/power/resume - dmsetup remove swap0 -fi -umount /sys -mount /mnt -umount /proc -cd /mnt -pivot_root . mnt -mount /proc -umount -l /mnt -umount /proc -exec chroot . /sbin/init $* < dev/console > dev/console 2>&1 - -Please don't mind the weird loop above, busybox's msh doesn't know -the let statement. Now, what is happening in the script? -First we have to decide if we want to try to resume, or not. -We will not resume if booting with "noresume" or any parameters -for init like "single" or "emergency" as boot parameters. - -Then we need to set up dmcrypt with the setup data from the -pcmcia flash disk. If this succeeds we need to reset the swap -device if we don't want to resume. The line "echo 254:0 > /sys/power/resume" -then attempts to resume from the first device mapper device. -Note that it is important to set the device in /sys/power/resume, -regardless if resuming or not, otherwise later suspend will fail. -If resume starts, script execution terminates here. - -Otherwise we just remove the encrypted swap device and leave it to the -mini system on /dev/hda1 to set the whole crypto up (it is up to -you to modify this to your taste). - -What then follows is the well known process to change the root -file system and continue booting from there. I prefer to unmount -the initrd prior to continue booting but it is up to you to modify -this. diff --git a/Documentation/power/swsusp.rst b/Documentation/power/swsusp.rst new file mode 100644 index 000000000000..d000312f6965 --- /dev/null +++ b/Documentation/power/swsusp.rst @@ -0,0 +1,501 @@ +============ +Swap suspend +============ + +Some warnings, first. + +.. warning:: + + **BIG FAT WARNING** + + If you touch anything on disk between suspend and resume... + ...kiss your data goodbye. + + If you do resume from initrd after your filesystems are mounted... + ...bye bye root partition. + + [this is actually same case as above] + + If you have unsupported ( ) devices using DMA, you may have some + problems. If your disk driver does not support suspend... (IDE does), + it may cause some problems, too. If you change kernel command line + between suspend and resume, it may do something wrong. If you change + your hardware while system is suspended... well, it was not good idea; + but it will probably only crash. + + ( ) suspend/resume support is needed to make it safe. + + If you have any filesystems on USB devices mounted before software suspend, + they won't be accessible after resume and you may lose data, as though + you have unplugged the USB devices with mounted filesystems on them; + see the FAQ below for details. (This is not true for more traditional + power states like "standby", which normally don't turn USB off.) + +Swap partition: + You need to append resume=/dev/your_swap_partition to kernel command + line or specify it using /sys/power/resume. + +Swap file: + If using a swapfile you can also specify a resume offset using + resume_offset= on the kernel command line or specify it + in /sys/power/resume_offset. + +After preparing then you suspend by:: + + echo shutdown > /sys/power/disk; echo disk > /sys/power/state + +- If you feel ACPI works pretty well on your system, you might try:: + + echo platform > /sys/power/disk; echo disk > /sys/power/state + +- If you would like to write hibernation image to swap and then suspend + to RAM (provided your platform supports it), you can try:: + + echo suspend > /sys/power/disk; echo disk > /sys/power/state + +- If you have SATA disks, you'll need recent kernels with SATA suspend + support. For suspend and resume to work, make sure your disk drivers + are built into kernel -- not modules. [There's way to make + suspend/resume with modular disk drivers, see FAQ, but you probably + should not do that.] + +If you want to limit the suspend image size to N bytes, do:: + + echo N > /sys/power/image_size + +before suspend (it is limited to around 2/5 of available RAM by default). + +- The resume process checks for the presence of the resume device, + if found, it then checks the contents for the hibernation image signature. + If both are found, it resumes the hibernation image. + +- The resume process may be triggered in two ways: + + 1) During lateinit: If resume=/dev/your_swap_partition is specified on + the kernel command line, lateinit runs the resume process. If the + resume device has not been probed yet, the resume process fails and + bootup continues. + 2) Manually from an initrd or initramfs: May be run from + the init script by using the /sys/power/resume file. It is vital + that this be done prior to remounting any filesystems (even as + read-only) otherwise data may be corrupted. + +Article about goals and implementation of Software Suspend for Linux +==================================================================== + +Author: Gábor Kuti +Last revised: 2003-10-20 by Pavel Machek + +Idea and goals to achieve +------------------------- + +Nowadays it is common in several laptops that they have a suspend button. It +saves the state of the machine to a filesystem or to a partition and switches +to standby mode. Later resuming the machine the saved state is loaded back to +ram and the machine can continue its work. It has two real benefits. First we +save ourselves the time machine goes down and later boots up, energy costs +are real high when running from batteries. The other gain is that we don't have +to interrupt our programs so processes that are calculating something for a long +time shouldn't need to be written interruptible. + +swsusp saves the state of the machine into active swaps and then reboots or +powerdowns. You must explicitly specify the swap partition to resume from with +`resume=` kernel option. If signature is found it loads and restores saved +state. If the option `noresume` is specified as a boot parameter, it skips +the resuming. If the option `hibernate=nocompress` is specified as a boot +parameter, it saves hibernation image without compression. + +In the meantime while the system is suspended you should not add/remove any +of the hardware, write to the filesystems, etc. + +Sleep states summary +==================== + +There are three different interfaces you can use, /proc/acpi should +work like this: + +In a really perfect world:: + + echo 1 > /proc/acpi/sleep # for standby + echo 2 > /proc/acpi/sleep # for suspend to ram + echo 3 > /proc/acpi/sleep # for suspend to ram, but with more power conservative + echo 4 > /proc/acpi/sleep # for suspend to disk + echo 5 > /proc/acpi/sleep # for shutdown unfriendly the system + +and perhaps:: + + echo 4b > /proc/acpi/sleep # for suspend to disk via s4bios + +Frequently Asked Questions +========================== + +Q: + well, suspending a server is IMHO a really stupid thing, + but... (Diego Zuccato): + +A: + You bought new UPS for your server. How do you install it without + bringing machine down? Suspend to disk, rearrange power cables, + resume. + + You have your server on UPS. Power died, and UPS is indicating 30 + seconds to failure. What do you do? Suspend to disk. + + +Q: + Maybe I'm missing something, but why don't the regular I/O paths work? + +A: + We do use the regular I/O paths. However we cannot restore the data + to its original location as we load it. That would create an + inconsistent kernel state which would certainly result in an oops. + Instead, we load the image into unused memory and then atomically copy + it back to it original location. This implies, of course, a maximum + image size of half the amount of memory. + + There are two solutions to this: + + * require half of memory to be free during suspend. That way you can + read "new" data onto free spots, then cli and copy + + * assume we had special "polling" ide driver that only uses memory + between 0-640KB. That way, I'd have to make sure that 0-640KB is free + during suspending, but otherwise it would work... + + suspend2 shares this fundamental limitation, but does not include user + data and disk caches into "used memory" by saving them in + advance. That means that the limitation goes away in practice. + +Q: + Does linux support ACPI S4? + +A: + Yes. That's what echo platform > /sys/power/disk does. + +Q: + What is 'suspend2'? + +A: + suspend2 is 'Software Suspend 2', a forked implementation of + suspend-to-disk which is available as separate patches for 2.4 and 2.6 + kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB + highmem and preemption. It also has a extensible architecture that + allows for arbitrary transformations on the image (compression, + encryption) and arbitrary backends for writing the image (eg to swap + or an NFS share[Work In Progress]). Questions regarding suspend2 + should be sent to the mailing list available through the suspend2 + website, and not to the Linux Kernel Mailing List. We are working + toward merging suspend2 into the mainline kernel. + +Q: + What is the freezing of tasks and why are we using it? + +A: + The freezing of tasks is a mechanism by which user space processes and some + kernel threads are controlled during hibernation or system-wide suspend (on some + architectures). See freezing-of-tasks.txt for details. + +Q: + What is the difference between "platform" and "shutdown"? + +A: + shutdown: + save state in linux, then tell bios to powerdown + + platform: + save state in linux, then tell bios to powerdown and blink + "suspended led" + + "platform" is actually right thing to do where supported, but + "shutdown" is most reliable (except on ACPI systems). + +Q: + I do not understand why you have such strong objections to idea of + selective suspend. + +A: + Do selective suspend during runtime power management, that's okay. But + it's useless for suspend-to-disk. (And I do not see how you could use + it for suspend-to-ram, I hope you do not want that). + + Lets see, so you suggest to + + * SUSPEND all but swap device and parents + * Snapshot + * Write image to disk + * SUSPEND swap device and parents + * Powerdown + + Oh no, that does not work, if swap device or its parents uses DMA, + you've corrupted data. You'd have to do + + * SUSPEND all but swap device and parents + * FREEZE swap device and parents + * Snapshot + * UNFREEZE swap device and parents + * Write + * SUSPEND swap device and parents + + Which means that you still need that FREEZE state, and you get more + complicated code. (And I have not yet introduce details like system + devices). + +Q: + There don't seem to be any generally useful behavioral + distinctions between SUSPEND and FREEZE. + +A: + Doing SUSPEND when you are asked to do FREEZE is always correct, + but it may be unnecessarily slow. If you want your driver to stay simple, + slowness may not matter to you. It can always be fixed later. + + For devices like disk it does matter, you do not want to spindown for + FREEZE. + +Q: + After resuming, system is paging heavily, leading to very bad interactivity. + +A: + Try running:: + + cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file + do + test -f "$file" && cat "$file" > /dev/null + done + + after resume. swapoff -a; swapon -a may also be useful. + +Q: + What happens to devices during swsusp? They seem to be resumed + during system suspend? + +A: + That's correct. We need to resume them if we want to write image to + disk. Whole sequence goes like + + **Suspend part** + + running system, user asks for suspend-to-disk + + user processes are stopped + + suspend(PMSG_FREEZE): devices are frozen so that they don't interfere + with state snapshot + + state snapshot: copy of whole used memory is taken with interrupts disabled + + resume(): devices are woken up so that we can write image to swap + + write image to swap + + suspend(PMSG_SUSPEND): suspend devices so that we can power off + + turn the power off + + **Resume part** + + (is actually pretty similar) + + running system, user asks for suspend-to-disk + + user processes are stopped (in common case there are none, + but with resume-from-initrd, no one knows) + + read image from disk + + suspend(PMSG_FREEZE): devices are frozen so that they don't interfere + with image restoration + + image restoration: rewrite memory with image + + resume(): devices are woken up so that system can continue + + thaw all user processes + +Q: + What is this 'Encrypt suspend image' for? + +A: + First of all: it is not a replacement for dm-crypt encrypted swap. + It cannot protect your computer while it is suspended. Instead it does + protect from leaking sensitive data after resume from suspend. + + Think of the following: you suspend while an application is running + that keeps sensitive data in memory. The application itself prevents + the data from being swapped out. Suspend, however, must write these + data to swap to be able to resume later on. Without suspend encryption + your sensitive data are then stored in plaintext on disk. This means + that after resume your sensitive data are accessible to all + applications having direct access to the swap device which was used + for suspend. If you don't need swap after resume these data can remain + on disk virtually forever. Thus it can happen that your system gets + broken in weeks later and sensitive data which you thought were + encrypted and protected are retrieved and stolen from the swap device. + To prevent this situation you should use 'Encrypt suspend image'. + + During suspend a temporary key is created and this key is used to + encrypt the data written to disk. When, during resume, the data was + read back into memory the temporary key is destroyed which simply + means that all data written to disk during suspend are then + inaccessible so they can't be stolen later on. The only thing that + you must then take care of is that you call 'mkswap' for the swap + partition used for suspend as early as possible during regular + boot. This asserts that any temporary key from an oopsed suspend or + from a failed or aborted resume is erased from the swap device. + + As a rule of thumb use encrypted swap to protect your data while your + system is shut down or suspended. Additionally use the encrypted + suspend image to prevent sensitive data from being stolen after + resume. + +Q: + Can I suspend to a swap file? + +A: + Generally, yes, you can. However, it requires you to use the "resume=" and + "resume_offset=" kernel command line parameters, so the resume from a swap file + cannot be initiated from an initrd or initramfs image. See + swsusp-and-swap-files.txt for details. + +Q: + Is there a maximum system RAM size that is supported by swsusp? + +A: + It should work okay with highmem. + +Q: + Does swsusp (to disk) use only one swap partition or can it use + multiple swap partitions (aggregate them into one logical space)? + +A: + Only one swap partition, sorry. + +Q: + If my application(s) causes lots of memory & swap space to be used + (over half of the total system RAM), is it correct that it is likely + to be useless to try to suspend to disk while that app is running? + +A: + No, it should work okay, as long as your app does not mlock() + it. Just prepare big enough swap partition. + +Q: + What information is useful for debugging suspend-to-disk problems? + +A: + Well, last messages on the screen are always useful. If something + is broken, it is usually some kernel driver, therefore trying with as + little as possible modules loaded helps a lot. I also prefer people to + suspend from console, preferably without X running. Booting with + init=/bin/bash, then swapon and starting suspend sequence manually + usually does the trick. Then it is good idea to try with latest + vanilla kernel. + +Q: + How can distributions ship a swsusp-supporting kernel with modular + disk drivers (especially SATA)? + +A: + Well, it can be done, load the drivers, then do echo into + /sys/power/resume file from initrd. Be sure not to mount + anything, not even read-only mount, or you are going to lose your + data. + +Q: + How do I make suspend more verbose? + +A: + If you want to see any non-error kernel messages on the virtual + terminal the kernel switches to during suspend, you have to set the + kernel console loglevel to at least 4 (KERN_WARNING), for example by + doing:: + + # save the old loglevel + read LOGLEVEL DUMMY < /proc/sys/kernel/printk + # set the loglevel so we see the progress bar. + # if the level is higher than needed, we leave it alone. + if [ $LOGLEVEL -lt 5 ]; then + echo 5 > /proc/sys/kernel/printk + fi + + IMG_SZ=0 + read IMG_SZ < /sys/power/image_size + echo -n disk > /sys/power/state + RET=$? + # + # the logic here is: + # if image_size > 0 (without kernel support, IMG_SZ will be zero), + # then try again with image_size set to zero. + if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size + echo 0 > /sys/power/image_size + echo -n disk > /sys/power/state + RET=$? + fi + + # restore previous loglevel + echo $LOGLEVEL > /proc/sys/kernel/printk + exit $RET + +Q: + Is this true that if I have a mounted filesystem on a USB device and + I suspend to disk, I can lose data unless the filesystem has been mounted + with "sync"? + +A: + That's right ... if you disconnect that device, you may lose data. + In fact, even with "-o sync" you can lose data if your programs have + information in buffers they haven't written out to a disk you disconnect, + or if you disconnect before the device finished saving data you wrote. + + Software suspend normally powers down USB controllers, which is equivalent + to disconnecting all USB devices attached to your system. + + Your system might well support low-power modes for its USB controllers + while the system is asleep, maintaining the connection, using true sleep + modes like "suspend-to-RAM" or "standby". (Don't write "disk" to the + /sys/power/state file; write "standby" or "mem".) We've not seen any + hardware that can use these modes through software suspend, although in + theory some systems might support "platform" modes that won't break the + USB connections. + + Remember that it's always a bad idea to unplug a disk drive containing a + mounted filesystem. That's true even when your system is asleep! The + safest thing is to unmount all filesystems on removable media (such USB, + Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays) + before suspending; then remount them after resuming. + + There is a work-around for this problem. For more information, see + Documentation/driver-api/usb/persist.rst. + +Q: + Can I suspend-to-disk using a swap partition under LVM? + +A: + Yes and No. You can suspend successfully, but the kernel will not be able + to resume on its own. You need an initramfs that can recognize the resume + situation, activate the logical volume containing the swap volume (but not + touch any filesystems!), and eventually call:: + + echo -n "$major:$minor" > /sys/power/resume + + where $major and $minor are the respective major and minor device numbers of + the swap volume. + + uswsusp works with LVM, too. See http://suspend.sourceforge.net/ + +Q: + I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were + compiled with the similar configuration files. Anyway I found that + suspend to disk (and resume) is much slower on 2.6.16 compared to + 2.6.15. Any idea for why that might happen or how can I speed it up? + +A: + This is because the size of the suspend image is now greater than + for 2.6.15 (by saving more data we can get more responsive system + after resume). + + There's the /sys/power/image_size knob that controls the size of the + image. If you set it to 0 (eg. by echo 0 > /sys/power/image_size as + root), the 2.6.15 behavior should be restored. If it is still too + slow, take a look at suspend.sf.net -- userland suspend is faster and + supports LZF compression to speed it up further. diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt deleted file mode 100644 index 236d1fb13640..000000000000 --- a/Documentation/power/swsusp.txt +++ /dev/null @@ -1,446 +0,0 @@ -Some warnings, first. - - * BIG FAT WARNING ********************************************************* - * - * If you touch anything on disk between suspend and resume... - * ...kiss your data goodbye. - * - * If you do resume from initrd after your filesystems are mounted... - * ...bye bye root partition. - * [this is actually same case as above] - * - * If you have unsupported (*) devices using DMA, you may have some - * problems. If your disk driver does not support suspend... (IDE does), - * it may cause some problems, too. If you change kernel command line - * between suspend and resume, it may do something wrong. If you change - * your hardware while system is suspended... well, it was not good idea; - * but it will probably only crash. - * - * (*) suspend/resume support is needed to make it safe. - * - * If you have any filesystems on USB devices mounted before software suspend, - * they won't be accessible after resume and you may lose data, as though - * you have unplugged the USB devices with mounted filesystems on them; - * see the FAQ below for details. (This is not true for more traditional - * power states like "standby", which normally don't turn USB off.) - -Swap partition: -You need to append resume=/dev/your_swap_partition to kernel command -line or specify it using /sys/power/resume. - -Swap file: -If using a swapfile you can also specify a resume offset using -resume_offset= on the kernel command line or specify it -in /sys/power/resume_offset. - -After preparing then you suspend by - -echo shutdown > /sys/power/disk; echo disk > /sys/power/state - -. If you feel ACPI works pretty well on your system, you might try - -echo platform > /sys/power/disk; echo disk > /sys/power/state - -. If you would like to write hibernation image to swap and then suspend -to RAM (provided your platform supports it), you can try - -echo suspend > /sys/power/disk; echo disk > /sys/power/state - -. If you have SATA disks, you'll need recent kernels with SATA suspend -support. For suspend and resume to work, make sure your disk drivers -are built into kernel -- not modules. [There's way to make -suspend/resume with modular disk drivers, see FAQ, but you probably -should not do that.] - -If you want to limit the suspend image size to N bytes, do - -echo N > /sys/power/image_size - -before suspend (it is limited to around 2/5 of available RAM by default). - -. The resume process checks for the presence of the resume device, -if found, it then checks the contents for the hibernation image signature. -If both are found, it resumes the hibernation image. - -. The resume process may be triggered in two ways: - 1) During lateinit: If resume=/dev/your_swap_partition is specified on - the kernel command line, lateinit runs the resume process. If the - resume device has not been probed yet, the resume process fails and - bootup continues. - 2) Manually from an initrd or initramfs: May be run from - the init script by using the /sys/power/resume file. It is vital - that this be done prior to remounting any filesystems (even as - read-only) otherwise data may be corrupted. - -Article about goals and implementation of Software Suspend for Linux -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Author: Gábor Kuti -Last revised: 2003-10-20 by Pavel Machek - -Idea and goals to achieve - -Nowadays it is common in several laptops that they have a suspend button. It -saves the state of the machine to a filesystem or to a partition and switches -to standby mode. Later resuming the machine the saved state is loaded back to -ram and the machine can continue its work. It has two real benefits. First we -save ourselves the time machine goes down and later boots up, energy costs -are real high when running from batteries. The other gain is that we don't have to -interrupt our programs so processes that are calculating something for a long -time shouldn't need to be written interruptible. - -swsusp saves the state of the machine into active swaps and then reboots or -powerdowns. You must explicitly specify the swap partition to resume from with -``resume='' kernel option. If signature is found it loads and restores saved -state. If the option ``noresume'' is specified as a boot parameter, it skips -the resuming. If the option ``hibernate=nocompress'' is specified as a boot -parameter, it saves hibernation image without compression. - -In the meantime while the system is suspended you should not add/remove any -of the hardware, write to the filesystems, etc. - -Sleep states summary -==================== - -There are three different interfaces you can use, /proc/acpi should -work like this: - -In a really perfect world: -echo 1 > /proc/acpi/sleep # for standby -echo 2 > /proc/acpi/sleep # for suspend to ram -echo 3 > /proc/acpi/sleep # for suspend to ram, but with more power conservative -echo 4 > /proc/acpi/sleep # for suspend to disk -echo 5 > /proc/acpi/sleep # for shutdown unfriendly the system - -and perhaps -echo 4b > /proc/acpi/sleep # for suspend to disk via s4bios - -Frequently Asked Questions -========================== - -Q: well, suspending a server is IMHO a really stupid thing, -but... (Diego Zuccato): - -A: You bought new UPS for your server. How do you install it without -bringing machine down? Suspend to disk, rearrange power cables, -resume. - -You have your server on UPS. Power died, and UPS is indicating 30 -seconds to failure. What do you do? Suspend to disk. - - -Q: Maybe I'm missing something, but why don't the regular I/O paths work? - -A: We do use the regular I/O paths. However we cannot restore the data -to its original location as we load it. That would create an -inconsistent kernel state which would certainly result in an oops. -Instead, we load the image into unused memory and then atomically copy -it back to it original location. This implies, of course, a maximum -image size of half the amount of memory. - -There are two solutions to this: - -* require half of memory to be free during suspend. That way you can -read "new" data onto free spots, then cli and copy - -* assume we had special "polling" ide driver that only uses memory -between 0-640KB. That way, I'd have to make sure that 0-640KB is free -during suspending, but otherwise it would work... - -suspend2 shares this fundamental limitation, but does not include user -data and disk caches into "used memory" by saving them in -advance. That means that the limitation goes away in practice. - -Q: Does linux support ACPI S4? - -A: Yes. That's what echo platform > /sys/power/disk does. - -Q: What is 'suspend2'? - -A: suspend2 is 'Software Suspend 2', a forked implementation of -suspend-to-disk which is available as separate patches for 2.4 and 2.6 -kernels from swsusp.sourceforge.net. It includes support for SMP, 4GB -highmem and preemption. It also has a extensible architecture that -allows for arbitrary transformations on the image (compression, -encryption) and arbitrary backends for writing the image (eg to swap -or an NFS share[Work In Progress]). Questions regarding suspend2 -should be sent to the mailing list available through the suspend2 -website, and not to the Linux Kernel Mailing List. We are working -toward merging suspend2 into the mainline kernel. - -Q: What is the freezing of tasks and why are we using it? - -A: The freezing of tasks is a mechanism by which user space processes and some -kernel threads are controlled during hibernation or system-wide suspend (on some -architectures). See freezing-of-tasks.txt for details. - -Q: What is the difference between "platform" and "shutdown"? - -A: - -shutdown: save state in linux, then tell bios to powerdown - -platform: save state in linux, then tell bios to powerdown and blink - "suspended led" - -"platform" is actually right thing to do where supported, but -"shutdown" is most reliable (except on ACPI systems). - -Q: I do not understand why you have such strong objections to idea of -selective suspend. - -A: Do selective suspend during runtime power management, that's okay. But -it's useless for suspend-to-disk. (And I do not see how you could use -it for suspend-to-ram, I hope you do not want that). - -Lets see, so you suggest to - -* SUSPEND all but swap device and parents -* Snapshot -* Write image to disk -* SUSPEND swap device and parents -* Powerdown - -Oh no, that does not work, if swap device or its parents uses DMA, -you've corrupted data. You'd have to do - -* SUSPEND all but swap device and parents -* FREEZE swap device and parents -* Snapshot -* UNFREEZE swap device and parents -* Write -* SUSPEND swap device and parents - -Which means that you still need that FREEZE state, and you get more -complicated code. (And I have not yet introduce details like system -devices). - -Q: There don't seem to be any generally useful behavioral -distinctions between SUSPEND and FREEZE. - -A: Doing SUSPEND when you are asked to do FREEZE is always correct, -but it may be unnecessarily slow. If you want your driver to stay simple, -slowness may not matter to you. It can always be fixed later. - -For devices like disk it does matter, you do not want to spindown for -FREEZE. - -Q: After resuming, system is paging heavily, leading to very bad interactivity. - -A: Try running - -cat /proc/[0-9]*/maps | grep / | sed 's:.* /:/:' | sort -u | while read file -do - test -f "$file" && cat "$file" > /dev/null -done - -after resume. swapoff -a; swapon -a may also be useful. - -Q: What happens to devices during swsusp? They seem to be resumed -during system suspend? - -A: That's correct. We need to resume them if we want to write image to -disk. Whole sequence goes like - - Suspend part - ~~~~~~~~~~~~ - running system, user asks for suspend-to-disk - - user processes are stopped - - suspend(PMSG_FREEZE): devices are frozen so that they don't interfere - with state snapshot - - state snapshot: copy of whole used memory is taken with interrupts disabled - - resume(): devices are woken up so that we can write image to swap - - write image to swap - - suspend(PMSG_SUSPEND): suspend devices so that we can power off - - turn the power off - - Resume part - ~~~~~~~~~~~ - (is actually pretty similar) - - running system, user asks for suspend-to-disk - - user processes are stopped (in common case there are none, but with resume-from-initrd, no one knows) - - read image from disk - - suspend(PMSG_FREEZE): devices are frozen so that they don't interfere - with image restoration - - image restoration: rewrite memory with image - - resume(): devices are woken up so that system can continue - - thaw all user processes - -Q: What is this 'Encrypt suspend image' for? - -A: First of all: it is not a replacement for dm-crypt encrypted swap. -It cannot protect your computer while it is suspended. Instead it does -protect from leaking sensitive data after resume from suspend. - -Think of the following: you suspend while an application is running -that keeps sensitive data in memory. The application itself prevents -the data from being swapped out. Suspend, however, must write these -data to swap to be able to resume later on. Without suspend encryption -your sensitive data are then stored in plaintext on disk. This means -that after resume your sensitive data are accessible to all -applications having direct access to the swap device which was used -for suspend. If you don't need swap after resume these data can remain -on disk virtually forever. Thus it can happen that your system gets -broken in weeks later and sensitive data which you thought were -encrypted and protected are retrieved and stolen from the swap device. -To prevent this situation you should use 'Encrypt suspend image'. - -During suspend a temporary key is created and this key is used to -encrypt the data written to disk. When, during resume, the data was -read back into memory the temporary key is destroyed which simply -means that all data written to disk during suspend are then -inaccessible so they can't be stolen later on. The only thing that -you must then take care of is that you call 'mkswap' for the swap -partition used for suspend as early as possible during regular -boot. This asserts that any temporary key from an oopsed suspend or -from a failed or aborted resume is erased from the swap device. - -As a rule of thumb use encrypted swap to protect your data while your -system is shut down or suspended. Additionally use the encrypted -suspend image to prevent sensitive data from being stolen after -resume. - -Q: Can I suspend to a swap file? - -A: Generally, yes, you can. However, it requires you to use the "resume=" and -"resume_offset=" kernel command line parameters, so the resume from a swap file -cannot be initiated from an initrd or initramfs image. See -swsusp-and-swap-files.txt for details. - -Q: Is there a maximum system RAM size that is supported by swsusp? - -A: It should work okay with highmem. - -Q: Does swsusp (to disk) use only one swap partition or can it use -multiple swap partitions (aggregate them into one logical space)? - -A: Only one swap partition, sorry. - -Q: If my application(s) causes lots of memory & swap space to be used -(over half of the total system RAM), is it correct that it is likely -to be useless to try to suspend to disk while that app is running? - -A: No, it should work okay, as long as your app does not mlock() -it. Just prepare big enough swap partition. - -Q: What information is useful for debugging suspend-to-disk problems? - -A: Well, last messages on the screen are always useful. If something -is broken, it is usually some kernel driver, therefore trying with as -little as possible modules loaded helps a lot. I also prefer people to -suspend from console, preferably without X running. Booting with -init=/bin/bash, then swapon and starting suspend sequence manually -usually does the trick. Then it is good idea to try with latest -vanilla kernel. - -Q: How can distributions ship a swsusp-supporting kernel with modular -disk drivers (especially SATA)? - -A: Well, it can be done, load the drivers, then do echo into -/sys/power/resume file from initrd. Be sure not to mount -anything, not even read-only mount, or you are going to lose your -data. - -Q: How do I make suspend more verbose? - -A: If you want to see any non-error kernel messages on the virtual -terminal the kernel switches to during suspend, you have to set the -kernel console loglevel to at least 4 (KERN_WARNING), for example by -doing - - # save the old loglevel - read LOGLEVEL DUMMY < /proc/sys/kernel/printk - # set the loglevel so we see the progress bar. - # if the level is higher than needed, we leave it alone. - if [ $LOGLEVEL -lt 5 ]; then - echo 5 > /proc/sys/kernel/printk - fi - - IMG_SZ=0 - read IMG_SZ < /sys/power/image_size - echo -n disk > /sys/power/state - RET=$? - # - # the logic here is: - # if image_size > 0 (without kernel support, IMG_SZ will be zero), - # then try again with image_size set to zero. - if [ $RET -ne 0 -a $IMG_SZ -ne 0 ]; then # try again with minimal image size - echo 0 > /sys/power/image_size - echo -n disk > /sys/power/state - RET=$? - fi - - # restore previous loglevel - echo $LOGLEVEL > /proc/sys/kernel/printk - exit $RET - -Q: Is this true that if I have a mounted filesystem on a USB device and -I suspend to disk, I can lose data unless the filesystem has been mounted -with "sync"? - -A: That's right ... if you disconnect that device, you may lose data. -In fact, even with "-o sync" you can lose data if your programs have -information in buffers they haven't written out to a disk you disconnect, -or if you disconnect before the device finished saving data you wrote. - -Software suspend normally powers down USB controllers, which is equivalent -to disconnecting all USB devices attached to your system. - -Your system might well support low-power modes for its USB controllers -while the system is asleep, maintaining the connection, using true sleep -modes like "suspend-to-RAM" or "standby". (Don't write "disk" to the -/sys/power/state file; write "standby" or "mem".) We've not seen any -hardware that can use these modes through software suspend, although in -theory some systems might support "platform" modes that won't break the -USB connections. - -Remember that it's always a bad idea to unplug a disk drive containing a -mounted filesystem. That's true even when your system is asleep! The -safest thing is to unmount all filesystems on removable media (such USB, -Firewire, CompactFlash, MMC, external SATA, or even IDE hotplug bays) -before suspending; then remount them after resuming. - -There is a work-around for this problem. For more information, see -Documentation/driver-api/usb/persist.rst. - -Q: Can I suspend-to-disk using a swap partition under LVM? - -A: Yes and No. You can suspend successfully, but the kernel will not be able -to resume on its own. You need an initramfs that can recognize the resume -situation, activate the logical volume containing the swap volume (but not -touch any filesystems!), and eventually call - -echo -n "$major:$minor" > /sys/power/resume - -where $major and $minor are the respective major and minor device numbers of -the swap volume. - -uswsusp works with LVM, too. See http://suspend.sourceforge.net/ - -Q: I upgraded the kernel from 2.6.15 to 2.6.16. Both kernels were -compiled with the similar configuration files. Anyway I found that -suspend to disk (and resume) is much slower on 2.6.16 compared to -2.6.15. Any idea for why that might happen or how can I speed it up? - -A: This is because the size of the suspend image is now greater than -for 2.6.15 (by saving more data we can get more responsive system -after resume). - -There's the /sys/power/image_size knob that controls the size of the -image. If you set it to 0 (eg. by echo 0 > /sys/power/image_size as -root), the 2.6.15 behavior should be restored. If it is still too -slow, take a look at suspend.sf.net -- userland suspend is faster and -supports LZF compression to speed it up further. diff --git a/Documentation/power/tricks.rst b/Documentation/power/tricks.rst new file mode 100644 index 000000000000..ca787f142c3f --- /dev/null +++ b/Documentation/power/tricks.rst @@ -0,0 +1,29 @@ +================ +swsusp/S3 tricks +================ + +Pavel Machek + +If you want to trick swsusp/S3 into working, you might want to try: + +* go with minimal config, turn off drivers like USB, AGP you don't + really need + +* turn off APIC and preempt + +* use ext2. At least it has working fsck. [If something seems to go + wrong, force fsck when you have a chance] + +* turn off modules + +* use vga text console, shut down X. [If you really want X, you might + want to try vesafb later] + +* try running as few processes as possible, preferably go to single + user mode. + +* due to video issues, swsusp should be easier to get working than + S3. Try that first. + +When you make it work, try to find out what exactly was it that broke +suspend, and preferably fix that. diff --git a/Documentation/power/tricks.txt b/Documentation/power/tricks.txt deleted file mode 100644 index a1b8f7249f4c..000000000000 --- a/Documentation/power/tricks.txt +++ /dev/null @@ -1,27 +0,0 @@ - swsusp/S3 tricks - ~~~~~~~~~~~~~~~~ -Pavel Machek - -If you want to trick swsusp/S3 into working, you might want to try: - -* go with minimal config, turn off drivers like USB, AGP you don't - really need - -* turn off APIC and preempt - -* use ext2. At least it has working fsck. [If something seems to go - wrong, force fsck when you have a chance] - -* turn off modules - -* use vga text console, shut down X. [If you really want X, you might - want to try vesafb later] - -* try running as few processes as possible, preferably go to single - user mode. - -* due to video issues, swsusp should be easier to get working than - S3. Try that first. - -When you make it work, try to find out what exactly was it that broke -suspend, and preferably fix that. diff --git a/Documentation/power/userland-swsusp.rst b/Documentation/power/userland-swsusp.rst new file mode 100644 index 000000000000..a0fa51bb1a4d --- /dev/null +++ b/Documentation/power/userland-swsusp.rst @@ -0,0 +1,191 @@ +===================================================== +Documentation for userland software suspend interface +===================================================== + + (C) 2006 Rafael J. Wysocki + +First, the warnings at the beginning of swsusp.txt still apply. + +Second, you should read the FAQ in swsusp.txt _now_ if you have not +done it already. + +Now, to use the userland interface for software suspend you need special +utilities that will read/write the system memory snapshot from/to the +kernel. Such utilities are available, for example, from +. You may want to have a look at them if you +are going to develop your own suspend/resume utilities. + +The interface consists of a character device providing the open(), +release(), read(), and write() operations as well as several ioctl() +commands defined in include/linux/suspend_ioctls.h . The major and minor +numbers of the device are, respectively, 10 and 231, and they can +be read from /sys/class/misc/snapshot/dev. + +The device can be open either for reading or for writing. If open for +reading, it is considered to be in the suspend mode. Otherwise it is +assumed to be in the resume mode. The device cannot be open for simultaneous +reading and writing. It is also impossible to have the device open more than +once at a time. + +Even opening the device has side effects. Data structures are +allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are +called. + +The ioctl() commands recognized by the device are: + +SNAPSHOT_FREEZE + freeze user space processes (the current process is + not frozen); this is required for SNAPSHOT_CREATE_IMAGE + and SNAPSHOT_ATOMIC_RESTORE to succeed + +SNAPSHOT_UNFREEZE + thaw user space processes frozen by SNAPSHOT_FREEZE + +SNAPSHOT_CREATE_IMAGE + create a snapshot of the system memory; the + last argument of ioctl() should be a pointer to an int variable, + the value of which will indicate whether the call returned after + creating the snapshot (1) or after restoring the system memory state + from it (0) (after resume the system finds itself finishing the + SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot + has been created the read() operation can be used to transfer + it out of the kernel + +SNAPSHOT_ATOMIC_RESTORE + restore the system memory state from the + uploaded snapshot image; before calling it you should transfer + the system memory snapshot back to the kernel using the write() + operation; this call will not succeed if the snapshot + image is not available to the kernel + +SNAPSHOT_FREE + free memory allocated for the snapshot image + +SNAPSHOT_PREF_IMAGE_SIZE + set the preferred maximum size of the image + (the kernel will do its best to ensure the image size will not exceed + this number, but if it turns out to be impossible, the kernel will + create the smallest image possible) + +SNAPSHOT_GET_IMAGE_SIZE + return the actual size of the hibernation image + +SNAPSHOT_AVAIL_SWAP_SIZE + return the amount of available swap in bytes (the + last argument should be a pointer to an unsigned int variable that will + contain the result if the call is successful). + +SNAPSHOT_ALLOC_SWAP_PAGE + allocate a swap page from the resume partition + (the last argument should be a pointer to a loff_t variable that + will contain the swap page offset if the call is successful) + +SNAPSHOT_FREE_SWAP_PAGES + free all swap pages allocated by + SNAPSHOT_ALLOC_SWAP_PAGE + +SNAPSHOT_SET_SWAP_AREA + set the resume partition and the offset (in + units) from the beginning of the partition at which the swap header is + located (the last ioctl() argument should point to a struct + resume_swap_area, as defined in kernel/power/suspend_ioctls.h, + containing the resume device specification and the offset); for swap + partitions the offset is always 0, but it is different from zero for + swap files (see Documentation/power/swsusp-and-swap-files.rst for + details). + +SNAPSHOT_PLATFORM_SUPPORT + enable/disable the hibernation platform support, + depending on the argument value (enable, if the argument is nonzero) + +SNAPSHOT_POWER_OFF + make the kernel transition the system to the hibernation + state (eg. ACPI S4) using the platform (eg. ACPI) driver + +SNAPSHOT_S2RAM + suspend to RAM; using this call causes the kernel to + immediately enter the suspend-to-RAM state, so this call must always + be preceded by the SNAPSHOT_FREEZE call and it is also necessary + to use the SNAPSHOT_UNFREEZE call after the system wakes up. This call + is needed to implement the suspend-to-both mechanism in which the + suspend image is first created, as though the system had been suspended + to disk, and then the system is suspended to RAM (this makes it possible + to resume the system from RAM if there's enough battery power or restore + its state on the basis of the saved suspend image otherwise) + +The device's read() operation can be used to transfer the snapshot image from +the kernel. It has the following limitations: + +- you cannot read() more than one virtual memory page at a time +- read()s across page boundaries are impossible (ie. if you read() 1/2 of + a page in the previous call, you will only be able to read() + **at most** 1/2 of the page in the next call) + +The device's write() operation is used for uploading the system memory snapshot +into the kernel. It has the same limitations as the read() operation. + +The release() operation frees all memory allocated for the snapshot image +and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any). +Thus it is not necessary to use either SNAPSHOT_FREE or +SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also +unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are +still frozen when the device is being closed). + +Currently it is assumed that the userland utilities reading/writing the +snapshot image from/to the kernel will use a swap partition, called the resume +partition, or a swap file as storage space (if a swap file is used, the resume +partition is the partition that holds this file). However, this is not really +required, as they can use, for example, a special (blank) suspend partition or +a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and +mounted afterwards. + +These utilities MUST NOT make any assumptions regarding the ordering of +data within the snapshot image. The contents of the image are entirely owned +by the kernel and its structure may be changed in future kernel releases. + +The snapshot image MUST be written to the kernel unaltered (ie. all of the image +data, metadata and header MUST be written in _exactly_ the same amount, form +and order in which they have been read). Otherwise, the behavior of the +resumed system may be totally unpredictable. + +While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the +structure of the snapshot image is consistent with the information stored +in the image header. If any inconsistencies are detected, +SNAPSHOT_ATOMIC_RESTORE will not succeed. Still, this is not a fool-proof +mechanism and the userland utilities using the interface SHOULD use additional +means, such as checksums, to ensure the integrity of the snapshot image. + +The suspending and resuming utilities MUST lock themselves in memory, +preferably using mlockall(), before calling SNAPSHOT_FREEZE. + +The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE +in the memory location pointed to by the last argument of ioctl() and proceed +in accordance with it: + +1. If the value is 1 (ie. the system memory snapshot has just been + created and the system is ready for saving it): + + (a) The suspending utility MUST NOT close the snapshot device + _unless_ the whole suspend procedure is to be cancelled, in + which case, if the snapshot image has already been saved, the + suspending utility SHOULD destroy it, preferably by zapping + its header. If the suspend is not to be cancelled, the + system MUST be powered off or rebooted after the snapshot + image has been saved. + (b) The suspending utility SHOULD NOT attempt to perform any + file system operations (including reads) on the file systems + that were mounted before SNAPSHOT_CREATE_IMAGE has been + called. However, it MAY mount a file system that was not + mounted at that time and perform some operations on it (eg. + use it for saving the image). + +2. If the value is 0 (ie. the system state has just been restored from + the snapshot image), the suspending utility MUST close the snapshot + device. Afterwards it will be treated as a regular userland process, + so it need not exit. + +The resuming utility SHOULD NOT attempt to mount any file systems that could +be mounted before suspend and SHOULD NOT attempt to perform any operations +involving such file systems. + +For details, please refer to the source code. diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt deleted file mode 100644 index bbfcd1bbedc5..000000000000 --- a/Documentation/power/userland-swsusp.txt +++ /dev/null @@ -1,170 +0,0 @@ -Documentation for userland software suspend interface - (C) 2006 Rafael J. Wysocki - -First, the warnings at the beginning of swsusp.txt still apply. - -Second, you should read the FAQ in swsusp.txt _now_ if you have not -done it already. - -Now, to use the userland interface for software suspend you need special -utilities that will read/write the system memory snapshot from/to the -kernel. Such utilities are available, for example, from -. You may want to have a look at them if you -are going to develop your own suspend/resume utilities. - -The interface consists of a character device providing the open(), -release(), read(), and write() operations as well as several ioctl() -commands defined in include/linux/suspend_ioctls.h . The major and minor -numbers of the device are, respectively, 10 and 231, and they can -be read from /sys/class/misc/snapshot/dev. - -The device can be open either for reading or for writing. If open for -reading, it is considered to be in the suspend mode. Otherwise it is -assumed to be in the resume mode. The device cannot be open for simultaneous -reading and writing. It is also impossible to have the device open more than -once at a time. - -Even opening the device has side effects. Data structures are -allocated, and PM_HIBERNATION_PREPARE / PM_RESTORE_PREPARE chains are -called. - -The ioctl() commands recognized by the device are: - -SNAPSHOT_FREEZE - freeze user space processes (the current process is - not frozen); this is required for SNAPSHOT_CREATE_IMAGE - and SNAPSHOT_ATOMIC_RESTORE to succeed - -SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE - -SNAPSHOT_CREATE_IMAGE - create a snapshot of the system memory; the - last argument of ioctl() should be a pointer to an int variable, - the value of which will indicate whether the call returned after - creating the snapshot (1) or after restoring the system memory state - from it (0) (after resume the system finds itself finishing the - SNAPSHOT_CREATE_IMAGE ioctl() again); after the snapshot - has been created the read() operation can be used to transfer - it out of the kernel - -SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the - uploaded snapshot image; before calling it you should transfer - the system memory snapshot back to the kernel using the write() - operation; this call will not succeed if the snapshot - image is not available to the kernel - -SNAPSHOT_FREE - free memory allocated for the snapshot image - -SNAPSHOT_PREF_IMAGE_SIZE - set the preferred maximum size of the image - (the kernel will do its best to ensure the image size will not exceed - this number, but if it turns out to be impossible, the kernel will - create the smallest image possible) - -SNAPSHOT_GET_IMAGE_SIZE - return the actual size of the hibernation image - -SNAPSHOT_AVAIL_SWAP_SIZE - return the amount of available swap in bytes (the - last argument should be a pointer to an unsigned int variable that will - contain the result if the call is successful). - -SNAPSHOT_ALLOC_SWAP_PAGE - allocate a swap page from the resume partition - (the last argument should be a pointer to a loff_t variable that - will contain the swap page offset if the call is successful) - -SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated by - SNAPSHOT_ALLOC_SWAP_PAGE - -SNAPSHOT_SET_SWAP_AREA - set the resume partition and the offset (in - units) from the beginning of the partition at which the swap header is - located (the last ioctl() argument should point to a struct - resume_swap_area, as defined in kernel/power/suspend_ioctls.h, - containing the resume device specification and the offset); for swap - partitions the offset is always 0, but it is different from zero for - swap files (see Documentation/power/swsusp-and-swap-files.txt for - details). - -SNAPSHOT_PLATFORM_SUPPORT - enable/disable the hibernation platform support, - depending on the argument value (enable, if the argument is nonzero) - -SNAPSHOT_POWER_OFF - make the kernel transition the system to the hibernation - state (eg. ACPI S4) using the platform (eg. ACPI) driver - -SNAPSHOT_S2RAM - suspend to RAM; using this call causes the kernel to - immediately enter the suspend-to-RAM state, so this call must always - be preceded by the SNAPSHOT_FREEZE call and it is also necessary - to use the SNAPSHOT_UNFREEZE call after the system wakes up. This call - is needed to implement the suspend-to-both mechanism in which the - suspend image is first created, as though the system had been suspended - to disk, and then the system is suspended to RAM (this makes it possible - to resume the system from RAM if there's enough battery power or restore - its state on the basis of the saved suspend image otherwise) - -The device's read() operation can be used to transfer the snapshot image from -the kernel. It has the following limitations: -- you cannot read() more than one virtual memory page at a time -- read()s across page boundaries are impossible (ie. if you read() 1/2 of - a page in the previous call, you will only be able to read() - _at_ _most_ 1/2 of the page in the next call) - -The device's write() operation is used for uploading the system memory snapshot -into the kernel. It has the same limitations as the read() operation. - -The release() operation frees all memory allocated for the snapshot image -and all swap pages allocated with SNAPSHOT_ALLOC_SWAP_PAGE (if any). -Thus it is not necessary to use either SNAPSHOT_FREE or -SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also -unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are -still frozen when the device is being closed). - -Currently it is assumed that the userland utilities reading/writing the -snapshot image from/to the kernel will use a swap partition, called the resume -partition, or a swap file as storage space (if a swap file is used, the resume -partition is the partition that holds this file). However, this is not really -required, as they can use, for example, a special (blank) suspend partition or -a file on a partition that is unmounted before SNAPSHOT_CREATE_IMAGE and -mounted afterwards. - -These utilities MUST NOT make any assumptions regarding the ordering of -data within the snapshot image. The contents of the image are entirely owned -by the kernel and its structure may be changed in future kernel releases. - -The snapshot image MUST be written to the kernel unaltered (ie. all of the image -data, metadata and header MUST be written in _exactly_ the same amount, form -and order in which they have been read). Otherwise, the behavior of the -resumed system may be totally unpredictable. - -While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the -structure of the snapshot image is consistent with the information stored -in the image header. If any inconsistencies are detected, -SNAPSHOT_ATOMIC_RESTORE will not succeed. Still, this is not a fool-proof -mechanism and the userland utilities using the interface SHOULD use additional -means, such as checksums, to ensure the integrity of the snapshot image. - -The suspending and resuming utilities MUST lock themselves in memory, -preferably using mlockall(), before calling SNAPSHOT_FREEZE. - -The suspending utility MUST check the value stored by SNAPSHOT_CREATE_IMAGE -in the memory location pointed to by the last argument of ioctl() and proceed -in accordance with it: -1. If the value is 1 (ie. the system memory snapshot has just been - created and the system is ready for saving it): - (a) The suspending utility MUST NOT close the snapshot device - _unless_ the whole suspend procedure is to be cancelled, in - which case, if the snapshot image has already been saved, the - suspending utility SHOULD destroy it, preferably by zapping - its header. If the suspend is not to be cancelled, the - system MUST be powered off or rebooted after the snapshot - image has been saved. - (b) The suspending utility SHOULD NOT attempt to perform any - file system operations (including reads) on the file systems - that were mounted before SNAPSHOT_CREATE_IMAGE has been - called. However, it MAY mount a file system that was not - mounted at that time and perform some operations on it (eg. - use it for saving the image). -2. If the value is 0 (ie. the system state has just been restored from - the snapshot image), the suspending utility MUST close the snapshot - device. Afterwards it will be treated as a regular userland process, - so it need not exit. - -The resuming utility SHOULD NOT attempt to mount any file systems that could -be mounted before suspend and SHOULD NOT attempt to perform any operations -involving such file systems. - -For details, please refer to the source code. diff --git a/Documentation/power/video.rst b/Documentation/power/video.rst new file mode 100644 index 000000000000..337a2ba9f32f --- /dev/null +++ b/Documentation/power/video.rst @@ -0,0 +1,213 @@ +=========================== +Video issues with S3 resume +=========================== + +2003-2006, Pavel Machek + +During S3 resume, hardware needs to be reinitialized. For most +devices, this is easy, and kernel driver knows how to do +it. Unfortunately there's one exception: video card. Those are usually +initialized by BIOS, and kernel does not have enough information to +boot video card. (Kernel usually does not even contain video card +driver -- vesafb and vgacon are widely used). + +This is not problem for swsusp, because during swsusp resume, BIOS is +run normally so video card is normally initialized. It should not be +problem for S1 standby, because hardware should retain its state over +that. + +We either have to run video BIOS during early resume, or interpret it +using vbetool later, or maybe nothing is necessary on particular +system because video state is preserved. Unfortunately different +methods work on different systems, and no known method suits all of +them. + +Userland application called s2ram has been developed; it contains long +whitelist of systems, and automatically selects working method for a +given system. It can be downloaded from CVS at +www.sf.net/projects/suspend . If you get a system that is not in the +whitelist, please try to find a working solution, and submit whitelist +entry so that work does not need to be repeated. + +Currently, VBE_SAVE method (6 below) works on most +systems. Unfortunately, vbetool only runs after userland is resumed, +so it makes debugging of early resume problems +hard/impossible. Methods that do not rely on userland are preferable. + +Details +~~~~~~~ + +There are a few types of systems where video works after S3 resume: + +(1) systems where video state is preserved over S3. + +(2) systems where it is possible to call the video BIOS during S3 + resume. Unfortunately, it is not correct to call the video BIOS at + that point, but it happens to work on some machines. Use + acpi_sleep=s3_bios. + +(3) systems that initialize video card into vga text mode and where + the BIOS works well enough to be able to set video mode. Use + acpi_sleep=s3_mode on these. + +(4) on some systems s3_bios kicks video into text mode, and + acpi_sleep=s3_bios,s3_mode is needed. + +(5) radeon systems, where X can soft-boot your video card. You'll need + a new enough X, and a plain text console (no vesafb or radeonfb). See + http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information. + Alternatively, you should use vbetool (6) instead. + +(6) other radeon systems, where vbetool is enough to bring system back + to life. It needs text console to be working. Do vbetool vbestate + save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool + vbestate restore < /tmp/delme; setfont , and your video + should work. + +(7) on some systems, it is possible to boot most of kernel, and then + POSTing bios works. Ole Rohne has patch to do just that at + http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2. + +(8) on some systems, you can use the video_post utility and or + do echo 3 > /sys/power/state && /usr/sbin/video_post - which will + initialize the display in console mode. If you are in X, you can switch + to a virtual terminal and back to X using CTRL+ALT+F1 - CTRL+ALT+F7 to get + the display working in graphical mode again. + +Now, if you pass acpi_sleep=something, and it does not work with your +bios, you'll get a hard crash during resume. Be careful. Also it is +safest to do your experiments with plain old VGA console. The vesafb +and radeonfb (etc) drivers have a tendency to crash the machine during +resume. + +You may have a system where none of above works. At that point you +either invent another ugly hack that works, or write proper driver for +your video card (good luck getting docs :-(). Maybe suspending from X +(proper X, knowing your hardware, not XF68_FBcon) might have better +chance of working. + +Table of known working notebooks: + + +=============================== =============================================== +Model hack (or "how to do it") +=============================== =============================================== +Acer Aspire 1406LC ole's late BIOS init (7), turn off DRI +Acer TM 230 s3_bios (2) +Acer TM 242FX vbetool (6) +Acer TM C110 video_post (8) +Acer TM C300 vga=normal (only suspend on console, not in X), + vbetool (6) or video_post (8) +Acer TM 4052LCi s3_bios (2) +Acer TM 636Lci s3_bios,s3_mode (4) +Acer TM 650 (Radeon M7) vga=normal plus boot-radeon (5) gets text + console back +Acer TM 660 ??? [#f1]_ +Acer TM 800 vga=normal, X patches, see webpage (5) + or vbetool (6) +Acer TM 803 vga=normal, X patches, see webpage (5) + or vbetool (6) +Acer TM 803LCi vga=normal, vbetool (6) +Arima W730a vbetool needed (6) +Asus L2400D s3_mode (3) [#f2]_ (S1 also works OK) +Asus L3350M (SiS 740) (6) +Asus L3800C (Radeon M7) s3_bios (2) (S1 also works OK) +Asus M6887Ne vga=normal, s3_bios (2), use radeon driver + instead of fglrx in x.org +Athlon64 desktop prototype s3_bios (2) +Compal CL-50 ??? [#f1]_ +Compaq Armada E500 - P3-700 none (1) (S1 also works OK) +Compaq Evo N620c vga=normal, s3_bios (2) +Dell 600m, ATI R250 Lf none (1), but needs xorg-x11-6.8.1.902-1 +Dell D600, ATI RV250 vga=normal and X, or try vbestate (6) +Dell D610 vga=normal and X (possibly vbestate (6) too, + but not tested) +Dell Inspiron 4000 ??? [#f1]_ +Dell Inspiron 500m ??? [#f1]_ +Dell Inspiron 510m ??? +Dell Inspiron 5150 vbetool needed (6) +Dell Inspiron 600m ??? [#f1]_ +Dell Inspiron 8200 ??? [#f1]_ +Dell Inspiron 8500 ??? [#f1]_ +Dell Inspiron 8600 ??? [#f1]_ +eMachines athlon64 machines vbetool needed (6) (someone please get + me model #s) +HP NC6000 s3_bios, may not use radeonfb (2); + or vbetool (6) +HP NX7000 ??? [#f1]_ +HP Pavilion ZD7000 vbetool post needed, need open-source nv + driver for X +HP Omnibook XE3 athlon version none (1) +HP Omnibook XE3GC none (1), video is S3 Savage/IX-MV +HP Omnibook XE3L-GF vbetool (6) +HP Omnibook 5150 none (1), (S1 also works OK) +IBM TP T20, model 2647-44G none (1), video is S3 Inc. 86C270-294 + Savage/IX-MV, vesafb gets "interesting" + but X work. +IBM TP A31 / Type 2652-M5G s3_mode (3) [works ok with + BIOS 1.04 2002-08-23, but not at all with + BIOS 1.11 2004-11-05 :-(] +IBM TP R32 / Type 2658-MMG none (1) +IBM TP R40 2722B3G ??? [#f1]_ +IBM TP R50p / Type 1832-22U s3_bios (2) +IBM TP R51 none (1) +IBM TP T30 236681A ??? [#f1]_ +IBM TP T40 / Type 2373-MU4 none (1) +IBM TP T40p none (1) +IBM TP R40p s3_bios (2) +IBM TP T41p s3_bios (2), switch to X after resume +IBM TP T42 s3_bios (2) +IBM ThinkPad T42p (2373-GTG) s3_bios (2) +IBM TP X20 ??? [#f1]_ +IBM TP X30 s3_bios, s3_mode (4) +IBM TP X31 / Type 2672-XXH none (1), use radeontool + (http://fdd.com/software/radeon/) to + turn off backlight. +IBM TP X32 none (1), but backlight is on and video is + trashed after long suspend. s3_bios, + s3_mode (4) works too. Perhaps that gets + better results? +IBM Thinkpad X40 Type 2371-7JG s3_bios,s3_mode (4) +IBM TP 600e none(1), but a switch to console and + back to X is needed +Medion MD4220 ??? [#f1]_ +Samsung P35 vbetool needed (6) +Sharp PC-AR10 (ATI rage) none (1), backlight does not switch off +Sony Vaio PCG-C1VRX/K s3_bios (2) +Sony Vaio PCG-F403 ??? [#f1]_ +Sony Vaio PCG-GRT995MP none (1), works with 'nv' X driver +Sony Vaio PCG-GR7/K none (1), but needs radeonfb, use + radeontool (http://fdd.com/software/radeon/) + to turn off backlight. +Sony Vaio PCG-N505SN ??? [#f1]_ +Sony Vaio vgn-s260 X or boot-radeon can init it (5) +Sony Vaio vgn-S580BH vga=normal, but suspend from X. Console will + be blank unless you return to X. +Sony Vaio vgn-FS115B s3_bios (2),s3_mode (4) +Toshiba Libretto L5 none (1) +Toshiba Libretto 100CT/110CT vbetool (6) +Toshiba Portege 3020CT s3_mode (3) +Toshiba Satellite 4030CDT s3_mode (3) (S1 also works OK) +Toshiba Satellite 4080XCDT s3_mode (3) (S1 also works OK) +Toshiba Satellite 4090XCDT ??? [#f1]_ +Toshiba Satellite P10-554 s3_bios,s3_mode (4)[#f3]_ +Toshiba M30 (2) xor X with nvidia driver using internal AGP +Uniwill 244IIO ??? [#f1]_ +=============================== =============================================== + +Known working desktop systems +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +=================== ============================= ======================== +Mainboard Graphics card hack (or "how to do it") +=================== ============================= ======================== +Asus A7V8X nVidia RIVA TNT2 model 64 s3_bios,s3_mode (4) +=================== ============================= ======================== + + +.. [#f1] from https://wiki.ubuntu.com/HoaryPMResults, not sure + which options to use. If you know, please tell me. + +.. [#f2] To be tested with a newer kernel. + +.. [#f3] Not with SMP kernel, UP only. diff --git a/Documentation/power/video.txt b/Documentation/power/video.txt deleted file mode 100644 index 3e6272bc4472..000000000000 --- a/Documentation/power/video.txt +++ /dev/null @@ -1,185 +0,0 @@ - - Video issues with S3 resume - ~~~~~~~~~~~~~~~~~~~~~~~~~~~ - 2003-2006, Pavel Machek - -During S3 resume, hardware needs to be reinitialized. For most -devices, this is easy, and kernel driver knows how to do -it. Unfortunately there's one exception: video card. Those are usually -initialized by BIOS, and kernel does not have enough information to -boot video card. (Kernel usually does not even contain video card -driver -- vesafb and vgacon are widely used). - -This is not problem for swsusp, because during swsusp resume, BIOS is -run normally so video card is normally initialized. It should not be -problem for S1 standby, because hardware should retain its state over -that. - -We either have to run video BIOS during early resume, or interpret it -using vbetool later, or maybe nothing is necessary on particular -system because video state is preserved. Unfortunately different -methods work on different systems, and no known method suits all of -them. - -Userland application called s2ram has been developed; it contains long -whitelist of systems, and automatically selects working method for a -given system. It can be downloaded from CVS at -www.sf.net/projects/suspend . If you get a system that is not in the -whitelist, please try to find a working solution, and submit whitelist -entry so that work does not need to be repeated. - -Currently, VBE_SAVE method (6 below) works on most -systems. Unfortunately, vbetool only runs after userland is resumed, -so it makes debugging of early resume problems -hard/impossible. Methods that do not rely on userland are preferable. - -Details -~~~~~~~ - -There are a few types of systems where video works after S3 resume: - -(1) systems where video state is preserved over S3. - -(2) systems where it is possible to call the video BIOS during S3 - resume. Unfortunately, it is not correct to call the video BIOS at - that point, but it happens to work on some machines. Use - acpi_sleep=s3_bios. - -(3) systems that initialize video card into vga text mode and where - the BIOS works well enough to be able to set video mode. Use - acpi_sleep=s3_mode on these. - -(4) on some systems s3_bios kicks video into text mode, and - acpi_sleep=s3_bios,s3_mode is needed. - -(5) radeon systems, where X can soft-boot your video card. You'll need - a new enough X, and a plain text console (no vesafb or radeonfb). See - http://www.doesi.gmxhome.de/linux/tm800s3/s3.html for more information. - Alternatively, you should use vbetool (6) instead. - -(6) other radeon systems, where vbetool is enough to bring system back - to life. It needs text console to be working. Do vbetool vbestate - save > /tmp/delme; echo 3 > /proc/acpi/sleep; vbetool post; vbetool - vbestate restore < /tmp/delme; setfont , and your video - should work. - -(7) on some systems, it is possible to boot most of kernel, and then - POSTing bios works. Ole Rohne has patch to do just that at - http://dev.gentoo.org/~marineam/patch-radeonfb-2.6.11-rc2-mm2. - -(8) on some systems, you can use the video_post utility and or - do echo 3 > /sys/power/state && /usr/sbin/video_post - which will - initialize the display in console mode. If you are in X, you can switch - to a virtual terminal and back to X using CTRL+ALT+F1 - CTRL+ALT+F7 to get - the display working in graphical mode again. - -Now, if you pass acpi_sleep=something, and it does not work with your -bios, you'll get a hard crash during resume. Be careful. Also it is -safest to do your experiments with plain old VGA console. The vesafb -and radeonfb (etc) drivers have a tendency to crash the machine during -resume. - -You may have a system where none of above works. At that point you -either invent another ugly hack that works, or write proper driver for -your video card (good luck getting docs :-(). Maybe suspending from X -(proper X, knowing your hardware, not XF68_FBcon) might have better -chance of working. - -Table of known working notebooks: - -Model hack (or "how to do it") ------------------------------------------------------------------------------- -Acer Aspire 1406LC ole's late BIOS init (7), turn off DRI -Acer TM 230 s3_bios (2) -Acer TM 242FX vbetool (6) -Acer TM C110 video_post (8) -Acer TM C300 vga=normal (only suspend on console, not in X), vbetool (6) or video_post (8) -Acer TM 4052LCi s3_bios (2) -Acer TM 636Lci s3_bios,s3_mode (4) -Acer TM 650 (Radeon M7) vga=normal plus boot-radeon (5) gets text console back -Acer TM 660 ??? (*) -Acer TM 800 vga=normal, X patches, see webpage (5) or vbetool (6) -Acer TM 803 vga=normal, X patches, see webpage (5) or vbetool (6) -Acer TM 803LCi vga=normal, vbetool (6) -Arima W730a vbetool needed (6) -Asus L2400D s3_mode (3)(***) (S1 also works OK) -Asus L3350M (SiS 740) (6) -Asus L3800C (Radeon M7) s3_bios (2) (S1 also works OK) -Asus M6887Ne vga=normal, s3_bios (2), use radeon driver instead of fglrx in x.org -Athlon64 desktop prototype s3_bios (2) -Compal CL-50 ??? (*) -Compaq Armada E500 - P3-700 none (1) (S1 also works OK) -Compaq Evo N620c vga=normal, s3_bios (2) -Dell 600m, ATI R250 Lf none (1), but needs xorg-x11-6.8.1.902-1 -Dell D600, ATI RV250 vga=normal and X, or try vbestate (6) -Dell D610 vga=normal and X (possibly vbestate (6) too, but not tested) -Dell Inspiron 4000 ??? (*) -Dell Inspiron 500m ??? (*) -Dell Inspiron 510m ??? -Dell Inspiron 5150 vbetool needed (6) -Dell Inspiron 600m ??? (*) -Dell Inspiron 8200 ??? (*) -Dell Inspiron 8500 ??? (*) -Dell Inspiron 8600 ??? (*) -eMachines athlon64 machines vbetool needed (6) (someone please get me model #s) -HP NC6000 s3_bios, may not use radeonfb (2); or vbetool (6) -HP NX7000 ??? (*) -HP Pavilion ZD7000 vbetool post needed, need open-source nv driver for X -HP Omnibook XE3 athlon version none (1) -HP Omnibook XE3GC none (1), video is S3 Savage/IX-MV -HP Omnibook XE3L-GF vbetool (6) -HP Omnibook 5150 none (1), (S1 also works OK) -IBM TP T20, model 2647-44G none (1), video is S3 Inc. 86C270-294 Savage/IX-MV, vesafb gets "interesting" but X work. -IBM TP A31 / Type 2652-M5G s3_mode (3) [works ok with BIOS 1.04 2002-08-23, but not at all with BIOS 1.11 2004-11-05 :-(] -IBM TP R32 / Type 2658-MMG none (1) -IBM TP R40 2722B3G ??? (*) -IBM TP R50p / Type 1832-22U s3_bios (2) -IBM TP R51 none (1) -IBM TP T30 236681A ??? (*) -IBM TP T40 / Type 2373-MU4 none (1) -IBM TP T40p none (1) -IBM TP R40p s3_bios (2) -IBM TP T41p s3_bios (2), switch to X after resume -IBM TP T42 s3_bios (2) -IBM ThinkPad T42p (2373-GTG) s3_bios (2) -IBM TP X20 ??? (*) -IBM TP X30 s3_bios, s3_mode (4) -IBM TP X31 / Type 2672-XXH none (1), use radeontool (http://fdd.com/software/radeon/) to turn off backlight. -IBM TP X32 none (1), but backlight is on and video is trashed after long suspend. s3_bios,s3_mode (4) works too. Perhaps that gets better results? -IBM Thinkpad X40 Type 2371-7JG s3_bios,s3_mode (4) -IBM TP 600e none(1), but a switch to console and back to X is needed -Medion MD4220 ??? (*) -Samsung P35 vbetool needed (6) -Sharp PC-AR10 (ATI rage) none (1), backlight does not switch off -Sony Vaio PCG-C1VRX/K s3_bios (2) -Sony Vaio PCG-F403 ??? (*) -Sony Vaio PCG-GRT995MP none (1), works with 'nv' X driver -Sony Vaio PCG-GR7/K none (1), but needs radeonfb, use radeontool (http://fdd.com/software/radeon/) to turn off backlight. -Sony Vaio PCG-N505SN ??? (*) -Sony Vaio vgn-s260 X or boot-radeon can init it (5) -Sony Vaio vgn-S580BH vga=normal, but suspend from X. Console will be blank unless you return to X. -Sony Vaio vgn-FS115B s3_bios (2),s3_mode (4) -Toshiba Libretto L5 none (1) -Toshiba Libretto 100CT/110CT vbetool (6) -Toshiba Portege 3020CT s3_mode (3) -Toshiba Satellite 4030CDT s3_mode (3) (S1 also works OK) -Toshiba Satellite 4080XCDT s3_mode (3) (S1 also works OK) -Toshiba Satellite 4090XCDT ??? (*) -Toshiba Satellite P10-554 s3_bios,s3_mode (4)(****) -Toshiba M30 (2) xor X with nvidia driver using internal AGP -Uniwill 244IIO ??? (*) - -Known working desktop systems -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Mainboard Graphics card hack (or "how to do it") ------------------------------------------------------------------------------- -Asus A7V8X nVidia RIVA TNT2 model 64 s3_bios,s3_mode (4) - - -(*) from https://wiki.ubuntu.com/HoaryPMResults, not sure - which options to use. If you know, please tell me. - -(***) To be tested with a newer kernel. - -(****) Not with SMP kernel, UP only. diff --git a/Documentation/process/submitting-drivers.rst b/Documentation/process/submitting-drivers.rst index 58bc047e7b95..1acaa14903d6 100644 --- a/Documentation/process/submitting-drivers.rst +++ b/Documentation/process/submitting-drivers.rst @@ -117,7 +117,7 @@ PM support: implemented") error. You should also try to make sure that your driver uses as little power as possible when it's not doing anything. For the driver testing instructions see - Documentation/power/drivers-testing.txt and for a relatively + Documentation/power/drivers-testing.rst and for a relatively complete overview of the power management issues related to drivers see :ref:`Documentation/driver-api/pm/devices.rst `. diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt index 197d81f4b836..d97207b9accb 100644 --- a/Documentation/scheduler/sched-energy.txt +++ b/Documentation/scheduler/sched-energy.txt @@ -22,7 +22,7 @@ the highest. The actual EM used by EAS is _not_ maintained by the scheduler, but by a dedicated framework. For details about this framework and what it provides, -please refer to its documentation (see Documentation/power/energy-model.txt). +please refer to its documentation (see Documentation/power/energy-model.rst). 2. Background and Terminology @@ -81,7 +81,7 @@ through the arch_scale_cpu_capacity() callback. The rest of platform knowledge used by EAS is directly read from the Energy Model (EM) framework. The EM of a platform is composed of a power cost table -per 'performance domain' in the system (see Documentation/power/energy-model.txt +per 'performance domain' in the system (see Documentation/power/energy-model.rst for futher details about performance domains). The scheduler manages references to the EM objects in the topology code when the @@ -352,7 +352,7 @@ could be amended in the future if proven otherwise. EAS uses the EM of a platform to estimate the impact of scheduling decisions on energy. So, your platform must provide power cost tables to the EM framework in order to make EAS start. To do so, please refer to documentation of the -independent EM framework in Documentation/power/energy-model.txt. +independent EM framework in Documentation/power/energy-model.rst. Please also note that the scheduling domains need to be re-built after the EM has been registered in order to start EAS. diff --git a/Documentation/trace/coresight-cpu-debug.txt b/Documentation/trace/coresight-cpu-debug.txt index f07e38094b40..1a660a39e3c0 100644 --- a/Documentation/trace/coresight-cpu-debug.txt +++ b/Documentation/trace/coresight-cpu-debug.txt @@ -151,7 +151,7 @@ At the runtime you can disable idle states with below methods: It is possible to disable CPU idle states by way of the PM QoS subsystem, more specifically by using the "/dev/cpu_dma_latency" -interface (see Documentation/power/pm_qos_interface.txt for more +interface (see Documentation/power/pm_qos_interface.rst for more details). As specified in the PM QoS documentation the requested parameter will stay in effect until the file descriptor is released. For example: diff --git a/Documentation/translations/zh_CN/process/submitting-drivers.rst b/Documentation/translations/zh_CN/process/submitting-drivers.rst index 72c6cd935821..f1c3906c69a8 100644 --- a/Documentation/translations/zh_CN/process/submitting-drivers.rst +++ b/Documentation/translations/zh_CN/process/submitting-drivers.rst @@ -97,7 +97,7 @@ Linux 2.6: 函数定义成返回 -ENOSYS(功能未实现)错误。你还应该尝试确 保你的驱动在什么都不干的情况下将耗电降到最低。要获得驱动 程序测试的指导,请参阅 - Documentation/power/drivers-testing.txt。有关驱动程序电 + Documentation/power/drivers-testing.rst。有关驱动程序电 源管理问题相对全面的概述,请参阅 Documentation/driver-api/pm/devices.rst。 diff --git a/MAINTAINERS b/MAINTAINERS index 9c382053ce6a..5a6137df3f0e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6446,7 +6446,7 @@ M: "Rafael J. Wysocki" M: Pavel Machek L: linux-pm@vger.kernel.org S: Supported -F: Documentation/power/freezing-of-tasks.txt +F: Documentation/power/freezing-of-tasks.rst F: include/linux/freezer.h F: kernel/freezer.c @@ -11764,7 +11764,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git F: drivers/opp/ F: include/linux/pm_opp.h -F: Documentation/power/opp.txt +F: Documentation/power/opp.rst F: Documentation/devicetree/bindings/opp/ OPL4 DRIVER diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2bbbd4d1ba31..77a724771dbb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2447,7 +2447,7 @@ menuconfig APM machines with more than one CPU. In order to use APM, you will need supporting software. For location - and more information, read + and more information, read and the Battery Powered Linux mini-HOWTO, available from . diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 066fd2a12851..10d040e2e807 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1175,7 +1175,7 @@ struct skl_wm_params { * to be disabled. This shouldn't happen and we'll print some error messages in * case it happens. * - * For more, read the Documentation/power/runtime_pm.txt. + * For more, read the Documentation/power/runtime_pm.rst. */ struct i915_runtime_pm { atomic_t wakeref_count; diff --git a/drivers/opp/Kconfig b/drivers/opp/Kconfig index a7fbb93f302c..1f64a3d46c8a 100644 --- a/drivers/opp/Kconfig +++ b/drivers/opp/Kconfig @@ -10,4 +10,4 @@ config PM_OPP OPP layer organizes the data internally using device pointers representing individual voltage domains and provides SOC implementations a ready to use framework to manage OPPs. - For more information, read + For more information, read diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c index f7033ecf6d0b..11f9c875b028 100644 --- a/drivers/power/supply/power_supply_core.c +++ b/drivers/power/supply/power_supply_core.c @@ -607,7 +607,7 @@ int power_supply_get_battery_info(struct power_supply *psy, /* The property and field names below must correspond to elements * in enum power_supply_property. For reasoning, see - * Documentation/power/power_supply_class.txt. + * Documentation/power/power_supply_class.rst. */ of_property_read_u32(battery_np, "energy-full-design-microwatt-hours", diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index c7eef32e7739..5b8328a99b2a 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -52,7 +52,7 @@ * irq line disabled until the threaded handler has been run. * IRQF_NO_SUSPEND - Do not disable this IRQ during suspend. Does not guarantee * that this interrupt will wake the system from a suspended - * state. See Documentation/power/suspend-and-interrupts.txt + * state. See Documentation/power/suspend-and-interrupts.rst * IRQF_FORCE_RESUME - Force enable it on resume even if IRQF_NO_SUSPEND is set * IRQF_NO_THREAD - Interrupt cannot be threaded * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device diff --git a/include/linux/pci.h b/include/linux/pci.h index b74b2a4e6df2..3d9a167ca5c3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -807,7 +807,7 @@ struct module; * @suspend_late: Put device into low power state. * @resume_early: Wake device from low power state. * @resume: Wake device from low power state. - * (Please see Documentation/power/pci.txt for descriptions + * (Please see Documentation/power/pci.rst for descriptions * of PCI Power Management and the related functions.) * @shutdown: Hook into reboot_notifier_list (kernel/sys.c). * Intended to stop any idling DMA operations. diff --git a/include/linux/pm.h b/include/linux/pm.h index 66c19a65a514..c14ad8bc1a41 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -284,7 +284,7 @@ typedef struct pm_message { * actions to be performed by a device driver's callbacks generally depend on * the platform and subsystem the device belongs to. * - * Refer to Documentation/power/runtime_pm.txt for more information about the + * Refer to Documentation/power/runtime_pm.rst for more information about the * role of the @runtime_suspend(), @runtime_resume() and @runtime_idle() * callbacks in device runtime power management. */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9bbaaab14b36..7a4dda9e5309 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -65,7 +65,7 @@ config HIBERNATION need to run mkswap against the swap partition used for the suspend. It also works with swap files to a limited extent (for details see - ). + ). Right now you may boot without resuming and resume later but in the meantime you cannot use the swap partition(s)/file(s) involved in @@ -74,7 +74,7 @@ config HIBERNATION MOUNT any journaled filesystems mounted before the suspend or they will get corrupted in a nasty way. - For more information take a look at . + For more information take a look at . config ARCH_SAVE_PAGE_KEYS bool @@ -255,7 +255,7 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read + and more information, read and the Battery Powered Linux mini-HOWTO, available from . diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 41722046b937..0cd26289bfbc 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -165,7 +165,7 @@ config CFG80211_DEFAULT_PS If this causes your applications to misbehave you should fix your applications instead -- they need to register their network - latency requirement, see Documentation/power/pm_qos_interface.txt. + latency requirement, see Documentation/power/pm_qos_interface.rst. config CFG80211_DEBUGFS bool "cfg80211 DebugFS entries" -- cgit v1.2.3-59-g8ed1b From e1714daad7cf8fe4d6dd91adcfbbdd0604b0210d Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 3 Jun 2019 10:25:31 +0200 Subject: i2c: headers: don't use 'dev' as adapter variable It is not a struct device, so 'dev' is confusing. Use 'adap', the most common name. Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index e982b8913b73..6bd199cfe61f 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -703,14 +703,14 @@ struct i2c_adapter { }; #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev) -static inline void *i2c_get_adapdata(const struct i2c_adapter *dev) +static inline void *i2c_get_adapdata(const struct i2c_adapter *adap) { - return dev_get_drvdata(&dev->dev); + return dev_get_drvdata(&adap->dev); } -static inline void i2c_set_adapdata(struct i2c_adapter *dev, void *data) +static inline void i2c_set_adapdata(struct i2c_adapter *adap, void *data) { - dev_set_drvdata(&dev->dev, data); + dev_set_drvdata(&adap->dev, data); } static inline struct i2c_adapter * -- cgit v1.2.3-59-g8ed1b From d68222d4d6647611be5a32c80a53a145e7c80ce9 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 3 Jun 2019 10:25:32 +0200 Subject: i2c: headers: always have a named variable in arguments Much better to read and understand. Naming for i2c_adapter is not consistent (yet), so use the name which is also used in core code. Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 6bd199cfe61f..14e04fb4f46f 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -40,7 +40,8 @@ struct i2c_device_identity; union i2c_smbus_data; struct i2c_board_info; enum i2c_slave_event; -typedef int (*i2c_slave_cb_t)(struct i2c_client *, enum i2c_slave_event, u8 *); +typedef int (*i2c_slave_cb_t)(struct i2c_client *client, + enum i2c_slave_event event, u8 *val); struct module; struct property_entry; @@ -257,16 +258,16 @@ struct i2c_driver { unsigned int class; /* Standard driver model interfaces */ - int (*probe)(struct i2c_client *, const struct i2c_device_id *); - int (*remove)(struct i2c_client *); + int (*probe)(struct i2c_client *client, const struct i2c_device_id *id); + int (*remove)(struct i2c_client *client); /* New driver model interface to aid the seamless removal of the * current probe()'s, more commonly unused than used second parameter. */ - int (*probe_new)(struct i2c_client *); + int (*probe_new)(struct i2c_client *client); /* driver model interfaces that don't relate to enumeration */ - void (*shutdown)(struct i2c_client *); + void (*shutdown)(struct i2c_client *client); /* Alert callback, for example for the SMBus alert protocol. * The format and meaning of the data value depends on the protocol. @@ -275,7 +276,7 @@ struct i2c_driver { * For the SMBus Host Notify protocol, the data corresponds to the * 16-bit payload data reported by the slave device acting as master. */ - void (*alert)(struct i2c_client *, enum i2c_alert_protocol protocol, + void (*alert)(struct i2c_client *client, enum i2c_alert_protocol protocol, unsigned int data); /* a ioctl like command that can be used to perform specific functions @@ -287,7 +288,7 @@ struct i2c_driver { const struct i2c_device_id *id_table; /* Device detection callback for automatic device creation */ - int (*detect)(struct i2c_client *, struct i2c_board_info *); + int (*detect)(struct i2c_client *client, struct i2c_board_info *info); const unsigned short *address_list; struct list_head clients; @@ -447,10 +448,10 @@ extern struct i2c_client * i2c_new_probed_device(struct i2c_adapter *adap, struct i2c_board_info *info, unsigned short const *addr_list, - int (*probe)(struct i2c_adapter *, unsigned short addr)); + int (*probe)(struct i2c_adapter *adap, unsigned short addr)); /* Common custom probe functions */ -extern int i2c_probe_func_quick_read(struct i2c_adapter *, unsigned short addr); +extern int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short addr); /* For devices that use several addresses, use i2c_new_dummy() to make * client handles for the extra addresses. @@ -466,7 +467,7 @@ i2c_new_secondary_device(struct i2c_client *client, const char *name, u16 default_addr); -extern void i2c_unregister_device(struct i2c_client *); +extern void i2c_unregister_device(struct i2c_client *client); #endif /* I2C */ /* Mainboard arch_initcall() code should register all its I2C devices. @@ -551,9 +552,9 @@ struct i2c_algorithm { * The main operations are wrapped by i2c_lock_bus and i2c_unlock_bus. */ struct i2c_lock_operations { - void (*lock_bus)(struct i2c_adapter *, unsigned int flags); - int (*trylock_bus)(struct i2c_adapter *, unsigned int flags); - void (*unlock_bus)(struct i2c_adapter *, unsigned int flags); + void (*lock_bus)(struct i2c_adapter *adapter, unsigned int flags); + int (*trylock_bus)(struct i2c_adapter *adapter, unsigned int flags); + void (*unlock_bus)(struct i2c_adapter *adapter, unsigned int flags); }; /** @@ -726,7 +727,7 @@ i2c_parent_is_i2c_adapter(const struct i2c_adapter *adapter) return NULL; } -int i2c_for_each_dev(void *data, int (*fn)(struct device *, void *)); +int i2c_for_each_dev(void *data, int (*fn)(struct device *dev, void *data)); /* Adapter locking functions, exported for shared pin cases */ #define I2C_LOCK_ROOT_ADAPTER BIT(0) @@ -832,12 +833,12 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap) /* administration... */ #if IS_ENABLED(CONFIG_I2C) -extern int i2c_add_adapter(struct i2c_adapter *); -extern void i2c_del_adapter(struct i2c_adapter *); -extern int i2c_add_numbered_adapter(struct i2c_adapter *); +extern int i2c_add_adapter(struct i2c_adapter *adap); +extern void i2c_del_adapter(struct i2c_adapter *adap); +extern int i2c_add_numbered_adapter(struct i2c_adapter *adap); -extern int i2c_register_driver(struct module *, struct i2c_driver *); -extern void i2c_del_driver(struct i2c_driver *); +extern int i2c_register_driver(struct module *owner, struct i2c_driver *driver); +extern void i2c_del_driver(struct i2c_driver *driver); /* use a define to avoid include chaining to get THIS_MODULE */ #define i2c_add_driver(driver) \ -- cgit v1.2.3-59-g8ed1b From 2caea56f569ac361fc854f6bf2fe94b70514c917 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 3 Jun 2019 10:25:34 +0200 Subject: i2c: headers: update docs about I2C_CLIENT_* Update kerneldoc for i2c client flags because they increased over time. Also, move them to a position where they can be more easily found. Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 14e04fb4f46f..9853fae9b505 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -298,8 +298,7 @@ struct i2c_driver { /** * struct i2c_client - represent an I2C slave device - * @flags: I2C_CLIENT_TEN indicates the device uses a ten bit chip address; - * I2C_CLIENT_PEC indicates it uses SMBus Packet Error Checking + * @flags: see I2C_CLIENT_* for possible flags * @addr: Address used on the I2C bus connected to the parent adapter. * @name: Indicates the type of the device, usually a chip name that's * generic enough to hide second-sourcing and compatible revisions. @@ -317,6 +316,15 @@ struct i2c_driver { */ struct i2c_client { unsigned short flags; /* div., see below */ +#define I2C_CLIENT_PEC 0x04 /* Use Packet Error Checking */ +#define I2C_CLIENT_TEN 0x10 /* we have a ten bit chip address */ + /* Must equal I2C_M_TEN below */ +#define I2C_CLIENT_SLAVE 0x20 /* we are the slave */ +#define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ +#define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ +#define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ + /* Must match I2C_M_STOP|IGNORE_NAK */ + unsigned short addr; /* chip address - NOTE: 7bit */ /* addresses are stored in the */ /* _LOWER_ 7 bits */ @@ -803,16 +811,6 @@ static inline void i2c_mark_adapter_resumed(struct i2c_adapter *adap) i2c_unlock_bus(adap, I2C_LOCK_ROOT_ADAPTER); } -/*flags for the client struct: */ -#define I2C_CLIENT_PEC 0x04 /* Use Packet Error Checking */ -#define I2C_CLIENT_TEN 0x10 /* we have a ten bit chip address */ - /* Must equal I2C_M_TEN below */ -#define I2C_CLIENT_SLAVE 0x20 /* we are the slave */ -#define I2C_CLIENT_HOST_NOTIFY 0x40 /* We want to use I2C host notify */ -#define I2C_CLIENT_WAKE 0x80 /* for board_info; true iff can wake */ -#define I2C_CLIENT_SCCB 0x9000 /* Use Omnivision SCCB protocol */ - /* Must match I2C_M_STOP|IGNORE_NAK */ - /* i2c adapter classes (bitmask) */ #define I2C_CLASS_HWMON (1<<0) /* lm_sensors, ... */ #define I2C_CLASS_DDC (1<<3) /* DDC bus on graphics adapters */ -- cgit v1.2.3-59-g8ed1b From 76cc9f0efd952d376e93e79b1f19fd6fdb8291bc Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 3 Jun 2019 10:25:35 +0200 Subject: i2c: headers: reformat header comment and update copyright Let's stick to coding style. Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 9853fae9b505..d8f9060179d0 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -1,16 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* ------------------------------------------------------------------------- */ -/* */ -/* i2c.h - definitions for the i2c-bus interface */ -/* */ -/* ------------------------------------------------------------------------- */ -/* Copyright (C) 1995-2000 Simon G. Vogl - +/* + * i2c.h - definitions for the Linux i2c bus interface + * Copyright (C) 1995-2000 Simon G. Vogl + * Copyright (C) 2013-2019 Wolfram Sang + * + * With some changes from Kyösti Mälkki and + * Frodo Looijaard */ -/* ------------------------------------------------------------------------- */ - -/* With some changes from Kyösti Mälkki and - Frodo Looijaard */ #ifndef _LINUX_I2C_H #define _LINUX_I2C_H -- cgit v1.2.3-59-g8ed1b From 013e868bc9465452c7b667830712ab57de236d08 Mon Sep 17 00:00:00 2001 From: Keerthy Date: Wed, 15 May 2019 15:38:47 +0530 Subject: mfd: lp87565: Add support for 4-phase LP87561 combination Add support for 4-phase LP87561 combination. Data Sheet: https://www.ti.com/lit/ds/symlink/lp87561-q1.pdf Signed-off-by: Keerthy Signed-off-by: Lee Jones --- drivers/mfd/lp87565.c | 4 ++++ include/linux/mfd/lp87565.h | 2 ++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/drivers/mfd/lp87565.c b/drivers/mfd/lp87565.c index 32d2a07d4354..8ad688fe75f9 100644 --- a/drivers/mfd/lp87565.c +++ b/drivers/mfd/lp87565.c @@ -33,6 +33,10 @@ static const struct of_device_id of_lp87565_match_table[] = { .compatible = "ti,lp87565-q1", .data = (void *)LP87565_DEVICE_TYPE_LP87565_Q1, }, + { + .compatible = "ti,lp87561-q1", + .data = (void *)LP87565_DEVICE_TYPE_LP87561_Q1, + }, {} }; MODULE_DEVICE_TABLE(of, of_lp87565_match_table); diff --git a/include/linux/mfd/lp87565.h b/include/linux/mfd/lp87565.h index d0c91ba65525..976447607ea2 100644 --- a/include/linux/mfd/lp87565.h +++ b/include/linux/mfd/lp87565.h @@ -17,6 +17,7 @@ enum lp87565_device_type { LP87565_DEVICE_TYPE_UNKNOWN = 0, + LP87565_DEVICE_TYPE_LP87561_Q1, LP87565_DEVICE_TYPE_LP87565_Q1, }; @@ -249,6 +250,7 @@ enum LP87565_regulator_id { LP87565_BUCK_3, LP87565_BUCK_10, LP87565_BUCK_23, + LP87565_BUCK_3210, }; /** -- cgit v1.2.3-59-g8ed1b From e7488e58c7cfe4be0c52db68622a0397bb75258e Mon Sep 17 00:00:00 2001 From: Yurii Pavlovskyi Date: Tue, 14 May 2019 20:59:01 +0200 Subject: platform/x86: wmi: Add function to get _UID of WMI device Add a new function to acpi.h / wmi.c that returns _UID of the ACPI WMI device. For example, it returns "ATK" for the following declaration in DSDT: Device (ATKD) { Name (_HID, "PNP0C14" /* Windows Management Instrumentation Device */) // _HID: Hardware ID Name (_UID, "ATK") // _UID: Unique ID .. Generally, it is possible that multiple PNP0C14 ACPI devices are present in the system as mentioned in the commit message of commit bff431e49ff5 ("ACPI: WMI: Add ACPI-WMI mapping driver"). Therefore the _UID is returned for a specific ACPI device that declares the given GUID, to which it is also mapped by other methods of wmi module. Signed-off-by: Yurii Pavlovskyi Signed-off-by: Andy Shevchenko --- drivers/platform/x86/wmi.c | 19 +++++++++++++++++++ include/linux/acpi.h | 1 + 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index 7b26b6ccf1a0..b08ffb769cbe 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -635,6 +635,25 @@ bool wmi_has_guid(const char *guid_string) } EXPORT_SYMBOL_GPL(wmi_has_guid); +/** + * wmi_get_acpi_device_uid() - Get _UID name of ACPI device that defines GUID + * @guid_string: 36 char string of the form fa50ff2b-f2e8-45de-83fa-65417f2f49ba + * + * Find the _UID of ACPI device associated with this WMI GUID. + * + * Return: The ACPI _UID field value or NULL if the WMI GUID was not found + */ +char *wmi_get_acpi_device_uid(const char *guid_string) +{ + struct wmi_block *wblock = NULL; + + if (!find_guid(guid_string, &wblock)) + return NULL; + + return acpi_device_uid(wblock->acpi_device); +} +EXPORT_SYMBOL_GPL(wmi_get_acpi_device_uid); + static struct wmi_block *dev_to_wblock(struct device *dev) { return container_of(dev, struct wmi_block, dev.dev); diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 98440df7fe42..d867a9a904f9 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -380,6 +380,7 @@ extern acpi_status wmi_install_notify_handler(const char *guid, extern acpi_status wmi_remove_notify_handler(const char *guid); extern acpi_status wmi_get_event_data(u32 event, struct acpi_buffer *out); extern bool wmi_has_guid(const char *guid); +extern char *wmi_get_acpi_device_uid(const char *guid); #endif /* CONFIG_ACPI_WMI */ -- cgit v1.2.3-59-g8ed1b From e0668f28888184f6c633110a37386f2d4a6fa00e Mon Sep 17 00:00:00 2001 From: Yurii Pavlovskyi Date: Tue, 14 May 2019 21:00:31 +0200 Subject: platform/x86: asus-wmi: Improve DSTS WMI method ID detection The DSTS method detection mistakenly selects DCTS instead of DSTS if nothing is returned when the method ID is not defined in WMNB. As a result, the control of keyboard backlight is not functional for TUF Gaming series laptops. Implement detection based on _UID of the WMI device instead. There is evidence that DCTS is handled by ACPI WMI devices that have _UID ASUSWMI, whereas none of the devices without ASUSWMI respond to DCTS and DSTS is used instead [1]. DSDT examples: FX505GM (_UID ATK): Method (WMNB, 3, Serialized) { ... If ((Local0 == 0x53545344)) { ... Return (Zero) } ... // No return } K54C (_UID ATK): Method (WMNB, 3, Serialized) { ... If ((Local0 == 0x53545344)) { ... Return (0x02) } ... Return (0xFFFFFFFE) } [1] Link: https://lkml.org/lkml/2019/4/11/322 Signed-off-by: Yurii Pavlovskyi Suggested-by: Daniel Drake Signed-off-by: Andy Shevchenko --- drivers/hid/hid-asus.c | 2 +- drivers/platform/x86/asus-wmi.c | 23 ++++++++++++++++++++--- include/linux/platform_data/x86/asus-wmi.h | 4 ++-- 3 files changed, 23 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c index 336aeaed1159..1d01fe23ca0c 100644 --- a/drivers/hid/hid-asus.c +++ b/drivers/hid/hid-asus.c @@ -396,7 +396,7 @@ static bool asus_kbd_wmi_led_control_present(struct hid_device *hdev) if (!IS_ENABLED(CONFIG_ASUS_WMI)) return false; - ret = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS2, + ret = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, ASUS_WMI_DEVID_KBD_BACKLIGHT, 0, &value); hid_dbg(hdev, "WMI backlight check: rc %d value %x", ret, value); if (ret) diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index c67f11e0d6e7..ef526dcfeac5 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -83,6 +83,8 @@ MODULE_LICENSE("GPL"); #define USB_INTEL_XUSB2PR 0xD0 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI 0x9c31 +#define ASUS_ACPI_UID_ASUSWMI "ASUSWMI" + static const char * const ashs_ids[] = { "ATK4001", "ATK4002", NULL }; static bool ashs_present(void) @@ -1874,6 +1876,8 @@ static int asus_wmi_sysfs_init(struct platform_device *device) */ static int asus_wmi_platform_init(struct asus_wmi *asus) { + struct device *dev = &asus->platform_device->dev; + char *wmi_uid; int rv; /* INIT enable hotkeys on some models */ @@ -1903,11 +1907,24 @@ static int asus_wmi_platform_init(struct asus_wmi *asus) * Note, on most Eeepc, there is no way to check if a method exist * or note, while on notebooks, they returns 0xFFFFFFFE on failure, * but once again, SPEC may probably be used for that kind of things. + * + * Additionally at least TUF Gaming series laptops return nothing for + * unknown methods, so the detection in this way is not possible. + * + * There is strong indication that only ACPI WMI devices that have _UID + * equal to "ASUSWMI" use DCTS whereas those with "ATK" use DSTS. */ - if (!asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, 0, 0, NULL)) + wmi_uid = wmi_get_acpi_device_uid(ASUS_WMI_MGMT_GUID); + if (!wmi_uid) + return -ENODEV; + + if (!strcmp(wmi_uid, ASUS_ACPI_UID_ASUSWMI)) { + dev_info(dev, "Detected ASUSWMI, use DCTS\n"); + asus->dsts_id = ASUS_WMI_METHODID_DCTS; + } else { + dev_info(dev, "Detected %s, not ASUSWMI, use DSTS\n", wmi_uid); asus->dsts_id = ASUS_WMI_METHODID_DSTS; - else - asus->dsts_id = ASUS_WMI_METHODID_DSTS2; + } /* CWAP allow to define the behavior of the Fn+F2 key, * this method doesn't seems to be present on Eee PCs */ diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index bfba245636a7..0668f76df921 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -18,8 +18,8 @@ #define ASUS_WMI_METHODID_GDSP 0x50534447 /* Get DiSPlay output */ #define ASUS_WMI_METHODID_DEVP 0x50564544 /* DEVice Policy */ #define ASUS_WMI_METHODID_OSVR 0x5256534F /* OS VeRsion */ -#define ASUS_WMI_METHODID_DSTS 0x53544344 /* Device STatuS */ -#define ASUS_WMI_METHODID_DSTS2 0x53545344 /* Device STatuS #2*/ +#define ASUS_WMI_METHODID_DCTS 0x53544344 /* Device status (DCTS) */ +#define ASUS_WMI_METHODID_DSTS 0x53545344 /* Device status (DSTS) */ #define ASUS_WMI_METHODID_BSTS 0x53545342 /* Bios STatuS ? */ #define ASUS_WMI_METHODID_DEVS 0x53564544 /* DEVice Set */ #define ASUS_WMI_METHODID_CFVS 0x53564643 /* CPU Frequency Volt Set */ -- cgit v1.2.3-59-g8ed1b From b096f626a6827ad2ced5ebdbdc04e62422d463f6 Mon Sep 17 00:00:00 2001 From: Yurii Pavlovskyi Date: Tue, 14 May 2019 21:07:05 +0200 Subject: platform/x86: asus-wmi: Switch fan boost mode The WMI exposes a write-only device ID where up to three fan modes can be switched on some laptops (TUF Gaming FX505GM). There is a hotkey combination Fn-F5 that does have a fan icon, which is designed to toggle between fan modes. The DSTS of the device ID returns information about the presence of this capability and the presence of each of the two additional fan modes as a bitmask (0x01 - overboost present, 0x02 - silent present) [1]. Add a SysFS entry that reads the last written value and updates value in WMI on write and a hotkey handler that toggles the modes taking into account their availability according to DSTS. Modes: * 0x00 - normal or balanced, * 0x01 - overboost, increased fan RPM, * 0x02 - silent, decreased fan RPM [1] Link: https://lkml.org/lkml/2019/4/12/110 Signed-off-by: Yurii Pavlovskyi Suggested-by: Daniel Drake Signed-off-by: Andy Shevchenko --- Documentation/ABI/testing/sysfs-platform-asus-wmi | 10 ++ drivers/platform/x86/asus-wmi.c | 151 ++++++++++++++++++++-- include/linux/platform_data/x86/asus-wmi.h | 1 + 3 files changed, 154 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi index 019e1e29370e..87ae5cc983bf 100644 --- a/Documentation/ABI/testing/sysfs-platform-asus-wmi +++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi @@ -36,3 +36,13 @@ KernelVersion: 3.5 Contact: "AceLan Kao" Description: Resume on lid open. 1 means on, 0 means off. + +What: /sys/devices/platform//fan_mode +Date: Apr 2019 +KernelVersion: 5.2 +Contact: "Yurii Pavlovskyi" +Description: + Fan boost mode: + * 0 - normal, + * 1 - overboost, + * 2 - silent diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index a1d85667383c..5712bc56fa10 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -70,6 +70,7 @@ MODULE_LICENSE("GPL"); #define NOTIFY_KBD_BRTUP 0xc4 #define NOTIFY_KBD_BRTDWN 0xc5 #define NOTIFY_KBD_BRTTOGGLE 0xc7 +#define NOTIFY_KBD_FBM 0x99 #define ASUS_WMI_FNLOCK_BIOS_DISABLED BIT(0) @@ -80,6 +81,13 @@ MODULE_LICENSE("GPL"); #define ASUS_FAN_CTRL_MANUAL 1 #define ASUS_FAN_CTRL_AUTO 2 +#define ASUS_FAN_MODE_NORMAL 0 +#define ASUS_FAN_MODE_OVERBOOST 1 +#define ASUS_FAN_MODE_OVERBOOST_MASK 0x01 +#define ASUS_FAN_MODE_SILENT 2 +#define ASUS_FAN_MODE_SILENT_MASK 0x02 +#define ASUS_FAN_MODES_MASK 0x03 + #define USB_INTEL_XUSB2PR 0xD0 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI 0x9c31 @@ -187,6 +195,10 @@ struct asus_wmi { int asus_hwmon_num_fans; int asus_hwmon_pwm; + bool fan_mode_available; + u8 fan_mode_mask; + u8 fan_mode; + struct hotplug_slot hotplug_slot; struct mutex hotplug_lock; struct mutex wmi_lock; @@ -1483,6 +1495,116 @@ static int asus_wmi_fan_init(struct asus_wmi *asus) return 0; } +/* Fan mode *******************************************************************/ + +static int fan_mode_check_present(struct asus_wmi *asus) +{ + u32 result; + int err; + + asus->fan_mode_available = false; + + err = asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_FAN_MODE, &result); + if (err) { + if (err == -ENODEV) + return 0; + else + return err; + } + + if ((result & ASUS_WMI_DSTS_PRESENCE_BIT) && + (result & ASUS_FAN_MODES_MASK)) { + asus->fan_mode_available = true; + asus->fan_mode_mask = result & ASUS_FAN_MODES_MASK; + } + + return 0; +} + +static int fan_mode_write(struct asus_wmi *asus) +{ + int err; + u8 value; + u32 retval; + + value = asus->fan_mode; + + pr_info("Set fan mode: %u\n", value); + err = asus_wmi_set_devstate(ASUS_WMI_DEVID_FAN_MODE, value, &retval); + + if (err) { + pr_warn("Failed to set fan mode: %d\n", err); + return err; + } + + if (retval != 1) { + pr_warn("Failed to set fan mode (retval): 0x%x\n", retval); + return -EIO; + } + + return 0; +} + +static int fan_mode_switch_next(struct asus_wmi *asus) +{ + if (asus->fan_mode == ASUS_FAN_MODE_NORMAL) { + if (asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK) + asus->fan_mode = ASUS_FAN_MODE_OVERBOOST; + else if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK) + asus->fan_mode = ASUS_FAN_MODE_SILENT; + } else if (asus->fan_mode == ASUS_FAN_MODE_OVERBOOST) { + if (asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK) + asus->fan_mode = ASUS_FAN_MODE_SILENT; + else + asus->fan_mode = ASUS_FAN_MODE_NORMAL; + } else { + asus->fan_mode = ASUS_FAN_MODE_NORMAL; + } + + return fan_mode_write(asus); +} + +static ssize_t fan_mode_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct asus_wmi *asus = dev_get_drvdata(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", asus->fan_mode); +} + +static ssize_t fan_mode_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + int result; + u8 new_mode; + + struct asus_wmi *asus = dev_get_drvdata(dev); + + result = kstrtou8(buf, 10, &new_mode); + if (result < 0) { + pr_warn("Trying to store invalid value\n"); + return result; + } + + if (new_mode == ASUS_FAN_MODE_OVERBOOST) { + if (!(asus->fan_mode_mask & ASUS_FAN_MODE_OVERBOOST_MASK)) + return -EINVAL; + } else if (new_mode == ASUS_FAN_MODE_SILENT) { + if (!(asus->fan_mode_mask & ASUS_FAN_MODE_SILENT_MASK)) + return -EINVAL; + } else if (new_mode != ASUS_FAN_MODE_NORMAL) { + return -EINVAL; + } + + asus->fan_mode = new_mode; + fan_mode_write(asus); + + return result; +} + +// Fan mode: 0 - normal, 1 - overboost, 2 - silent +static DEVICE_ATTR_RW(fan_mode); + /* Backlight ******************************************************************/ static int read_backlight_power(struct asus_wmi *asus) @@ -1761,6 +1883,11 @@ static void asus_wmi_handle_event_code(int code, struct asus_wmi *asus) return; } + if (asus->fan_mode_available && code == NOTIFY_KBD_FBM) { + fan_mode_switch_next(asus); + return; + } + if (is_display_toggle(code) && asus->driver->quirks->no_display_toggle) return; @@ -1917,6 +2044,7 @@ static struct attribute *platform_attributes[] = { &dev_attr_touchpad.attr, &dev_attr_lid_resume.attr, &dev_attr_als_enable.attr, + &dev_attr_fan_mode.attr, NULL }; @@ -1938,6 +2066,8 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj, devid = ASUS_WMI_DEVID_LID_RESUME; else if (attr == &dev_attr_als_enable.attr) devid = ASUS_WMI_DEVID_ALS_ENABLE; + else if (attr == &dev_attr_fan_mode.attr) + ok = asus->fan_mode_available; if (devid != -1) ok = !(asus_wmi_get_devstate_simple(asus, devid) < 0); @@ -2037,12 +2167,7 @@ static int asus_wmi_platform_init(struct asus_wmi *asus) asus_wmi_set_devstate(ASUS_WMI_DEVID_CWAP, asus->driver->quirks->wapf, NULL); - return asus_wmi_sysfs_init(asus->platform_device); -} - -static void asus_wmi_platform_exit(struct asus_wmi *asus) -{ - asus_wmi_sysfs_exit(asus->platform_device); + return 0; } /* debugfs ********************************************************************/ @@ -2200,6 +2325,14 @@ static int asus_wmi_add(struct platform_device *pdev) if (err) goto fail_platform; + err = fan_mode_check_present(asus); + if (err) + goto fail_fan_mode; + + err = asus_wmi_sysfs_init(asus->platform_device); + if (err) + goto fail_sysfs; + err = asus_wmi_input_init(asus); if (err) goto fail_input; @@ -2277,7 +2410,9 @@ fail_leds: fail_hwmon: asus_wmi_input_exit(asus); fail_input: - asus_wmi_platform_exit(asus); + asus_wmi_sysfs_exit(asus->platform_device); +fail_sysfs: +fail_fan_mode: fail_platform: kfree(asus); return err; @@ -2294,7 +2429,7 @@ static int asus_wmi_remove(struct platform_device *device) asus_wmi_led_exit(asus); asus_wmi_rfkill_exit(asus); asus_wmi_debugfs_exit(asus); - asus_wmi_platform_exit(asus); + asus_wmi_sysfs_exit(asus->platform_device); asus_hwmon_fan_set_auto(asus); kfree(asus); diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 0668f76df921..8551156b8dca 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -57,6 +57,7 @@ #define ASUS_WMI_DEVID_KBD_BACKLIGHT 0x00050021 #define ASUS_WMI_DEVID_LIGHT_SENSOR 0x00050022 /* ?? */ #define ASUS_WMI_DEVID_LIGHTBAR 0x00050025 +#define ASUS_WMI_DEVID_FAN_MODE 0x00110018 /* Misc */ #define ASUS_WMI_DEVID_CAMERA 0x00060013 -- cgit v1.2.3-59-g8ed1b From a48e23385fcf397e69e2a75d72a81c545ec8bec2 Mon Sep 17 00:00:00 2001 From: Mattias Jacobsson <2pi@mok.nu> Date: Mon, 27 May 2019 18:21:29 +0200 Subject: platform/x86: wmi: add context pointer field to struct wmi_device_id When using wmi_install_notify_handler() to initialize a WMI handler a data pointer can be supplied which will be passed on to the notification handler. No similar feature exist when handling WMI events via struct wmi_driver. Add a context field pointer to struct wmi_device_id and add a function find_guid_context() to retrieve that context pointer. Signed-off-by: Mattias Jacobsson <2pi@mok.nu> Signed-off-by: Andy Shevchenko --- drivers/platform/x86/wmi.c | 22 ++++++++++++++++++++++ include/linux/mod_devicetable.h | 1 + 2 files changed, 23 insertions(+) (limited to 'include/linux') diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index b08ffb769cbe..f3be1c008856 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -146,6 +146,28 @@ static bool find_guid(const char *guid_string, struct wmi_block **out) return false; } +static const void *find_guid_context(struct wmi_block *wblock, + struct wmi_driver *wdriver) +{ + const struct wmi_device_id *id; + uuid_le guid_input; + + if (wblock == NULL || wdriver == NULL) + return NULL; + if (wdriver->id_table == NULL) + return NULL; + + id = wdriver->id_table; + while (*id->guid_string) { + if (uuid_le_to_bin(id->guid_string, &guid_input)) + continue; + if (!memcmp(wblock->gblock.guid, &guid_input, 16)) + return id->context; + id++; + } + return NULL; +} + static int get_subobj_info(acpi_handle handle, const char *pathname, struct acpi_device_info **info) { diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 448621c32e4d..09366859aac2 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -798,6 +798,7 @@ struct tee_client_device_id { */ struct wmi_device_id { const char guid_string[UUID_STRING_LEN+1]; + const void *context; }; #endif /* LINUX_MOD_DEVICETABLE_H */ -- cgit v1.2.3-59-g8ed1b From 440c4983de262f78033ec58f6abcd199a664327d Mon Sep 17 00:00:00 2001 From: Mattias Jacobsson <2pi@mok.nu> Date: Mon, 27 May 2019 18:21:30 +0200 Subject: platform/x86: wmi: add context argument to the probe function The struct wmi_device_id has a context pointer field, forward this pointer as an argument to the probe function in struct wmi_driver. Update existing users of the same probe function to accept this new context argument. Signed-off-by: Mattias Jacobsson <2pi@mok.nu> Signed-off-by: Andy Shevchenko --- drivers/platform/x86/dell-smbios-wmi.c | 2 +- drivers/platform/x86/dell-wmi-descriptor.c | 3 ++- drivers/platform/x86/dell-wmi.c | 2 +- drivers/platform/x86/huawei-wmi.c | 2 +- drivers/platform/x86/intel-wmi-thunderbolt.c | 3 ++- drivers/platform/x86/wmi-bmof.c | 2 +- drivers/platform/x86/wmi.c | 3 ++- include/linux/wmi.h | 2 +- 8 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/platform/x86/dell-smbios-wmi.c b/drivers/platform/x86/dell-smbios-wmi.c index c3ed3c8c17b9..add2687079f7 100644 --- a/drivers/platform/x86/dell-smbios-wmi.c +++ b/drivers/platform/x86/dell-smbios-wmi.c @@ -146,7 +146,7 @@ fail_smbios_cmd: return ret; } -static int dell_smbios_wmi_probe(struct wmi_device *wdev) +static int dell_smbios_wmi_probe(struct wmi_device *wdev, const void *context) { struct wmi_driver *wdriver = container_of(wdev->dev.driver, struct wmi_driver, driver); diff --git a/drivers/platform/x86/dell-wmi-descriptor.c b/drivers/platform/x86/dell-wmi-descriptor.c index 14ab250b7d5a..9994fd1a5acf 100644 --- a/drivers/platform/x86/dell-wmi-descriptor.c +++ b/drivers/platform/x86/dell-wmi-descriptor.c @@ -106,7 +106,8 @@ EXPORT_SYMBOL_GPL(dell_wmi_get_hotfix); * WMI buffer length 12 4 * WMI hotfix number 16 4 */ -static int dell_wmi_descriptor_probe(struct wmi_device *wdev) +static int dell_wmi_descriptor_probe(struct wmi_device *wdev, + const void *context) { union acpi_object *obj = NULL; struct descriptor_priv *priv; diff --git a/drivers/platform/x86/dell-wmi.c b/drivers/platform/x86/dell-wmi.c index d118bb73fcae..72b0a69a6ed0 100644 --- a/drivers/platform/x86/dell-wmi.c +++ b/drivers/platform/x86/dell-wmi.c @@ -672,7 +672,7 @@ static int dell_wmi_events_set_enabled(bool enable) return dell_smbios_error(ret); } -static int dell_wmi_probe(struct wmi_device *wdev) +static int dell_wmi_probe(struct wmi_device *wdev, const void *context) { struct dell_wmi_priv *priv; int ret; diff --git a/drivers/platform/x86/huawei-wmi.c b/drivers/platform/x86/huawei-wmi.c index 52fcac5b393a..195a7f3638cb 100644 --- a/drivers/platform/x86/huawei-wmi.c +++ b/drivers/platform/x86/huawei-wmi.c @@ -166,7 +166,7 @@ static int huawei_wmi_input_setup(struct wmi_device *wdev) return input_register_device(priv->idev); } -static int huawei_wmi_probe(struct wmi_device *wdev) +static int huawei_wmi_probe(struct wmi_device *wdev, const void *context) { struct huawei_wmi_priv *priv; int err; diff --git a/drivers/platform/x86/intel-wmi-thunderbolt.c b/drivers/platform/x86/intel-wmi-thunderbolt.c index 4dfa61434a76..974c22a7ff61 100644 --- a/drivers/platform/x86/intel-wmi-thunderbolt.c +++ b/drivers/platform/x86/intel-wmi-thunderbolt.c @@ -56,7 +56,8 @@ static const struct attribute_group tbt_attribute_group = { .attrs = tbt_attrs, }; -static int intel_wmi_thunderbolt_probe(struct wmi_device *wdev) +static int intel_wmi_thunderbolt_probe(struct wmi_device *wdev, + const void *context) { int ret; diff --git a/drivers/platform/x86/wmi-bmof.c b/drivers/platform/x86/wmi-bmof.c index 8751a13134be..105a82b6b076 100644 --- a/drivers/platform/x86/wmi-bmof.c +++ b/drivers/platform/x86/wmi-bmof.c @@ -54,7 +54,7 @@ read_bmof(struct file *filp, struct kobject *kobj, return count; } -static int wmi_bmof_probe(struct wmi_device *wdev) +static int wmi_bmof_probe(struct wmi_device *wdev, const void *context) { struct bmof_priv *priv; int ret; diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index f3be1c008856..2163fd8bf9e1 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -945,7 +945,8 @@ static int wmi_dev_probe(struct device *dev) dev_warn(dev, "failed to enable device -- probing anyway\n"); if (wdriver->probe) { - ret = wdriver->probe(dev_to_wdev(dev)); + ret = wdriver->probe(dev_to_wdev(dev), + find_guid_context(wblock, wdriver)); if (ret != 0) goto probe_failure; } diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 592f81afecbb..1e84c474a993 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -44,7 +44,7 @@ struct wmi_driver { struct device_driver driver; const struct wmi_device_id *id_table; - int (*probe)(struct wmi_device *wdev); + int (*probe)(struct wmi_device *wdev, const void *context); int (*remove)(struct wmi_device *wdev); void (*notify)(struct wmi_device *device, union acpi_object *data); long (*filter_callback)(struct wmi_device *wdev, unsigned int cmd, -- cgit v1.2.3-59-g8ed1b From 99600fd47eafd20b9ba6e04562bb2fcc48475344 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Mon, 22 Apr 2019 07:15:05 +0800 Subject: clk: Add CLK_HW_INIT_* macros using .parent_hws With the new clk parenting code, struct clk_init_data was expanded to include .parent_hws, for clk drivers to directly list parents by pointing to their respective struct clk_hw's. Add macros that can take either one single struct clk_hw *, or an array of them, for drivers to use. A special CLK_HW_INIT_HWS macro is included, which takes an array of struct clk_hw *, but sets .num_parents to 1. This variant is to allow the reuse of the array, instead of having a compound literal allocated for each clk sharing the same parent. Signed-off-by: Chen-Yu Tsai --- include/linux/clk-provider.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index bb6118f79784..70aad5cefea7 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -904,6 +904,29 @@ extern struct of_device_id __clk_of_table; .ops = _ops, \ }) +#define CLK_HW_INIT_HW(_name, _parent, _ops, _flags) \ + (&(struct clk_init_data) { \ + .flags = _flags, \ + .name = _name, \ + .parent_hws = (const struct clk_hw*[]) { _parent }, \ + .num_parents = 1, \ + .ops = _ops, \ + }) + +/* + * This macro is intended for drivers to be able to share the otherwise + * individual struct clk_hw[] compound literals created by the compiler + * when using CLK_HW_INIT_HW. It does NOT support multiple parents. + */ +#define CLK_HW_INIT_HWS(_name, _parent, _ops, _flags) \ + (&(struct clk_init_data) { \ + .flags = _flags, \ + .name = _name, \ + .parent_hws = _parent, \ + .num_parents = 1, \ + .ops = _ops, \ + }) + #define CLK_HW_INIT_PARENTS(_name, _parents, _ops, _flags) \ (&(struct clk_init_data) { \ .flags = _flags, \ @@ -913,6 +936,15 @@ extern struct of_device_id __clk_of_table; .ops = _ops, \ }) +#define CLK_HW_INIT_PARENTS_HW(_name, _parents, _ops, _flags) \ + (&(struct clk_init_data) { \ + .flags = _flags, \ + .name = _name, \ + .parent_hws = _parents, \ + .num_parents = ARRAY_SIZE(_parents), \ + .ops = _ops, \ + }) + #define CLK_HW_INIT_NO_PARENT(_name, _ops, _flags) \ (&(struct clk_init_data) { \ .flags = _flags, \ -- cgit v1.2.3-59-g8ed1b From 2d6b4f33e637bf51c50c536966a19e94a59f3212 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Fri, 3 May 2019 11:49:03 +0800 Subject: clk: Add CLK_HW_INIT_FW_NAME macro using .fw_name in .parent_data With the new clk parenting code, clk_init_data was expanded to include .parent_data, for clk drivers that have parents referenced using a combination of device tree clock-names, clock indices, and/or clk_hw pointers. Add a CLK_HW_INIT macro for specifying a single parent from the device tree using .fw_name in struct clk_parent_data. Signed-off-by: Chen-Yu Tsai --- include/linux/clk-provider.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 70aad5cefea7..b19063512a29 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -927,6 +927,17 @@ extern struct of_device_id __clk_of_table; .ops = _ops, \ }) +#define CLK_HW_INIT_FW_NAME(_name, _parent, _ops, _flags) \ + (&(struct clk_init_data) { \ + .flags = _flags, \ + .name = _name, \ + .parent_data = (const struct clk_parent_data[]) { \ + { .fw_name = _parent }, \ + }, \ + .num_parents = 1, \ + .ops = _ops, \ + }) + #define CLK_HW_INIT_PARENTS(_name, _parents, _ops, _flags) \ (&(struct clk_init_data) { \ .flags = _flags, \ -- cgit v1.2.3-59-g8ed1b From 13933109dff0a5abbfc3980304c6c21c90829810 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Mon, 22 Apr 2019 07:17:50 +0800 Subject: clk: Add CLK_HW_INIT_PARENT_DATA macro using .parent_data With the new clk parenting code, struct clk_init_data was expanded to include .parent_data, for clk drivers that have parents referenced using a combination of device tree clock-names, clock indices, and/or struct clk_hw pointers. Add a new macro that can take a list of struct clk_parent_data for drivers to use. Signed-off-by: Chen-Yu Tsai --- include/linux/clk-provider.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index b19063512a29..0fd14c4874d6 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -956,6 +956,15 @@ extern struct of_device_id __clk_of_table; .ops = _ops, \ }) +#define CLK_HW_INIT_PARENTS_DATA(_name, _parents, _ops, _flags) \ + (&(struct clk_init_data) { \ + .flags = _flags, \ + .name = _name, \ + .parent_data = _parents, \ + .num_parents = ARRAY_SIZE(_parents), \ + .ops = _ops, \ + }) + #define CLK_HW_INIT_NO_PARENT(_name, _ops, _flags) \ (&(struct clk_init_data) { \ .flags = _flags, \ -- cgit v1.2.3-59-g8ed1b From d7b15114aba956ca395ec5cc28f68fe861ffc208 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Mon, 22 Apr 2019 07:19:46 +0800 Subject: clk: fixed-factor: Add CLK_FIXED_FACTOR_HW which takes clk_hw pointer as parent With the new clk parenting code, clk_init_data was expanded to include .parent_hws, for clk drivers to directly reference parents by clk_hw. Add a new macro, CLK_FIXED_FACTOR_HW, that can take a struct clk_hw pointer, instead of a string, as its parent. Signed-off-by: Chen-Yu Tsai --- include/linux/clk-provider.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 0fd14c4874d6..c85e9f3809f2 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -985,6 +985,17 @@ extern struct of_device_id __clk_of_table; _flags), \ } +#define CLK_FIXED_FACTOR_HW(_struct, _name, _parent, \ + _div, _mult, _flags) \ + struct clk_fixed_factor _struct = { \ + .div = _div, \ + .mult = _mult, \ + .hw.init = CLK_HW_INIT_HW(_name, \ + _parent, \ + &clk_fixed_factor_ops, \ + _flags), \ + } + #ifdef CONFIG_OF int of_clk_add_provider(struct device_node *np, struct clk *(*clk_src_get)(struct of_phandle_args *args, -- cgit v1.2.3-59-g8ed1b From 1bef004e2680511ecbb6b5db3954fba430501ecb Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Mon, 6 May 2019 10:43:16 +0800 Subject: clk: fixed-factor: Add CLK_FIXED_FACTOR_HWS which takes list of struct clk_hw * With the new clk parenting code, clk_init_data was expanded to include .parent_hws, for clk drivers to directly reference parents by clk_hw. Add a new macro, CLK_FIXED_FACTOR_HWS, that can take an array of pointers to struct clk_hw, instead of a string, as its parent. Taking an array instead of a direct pointer allows the reuse of the array for multiple clks, rather than having one compound literal with the same contents allocated for each clk declaration. Signed-off-by: Chen-Yu Tsai --- include/linux/clk-provider.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index c85e9f3809f2..146a6859969e 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -996,6 +996,21 @@ extern struct of_device_id __clk_of_table; _flags), \ } +/* + * This macro allows the driver to reuse the _parent array for multiple + * fixed factor clk declarations. + */ +#define CLK_FIXED_FACTOR_HWS(_struct, _name, _parent, \ + _div, _mult, _flags) \ + struct clk_fixed_factor _struct = { \ + .div = _div, \ + .mult = _mult, \ + .hw.init = CLK_HW_INIT_HWS(_name, \ + _parent, \ + &clk_fixed_factor_ops, \ + _flags), \ + } + #ifdef CONFIG_OF int of_clk_add_provider(struct device_node *np, struct clk *(*clk_src_get)(struct of_phandle_args *args, -- cgit v1.2.3-59-g8ed1b From 8b13a48b891c7c855e9f3a401d91391a946f4ca7 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Fri, 3 May 2019 11:58:20 +0800 Subject: clk: fixed-factor: Add CLK_FIXED_FACTOR_FW_NAME for DT clock-names parent With the new clk parenting code, clk_init_data was expanded to include .parent_data, for clk drivers to specify parents using a combination of device tree clock-names, pointers to struct clk_hw, device tree clocks, and/or fallback global clock names. Add a new macro, CLK_FIXED_FACTOR_FW_NAME, that takes a string to match a clock-names entry in the device tree to specify the clock parent. Signed-off-by: Chen-Yu Tsai --- include/linux/clk-provider.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h index 146a6859969e..e5c44f6dd897 100644 --- a/include/linux/clk-provider.h +++ b/include/linux/clk-provider.h @@ -1011,6 +1011,17 @@ extern struct of_device_id __clk_of_table; _flags), \ } +#define CLK_FIXED_FACTOR_FW_NAME(_struct, _name, _parent, \ + _div, _mult, _flags) \ + struct clk_fixed_factor _struct = { \ + .div = _div, \ + .mult = _mult, \ + .hw.init = CLK_HW_INIT_FW_NAME(_name, \ + _parent, \ + &clk_fixed_factor_ops, \ + _flags), \ + } + #ifdef CONFIG_OF int of_clk_add_provider(struct device_node *np, struct clk *(*clk_src_get)(struct of_phandle_args *args, -- cgit v1.2.3-59-g8ed1b From 4eb293487d05a69862a4907ee944aa271ed49a4c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 13 Jun 2019 10:55:32 +0900 Subject: pinctrl: make pinconf.h self-contained This header uses 'bool', but it does not include any header by itself. So, it could cause unknown type name error, depending on the header include order, although probably has been included by someone else. Include to make it self-contained. Signed-off-by: Masahiro Yamada Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h index 93c9dd133e9d..9bebc3554809 100644 --- a/include/linux/pinctrl/pinconf.h +++ b/include/linux/pinctrl/pinconf.h @@ -14,6 +14,8 @@ #ifdef CONFIG_PINCONF +#include + struct pinctrl_dev; struct seq_file; -- cgit v1.2.3-59-g8ed1b From 29875a52915e09abb9703722054f6443cb492ccc Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Fri, 12 Oct 2018 17:06:06 +0200 Subject: mm: Add an apply_to_pfn_range interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is basically apply_to_page_range with added functionality: Allocating missing parts of the page table becomes optional, which means that the function can be guaranteed not to error if allocation is disabled. Also passing of the closure struct and callback function becomes different and more in line with how things are done elsewhere. Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range The reason for not using the page-walk code is that we want to perform the page-walk on vmas pointing to an address space without requiring the mmap_sem to be held rather than on vmas belonging to a process with the mmap_sem held. Notable changes since RFC: Don't export apply_to_pfn range. Cc: Andrew Morton Cc: Matthew Wilcox Cc: Will Deacon Cc: Peter Zijlstra Cc: Rik van Riel Cc: Minchan Kim Cc: Michal Hocko Cc: Huang Ying Cc: Souptick Joarder Cc: "Jérôme Glisse" Cc: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Thomas Hellstrom Reviewed-by: Ralph Campbell #v1 --- include/linux/mm.h | 10 ++++ mm/memory.c | 135 ++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 113 insertions(+), 32 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e8834ac32b7..3d06ce2a64af 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2675,6 +2675,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); +struct pfn_range_apply; +typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, + struct pfn_range_apply *closure); +struct pfn_range_apply { + struct mm_struct *mm; + pter_fn_t ptefn; + unsigned int alloc; +}; +extern int apply_to_pfn_range(struct pfn_range_apply *closure, + unsigned long address, unsigned long size); #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); diff --git a/mm/memory.c b/mm/memory.c index 168f546af1ad..462aa47f8878 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2032,18 +2032,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long } EXPORT_SYMBOL(vm_iomap_memory); -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) +static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd, + unsigned long addr, unsigned long end) { pte_t *pte; int err; pgtable_t token; spinlock_t *uninitialized_var(ptl); - pte = (mm == &init_mm) ? + pte = (closure->mm == &init_mm) ? pte_alloc_kernel(pmd, addr) : - pte_alloc_map_lock(mm, pmd, addr, &ptl); + pte_alloc_map_lock(closure->mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; @@ -2054,86 +2053,109 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, token = pmd_pgtable(*pmd); do { - err = fn(pte++, token, addr, data); + err = closure->ptefn(pte++, token, addr, closure); if (err) break; } while (addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); - if (mm != &init_mm) + if (closure->mm != &init_mm) pte_unmap_unlock(pte-1, ptl); return err; } -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) +static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud, + unsigned long addr, unsigned long end) { pmd_t *pmd; unsigned long next; - int err; + int err = 0; BUG_ON(pud_huge(*pud)); - pmd = pmd_alloc(mm, pud, addr); + pmd = pmd_alloc(closure->mm, pud, addr); if (!pmd) return -ENOMEM; + do { next = pmd_addr_end(addr, end); - err = apply_to_pte_range(mm, pmd, addr, next, fn, data); + if (!closure->alloc && pmd_none_or_clear_bad(pmd)) + continue; + err = apply_to_pte_range(closure, pmd, addr, next); if (err) break; } while (pmd++, addr = next, addr != end); return err; } -static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) +static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d, + unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; - int err; + int err = 0; - pud = pud_alloc(mm, p4d, addr); + pud = pud_alloc(closure->mm, p4d, addr); if (!pud) return -ENOMEM; + do { next = pud_addr_end(addr, end); - err = apply_to_pmd_range(mm, pud, addr, next, fn, data); + if (!closure->alloc && pud_none_or_clear_bad(pud)) + continue; + err = apply_to_pmd_range(closure, pud, addr, next); if (err) break; } while (pud++, addr = next, addr != end); return err; } -static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) +static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd, + unsigned long addr, unsigned long end) { p4d_t *p4d; unsigned long next; - int err; + int err = 0; - p4d = p4d_alloc(mm, pgd, addr); + p4d = p4d_alloc(closure->mm, pgd, addr); if (!p4d) return -ENOMEM; + do { next = p4d_addr_end(addr, end); - err = apply_to_pud_range(mm, p4d, addr, next, fn, data); + if (!closure->alloc && p4d_none_or_clear_bad(p4d)) + continue; + err = apply_to_pud_range(closure, p4d, addr, next); if (err) break; } while (p4d++, addr = next, addr != end); return err; } -/* - * Scan a region of virtual memory, filling in page tables as necessary - * and calling a provided function on each leaf page table. +/** + * apply_to_pfn_range - Scan a region of virtual memory, calling a provided + * function on each leaf page table entry + * @closure: Details about how to scan and what function to apply + * @addr: Start virtual address + * @size: Size of the region + * + * If @closure->alloc is set to 1, the function will fill in the page table + * as necessary. Otherwise it will skip non-present parts. + * Note: The caller must ensure that the range does not contain huge pages. + * The caller must also assure that the proper mmu_notifier functions are + * called before and after the call to apply_to_pfn_range. + * + * WARNING: Do not use this function unless you know exactly what you are + * doing. It is lacking support for huge pages and transparent huge pages. + * + * Return: Zero on success. If the provided function returns a non-zero status, + * the page table walk will terminate and that status will be returned. + * If @closure->alloc is set to 1, then this function may also return memory + * allocation errors arising from allocating page table memory. */ -int apply_to_page_range(struct mm_struct *mm, unsigned long addr, - unsigned long size, pte_fn_t fn, void *data) +int apply_to_pfn_range(struct pfn_range_apply *closure, + unsigned long addr, unsigned long size) { pgd_t *pgd; unsigned long next; @@ -2143,16 +2165,65 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, if (WARN_ON(addr >= end)) return -EINVAL; - pgd = pgd_offset(mm, addr); + pgd = pgd_offset(closure->mm, addr); do { next = pgd_addr_end(addr, end); - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data); + if (!closure->alloc && pgd_none_or_clear_bad(pgd)) + continue; + err = apply_to_p4d_range(closure, pgd, addr, next); if (err) break; } while (pgd++, addr = next, addr != end); return err; } + +/** + * struct page_range_apply - Closure structure for apply_to_page_range() + * @pter: The base closure structure we derive from + * @fn: The leaf pte function to call + * @data: The leaf pte function closure + */ +struct page_range_apply { + struct pfn_range_apply pter; + pte_fn_t fn; + void *data; +}; + +/* + * Callback wrapper to enable use of apply_to_pfn_range for + * the apply_to_page_range interface + */ +static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token, + unsigned long addr, + struct pfn_range_apply *pter) +{ + struct page_range_apply *pra = + container_of(pter, typeof(*pra), pter); + + return pra->fn(pte, token, addr, pra->data); +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + * + * WARNING: Do not use this function unless you know exactly what you are + * doing. It is lacking support for huge pages and transparent huge pages. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + struct page_range_apply pra = { + .pter = {.mm = mm, + .alloc = 1, + .ptefn = apply_to_page_range_wrapper }, + .fn = fn, + .data = data + }; + + return apply_to_pfn_range(&pra.pter, addr, size); +} EXPORT_SYMBOL_GPL(apply_to_page_range); /* -- cgit v1.2.3-59-g8ed1b From 4fe51e9e7902b5724b618dadd9527b1bbf2b55cc Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Tue, 19 Mar 2019 13:12:30 +0100 Subject: mm: Add write-protect and clean utilities for address space ranges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add two utilities to a) write-protect and b) clean all ptes pointing into a range of an address space. The utilities are intended to aid in tracking dirty pages (either driver-allocated system memory or pci device memory). The write-protect utility should be used in conjunction with page_mkwrite() and pfn_mkwrite() to trigger write page-faults on page accesses. Typically one would want to use this on sparse accesses into large memory regions. The clean utility should be used to utilize hardware dirtying functionality and avoid the overhead of page-faults, typically on large accesses into small memory regions. The added file "as_dirty_helpers.c" is initially listed as maintained by VMware under our DRM driver. If somebody would like it elsewhere, that's of course no problem. Notable changes since RFC: - Added comments to help avoid the usage of these function for VMAs it's not intended for. We also do advisory checks on the vm_flags and warn on illegal usage. - Perform the pte modifications the same way softdirty does. - Add mmu_notifier range invalidation calls. - Add a config option so that this code is not unconditionally included. - Tell the mmu_gather code about pending tlb flushes. Cc: Andrew Morton Cc: Matthew Wilcox Cc: Will Deacon Cc: Peter Zijlstra Cc: Rik van Riel Cc: Minchan Kim Cc: Michal Hocko Cc: Huang Ying Cc: Souptick Joarder Cc: "Jérôme Glisse" Cc: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Thomas Hellstrom Reviewed-by: Ralph Campbell #v1 --- MAINTAINERS | 1 + include/linux/mm.h | 9 +- mm/Kconfig | 3 + mm/Makefile | 1 + mm/as_dirty_helpers.c | 300 ++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 313 insertions(+), 1 deletion(-) create mode 100644 mm/as_dirty_helpers.c (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 7a2f487ea49a..a55d4ef91b0b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5179,6 +5179,7 @@ T: git git://people.freedesktop.org/~thomash/linux S: Supported F: drivers/gpu/drm/vmwgfx/ F: include/uapi/drm/vmwgfx_drm.h +F: mm/as_dirty_helpers.c DRM DRIVERS M: David Airlie diff --git a/include/linux/mm.h b/include/linux/mm.h index 3d06ce2a64af..a0bc2a82917e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2685,7 +2685,14 @@ struct pfn_range_apply { }; extern int apply_to_pfn_range(struct pfn_range_apply *closure, unsigned long address, unsigned long size); - +unsigned long apply_as_wrprotect(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr); +unsigned long apply_as_clean(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr, + pgoff_t bitmap_pgoff, + unsigned long *bitmap, + pgoff_t *start, + pgoff_t *end); #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); extern void kernel_poison_pages(struct page *page, int numpages, int enable); diff --git a/mm/Kconfig b/mm/Kconfig index f0c76ba47695..5006d0e6a5c7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -765,4 +765,7 @@ config GUP_BENCHMARK config ARCH_HAS_PTE_SPECIAL bool +config AS_DIRTY_HELPERS + bool + endmenu diff --git a/mm/Makefile b/mm/Makefile index ac5e5ba78874..f5d412bbc2f7 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -104,3 +104,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o +obj-$(CONFIG_AS_DIRTY_HELPERS) += as_dirty_helpers.o diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c new file mode 100644 index 000000000000..f600e31534fb --- /dev/null +++ b/mm/as_dirty_helpers.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +/** + * struct apply_as - Closure structure for apply_as_range + * @base: struct pfn_range_apply we derive from + * @start: Address of first modified pte + * @end: Address of last modified pte + 1 + * @total: Total number of modified ptes + * @vma: Pointer to the struct vm_area_struct we're currently operating on + */ +struct apply_as { + struct pfn_range_apply base; + unsigned long start; + unsigned long end; + unsigned long total; + struct vm_area_struct *vma; +}; + +/** + * apply_pt_wrprotect - Leaf pte callback to write-protect a pte + * @pte: Pointer to the pte + * @token: Page table token, see apply_to_pfn_range() + * @addr: The virtual page address + * @closure: Pointer to a struct pfn_range_apply embedded in a + * struct apply_as + * + * The function write-protects a pte and records the range in + * virtual address space of touched ptes for efficient range TLB flushes. + * + * Return: Always zero. + */ +static int apply_pt_wrprotect(pte_t *pte, pgtable_t token, + unsigned long addr, + struct pfn_range_apply *closure) +{ + struct apply_as *aas = container_of(closure, typeof(*aas), base); + pte_t ptent = *pte; + + if (pte_write(ptent)) { + pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte); + + ptent = pte_wrprotect(old_pte); + ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent); + aas->total++; + aas->start = min(aas->start, addr); + aas->end = max(aas->end, addr + PAGE_SIZE); + } + + return 0; +} + +/** + * struct apply_as_clean - Closure structure for apply_as_clean + * @base: struct apply_as we derive from + * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap + * @bitmap: Bitmap with one bit for each page offset in the address_space range + * covered. + * @start: Address_space page offset of first modified pte relative + * to @bitmap_pgoff + * @end: Address_space page offset of last modified pte relative + * to @bitmap_pgoff + */ +struct apply_as_clean { + struct apply_as base; + pgoff_t bitmap_pgoff; + unsigned long *bitmap; + pgoff_t start; + pgoff_t end; +}; + +/** + * apply_pt_clean - Leaf pte callback to clean a pte + * @pte: Pointer to the pte + * @token: Page table token, see apply_to_pfn_range() + * @addr: The virtual page address + * @closure: Pointer to a struct pfn_range_apply embedded in a + * struct apply_as_clean + * + * The function cleans a pte and records the range in + * virtual address space of touched ptes for efficient TLB flushes. + * It also records dirty ptes in a bitmap representing page offsets + * in the address_space, as well as the first and last of the bits + * touched. + * + * Return: Always zero. + */ +static int apply_pt_clean(pte_t *pte, pgtable_t token, + unsigned long addr, + struct pfn_range_apply *closure) +{ + struct apply_as *aas = container_of(closure, typeof(*aas), base); + struct apply_as_clean *clean = container_of(aas, typeof(*clean), base); + pte_t ptent = *pte; + + if (pte_dirty(ptent)) { + pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) + + aas->vma->vm_pgoff - clean->bitmap_pgoff; + pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte); + + ptent = pte_mkclean(old_pte); + ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent); + + aas->total++; + aas->start = min(aas->start, addr); + aas->end = max(aas->end, addr + PAGE_SIZE); + + __set_bit(pgoff, clean->bitmap); + clean->start = min(clean->start, pgoff); + clean->end = max(clean->end, pgoff + 1); + } + + return 0; +} + +/** + * apply_as_range - Apply a pte callback to all PTEs pointing into a range + * of an address_space. + * @mapping: Pointer to the struct address_space + * @aas: Closure structure + * @first_index: First page offset in the address_space + * @nr: Number of incremental page offsets to cover + * + * Return: Number of ptes touched. Note that this number might be larger + * than @nr if there are overlapping vmas + */ +static unsigned long apply_as_range(struct address_space *mapping, + struct apply_as *aas, + pgoff_t first_index, pgoff_t nr) +{ + struct vm_area_struct *vma; + pgoff_t vba, vea, cba, cea; + unsigned long start_addr, end_addr; + struct mmu_notifier_range range; + + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, + first_index + nr - 1) { + unsigned long vm_flags = READ_ONCE(vma->vm_flags); + + /* + * We can only do advisory flag tests below, since we can't + * require the vm's mmap_sem to be held to protect the flags. + * Therefore, callers that strictly depend on specific mmap + * flags to remain constant throughout the operation must + * either ensure those flags are immutable for all relevant + * vmas or can't use this function. Fixing this properly would + * require the vma::vm_flags to be protected by a separate + * lock taken after the i_mmap_lock + */ + + /* Skip non-applicable VMAs */ + if ((vm_flags & (VM_SHARED | VM_WRITE)) != + (VM_SHARED | VM_WRITE)) + continue; + + /* Warn on and skip VMAs whose flags indicate illegal usage */ + if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO)) + continue; + + /* Clip to the vma */ + vba = vma->vm_pgoff; + vea = vba + vma_pages(vma); + cba = first_index; + cba = max(cba, vba); + cea = first_index + nr; + cea = min(cea, vea); + + /* Translate to virtual address */ + start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; + end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; + if (start_addr >= end_addr) + continue; + + aas->base.mm = vma->vm_mm; + aas->vma = vma; + aas->start = end_addr; + aas->end = start_addr; + + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, + vma, vma->vm_mm, start_addr, end_addr); + mmu_notifier_invalidate_range_start(&range); + + /* Needed when we only change protection? */ + flush_cache_range(vma, start_addr, end_addr); + + /* + * We're not using tlb_gather_mmu() since typically + * only a small subrange of PTEs are affected. + */ + inc_tlb_flush_pending(vma->vm_mm); + + /* Should not error since aas->base.alloc == 0 */ + WARN_ON(apply_to_pfn_range(&aas->base, start_addr, + end_addr - start_addr)); + if (aas->end > aas->start) + flush_tlb_range(vma, aas->start, aas->end); + + mmu_notifier_invalidate_range_end(&range); + dec_tlb_flush_pending(vma->vm_mm); + } + i_mmap_unlock_read(mapping); + + return aas->total; +} + +/** + * apply_as_wrprotect - Write-protect all ptes in an address_space range + * @mapping: The address_space we want to write protect + * @first_index: The first page offset in the range + * @nr: Number of incremental page offsets to cover + * + * WARNING: This function should only be used for address spaces whose + * vmas are marked VM_IO and that do not contain huge pages. + * To avoid interference with COW'd pages, vmas not marked VM_SHARED are + * simply skipped. + * + * Return: The number of ptes actually write-protected. Note that + * already write-protected ptes are not counted. + */ +unsigned long apply_as_wrprotect(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr) +{ + struct apply_as aas = { + .base = { + .alloc = 0, + .ptefn = apply_pt_wrprotect, + }, + .total = 0, + }; + + return apply_as_range(mapping, &aas, first_index, nr); +} +EXPORT_SYMBOL_GPL(apply_as_wrprotect); + +/** + * apply_as_clean - Clean all ptes in an address_space range + * @mapping: The address_space we want to clean + * @first_index: The first page offset in the range + * @nr: Number of incremental page offsets to cover + * @bitmap_pgoff: The page offset of the first bit in @bitmap + * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to + * cover the whole range @first_index..@first_index + @nr. + * @start: Pointer to number of the first set bit in @bitmap. + * is modified as new bits are set by the function. + * @end: Pointer to the number of the last set bit in @bitmap. + * none set. The value is modified as new bits are set by the function. + * + * Note: When this function returns there is no guarantee that a CPU has + * not already dirtied new ptes. However it will not clean any ptes not + * reported in the bitmap. + * + * If a caller needs to make sure all dirty ptes are picked up and none + * additional are added, it first needs to write-protect the address-space + * range and make sure new writers are blocked in page_mkwrite() or + * pfn_mkwrite(). And then after a TLB flush following the write-protection + * pick up all dirty bits. + * + * WARNING: This function should only be used for address spaces whose + * vmas are marked VM_IO and that do not contain huge pages. + * To avoid interference with COW'd pages, vmas not marked VM_SHARED are + * simply skipped. + * + * Return: The number of dirty ptes actually cleaned. + */ +unsigned long apply_as_clean(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr, + pgoff_t bitmap_pgoff, + unsigned long *bitmap, + pgoff_t *start, + pgoff_t *end) +{ + bool none_set = (*start >= *end); + struct apply_as_clean clean = { + .base = { + .base = { + .alloc = 0, + .ptefn = apply_pt_clean, + }, + .total = 0, + }, + .bitmap_pgoff = bitmap_pgoff, + .bitmap = bitmap, + .start = none_set ? nr : *start, + .end = none_set ? 0 : *end, + }; + unsigned long ret = apply_as_range(mapping, &clean.base, first_index, + nr); + + *start = clean.start; + *end = clean.end; + return ret; +} +EXPORT_SYMBOL_GPL(apply_as_clean); -- cgit v1.2.3-59-g8ed1b From 378a60406415bd20ec6e845a3d6883d460656537 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 23 May 2019 11:17:22 -0300 Subject: mm/hmm: Remove duplicate condition test before wait_event_timeout The wait_event_timeout macro already tests the condition as its first action, so there is no reason to open code another version of this, all that does is skip the might_sleep() debugging in common cases, which is not helpful. Further, based on prior patches, we can now simplify the required condition test: - If range is valid memory then so is range->hmm - If hmm_release() has run then range->valid is set to false at the same time as dead, so no reason to check both. - A valid hmm has a valid hmm->mm. Allowing the return value of wait_event_timeout() (along with its internal barriers) to compute the result of the function. Signed-off-by: Jason Gunthorpe Reviewed-by: Ralph Campbell Reviewed-by: John Hubbard Reviewed-by: Ira Weiny Reviewed-by: Christoph Hellwig Tested-by: Philip Yang --- include/linux/hmm.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 1d97b6d62c5b..26e7c477490c 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -209,17 +209,8 @@ static inline unsigned long hmm_range_page_size(const struct hmm_range *range) static inline bool hmm_range_wait_until_valid(struct hmm_range *range, unsigned long timeout) { - /* Check if mm is dead ? */ - if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) { - range->valid = false; - return false; - } - if (range->valid) - return true; - wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead, - msecs_to_jiffies(timeout)); - /* Return current valid status just in case we get lucky */ - return range->valid; + return wait_event_timeout(range->hmm->wq, range->valid, + msecs_to_jiffies(timeout)) != 0; } /* -- cgit v1.2.3-59-g8ed1b From 47f245985a4f3e270b1e4f28aa49f4c939527981 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 23 May 2019 11:08:28 -0300 Subject: mm/hmm: Hold on to the mmget for the lifetime of the range Range functions like hmm_range_snapshot() and hmm_range_fault() call find_vma, which requires hodling the mmget() and the mmap_sem for the mm. Make this simpler for the callers by holding the mmget() inside the range for the lifetime of the range. Other functions that accept a range should only be called if the range is registered. This has the side effect of directly preventing hmm_release() from happening while a range is registered. That means range->dead cannot be false during the lifetime of the range, so remove dead and hmm_mirror_mm_is_alive() entirely. Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Reviewed-by: Ralph Campbell Reviewed-by: Christoph Hellwig Tested-by: Philip Yang --- include/linux/hmm.h | 26 -------------------------- mm/hmm.c | 32 +++++++++++--------------------- 2 files changed, 11 insertions(+), 47 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 26e7c477490c..bf013e965257 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -82,7 +82,6 @@ * @mirrors_sem: read/write semaphore protecting the mirrors list * @wq: wait queue for user waiting on a range invalidation * @notifiers: count of active mmu notifiers - * @dead: is the mm dead ? */ struct hmm { struct mm_struct *mm; @@ -95,7 +94,6 @@ struct hmm { wait_queue_head_t wq; struct rcu_head rcu; long notifiers; - bool dead; }; /* @@ -459,30 +457,6 @@ struct hmm_mirror { int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); void hmm_mirror_unregister(struct hmm_mirror *mirror); -/* - * hmm_mirror_mm_is_alive() - test if mm is still alive - * @mirror: the HMM mm mirror for which we want to lock the mmap_sem - * Return: false if the mm is dead, true otherwise - * - * This is an optimization, it will not always accurately return false if the - * mm is dead; i.e., there can be false negatives (process is being killed but - * HMM is not yet informed of that). It is only intended to be used to optimize - * out cases where the driver is about to do something time consuming and it - * would be better to skip it if the mm is dead. - */ -static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) -{ - struct mm_struct *mm; - - if (!mirror || !mirror->hmm) - return false; - mm = READ_ONCE(mirror->hmm->mm); - if (mirror->hmm->dead || !mm) - return false; - - return true; -} - /* * Please see Documentation/vm/hmm.rst for how to use the range API. */ diff --git a/mm/hmm.c b/mm/hmm.c index 73c8af4827fe..1eddda45cefa 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -67,7 +67,6 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) mutex_init(&hmm->lock); kref_init(&hmm->kref); hmm->notifiers = 0; - hmm->dead = false; hmm->mm = mm; hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; @@ -120,21 +119,16 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; - struct hmm_range *range; /* Bail out if hmm is in the process of being freed */ if (!kref_get_unless_zero(&hmm->kref)) return; - /* Report this HMM as dying. */ - hmm->dead = true; - - /* Wake-up everyone waiting on any range. */ - mutex_lock(&hmm->lock); - list_for_each_entry(range, &hmm->ranges, list) - range->valid = false; - wake_up_all(&hmm->wq); - mutex_unlock(&hmm->lock); + /* + * Since hmm_range_register() holds the mmget() lock hmm_release() is + * prevented as long as a range exists. + */ + WARN_ON(!list_empty_careful(&hmm->ranges)); down_write(&hmm->mirrors_sem); mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, @@ -903,8 +897,8 @@ int hmm_range_register(struct hmm_range *range, range->start = start; range->end = end; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) + /* Prevent hmm_release() from running while the range is valid */ + if (!mmget_not_zero(hmm->mm)) return -EFAULT; /* Initialize range to track CPU page table updates. */ @@ -942,11 +936,12 @@ void hmm_range_unregister(struct hmm_range *range) return; mutex_lock(&hmm->lock); - list_del(&range->list); + list_del_init(&range->list); mutex_unlock(&hmm->lock); /* Drop reference taken by hmm_range_register() */ range->valid = false; + mmput(hmm->mm); hmm_put(hmm); range->hmm = NULL; } @@ -974,10 +969,7 @@ long hmm_range_snapshot(struct hmm_range *range) struct vm_area_struct *vma; struct mm_walk mm_walk; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) - return -EFAULT; - + lockdep_assert_held(&hmm->mm->mmap_sem); do { /* If range is no longer valid force retry. */ if (!range->valid) @@ -1072,9 +1064,7 @@ long hmm_range_fault(struct hmm_range *range, bool block) struct mm_walk mm_walk; int ret; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) - return -EFAULT; + lockdep_assert_held(&hmm->mm->mmap_sem); do { /* If range is no longer valid force retry. */ -- cgit v1.2.3-59-g8ed1b From a78cf9657ba5426f54aa93a067c10d097944c082 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 15 Jun 2019 10:23:57 +1000 Subject: PCI/ACPI: Evaluate PCI Boot Configuration _DSM Evaluate _DSM Function #5, the "PCI Boot Configuration" function. If the result is 0, the OS should preserve any resource assignments made by the firmware. Link: https://lore.kernel.org/r/20190615002359.29577-2-benh@kernel.crashing.org Signed-off-by: Benjamin Herrenschmidt [bhelgaas: commit log] Signed-off-by: Bjorn Helgaas --- drivers/acpi/pci_root.c | 12 ++++++++++++ include/linux/pci-acpi.h | 7 ++++--- include/linux/pci.h | 2 ++ 3 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c index c36781a9b493..0d57f817ef1e 100644 --- a/drivers/acpi/pci_root.c +++ b/drivers/acpi/pci_root.c @@ -894,6 +894,7 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root, int node = acpi_get_node(device->handle); struct pci_bus *bus; struct pci_host_bridge *host_bridge; + union acpi_object *obj; info->root = root; info->bridge = device; @@ -930,6 +931,17 @@ struct pci_bus *acpi_pci_root_create(struct acpi_pci_root *root, if (!(root->osc_control_set & OSC_PCI_EXPRESS_LTR_CONTROL)) host_bridge->native_ltr = 0; + /* + * Evaluate the "PCI Boot Configuration" _DSM Function. If it + * exists and returns 0, we must preserve any PCI resource + * assignments made by firmware for this host bridge. + */ + obj = acpi_evaluate_dsm(ACPI_HANDLE(bus->bridge), &pci_acpi_dsm_guid, 1, + IGNORE_PCI_BOOT_CONFIG_DSM, NULL); + if (obj && obj->type == ACPI_TYPE_INTEGER && obj->integer.value == 0) + host_bridge->preserve_config = 1; + ACPI_FREE(obj); + pci_scan_child_bus(bus); pci_set_host_bridge_release(host_bridge, acpi_pci_root_release_info, info); diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 8082b612f561..62b7fdcc661c 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -107,9 +107,10 @@ static inline void acpiphp_check_host_bridge(struct acpi_device *adev) { } #endif extern const guid_t pci_acpi_dsm_guid; -#define DEVICE_LABEL_DSM 0x07 -#define RESET_DELAY_DSM 0x08 -#define FUNCTION_DELAY_DSM 0x09 +#define IGNORE_PCI_BOOT_CONFIG_DSM 0x05 +#define DEVICE_LABEL_DSM 0x07 +#define RESET_DELAY_DSM 0x08 +#define FUNCTION_DELAY_DSM 0x09 #else /* CONFIG_ACPI */ static inline void acpi_pci_add_bus(struct pci_bus *bus) { } diff --git a/include/linux/pci.h b/include/linux/pci.h index 4a5a84d7bdd4..5e2b309363a3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -505,6 +505,8 @@ struct pci_host_bridge { unsigned int native_shpc_hotplug:1; /* OS may use SHPC hotplug */ unsigned int native_pme:1; /* OS may use PCIe PME */ unsigned int native_ltr:1; /* OS may use PCIe LTR */ + unsigned int preserve_config:1; /* Preserve FW resource setup */ + /* Resource alignment requirements */ resource_size_t (*align_resource)(struct pci_dev *dev, const struct resource *res, -- cgit v1.2.3-59-g8ed1b From d308dfbf62eff897d71968d764f21a78678ee0a5 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 18 Jun 2019 12:58:33 +0200 Subject: i2c: mux/i801: Switch to use descriptor passing This switches the i801 GPIO mux to use GPIO descriptors for handling the GPIO lines. The previous hack which was reaching inside the GPIO chips etc cannot live on. We pass descriptors along with the GPIO mux device at creation instead. The GPIO mux was only used by way of platform data with a platform device from one place in the kernel: the i801 i2c bus driver. Let's just associate the GPIO descriptor table with the actual device like everyone else and dynamically create a descriptor table passed along with the GPIO i2c mux. This enables simplification of the GPIO i2c mux driver to use only the descriptor API and the OF probe path gets simplified in the process. The i801 driver was registering the GPIO i2c mux with PLATFORM_DEVID_AUTO which would make it hard to predict the device name and assign the descriptor table properly, but this seems to be a mistake to begin with: all of the GPIO mux devices are hardcoded to look up GPIO lines from the "gpio_ich" GPIO chip. If there are more than one mux, there is certainly more than one gpio chip as well, and then we have more serious problems. Switch to PLATFORM_DEVID_NONE instead. There can be only one. Cc: Mika Westerberg Cc: Andy Shevchenko Cc: Peter Rosin Cc: Jean Delvare Signed-off-by: Serge Semin Signed-off-by: Linus Walleij Reviewed-by: Andy Shevchenko Reviewed-by: Mika Westerberg [Removed a newline, suggested by Andy. /Peter] Signed-off-by: Peter Rosin --- drivers/i2c/busses/i2c-i801.c | 37 +++++++-- drivers/i2c/muxes/i2c-mux-gpio.c | 116 ++++++++--------------------- include/linux/platform_data/i2c-mux-gpio.h | 7 -- 3 files changed, 60 insertions(+), 100 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index 679c6c41f64b..bf484cd775ec 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -107,7 +107,7 @@ #include #if IS_ENABLED(CONFIG_I2C_MUX_GPIO) && defined CONFIG_DMI -#include +#include #include #endif @@ -274,6 +274,7 @@ struct i801_priv { #if IS_ENABLED(CONFIG_I2C_MUX_GPIO) && defined CONFIG_DMI const struct i801_mux_config *mux_drvdata; struct platform_device *mux_pdev; + struct gpiod_lookup_table *lookup; #endif struct platform_device *tco_pdev; @@ -1258,7 +1259,8 @@ static int i801_add_mux(struct i801_priv *priv) struct device *dev = &priv->adapter.dev; const struct i801_mux_config *mux_config; struct i2c_mux_gpio_platform_data gpio_data; - int err; + struct gpiod_lookup_table *lookup; + int err, i; if (!priv->mux_drvdata) return 0; @@ -1270,17 +1272,36 @@ static int i801_add_mux(struct i801_priv *priv) gpio_data.values = mux_config->values; gpio_data.n_values = mux_config->n_values; gpio_data.classes = mux_config->classes; - gpio_data.gpio_chip = mux_config->gpio_chip; - gpio_data.gpios = mux_config->gpios; - gpio_data.n_gpios = mux_config->n_gpios; gpio_data.idle = I2C_MUX_GPIO_NO_IDLE; - /* Register the mux device */ + /* Register GPIO descriptor lookup table */ + lookup = devm_kzalloc(dev, + struct_size(lookup, table, mux_config->n_gpios), + GFP_KERNEL); + if (!lookup) + return -ENOMEM; + lookup->dev_id = "i2c-mux-gpio"; + for (i = 0; i < mux_config->n_gpios; i++) { + lookup->table[i].chip_label = mux_config->gpio_chip; + lookup->table[i].chip_hwnum = mux_config->gpios[i]; + lookup->table[i].con_id = "mux"; + } + gpiod_add_lookup_table(lookup); + priv->lookup = lookup; + + /* + * Register the mux device, we use PLATFORM_DEVID_NONE here + * because since we are referring to the GPIO chip by name we are + * anyways in deep trouble if there is more than one of these + * devices, and there should likely only be one platform controller + * hub. + */ priv->mux_pdev = platform_device_register_data(dev, "i2c-mux-gpio", - PLATFORM_DEVID_AUTO, &gpio_data, + PLATFORM_DEVID_NONE, &gpio_data, sizeof(struct i2c_mux_gpio_platform_data)); if (IS_ERR(priv->mux_pdev)) { err = PTR_ERR(priv->mux_pdev); + gpiod_remove_lookup_table(lookup); priv->mux_pdev = NULL; dev_err(dev, "Failed to register i2c-mux-gpio device\n"); return err; @@ -1293,6 +1314,8 @@ static void i801_del_mux(struct i801_priv *priv) { if (priv->mux_pdev) platform_device_unregister(priv->mux_pdev); + if (priv->lookup) + gpiod_remove_lookup_table(priv->lookup); } static unsigned int i801_get_adapter_class(struct i801_priv *priv) diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c index 13882a2a4f60..fd482feafb19 100644 --- a/drivers/i2c/muxes/i2c-mux-gpio.c +++ b/drivers/i2c/muxes/i2c-mux-gpio.c @@ -14,13 +14,14 @@ #include #include #include -#include +#include +#include +/* FIXME: stop poking around inside gpiolib */ #include "../../gpio/gpiolib.h" -#include struct gpiomux { struct i2c_mux_gpio_platform_data data; - unsigned gpio_base; + int ngpios; struct gpio_desc **gpios; }; @@ -30,8 +31,7 @@ static void i2c_mux_gpio_set(const struct gpiomux *mux, unsigned val) values[0] = val; - gpiod_set_array_value_cansleep(mux->data.n_gpios, mux->gpios, NULL, - values); + gpiod_set_array_value_cansleep(mux->ngpios, mux->gpios, NULL, values); } static int i2c_mux_gpio_select(struct i2c_mux_core *muxc, u32 chan) @@ -52,12 +52,6 @@ static int i2c_mux_gpio_deselect(struct i2c_mux_core *muxc, u32 chan) return 0; } -static int match_gpio_chip_by_label(struct gpio_chip *chip, - void *data) -{ - return !strcmp(chip->label, data); -} - #ifdef CONFIG_OF static int i2c_mux_gpio_probe_dt(struct gpiomux *mux, struct platform_device *pdev) @@ -65,8 +59,8 @@ static int i2c_mux_gpio_probe_dt(struct gpiomux *mux, struct device_node *np = pdev->dev.of_node; struct device_node *adapter_np, *child; struct i2c_adapter *adapter; - unsigned *values, *gpios; - int i = 0, ret; + unsigned *values; + int i = 0; if (!np) return -ENODEV; @@ -103,29 +97,6 @@ static int i2c_mux_gpio_probe_dt(struct gpiomux *mux, if (of_property_read_u32(np, "idle-state", &mux->data.idle)) mux->data.idle = I2C_MUX_GPIO_NO_IDLE; - mux->data.n_gpios = of_gpio_named_count(np, "mux-gpios"); - if (mux->data.n_gpios < 0) { - dev_err(&pdev->dev, "Missing mux-gpios property in the DT.\n"); - return -EINVAL; - } - - gpios = devm_kcalloc(&pdev->dev, - mux->data.n_gpios, sizeof(*mux->data.gpios), - GFP_KERNEL); - if (!gpios) { - dev_err(&pdev->dev, "Cannot allocate gpios array"); - return -ENOMEM; - } - - for (i = 0; i < mux->data.n_gpios; i++) { - ret = of_get_named_gpio(np, "mux-gpios", i); - if (ret < 0) - return ret; - gpios[i] = ret; - } - - mux->data.gpios = gpios; - return 0; } #else @@ -142,8 +113,8 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev) struct gpiomux *mux; struct i2c_adapter *parent; struct i2c_adapter *root; - unsigned initial_state, gpio_base; - int i, ret; + unsigned initial_state; + int i, ngpios, ret; mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL); if (!mux) @@ -158,29 +129,19 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev) sizeof(mux->data)); } - /* - * If a GPIO chip name is provided, the GPIO pin numbers provided are - * relative to its base GPIO number. Otherwise they are absolute. - */ - if (mux->data.gpio_chip) { - struct gpio_chip *gpio; - - gpio = gpiochip_find(mux->data.gpio_chip, - match_gpio_chip_by_label); - if (!gpio) - return -EPROBE_DEFER; - - gpio_base = gpio->base; - } else { - gpio_base = 0; + ngpios = gpiod_count(&pdev->dev, "mux"); + if (ngpios <= 0) { + dev_err(&pdev->dev, "no valid gpios provided\n"); + return ngpios ?: -EINVAL; } + mux->ngpios = ngpios; parent = i2c_get_adapter(mux->data.parent); if (!parent) return -EPROBE_DEFER; muxc = i2c_mux_alloc(parent, &pdev->dev, mux->data.n_values, - mux->data.n_gpios * sizeof(*mux->gpios), 0, + ngpios * sizeof(*mux->gpios), 0, i2c_mux_gpio_select, NULL); if (!muxc) { ret = -ENOMEM; @@ -194,7 +155,6 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev) root = i2c_root_adapter(&parent->dev); muxc->mux_locked = true; - mux->gpio_base = gpio_base; if (mux->data.idle != I2C_MUX_GPIO_NO_IDLE) { initial_state = mux->data.idle; @@ -203,34 +163,28 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev) initial_state = mux->data.values[0]; } - for (i = 0; i < mux->data.n_gpios; i++) { + for (i = 0; i < ngpios; i++) { struct device *gpio_dev; - struct gpio_desc *gpio_desc; - - ret = gpio_request(gpio_base + mux->data.gpios[i], "i2c-mux-gpio"); - if (ret) { - dev_err(&pdev->dev, "Failed to request GPIO %d\n", - mux->data.gpios[i]); - goto err_request_gpio; + struct gpio_desc *gpiod; + enum gpiod_flags flag; + + if (initial_state & BIT(i)) + flag = GPIOD_OUT_HIGH; + else + flag = GPIOD_OUT_LOW; + gpiod = devm_gpiod_get_index(&pdev->dev, "mux", i, flag); + if (IS_ERR(gpiod)) { + ret = PTR_ERR(gpiod); + goto alloc_failed; } - ret = gpio_direction_output(gpio_base + mux->data.gpios[i], - initial_state & (1 << i)); - if (ret) { - dev_err(&pdev->dev, - "Failed to set direction of GPIO %d to output\n", - mux->data.gpios[i]); - i++; /* gpio_request above succeeded, so must free */ - goto err_request_gpio; - } - - gpio_desc = gpio_to_desc(gpio_base + mux->data.gpios[i]); - mux->gpios[i] = gpio_desc; + mux->gpios[i] = gpiod; if (!muxc->mux_locked) continue; - gpio_dev = &gpio_desc->gdev->dev; + /* FIXME: find a proper way to access the GPIO device */ + gpio_dev = &gpiod->gdev->dev; muxc->mux_locked = i2c_root_adapter(gpio_dev) == root; } @@ -253,10 +207,6 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev) add_adapter_failed: i2c_mux_del_adapters(muxc); - i = mux->data.n_gpios; -err_request_gpio: - for (; i > 0; i--) - gpio_free(gpio_base + mux->data.gpios[i - 1]); alloc_failed: i2c_put_adapter(parent); @@ -266,14 +216,8 @@ alloc_failed: static int i2c_mux_gpio_remove(struct platform_device *pdev) { struct i2c_mux_core *muxc = platform_get_drvdata(pdev); - struct gpiomux *mux = i2c_mux_priv(muxc); - int i; i2c_mux_del_adapters(muxc); - - for (i = 0; i < mux->data.n_gpios; i++) - gpio_free(mux->gpio_base + mux->data.gpios[i]); - i2c_put_adapter(muxc->parent); return 0; diff --git a/include/linux/platform_data/i2c-mux-gpio.h b/include/linux/platform_data/i2c-mux-gpio.h index 4406108201fe..28f288eed652 100644 --- a/include/linux/platform_data/i2c-mux-gpio.h +++ b/include/linux/platform_data/i2c-mux-gpio.h @@ -22,10 +22,6 @@ * position * @n_values: Number of multiplexer positions (busses to instantiate) * @classes: Optional I2C auto-detection classes - * @gpio_chip: Optional GPIO chip name; if set, GPIO pin numbers are given - * relative to the base GPIO number of that chip - * @gpios: Array of GPIO numbers used to control MUX - * @n_gpios: Number of GPIOs used to control MUX * @idle: Bitmask to write to MUX when idle or GPIO_I2CMUX_NO_IDLE if not used */ struct i2c_mux_gpio_platform_data { @@ -34,9 +30,6 @@ struct i2c_mux_gpio_platform_data { const unsigned *values; int n_values; const unsigned *classes; - char *gpio_chip; - const unsigned *gpios; - int n_gpios; unsigned idle; }; -- cgit v1.2.3-59-g8ed1b From 62de37da9f382455b983f2f92b10012109005278 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 20 Jun 2019 15:26:29 +0300 Subject: mtd: spi-nor: intel-spi: Convert to use SPDX identifier This gets rid of the license boilerplate duplicated in each file. No functional changes intended. Signed-off-by: Mika Westerberg Signed-off-by: Tudor Ambarus --- drivers/mtd/spi-nor/intel-spi-pci.c | 5 +---- drivers/mtd/spi-nor/intel-spi-platform.c | 5 +---- drivers/mtd/spi-nor/intel-spi.c | 5 +---- drivers/mtd/spi-nor/intel-spi.h | 5 +---- include/linux/platform_data/intel-spi.h | 5 +---- 5 files changed, 5 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/spi-nor/intel-spi-pci.c b/drivers/mtd/spi-nor/intel-spi-pci.c index 578f0c74e536..1b9c2d99ba38 100644 --- a/drivers/mtd/spi-nor/intel-spi-pci.c +++ b/drivers/mtd/spi-nor/intel-spi-pci.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Intel PCH/PCU SPI flash PCI driver. * * Copyright (C) 2016, Intel Corporation * Author: Mika Westerberg - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include diff --git a/drivers/mtd/spi-nor/intel-spi-platform.c b/drivers/mtd/spi-nor/intel-spi-platform.c index 5c943df9398f..25b18804e9bb 100644 --- a/drivers/mtd/spi-nor/intel-spi-platform.c +++ b/drivers/mtd/spi-nor/intel-spi-platform.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Intel PCH/PCU SPI flash platform driver. * * Copyright (C) 2016, Intel Corporation * Author: Mika Westerberg - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include diff --git a/drivers/mtd/spi-nor/intel-spi.c b/drivers/mtd/spi-nor/intel-spi.c index d60cbf23d9aa..021cef930f9f 100644 --- a/drivers/mtd/spi-nor/intel-spi.c +++ b/drivers/mtd/spi-nor/intel-spi.c @@ -1,12 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Intel PCH/PCU SPI flash driver. * * Copyright (C) 2016, Intel Corporation * Author: Mika Westerberg - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #include diff --git a/drivers/mtd/spi-nor/intel-spi.h b/drivers/mtd/spi-nor/intel-spi.h index 5ab7dc250050..b03bf296fda3 100644 --- a/drivers/mtd/spi-nor/intel-spi.h +++ b/drivers/mtd/spi-nor/intel-spi.h @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Intel PCH/PCU SPI flash driver. * * Copyright (C) 2016, Intel Corporation * Author: Mika Westerberg - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef INTEL_SPI_H diff --git a/include/linux/platform_data/intel-spi.h b/include/linux/platform_data/intel-spi.h index 942b0c3f8f08..001f377fb5ef 100644 --- a/include/linux/platform_data/intel-spi.h +++ b/include/linux/platform_data/intel-spi.h @@ -1,12 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Intel PCH/PCU SPI flash driver. * * Copyright (C) 2016, Intel Corporation * Author: Mika Westerberg - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef INTEL_SPI_PDATA_H -- cgit v1.2.3-59-g8ed1b From e67d4dfc9ff19dbe74b29617cf2592ccc50c3920 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Wed, 12 Jun 2019 01:44:04 -0700 Subject: power: supply: Add HWMON compatibility layer Add code implementing HWMON adapter/compatibility layer to allow expositing various sensors present on power supply devices via HWMON subsystem. This is done in order to allow userspace to use single ABI/library(libsensors) to access/manipulate all of the sensors of the system. Signed-off-by: Andrey Smirnov Reviewed-by: Guenter Roeck Tested-by: Chris Healy Cc: Chris Healy Cc: Cory Tusar Cc: Lucas Stach Cc: Fabio Estevam Cc: Guenter Roeck Cc: Sebastian Reichel Cc: linux-kernel@vger.kernel.org Cc: linux-pm@vger.kernel.org Signed-off-by: Sebastian Reichel --- drivers/power/supply/Kconfig | 14 ++ drivers/power/supply/Makefile | 1 + drivers/power/supply/power_supply_core.c | 7 + drivers/power/supply/power_supply_hwmon.c | 355 ++++++++++++++++++++++++++++++ include/linux/power_supply.h | 13 ++ 5 files changed, 390 insertions(+) create mode 100644 drivers/power/supply/power_supply_hwmon.c (limited to 'include/linux') diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig index 26dacdab03cc..1f2252cb95fd 100644 --- a/drivers/power/supply/Kconfig +++ b/drivers/power/supply/Kconfig @@ -14,6 +14,20 @@ config POWER_SUPPLY_DEBUG Say Y here to enable debugging messages for power supply class and drivers. +config POWER_SUPPLY_HWMON + bool + prompt "Expose power supply sensors as hwmon device" + depends on HWMON=y || HWMON=POWER_SUPPLY + default y + help + This options enables API that allows sensors found on a + power supply device (current, voltage, temperature) to be + exposed as a hwmon device. + + Say 'Y' here if you want power supplies to + have hwmon sysfs interface too. + + config PDA_POWER tristate "Generic PDA/phone power driver" depends on !S390 diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile index f208273f9686..c47e88ba16b9 100644 --- a/drivers/power/supply/Makefile +++ b/drivers/power/supply/Makefile @@ -6,6 +6,7 @@ power_supply-$(CONFIG_SYSFS) += power_supply_sysfs.o power_supply-$(CONFIG_LEDS_TRIGGERS) += power_supply_leds.o obj-$(CONFIG_POWER_SUPPLY) += power_supply.o +obj-$(CONFIG_POWER_SUPPLY_HWMON) += power_supply_hwmon.o obj-$(CONFIG_GENERIC_ADC_BATTERY) += generic-adc-battery.o obj-$(CONFIG_PDA_POWER) += pda_power.o diff --git a/drivers/power/supply/power_supply_core.c b/drivers/power/supply/power_supply_core.c index f7033ecf6d0b..35624193a346 100644 --- a/drivers/power/supply/power_supply_core.c +++ b/drivers/power/supply/power_supply_core.c @@ -1072,6 +1072,10 @@ __power_supply_register(struct device *parent, if (rc) goto create_triggers_failed; + rc = power_supply_add_hwmon_sysfs(psy); + if (rc) + goto add_hwmon_sysfs_failed; + /* * Update use_cnt after any uevents (most notably from device_add()). * We are here still during driver's probe but @@ -1090,6 +1094,8 @@ __power_supply_register(struct device *parent, return psy; +add_hwmon_sysfs_failed: + power_supply_remove_triggers(psy); create_triggers_failed: psy_unregister_cooler(psy); register_cooler_failed: @@ -1242,6 +1248,7 @@ void power_supply_unregister(struct power_supply *psy) cancel_work_sync(&psy->changed_work); cancel_delayed_work_sync(&psy->deferred_register_work); sysfs_remove_link(&psy->dev.kobj, "powers"); + power_supply_remove_hwmon_sysfs(psy); power_supply_remove_triggers(psy); psy_unregister_cooler(psy); psy_unregister_thermal(psy); diff --git a/drivers/power/supply/power_supply_hwmon.c b/drivers/power/supply/power_supply_hwmon.c new file mode 100644 index 000000000000..51fe60440d12 --- /dev/null +++ b/drivers/power/supply/power_supply_hwmon.c @@ -0,0 +1,355 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * power_supply_hwmon.c - power supply hwmon support. + */ + +#include +#include +#include +#include + +struct power_supply_hwmon { + struct power_supply *psy; + unsigned long *props; +}; + +static int power_supply_hwmon_in_to_property(u32 attr) +{ + switch (attr) { + case hwmon_in_average: + return POWER_SUPPLY_PROP_VOLTAGE_AVG; + case hwmon_in_min: + return POWER_SUPPLY_PROP_VOLTAGE_MIN; + case hwmon_in_max: + return POWER_SUPPLY_PROP_VOLTAGE_MAX; + case hwmon_in_input: + return POWER_SUPPLY_PROP_VOLTAGE_NOW; + default: + return -EINVAL; + } +} + +static int power_supply_hwmon_curr_to_property(u32 attr) +{ + switch (attr) { + case hwmon_curr_average: + return POWER_SUPPLY_PROP_CURRENT_AVG; + case hwmon_curr_max: + return POWER_SUPPLY_PROP_CURRENT_MAX; + case hwmon_curr_input: + return POWER_SUPPLY_PROP_CURRENT_NOW; + default: + return -EINVAL; + } +} + +static int power_supply_hwmon_temp_to_property(u32 attr, int channel) +{ + if (channel) { + switch (attr) { + case hwmon_temp_input: + return POWER_SUPPLY_PROP_TEMP_AMBIENT; + case hwmon_temp_min_alarm: + return POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN; + case hwmon_temp_max_alarm: + return POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX; + default: + break; + } + } else { + switch (attr) { + case hwmon_temp_input: + return POWER_SUPPLY_PROP_TEMP; + case hwmon_temp_max: + return POWER_SUPPLY_PROP_TEMP_MAX; + case hwmon_temp_min: + return POWER_SUPPLY_PROP_TEMP_MIN; + case hwmon_temp_min_alarm: + return POWER_SUPPLY_PROP_TEMP_ALERT_MIN; + case hwmon_temp_max_alarm: + return POWER_SUPPLY_PROP_TEMP_ALERT_MAX; + default: + break; + } + } + + return -EINVAL; +} + +static int +power_supply_hwmon_to_property(enum hwmon_sensor_types type, + u32 attr, int channel) +{ + switch (type) { + case hwmon_in: + return power_supply_hwmon_in_to_property(attr); + case hwmon_curr: + return power_supply_hwmon_curr_to_property(attr); + case hwmon_temp: + return power_supply_hwmon_temp_to_property(attr, channel); + default: + return -EINVAL; + } +} + +static bool power_supply_hwmon_is_a_label(enum hwmon_sensor_types type, + u32 attr) +{ + return type == hwmon_temp && attr == hwmon_temp_label; +} + +static bool power_supply_hwmon_is_writable(enum hwmon_sensor_types type, + u32 attr) +{ + switch (type) { + case hwmon_in: + return attr == hwmon_in_min || + attr == hwmon_in_max; + case hwmon_curr: + return attr == hwmon_curr_max; + case hwmon_temp: + return attr == hwmon_temp_max || + attr == hwmon_temp_min || + attr == hwmon_temp_min_alarm || + attr == hwmon_temp_max_alarm; + default: + return false; + } +} + +static umode_t power_supply_hwmon_is_visible(const void *data, + enum hwmon_sensor_types type, + u32 attr, int channel) +{ + const struct power_supply_hwmon *psyhw = data; + int prop; + + + if (power_supply_hwmon_is_a_label(type, attr)) + return 0444; + + prop = power_supply_hwmon_to_property(type, attr, channel); + if (prop < 0 || !test_bit(prop, psyhw->props)) + return 0; + + if (power_supply_property_is_writeable(psyhw->psy, prop) > 0 && + power_supply_hwmon_is_writable(type, attr)) + return 0644; + + return 0444; +} + +static int power_supply_hwmon_read_string(struct device *dev, + enum hwmon_sensor_types type, + u32 attr, int channel, + const char **str) +{ + *str = channel ? "temp" : "temp ambient"; + return 0; +} + +static int +power_supply_hwmon_read(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long *val) +{ + struct power_supply_hwmon *psyhw = dev_get_drvdata(dev); + struct power_supply *psy = psyhw->psy; + union power_supply_propval pspval; + int ret, prop; + + prop = power_supply_hwmon_to_property(type, attr, channel); + if (prop < 0) + return prop; + + ret = power_supply_get_property(psy, prop, &pspval); + if (ret) + return ret; + + switch (type) { + /* + * Both voltage and current is reported in units of + * microvolts/microamps, so we need to adjust it to + * milliamps(volts) + */ + case hwmon_curr: + case hwmon_in: + pspval.intval = DIV_ROUND_CLOSEST(pspval.intval, 1000); + break; + /* + * Temp needs to be converted from 1/10 C to milli-C + */ + case hwmon_temp: + if (check_mul_overflow(pspval.intval, 100, + &pspval.intval)) + return -EOVERFLOW; + break; + default: + return -EINVAL; + } + + *val = pspval.intval; + + return 0; +} + +static int +power_supply_hwmon_write(struct device *dev, enum hwmon_sensor_types type, + u32 attr, int channel, long val) +{ + struct power_supply_hwmon *psyhw = dev_get_drvdata(dev); + struct power_supply *psy = psyhw->psy; + union power_supply_propval pspval; + int prop; + + prop = power_supply_hwmon_to_property(type, attr, channel); + if (prop < 0) + return prop; + + pspval.intval = val; + + switch (type) { + /* + * Both voltage and current is reported in units of + * microvolts/microamps, so we need to adjust it to + * milliamps(volts) + */ + case hwmon_curr: + case hwmon_in: + if (check_mul_overflow(pspval.intval, 1000, + &pspval.intval)) + return -EOVERFLOW; + break; + /* + * Temp needs to be converted from 1/10 C to milli-C + */ + case hwmon_temp: + pspval.intval = DIV_ROUND_CLOSEST(pspval.intval, 100); + break; + default: + return -EINVAL; + } + + return power_supply_set_property(psy, prop, &pspval); +} + +static const struct hwmon_ops power_supply_hwmon_ops = { + .is_visible = power_supply_hwmon_is_visible, + .read = power_supply_hwmon_read, + .write = power_supply_hwmon_write, + .read_string = power_supply_hwmon_read_string, +}; + +static const struct hwmon_channel_info *power_supply_hwmon_info[] = { + HWMON_CHANNEL_INFO(temp, + HWMON_T_LABEL | + HWMON_T_INPUT | + HWMON_T_MAX | + HWMON_T_MIN | + HWMON_T_MIN_ALARM | + HWMON_T_MIN_ALARM, + + HWMON_T_LABEL | + HWMON_T_INPUT | + HWMON_T_MIN_ALARM | + HWMON_T_LABEL | + HWMON_T_MAX_ALARM), + + HWMON_CHANNEL_INFO(curr, + HWMON_C_AVERAGE | + HWMON_C_MAX | + HWMON_C_INPUT), + + HWMON_CHANNEL_INFO(in, + HWMON_I_AVERAGE | + HWMON_I_MIN | + HWMON_I_MAX | + HWMON_I_INPUT), + NULL +}; + +static const struct hwmon_chip_info power_supply_hwmon_chip_info = { + .ops = &power_supply_hwmon_ops, + .info = power_supply_hwmon_info, +}; + +static void power_supply_hwmon_bitmap_free(void *data) +{ + bitmap_free(data); +} + +int power_supply_add_hwmon_sysfs(struct power_supply *psy) +{ + const struct power_supply_desc *desc = psy->desc; + struct power_supply_hwmon *psyhw; + struct device *dev = &psy->dev; + struct device *hwmon; + int ret, i; + + if (!devres_open_group(dev, power_supply_add_hwmon_sysfs, + GFP_KERNEL)) + return -ENOMEM; + + psyhw = devm_kzalloc(dev, sizeof(*psyhw), GFP_KERNEL); + if (!psyhw) { + ret = -ENOMEM; + goto error; + } + + psyhw->psy = psy; + psyhw->props = bitmap_zalloc(POWER_SUPPLY_PROP_TIME_TO_FULL_AVG + 1, + GFP_KERNEL); + if (!psyhw->props) { + ret = -ENOMEM; + goto error; + } + + ret = devm_add_action(dev, power_supply_hwmon_bitmap_free, + psyhw->props); + if (ret) + goto error; + + for (i = 0; i < desc->num_properties; i++) { + const enum power_supply_property prop = desc->properties[i]; + + switch (prop) { + case POWER_SUPPLY_PROP_CURRENT_AVG: + case POWER_SUPPLY_PROP_CURRENT_MAX: + case POWER_SUPPLY_PROP_CURRENT_NOW: + case POWER_SUPPLY_PROP_TEMP: + case POWER_SUPPLY_PROP_TEMP_MAX: + case POWER_SUPPLY_PROP_TEMP_MIN: + case POWER_SUPPLY_PROP_TEMP_ALERT_MIN: + case POWER_SUPPLY_PROP_TEMP_ALERT_MAX: + case POWER_SUPPLY_PROP_TEMP_AMBIENT: + case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MIN: + case POWER_SUPPLY_PROP_TEMP_AMBIENT_ALERT_MAX: + case POWER_SUPPLY_PROP_VOLTAGE_AVG: + case POWER_SUPPLY_PROP_VOLTAGE_MIN: + case POWER_SUPPLY_PROP_VOLTAGE_MAX: + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + set_bit(prop, psyhw->props); + break; + default: + break; + } + } + + hwmon = devm_hwmon_device_register_with_info(dev, psy->desc->name, + psyhw, + &power_supply_hwmon_chip_info, + NULL); + ret = PTR_ERR_OR_ZERO(hwmon); + if (ret) + goto error; + + devres_close_group(dev, power_supply_add_hwmon_sysfs); + return 0; +error: + devres_release_group(dev, NULL); + return ret; +} + +void power_supply_remove_hwmon_sysfs(struct power_supply *psy) +{ + devres_release_group(&psy->dev, power_supply_add_hwmon_sysfs); +} diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h index d9c0c094f8a0..d5b15e039f4f 100644 --- a/include/linux/power_supply.h +++ b/include/linux/power_supply.h @@ -481,4 +481,17 @@ static inline bool power_supply_is_watt_property(enum power_supply_property psp) return 0; } +#ifdef CONFIG_POWER_SUPPLY_HWMON +int power_supply_add_hwmon_sysfs(struct power_supply *psy); +void power_supply_remove_hwmon_sysfs(struct power_supply *psy); +#else +static inline int power_supply_add_hwmon_sysfs(struct power_supply *psy) +{ + return 0; +} + +static inline +void power_supply_remove_hwmon_sysfs(struct power_supply *psy) {} +#endif + #endif /* __LINUX_POWER_SUPPLY_H__ */ -- cgit v1.2.3-59-g8ed1b From 38b37d631aec80da0c65ac03a7ef680b468c7857 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Fri, 7 Jun 2019 12:49:11 +0200 Subject: module: allow arch overrides for .exit section names Some archs like ARM store unwind information for .exit.text in sections with unusual names. As this unwind information refers to .exit.text, it must not be loaded when .exit.text is not loaded (when CONFIG_MODULE_UNLOAD is unset); otherwise, loading a module can fail due to relocation failures. Signed-off-by: Matthias Schiffer Signed-off-by: Jessica Yu --- include/linux/moduleloader.h | 5 +++++ kernel/module.c | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 31013c2effd3..5229c18025e9 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -29,6 +29,11 @@ void *module_alloc(unsigned long size); /* Free memory returned from module_alloc. */ void module_memfree(void *module_region); +/* Determines if the section name is an exit section (that is only used during + * module unloading) + */ +bool module_exit_section(const char *name); + /* * Apply the given relocation to the (simplified) ELF. Return -error * or 0. diff --git a/kernel/module.c b/kernel/module.c index 41258bab24f1..537c456ce3ee 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2748,6 +2748,11 @@ void * __weak module_alloc(unsigned long size) return vmalloc_exec(size); } +bool __weak module_exit_section(const char *name) +{ + return strstarts(name, ".exit"); +} + #ifdef CONFIG_DEBUG_KMEMLEAK static void kmemleak_load_module(const struct module *mod, const struct load_info *info) @@ -2937,7 +2942,7 @@ static int rewrite_section_headers(struct load_info *info, int flags) #ifndef CONFIG_MODULE_UNLOAD /* Don't load .exit sections */ - if (strstarts(info->secstrings+shdr->sh_name, ".exit")) + if (module_exit_section(info->secstrings+shdr->sh_name)) shdr->sh_flags &= ~(unsigned long)SHF_ALLOC; #endif } -- cgit v1.2.3-59-g8ed1b From 38ca87c6f1e514686d4a385246d1afe1e1f2e482 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 Jun 2019 18:52:46 +0300 Subject: RDMA/mlx5: Introduce and implement new IB_WR_REG_MR_INTEGRITY work request This new WR will be used to perform PI (protection information) handover using the new API. Using the new API, the user will post a single WR that will internally perform all the needed actions to complete PI operation. This new WR will use a memory region that was allocated as IB_MR_TYPE_INTEGRITY and was mapped using ib_map_mr_sg_pi to perform the registration. In the old API, in order to perform a signature handover operation, each ULP should perform the following: 1. Map and register the data buffers. 2. Map and register the protection buffers. 3. Post a special reg WR to configure the signature handover operation layout. 4. Invalidate the signature memory key. 5. Invalidate protection buffers memory key. 6. Invalidate data buffers memory key. In the new API, the mapping of both data and protection buffers is performed using a single call to ib_map_mr_sg_pi function. Also the registration of the buffers and the configuration of the signature operation layout is done by a single new work request called IB_WR_REG_MR_INTEGRITY. This patch implements this operation for mlx5 devices that are capable to offload data integrity generation/validation while performing the actual buffer transfer. This patch will not remove the old signature API that is used by the iSER initiator and target drivers. This will be done in the future. In the internal implementation, for each IB_WR_REG_MR_INTEGRITY work request, we are using a single UMR operation to register both data and protection buffers using KLM's. Afterwards, another UMR operation will describe the strided block format. These will be followed by 2 SET_PSV operations to set the memory/wire domains initial signature parameters passed by the user. In the end of the whole transaction, only the signature memory key (the one that exposed for the RDMA operation) will be invalidated. Signed-off-by: Max Gurtovoy Signed-off-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 218 ++++++++++++++++++++++++++++++++++++---- include/linux/mlx5/qp.h | 3 +- include/rdma/ib_verbs.h | 1 + 3 files changed, 201 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index ce8fccb04c3c..f6651b93e469 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4169,7 +4169,7 @@ static __be64 sig_mkey_mask(void) static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, struct mlx5_ib_mr *mr, u8 flags) { - int size = mr->ndescs * mr->desc_size; + int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size; memset(umr, 0, sizeof(*umr)); @@ -4300,7 +4300,7 @@ static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, struct mlx5_ib_mr *mr, u32 key, int access) { - int ndescs = ALIGN(mr->ndescs, 8) >> 1; + int ndescs = ALIGN(mr->ndescs + mr->meta_ndescs, 8) >> 1; memset(seg, 0, sizeof(*seg)); @@ -4351,7 +4351,7 @@ static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, struct mlx5_ib_mr *mr, struct mlx5_ib_pd *pd) { - int bcount = mr->desc_size * mr->ndescs; + int bcount = mr->desc_size * (mr->ndescs + mr->meta_ndescs); dseg->addr = cpu_to_be64(mr->desc_map); dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64)); @@ -4544,23 +4544,52 @@ static int mlx5_set_bsf(struct ib_mr *sig_mr, return 0; } -static int set_sig_data_segment(const struct ib_sig_handover_wr *wr, - struct mlx5_ib_qp *qp, void **seg, - int *size, void **cur_edge) +static int set_sig_data_segment(const struct ib_send_wr *send_wr, + struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_ib_qp *qp, void **seg, int *size, + void **cur_edge) { - struct ib_sig_attrs *sig_attrs = wr->sig_attrs; - struct ib_mr *sig_mr = wr->sig_mr; struct mlx5_bsf *bsf; - u32 data_len = wr->wr.sg_list->length; - u32 data_key = wr->wr.sg_list->lkey; - u64 data_va = wr->wr.sg_list->addr; + u32 data_len; + u32 data_key; + u64 data_va; + u32 prot_len = 0; + u32 prot_key = 0; + u64 prot_va = 0; + bool prot = false; int ret; int wqe_size; - if (!wr->prot || - (data_key == wr->prot->lkey && - data_va == wr->prot->addr && - data_len == wr->prot->length)) { + if (send_wr->opcode == IB_WR_REG_SIG_MR) { + const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); + + data_len = wr->wr.sg_list->length; + data_key = wr->wr.sg_list->lkey; + data_va = wr->wr.sg_list->addr; + if (wr->prot) { + prot_len = wr->prot->length; + prot_key = wr->prot->lkey; + prot_va = wr->prot->addr; + prot = true; + } + } else { + struct mlx5_ib_mr *mr = to_mmr(sig_mr); + struct mlx5_ib_mr *pi_mr = mr->pi_mr; + + data_len = pi_mr->data_length; + data_key = pi_mr->ibmr.lkey; + data_va = pi_mr->ibmr.iova; + if (pi_mr->meta_ndescs) { + prot_len = pi_mr->meta_length; + prot_key = pi_mr->ibmr.lkey; + prot_va = pi_mr->ibmr.iova + data_len; + prot = true; + } + } + + if (!prot || (data_key == prot_key && data_va == prot_va && + data_len == prot_len)) { /** * Source domain doesn't contain signature information * or data and protection are interleaved in memory. @@ -4594,8 +4623,6 @@ static int set_sig_data_segment(const struct ib_sig_handover_wr *wr, struct mlx5_stride_block_ctrl_seg *sblock_ctrl; struct mlx5_stride_block_entry *data_sentry; struct mlx5_stride_block_entry *prot_sentry; - u32 prot_key = wr->prot->lkey; - u64 prot_va = wr->prot->addr; u16 block_size = sig_attrs->mem.sig.dif.pi_interval; int prot_size; @@ -4673,6 +4700,56 @@ static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, umr->mkey_mask = sig_mkey_mask(); } +static int set_pi_umr_wr(const struct ib_send_wr *send_wr, + struct mlx5_ib_qp *qp, void **seg, int *size, + void **cur_edge) +{ + const struct ib_reg_wr *wr = reg_wr(send_wr); + struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr); + struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr; + struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs; + u32 pdn = get_pd(qp)->pdn; + u32 xlt_size; + int region_len, ret; + + if (unlikely(send_wr->num_sge != 0) || + unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) || + unlikely(!sig_mr->sig->sig_status_checked)) + return -EINVAL; + + /* length of the protected region, data + protection */ + region_len = pi_mr->ibmr.length; + + /** + * KLM octoword size - if protection was provided + * then we use strided block format (3 octowords), + * else we use single KLM (1 octoword) + **/ + if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE) + xlt_size = 0x30; + else + xlt_size = sizeof(struct mlx5_klm); + + set_sig_umr_segment(*seg, xlt_size); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len, + pdn); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size, + cur_edge); + if (ret) + return ret; + + sig_mr->sig->sig_status_checked = false; + return 0; +} static int set_sig_umr_wr(const struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp, void **seg, int *size, @@ -4716,7 +4793,8 @@ static int set_sig_umr_wr(const struct ib_send_wr *send_wr, *size += sizeof(struct mlx5_mkey_seg) / 16; handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - ret = set_sig_data_segment(wr, qp, seg, size, cur_edge); + ret = set_sig_data_segment(send_wr, wr->sig_mr, wr->sig_attrs, qp, seg, + size, cur_edge); if (ret) return ret; @@ -4758,7 +4836,7 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, { struct mlx5_ib_mr *mr = to_mmr(wr->mr); struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); - size_t mr_list_size = mr->ndescs * mr->desc_size; + int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size; bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD; u8 flags = 0; @@ -4899,8 +4977,11 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_core_dev *mdev = dev->mdev; + struct ib_reg_wr reg_pi_wr; struct mlx5_ib_qp *qp; struct mlx5_ib_mr *mr; + struct mlx5_ib_mr *pi_mr; + struct ib_sig_attrs *sig_attrs; struct mlx5_wqe_xrc_seg *xrc; struct mlx5_bf *bf; void *cur_edge; @@ -4954,7 +5035,8 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, goto out; } - if (wr->opcode == IB_WR_REG_MR) { + if (wr->opcode == IB_WR_REG_MR || + wr->opcode == IB_WR_REG_MR_INTEGRITY) { fence = dev->umr_fence; next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; } else { @@ -5012,6 +5094,102 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, num_sge = 0; break; + case IB_WR_REG_MR_INTEGRITY: + memset(®_pi_wr, 0, sizeof(struct ib_reg_wr)); + + mr = to_mmr(reg_wr(wr)->mr); + pi_mr = mr->pi_mr; + + reg_pi_wr.mr = &pi_mr->ibmr; + reg_pi_wr.access = reg_wr(wr)->access; + reg_pi_wr.key = pi_mr->ibmr.rkey; + + qp->sq.wr_data[idx] = IB_WR_REG_MR_INTEGRITY; + ctrl->imm = cpu_to_be32(reg_pi_wr.key); + /* UMR for data + protection registration */ + err = set_reg_wr(qp, ®_pi_wr, &seg, &size, + &cur_edge, false); + if (err) { + *bad_wr = wr; + goto out; + } + finish_wqe(qp, ctrl, seg, size, cur_edge, idx, + wr->wr_id, nreq, fence, + MLX5_OPCODE_UMR); + + err = begin_wqe(qp, &seg, &ctrl, wr, &idx, + &size, &cur_edge, nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + ctrl->imm = cpu_to_be32(mr->ibmr.rkey); + /* UMR for sig MR */ + err = set_pi_umr_wr(wr, qp, &seg, &size, + &cur_edge); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + finish_wqe(qp, ctrl, seg, size, cur_edge, idx, + wr->wr_id, nreq, fence, + MLX5_OPCODE_UMR); + + /* + * SET_PSV WQEs are not signaled and solicited + * on error + */ + sig_attrs = mr->ibmr.sig_attrs; + err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, + &size, &cur_edge, nreq, false, + true); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + err = set_psv_wr(&sig_attrs->mem, + mr->sig->psv_memory.psv_idx, + &seg, &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + finish_wqe(qp, ctrl, seg, size, cur_edge, idx, + wr->wr_id, nreq, next_fence, + MLX5_OPCODE_SET_PSV); + + err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, + &size, &cur_edge, nreq, false, + true); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + err = set_psv_wr(&sig_attrs->wire, + mr->sig->psv_wire.psv_idx, + &seg, &size); + if (err) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + finish_wqe(qp, ctrl, seg, size, cur_edge, idx, + wr->wr_id, nreq, next_fence, + MLX5_OPCODE_SET_PSV); + + qp->next_fence = + MLX5_FENCE_MODE_INITIATOR_SMALL; + num_sge = 0; + goto skip_psv; + case IB_WR_REG_SIG_MR: qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR; mr = to_mmr(sig_handover_wr(wr)->sig_mr); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 3ba4edbd17a6..08e43cd9e742 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -37,7 +37,8 @@ #include #define MLX5_INVALID_LKEY 0x100 -#define MLX5_SIG_WQE_SIZE (MLX5_SEND_WQE_BB * 5) +/* UMR (3 WQE_BB's) + SIG (3 WQE_BB's) + PSV (mem) + PSV (wire) */ +#define MLX5_SIG_WQE_SIZE (MLX5_SEND_WQE_BB * 8) #define MLX5_DIF_SIZE 8 #define MLX5_STRIDE_BLOCK_OP 0x400 #define MLX5_CPY_GRD_MASK 0xc0 diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9169e798334f..28db256cbdb9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1236,6 +1236,7 @@ enum ib_wr_opcode { /* These are kernel only and can not be issued by userspace */ IB_WR_REG_MR = 0x20, IB_WR_REG_SIG_MR, + IB_WR_REG_MR_INTEGRITY, /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. -- cgit v1.2.3-59-g8ed1b From ec6516bfbaf72e7c81811162b6de96322e32a027 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 13 Jun 2019 10:55:31 +0900 Subject: pinctrl: remove unneeded #ifdef around declarations What is the point in surrounding the whole of declarations with ifdef like this? #ifdef CONFIG_FOO int foo(void); #endif If CONFIG_FOO is not defined, all callers of foo() will fail with implicit declaration errors since the top Makefile adds -Werror-implicit-function-declaration to KBUILD_CFLAGS. This breaks the build earlier when you are doing something wrong. That's it. Anyway, it will fail to link since the definition of foo() is not compiled. In summary, these ifdef are unneeded. Signed-off-by: Masahiro Yamada Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 20 ++++++-------------- include/linux/pinctrl/pinconf.h | 4 ---- include/linux/pinctrl/pinctrl.h | 4 ---- include/linux/pinctrl/pinmux.h | 4 ---- 4 files changed, 6 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 72d06d6a3099..673828a52294 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -12,6 +12,12 @@ #ifndef __LINUX_PINCTRL_PINCONF_GENERIC_H #define __LINUX_PINCTRL_PINCONF_GENERIC_H +#include +#include + +struct pinctrl_dev; +struct pinctrl_map; + /** * enum pin_config_param - possible pin configuration parameters * @PIN_CONFIG_BIAS_BUS_HOLD: the pin will be set to weakly latch so that it @@ -159,9 +165,6 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param return PIN_CONF_PACKED(param, argument); } -#ifdef CONFIG_GENERIC_PINCONF - -#ifdef CONFIG_DEBUG_FS #define PCONFDUMP(a, b, c, d) { \ .param = a, .display = b, .format = c, .has_arg = d \ } @@ -172,14 +175,6 @@ struct pin_config_item { const char * const format; bool has_arg; }; -#endif /* CONFIG_DEBUG_FS */ - -#ifdef CONFIG_OF - -#include -#include -struct pinctrl_dev; -struct pinctrl_map; struct pinconf_generic_params { const char * const property; @@ -224,8 +219,5 @@ static inline int pinconf_generic_dt_node_to_map_all( return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps, PIN_MAP_TYPE_INVALID); } -#endif - -#endif /* CONFIG_GENERIC_PINCONF */ #endif /* __LINUX_PINCTRL_PINCONF_GENERIC_H */ diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h index 9bebc3554809..513883dcc5a9 100644 --- a/include/linux/pinctrl/pinconf.h +++ b/include/linux/pinctrl/pinconf.h @@ -12,8 +12,6 @@ #ifndef __LINUX_PINCTRL_PINCONF_H #define __LINUX_PINCTRL_PINCONF_H -#ifdef CONFIG_PINCONF - #include struct pinctrl_dev; @@ -67,6 +65,4 @@ struct pinconf_ops { unsigned long config); }; -#endif - #endif /* __LINUX_PINCTRL_PINCONF_H */ diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h index 36a79fe7b84f..27738164daa7 100644 --- a/include/linux/pinctrl/pinctrl.h +++ b/include/linux/pinctrl/pinctrl.h @@ -12,8 +12,6 @@ #ifndef __LINUX_PINCTRL_PINCTRL_H #define __LINUX_PINCTRL_PINCTRL_H -#ifdef CONFIG_PINCTRL - #include #include #include @@ -203,6 +201,4 @@ extern const char *pinctrl_dev_get_name(struct pinctrl_dev *pctldev); extern const char *pinctrl_dev_get_devname(struct pinctrl_dev *pctldev); extern void *pinctrl_dev_get_drvdata(struct pinctrl_dev *pctldev); -#endif /* !CONFIG_PINCTRL */ - #endif /* __LINUX_PINCTRL_PINCTRL_H */ diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index ace60d775b20..566a5fe8eab5 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -16,8 +16,6 @@ #include #include -#ifdef CONFIG_PINMUX - struct pinctrl_dev; /** @@ -85,6 +83,4 @@ struct pinmux_ops { bool strict; }; -#endif /* CONFIG_PINMUX */ - #endif /* __LINUX_PINCTRL_PINMUX_H */ -- cgit v1.2.3-59-g8ed1b From 2f25528e4edddc6eddd42c8d41c9c9e341c8b9da Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Wed, 19 Jun 2019 11:39:25 +0200 Subject: clk: Add clk_bulk_get_optional() function clk_bulk_get_optional() allows to get a group of clocks where one or more is optional. For a not available clock, e.g. not specifed in the clock consumer node in DT, its respective struct clk pointer will be NULL. This allows for operating on a group of returned clocks (struct clk_bulk_data array) with existing clk_bulk* APIs. Signed-off-by: Sylwester Nawrocki Signed-off-by: Stephen Boyd --- drivers/clk/clk-bulk.c | 23 ++++++++++++++++++++--- include/linux/clk.h | 19 +++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/clk-bulk.c b/drivers/clk/clk-bulk.c index 06499568cf07..524bf9a53098 100644 --- a/drivers/clk/clk-bulk.c +++ b/drivers/clk/clk-bulk.c @@ -75,8 +75,8 @@ void clk_bulk_put(int num_clks, struct clk_bulk_data *clks) } EXPORT_SYMBOL_GPL(clk_bulk_put); -int __must_check clk_bulk_get(struct device *dev, int num_clks, - struct clk_bulk_data *clks) +static int __clk_bulk_get(struct device *dev, int num_clks, + struct clk_bulk_data *clks, bool optional) { int ret; int i; @@ -88,10 +88,14 @@ int __must_check clk_bulk_get(struct device *dev, int num_clks, clks[i].clk = clk_get(dev, clks[i].id); if (IS_ERR(clks[i].clk)) { ret = PTR_ERR(clks[i].clk); + clks[i].clk = NULL; + + if (ret == -ENOENT && optional) + continue; + if (ret != -EPROBE_DEFER) dev_err(dev, "Failed to get clk '%s': %d\n", clks[i].id, ret); - clks[i].clk = NULL; goto err; } } @@ -103,8 +107,21 @@ err: return ret; } + +int __must_check clk_bulk_get(struct device *dev, int num_clks, + struct clk_bulk_data *clks) +{ + return __clk_bulk_get(dev, num_clks, clks, false); +} EXPORT_SYMBOL(clk_bulk_get); +int __must_check clk_bulk_get_optional(struct device *dev, int num_clks, + struct clk_bulk_data *clks) +{ + return __clk_bulk_get(dev, num_clks, clks, true); +} +EXPORT_SYMBOL_GPL(clk_bulk_get_optional); + void clk_bulk_put_all(int num_clks, struct clk_bulk_data *clks) { if (IS_ERR_OR_NULL(clks)) diff --git a/include/linux/clk.h b/include/linux/clk.h index f689fc58d7be..1b50e7d1675c 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -332,6 +332,19 @@ int __must_check clk_bulk_get(struct device *dev, int num_clks, */ int __must_check clk_bulk_get_all(struct device *dev, struct clk_bulk_data **clks); + +/** + * clk_bulk_get_optional - lookup and obtain a number of references to clock producer + * @dev: device for clock "consumer" + * @num_clks: the number of clk_bulk_data + * @clks: the clk_bulk_data table of consumer + * + * Behaves the same as clk_bulk_get() except where there is no clock producer. + * In this case, instead of returning -ENOENT, the function returns 0 and + * NULL for a clk for which a clock producer could not be determined. + */ +int __must_check clk_bulk_get_optional(struct device *dev, int num_clks, + struct clk_bulk_data *clks); /** * devm_clk_bulk_get - managed get multiple clk consumers * @dev: device for clock "consumer" @@ -718,6 +731,12 @@ static inline int __must_check clk_bulk_get(struct device *dev, int num_clks, return 0; } +static inline int __must_check clk_bulk_get_optional(struct device *dev, + int num_clks, struct clk_bulk_data *clks) +{ + return 0; +} + static inline int __must_check clk_bulk_get_all(struct device *dev, struct clk_bulk_data **clks) { -- cgit v1.2.3-59-g8ed1b From 9bd5ef0bd8743700d9adffb6fbb1baa346575457 Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Wed, 19 Jun 2019 11:39:26 +0200 Subject: clk: Add devm_clk_bulk_get_optional() function Add managed version of the clk_bulk_get_optional() helper function. Signed-off-by: Sylwester Nawrocki [sboyd@kernel.org: Mark __devm_clk_bulk_get() static] Signed-off-by: Stephen Boyd --- drivers/clk/clk-devres.c | 22 +++++++++++++++++++--- include/linux/clk.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/clk/clk-devres.c b/drivers/clk/clk-devres.c index daa1fc8fba53..be160764911b 100644 --- a/drivers/clk/clk-devres.c +++ b/drivers/clk/clk-devres.c @@ -52,8 +52,8 @@ static void devm_clk_bulk_release(struct device *dev, void *res) clk_bulk_put(devres->num_clks, devres->clks); } -int __must_check devm_clk_bulk_get(struct device *dev, int num_clks, - struct clk_bulk_data *clks) +static int __devm_clk_bulk_get(struct device *dev, int num_clks, + struct clk_bulk_data *clks, bool optional) { struct clk_bulk_devres *devres; int ret; @@ -63,7 +63,10 @@ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks, if (!devres) return -ENOMEM; - ret = clk_bulk_get(dev, num_clks, clks); + if (optional) + ret = clk_bulk_get_optional(dev, num_clks, clks); + else + ret = clk_bulk_get(dev, num_clks, clks); if (!ret) { devres->clks = clks; devres->num_clks = num_clks; @@ -74,8 +77,21 @@ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks, return ret; } + +int __must_check devm_clk_bulk_get(struct device *dev, int num_clks, + struct clk_bulk_data *clks) +{ + return __devm_clk_bulk_get(dev, num_clks, clks, false); +} EXPORT_SYMBOL_GPL(devm_clk_bulk_get); +int __must_check devm_clk_bulk_get_optional(struct device *dev, int num_clks, + struct clk_bulk_data *clks) +{ + return __devm_clk_bulk_get(dev, num_clks, clks, true); +} +EXPORT_SYMBOL_GPL(devm_clk_bulk_get_optional); + int __must_check devm_clk_bulk_get_all(struct device *dev, struct clk_bulk_data **clks) { diff --git a/include/linux/clk.h b/include/linux/clk.h index 1b50e7d1675c..5e7b2dd84965 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -359,6 +359,28 @@ int __must_check clk_bulk_get_optional(struct device *dev, int num_clks, */ int __must_check devm_clk_bulk_get(struct device *dev, int num_clks, struct clk_bulk_data *clks); +/** + * devm_clk_bulk_get_optional - managed get multiple optional consumer clocks + * @dev: device for clock "consumer" + * @clks: pointer to the clk_bulk_data table of consumer + * + * Behaves the same as devm_clk_bulk_get() except where there is no clock + * producer. In this case, instead of returning -ENOENT, the function returns + * NULL for given clk. It is assumed all clocks in clk_bulk_data are optional. + * + * Returns 0 if all clocks specified in clk_bulk_data table are obtained + * successfully or for any clk there was no clk provider available, otherwise + * returns valid IS_ERR() condition containing errno. + * The implementation uses @dev and @clk_bulk_data.id to determine the + * clock consumer, and thereby the clock producer. + * The clock returned is stored in each @clk_bulk_data.clk field. + * + * Drivers must assume that the clock source is not enabled. + * + * clk_bulk_get should not be called from within interrupt context. + */ +int __must_check devm_clk_bulk_get_optional(struct device *dev, int num_clks, + struct clk_bulk_data *clks); /** * devm_clk_bulk_get_all - managed get multiple clk consumers * @dev: device for clock "consumer" @@ -760,6 +782,12 @@ static inline int __must_check devm_clk_bulk_get(struct device *dev, int num_clk return 0; } +static inline int __must_check devm_clk_bulk_get_optional(struct device *dev, + int num_clks, struct clk_bulk_data *clks) +{ + return 0; +} + static inline int __must_check devm_clk_bulk_get_all(struct device *dev, struct clk_bulk_data **clks) { -- cgit v1.2.3-59-g8ed1b From 550113d4e9f5c7b62be760fc01178c9e0139c1f4 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 24 Jun 2019 19:04:02 +0200 Subject: i2c: add newly exported functions to the header, too Nobody (including me) noticed that these functions were exported but not added to the header :/ Fixes: 7159dbdae3c5 ("i2c: core: improve return value handling of i2c_new_device and i2c_new_dummy") Signed-off-by: Wolfram Sang Reviewed-by: Bartosz Golaszewski Reviewed-by: Kieran Bingham Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-base.c | 5 ++--- include/linux/i2c.h | 6 ++++++ 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index e7d5ada40d48..f1949d1e2b54 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -729,7 +729,7 @@ static int i2c_dev_irq_from_resources(const struct resource *resources, * This returns the new i2c client, which may be saved for later use with * i2c_unregister_device(); or an ERR_PTR to describe the error. */ -static struct i2c_client * +struct i2c_client * i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info) { struct i2c_client *client; @@ -895,8 +895,7 @@ static struct i2c_driver dummy_driver = { * This returns the new i2c client, which should be saved for later use with * i2c_unregister_device(); or an ERR_PTR to describe the error. */ -static struct i2c_client * -i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address) +struct i2c_client *i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address) { struct i2c_board_info info = { I2C_BOARD_INFO("dummy", address), diff --git a/include/linux/i2c.h b/include/linux/i2c.h index d8f9060179d0..fa5552c2307b 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -442,6 +442,9 @@ struct i2c_board_info { extern struct i2c_client * i2c_new_device(struct i2c_adapter *adap, struct i2c_board_info const *info); +extern struct i2c_client * +i2c_new_client_device(struct i2c_adapter *adap, struct i2c_board_info const *info); + /* If you don't know the exact address of an I2C device, use this variant * instead, which can probe for device presence in a list of possible * addresses. The "probe" callback function is optional. If it is provided, @@ -463,6 +466,9 @@ extern int i2c_probe_func_quick_read(struct i2c_adapter *adap, unsigned short ad extern struct i2c_client * i2c_new_dummy(struct i2c_adapter *adap, u16 address); +extern struct i2c_client * +i2c_new_dummy_device(struct i2c_adapter *adapter, u16 address); + extern struct i2c_client * devm_i2c_new_dummy_device(struct device *dev, struct i2c_adapter *adap, u16 address); -- cgit v1.2.3-59-g8ed1b From 2a6a7aacd4e557a4c7007f8858bcc9654b098fea Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Mon, 3 Jun 2019 10:24:32 +0300 Subject: mfd: regulator: clk: Split rohm-bd718x7.h Split the bd718x7.h to ROHM common and bd718x7 specific parts so that we do not need to add same things in every new ROHM PMIC header. Please note that this change requires changes also in bd718x7 sub-device drivers for regulators and clk. Signed-off-by: Matti Vaittinen Acked-by: Mark Brown Acked-by: Stephen Boyd Signed-off-by: Lee Jones --- drivers/clk/clk-bd718x7.c | 6 +++--- drivers/mfd/rohm-bd718x7.c | 23 ++++++++++++----------- drivers/regulator/bd718x7-regulator.c | 25 +++++++++++++------------ include/linux/mfd/rohm-bd718x7.h | 22 ++++++++-------------- include/linux/mfd/rohm-generic.h | 20 ++++++++++++++++++++ 5 files changed, 56 insertions(+), 40 deletions(-) create mode 100644 include/linux/mfd/rohm-generic.h (limited to 'include/linux') diff --git a/drivers/clk/clk-bd718x7.c b/drivers/clk/clk-bd718x7.c index 60422c72d142..461228ebf703 100644 --- a/drivers/clk/clk-bd718x7.c +++ b/drivers/clk/clk-bd718x7.c @@ -17,7 +17,7 @@ struct bd718xx_clk { u8 reg; u8 mask; struct platform_device *pdev; - struct bd718xx *mfd; + struct rohm_regmap_dev *mfd; }; static int bd71837_clk_set(struct clk_hw *hw, int status) @@ -68,7 +68,7 @@ static int bd71837_clk_probe(struct platform_device *pdev) int rval = -ENOMEM; const char *parent_clk; struct device *parent = pdev->dev.parent; - struct bd718xx *mfd = dev_get_drvdata(parent); + struct rohm_regmap_dev *mfd = dev_get_drvdata(parent); struct clk_init_data init = { .name = "bd718xx-32k-out", .ops = &bd71837_clk_ops, @@ -119,5 +119,5 @@ static struct platform_driver bd71837_clk = { module_platform_driver(bd71837_clk); MODULE_AUTHOR("Matti Vaittinen "); -MODULE_DESCRIPTION("BD71837 chip clk driver"); +MODULE_DESCRIPTION("BD71837/BD71847 chip clk driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/mfd/rohm-bd718x7.c b/drivers/mfd/rohm-bd718x7.c index a29d529a96f4..7beb444a57cb 100644 --- a/drivers/mfd/rohm-bd718x7.c +++ b/drivers/mfd/rohm-bd718x7.c @@ -98,18 +98,19 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c, return -ENOMEM; bd718xx->chip_irq = i2c->irq; - bd718xx->chip_type = (unsigned int)(uintptr_t) + bd718xx->chip.chip_type = (unsigned int)(uintptr_t) of_device_get_match_data(&i2c->dev); - bd718xx->dev = &i2c->dev; + bd718xx->chip.dev = &i2c->dev; dev_set_drvdata(&i2c->dev, bd718xx); - bd718xx->regmap = devm_regmap_init_i2c(i2c, &bd718xx_regmap_config); - if (IS_ERR(bd718xx->regmap)) { + bd718xx->chip.regmap = devm_regmap_init_i2c(i2c, + &bd718xx_regmap_config); + if (IS_ERR(bd718xx->chip.regmap)) { dev_err(&i2c->dev, "regmap initialization failed\n"); - return PTR_ERR(bd718xx->regmap); + return PTR_ERR(bd718xx->chip.regmap); } - ret = devm_regmap_add_irq_chip(&i2c->dev, bd718xx->regmap, + ret = devm_regmap_add_irq_chip(&i2c->dev, bd718xx->chip.regmap, bd718xx->chip_irq, IRQF_ONESHOT, 0, &bd718xx_irq_chip, &bd718xx->irq_data); if (ret) { @@ -118,7 +119,7 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c, } /* Configure short press to 10 milliseconds */ - ret = regmap_update_bits(bd718xx->regmap, + ret = regmap_update_bits(bd718xx->chip.regmap, BD718XX_REG_PWRONCONFIG0, BD718XX_PWRBTN_PRESS_DURATION_MASK, BD718XX_PWRBTN_SHORT_PRESS_10MS); @@ -129,7 +130,7 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c, } /* Configure long press to 10 seconds */ - ret = regmap_update_bits(bd718xx->regmap, + ret = regmap_update_bits(bd718xx->chip.regmap, BD718XX_REG_PWRONCONFIG1, BD718XX_PWRBTN_PRESS_DURATION_MASK, BD718XX_PWRBTN_LONG_PRESS_10S); @@ -149,7 +150,7 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c, button.irq = ret; - ret = devm_mfd_add_devices(bd718xx->dev, PLATFORM_DEVID_AUTO, + ret = devm_mfd_add_devices(bd718xx->chip.dev, PLATFORM_DEVID_AUTO, bd718xx_mfd_cells, ARRAY_SIZE(bd718xx_mfd_cells), NULL, 0, regmap_irq_get_domain(bd718xx->irq_data)); @@ -162,11 +163,11 @@ static int bd718xx_i2c_probe(struct i2c_client *i2c, static const struct of_device_id bd718xx_of_match[] = { { .compatible = "rohm,bd71837", - .data = (void *)BD718XX_TYPE_BD71837, + .data = (void *)ROHM_CHIP_TYPE_BD71837, }, { .compatible = "rohm,bd71847", - .data = (void *)BD718XX_TYPE_BD71847, + .data = (void *)ROHM_CHIP_TYPE_BD71847, }, { } }; diff --git a/drivers/regulator/bd718x7-regulator.c b/drivers/regulator/bd718x7-regulator.c index fde4264da6ff..ef2fc175a9ae 100644 --- a/drivers/regulator/bd718x7-regulator.c +++ b/drivers/regulator/bd718x7-regulator.c @@ -1152,12 +1152,12 @@ static int bd718xx_probe(struct platform_device *pdev) { struct bd718xx *mfd; struct regulator_config config = { 0 }; - struct bd718xx_pmic_inits pmic_regulators[] = { - [BD718XX_TYPE_BD71837] = { + struct bd718xx_pmic_inits pmic_regulators[ROHM_CHIP_TYPE_AMOUNT] = { + [ROHM_CHIP_TYPE_BD71837] = { .r_datas = bd71837_regulators, .r_amount = ARRAY_SIZE(bd71837_regulators), }, - [BD718XX_TYPE_BD71847] = { + [ROHM_CHIP_TYPE_BD71847] = { .r_datas = bd71847_regulators, .r_amount = ARRAY_SIZE(bd71847_regulators), }, @@ -1173,15 +1173,15 @@ static int bd718xx_probe(struct platform_device *pdev) goto err; } - if (mfd->chip_type >= BD718XX_TYPE_AMOUNT || - !pmic_regulators[mfd->chip_type].r_datas) { + if (mfd->chip.chip_type >= ROHM_CHIP_TYPE_AMOUNT || + !pmic_regulators[mfd->chip.chip_type].r_datas) { dev_err(&pdev->dev, "Unsupported chip type\n"); err = -EINVAL; goto err; } /* Register LOCK release */ - err = regmap_update_bits(mfd->regmap, BD718XX_REG_REGLOCK, + err = regmap_update_bits(mfd->chip.regmap, BD718XX_REG_REGLOCK, (REGLOCK_PWRSEQ | REGLOCK_VREG), 0); if (err) { dev_err(&pdev->dev, "Failed to unlock PMIC (%d)\n", err); @@ -1200,7 +1200,8 @@ static int bd718xx_probe(struct platform_device *pdev) * bit allowing HW defaults for power rails to be used */ if (!use_snvs) { - err = regmap_update_bits(mfd->regmap, BD718XX_REG_TRANS_COND1, + err = regmap_update_bits(mfd->chip.regmap, + BD718XX_REG_TRANS_COND1, BD718XX_ON_REQ_POWEROFF_MASK | BD718XX_SWRESET_POWEROFF_MASK | BD718XX_WDOG_POWEROFF_MASK | @@ -1215,17 +1216,17 @@ static int bd718xx_probe(struct platform_device *pdev) } } - for (i = 0; i < pmic_regulators[mfd->chip_type].r_amount; i++) { + for (i = 0; i < pmic_regulators[mfd->chip.chip_type].r_amount; i++) { const struct regulator_desc *desc; struct regulator_dev *rdev; const struct bd718xx_regulator_data *r; - r = &pmic_regulators[mfd->chip_type].r_datas[i]; + r = &pmic_regulators[mfd->chip.chip_type].r_datas[i]; desc = &r->desc; config.dev = pdev->dev.parent; - config.regmap = mfd->regmap; + config.regmap = mfd->chip.regmap; rdev = devm_regulator_register(&pdev->dev, desc, &config); if (IS_ERR(rdev)) { @@ -1254,7 +1255,7 @@ static int bd718xx_probe(struct platform_device *pdev) */ if (!use_snvs || !rdev->constraints->always_on || !rdev->constraints->boot_on) { - err = regmap_update_bits(mfd->regmap, r->init.reg, + err = regmap_update_bits(mfd->chip.regmap, r->init.reg, r->init.mask, r->init.val); if (err) { dev_err(&pdev->dev, @@ -1264,7 +1265,7 @@ static int bd718xx_probe(struct platform_device *pdev) } } for (j = 0; j < r->additional_init_amnt; j++) { - err = regmap_update_bits(mfd->regmap, + err = regmap_update_bits(mfd->chip.regmap, r->additional_inits[j].reg, r->additional_inits[j].mask, r->additional_inits[j].val); diff --git a/include/linux/mfd/rohm-bd718x7.h b/include/linux/mfd/rohm-bd718x7.h index fd194bfc836f..7f2dbde402a1 100644 --- a/include/linux/mfd/rohm-bd718x7.h +++ b/include/linux/mfd/rohm-bd718x7.h @@ -4,14 +4,9 @@ #ifndef __LINUX_MFD_BD718XX_H__ #define __LINUX_MFD_BD718XX_H__ +#include #include -enum { - BD718XX_TYPE_BD71837 = 0, - BD718XX_TYPE_BD71847, - BD718XX_TYPE_AMOUNT -}; - enum { BD718XX_BUCK1 = 0, BD718XX_BUCK2, @@ -321,18 +316,17 @@ enum { BD718XX_PWRBTN_LONG_PRESS_15S }; -struct bd718xx_clk; - struct bd718xx { - unsigned int chip_type; - struct device *dev; - struct regmap *regmap; - unsigned long int id; + /* + * Please keep this as the first member here as some + * drivers (clk) supporting more than one chip may only know this + * generic struct 'struct rohm_regmap_dev' and assume it is + * the first chunk of parent device's private data. + */ + struct rohm_regmap_dev chip; int chip_irq; struct regmap_irq_chip_data *irq_data; - - struct bd718xx_clk *clk; }; #endif /* __LINUX_MFD_BD718XX_H__ */ diff --git a/include/linux/mfd/rohm-generic.h b/include/linux/mfd/rohm-generic.h new file mode 100644 index 000000000000..bff15ac26f2c --- /dev/null +++ b/include/linux/mfd/rohm-generic.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (C) 2018 ROHM Semiconductors */ + +#ifndef __LINUX_MFD_ROHM_H__ +#define __LINUX_MFD_ROHM_H__ + +enum { + ROHM_CHIP_TYPE_BD71837 = 0, + ROHM_CHIP_TYPE_BD71847, + ROHM_CHIP_TYPE_BD70528, + ROHM_CHIP_TYPE_AMOUNT +}; + +struct rohm_regmap_dev { + unsigned int chip_type; + struct device *dev; + struct regmap *regmap; +}; + +#endif -- cgit v1.2.3-59-g8ed1b From 21b7c58fc1943f3aa8c18a994ab9bed4ae5aa72d Mon Sep 17 00:00:00 2001 From: Matti Vaittinen Date: Mon, 3 Jun 2019 10:25:08 +0300 Subject: mfd: bd70528: Support ROHM bd70528 PMIC core ROHM BD70528MWV is an ultra-low quiescent current general purpose single-chip power management IC for battery-powered portable devices. Add MFD core which enables chip access for following subdevices: - regulators/LED drivers - battery-charger - gpios - 32.768kHz clk - RTC - watchdog Signed-off-by: Matti Vaittinen Signed-off-by: Lee Jones --- drivers/mfd/Kconfig | 17 ++ drivers/mfd/Makefile | 2 + drivers/mfd/rohm-bd70528.c | 316 ++++++++++++++++++++++++++++++ include/linux/mfd/rohm-bd70528.h | 408 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 743 insertions(+) create mode 100644 drivers/mfd/rohm-bd70528.c create mode 100644 include/linux/mfd/rohm-bd70528.h (limited to 'include/linux') diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 294d9567cc71..11fc53d78c5f 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -1890,6 +1890,23 @@ config MFD_ROHM_BD718XX NXP i.MX8. It contains 8 BUCK outputs and 7 LDOs, voltage monitoring and emergency shut down as well as 32,768KHz clock output. +config MFD_ROHM_BD70528 + tristate "ROHM BD70528 Power Management IC" + depends on I2C=y + depends on OF + select REGMAP_I2C + select REGMAP_IRQ + select MFD_CORE + help + Select this option to get support for the ROHM BD70528 Power + Management IC. BD71837 is general purpose single-chip power + management IC for battery-powered portable devices. It contains + 3 ultra-low current consumption buck converters, 3 LDOs and 2 LED + drivers. Also included are 4 GPIOs, a real-time clock (RTC), a 32kHz + crystal oscillator, high-accuracy VREF for use with an external ADC, + 10 bits SAR ADC for battery temperature monitor and 1S battery + charger. + config MFD_STM32_LPTIMER tristate "Support for STM32 Low-Power Timer" depends on (ARCH_STM32 && OF) || COMPILE_TEST diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index 52b1a90ff515..643d65bcb6ea 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -247,5 +247,7 @@ obj-$(CONFIG_MFD_STM32_TIMERS) += stm32-timers.o obj-$(CONFIG_MFD_MXS_LRADC) += mxs-lradc.o obj-$(CONFIG_MFD_SC27XX_PMIC) += sprd-sc27xx-spi.o obj-$(CONFIG_RAVE_SP_CORE) += rave-sp.o +obj-$(CONFIG_MFD_ROHM_BD70528) += rohm-bd70528.o obj-$(CONFIG_MFD_ROHM_BD718XX) += rohm-bd718x7.o obj-$(CONFIG_MFD_STMFX) += stmfx.o + diff --git a/drivers/mfd/rohm-bd70528.c b/drivers/mfd/rohm-bd70528.c new file mode 100644 index 000000000000..55599d5c5c86 --- /dev/null +++ b/drivers/mfd/rohm-bd70528.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// +// Copyright (C) 2019 ROHM Semiconductors +// +// ROHM BD70528 PMIC driver + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BD70528_NUM_OF_GPIOS 4 + +static const struct resource rtc_irqs[] = { + DEFINE_RES_IRQ_NAMED(BD70528_INT_RTC_ALARM, "bd70528-rtc-alm"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_ELPS_TIM, "bd70528-elapsed-timer"), +}; + +static const struct resource charger_irqs[] = { + DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_OV_RES, "bd70528-bat-ov-res"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_OV_DET, "bd70528-bat-ov-det"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_DBAT_DET, "bd70528-bat-dead"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_COLD_RES, "bd70528-bat-warmed"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_COLD_DET, "bd70528-bat-cold"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_HOT_RES, "bd70528-bat-cooled"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_BATTSD_HOT_DET, "bd70528-bat-hot"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_CHG_TSD, "bd70528-chg-tshd"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_RMV, "bd70528-bat-removed"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_BAT_DET, "bd70528-bat-detected"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_OV_RES, "bd70528-dcin2-ov-res"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_OV_DET, "bd70528-dcin2-ov-det"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_RMV, "bd70528-dcin2-removed"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN2_DET, "bd70528-dcin2-detected"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN1_RMV, "bd70528-dcin1-removed"), + DEFINE_RES_IRQ_NAMED(BD70528_INT_DCIN1_DET, "bd70528-dcin1-detected"), +}; + +static struct mfd_cell bd70528_mfd_cells[] = { + { .name = "bd70528-pmic", }, + { .name = "bd70528-gpio", }, + /* + * We use BD71837 driver to drive the clock block. Only differences to + * BD70528 clock gate are the register address and mask. + */ + { .name = "bd718xx-clk", }, + { .name = "bd70528-wdt", }, + { + .name = "bd70528-power", + .resources = charger_irqs, + .num_resources = ARRAY_SIZE(charger_irqs), + }, { + .name = "bd70528-rtc", + .resources = rtc_irqs, + .num_resources = ARRAY_SIZE(rtc_irqs), + }, +}; + +static const struct regmap_range volatile_ranges[] = { + { + .range_min = BD70528_REG_INT_MAIN, + .range_max = BD70528_REG_INT_OP_FAIL, + }, { + .range_min = BD70528_REG_RTC_COUNT_H, + .range_max = BD70528_REG_RTC_ALM_REPEAT, + }, { + /* + * WDT control reg is special. Magic values must be written to + * it in order to change the control. Should not be cached. + */ + .range_min = BD70528_REG_WDT_CTRL, + .range_max = BD70528_REG_WDT_CTRL, + }, { + /* + * BD70528 also contains a few other registers which require + * magic sequences to be written in order to update the value. + * At least SHIPMODE, HWRESET, WARMRESET,and STANDBY + */ + .range_min = BD70528_REG_SHIPMODE, + .range_max = BD70528_REG_STANDBY, + }, +}; + +static const struct regmap_access_table volatile_regs = { + .yes_ranges = &volatile_ranges[0], + .n_yes_ranges = ARRAY_SIZE(volatile_ranges), +}; + +static struct regmap_config bd70528_regmap = { + .reg_bits = 8, + .val_bits = 8, + .volatile_table = &volatile_regs, + .max_register = BD70528_MAX_REGISTER, + .cache_type = REGCACHE_RBTREE, +}; + +/* + * Mapping of main IRQ register bits to sub-IRQ register offsets so that we can + * access corect sub-IRQ registers based on bits that are set in main IRQ + * register. + */ + +/* bit [0] - Shutdown register */ +unsigned int bit0_offsets[] = {0}; /* Shutdown register */ +unsigned int bit1_offsets[] = {1}; /* Power failure register */ +unsigned int bit2_offsets[] = {2}; /* VR FAULT register */ +unsigned int bit3_offsets[] = {3}; /* PMU register interrupts */ +unsigned int bit4_offsets[] = {4, 5}; /* Charger 1 and Charger 2 registers */ +unsigned int bit5_offsets[] = {6}; /* RTC register */ +unsigned int bit6_offsets[] = {7}; /* GPIO register */ +unsigned int bit7_offsets[] = {8}; /* Invalid operation register */ + +static struct regmap_irq_sub_irq_map bd70528_sub_irq_offsets[] = { + REGMAP_IRQ_MAIN_REG_OFFSET(bit0_offsets), + REGMAP_IRQ_MAIN_REG_OFFSET(bit1_offsets), + REGMAP_IRQ_MAIN_REG_OFFSET(bit2_offsets), + REGMAP_IRQ_MAIN_REG_OFFSET(bit3_offsets), + REGMAP_IRQ_MAIN_REG_OFFSET(bit4_offsets), + REGMAP_IRQ_MAIN_REG_OFFSET(bit5_offsets), + REGMAP_IRQ_MAIN_REG_OFFSET(bit6_offsets), + REGMAP_IRQ_MAIN_REG_OFFSET(bit7_offsets), +}; + +static struct regmap_irq bd70528_irqs[] = { + REGMAP_IRQ_REG(BD70528_INT_LONGPUSH, 0, BD70528_INT_LONGPUSH_MASK), + REGMAP_IRQ_REG(BD70528_INT_WDT, 0, BD70528_INT_WDT_MASK), + REGMAP_IRQ_REG(BD70528_INT_HWRESET, 0, BD70528_INT_HWRESET_MASK), + REGMAP_IRQ_REG(BD70528_INT_RSTB_FAULT, 0, BD70528_INT_RSTB_FAULT_MASK), + REGMAP_IRQ_REG(BD70528_INT_VBAT_UVLO, 0, BD70528_INT_VBAT_UVLO_MASK), + REGMAP_IRQ_REG(BD70528_INT_TSD, 0, BD70528_INT_TSD_MASK), + REGMAP_IRQ_REG(BD70528_INT_RSTIN, 0, BD70528_INT_RSTIN_MASK), + REGMAP_IRQ_REG(BD70528_INT_BUCK1_FAULT, 1, + BD70528_INT_BUCK1_FAULT_MASK), + REGMAP_IRQ_REG(BD70528_INT_BUCK2_FAULT, 1, + BD70528_INT_BUCK2_FAULT_MASK), + REGMAP_IRQ_REG(BD70528_INT_BUCK3_FAULT, 1, + BD70528_INT_BUCK3_FAULT_MASK), + REGMAP_IRQ_REG(BD70528_INT_LDO1_FAULT, 1, BD70528_INT_LDO1_FAULT_MASK), + REGMAP_IRQ_REG(BD70528_INT_LDO2_FAULT, 1, BD70528_INT_LDO2_FAULT_MASK), + REGMAP_IRQ_REG(BD70528_INT_LDO3_FAULT, 1, BD70528_INT_LDO3_FAULT_MASK), + REGMAP_IRQ_REG(BD70528_INT_LED1_FAULT, 1, BD70528_INT_LED1_FAULT_MASK), + REGMAP_IRQ_REG(BD70528_INT_LED2_FAULT, 1, BD70528_INT_LED2_FAULT_MASK), + REGMAP_IRQ_REG(BD70528_INT_BUCK1_OCP, 2, BD70528_INT_BUCK1_OCP_MASK), + REGMAP_IRQ_REG(BD70528_INT_BUCK2_OCP, 2, BD70528_INT_BUCK2_OCP_MASK), + REGMAP_IRQ_REG(BD70528_INT_BUCK3_OCP, 2, BD70528_INT_BUCK3_OCP_MASK), + REGMAP_IRQ_REG(BD70528_INT_LED1_OCP, 2, BD70528_INT_LED1_OCP_MASK), + REGMAP_IRQ_REG(BD70528_INT_LED2_OCP, 2, BD70528_INT_LED2_OCP_MASK), + REGMAP_IRQ_REG(BD70528_INT_BUCK1_FULLON, 2, + BD70528_INT_BUCK1_FULLON_MASK), + REGMAP_IRQ_REG(BD70528_INT_BUCK2_FULLON, 2, + BD70528_INT_BUCK2_FULLON_MASK), + REGMAP_IRQ_REG(BD70528_INT_SHORTPUSH, 3, BD70528_INT_SHORTPUSH_MASK), + REGMAP_IRQ_REG(BD70528_INT_AUTO_WAKEUP, 3, + BD70528_INT_AUTO_WAKEUP_MASK), + REGMAP_IRQ_REG(BD70528_INT_STATE_CHANGE, 3, + BD70528_INT_STATE_CHANGE_MASK), + REGMAP_IRQ_REG(BD70528_INT_BAT_OV_RES, 4, BD70528_INT_BAT_OV_RES_MASK), + REGMAP_IRQ_REG(BD70528_INT_BAT_OV_DET, 4, BD70528_INT_BAT_OV_DET_MASK), + REGMAP_IRQ_REG(BD70528_INT_DBAT_DET, 4, BD70528_INT_DBAT_DET_MASK), + REGMAP_IRQ_REG(BD70528_INT_BATTSD_COLD_RES, 4, + BD70528_INT_BATTSD_COLD_RES_MASK), + REGMAP_IRQ_REG(BD70528_INT_BATTSD_COLD_DET, 4, + BD70528_INT_BATTSD_COLD_DET_MASK), + REGMAP_IRQ_REG(BD70528_INT_BATTSD_HOT_RES, 4, + BD70528_INT_BATTSD_HOT_RES_MASK), + REGMAP_IRQ_REG(BD70528_INT_BATTSD_HOT_DET, 4, + BD70528_INT_BATTSD_HOT_DET_MASK), + REGMAP_IRQ_REG(BD70528_INT_CHG_TSD, 4, BD70528_INT_CHG_TSD_MASK), + REGMAP_IRQ_REG(BD70528_INT_BAT_RMV, 5, BD70528_INT_BAT_RMV_MASK), + REGMAP_IRQ_REG(BD70528_INT_BAT_DET, 5, BD70528_INT_BAT_DET_MASK), + REGMAP_IRQ_REG(BD70528_INT_DCIN2_OV_RES, 5, + BD70528_INT_DCIN2_OV_RES_MASK), + REGMAP_IRQ_REG(BD70528_INT_DCIN2_OV_DET, 5, + BD70528_INT_DCIN2_OV_DET_MASK), + REGMAP_IRQ_REG(BD70528_INT_DCIN2_RMV, 5, BD70528_INT_DCIN2_RMV_MASK), + REGMAP_IRQ_REG(BD70528_INT_DCIN2_DET, 5, BD70528_INT_DCIN2_DET_MASK), + REGMAP_IRQ_REG(BD70528_INT_DCIN1_RMV, 5, BD70528_INT_DCIN1_RMV_MASK), + REGMAP_IRQ_REG(BD70528_INT_DCIN1_DET, 5, BD70528_INT_DCIN1_DET_MASK), + REGMAP_IRQ_REG(BD70528_INT_RTC_ALARM, 6, BD70528_INT_RTC_ALARM_MASK), + REGMAP_IRQ_REG(BD70528_INT_ELPS_TIM, 6, BD70528_INT_ELPS_TIM_MASK), + REGMAP_IRQ_REG(BD70528_INT_GPIO0, 7, BD70528_INT_GPIO0_MASK), + REGMAP_IRQ_REG(BD70528_INT_GPIO1, 7, BD70528_INT_GPIO1_MASK), + REGMAP_IRQ_REG(BD70528_INT_GPIO2, 7, BD70528_INT_GPIO2_MASK), + REGMAP_IRQ_REG(BD70528_INT_GPIO3, 7, BD70528_INT_GPIO3_MASK), + REGMAP_IRQ_REG(BD70528_INT_BUCK1_DVS_OPFAIL, 8, + BD70528_INT_BUCK1_DVS_OPFAIL_MASK), + REGMAP_IRQ_REG(BD70528_INT_BUCK2_DVS_OPFAIL, 8, + BD70528_INT_BUCK2_DVS_OPFAIL_MASK), + REGMAP_IRQ_REG(BD70528_INT_BUCK3_DVS_OPFAIL, 8, + BD70528_INT_BUCK3_DVS_OPFAIL_MASK), + REGMAP_IRQ_REG(BD70528_INT_LED1_VOLT_OPFAIL, 8, + BD70528_INT_LED1_VOLT_OPFAIL_MASK), + REGMAP_IRQ_REG(BD70528_INT_LED2_VOLT_OPFAIL, 8, + BD70528_INT_LED2_VOLT_OPFAIL_MASK), +}; + +static struct regmap_irq_chip bd70528_irq_chip = { + .name = "bd70528_irq", + .main_status = BD70528_REG_INT_MAIN, + .irqs = &bd70528_irqs[0], + .num_irqs = ARRAY_SIZE(bd70528_irqs), + .status_base = BD70528_REG_INT_SHDN, + .mask_base = BD70528_REG_INT_SHDN_MASK, + .ack_base = BD70528_REG_INT_SHDN, + .type_base = BD70528_REG_GPIO1_IN, + .init_ack_masked = true, + .num_regs = 9, + .num_main_regs = 1, + .num_type_reg = 4, + .sub_reg_offsets = &bd70528_sub_irq_offsets[0], + .num_main_status_bits = 8, + .irq_reg_stride = 1, +}; + +static int bd70528_i2c_probe(struct i2c_client *i2c, + const struct i2c_device_id *id) +{ + struct bd70528_data *bd70528; + struct regmap_irq_chip_data *irq_data; + int ret, i; + + if (!i2c->irq) { + dev_err(&i2c->dev, "No IRQ configured\n"); + return -EINVAL; + } + + bd70528 = devm_kzalloc(&i2c->dev, sizeof(*bd70528), GFP_KERNEL); + if (!bd70528) + return -ENOMEM; + + mutex_init(&bd70528->rtc_timer_lock); + + dev_set_drvdata(&i2c->dev, &bd70528->chip); + + bd70528->chip.chip_type = ROHM_CHIP_TYPE_BD70528; + bd70528->chip.regmap = devm_regmap_init_i2c(i2c, &bd70528_regmap); + if (IS_ERR(bd70528->chip.regmap)) { + dev_err(&i2c->dev, "Failed to initialize Regmap\n"); + return PTR_ERR(bd70528->chip.regmap); + } + + /* + * Disallow type setting for all IRQs by default as most of them do not + * support setting type. + */ + for (i = 0; i < ARRAY_SIZE(bd70528_irqs); i++) + bd70528_irqs[i].type.types_supported = 0; + + /* Set IRQ typesetting information for GPIO pins 0 - 3 */ + for (i = 0; i < BD70528_NUM_OF_GPIOS; i++) { + struct regmap_irq_type *type; + + type = &bd70528_irqs[BD70528_INT_GPIO0 + i].type; + type->type_reg_offset = 2 * i; + type->type_rising_val = 0x20; + type->type_falling_val = 0x10; + type->type_level_high_val = 0x40; + type->type_level_low_val = 0x50; + type->types_supported = (IRQ_TYPE_EDGE_BOTH | + IRQ_TYPE_LEVEL_HIGH | IRQ_TYPE_LEVEL_LOW); + } + + ret = devm_regmap_add_irq_chip(&i2c->dev, bd70528->chip.regmap, + i2c->irq, IRQF_ONESHOT, 0, + &bd70528_irq_chip, &irq_data); + if (ret) { + dev_err(&i2c->dev, "Failed to add IRQ chip\n"); + return ret; + } + dev_dbg(&i2c->dev, "Registered %d IRQs for chip\n", + bd70528_irq_chip.num_irqs); + + /* + * BD70528 IRQ controller is not touching the main mask register. + * So enable the GPIO block interrupts at main level. We can just leave + * them enabled as the IRQ controller should disable IRQs from + * sub-registers when IRQ is disabled or freed. + */ + ret = regmap_update_bits(bd70528->chip.regmap, + BD70528_REG_INT_MAIN_MASK, + BD70528_INT_GPIO_MASK, 0); + + ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_AUTO, + bd70528_mfd_cells, + ARRAY_SIZE(bd70528_mfd_cells), NULL, 0, + regmap_irq_get_domain(irq_data)); + if (ret) + dev_err(&i2c->dev, "Failed to create subdevices\n"); + + return ret; +} + +static const struct of_device_id bd70528_of_match[] = { + { .compatible = "rohm,bd70528", }, + { }, +}; +MODULE_DEVICE_TABLE(of, bd70528_of_match); + +static struct i2c_driver bd70528_drv = { + .driver = { + .name = "rohm-bd70528", + .of_match_table = bd70528_of_match, + }, + .probe = &bd70528_i2c_probe, +}; + +module_i2c_driver(bd70528_drv); + +MODULE_AUTHOR("Matti Vaittinen "); +MODULE_DESCRIPTION("ROHM BD70528 Power Management IC driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mfd/rohm-bd70528.h b/include/linux/mfd/rohm-bd70528.h new file mode 100644 index 000000000000..1013e60c5b25 --- /dev/null +++ b/include/linux/mfd/rohm-bd70528.h @@ -0,0 +1,408 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Copyright (C) 2018 ROHM Semiconductors */ + +#ifndef __LINUX_MFD_BD70528_H__ +#define __LINUX_MFD_BD70528_H__ + +#include +#include +#include +#include + +enum { + BD70528_BUCK1, + BD70528_BUCK2, + BD70528_BUCK3, + BD70528_LDO1, + BD70528_LDO2, + BD70528_LDO3, + BD70528_LED1, + BD70528_LED2, +}; + +struct bd70528_data { + struct rohm_regmap_dev chip; + struct mutex rtc_timer_lock; +}; + +#define BD70528_BUCK_VOLTS 17 +#define BD70528_BUCK_VOLTS 17 +#define BD70528_BUCK_VOLTS 17 +#define BD70528_LDO_VOLTS 0x20 + +#define BD70528_REG_BUCK1_EN 0x0F +#define BD70528_REG_BUCK1_VOLT 0x15 +#define BD70528_REG_BUCK2_EN 0x10 +#define BD70528_REG_BUCK2_VOLT 0x16 +#define BD70528_REG_BUCK3_EN 0x11 +#define BD70528_REG_BUCK3_VOLT 0x17 +#define BD70528_REG_LDO1_EN 0x1b +#define BD70528_REG_LDO1_VOLT 0x1e +#define BD70528_REG_LDO2_EN 0x1c +#define BD70528_REG_LDO2_VOLT 0x1f +#define BD70528_REG_LDO3_EN 0x1d +#define BD70528_REG_LDO3_VOLT 0x20 +#define BD70528_REG_LED_CTRL 0x2b +#define BD70528_REG_LED_VOLT 0x29 +#define BD70528_REG_LED_EN 0x2a + +/* main irq registers */ +#define BD70528_REG_INT_MAIN 0x7E +#define BD70528_REG_INT_MAIN_MASK 0x74 + +/* 'sub irq' registers */ +#define BD70528_REG_INT_SHDN 0x7F +#define BD70528_REG_INT_PWR_FLT 0x80 +#define BD70528_REG_INT_VR_FLT 0x81 +#define BD70528_REG_INT_MISC 0x82 +#define BD70528_REG_INT_BAT1 0x83 +#define BD70528_REG_INT_BAT2 0x84 +#define BD70528_REG_INT_RTC 0x85 +#define BD70528_REG_INT_GPIO 0x86 +#define BD70528_REG_INT_OP_FAIL 0x87 + +#define BD70528_REG_INT_SHDN_MASK 0x75 +#define BD70528_REG_INT_PWR_FLT_MASK 0x76 +#define BD70528_REG_INT_VR_FLT_MASK 0x77 +#define BD70528_REG_INT_MISC_MASK 0x78 +#define BD70528_REG_INT_BAT1_MASK 0x79 +#define BD70528_REG_INT_BAT2_MASK 0x7a +#define BD70528_REG_INT_RTC_MASK 0x7b +#define BD70528_REG_INT_GPIO_MASK 0x7c +#define BD70528_REG_INT_OP_FAIL_MASK 0x7d + +/* Reset related 'magic' registers */ +#define BD70528_REG_SHIPMODE 0x03 +#define BD70528_REG_HWRESET 0x04 +#define BD70528_REG_WARMRESET 0x05 +#define BD70528_REG_STANDBY 0x06 + +/* GPIO registers */ +#define BD70528_REG_GPIO_STATE 0x8F + +#define BD70528_REG_GPIO1_IN 0x4d +#define BD70528_REG_GPIO2_IN 0x4f +#define BD70528_REG_GPIO3_IN 0x51 +#define BD70528_REG_GPIO4_IN 0x53 +#define BD70528_REG_GPIO1_OUT 0x4e +#define BD70528_REG_GPIO2_OUT 0x50 +#define BD70528_REG_GPIO3_OUT 0x52 +#define BD70528_REG_GPIO4_OUT 0x54 + +/* clk control */ + +#define BD70528_REG_CLK_OUT 0x2c + +/* RTC */ + +#define BD70528_REG_RTC_COUNT_H 0x2d +#define BD70528_REG_RTC_COUNT_L 0x2e +#define BD70528_REG_RTC_SEC 0x2f +#define BD70528_REG_RTC_MINUTE 0x30 +#define BD70528_REG_RTC_HOUR 0x31 +#define BD70528_REG_RTC_WEEK 0x32 +#define BD70528_REG_RTC_DAY 0x33 +#define BD70528_REG_RTC_MONTH 0x34 +#define BD70528_REG_RTC_YEAR 0x35 + +#define BD70528_REG_RTC_ALM_SEC 0x36 +#define BD70528_REG_RTC_ALM_START BD70528_REG_RTC_ALM_SEC +#define BD70528_REG_RTC_ALM_MINUTE 0x37 +#define BD70528_REG_RTC_ALM_HOUR 0x38 +#define BD70528_REG_RTC_ALM_WEEK 0x39 +#define BD70528_REG_RTC_ALM_DAY 0x3a +#define BD70528_REG_RTC_ALM_MONTH 0x3b +#define BD70528_REG_RTC_ALM_YEAR 0x3c +#define BD70528_REG_RTC_ALM_MASK 0x3d +#define BD70528_REG_RTC_ALM_REPEAT 0x3e +#define BD70528_REG_RTC_START BD70528_REG_RTC_SEC + +#define BD70528_REG_RTC_WAKE_SEC 0x43 +#define BD70528_REG_RTC_WAKE_START BD70528_REG_RTC_WAKE_SEC +#define BD70528_REG_RTC_WAKE_MIN 0x44 +#define BD70528_REG_RTC_WAKE_HOUR 0x45 +#define BD70528_REG_RTC_WAKE_CTRL 0x46 + +#define BD70528_REG_ELAPSED_TIMER_EN 0x42 +#define BD70528_REG_WAKE_EN 0x46 + +/* WDT registers */ +#define BD70528_REG_WDT_CTRL 0x4A +#define BD70528_REG_WDT_HOUR 0x49 +#define BD70528_REG_WDT_MINUTE 0x48 +#define BD70528_REG_WDT_SEC 0x47 + +/* Charger / Battery */ +#define BD70528_REG_CHG_CURR_STAT 0x59 +#define BD70528_REG_CHG_BAT_STAT 0x57 +#define BD70528_REG_CHG_BAT_TEMP 0x58 +#define BD70528_REG_CHG_IN_STAT 0x56 +#define BD70528_REG_CHG_DCIN_ILIM 0x5d +#define BD70528_REG_CHG_CHG_CURR_WARM 0x61 +#define BD70528_REG_CHG_CHG_CURR_COLD 0x62 + +/* Masks for main IRQ register bits */ +enum { + BD70528_INT_SHDN, +#define BD70528_INT_SHDN_MASK BIT(BD70528_INT_SHDN) + BD70528_INT_PWR_FLT, +#define BD70528_INT_PWR_FLT_MASK BIT(BD70528_INT_PWR_FLT) + BD70528_INT_VR_FLT, +#define BD70528_INT_VR_FLT_MASK BIT(BD70528_INT_VR_FLT) + BD70528_INT_MISC, +#define BD70528_INT_MISC_MASK BIT(BD70528_INT_MISC) + BD70528_INT_BAT1, +#define BD70528_INT_BAT1_MASK BIT(BD70528_INT_BAT1) + BD70528_INT_RTC, +#define BD70528_INT_RTC_MASK BIT(BD70528_INT_RTC) + BD70528_INT_GPIO, +#define BD70528_INT_GPIO_MASK BIT(BD70528_INT_GPIO) + BD70528_INT_OP_FAIL, +#define BD70528_INT_OP_FAIL_MASK BIT(BD70528_INT_OP_FAIL) +}; + +/* IRQs */ +enum { + /* Shutdown register IRQs */ + BD70528_INT_LONGPUSH, + BD70528_INT_WDT, + BD70528_INT_HWRESET, + BD70528_INT_RSTB_FAULT, + BD70528_INT_VBAT_UVLO, + BD70528_INT_TSD, + BD70528_INT_RSTIN, + /* Power failure register IRQs */ + BD70528_INT_BUCK1_FAULT, + BD70528_INT_BUCK2_FAULT, + BD70528_INT_BUCK3_FAULT, + BD70528_INT_LDO1_FAULT, + BD70528_INT_LDO2_FAULT, + BD70528_INT_LDO3_FAULT, + BD70528_INT_LED1_FAULT, + BD70528_INT_LED2_FAULT, + /* VR FAULT register IRQs */ + BD70528_INT_BUCK1_OCP, + BD70528_INT_BUCK2_OCP, + BD70528_INT_BUCK3_OCP, + BD70528_INT_LED1_OCP, + BD70528_INT_LED2_OCP, + BD70528_INT_BUCK1_FULLON, + BD70528_INT_BUCK2_FULLON, + /* PMU register interrupts */ + BD70528_INT_SHORTPUSH, + BD70528_INT_AUTO_WAKEUP, + BD70528_INT_STATE_CHANGE, + /* Charger 1 register IRQs */ + BD70528_INT_BAT_OV_RES, + BD70528_INT_BAT_OV_DET, + BD70528_INT_DBAT_DET, + BD70528_INT_BATTSD_COLD_RES, + BD70528_INT_BATTSD_COLD_DET, + BD70528_INT_BATTSD_HOT_RES, + BD70528_INT_BATTSD_HOT_DET, + BD70528_INT_CHG_TSD, + /* Charger 2 register IRQs */ + BD70528_INT_BAT_RMV, + BD70528_INT_BAT_DET, + BD70528_INT_DCIN2_OV_RES, + BD70528_INT_DCIN2_OV_DET, + BD70528_INT_DCIN2_RMV, + BD70528_INT_DCIN2_DET, + BD70528_INT_DCIN1_RMV, + BD70528_INT_DCIN1_DET, + /* RTC register IRQs */ + BD70528_INT_RTC_ALARM, + BD70528_INT_ELPS_TIM, + /* GPIO register IRQs */ + BD70528_INT_GPIO0, + BD70528_INT_GPIO1, + BD70528_INT_GPIO2, + BD70528_INT_GPIO3, + /* Invalid operation register IRQs */ + BD70528_INT_BUCK1_DVS_OPFAIL, + BD70528_INT_BUCK2_DVS_OPFAIL, + BD70528_INT_BUCK3_DVS_OPFAIL, + BD70528_INT_LED1_VOLT_OPFAIL, + BD70528_INT_LED2_VOLT_OPFAIL, +}; + +/* Masks */ +#define BD70528_INT_LONGPUSH_MASK 0x1 +#define BD70528_INT_WDT_MASK 0x2 +#define BD70528_INT_HWRESET_MASK 0x4 +#define BD70528_INT_RSTB_FAULT_MASK 0x8 +#define BD70528_INT_VBAT_UVLO_MASK 0x10 +#define BD70528_INT_TSD_MASK 0x20 +#define BD70528_INT_RSTIN_MASK 0x40 + +#define BD70528_INT_BUCK1_FAULT_MASK 0x1 +#define BD70528_INT_BUCK2_FAULT_MASK 0x2 +#define BD70528_INT_BUCK3_FAULT_MASK 0x4 +#define BD70528_INT_LDO1_FAULT_MASK 0x8 +#define BD70528_INT_LDO2_FAULT_MASK 0x10 +#define BD70528_INT_LDO3_FAULT_MASK 0x20 +#define BD70528_INT_LED1_FAULT_MASK 0x40 +#define BD70528_INT_LED2_FAULT_MASK 0x80 + +#define BD70528_INT_BUCK1_OCP_MASK 0x1 +#define BD70528_INT_BUCK2_OCP_MASK 0x2 +#define BD70528_INT_BUCK3_OCP_MASK 0x4 +#define BD70528_INT_LED1_OCP_MASK 0x8 +#define BD70528_INT_LED2_OCP_MASK 0x10 +#define BD70528_INT_BUCK1_FULLON_MASK 0x20 +#define BD70528_INT_BUCK2_FULLON_MASK 0x40 + +#define BD70528_INT_SHORTPUSH_MASK 0x1 +#define BD70528_INT_AUTO_WAKEUP_MASK 0x2 +#define BD70528_INT_STATE_CHANGE_MASK 0x10 + +#define BD70528_INT_BAT_OV_RES_MASK 0x1 +#define BD70528_INT_BAT_OV_DET_MASK 0x2 +#define BD70528_INT_DBAT_DET_MASK 0x4 +#define BD70528_INT_BATTSD_COLD_RES_MASK 0x8 +#define BD70528_INT_BATTSD_COLD_DET_MASK 0x10 +#define BD70528_INT_BATTSD_HOT_RES_MASK 0x20 +#define BD70528_INT_BATTSD_HOT_DET_MASK 0x40 +#define BD70528_INT_CHG_TSD_MASK 0x80 + +#define BD70528_INT_BAT_RMV_MASK 0x1 +#define BD70528_INT_BAT_DET_MASK 0x2 +#define BD70528_INT_DCIN2_OV_RES_MASK 0x4 +#define BD70528_INT_DCIN2_OV_DET_MASK 0x8 +#define BD70528_INT_DCIN2_RMV_MASK 0x10 +#define BD70528_INT_DCIN2_DET_MASK 0x20 +#define BD70528_INT_DCIN1_RMV_MASK 0x40 +#define BD70528_INT_DCIN1_DET_MASK 0x80 + +#define BD70528_INT_RTC_ALARM_MASK 0x1 +#define BD70528_INT_ELPS_TIM_MASK 0x2 + +#define BD70528_INT_GPIO0_MASK 0x1 +#define BD70528_INT_GPIO1_MASK 0x2 +#define BD70528_INT_GPIO2_MASK 0x4 +#define BD70528_INT_GPIO3_MASK 0x8 + +#define BD70528_INT_BUCK1_DVS_OPFAIL_MASK 0x1 +#define BD70528_INT_BUCK2_DVS_OPFAIL_MASK 0x2 +#define BD70528_INT_BUCK3_DVS_OPFAIL_MASK 0x4 +#define BD70528_INT_LED1_VOLT_OPFAIL_MASK 0x10 +#define BD70528_INT_LED2_VOLT_OPFAIL_MASK 0x20 + +#define BD70528_DEBOUNCE_MASK 0x3 + +#define BD70528_DEBOUNCE_DISABLE 0 +#define BD70528_DEBOUNCE_15MS 1 +#define BD70528_DEBOUNCE_30MS 2 +#define BD70528_DEBOUNCE_50MS 3 + +#define BD70528_GPIO_DRIVE_MASK 0x2 +#define BD70528_GPIO_PUSH_PULL 0x0 +#define BD70528_GPIO_OPEN_DRAIN 0x2 + +#define BD70528_GPIO_OUT_EN_MASK 0x80 +#define BD70528_GPIO_OUT_ENABLE 0x80 +#define BD70528_GPIO_OUT_DISABLE 0x0 + +#define BD70528_GPIO_OUT_HI 0x1 +#define BD70528_GPIO_OUT_LO 0x0 +#define BD70528_GPIO_OUT_MASK 0x1 + +#define BD70528_GPIO_IN_STATE_BASE 1 + +#define BD70528_CLK_OUT_EN_MASK 0x1 + +/* RTC masks to mask out reserved bits */ + +#define BD70528_MASK_RTC_SEC 0x7f +#define BD70528_MASK_RTC_MINUTE 0x7f +#define BD70528_MASK_RTC_HOUR_24H 0x80 +#define BD70528_MASK_RTC_HOUR_PM 0x20 +#define BD70528_MASK_RTC_HOUR 0x1f +#define BD70528_MASK_RTC_DAY 0x3f +#define BD70528_MASK_RTC_WEEK 0x07 +#define BD70528_MASK_RTC_MONTH 0x1f +#define BD70528_MASK_RTC_YEAR 0xff +#define BD70528_MASK_RTC_COUNT_L 0x7f + +#define BD70528_MASK_ELAPSED_TIMER_EN 0x1 +/* Mask second, min and hour fields + * HW would support ALM irq for over 24h + * (by setting day, month and year too) + * but as we wish to keep this same as for + * wake-up we limit ALM to 24H and only + * unmask sec, min and hour + */ +#define BD70528_MASK_ALM_EN 0x7 +#define BD70528_MASK_WAKE_EN 0x1 + +/* WDT masks */ +#define BD70528_MASK_WDT_EN 0x1 +#define BD70528_MASK_WDT_HOUR 0x1 +#define BD70528_MASK_WDT_MINUTE 0x7f +#define BD70528_MASK_WDT_SEC 0x7f + +#define BD70528_WDT_STATE_BIT 0x1 +#define BD70528_ELAPSED_STATE_BIT 0x2 +#define BD70528_WAKE_STATE_BIT 0x4 + +/* Charger masks */ +#define BD70528_MASK_CHG_STAT 0x7f +#define BD70528_MASK_CHG_BAT_TIMER 0x20 +#define BD70528_MASK_CHG_BAT_OVERVOLT 0x10 +#define BD70528_MASK_CHG_BAT_DETECT 0x1 +#define BD70528_MASK_CHG_DCIN1_UVLO 0x1 +#define BD70528_MASK_CHG_DCIN_ILIM 0x3f +#define BD70528_MASK_CHG_CHG_CURR 0x1f +#define BD70528_MASK_CHG_TRICKLE_CURR 0x10 + +/* + * Note, external battery register is the lonely rider at + * address 0xc5. See how to stuff that in the regmap + */ +#define BD70528_MAX_REGISTER 0x94 + +/* Buck control masks */ +#define BD70528_MASK_RUN_EN 0x4 +#define BD70528_MASK_STBY_EN 0x2 +#define BD70528_MASK_IDLE_EN 0x1 +#define BD70528_MASK_LED1_EN 0x1 +#define BD70528_MASK_LED2_EN 0x10 + +#define BD70528_MASK_BUCK_VOLT 0xf +#define BD70528_MASK_LDO_VOLT 0x1f +#define BD70528_MASK_LED1_VOLT 0x1 +#define BD70528_MASK_LED2_VOLT 0x10 + +/* Misc irq masks */ +#define BD70528_INT_MASK_SHORT_PUSH 1 +#define BD70528_INT_MASK_AUTO_WAKE 2 +#define BD70528_INT_MASK_POWER_STATE 4 + +#define BD70528_MASK_BUCK_RAMP 0x10 +#define BD70528_SIFT_BUCK_RAMP 4 + +#if IS_ENABLED(CONFIG_BD70528_WATCHDOG) + +int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable, int *old_state); +void bd70528_wdt_lock(struct rohm_regmap_dev *data); +void bd70528_wdt_unlock(struct rohm_regmap_dev *data); + +#else /* CONFIG_BD70528_WATCHDOG */ + +static inline int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable, + int *old_state) +{ + return 0; +} + +static inline void bd70528_wdt_lock(struct rohm_regmap_dev *data) +{ +} + +static inline void bd70528_wdt_unlock(struct rohm_regmap_dev *data) +{ +} + +#endif /* CONFIG_BD70528_WATCHDOG */ + +#endif /* __LINUX_MFD_BD70528_H__ */ -- cgit v1.2.3-59-g8ed1b From 6bbe6f5732faeabb4bb583726ec2d7f9739532bd Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 18 Jun 2019 18:05:28 -0300 Subject: docs: thermal: convert to ReST Rename the thermal documentation files to ReST, add an index for them and adjust in order to produce a nice html output via the Sphinx build system. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Acked-by: Zhang Rui Signed-off-by: Zhang Rui --- Documentation/thermal/cpu-cooling-api.rst | 107 +++ Documentation/thermal/cpu-cooling-api.txt | 92 --- Documentation/thermal/exynos_thermal | 77 -- Documentation/thermal/exynos_thermal.rst | 90 +++ Documentation/thermal/exynos_thermal_emulation | 53 -- Documentation/thermal/exynos_thermal_emulation.rst | 61 ++ Documentation/thermal/index.rst | 18 + Documentation/thermal/intel_powerclamp.rst | 320 +++++++++ Documentation/thermal/intel_powerclamp.txt | 317 -------- Documentation/thermal/nouveau_thermal | 82 --- Documentation/thermal/nouveau_thermal.rst | 96 +++ Documentation/thermal/power_allocator.rst | 271 +++++++ Documentation/thermal/power_allocator.txt | 247 ------- Documentation/thermal/sysfs-api.rst | 798 +++++++++++++++++++++ Documentation/thermal/sysfs-api.txt | 636 ---------------- Documentation/thermal/x86_pkg_temperature_thermal | 47 -- .../thermal/x86_pkg_temperature_thermal.rst | 55 ++ MAINTAINERS | 2 +- include/linux/thermal.h | 4 +- 19 files changed, 1819 insertions(+), 1554 deletions(-) create mode 100644 Documentation/thermal/cpu-cooling-api.rst delete mode 100644 Documentation/thermal/cpu-cooling-api.txt delete mode 100644 Documentation/thermal/exynos_thermal create mode 100644 Documentation/thermal/exynos_thermal.rst delete mode 100644 Documentation/thermal/exynos_thermal_emulation create mode 100644 Documentation/thermal/exynos_thermal_emulation.rst create mode 100644 Documentation/thermal/index.rst create mode 100644 Documentation/thermal/intel_powerclamp.rst delete mode 100644 Documentation/thermal/intel_powerclamp.txt delete mode 100644 Documentation/thermal/nouveau_thermal create mode 100644 Documentation/thermal/nouveau_thermal.rst create mode 100644 Documentation/thermal/power_allocator.rst delete mode 100644 Documentation/thermal/power_allocator.txt create mode 100644 Documentation/thermal/sysfs-api.rst delete mode 100644 Documentation/thermal/sysfs-api.txt delete mode 100644 Documentation/thermal/x86_pkg_temperature_thermal create mode 100644 Documentation/thermal/x86_pkg_temperature_thermal.rst (limited to 'include/linux') diff --git a/Documentation/thermal/cpu-cooling-api.rst b/Documentation/thermal/cpu-cooling-api.rst new file mode 100644 index 000000000000..645d914c45a6 --- /dev/null +++ b/Documentation/thermal/cpu-cooling-api.rst @@ -0,0 +1,107 @@ +======================= +CPU cooling APIs How To +======================= + +Written by Amit Daniel Kachhap + +Updated: 6 Jan 2015 + +Copyright (c) 2012 Samsung Electronics Co., Ltd(http://www.samsung.com) + +0. Introduction +=============== + +The generic cpu cooling(freq clipping) provides registration/unregistration APIs +to the caller. The binding of the cooling devices to the trip point is left for +the user. The registration APIs returns the cooling device pointer. + +1. cpu cooling APIs +=================== + +1.1 cpufreq registration/unregistration APIs +-------------------------------------------- + + :: + + struct thermal_cooling_device + *cpufreq_cooling_register(struct cpumask *clip_cpus) + + This interface function registers the cpufreq cooling device with the name + "thermal-cpufreq-%x". This api can support multiple instances of cpufreq + cooling devices. + + clip_cpus: + cpumask of cpus where the frequency constraints will happen. + + :: + + struct thermal_cooling_device + *of_cpufreq_cooling_register(struct cpufreq_policy *policy) + + This interface function registers the cpufreq cooling device with + the name "thermal-cpufreq-%x" linking it with a device tree node, in + order to bind it via the thermal DT code. This api can support multiple + instances of cpufreq cooling devices. + + policy: + CPUFreq policy. + + + :: + + void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) + + This interface function unregisters the "thermal-cpufreq-%x" cooling device. + + cdev: Cooling device pointer which has to be unregistered. + +2. Power models +=============== + +The power API registration functions provide a simple power model for +CPUs. The current power is calculated as dynamic power (static power isn't +supported currently). This power model requires that the operating-points of +the CPUs are registered using the kernel's opp library and the +`cpufreq_frequency_table` is assigned to the `struct device` of the +cpu. If you are using CONFIG_CPUFREQ_DT then the +`cpufreq_frequency_table` should already be assigned to the cpu +device. + +The dynamic power consumption of a processor depends on many factors. +For a given processor implementation the primary factors are: + +- The time the processor spends running, consuming dynamic power, as + compared to the time in idle states where dynamic consumption is + negligible. Herein we refer to this as 'utilisation'. +- The voltage and frequency levels as a result of DVFS. The DVFS + level is a dominant factor governing power consumption. +- In running time the 'execution' behaviour (instruction types, memory + access patterns and so forth) causes, in most cases, a second order + variation. In pathological cases this variation can be significant, + but typically it is of a much lesser impact than the factors above. + +A high level dynamic power consumption model may then be represented as:: + + Pdyn = f(run) * Voltage^2 * Frequency * Utilisation + +f(run) here represents the described execution behaviour and its +result has a units of Watts/Hz/Volt^2 (this often expressed in +mW/MHz/uVolt^2) + +The detailed behaviour for f(run) could be modelled on-line. However, +in practice, such an on-line model has dependencies on a number of +implementation specific processor support and characterisation +factors. Therefore, in initial implementation that contribution is +represented as a constant coefficient. This is a simplification +consistent with the relative contribution to overall power variation. + +In this simplified representation our model becomes:: + + Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation + +Where `capacitance` is a constant that represents an indicative +running time dynamic power coefficient in fundamental units of +mW/MHz/uVolt^2. Typical values for mobile CPUs might lie in range +from 100 to 500. For reference, the approximate values for the SoC in +ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and +140 for the Cortex-A53 cluster. diff --git a/Documentation/thermal/cpu-cooling-api.txt b/Documentation/thermal/cpu-cooling-api.txt deleted file mode 100644 index 7df567eaea1a..000000000000 --- a/Documentation/thermal/cpu-cooling-api.txt +++ /dev/null @@ -1,92 +0,0 @@ -CPU cooling APIs How To -=================================== - -Written by Amit Daniel Kachhap - -Updated: 6 Jan 2015 - -Copyright (c) 2012 Samsung Electronics Co., Ltd(http://www.samsung.com) - -0. Introduction - -The generic cpu cooling(freq clipping) provides registration/unregistration APIs -to the caller. The binding of the cooling devices to the trip point is left for -the user. The registration APIs returns the cooling device pointer. - -1. cpu cooling APIs - -1.1 cpufreq registration/unregistration APIs -1.1.1 struct thermal_cooling_device *cpufreq_cooling_register( - struct cpumask *clip_cpus) - - This interface function registers the cpufreq cooling device with the name - "thermal-cpufreq-%x". This api can support multiple instances of cpufreq - cooling devices. - - clip_cpus: cpumask of cpus where the frequency constraints will happen. - -1.1.2 struct thermal_cooling_device *of_cpufreq_cooling_register( - struct cpufreq_policy *policy) - - This interface function registers the cpufreq cooling device with - the name "thermal-cpufreq-%x" linking it with a device tree node, in - order to bind it via the thermal DT code. This api can support multiple - instances of cpufreq cooling devices. - - policy: CPUFreq policy. - -1.1.3 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) - - This interface function unregisters the "thermal-cpufreq-%x" cooling device. - - cdev: Cooling device pointer which has to be unregistered. - -2. Power models - -The power API registration functions provide a simple power model for -CPUs. The current power is calculated as dynamic power (static power isn't -supported currently). This power model requires that the operating-points of -the CPUs are registered using the kernel's opp library and the -`cpufreq_frequency_table` is assigned to the `struct device` of the -cpu. If you are using CONFIG_CPUFREQ_DT then the -`cpufreq_frequency_table` should already be assigned to the cpu -device. - -The dynamic power consumption of a processor depends on many factors. -For a given processor implementation the primary factors are: - -- The time the processor spends running, consuming dynamic power, as - compared to the time in idle states where dynamic consumption is - negligible. Herein we refer to this as 'utilisation'. -- The voltage and frequency levels as a result of DVFS. The DVFS - level is a dominant factor governing power consumption. -- In running time the 'execution' behaviour (instruction types, memory - access patterns and so forth) causes, in most cases, a second order - variation. In pathological cases this variation can be significant, - but typically it is of a much lesser impact than the factors above. - -A high level dynamic power consumption model may then be represented as: - -Pdyn = f(run) * Voltage^2 * Frequency * Utilisation - -f(run) here represents the described execution behaviour and its -result has a units of Watts/Hz/Volt^2 (this often expressed in -mW/MHz/uVolt^2) - -The detailed behaviour for f(run) could be modelled on-line. However, -in practice, such an on-line model has dependencies on a number of -implementation specific processor support and characterisation -factors. Therefore, in initial implementation that contribution is -represented as a constant coefficient. This is a simplification -consistent with the relative contribution to overall power variation. - -In this simplified representation our model becomes: - -Pdyn = Capacitance * Voltage^2 * Frequency * Utilisation - -Where `capacitance` is a constant that represents an indicative -running time dynamic power coefficient in fundamental units of -mW/MHz/uVolt^2. Typical values for mobile CPUs might lie in range -from 100 to 500. For reference, the approximate values for the SoC in -ARM's Juno Development Platform are 530 for the Cortex-A57 cluster and -140 for the Cortex-A53 cluster. diff --git a/Documentation/thermal/exynos_thermal b/Documentation/thermal/exynos_thermal deleted file mode 100644 index 9010c4416967..000000000000 --- a/Documentation/thermal/exynos_thermal +++ /dev/null @@ -1,77 +0,0 @@ -Kernel driver exynos_tmu -================= - -Supported chips: -* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC - Datasheet: Not publicly available - -Authors: Donggeun Kim -Authors: Amit Daniel - -TMU controller Description: ---------------------------- - -This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC. - -The chip only exposes the measured 8-bit temperature code value -through a register. -Temperature can be taken from the temperature code. -There are three equations converting from temperature to temperature code. - -The three equations are: - 1. Two point trimming - Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1 - - 2. One point trimming - Tc = T + TI1 - 25 - - 3. No trimming - Tc = T + 50 - - Tc: Temperature code, T: Temperature, - TI1: Trimming info for 25 degree Celsius (stored at TRIMINFO register) - Temperature code measured at 25 degree Celsius which is unchanged - TI2: Trimming info for 85 degree Celsius (stored at TRIMINFO register) - Temperature code measured at 85 degree Celsius which is unchanged - -TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt -when temperature exceeds pre-defined levels. -The maximum number of configurable threshold is five. -The threshold levels are defined as follows: - Level_0: current temperature > trigger_level_0 + threshold - Level_1: current temperature > trigger_level_1 + threshold - Level_2: current temperature > trigger_level_2 + threshold - Level_3: current temperature > trigger_level_3 + threshold - - The threshold and each trigger_level are set - through the corresponding registers. - -When an interrupt occurs, this driver notify kernel thermal framework -with the function exynos_report_trigger. -Although an interrupt condition for level_0 can be set, -it can be used to synchronize the cooling action. - -TMU driver description: ------------------------ - -The exynos thermal driver is structured as, - - Kernel Core thermal framework - (thermal_core.c, step_wise.c, cpu_cooling.c) - ^ - | - | -TMU configuration data -------> TMU Driver <------> Exynos Core thermal wrapper -(exynos_tmu_data.c) (exynos_tmu.c) (exynos_thermal_common.c) -(exynos_tmu_data.h) (exynos_tmu.h) (exynos_thermal_common.h) - -a) TMU configuration data: This consist of TMU register offsets/bitfields - described through structure exynos_tmu_registers. Also several - other platform data (struct exynos_tmu_platform_data) members - are used to configure the TMU. -b) TMU driver: This component initialises the TMU controller and sets different - thresholds. It invokes core thermal implementation with the call - exynos_report_trigger. -c) Exynos Core thermal wrapper: This provides 3 wrapper function to use the - Kernel core thermal framework. They are exynos_unregister_thermal, - exynos_register_thermal and exynos_report_trigger. diff --git a/Documentation/thermal/exynos_thermal.rst b/Documentation/thermal/exynos_thermal.rst new file mode 100644 index 000000000000..5bd556566c70 --- /dev/null +++ b/Documentation/thermal/exynos_thermal.rst @@ -0,0 +1,90 @@ +======================== +Kernel driver exynos_tmu +======================== + +Supported chips: + +* ARM SAMSUNG EXYNOS4, EXYNOS5 series of SoC + + Datasheet: Not publicly available + +Authors: Donggeun Kim +Authors: Amit Daniel + +TMU controller Description: +--------------------------- + +This driver allows to read temperature inside SAMSUNG EXYNOS4/5 series of SoC. + +The chip only exposes the measured 8-bit temperature code value +through a register. +Temperature can be taken from the temperature code. +There are three equations converting from temperature to temperature code. + +The three equations are: + 1. Two point trimming:: + + Tc = (T - 25) * (TI2 - TI1) / (85 - 25) + TI1 + + 2. One point trimming:: + + Tc = T + TI1 - 25 + + 3. No trimming:: + + Tc = T + 50 + + Tc: + Temperature code, T: Temperature, + TI1: + Trimming info for 25 degree Celsius (stored at TRIMINFO register) + Temperature code measured at 25 degree Celsius which is unchanged + TI2: + Trimming info for 85 degree Celsius (stored at TRIMINFO register) + Temperature code measured at 85 degree Celsius which is unchanged + +TMU(Thermal Management Unit) in EXYNOS4/5 generates interrupt +when temperature exceeds pre-defined levels. +The maximum number of configurable threshold is five. +The threshold levels are defined as follows:: + + Level_0: current temperature > trigger_level_0 + threshold + Level_1: current temperature > trigger_level_1 + threshold + Level_2: current temperature > trigger_level_2 + threshold + Level_3: current temperature > trigger_level_3 + threshold + +The threshold and each trigger_level are set +through the corresponding registers. + +When an interrupt occurs, this driver notify kernel thermal framework +with the function exynos_report_trigger. +Although an interrupt condition for level_0 can be set, +it can be used to synchronize the cooling action. + +TMU driver description: +----------------------- + +The exynos thermal driver is structured as:: + + Kernel Core thermal framework + (thermal_core.c, step_wise.c, cpu_cooling.c) + ^ + | + | + TMU configuration data -----> TMU Driver <----> Exynos Core thermal wrapper + (exynos_tmu_data.c) (exynos_tmu.c) (exynos_thermal_common.c) + (exynos_tmu_data.h) (exynos_tmu.h) (exynos_thermal_common.h) + +a) TMU configuration data: + This consist of TMU register offsets/bitfields + described through structure exynos_tmu_registers. Also several + other platform data (struct exynos_tmu_platform_data) members + are used to configure the TMU. +b) TMU driver: + This component initialises the TMU controller and sets different + thresholds. It invokes core thermal implementation with the call + exynos_report_trigger. +c) Exynos Core thermal wrapper: + This provides 3 wrapper function to use the + Kernel core thermal framework. They are exynos_unregister_thermal, + exynos_register_thermal and exynos_report_trigger. diff --git a/Documentation/thermal/exynos_thermal_emulation b/Documentation/thermal/exynos_thermal_emulation deleted file mode 100644 index b15efec6ca28..000000000000 --- a/Documentation/thermal/exynos_thermal_emulation +++ /dev/null @@ -1,53 +0,0 @@ -EXYNOS EMULATION MODE -======================== - -Copyright (C) 2012 Samsung Electronics - -Written by Jonghwa Lee - -Description ------------ - -Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal management unit. -Thermal emulation mode supports software debug for TMU's operation. User can set temperature -manually with software code and TMU will read current temperature from user value not from -sensor's value. - -Enabling CONFIG_THERMAL_EMULATION option will make this support available. -When it's enabled, sysfs node will be created as -/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp. - -The sysfs node, 'emul_node', will contain value 0 for the initial state. When you input any -temperature you want to update to sysfs node, it automatically enable emulation mode and -current temperature will be changed into it. -(Exynos also supports user changeable delay time which would be used to delay of - changing temperature. However, this node only uses same delay of real sensing time, 938us.) - -Exynos emulation mode requires synchronous of value changing and enabling. It means when you -want to update the any value of delay or next temperature, then you have to enable emulation -mode at the same time. (Or you have to keep the mode enabling.) If you don't, it fails to -change the value to updated one and just use last succeessful value repeatedly. That's why -this node gives users the right to change termerpature only. Just one interface makes it more -simply to use. - -Disabling emulation mode only requires writing value 0 to sysfs node. - - -TEMP 120 | - | - 100 | - | - 80 | - | +----------- - 60 | | | - | +-------------| | - 40 | | | | - | | | | - 20 | | | +---------- - | | | | | - 0 |______________|_____________|__________|__________|_________ - A A A A TIME - |<----->| |<----->| |<----->| | - | 938us | | | | | | -emulation : 0 50 | 70 | 20 | 0 -current temp : sensor 50 70 20 sensor diff --git a/Documentation/thermal/exynos_thermal_emulation.rst b/Documentation/thermal/exynos_thermal_emulation.rst new file mode 100644 index 000000000000..c21d10838bc5 --- /dev/null +++ b/Documentation/thermal/exynos_thermal_emulation.rst @@ -0,0 +1,61 @@ +===================== +Exynos Emulation Mode +===================== + +Copyright (C) 2012 Samsung Electronics + +Written by Jonghwa Lee + +Description +----------- + +Exynos 4x12 (4212, 4412) and 5 series provide emulation mode for thermal +management unit. Thermal emulation mode supports software debug for +TMU's operation. User can set temperature manually with software code +and TMU will read current temperature from user value not from sensor's +value. + +Enabling CONFIG_THERMAL_EMULATION option will make this support +available. When it's enabled, sysfs node will be created as +/sys/devices/virtual/thermal/thermal_zone'zone id'/emul_temp. + +The sysfs node, 'emul_node', will contain value 0 for the initial state. +When you input any temperature you want to update to sysfs node, it +automatically enable emulation mode and current temperature will be +changed into it. + +(Exynos also supports user changeable delay time which would be used to +delay of changing temperature. However, this node only uses same delay +of real sensing time, 938us.) + +Exynos emulation mode requires synchronous of value changing and +enabling. It means when you want to update the any value of delay or +next temperature, then you have to enable emulation mode at the same +time. (Or you have to keep the mode enabling.) If you don't, it fails to +change the value to updated one and just use last succeessful value +repeatedly. That's why this node gives users the right to change +termerpature only. Just one interface makes it more simply to use. + +Disabling emulation mode only requires writing value 0 to sysfs node. + +:: + + + TEMP 120 | + | + 100 | + | + 80 | + | +----------- + 60 | | | + | +-------------| | + 40 | | | | + | | | | + 20 | | | +---------- + | | | | | + 0 |______________|_____________|__________|__________|_________ + A A A A TIME + |<----->| |<----->| |<----->| | + | 938us | | | | | | + emulation : 0 50 | 70 | 20 | 0 + current temp: sensor 50 70 20 sensor diff --git a/Documentation/thermal/index.rst b/Documentation/thermal/index.rst new file mode 100644 index 000000000000..8c1c00146cad --- /dev/null +++ b/Documentation/thermal/index.rst @@ -0,0 +1,18 @@ +:orphan: + +======= +Thermal +======= + +.. toctree:: + :maxdepth: 1 + + cpu-cooling-api + sysfs-api + power_allocator + + exynos_thermal + exynos_thermal_emulation + intel_powerclamp + nouveau_thermal + x86_pkg_temperature_thermal diff --git a/Documentation/thermal/intel_powerclamp.rst b/Documentation/thermal/intel_powerclamp.rst new file mode 100644 index 000000000000..3f6dfb0b3ea6 --- /dev/null +++ b/Documentation/thermal/intel_powerclamp.rst @@ -0,0 +1,320 @@ +======================= +Intel Powerclamp Driver +======================= + +By: + - Arjan van de Ven + - Jacob Pan + +.. Contents: + + (*) Introduction + - Goals and Objectives + + (*) Theory of Operation + - Idle Injection + - Calibration + + (*) Performance Analysis + - Effectiveness and Limitations + - Power vs Performance + - Scalability + - Calibration + - Comparison with Alternative Techniques + + (*) Usage and Interfaces + - Generic Thermal Layer (sysfs) + - Kernel APIs (TBD) + +INTRODUCTION +============ + +Consider the situation where a system’s power consumption must be +reduced at runtime, due to power budget, thermal constraint, or noise +level, and where active cooling is not preferred. Software managed +passive power reduction must be performed to prevent the hardware +actions that are designed for catastrophic scenarios. + +Currently, P-states, T-states (clock modulation), and CPU offlining +are used for CPU throttling. + +On Intel CPUs, C-states provide effective power reduction, but so far +they’re only used opportunistically, based on workload. With the +development of intel_powerclamp driver, the method of synchronizing +idle injection across all online CPU threads was introduced. The goal +is to achieve forced and controllable C-state residency. + +Test/Analysis has been made in the areas of power, performance, +scalability, and user experience. In many cases, clear advantage is +shown over taking the CPU offline or modulating the CPU clock. + + +THEORY OF OPERATION +=================== + +Idle Injection +-------------- + +On modern Intel processors (Nehalem or later), package level C-state +residency is available in MSRs, thus also available to the kernel. + +These MSRs are:: + + #define MSR_PKG_C2_RESIDENCY 0x60D + #define MSR_PKG_C3_RESIDENCY 0x3F8 + #define MSR_PKG_C6_RESIDENCY 0x3F9 + #define MSR_PKG_C7_RESIDENCY 0x3FA + +If the kernel can also inject idle time to the system, then a +closed-loop control system can be established that manages package +level C-state. The intel_powerclamp driver is conceived as such a +control system, where the target set point is a user-selected idle +ratio (based on power reduction), and the error is the difference +between the actual package level C-state residency ratio and the target idle +ratio. + +Injection is controlled by high priority kernel threads, spawned for +each online CPU. + +These kernel threads, with SCHED_FIFO class, are created to perform +clamping actions of controlled duty ratio and duration. Each per-CPU +thread synchronizes its idle time and duration, based on the rounding +of jiffies, so accumulated errors can be prevented to avoid a jittery +effect. Threads are also bound to the CPU such that they cannot be +migrated, unless the CPU is taken offline. In this case, threads +belong to the offlined CPUs will be terminated immediately. + +Running as SCHED_FIFO and relatively high priority, also allows such +scheme to work for both preemptable and non-preemptable kernels. +Alignment of idle time around jiffies ensures scalability for HZ +values. This effect can be better visualized using a Perf timechart. +The following diagram shows the behavior of kernel thread +kidle_inject/cpu. During idle injection, it runs monitor/mwait idle +for a given "duration", then relinquishes the CPU to other tasks, +until the next time interval. + +The NOHZ schedule tick is disabled during idle time, but interrupts +are not masked. Tests show that the extra wakeups from scheduler tick +have a dramatic impact on the effectiveness of the powerclamp driver +on large scale systems (Westmere system with 80 processors). + +:: + + CPU0 + ____________ ____________ + kidle_inject/0 | sleep | mwait | sleep | + _________| |________| |_______ + duration + CPU1 + ____________ ____________ + kidle_inject/1 | sleep | mwait | sleep | + _________| |________| |_______ + ^ + | + | + roundup(jiffies, interval) + +Only one CPU is allowed to collect statistics and update global +control parameters. This CPU is referred to as the controlling CPU in +this document. The controlling CPU is elected at runtime, with a +policy that favors BSP, taking into account the possibility of a CPU +hot-plug. + +In terms of dynamics of the idle control system, package level idle +time is considered largely as a non-causal system where its behavior +cannot be based on the past or current input. Therefore, the +intel_powerclamp driver attempts to enforce the desired idle time +instantly as given input (target idle ratio). After injection, +powerclamp monitors the actual idle for a given time window and adjust +the next injection accordingly to avoid over/under correction. + +When used in a causal control system, such as a temperature control, +it is up to the user of this driver to implement algorithms where +past samples and outputs are included in the feedback. For example, a +PID-based thermal controller can use the powerclamp driver to +maintain a desired target temperature, based on integral and +derivative gains of the past samples. + + + +Calibration +----------- +During scalability testing, it is observed that synchronized actions +among CPUs become challenging as the number of cores grows. This is +also true for the ability of a system to enter package level C-states. + +To make sure the intel_powerclamp driver scales well, online +calibration is implemented. The goals for doing such a calibration +are: + +a) determine the effective range of idle injection ratio +b) determine the amount of compensation needed at each target ratio + +Compensation to each target ratio consists of two parts: + + a) steady state error compensation + This is to offset the error occurring when the system can + enter idle without extra wakeups (such as external interrupts). + + b) dynamic error compensation + When an excessive amount of wakeups occurs during idle, an + additional idle ratio can be added to quiet interrupts, by + slowing down CPU activities. + +A debugfs file is provided for the user to examine compensation +progress and results, such as on a Westmere system:: + + [jacob@nex01 ~]$ cat + /sys/kernel/debug/intel_powerclamp/powerclamp_calib + controlling cpu: 0 + pct confidence steady dynamic (compensation) + 0 0 0 0 + 1 1 0 0 + 2 1 1 0 + 3 3 1 0 + 4 3 1 0 + 5 3 1 0 + 6 3 1 0 + 7 3 1 0 + 8 3 1 0 + ... + 30 3 2 0 + 31 3 2 0 + 32 3 1 0 + 33 3 2 0 + 34 3 1 0 + 35 3 2 0 + 36 3 1 0 + 37 3 2 0 + 38 3 1 0 + 39 3 2 0 + 40 3 3 0 + 41 3 1 0 + 42 3 2 0 + 43 3 1 0 + 44 3 1 0 + 45 3 2 0 + 46 3 3 0 + 47 3 0 0 + 48 3 2 0 + 49 3 3 0 + +Calibration occurs during runtime. No offline method is available. +Steady state compensation is used only when confidence levels of all +adjacent ratios have reached satisfactory level. A confidence level +is accumulated based on clean data collected at runtime. Data +collected during a period without extra interrupts is considered +clean. + +To compensate for excessive amounts of wakeup during idle, additional +idle time is injected when such a condition is detected. Currently, +we have a simple algorithm to double the injection ratio. A possible +enhancement might be to throttle the offending IRQ, such as delaying +EOI for level triggered interrupts. But it is a challenge to be +non-intrusive to the scheduler or the IRQ core code. + + +CPU Online/Offline +------------------ +Per-CPU kernel threads are started/stopped upon receiving +notifications of CPU hotplug activities. The intel_powerclamp driver +keeps track of clamping kernel threads, even after they are migrated +to other CPUs, after a CPU offline event. + + +Performance Analysis +==================== +This section describes the general performance data collected on +multiple systems, including Westmere (80P) and Ivy Bridge (4P, 8P). + +Effectiveness and Limitations +----------------------------- +The maximum range that idle injection is allowed is capped at 50 +percent. As mentioned earlier, since interrupts are allowed during +forced idle time, excessive interrupts could result in less +effectiveness. The extreme case would be doing a ping -f to generated +flooded network interrupts without much CPU acknowledgement. In this +case, little can be done from the idle injection threads. In most +normal cases, such as scp a large file, applications can be throttled +by the powerclamp driver, since slowing down the CPU also slows down +network protocol processing, which in turn reduces interrupts. + +When control parameters change at runtime by the controlling CPU, it +may take an additional period for the rest of the CPUs to catch up +with the changes. During this time, idle injection is out of sync, +thus not able to enter package C- states at the expected ratio. But +this effect is minor, in that in most cases change to the target +ratio is updated much less frequently than the idle injection +frequency. + +Scalability +----------- +Tests also show a minor, but measurable, difference between the 4P/8P +Ivy Bridge system and the 80P Westmere server under 50% idle ratio. +More compensation is needed on Westmere for the same amount of +target idle ratio. The compensation also increases as the idle ratio +gets larger. The above reason constitutes the need for the +calibration code. + +On the IVB 8P system, compared to an offline CPU, powerclamp can +achieve up to 40% better performance per watt. (measured by a spin +counter summed over per CPU counting threads spawned for all running +CPUs). + +Usage and Interfaces +==================== +The powerclamp driver is registered to the generic thermal layer as a +cooling device. Currently, it’s not bound to any thermal zones:: + + jacob@chromoly:/sys/class/thermal/cooling_device14$ grep . * + cur_state:0 + max_state:50 + type:intel_powerclamp + +cur_state allows user to set the desired idle percentage. Writing 0 to +cur_state will stop idle injection. Writing a value between 1 and +max_state will start the idle injection. Reading cur_state returns the +actual and current idle percentage. This may not be the same value +set by the user in that current idle percentage depends on workload +and includes natural idle. When idle injection is disabled, reading +cur_state returns value -1 instead of 0 which is to avoid confusing +100% busy state with the disabled state. + +Example usage: +- To inject 25% idle time:: + + $ sudo sh -c "echo 25 > /sys/class/thermal/cooling_device80/cur_state + +If the system is not busy and has more than 25% idle time already, +then the powerclamp driver will not start idle injection. Using Top +will not show idle injection kernel threads. + +If the system is busy (spin test below) and has less than 25% natural +idle time, powerclamp kernel threads will do idle injection. Forced +idle time is accounted as normal idle in that common code path is +taken as the idle task. + +In this example, 24.1% idle is shown. This helps the system admin or +user determine the cause of slowdown, when a powerclamp driver is in action:: + + + Tasks: 197 total, 1 running, 196 sleeping, 0 stopped, 0 zombie + Cpu(s): 71.2%us, 4.7%sy, 0.0%ni, 24.1%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st + Mem: 3943228k total, 1689632k used, 2253596k free, 74960k buffers + Swap: 4087804k total, 0k used, 4087804k free, 945336k cached + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 3352 jacob 20 0 262m 644 428 S 286 0.0 0:17.16 spin + 3341 root -51 0 0 0 0 D 25 0.0 0:01.62 kidle_inject/0 + 3344 root -51 0 0 0 0 D 25 0.0 0:01.60 kidle_inject/3 + 3342 root -51 0 0 0 0 D 25 0.0 0:01.61 kidle_inject/1 + 3343 root -51 0 0 0 0 D 25 0.0 0:01.60 kidle_inject/2 + 2935 jacob 20 0 696m 125m 35m S 5 3.3 0:31.11 firefox + 1546 root 20 0 158m 20m 6640 S 3 0.5 0:26.97 Xorg + 2100 jacob 20 0 1223m 88m 30m S 3 2.3 0:23.68 compiz + +Tests have shown that by using the powerclamp driver as a cooling +device, a PID based userspace thermal controller can manage to +control CPU temperature effectively, when no other thermal influence +is added. For example, a UltraBook user can compile the kernel under +certain temperature (below most active trip points). diff --git a/Documentation/thermal/intel_powerclamp.txt b/Documentation/thermal/intel_powerclamp.txt deleted file mode 100644 index b5df21168fbc..000000000000 --- a/Documentation/thermal/intel_powerclamp.txt +++ /dev/null @@ -1,317 +0,0 @@ - ======================= - INTEL POWERCLAMP DRIVER - ======================= -By: Arjan van de Ven - Jacob Pan - -Contents: - (*) Introduction - - Goals and Objectives - - (*) Theory of Operation - - Idle Injection - - Calibration - - (*) Performance Analysis - - Effectiveness and Limitations - - Power vs Performance - - Scalability - - Calibration - - Comparison with Alternative Techniques - - (*) Usage and Interfaces - - Generic Thermal Layer (sysfs) - - Kernel APIs (TBD) - -============ -INTRODUCTION -============ - -Consider the situation where a system’s power consumption must be -reduced at runtime, due to power budget, thermal constraint, or noise -level, and where active cooling is not preferred. Software managed -passive power reduction must be performed to prevent the hardware -actions that are designed for catastrophic scenarios. - -Currently, P-states, T-states (clock modulation), and CPU offlining -are used for CPU throttling. - -On Intel CPUs, C-states provide effective power reduction, but so far -they’re only used opportunistically, based on workload. With the -development of intel_powerclamp driver, the method of synchronizing -idle injection across all online CPU threads was introduced. The goal -is to achieve forced and controllable C-state residency. - -Test/Analysis has been made in the areas of power, performance, -scalability, and user experience. In many cases, clear advantage is -shown over taking the CPU offline or modulating the CPU clock. - - -=================== -THEORY OF OPERATION -=================== - -Idle Injection --------------- - -On modern Intel processors (Nehalem or later), package level C-state -residency is available in MSRs, thus also available to the kernel. - -These MSRs are: - #define MSR_PKG_C2_RESIDENCY 0x60D - #define MSR_PKG_C3_RESIDENCY 0x3F8 - #define MSR_PKG_C6_RESIDENCY 0x3F9 - #define MSR_PKG_C7_RESIDENCY 0x3FA - -If the kernel can also inject idle time to the system, then a -closed-loop control system can be established that manages package -level C-state. The intel_powerclamp driver is conceived as such a -control system, where the target set point is a user-selected idle -ratio (based on power reduction), and the error is the difference -between the actual package level C-state residency ratio and the target idle -ratio. - -Injection is controlled by high priority kernel threads, spawned for -each online CPU. - -These kernel threads, with SCHED_FIFO class, are created to perform -clamping actions of controlled duty ratio and duration. Each per-CPU -thread synchronizes its idle time and duration, based on the rounding -of jiffies, so accumulated errors can be prevented to avoid a jittery -effect. Threads are also bound to the CPU such that they cannot be -migrated, unless the CPU is taken offline. In this case, threads -belong to the offlined CPUs will be terminated immediately. - -Running as SCHED_FIFO and relatively high priority, also allows such -scheme to work for both preemptable and non-preemptable kernels. -Alignment of idle time around jiffies ensures scalability for HZ -values. This effect can be better visualized using a Perf timechart. -The following diagram shows the behavior of kernel thread -kidle_inject/cpu. During idle injection, it runs monitor/mwait idle -for a given "duration", then relinquishes the CPU to other tasks, -until the next time interval. - -The NOHZ schedule tick is disabled during idle time, but interrupts -are not masked. Tests show that the extra wakeups from scheduler tick -have a dramatic impact on the effectiveness of the powerclamp driver -on large scale systems (Westmere system with 80 processors). - -CPU0 - ____________ ____________ -kidle_inject/0 | sleep | mwait | sleep | - _________| |________| |_______ - duration -CPU1 - ____________ ____________ -kidle_inject/1 | sleep | mwait | sleep | - _________| |________| |_______ - ^ - | - | - roundup(jiffies, interval) - -Only one CPU is allowed to collect statistics and update global -control parameters. This CPU is referred to as the controlling CPU in -this document. The controlling CPU is elected at runtime, with a -policy that favors BSP, taking into account the possibility of a CPU -hot-plug. - -In terms of dynamics of the idle control system, package level idle -time is considered largely as a non-causal system where its behavior -cannot be based on the past or current input. Therefore, the -intel_powerclamp driver attempts to enforce the desired idle time -instantly as given input (target idle ratio). After injection, -powerclamp monitors the actual idle for a given time window and adjust -the next injection accordingly to avoid over/under correction. - -When used in a causal control system, such as a temperature control, -it is up to the user of this driver to implement algorithms where -past samples and outputs are included in the feedback. For example, a -PID-based thermal controller can use the powerclamp driver to -maintain a desired target temperature, based on integral and -derivative gains of the past samples. - - - -Calibration ------------ -During scalability testing, it is observed that synchronized actions -among CPUs become challenging as the number of cores grows. This is -also true for the ability of a system to enter package level C-states. - -To make sure the intel_powerclamp driver scales well, online -calibration is implemented. The goals for doing such a calibration -are: - -a) determine the effective range of idle injection ratio -b) determine the amount of compensation needed at each target ratio - -Compensation to each target ratio consists of two parts: - - a) steady state error compensation - This is to offset the error occurring when the system can - enter idle without extra wakeups (such as external interrupts). - - b) dynamic error compensation - When an excessive amount of wakeups occurs during idle, an - additional idle ratio can be added to quiet interrupts, by - slowing down CPU activities. - -A debugfs file is provided for the user to examine compensation -progress and results, such as on a Westmere system. -[jacob@nex01 ~]$ cat -/sys/kernel/debug/intel_powerclamp/powerclamp_calib -controlling cpu: 0 -pct confidence steady dynamic (compensation) -0 0 0 0 -1 1 0 0 -2 1 1 0 -3 3 1 0 -4 3 1 0 -5 3 1 0 -6 3 1 0 -7 3 1 0 -8 3 1 0 -... -30 3 2 0 -31 3 2 0 -32 3 1 0 -33 3 2 0 -34 3 1 0 -35 3 2 0 -36 3 1 0 -37 3 2 0 -38 3 1 0 -39 3 2 0 -40 3 3 0 -41 3 1 0 -42 3 2 0 -43 3 1 0 -44 3 1 0 -45 3 2 0 -46 3 3 0 -47 3 0 0 -48 3 2 0 -49 3 3 0 - -Calibration occurs during runtime. No offline method is available. -Steady state compensation is used only when confidence levels of all -adjacent ratios have reached satisfactory level. A confidence level -is accumulated based on clean data collected at runtime. Data -collected during a period without extra interrupts is considered -clean. - -To compensate for excessive amounts of wakeup during idle, additional -idle time is injected when such a condition is detected. Currently, -we have a simple algorithm to double the injection ratio. A possible -enhancement might be to throttle the offending IRQ, such as delaying -EOI for level triggered interrupts. But it is a challenge to be -non-intrusive to the scheduler or the IRQ core code. - - -CPU Online/Offline ------------------- -Per-CPU kernel threads are started/stopped upon receiving -notifications of CPU hotplug activities. The intel_powerclamp driver -keeps track of clamping kernel threads, even after they are migrated -to other CPUs, after a CPU offline event. - - -===================== -Performance Analysis -===================== -This section describes the general performance data collected on -multiple systems, including Westmere (80P) and Ivy Bridge (4P, 8P). - -Effectiveness and Limitations ------------------------------ -The maximum range that idle injection is allowed is capped at 50 -percent. As mentioned earlier, since interrupts are allowed during -forced idle time, excessive interrupts could result in less -effectiveness. The extreme case would be doing a ping -f to generated -flooded network interrupts without much CPU acknowledgement. In this -case, little can be done from the idle injection threads. In most -normal cases, such as scp a large file, applications can be throttled -by the powerclamp driver, since slowing down the CPU also slows down -network protocol processing, which in turn reduces interrupts. - -When control parameters change at runtime by the controlling CPU, it -may take an additional period for the rest of the CPUs to catch up -with the changes. During this time, idle injection is out of sync, -thus not able to enter package C- states at the expected ratio. But -this effect is minor, in that in most cases change to the target -ratio is updated much less frequently than the idle injection -frequency. - -Scalability ------------ -Tests also show a minor, but measurable, difference between the 4P/8P -Ivy Bridge system and the 80P Westmere server under 50% idle ratio. -More compensation is needed on Westmere for the same amount of -target idle ratio. The compensation also increases as the idle ratio -gets larger. The above reason constitutes the need for the -calibration code. - -On the IVB 8P system, compared to an offline CPU, powerclamp can -achieve up to 40% better performance per watt. (measured by a spin -counter summed over per CPU counting threads spawned for all running -CPUs). - -==================== -Usage and Interfaces -==================== -The powerclamp driver is registered to the generic thermal layer as a -cooling device. Currently, it’s not bound to any thermal zones. - -jacob@chromoly:/sys/class/thermal/cooling_device14$ grep . * -cur_state:0 -max_state:50 -type:intel_powerclamp - -cur_state allows user to set the desired idle percentage. Writing 0 to -cur_state will stop idle injection. Writing a value between 1 and -max_state will start the idle injection. Reading cur_state returns the -actual and current idle percentage. This may not be the same value -set by the user in that current idle percentage depends on workload -and includes natural idle. When idle injection is disabled, reading -cur_state returns value -1 instead of 0 which is to avoid confusing -100% busy state with the disabled state. - -Example usage: -- To inject 25% idle time -$ sudo sh -c "echo 25 > /sys/class/thermal/cooling_device80/cur_state -" - -If the system is not busy and has more than 25% idle time already, -then the powerclamp driver will not start idle injection. Using Top -will not show idle injection kernel threads. - -If the system is busy (spin test below) and has less than 25% natural -idle time, powerclamp kernel threads will do idle injection. Forced -idle time is accounted as normal idle in that common code path is -taken as the idle task. - -In this example, 24.1% idle is shown. This helps the system admin or -user determine the cause of slowdown, when a powerclamp driver is in action. - - -Tasks: 197 total, 1 running, 196 sleeping, 0 stopped, 0 zombie -Cpu(s): 71.2%us, 4.7%sy, 0.0%ni, 24.1%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st -Mem: 3943228k total, 1689632k used, 2253596k free, 74960k buffers -Swap: 4087804k total, 0k used, 4087804k free, 945336k cached - - PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND - 3352 jacob 20 0 262m 644 428 S 286 0.0 0:17.16 spin - 3341 root -51 0 0 0 0 D 25 0.0 0:01.62 kidle_inject/0 - 3344 root -51 0 0 0 0 D 25 0.0 0:01.60 kidle_inject/3 - 3342 root -51 0 0 0 0 D 25 0.0 0:01.61 kidle_inject/1 - 3343 root -51 0 0 0 0 D 25 0.0 0:01.60 kidle_inject/2 - 2935 jacob 20 0 696m 125m 35m S 5 3.3 0:31.11 firefox - 1546 root 20 0 158m 20m 6640 S 3 0.5 0:26.97 Xorg - 2100 jacob 20 0 1223m 88m 30m S 3 2.3 0:23.68 compiz - -Tests have shown that by using the powerclamp driver as a cooling -device, a PID based userspace thermal controller can manage to -control CPU temperature effectively, when no other thermal influence -is added. For example, a UltraBook user can compile the kernel under -certain temperature (below most active trip points). diff --git a/Documentation/thermal/nouveau_thermal b/Documentation/thermal/nouveau_thermal deleted file mode 100644 index 6e17a11efcb0..000000000000 --- a/Documentation/thermal/nouveau_thermal +++ /dev/null @@ -1,82 +0,0 @@ -Kernel driver nouveau -=================== - -Supported chips: -* NV43+ - -Authors: Martin Peres (mupuf) - -Description ---------- - -This driver allows to read the GPU core temperature, drive the GPU fan and -set temperature alarms. - -Currently, due to the absence of in-kernel API to access HWMON drivers, Nouveau -cannot access any of the i2c external monitoring chips it may find. If you -have one of those, temperature and/or fan management through Nouveau's HWMON -interface is likely not to work. This document may then not cover your situation -entirely. - -Temperature management --------------------- - -Temperature is exposed under as a read-only HWMON attribute temp1_input. - -In order to protect the GPU from overheating, Nouveau supports 4 configurable -temperature thresholds: - - * Fan_boost: Fan speed is set to 100% when reaching this temperature; - * Downclock: The GPU will be downclocked to reduce its power dissipation; - * Critical: The GPU is put on hold to further lower power dissipation; - * Shutdown: Shut the computer down to protect your GPU. - -WARNING: Some of these thresholds may not be used by Nouveau depending -on your chipset. - -The default value for these thresholds comes from the GPU's vbios. These -thresholds can be configured thanks to the following HWMON attributes: - - * Fan_boost: temp1_auto_point1_temp and temp1_auto_point1_temp_hyst; - * Downclock: temp1_max and temp1_max_hyst; - * Critical: temp1_crit and temp1_crit_hyst; - * Shutdown: temp1_emergency and temp1_emergency_hyst. - -NOTE: Remember that the values are stored as milli degrees Celsius. Don't forget -to multiply! - -Fan management ------------- - -Not all cards have a drivable fan. If you do, then the following HWMON -attributes should be available: - - * pwm1_enable: Current fan management mode (NONE, MANUAL or AUTO); - * pwm1: Current PWM value (power percentage); - * pwm1_min: The minimum PWM speed allowed; - * pwm1_max: The maximum PWM speed allowed (bypassed when hitting Fan_boost); - -You may also have the following attribute: - - * fan1_input: Speed in RPM of your fan. - -Your fan can be driven in different modes: - - * 0: The fan is left untouched; - * 1: The fan can be driven in manual (use pwm1 to change the speed); - * 2; The fan is driven automatically depending on the temperature. - -NOTE: Be sure to use the manual mode if you want to drive the fan speed manually - -NOTE2: When operating in manual mode outside the vbios-defined -[PWM_min, PWM_max] range, the reported fan speed (RPM) may not be accurate -depending on your hardware. - -Bug reports ---------- - -Thermal management on Nouveau is new and may not work on all cards. If you have -inquiries, please ping mupuf on IRC (#nouveau, freenode). - -Bug reports should be filled on Freedesktop's bug tracker. Please follow -http://nouveau.freedesktop.org/wiki/Bugs diff --git a/Documentation/thermal/nouveau_thermal.rst b/Documentation/thermal/nouveau_thermal.rst new file mode 100644 index 000000000000..37255fd6735d --- /dev/null +++ b/Documentation/thermal/nouveau_thermal.rst @@ -0,0 +1,96 @@ +===================== +Kernel driver nouveau +===================== + +Supported chips: + +* NV43+ + +Authors: Martin Peres (mupuf) + +Description +----------- + +This driver allows to read the GPU core temperature, drive the GPU fan and +set temperature alarms. + +Currently, due to the absence of in-kernel API to access HWMON drivers, Nouveau +cannot access any of the i2c external monitoring chips it may find. If you +have one of those, temperature and/or fan management through Nouveau's HWMON +interface is likely not to work. This document may then not cover your situation +entirely. + +Temperature management +---------------------- + +Temperature is exposed under as a read-only HWMON attribute temp1_input. + +In order to protect the GPU from overheating, Nouveau supports 4 configurable +temperature thresholds: + + * Fan_boost: + Fan speed is set to 100% when reaching this temperature; + * Downclock: + The GPU will be downclocked to reduce its power dissipation; + * Critical: + The GPU is put on hold to further lower power dissipation; + * Shutdown: + Shut the computer down to protect your GPU. + +WARNING: + Some of these thresholds may not be used by Nouveau depending + on your chipset. + +The default value for these thresholds comes from the GPU's vbios. These +thresholds can be configured thanks to the following HWMON attributes: + + * Fan_boost: temp1_auto_point1_temp and temp1_auto_point1_temp_hyst; + * Downclock: temp1_max and temp1_max_hyst; + * Critical: temp1_crit and temp1_crit_hyst; + * Shutdown: temp1_emergency and temp1_emergency_hyst. + +NOTE: Remember that the values are stored as milli degrees Celsius. Don't forget +to multiply! + +Fan management +-------------- + +Not all cards have a drivable fan. If you do, then the following HWMON +attributes should be available: + + * pwm1_enable: + Current fan management mode (NONE, MANUAL or AUTO); + * pwm1: + Current PWM value (power percentage); + * pwm1_min: + The minimum PWM speed allowed; + * pwm1_max: + The maximum PWM speed allowed (bypassed when hitting Fan_boost); + +You may also have the following attribute: + + * fan1_input: + Speed in RPM of your fan. + +Your fan can be driven in different modes: + + * 0: The fan is left untouched; + * 1: The fan can be driven in manual (use pwm1 to change the speed); + * 2; The fan is driven automatically depending on the temperature. + +NOTE: + Be sure to use the manual mode if you want to drive the fan speed manually + +NOTE2: + When operating in manual mode outside the vbios-defined + [PWM_min, PWM_max] range, the reported fan speed (RPM) may not be accurate + depending on your hardware. + +Bug reports +----------- + +Thermal management on Nouveau is new and may not work on all cards. If you have +inquiries, please ping mupuf on IRC (#nouveau, freenode). + +Bug reports should be filled on Freedesktop's bug tracker. Please follow +http://nouveau.freedesktop.org/wiki/Bugs diff --git a/Documentation/thermal/power_allocator.rst b/Documentation/thermal/power_allocator.rst new file mode 100644 index 000000000000..67b6a3297238 --- /dev/null +++ b/Documentation/thermal/power_allocator.rst @@ -0,0 +1,271 @@ +================================= +Power allocator governor tunables +================================= + +Trip points +----------- + +The governor works optimally with the following two passive trip points: + +1. "switch on" trip point: temperature above which the governor + control loop starts operating. This is the first passive trip + point of the thermal zone. + +2. "desired temperature" trip point: it should be higher than the + "switch on" trip point. This the target temperature the governor + is controlling for. This is the last passive trip point of the + thermal zone. + +PID Controller +-------------- + +The power allocator governor implements a +Proportional-Integral-Derivative controller (PID controller) with +temperature as the control input and power as the controlled output: + + P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power + +where + - e = desired_temperature - current_temperature + - err_integral is the sum of previous errors + - diff_err = e - previous_error + +It is similar to the one depicted below:: + + k_d + | + current_temp | + | v + | +----------+ +---+ + | +----->| diff_err |-->| X |------+ + | | +----------+ +---+ | + | | | tdp actor + | | k_i | | get_requested_power() + | | | | | | | + | | | | | | | ... + v | v v v v v + +---+ | +-------+ +---+ +---+ +---+ +----------+ + | S |-----+----->| sum e |----->| X |--->| S |-->| S |-->|power | + +---+ | +-------+ +---+ +---+ +---+ |allocation| + ^ | ^ +----------+ + | | | | | + | | +---+ | | | + | +------->| X |-------------------+ v v + | +---+ granted performance + desired_temperature ^ + | + | + k_po/k_pu + +Sustainable power +----------------- + +An estimate of the sustainable dissipatable power (in mW) should be +provided while registering the thermal zone. This estimates the +sustained power that can be dissipated at the desired control +temperature. This is the maximum sustained power for allocation at +the desired maximum temperature. The actual sustained power can vary +for a number of reasons. The closed loop controller will take care of +variations such as environmental conditions, and some factors related +to the speed-grade of the silicon. `sustainable_power` is therefore +simply an estimate, and may be tuned to affect the aggressiveness of +the thermal ramp. For reference, the sustainable power of a 4" phone +is typically 2000mW, while on a 10" tablet is around 4500mW (may vary +depending on screen size). + +If you are using device tree, do add it as a property of the +thermal-zone. For example:: + + thermal-zones { + soc_thermal { + polling-delay = <1000>; + polling-delay-passive = <100>; + sustainable-power = <2500>; + ... + +Instead, if the thermal zone is registered from the platform code, pass a +`thermal_zone_params` that has a `sustainable_power`. If no +`thermal_zone_params` were being passed, then something like below +will suffice:: + + static const struct thermal_zone_params tz_params = { + .sustainable_power = 3500, + }; + +and then pass `tz_params` as the 5th parameter to +`thermal_zone_device_register()` + +k_po and k_pu +------------- + +The implementation of the PID controller in the power allocator +thermal governor allows the configuration of two proportional term +constants: `k_po` and `k_pu`. `k_po` is the proportional term +constant during temperature overshoot periods (current temperature is +above "desired temperature" trip point). Conversely, `k_pu` is the +proportional term constant during temperature undershoot periods +(current temperature below "desired temperature" trip point). + +These controls are intended as the primary mechanism for configuring +the permitted thermal "ramp" of the system. For instance, a lower +`k_pu` value will provide a slower ramp, at the cost of capping +available capacity at a low temperature. On the other hand, a high +value of `k_pu` will result in the governor granting very high power +while temperature is low, and may lead to temperature overshooting. + +The default value for `k_pu` is:: + + 2 * sustainable_power / (desired_temperature - switch_on_temp) + +This means that at `switch_on_temp` the output of the controller's +proportional term will be 2 * `sustainable_power`. The default value +for `k_po` is:: + + sustainable_power / (desired_temperature - switch_on_temp) + +Focusing on the proportional and feed forward values of the PID +controller equation we have:: + + P_max = k_p * e + sustainable_power + +The proportional term is proportional to the difference between the +desired temperature and the current one. When the current temperature +is the desired one, then the proportional component is zero and +`P_max` = `sustainable_power`. That is, the system should operate in +thermal equilibrium under constant load. `sustainable_power` is only +an estimate, which is the reason for closed-loop control such as this. + +Expanding `k_pu` we get:: + + P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) + + sustainable_power + +where: + + - T_set is the desired temperature + - T is the current temperature + - T_on is the switch on temperature + +When the current temperature is the switch_on temperature, the above +formula becomes:: + + P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) + + sustainable_power = 2 * sustainable_power + sustainable_power = + 3 * sustainable_power + +Therefore, the proportional term alone linearly decreases power from +3 * `sustainable_power` to `sustainable_power` as the temperature +rises from the switch on temperature to the desired temperature. + +k_i and integral_cutoff +----------------------- + +`k_i` configures the PID loop's integral term constant. This term +allows the PID controller to compensate for long term drift and for +the quantized nature of the output control: cooling devices can't set +the exact power that the governor requests. When the temperature +error is below `integral_cutoff`, errors are accumulated in the +integral term. This term is then multiplied by `k_i` and the result +added to the output of the controller. Typically `k_i` is set low (1 +or 2) and `integral_cutoff` is 0. + +k_d +--- + +`k_d` configures the PID loop's derivative term constant. It's +recommended to leave it as the default: 0. + +Cooling device power API +======================== + +Cooling devices controlled by this governor must supply the additional +"power" API in their `cooling_device_ops`. It consists on three ops: + +1. :: + + int get_requested_power(struct thermal_cooling_device *cdev, + struct thermal_zone_device *tz, u32 *power); + + +@cdev: + The `struct thermal_cooling_device` pointer +@tz: + thermal zone in which we are currently operating +@power: + pointer in which to store the calculated power + +`get_requested_power()` calculates the power requested by the device +in milliwatts and stores it in @power . It should return 0 on +success, -E* on failure. This is currently used by the power +allocator governor to calculate how much power to give to each cooling +device. + +2. :: + + int state2power(struct thermal_cooling_device *cdev, struct + thermal_zone_device *tz, unsigned long state, + u32 *power); + +@cdev: + The `struct thermal_cooling_device` pointer +@tz: + thermal zone in which we are currently operating +@state: + A cooling device state +@power: + pointer in which to store the equivalent power + +Convert cooling device state @state into power consumption in +milliwatts and store it in @power. It should return 0 on success, -E* +on failure. This is currently used by thermal core to calculate the +maximum power that an actor can consume. + +3. :: + + int power2state(struct thermal_cooling_device *cdev, u32 power, + unsigned long *state); + +@cdev: + The `struct thermal_cooling_device` pointer +@power: + power in milliwatts +@state: + pointer in which to store the resulting state + +Calculate a cooling device state that would make the device consume at +most @power mW and store it in @state. It should return 0 on success, +-E* on failure. This is currently used by the thermal core to convert +a given power set by the power allocator governor to a state that the +cooling device can set. It is a function because this conversion may +depend on external factors that may change so this function should the +best conversion given "current circumstances". + +Cooling device weights +---------------------- + +Weights are a mechanism to bias the allocation among cooling +devices. They express the relative power efficiency of different +cooling devices. Higher weight can be used to express higher power +efficiency. Weighting is relative such that if each cooling device +has a weight of one they are considered equal. This is particularly +useful in heterogeneous systems where two cooling devices may perform +the same kind of compute, but with different efficiency. For example, +a system with two different types of processors. + +If the thermal zone is registered using +`thermal_zone_device_register()` (i.e., platform code), then weights +are passed as part of the thermal zone's `thermal_bind_parameters`. +If the platform is registered using device tree, then they are passed +as the `contribution` property of each map in the `cooling-maps` node. + +Limitations of the power allocator governor +=========================================== + +The power allocator governor's PID controller works best if there is a +periodic tick. If you have a driver that calls +`thermal_zone_device_update()` (or anything that ends up calling the +governor's `throttle()` function) repetitively, the governor response +won't be very good. Note that this is not particular to this +governor, step-wise will also misbehave if you call its throttle() +faster than the normal thermal framework tick (due to interrupts for +example) as it will overreact. diff --git a/Documentation/thermal/power_allocator.txt b/Documentation/thermal/power_allocator.txt deleted file mode 100644 index 9fb0ff06dca9..000000000000 --- a/Documentation/thermal/power_allocator.txt +++ /dev/null @@ -1,247 +0,0 @@ -Power allocator governor tunables -================================= - -Trip points ------------ - -The governor works optimally with the following two passive trip points: - -1. "switch on" trip point: temperature above which the governor - control loop starts operating. This is the first passive trip - point of the thermal zone. - -2. "desired temperature" trip point: it should be higher than the - "switch on" trip point. This the target temperature the governor - is controlling for. This is the last passive trip point of the - thermal zone. - -PID Controller --------------- - -The power allocator governor implements a -Proportional-Integral-Derivative controller (PID controller) with -temperature as the control input and power as the controlled output: - - P_max = k_p * e + k_i * err_integral + k_d * diff_err + sustainable_power - -where - e = desired_temperature - current_temperature - err_integral is the sum of previous errors - diff_err = e - previous_error - -It is similar to the one depicted below: - - k_d - | -current_temp | - | v - | +----------+ +---+ - | +----->| diff_err |-->| X |------+ - | | +----------+ +---+ | - | | | tdp actor - | | k_i | | get_requested_power() - | | | | | | | - | | | | | | | ... - v | v v v v v - +---+ | +-------+ +---+ +---+ +---+ +----------+ - | S |-------+----->| sum e |----->| X |--->| S |-->| S |-->|power | - +---+ | +-------+ +---+ +---+ +---+ |allocation| - ^ | ^ +----------+ - | | | | | - | | +---+ | | | - | +------->| X |-------------------+ v v - | +---+ granted performance -desired_temperature ^ - | - | - k_po/k_pu - -Sustainable power ------------------ - -An estimate of the sustainable dissipatable power (in mW) should be -provided while registering the thermal zone. This estimates the -sustained power that can be dissipated at the desired control -temperature. This is the maximum sustained power for allocation at -the desired maximum temperature. The actual sustained power can vary -for a number of reasons. The closed loop controller will take care of -variations such as environmental conditions, and some factors related -to the speed-grade of the silicon. `sustainable_power` is therefore -simply an estimate, and may be tuned to affect the aggressiveness of -the thermal ramp. For reference, the sustainable power of a 4" phone -is typically 2000mW, while on a 10" tablet is around 4500mW (may vary -depending on screen size). - -If you are using device tree, do add it as a property of the -thermal-zone. For example: - - thermal-zones { - soc_thermal { - polling-delay = <1000>; - polling-delay-passive = <100>; - sustainable-power = <2500>; - ... - -Instead, if the thermal zone is registered from the platform code, pass a -`thermal_zone_params` that has a `sustainable_power`. If no -`thermal_zone_params` were being passed, then something like below -will suffice: - - static const struct thermal_zone_params tz_params = { - .sustainable_power = 3500, - }; - -and then pass `tz_params` as the 5th parameter to -`thermal_zone_device_register()` - -k_po and k_pu -------------- - -The implementation of the PID controller in the power allocator -thermal governor allows the configuration of two proportional term -constants: `k_po` and `k_pu`. `k_po` is the proportional term -constant during temperature overshoot periods (current temperature is -above "desired temperature" trip point). Conversely, `k_pu` is the -proportional term constant during temperature undershoot periods -(current temperature below "desired temperature" trip point). - -These controls are intended as the primary mechanism for configuring -the permitted thermal "ramp" of the system. For instance, a lower -`k_pu` value will provide a slower ramp, at the cost of capping -available capacity at a low temperature. On the other hand, a high -value of `k_pu` will result in the governor granting very high power -while temperature is low, and may lead to temperature overshooting. - -The default value for `k_pu` is: - - 2 * sustainable_power / (desired_temperature - switch_on_temp) - -This means that at `switch_on_temp` the output of the controller's -proportional term will be 2 * `sustainable_power`. The default value -for `k_po` is: - - sustainable_power / (desired_temperature - switch_on_temp) - -Focusing on the proportional and feed forward values of the PID -controller equation we have: - - P_max = k_p * e + sustainable_power - -The proportional term is proportional to the difference between the -desired temperature and the current one. When the current temperature -is the desired one, then the proportional component is zero and -`P_max` = `sustainable_power`. That is, the system should operate in -thermal equilibrium under constant load. `sustainable_power` is only -an estimate, which is the reason for closed-loop control such as this. - -Expanding `k_pu` we get: - P_max = 2 * sustainable_power * (T_set - T) / (T_set - T_on) + - sustainable_power - -where - T_set is the desired temperature - T is the current temperature - T_on is the switch on temperature - -When the current temperature is the switch_on temperature, the above -formula becomes: - - P_max = 2 * sustainable_power * (T_set - T_on) / (T_set - T_on) + - sustainable_power = 2 * sustainable_power + sustainable_power = - 3 * sustainable_power - -Therefore, the proportional term alone linearly decreases power from -3 * `sustainable_power` to `sustainable_power` as the temperature -rises from the switch on temperature to the desired temperature. - -k_i and integral_cutoff ------------------------ - -`k_i` configures the PID loop's integral term constant. This term -allows the PID controller to compensate for long term drift and for -the quantized nature of the output control: cooling devices can't set -the exact power that the governor requests. When the temperature -error is below `integral_cutoff`, errors are accumulated in the -integral term. This term is then multiplied by `k_i` and the result -added to the output of the controller. Typically `k_i` is set low (1 -or 2) and `integral_cutoff` is 0. - -k_d ---- - -`k_d` configures the PID loop's derivative term constant. It's -recommended to leave it as the default: 0. - -Cooling device power API -======================== - -Cooling devices controlled by this governor must supply the additional -"power" API in their `cooling_device_ops`. It consists on three ops: - -1. int get_requested_power(struct thermal_cooling_device *cdev, - struct thermal_zone_device *tz, u32 *power); -@cdev: The `struct thermal_cooling_device` pointer -@tz: thermal zone in which we are currently operating -@power: pointer in which to store the calculated power - -`get_requested_power()` calculates the power requested by the device -in milliwatts and stores it in @power . It should return 0 on -success, -E* on failure. This is currently used by the power -allocator governor to calculate how much power to give to each cooling -device. - -2. int state2power(struct thermal_cooling_device *cdev, struct - thermal_zone_device *tz, unsigned long state, u32 *power); -@cdev: The `struct thermal_cooling_device` pointer -@tz: thermal zone in which we are currently operating -@state: A cooling device state -@power: pointer in which to store the equivalent power - -Convert cooling device state @state into power consumption in -milliwatts and store it in @power. It should return 0 on success, -E* -on failure. This is currently used by thermal core to calculate the -maximum power that an actor can consume. - -3. int power2state(struct thermal_cooling_device *cdev, u32 power, - unsigned long *state); -@cdev: The `struct thermal_cooling_device` pointer -@power: power in milliwatts -@state: pointer in which to store the resulting state - -Calculate a cooling device state that would make the device consume at -most @power mW and store it in @state. It should return 0 on success, --E* on failure. This is currently used by the thermal core to convert -a given power set by the power allocator governor to a state that the -cooling device can set. It is a function because this conversion may -depend on external factors that may change so this function should the -best conversion given "current circumstances". - -Cooling device weights ----------------------- - -Weights are a mechanism to bias the allocation among cooling -devices. They express the relative power efficiency of different -cooling devices. Higher weight can be used to express higher power -efficiency. Weighting is relative such that if each cooling device -has a weight of one they are considered equal. This is particularly -useful in heterogeneous systems where two cooling devices may perform -the same kind of compute, but with different efficiency. For example, -a system with two different types of processors. - -If the thermal zone is registered using -`thermal_zone_device_register()` (i.e., platform code), then weights -are passed as part of the thermal zone's `thermal_bind_parameters`. -If the platform is registered using device tree, then they are passed -as the `contribution` property of each map in the `cooling-maps` node. - -Limitations of the power allocator governor -=========================================== - -The power allocator governor's PID controller works best if there is a -periodic tick. If you have a driver that calls -`thermal_zone_device_update()` (or anything that ends up calling the -governor's `throttle()` function) repetitively, the governor response -won't be very good. Note that this is not particular to this -governor, step-wise will also misbehave if you call its throttle() -faster than the normal thermal framework tick (due to interrupts for -example) as it will overreact. diff --git a/Documentation/thermal/sysfs-api.rst b/Documentation/thermal/sysfs-api.rst new file mode 100644 index 000000000000..e4930761d3e5 --- /dev/null +++ b/Documentation/thermal/sysfs-api.rst @@ -0,0 +1,798 @@ +=================================== +Generic Thermal Sysfs driver How To +=================================== + +Written by Sujith Thomas , Zhang Rui + +Updated: 2 January 2008 + +Copyright (c) 2008 Intel Corporation + + +0. Introduction +=============== + +The generic thermal sysfs provides a set of interfaces for thermal zone +devices (sensors) and thermal cooling devices (fan, processor...) to register +with the thermal management solution and to be a part of it. + +This how-to focuses on enabling new thermal zone and cooling devices to +participate in thermal management. +This solution is platform independent and any type of thermal zone devices +and cooling devices should be able to make use of the infrastructure. + +The main task of the thermal sysfs driver is to expose thermal zone attributes +as well as cooling device attributes to the user space. +An intelligent thermal management application can make decisions based on +inputs from thermal zone attributes (the current temperature and trip point +temperature) and throttle appropriate devices. + +- `[0-*]` denotes any positive number starting from 0 +- `[1-*]` denotes any positive number starting from 1 + +1. thermal sysfs driver interface functions +=========================================== + +1.1 thermal zone device interface +--------------------------------- + + :: + + struct thermal_zone_device + *thermal_zone_device_register(char *type, + int trips, int mask, void *devdata, + struct thermal_zone_device_ops *ops, + const struct thermal_zone_params *tzp, + int passive_delay, int polling_delay)) + + This interface function adds a new thermal zone device (sensor) to + /sys/class/thermal folder as `thermal_zone[0-*]`. It tries to bind all the + thermal cooling devices registered at the same time. + + type: + the thermal zone type. + trips: + the total number of trip points this thermal zone supports. + mask: + Bit string: If 'n'th bit is set, then trip point 'n' is writeable. + devdata: + device private data + ops: + thermal zone device call-backs. + + .bind: + bind the thermal zone device with a thermal cooling device. + .unbind: + unbind the thermal zone device with a thermal cooling device. + .get_temp: + get the current temperature of the thermal zone. + .set_trips: + set the trip points window. Whenever the current temperature + is updated, the trip points immediately below and above the + current temperature are found. + .get_mode: + get the current mode (enabled/disabled) of the thermal zone. + + - "enabled" means the kernel thermal management is + enabled. + - "disabled" will prevent kernel thermal driver action + upon trip points so that user applications can take + charge of thermal management. + .set_mode: + set the mode (enabled/disabled) of the thermal zone. + .get_trip_type: + get the type of certain trip point. + .get_trip_temp: + get the temperature above which the certain trip point + will be fired. + .set_emul_temp: + set the emulation temperature which helps in debugging + different threshold temperature points. + tzp: + thermal zone platform parameters. + passive_delay: + number of milliseconds to wait between polls when + performing passive cooling. + polling_delay: + number of milliseconds to wait between polls when checking + whether trip points have been crossed (0 for interrupt driven systems). + + :: + + void thermal_zone_device_unregister(struct thermal_zone_device *tz) + + This interface function removes the thermal zone device. + It deletes the corresponding entry from /sys/class/thermal folder and + unbinds all the thermal cooling devices it uses. + + :: + + struct thermal_zone_device + *thermal_zone_of_sensor_register(struct device *dev, int sensor_id, + void *data, + const struct thermal_zone_of_device_ops *ops) + + This interface adds a new sensor to a DT thermal zone. + This function will search the list of thermal zones described in + device tree and look for the zone that refer to the sensor device + pointed by dev->of_node as temperature providers. For the zone + pointing to the sensor node, the sensor will be added to the DT + thermal zone device. + + The parameters for this interface are: + + dev: + Device node of sensor containing valid node pointer in + dev->of_node. + sensor_id: + a sensor identifier, in case the sensor IP has more + than one sensors + data: + a private pointer (owned by the caller) that will be + passed back, when a temperature reading is needed. + ops: + `struct thermal_zone_of_device_ops *`. + + ============== ======================================= + get_temp a pointer to a function that reads the + sensor temperature. This is mandatory + callback provided by sensor driver. + set_trips a pointer to a function that sets a + temperature window. When this window is + left the driver must inform the thermal + core via thermal_zone_device_update. + get_trend a pointer to a function that reads the + sensor temperature trend. + set_emul_temp a pointer to a function that sets + sensor emulated temperature. + ============== ======================================= + + The thermal zone temperature is provided by the get_temp() function + pointer of thermal_zone_of_device_ops. When called, it will + have the private pointer @data back. + + It returns error pointer if fails otherwise valid thermal zone device + handle. Caller should check the return handle with IS_ERR() for finding + whether success or not. + + :: + + void thermal_zone_of_sensor_unregister(struct device *dev, + struct thermal_zone_device *tzd) + + This interface unregisters a sensor from a DT thermal zone which was + successfully added by interface thermal_zone_of_sensor_register(). + This function removes the sensor callbacks and private data from the + thermal zone device registered with thermal_zone_of_sensor_register() + interface. It will also silent the zone by remove the .get_temp() and + get_trend() thermal zone device callbacks. + + :: + + struct thermal_zone_device + *devm_thermal_zone_of_sensor_register(struct device *dev, + int sensor_id, + void *data, + const struct thermal_zone_of_device_ops *ops) + + This interface is resource managed version of + thermal_zone_of_sensor_register(). + + All details of thermal_zone_of_sensor_register() described in + section 1.1.3 is applicable here. + + The benefit of using this interface to register sensor is that it + is not require to explicitly call thermal_zone_of_sensor_unregister() + in error path or during driver unbinding as this is done by driver + resource manager. + + :: + + void devm_thermal_zone_of_sensor_unregister(struct device *dev, + struct thermal_zone_device *tzd) + + This interface is resource managed version of + thermal_zone_of_sensor_unregister(). + All details of thermal_zone_of_sensor_unregister() described in + section 1.1.4 is applicable here. + Normally this function will not need to be called and the resource + management code will ensure that the resource is freed. + + :: + + int thermal_zone_get_slope(struct thermal_zone_device *tz) + + This interface is used to read the slope attribute value + for the thermal zone device, which might be useful for platform + drivers for temperature calculations. + + :: + + int thermal_zone_get_offset(struct thermal_zone_device *tz) + + This interface is used to read the offset attribute value + for the thermal zone device, which might be useful for platform + drivers for temperature calculations. + +1.2 thermal cooling device interface +------------------------------------ + + + :: + + struct thermal_cooling_device + *thermal_cooling_device_register(char *name, + void *devdata, struct thermal_cooling_device_ops *) + + This interface function adds a new thermal cooling device (fan/processor/...) + to /sys/class/thermal/ folder as `cooling_device[0-*]`. It tries to bind itself + to all the thermal zone devices registered at the same time. + + name: + the cooling device name. + devdata: + device private data. + ops: + thermal cooling devices call-backs. + + .get_max_state: + get the Maximum throttle state of the cooling device. + .get_cur_state: + get the Currently requested throttle state of the + cooling device. + .set_cur_state: + set the Current throttle state of the cooling device. + + :: + + void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev) + + This interface function removes the thermal cooling device. + It deletes the corresponding entry from /sys/class/thermal folder and + unbinds itself from all the thermal zone devices using it. + +1.3 interface for binding a thermal zone device with a thermal cooling device +----------------------------------------------------------------------------- + + :: + + int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz, + int trip, struct thermal_cooling_device *cdev, + unsigned long upper, unsigned long lower, unsigned int weight); + + This interface function binds a thermal cooling device to a particular trip + point of a thermal zone device. + + This function is usually called in the thermal zone device .bind callback. + + tz: + the thermal zone device + cdev: + thermal cooling device + trip: + indicates which trip point in this thermal zone the cooling device + is associated with. + upper: + the Maximum cooling state for this trip point. + THERMAL_NO_LIMIT means no upper limit, + and the cooling device can be in max_state. + lower: + the Minimum cooling state can be used for this trip point. + THERMAL_NO_LIMIT means no lower limit, + and the cooling device can be in cooling state 0. + weight: + the influence of this cooling device in this thermal + zone. See 1.4.1 below for more information. + + :: + + int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz, + int trip, struct thermal_cooling_device *cdev); + + This interface function unbinds a thermal cooling device from a particular + trip point of a thermal zone device. This function is usually called in + the thermal zone device .unbind callback. + + tz: + the thermal zone device + cdev: + thermal cooling device + trip: + indicates which trip point in this thermal zone the cooling device + is associated with. + +1.4 Thermal Zone Parameters +--------------------------- + + :: + + struct thermal_bind_params + + This structure defines the following parameters that are used to bind + a zone with a cooling device for a particular trip point. + + .cdev: + The cooling device pointer + .weight: + The 'influence' of a particular cooling device on this + zone. This is relative to the rest of the cooling + devices. For example, if all cooling devices have a + weight of 1, then they all contribute the same. You can + use percentages if you want, but it's not mandatory. A + weight of 0 means that this cooling device doesn't + contribute to the cooling of this zone unless all cooling + devices have a weight of 0. If all weights are 0, then + they all contribute the same. + .trip_mask: + This is a bit mask that gives the binding relation between + this thermal zone and cdev, for a particular trip point. + If nth bit is set, then the cdev and thermal zone are bound + for trip point n. + .binding_limits: + This is an array of cooling state limits. Must have + exactly 2 * thermal_zone.number_of_trip_points. It is an + array consisting of tuples of + state limits. Each trip will be associated with one state + limit tuple when binding. A NULL pointer means + on all trips. + These limits are used when binding a cdev to a trip point. + .match: + This call back returns success(0) if the 'tz and cdev' need to + be bound, as per platform data. + + :: + + struct thermal_zone_params + + This structure defines the platform level parameters for a thermal zone. + This data, for each thermal zone should come from the platform layer. + This is an optional feature where some platforms can choose not to + provide this data. + + .governor_name: + Name of the thermal governor used for this zone + .no_hwmon: + a boolean to indicate if the thermal to hwmon sysfs interface + is required. when no_hwmon == false, a hwmon sysfs interface + will be created. when no_hwmon == true, nothing will be done. + In case the thermal_zone_params is NULL, the hwmon interface + will be created (for backward compatibility). + .num_tbps: + Number of thermal_bind_params entries for this zone + .tbp: + thermal_bind_params entries + +2. sysfs attributes structure +============================= + +== ================ +RO read only value +WO write only value +RW read/write value +== ================ + +Thermal sysfs attributes will be represented under /sys/class/thermal. +Hwmon sysfs I/F extension is also available under /sys/class/hwmon +if hwmon is compiled in or built as a module. + +Thermal zone device sys I/F, created once it's registered:: + + /sys/class/thermal/thermal_zone[0-*]: + |---type: Type of the thermal zone + |---temp: Current temperature + |---mode: Working mode of the thermal zone + |---policy: Thermal governor used for this zone + |---available_policies: Available thermal governors for this zone + |---trip_point_[0-*]_temp: Trip point temperature + |---trip_point_[0-*]_type: Trip point type + |---trip_point_[0-*]_hyst: Hysteresis value for this trip point + |---emul_temp: Emulated temperature set node + |---sustainable_power: Sustainable dissipatable power + |---k_po: Proportional term during temperature overshoot + |---k_pu: Proportional term during temperature undershoot + |---k_i: PID's integral term in the power allocator gov + |---k_d: PID's derivative term in the power allocator + |---integral_cutoff: Offset above which errors are accumulated + |---slope: Slope constant applied as linear extrapolation + |---offset: Offset constant applied as linear extrapolation + +Thermal cooling device sys I/F, created once it's registered:: + + /sys/class/thermal/cooling_device[0-*]: + |---type: Type of the cooling device(processor/fan/...) + |---max_state: Maximum cooling state of the cooling device + |---cur_state: Current cooling state of the cooling device + |---stats: Directory containing cooling device's statistics + |---stats/reset: Writing any value resets the statistics + |---stats/time_in_state_ms: Time (msec) spent in various cooling states + |---stats/total_trans: Total number of times cooling state is changed + |---stats/trans_table: Cooing state transition table + + +Then next two dynamic attributes are created/removed in pairs. They represent +the relationship between a thermal zone and its associated cooling device. +They are created/removed for each successful execution of +thermal_zone_bind_cooling_device/thermal_zone_unbind_cooling_device. + +:: + + /sys/class/thermal/thermal_zone[0-*]: + |---cdev[0-*]: [0-*]th cooling device in current thermal zone + |---cdev[0-*]_trip_point: Trip point that cdev[0-*] is associated with + |---cdev[0-*]_weight: Influence of the cooling device in + this thermal zone + +Besides the thermal zone device sysfs I/F and cooling device sysfs I/F, +the generic thermal driver also creates a hwmon sysfs I/F for each _type_ +of thermal zone device. E.g. the generic thermal driver registers one hwmon +class device and build the associated hwmon sysfs I/F for all the registered +ACPI thermal zones. + +:: + + /sys/class/hwmon/hwmon[0-*]: + |---name: The type of the thermal zone devices + |---temp[1-*]_input: The current temperature of thermal zone [1-*] + |---temp[1-*]_critical: The critical trip point of thermal zone [1-*] + +Please read Documentation/hwmon/sysfs-interface.rst for additional information. + +Thermal zone attributes +----------------------- + +type + Strings which represent the thermal zone type. + This is given by thermal zone driver as part of registration. + E.g: "acpitz" indicates it's an ACPI thermal device. + In order to keep it consistent with hwmon sys attribute; this should + be a short, lowercase string, not containing spaces nor dashes. + RO, Required + +temp + Current temperature as reported by thermal zone (sensor). + Unit: millidegree Celsius + RO, Required + +mode + One of the predefined values in [enabled, disabled]. + This file gives information about the algorithm that is currently + managing the thermal zone. It can be either default kernel based + algorithm or user space application. + + enabled + enable Kernel Thermal management. + disabled + Preventing kernel thermal zone driver actions upon + trip points so that user application can take full + charge of the thermal management. + + RW, Optional + +policy + One of the various thermal governors used for a particular zone. + + RW, Required + +available_policies + Available thermal governors which can be used for a particular zone. + + RO, Required + +`trip_point_[0-*]_temp` + The temperature above which trip point will be fired. + + Unit: millidegree Celsius + + RO, Optional + +`trip_point_[0-*]_type` + Strings which indicate the type of the trip point. + + E.g. it can be one of critical, hot, passive, `active[0-*]` for ACPI + thermal zone. + + RO, Optional + +`trip_point_[0-*]_hyst` + The hysteresis value for a trip point, represented as an integer + Unit: Celsius + RW, Optional + +`cdev[0-*]` + Sysfs link to the thermal cooling device node where the sys I/F + for cooling device throttling control represents. + + RO, Optional + +`cdev[0-*]_trip_point` + The trip point in this thermal zone which `cdev[0-*]` is associated + with; -1 means the cooling device is not associated with any trip + point. + + RO, Optional + +`cdev[0-*]_weight` + The influence of `cdev[0-*]` in this thermal zone. This value + is relative to the rest of cooling devices in the thermal + zone. For example, if a cooling device has a weight double + than that of other, it's twice as effective in cooling the + thermal zone. + + RW, Optional + +passive + Attribute is only present for zones in which the passive cooling + policy is not supported by native thermal driver. Default is zero + and can be set to a temperature (in millidegrees) to enable a + passive trip point for the zone. Activation is done by polling with + an interval of 1 second. + + Unit: millidegrees Celsius + + Valid values: 0 (disabled) or greater than 1000 + + RW, Optional + +emul_temp + Interface to set the emulated temperature method in thermal zone + (sensor). After setting this temperature, the thermal zone may pass + this temperature to platform emulation function if registered or + cache it locally. This is useful in debugging different temperature + threshold and its associated cooling action. This is write only node + and writing 0 on this node should disable emulation. + Unit: millidegree Celsius + + WO, Optional + + WARNING: + Be careful while enabling this option on production systems, + because userland can easily disable the thermal policy by simply + flooding this sysfs node with low temperature values. + +sustainable_power + An estimate of the sustained power that can be dissipated by + the thermal zone. Used by the power allocator governor. For + more information see Documentation/thermal/power_allocator.rst + + Unit: milliwatts + + RW, Optional + +k_po + The proportional term of the power allocator governor's PID + controller during temperature overshoot. Temperature overshoot + is when the current temperature is above the "desired + temperature" trip point. For more information see + Documentation/thermal/power_allocator.rst + + RW, Optional + +k_pu + The proportional term of the power allocator governor's PID + controller during temperature undershoot. Temperature undershoot + is when the current temperature is below the "desired + temperature" trip point. For more information see + Documentation/thermal/power_allocator.rst + + RW, Optional + +k_i + The integral term of the power allocator governor's PID + controller. This term allows the PID controller to compensate + for long term drift. For more information see + Documentation/thermal/power_allocator.rst + + RW, Optional + +k_d + The derivative term of the power allocator governor's PID + controller. For more information see + Documentation/thermal/power_allocator.rst + + RW, Optional + +integral_cutoff + Temperature offset from the desired temperature trip point + above which the integral term of the power allocator + governor's PID controller starts accumulating errors. For + example, if integral_cutoff is 0, then the integral term only + accumulates error when temperature is above the desired + temperature trip point. For more information see + Documentation/thermal/power_allocator.rst + + Unit: millidegree Celsius + + RW, Optional + +slope + The slope constant used in a linear extrapolation model + to determine a hotspot temperature based off the sensor's + raw readings. It is up to the device driver to determine + the usage of these values. + + RW, Optional + +offset + The offset constant used in a linear extrapolation model + to determine a hotspot temperature based off the sensor's + raw readings. It is up to the device driver to determine + the usage of these values. + + RW, Optional + +Cooling device attributes +------------------------- + +type + String which represents the type of device, e.g: + + - for generic ACPI: should be "Fan", "Processor" or "LCD" + - for memory controller device on intel_menlow platform: + should be "Memory controller". + + RO, Required + +max_state + The maximum permissible cooling state of this cooling device. + + RO, Required + +cur_state + The current cooling state of this cooling device. + The value can any integer numbers between 0 and max_state: + + - cur_state == 0 means no cooling + - cur_state == max_state means the maximum cooling. + + RW, Required + +stats/reset + Writing any value resets the cooling device's statistics. + WO, Required + +stats/time_in_state_ms: + The amount of time spent by the cooling device in various cooling + states. The output will have "