From 3b5e6454aaf6b4439b19400d8365e2ec2d24e411 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 4 Sep 2014 22:04:42 -0400 Subject: fs/buffer.c: support buffer cache allocations with gfp modifiers A buffer cache is allocated from movable area because it is referred for a while and released soon. But some filesystems are taking buffer cache for a long time and it can disturb page migration. New APIs are introduced to allocate buffer cache with user specific flag. *_gfp APIs are for user want to set page allocation flag for page cache allocation. And *_unmovable APIs are for the user wants to allocate page cache from non-movable area. Signed-off-by: Gioh Kim Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- include/linux/buffer_head.h | 47 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 324329ceea1e..73b45225a7ca 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -175,12 +175,13 @@ void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); -struct buffer_head *__getblk(struct block_device *bdev, sector_t block, - unsigned size); +struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp); void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); -struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size); +struct buffer_head *__bread_gfp(struct block_device *, + sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); @@ -295,7 +296,13 @@ static inline void bforget(struct buffer_head *bh) static inline struct buffer_head * sb_bread(struct super_block *sb, sector_t block) { - return __bread(sb->s_bdev, block, sb->s_blocksize); + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); +} + +static inline struct buffer_head * +sb_bread_unmovable(struct super_block *sb, sector_t block) +{ + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0); } static inline void @@ -307,7 +314,7 @@ sb_breadahead(struct super_block *sb, sector_t block) static inline struct buffer_head * sb_getblk(struct super_block *sb, sector_t block) { - return __getblk(sb->s_bdev, block, sb->s_blocksize); + return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); } static inline struct buffer_head * @@ -344,6 +351,36 @@ static inline void lock_buffer(struct buffer_head *bh) __lock_buffer(bh); } +static inline struct buffer_head *getblk_unmovable(struct block_device *bdev, + sector_t block, + unsigned size) +{ + return __getblk_gfp(bdev, block, size, 0); +} + +static inline struct buffer_head *__getblk(struct block_device *bdev, + sector_t block, + unsigned size) +{ + return __getblk_gfp(bdev, block, size, __GFP_MOVABLE); +} + +/** + * __bread() - reads a specified block and returns the bh + * @bdev: the block_device to read from + * @block: number of block + * @size: size (in bytes) to read + * + * Reads a specified block, and returns buffer head that contains it. + * The page cache is allocated from movable area so that it can be migrated. + * It returns NULL if the block was unreadable. + */ +static inline struct buffer_head * +__bread(struct block_device *bdev, sector_t block, unsigned size) +{ + return __bread_gfp(bdev, block, size, __GFP_MOVABLE); +} + extern int __set_page_dirty_buffers(struct page *page); #else /* CONFIG_BLOCK */ -- cgit v1.2.3-59-g8ed1b From 047133066e6c2549403fe5a2d619f47ba4212ef5 Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Thu, 7 Aug 2014 05:10:22 -0700 Subject: leds: Reorder include directives Reorder include directives so that they are arranged in alphabetical order. Signed-off-by: Jacek Anaszewski Acked-by: Kyungmin Park Cc: Richard Purdie Signed-off-by: Bryan Wu --- drivers/leds/led-class.c | 13 +++++++------ drivers/leds/led-core.c | 3 ++- include/linux/leds.h | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index aa29198fca3e..4bd4572efe6e 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -9,16 +9,17 @@ * published by the Free Software Foundation. */ -#include -#include +#include +#include +#include #include +#include +#include #include +#include +#include #include -#include #include -#include -#include -#include #include "leds.h" static struct class *leds_class; diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c index 71b40d3bf776..50b579ad948e 100644 --- a/drivers/leds/led-core.c +++ b/drivers/leds/led-core.c @@ -12,10 +12,11 @@ */ #include +#include #include #include +#include #include -#include #include "leds.h" DECLARE_RWSEM(leds_list_lock); diff --git a/include/linux/leds.h b/include/linux/leds.h index e43686472197..4be2d7623d9e 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -13,8 +13,8 @@ #define __LINUX_LEDS_H_INCLUDED #include -#include #include +#include #include #include -- cgit v1.2.3-59-g8ed1b From d8082827d8a214343b761f2c4554d2a7d1573d63 Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Thu, 7 Aug 2014 05:10:23 -0700 Subject: leds: make brightness type consistent across whole subsystem Documentations states that brightness units type is enum led_brightness and this is the type used by the led API functions. Adjust the type of brightness variables in the struct led_classdev accordingly. Signed-off-by: Jacek Anaszewski Acked-by: Kyungmin Park Cc: Richard Purdie Signed-off-by: Bryan Wu --- include/linux/leds.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index 4be2d7623d9e..f2e1cbc25705 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -31,8 +31,8 @@ enum led_brightness { struct led_classdev { const char *name; - int brightness; - int max_brightness; + enum led_brightness brightness; + enum led_brightness max_brightness; int flags; /* Lower 16 bits reflect status */ -- cgit v1.2.3-59-g8ed1b From 3ef7de5304edf60d0b8674dd7cdacc104e15a93c Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Wed, 20 Aug 2014 06:41:55 -0700 Subject: leds: Improve and export led_update_brightness led_update_brightness helper function used to be exploited only locally in the led-class.c module, where its result was being passed to the brightness_show sysfs callback. With the introduction of v4l2-flash subdevice the same functionality becomes required for reading current brightness from a LED device. This patch adds checking of return value of the brightness_get callback and moves the led_update_brightness() function to the LED subsystem public API. Signed-off-by: Jacek Anaszewski Acked-by: Kyungmin Park Cc: Richard Purdie Signed-off-by: Bryan Wu --- drivers/leds/led-class.c | 6 ------ drivers/leds/led-core.c | 16 ++++++++++++++++ include/linux/leds.h | 10 ++++++++++ 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 71666a40b79a..7440c58b8e6f 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -24,12 +24,6 @@ static struct class *leds_class; -static void led_update_brightness(struct led_classdev *led_cdev) -{ - if (led_cdev->brightness_get) - led_cdev->brightness = led_cdev->brightness_get(led_cdev); -} - static ssize_t brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c index 50b579ad948e..aaa8eba9099f 100644 --- a/drivers/leds/led-core.c +++ b/drivers/leds/led-core.c @@ -127,3 +127,19 @@ void led_set_brightness(struct led_classdev *led_cdev, __led_set_brightness(led_cdev, brightness); } EXPORT_SYMBOL(led_set_brightness); + +int led_update_brightness(struct led_classdev *led_cdev) +{ + int ret = 0; + + if (led_cdev->brightness_get) { + ret = led_cdev->brightness_get(led_cdev); + if (ret >= 0) { + led_cdev->brightness = ret; + return 0; + } + } + + return ret; +} +EXPORT_SYMBOL(led_update_brightness); diff --git a/include/linux/leds.h b/include/linux/leds.h index f2e1cbc25705..a57611d0c94e 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -140,6 +140,16 @@ extern void led_blink_set_oneshot(struct led_classdev *led_cdev, */ extern void led_set_brightness(struct led_classdev *led_cdev, enum led_brightness brightness); +/** + * led_update_brightness - update LED brightness + * @led_cdev: the LED to query + * + * Get an LED's current brightness and update led_cdev->brightness + * member with the obtained value. + * + * Returns: 0 on success or negative error value on failure + */ +extern int led_update_brightness(struct led_classdev *led_cdev); /* * LED Triggers -- cgit v1.2.3-59-g8ed1b From 50849db32a9f529235a84bcc84a6b8e631b1d0ec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 Sep 2014 00:58:12 -0400 Subject: jbd2: simplify calling convention around __jbd2_journal_clean_checkpoint_list __jbd2_journal_clean_checkpoint_list() returns number of buffers it freed but noone was using the value so just stop doing that. This also allows for simplifying the calling convention for journal_clean_once_cp_list(). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 56 ++++++++++++++++++++++------------------------------ include/linux/jbd2.h | 2 +- 2 files changed, 25 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 3ab4c5ee12ce..988b32ed4c87 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -421,16 +421,15 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * release them. * * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) + * Returns 1 if we freed the transaction, 0 otherwise. */ - -static int journal_clean_one_cp_list(struct journal_head *jh, int *released) +static int journal_clean_one_cp_list(struct journal_head *jh) { struct journal_head *last_jh; struct journal_head *next_jh = jh; - int ret, freed = 0; + int ret; + int freed = 0; - *released = 0; if (!jh) return 0; @@ -441,11 +440,9 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) ret = __try_to_free_cp_buf(jh); if (!ret) return freed; - freed++; - if (ret == 2) { - *released = 1; - return freed; - } + if (ret == 2) + return 1; + freed = 1; /* * This function only frees up some memory * if possible so we dont have an obligation @@ -465,53 +462,48 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) * Find all the written-back checkpoint buffers in the journal and release them. * * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) */ - -int __jbd2_journal_clean_checkpoint_list(journal_t *journal) +void __jbd2_journal_clean_checkpoint_list(journal_t *journal) { transaction_t *transaction, *last_transaction, *next_transaction; int ret; - int freed = 0; - int released; transaction = journal->j_checkpoint_transactions; if (!transaction) - goto out; + return; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret = journal_clean_one_cp_list(transaction-> - t_checkpoint_list, &released); + ret = journal_clean_one_cp_list(transaction->t_checkpoint_list); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if * preemption requested: */ - if (need_resched()) { - freed += ret; - goto out; - } - if (released) { - freed += ret; + if (need_resched()) + return; + if (ret) continue; - } /* * It is essential that we are as careful as in the case of * t_checkpoint_list with removing the buffer from the list as * we can possibly see not yet submitted buffers on io_list */ - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, &released); - freed += ret; - if (need_resched() || !ret) - goto out; + ret = journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list); + if (need_resched()) + return; + /* + * Stop scanning if we couldn't free the transaction. This + * avoids pointless scanning of transactions which still + * weren't checkpointed. + */ + if (!ret) + return; } while (transaction != last_transaction); -out: - return freed; } /* diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 0dae71e9971c..704b9a599b26 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1042,7 +1042,7 @@ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ -int __jbd2_journal_clean_checkpoint_list(journal_t *journal); +void __jbd2_journal_clean_checkpoint_list(journal_t *journal); int __jbd2_journal_remove_checkpoint(struct journal_head *); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); -- cgit v1.2.3-59-g8ed1b From 7990da71ebfa887ae6fe4464ab0d99ddeb8efacc Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Wed, 3 Sep 2014 17:49:32 +0200 Subject: PM / QoS: Add PM_QOS_MEMORY_BANDWIDTH class Also adds a class type PM_QOS_SUM that aggregates the values by summing them. It can be used by memory controllers to calculate the optimum clock frequency based on the bandwidth needs of the different memory clients. Signed-off-by: Tomeu Vizoso Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- Documentation/power/pm_qos_interface.txt | 4 +++- include/linux/pm_qos.h | 5 ++++- kernel/power/qos.c | 27 ++++++++++++++++++++++++++- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt index a5da5c7e7128..129f7c0e1483 100644 --- a/Documentation/power/pm_qos_interface.txt +++ b/Documentation/power/pm_qos_interface.txt @@ -5,7 +5,8 @@ performance expectations by drivers, subsystems and user space applications on one of the parameters. Two different PM QoS frameworks are available: -1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput. +1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput, +memory_bandwidth. 2. the per-device PM QoS framework provides the API to manage the per-device latency constraints and PM QoS flags. @@ -13,6 +14,7 @@ Each parameters have defined units: * latency: usec * timeout: usec * throughput: kbs (kilo bit / sec) + * memory bandwidth: mbs (mega bit / sec) 1. PM QoS framework diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 9ab4bf7c4646..636e82834506 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -15,6 +15,7 @@ enum { PM_QOS_CPU_DMA_LATENCY, PM_QOS_NETWORK_LATENCY, PM_QOS_NETWORK_THROUGHPUT, + PM_QOS_MEMORY_BANDWIDTH, /* insert new class ID */ PM_QOS_NUM_CLASSES, @@ -32,6 +33,7 @@ enum pm_qos_flags_status { #define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE 0 +#define PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE 0 #define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT (-1) @@ -69,7 +71,8 @@ struct dev_pm_qos_request { enum pm_qos_type { PM_QOS_UNITIALIZED, PM_QOS_MAX, /* return the largest value */ - PM_QOS_MIN /* return the smallest value */ + PM_QOS_MIN, /* return the smallest value */ + PM_QOS_SUM /* return the sum */ }; /* diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 884b77058864..5f4c006c4b1e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = { }; +static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); +static struct pm_qos_constraints memory_bw_constraints = { + .list = PLIST_HEAD_INIT(memory_bw_constraints.list), + .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .type = PM_QOS_SUM, + .notifiers = &memory_bandwidth_notifier, +}; +static struct pm_qos_object memory_bandwidth_pm_qos = { + .constraints = &memory_bw_constraints, + .name = "memory_bandwidth", +}; + + static struct pm_qos_object *pm_qos_array[] = { &null_pm_qos, &cpu_dma_pm_qos, &network_lat_pm_qos, - &network_throughput_pm_qos + &network_throughput_pm_qos, + &memory_bandwidth_pm_qos, }; static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, @@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = { /* unlocked internal variant */ static inline int pm_qos_get_value(struct pm_qos_constraints *c) { + struct plist_node *node; + int total_value = 0; + if (plist_head_empty(&c->list)) return c->no_constraint_value; @@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) case PM_QOS_MAX: return plist_last(&c->list)->prio; + case PM_QOS_SUM: + plist_for_each(node, &c->list) + total_value += node->prio; + + return total_value; + default: /* runtime check for not using enum */ BUG(); -- cgit v1.2.3-59-g8ed1b From 90a8020278c1598fafd071736a0846b38510309c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Oct 2014 21:49:18 -0400 Subject: vfs: fix data corruption when blocksize < pagesize for mmaped data ->page_mkwrite() is used by filesystems to allocate blocks under a page which is becoming writeably mmapped in some process' address space. This allows a filesystem to return a page fault if there is not enough space available, user exceeds quota or similar problem happens, rather than silently discarding data later when writepage is called. However VFS fails to call ->page_mkwrite() in all the cases where filesystems need it when blocksize < pagesize. For example when blocksize = 1024, pagesize = 4096 the following is problematic: ftruncate(fd, 0); pwrite(fd, buf, 1024, 0); map = mmap(NULL, 1024, PROT_WRITE, MAP_SHARED, fd, 0); map[0] = 'a'; ----> page_mkwrite() for index 0 is called ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */ mremap(map, 1024, 10000, 0); map[4095] = 'a'; ----> no page_mkwrite() called At the moment ->page_mkwrite() is called, filesystem can allocate only one block for the page because i_size == 1024. Otherwise it would create blocks beyond i_size which is generally undesirable. But later at ->writepage() time, we also need to store data at offset 4095 but we don't have block allocated for it. This patch introduces a helper function filesystems can use to have ->page_mkwrite() called at all the necessary moments. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/buffer.c | 3 +++ include/linux/mm.h | 1 + mm/truncate.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index 9a6029e0dd71..6dc1475dcb2d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2087,6 +2087,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int i_size_changed = 0; copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -2106,6 +2107,8 @@ int generic_write_end(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock diff --git a/include/linux/mm.h b/include/linux/mm.h index 8981cc882ed2..5005464fe012 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1155,6 +1155,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); diff --git a/mm/truncate.c b/mm/truncate.c index 96d167372d89..261eaf6e5a19 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -20,6 +20,7 @@ #include /* grr. try_to_release_page, do_invalidatepage */ #include +#include #include "internal.h" static void clear_exceptional_entry(struct address_space *mapping, @@ -719,11 +720,67 @@ EXPORT_SYMBOL(truncate_pagecache); */ void truncate_setsize(struct inode *inode, loff_t newsize) { + loff_t oldsize = inode->i_size; + i_size_write(inode, newsize); + if (newsize > oldsize) + pagecache_isize_extended(inode, oldsize, newsize); truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); +/** + * pagecache_isize_extended - update pagecache after extension of i_size + * @inode: inode for which i_size was extended + * @from: original inode size + * @to: new inode size + * + * Handle extension of inode size either caused by extending truncate or by + * write starting after current i_size. We mark the page straddling current + * i_size RO so that page_mkwrite() is called on the nearest write access to + * the page. This way filesystem can be sure that page_mkwrite() is called on + * the page before user writes to the page via mmap after the i_size has been + * changed. + * + * The function must be called after i_size is updated so that page fault + * coming after we unlock the page will already see the new i_size. + * The function must be called while we still hold i_mutex - this not only + * makes sure i_size is stable but also that userspace cannot observe new + * i_size value before we are prepared to store mmap writes at new inode size. + */ +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) +{ + int bsize = 1 << inode->i_blkbits; + loff_t rounded_from; + struct page *page; + pgoff_t index; + + WARN_ON(!mutex_is_locked(&inode->i_mutex)); + WARN_ON(to > inode->i_size); + + if (from >= to || bsize == PAGE_CACHE_SIZE) + return; + /* Page straddling @from will not have any hole block created? */ + rounded_from = round_up(from, bsize); + if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) + return; + + index = from >> PAGE_CACHE_SHIFT; + page = find_lock_page(inode->i_mapping, index); + /* Page not cached? Nothing to do */ + if (!page) + return; + /* + * See clear_page_dirty_for_io() for details why set_page_dirty() + * is needed. + */ + if (page_mkclean(page)) + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); +} +EXPORT_SYMBOL(pagecache_isize_extended); + /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode -- cgit v1.2.3-59-g8ed1b From f14bb039a4e8206439d3e9abd92bc76bd142f243 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 1 Oct 2014 16:07:03 -0700 Subject: uio: Export definition of struct uio_device In order to prevent a O(n) search of the filesystem to link up its uio node with its target configuration, TCMU needs to know the minor number that UIO assigned. Expose the definition of this struct so TCMU can access this field. Signed-off-by: Andy Grover Signed-off-by: Nicholas Bellinger --- drivers/uio/uio.c | 12 ------------ include/linux/uio_driver.h | 12 +++++++++++- 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index a673e5b6a2e0..60fa6278fbce 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -28,18 +28,6 @@ #define UIO_MAX_DEVICES (1U << MINORBITS) -struct uio_device { - struct module *owner; - struct device *dev; - int minor; - atomic_t event; - struct fasync_struct *async_queue; - wait_queue_head_t wait; - struct uio_info *info; - struct kobject *map_dir; - struct kobject *portio_dir; -}; - static int uio_major; static struct cdev *uio_cdev; static DEFINE_IDR(uio_idr); diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 1ad4724458de..baa81718d985 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -63,7 +63,17 @@ struct uio_port { #define MAX_UIO_PORT_REGIONS 5 -struct uio_device; +struct uio_device { + struct module *owner; + struct device *dev; + int minor; + atomic_t event; + struct fasync_struct *async_queue; + wait_queue_head_t wait; + struct uio_info *info; + struct kobject *map_dir; + struct kobject *portio_dir; +}; /** * struct uio_info - UIO device capabilities -- cgit v1.2.3-59-g8ed1b From 5a17dae422d7de4b776a9753cd4673a343a25b4b Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 5 Aug 2014 11:52:11 +0100 Subject: efi: Add efi= parameter parsing to the EFI boot stub We need a way to customize the behaviour of the EFI boot stub, in particular, we need a way to disable the "chunking" workaround, used when reading files from the EFI System Partition. One of my machines doesn't cope well when reading files in 1MB chunks to a buffer above the 4GB mark - it appears that the "chunking" bug workaround triggers another firmware bug. This was only discovered with commit 4bf7111f5016 ("x86/efi: Support initrd loaded above 4G"), and that commit is perfectly valid. The symptom I observed was a corrupt initrd rather than any kind of crash. efi= is now used to specify EFI parameters in two very different execution environments, the EFI boot stub and during kernel boot. There is also a slight performance optimization by enabling efi=nochunk, but that's offset by the fact that you're more likely to run into firmware issues, at least on x86. This is the rationale behind leaving the workaround enabled by default. Also provide some documentation for EFI_READ_CHUNK_SIZE and why we're using the current value of 1MB. Tested-by: Ard Biesheuvel Cc: Roy Franz Cc: Maarten Lankhorst Cc: Leif Lindholm Cc: Borislav Petkov Signed-off-by: Matt Fleming --- Documentation/kernel-parameters.txt | 5 ++- arch/x86/boot/compressed/eboot.c | 4 ++ arch/x86/platform/efi/efi.c | 19 +++++++- drivers/firmware/efi/libstub/arm-stub.c | 4 ++ drivers/firmware/efi/libstub/efi-stub-helper.c | 62 +++++++++++++++++++++++++- include/linux/efi.h | 2 + 6 files changed, 91 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 5ae8608ca9f5..67d79597d9d6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -992,10 +992,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: {"off" | "on" | "skip[mbr]"} efi= [EFI] - Format: { "old_map" } + Format: { "old_map", "nochunk" } old_map [X86-64]: switch to the old ioremap-based EFI runtime services mapping. 32-bit still uses this one by default. + nochunk: disable reading files in "chunks" in the EFI + boot stub, as chunking can cause problems with some + firmware implementations. efi_no_storage_paranoia [EFI; X86] Using this parameter you can use more than 50% of diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index f277184e2ac1..f4bdab1dbf66 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1100,6 +1100,10 @@ struct boot_params *make_boot_params(struct efi_config *c) else initrd_addr_max = hdr->initrd_addr_max; + status = efi_parse_options(cmdline_ptr); + if (status != EFI_SUCCESS) + goto fail2; + status = handle_cmdline_files(sys_table, image, (char *)(unsigned long)hdr->cmd_line_ptr, "initrd=", initrd_addr_max, diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 850da94fef30..a1f745b0bf1d 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -943,8 +943,23 @@ static int __init parse_efi_cmdline(char *str) if (*str == '=') str++; - if (!strncmp(str, "old_map", 7)) - set_bit(EFI_OLD_MEMMAP, &efi.flags); + while (*str) { + if (!strncmp(str, "old_map", 7)) { + set_bit(EFI_OLD_MEMMAP, &efi.flags); + str += strlen("old_map"); + } + + /* + * Skip any options we don't understand. Presumably + * they apply to the EFI boot stub. + */ + while (*str && *str != ',') + str++; + + /* If we hit a delimiter, skip it */ + if (*str == ',') + str++; + } return 0; } diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 480339b6b110..75ee05964cbc 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -226,6 +226,10 @@ unsigned long __init efi_entry(void *handle, efi_system_table_t *sys_table, goto fail_free_image; } + status = efi_parse_options(cmdline_ptr); + if (status != EFI_SUCCESS) + pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n"); + /* * Unauthenticated device tree data is a security hazard, so * ignore 'dtb=' unless UEFI Secure Boot is disabled. diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 32d5cca30f49..a920fec8fe88 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -15,8 +15,23 @@ #include "efistub.h" +/* + * Some firmware implementations have problems reading files in one go. + * A read chunk size of 1MB seems to work for most platforms. + * + * Unfortunately, reading files in chunks triggers *other* bugs on some + * platforms, so we provide a way to disable this workaround, which can + * be done by passing "efi=nochunk" on the EFI boot stub command line. + * + * If you experience issues with initrd images being corrupt it's worth + * trying efi=nochunk, but chunking is enabled by default because there + * are far more machines that require the workaround than those that + * break with it enabled. + */ #define EFI_READ_CHUNK_SIZE (1024 * 1024) +static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE; + struct file_info { efi_file_handle_t *handle; u64 size; @@ -281,6 +296,49 @@ void efi_free(efi_system_table_t *sys_table_arg, unsigned long size, efi_call_early(free_pages, addr, nr_pages); } +/* + * Parse the ASCII string 'cmdline' for EFI options, denoted by the efi= + * option, e.g. efi=nochunk. + * + * It should be noted that efi= is parsed in two very different + * environments, first in the early boot environment of the EFI boot + * stub, and subsequently during the kernel boot. + */ +efi_status_t efi_parse_options(char *cmdline) +{ + char *str; + + /* + * If no EFI parameters were specified on the cmdline we've got + * nothing to do. + */ + str = strstr(cmdline, "efi="); + if (!str) + return EFI_SUCCESS; + + /* Skip ahead to first argument */ + str += strlen("efi="); + + /* + * Remember, because efi= is also used by the kernel we need to + * skip over arguments we don't understand. + */ + while (*str) { + if (!strncmp(str, "nochunk", 7)) { + str += strlen("nochunk"); + __chunk_size = -1UL; + } + + /* Group words together, delimited by "," */ + while (*str && *str != ',') + str++; + + if (*str == ',') + str++; + } + + return EFI_SUCCESS; +} /* * Check the cmdline for a LILO-style file= arguments. @@ -423,8 +481,8 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, size = files[j].size; while (size) { unsigned long chunksize; - if (size > EFI_READ_CHUNK_SIZE) - chunksize = EFI_READ_CHUNK_SIZE; + if (size > __chunk_size) + chunksize = __chunk_size; else chunksize = size; diff --git a/include/linux/efi.h b/include/linux/efi.h index 45cb4ffdea62..518779fb5e90 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1227,4 +1227,6 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, unsigned long *load_addr, unsigned long *load_size); +efi_status_t efi_parse_options(char *cmdline); + #endif /* _LINUX_EFI_H */ -- cgit v1.2.3-59-g8ed1b From b2e0a54a1296a91b800f316df7bef7d1905e4fd0 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 14 Aug 2014 17:15:26 +0800 Subject: efi: Move noefi early param code out of x86 arch code noefi param can be used for arches other than X86 later, thus move it out of x86 platform code. Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- Documentation/kernel-parameters.txt | 2 +- arch/x86/platform/efi/efi.c | 10 +--------- drivers/firmware/efi/efi.c | 13 +++++++++++++ include/linux/efi.h | 1 + 4 files changed, 16 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 67d79597d9d6..08df275eee2b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2169,7 +2169,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nodsp [SH] Disable hardware DSP at boot time. - noefi [X86] Disable EFI runtime services support. + noefi Disable EFI runtime services support. noexec [IA-64] diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index a1f745b0bf1d..c90d3cd2728c 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -70,14 +70,6 @@ static efi_config_table_type_t arch_tables[] __initdata = { u64 efi_setup; /* efi setup_data physical address */ -static bool disable_runtime __initdata = false; -static int __init setup_noefi(char *arg) -{ - disable_runtime = true; - return 0; -} -early_param("noefi", setup_noefi); - int add_efi_memmap; EXPORT_SYMBOL(add_efi_memmap); @@ -492,7 +484,7 @@ void __init efi_init(void) if (!efi_runtime_supported()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); else { - if (disable_runtime || efi_runtime_init()) + if (efi_runtime_disabled() || efi_runtime_init()) return; } if (efi_memmap_init()) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 64ecbb501c50..c8f01a73edb5 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -41,6 +41,19 @@ struct efi __read_mostly efi = { }; EXPORT_SYMBOL(efi); +static bool disable_runtime; +static int __init setup_noefi(char *arg) +{ + disable_runtime = true; + return 0; +} +early_param("noefi", setup_noefi); + +bool efi_runtime_disabled(void) +{ + return disable_runtime; +} + static struct kobject *efi_kobj; static struct kobject *efivars_kobj; diff --git a/include/linux/efi.h b/include/linux/efi.h index 518779fb5e90..4812ed0b0374 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1229,4 +1229,5 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, efi_status_t efi_parse_options(char *cmdline); +bool efi_runtime_disabled(void); #endif /* _LINUX_EFI_H */ -- cgit v1.2.3-59-g8ed1b From 6ccc72b87b83ece31c2a75bbe07f440b0378f7a9 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 14 Aug 2014 17:15:27 +0800 Subject: lib: Add a generic cmdline parse function parse_option_str There should be a generic function to parse params like a=b,c Adding parse_option_str in lib/cmdline.c which will return true if there's specified option set in the params. Also updated efi=old_map parsing code to use the new function Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 22 ++-------------------- include/linux/kernel.h | 1 + lib/cmdline.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index c90d3cd2728c..c73a7df5d37f 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -932,26 +932,8 @@ u64 efi_mem_attributes(unsigned long phys_addr) static int __init parse_efi_cmdline(char *str) { - if (*str == '=') - str++; - - while (*str) { - if (!strncmp(str, "old_map", 7)) { - set_bit(EFI_OLD_MEMMAP, &efi.flags); - str += strlen("old_map"); - } - - /* - * Skip any options we don't understand. Presumably - * they apply to the EFI boot stub. - */ - while (*str && *str != ',') - str++; - - /* If we hit a delimiter, skip it */ - if (*str == ',') - str++; - } + if (parse_option_str(str, "old_map")) + set_bit(EFI_OLD_MEMMAP, &efi.flags); return 0; } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 95624bed87ef..f66427ef0628 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -407,6 +407,7 @@ int vsscanf(const char *, const char *, va_list); extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); +extern bool parse_option_str(const char *str, const char *option); extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); diff --git a/lib/cmdline.c b/lib/cmdline.c index 76a712e6e20e..8f13cf73c2ec 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -160,3 +160,32 @@ unsigned long long memparse(const char *ptr, char **retptr) return ret; } EXPORT_SYMBOL(memparse); + +/** + * parse_option_str - Parse a string and check an option is set or not + * @str: String to be parsed + * @option: option name + * + * This function parses a string containing a comma-separated list of + * strings like a=b,c. + * + * Return true if there's such option in the string, or return false. + */ +bool parse_option_str(const char *str, const char *option) +{ + while (*str) { + if (!strncmp(str, option, strlen(option))) { + str += strlen(option); + if (!*str || *str == ',') + return true; + } + + while (*str && *str != ',') + str++; + + if (*str == ',') + str++; + } + + return false; +} -- cgit v1.2.3-59-g8ed1b From 9c97e0bdd4b4ae44577a1b1ec949e782084e9a78 Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 3 Sep 2014 13:32:19 +0200 Subject: efi: Add macro for EFI_MEMORY_UCE memory attribute Add the following macro from the UEFI spec, for completeness: EFI_MEMORY_UCE Memory cacheability attribute: The memory region supports being configured as not cacheable, exported, and supports the "fetch and add" semaphore mechanism. Signed-off-by: Laszlo Ersek Tested-by: Ard Biesheuvel Acked-by: Ard Biesheuvel Signed-off-by: Matt Fleming --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/efi.h b/include/linux/efi.h index 4812ed0b0374..7464032ae00a 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -92,6 +92,7 @@ typedef struct { #define EFI_MEMORY_WC ((u64)0x0000000000000002ULL) /* write-coalescing */ #define EFI_MEMORY_WT ((u64)0x0000000000000004ULL) /* write-through */ #define EFI_MEMORY_WB ((u64)0x0000000000000008ULL) /* write-back */ +#define EFI_MEMORY_UCE ((u64)0x0000000000000010ULL) /* uncached, exported */ #define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ #define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ #define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ -- cgit v1.2.3-59-g8ed1b From 98d2a6ca14520904a47c46258d3bad02ffcd3f96 Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 3 Sep 2014 13:32:20 +0200 Subject: efi: Introduce efi_md_typeattr_format() At the moment, there are three architectures debug-printing the EFI memory map at initialization: x86, ia64, and arm64. They all use different format strings, plus the EFI memory type and the EFI memory attributes are similarly hard to decode for a human reader. Introduce a helper __init function that formats the memory type and the memory attributes in a unified way, to a user-provided character buffer. The array "memory_type_name" is copied from the arm64 code, temporarily duplicating it. The (otherwise optional) braces around each string literal in the initializer list are dropped in order to match the kernel coding style more closely. The element size is tightened from 32 to 20 bytes (maximum actual string length + 1) so that we can derive the field width from the element size. Signed-off-by: Laszlo Ersek Tested-by: Ard Biesheuvel Acked-by: Ard Biesheuvel [ Dropped useless 'register' keyword, which compiler will ignore ] Signed-off-by: Matt Fleming --- drivers/firmware/efi/efi.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/efi.h | 7 ++++++ 2 files changed, 64 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index cebfa36a27ae..8590099ac148 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -445,3 +445,60 @@ int __init efi_get_fdt_params(struct efi_fdt_params *params, int verbose) return ret; } #endif /* CONFIG_EFI_PARAMS_FROM_FDT */ + +static __initdata char memory_type_name[][20] = { + "Reserved", + "Loader Code", + "Loader Data", + "Boot Code", + "Boot Data", + "Runtime Code", + "Runtime Data", + "Conventional Memory", + "Unusable Memory", + "ACPI Reclaim Memory", + "ACPI Memory NVS", + "Memory Mapped I/O", + "MMIO Port Space", + "PAL Code" +}; + +char * __init efi_md_typeattr_format(char *buf, size_t size, + const efi_memory_desc_t *md) +{ + char *pos; + int type_len; + u64 attr; + + pos = buf; + if (md->type >= ARRAY_SIZE(memory_type_name)) + type_len = snprintf(pos, size, "[type=%u", md->type); + else + type_len = snprintf(pos, size, "[%-*s", + (int)(sizeof(memory_type_name[0]) - 1), + memory_type_name[md->type]); + if (type_len >= size) + return buf; + + pos += type_len; + size -= type_len; + + attr = md->attribute; + if (attr & ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT | + EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_WP | + EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_RUNTIME)) + snprintf(pos, size, "|attr=0x%016llx]", + (unsigned long long)attr); + else + snprintf(pos, size, "|%3s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]", + attr & EFI_MEMORY_RUNTIME ? "RUN" : "", + attr & EFI_MEMORY_XP ? "XP" : "", + attr & EFI_MEMORY_RP ? "RP" : "", + attr & EFI_MEMORY_WP ? "WP" : "", + attr & EFI_MEMORY_UCE ? "UCE" : "", + attr & EFI_MEMORY_WB ? "WB" : "", + attr & EFI_MEMORY_WT ? "WT" : "", + attr & EFI_MEMORY_WC ? "WC" : "", + attr & EFI_MEMORY_UC ? "UC" : ""); + return buf; +} diff --git a/include/linux/efi.h b/include/linux/efi.h index 7464032ae00a..78b29b133e14 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -887,6 +887,13 @@ extern bool efi_poweroff_required(void); (md) <= (efi_memory_desc_t *)((m)->map_end - (m)->desc_size); \ (md) = (void *)(md) + (m)->desc_size) +/* + * Format an EFI memory descriptor's type and attributes to a user-provided + * character buffer, as per snprintf(), and return the buffer. + */ +char * __init efi_md_typeattr_format(char *buf, size_t size, + const efi_memory_desc_t *md); + /** * efi_range_is_wc - check the WC bit on an address range * @start: starting kvirt address -- cgit v1.2.3-59-g8ed1b From 6d80dba1c9fe4316ef626980102b92fa30c7845a Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 30 Sep 2014 21:58:52 +0100 Subject: efi: Provide a non-blocking SetVariable() operation There are some circumstances that call for trying to write an EFI variable in a non-blocking way. One such scenario is when writing pstore data in efi_pstore_write() via the pstore_dump() kdump callback. Now that we have an EFI runtime spinlock we need a way of aborting if there is contention instead of spinning, since when writing pstore data from the kdump callback, the runtime lock may already be held by the CPU that's running the callback if we crashed in the middle of an EFI variable operation. The situation is sufficiently special that a new EFI variable operation is warranted. Introduce ->set_variable_nonblocking() for this use case. It is an optional EFI backend operation, and need only be implemented by those backends that usually acquire locks to serialize access to EFI variables, as is the case for virt_efi_set_variable() where we now grab the EFI runtime spinlock. Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Ard Biesheuvel Cc: Matthew Garrett Signed-off-by: Matt Fleming --- drivers/firmware/efi/runtime-wrappers.c | 19 +++++++++++++ drivers/firmware/efi/vars.c | 47 +++++++++++++++++++++++++++++++++ include/linux/efi.h | 6 +++++ 3 files changed, 72 insertions(+) (limited to 'include/linux') diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index 9694cba665c4..4349206198b2 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -200,6 +200,24 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, return status; } +static efi_status_t +virt_efi_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, + void *data) +{ + unsigned long flags; + efi_status_t status; + + if (!spin_trylock_irqsave(&efi_runtime_lock, flags)) + return EFI_NOT_READY; + + status = efi_call_virt(set_variable, name, vendor, attr, data_size, + data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; +} + + static efi_status_t virt_efi_query_variable_info(u32 attr, u64 *storage_space, u64 *remaining_space, @@ -287,6 +305,7 @@ void efi_native_runtime_setup(void) efi.get_variable = virt_efi_get_variable; efi.get_next_variable = virt_efi_get_next_variable; efi.set_variable = virt_efi_set_variable; + efi.set_variable_nonblocking = virt_efi_set_variable_nonblocking; efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; efi.reset_system = virt_efi_reset_system; efi.query_variable_info = virt_efi_query_variable_info; diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index 1fa724f31b0e..fa3c66bdc1e5 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -595,6 +595,39 @@ int efivar_entry_set(struct efivar_entry *entry, u32 attributes, } EXPORT_SYMBOL_GPL(efivar_entry_set); +/* + * efivar_entry_set_nonblocking - call set_variable_nonblocking() + * + * This function is guaranteed to not block and is suitable for calling + * from crash/panic handlers. + * + * Crucially, this function will not block if it cannot acquire + * __efivars->lock. Instead, it returns -EBUSY. + */ +static int +efivar_entry_set_nonblocking(efi_char16_t *name, efi_guid_t vendor, + u32 attributes, unsigned long size, void *data) +{ + const struct efivar_operations *ops = __efivars->ops; + unsigned long flags; + efi_status_t status; + + if (!spin_trylock_irqsave(&__efivars->lock, flags)) + return -EBUSY; + + status = check_var_size(attributes, size + ucs2_strsize(name, 1024)); + if (status != EFI_SUCCESS) { + spin_unlock_irqrestore(&__efivars->lock, flags); + return -ENOSPC; + } + + status = ops->set_variable_nonblocking(name, &vendor, attributes, + size, data); + + spin_unlock_irqrestore(&__efivars->lock, flags); + return efi_status_to_err(status); +} + /** * efivar_entry_set_safe - call set_variable() if enough space in firmware * @name: buffer containing the variable name @@ -622,6 +655,20 @@ int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes, if (!ops->query_variable_store) return -ENOSYS; + /* + * If the EFI variable backend provides a non-blocking + * ->set_variable() operation and we're in a context where we + * cannot block, then we need to use it to avoid live-locks, + * since the implication is that the regular ->set_variable() + * will block. + * + * If no ->set_variable_nonblocking() is provided then + * ->set_variable() is assumed to be non-blocking. + */ + if (!block && ops->set_variable_nonblocking) + return efivar_entry_set_nonblocking(name, vendor, attributes, + size, data); + if (!block) { if (!spin_trylock_irqsave(&__efivars->lock, flags)) return -EBUSY; diff --git a/include/linux/efi.h b/include/linux/efi.h index 78b29b133e14..0949f9c7e872 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -503,6 +503,10 @@ typedef efi_status_t efi_get_next_variable_t (unsigned long *name_size, efi_char typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, u32 attr, unsigned long data_size, void *data); +typedef efi_status_t +efi_set_variable_nonblocking_t(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, void *data); + typedef efi_status_t efi_get_next_high_mono_count_t (u32 *count); typedef void efi_reset_system_t (int reset_type, efi_status_t status, unsigned long data_size, efi_char16_t *data); @@ -822,6 +826,7 @@ extern struct efi { efi_get_variable_t *get_variable; efi_get_next_variable_t *get_next_variable; efi_set_variable_t *set_variable; + efi_set_variable_nonblocking_t *set_variable_nonblocking; efi_query_variable_info_t *query_variable_info; efi_update_capsule_t *update_capsule; efi_query_capsule_caps_t *query_capsule_caps; @@ -1042,6 +1047,7 @@ struct efivar_operations { efi_get_variable_t *get_variable; efi_get_next_variable_t *get_next_variable; efi_set_variable_t *set_variable; + efi_set_variable_nonblocking_t *set_variable_nonblocking; efi_query_variable_store_t *query_variable_store; }; -- cgit v1.2.3-59-g8ed1b From f2fc42b6ac31f4d808da7a9da460dd433a71e976 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Thu, 12 Jun 2014 22:30:34 +0530 Subject: mailbox: rename pl320-ipc specific mailbox.h The patch 30058677 "ARM / highbank: add support for pl320 IPC" added a pl320 IPC specific header file as a generic mailbox.h. This file has been renamed appropriately to allow the introduction of the generic mailbox API framework. Acked-by: Mark Langsdorf Cc: Rafael J. Wysocki Signed-off-by: Suman Anna Reviewed-by: Mark Brown Acked-by: Arnd Bergmann --- arch/arm/mach-highbank/highbank.c | 2 +- drivers/cpufreq/highbank-cpufreq.c | 2 +- drivers/mailbox/pl320-ipc.c | 2 +- include/linux/mailbox.h | 17 ----------------- include/linux/pl320-ipc.h | 17 +++++++++++++++++ 5 files changed, 20 insertions(+), 20 deletions(-) delete mode 100644 include/linux/mailbox.h create mode 100644 include/linux/pl320-ipc.h (limited to 'include/linux') diff --git a/arch/arm/mach-highbank/highbank.c b/arch/arm/mach-highbank/highbank.c index 8c35ae4ff176..07a09570175d 100644 --- a/arch/arm/mach-highbank/highbank.c +++ b/arch/arm/mach-highbank/highbank.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/cpufreq/highbank-cpufreq.c b/drivers/cpufreq/highbank-cpufreq.c index bf8902a0866d..b464f29d8d54 100644 --- a/drivers/cpufreq/highbank-cpufreq.c +++ b/drivers/cpufreq/highbank-cpufreq.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #define HB_CPUFREQ_CHANGE_NOTE 0x80000001 diff --git a/drivers/mailbox/pl320-ipc.c b/drivers/mailbox/pl320-ipc.c index d873cbae2fbb..f3755e0aa935 100644 --- a/drivers/mailbox/pl320-ipc.c +++ b/drivers/mailbox/pl320-ipc.c @@ -26,7 +26,7 @@ #include #include -#include +#include #define IPCMxSOURCE(m) ((m) * 0x40) #define IPCMxDSET(m) (((m) * 0x40) + 0x004) diff --git a/include/linux/mailbox.h b/include/linux/mailbox.h deleted file mode 100644 index 5161f63ec1c8..000000000000 --- a/include/linux/mailbox.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -int pl320_ipc_transmit(u32 *data); -int pl320_ipc_register_notifier(struct notifier_block *nb); -int pl320_ipc_unregister_notifier(struct notifier_block *nb); diff --git a/include/linux/pl320-ipc.h b/include/linux/pl320-ipc.h new file mode 100644 index 000000000000..5161f63ec1c8 --- /dev/null +++ b/include/linux/pl320-ipc.h @@ -0,0 +1,17 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +int pl320_ipc_transmit(u32 *data); +int pl320_ipc_register_notifier(struct notifier_block *nb); +int pl320_ipc_unregister_notifier(struct notifier_block *nb); -- cgit v1.2.3-59-g8ed1b From 2b6d83e2b8b7de82331a6a1dcd64b51020a6031c Mon Sep 17 00:00:00 2001 From: Jassi Brar Date: Thu, 12 Jun 2014 22:31:19 +0530 Subject: mailbox: Introduce framework for mailbox Introduce common framework for client/protocol drivers and controller drivers of Inter-Processor-Communication (IPC). Client driver developers should have a look at include/linux/mailbox_client.h to understand the part of the API exposed to client drivers. Similarly controller driver developers should have a look at include/linux/mailbox_controller.h Reviewed-by: Mark Brown Signed-off-by: Jassi Brar --- MAINTAINERS | 8 + drivers/mailbox/Makefile | 4 + drivers/mailbox/mailbox.c | 465 +++++++++++++++++++++++++++++++++++++ include/linux/mailbox_client.h | 46 ++++ include/linux/mailbox_controller.h | 133 +++++++++++ 5 files changed, 656 insertions(+) create mode 100644 drivers/mailbox/mailbox.c create mode 100644 include/linux/mailbox_client.h create mode 100644 include/linux/mailbox_controller.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index a12edf2624e5..c5f7c854b6d2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5740,6 +5740,14 @@ S: Maintained F: drivers/net/macvlan.c F: include/linux/if_macvlan.h +MAILBOX API +M: Jassi Brar +L: linux-kernel@vger.kernel.org +S: Maintained +F: drivers/mailbox/ +F: include/linux/mailbox_client.h +F: include/linux/mailbox_controller.h + MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7 M: Michael Kerrisk W: http://www.kernel.org/doc/man-pages diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile index 6d184dbcaca8..94ed7cefb14d 100644 --- a/drivers/mailbox/Makefile +++ b/drivers/mailbox/Makefile @@ -1,3 +1,7 @@ +# Generic MAILBOX API + +obj-$(CONFIG_MAILBOX) += mailbox.o + obj-$(CONFIG_PL320_MBOX) += pl320-ipc.o obj-$(CONFIG_OMAP2PLUS_MBOX) += omap-mailbox.o diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c new file mode 100644 index 000000000000..afcb430508ec --- /dev/null +++ b/drivers/mailbox/mailbox.c @@ -0,0 +1,465 @@ +/* + * Mailbox: Common code for Mailbox controllers and users + * + * Copyright (C) 2013-2014 Linaro Ltd. + * Author: Jassi Brar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ +#define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ +#define TXDONE_BY_ACK BIT(2) /* S/W ACK recevied by Client ticks the TX */ + +static LIST_HEAD(mbox_cons); +static DEFINE_MUTEX(con_mutex); + +static int add_to_rbuf(struct mbox_chan *chan, void *mssg) +{ + int idx; + unsigned long flags; + + spin_lock_irqsave(&chan->lock, flags); + + /* See if there is any space left */ + if (chan->msg_count == MBOX_TX_QUEUE_LEN) { + spin_unlock_irqrestore(&chan->lock, flags); + return -ENOBUFS; + } + + idx = chan->msg_free; + chan->msg_data[idx] = mssg; + chan->msg_count++; + + if (idx == MBOX_TX_QUEUE_LEN - 1) + chan->msg_free = 0; + else + chan->msg_free++; + + spin_unlock_irqrestore(&chan->lock, flags); + + return idx; +} + +static void msg_submit(struct mbox_chan *chan) +{ + unsigned count, idx; + unsigned long flags; + void *data; + int err; + + spin_lock_irqsave(&chan->lock, flags); + + if (!chan->msg_count || chan->active_req) + goto exit; + + count = chan->msg_count; + idx = chan->msg_free; + if (idx >= count) + idx -= count; + else + idx += MBOX_TX_QUEUE_LEN - count; + + data = chan->msg_data[idx]; + + /* Try to submit a message to the MBOX controller */ + err = chan->mbox->ops->send_data(chan, data); + if (!err) { + chan->active_req = data; + chan->msg_count--; + } +exit: + spin_unlock_irqrestore(&chan->lock, flags); +} + +static void tx_tick(struct mbox_chan *chan, int r) +{ + unsigned long flags; + void *mssg; + + spin_lock_irqsave(&chan->lock, flags); + mssg = chan->active_req; + chan->active_req = NULL; + spin_unlock_irqrestore(&chan->lock, flags); + + /* Submit next message */ + msg_submit(chan); + + /* Notify the client */ + if (mssg && chan->cl->tx_done) + chan->cl->tx_done(chan->cl, mssg, r); + + if (chan->cl->tx_block) + complete(&chan->tx_complete); +} + +static void poll_txdone(unsigned long data) +{ + struct mbox_controller *mbox = (struct mbox_controller *)data; + bool txdone, resched = false; + int i; + + for (i = 0; i < mbox->num_chans; i++) { + struct mbox_chan *chan = &mbox->chans[i]; + + if (chan->active_req && chan->cl) { + resched = true; + txdone = chan->mbox->ops->last_tx_done(chan); + if (txdone) + tx_tick(chan, 0); + } + } + + if (resched) + mod_timer(&mbox->poll, jiffies + + msecs_to_jiffies(mbox->txpoll_period)); +} + +/** + * mbox_chan_received_data - A way for controller driver to push data + * received from remote to the upper layer. + * @chan: Pointer to the mailbox channel on which RX happened. + * @mssg: Client specific message typecasted as void * + * + * After startup and before shutdown any data received on the chan + * is passed on to the API via atomic mbox_chan_received_data(). + * The controller should ACK the RX only after this call returns. + */ +void mbox_chan_received_data(struct mbox_chan *chan, void *mssg) +{ + /* No buffering the received data */ + if (chan->cl->rx_callback) + chan->cl->rx_callback(chan->cl, mssg); +} +EXPORT_SYMBOL_GPL(mbox_chan_received_data); + +/** + * mbox_chan_txdone - A way for controller driver to notify the + * framework that the last TX has completed. + * @chan: Pointer to the mailbox chan on which TX happened. + * @r: Status of last TX - OK or ERROR + * + * The controller that has IRQ for TX ACK calls this atomic API + * to tick the TX state machine. It works only if txdone_irq + * is set by the controller. + */ +void mbox_chan_txdone(struct mbox_chan *chan, int r) +{ + if (unlikely(!(chan->txdone_method & TXDONE_BY_IRQ))) { + dev_err(chan->mbox->dev, + "Controller can't run the TX ticker\n"); + return; + } + + tx_tick(chan, r); +} +EXPORT_SYMBOL_GPL(mbox_chan_txdone); + +/** + * mbox_client_txdone - The way for a client to run the TX state machine. + * @chan: Mailbox channel assigned to this client. + * @r: Success status of last transmission. + * + * The client/protocol had received some 'ACK' packet and it notifies + * the API that the last packet was sent successfully. This only works + * if the controller can't sense TX-Done. + */ +void mbox_client_txdone(struct mbox_chan *chan, int r) +{ + if (unlikely(!(chan->txdone_method & TXDONE_BY_ACK))) { + dev_err(chan->mbox->dev, "Client can't run the TX ticker\n"); + return; + } + + tx_tick(chan, r); +} +EXPORT_SYMBOL_GPL(mbox_client_txdone); + +/** + * mbox_client_peek_data - A way for client driver to pull data + * received from remote by the controller. + * @chan: Mailbox channel assigned to this client. + * + * A poke to controller driver for any received data. + * The data is actually passed onto client via the + * mbox_chan_received_data() + * The call can be made from atomic context, so the controller's + * implementation of peek_data() must not sleep. + * + * Return: True, if controller has, and is going to push after this, + * some data. + * False, if controller doesn't have any data to be read. + */ +bool mbox_client_peek_data(struct mbox_chan *chan) +{ + if (chan->mbox->ops->peek_data) + return chan->mbox->ops->peek_data(chan); + + return false; +} +EXPORT_SYMBOL_GPL(mbox_client_peek_data); + +/** + * mbox_send_message - For client to submit a message to be + * sent to the remote. + * @chan: Mailbox channel assigned to this client. + * @mssg: Client specific message typecasted. + * + * For client to submit data to the controller destined for a remote + * processor. If the client had set 'tx_block', the call will return + * either when the remote receives the data or when 'tx_tout' millisecs + * run out. + * In non-blocking mode, the requests are buffered by the API and a + * non-negative token is returned for each queued request. If the request + * is not queued, a negative token is returned. Upon failure or successful + * TX, the API calls 'tx_done' from atomic context, from which the client + * could submit yet another request. + * The pointer to message should be preserved until it is sent + * over the chan, i.e, tx_done() is made. + * This function could be called from atomic context as it simply + * queues the data and returns a token against the request. + * + * Return: Non-negative integer for successful submission (non-blocking mode) + * or transmission over chan (blocking mode). + * Negative value denotes failure. + */ +int mbox_send_message(struct mbox_chan *chan, void *mssg) +{ + int t; + + if (!chan || !chan->cl) + return -EINVAL; + + t = add_to_rbuf(chan, mssg); + if (t < 0) { + dev_err(chan->mbox->dev, "Try increasing MBOX_TX_QUEUE_LEN\n"); + return t; + } + + msg_submit(chan); + + if (chan->txdone_method == TXDONE_BY_POLL) + poll_txdone((unsigned long)chan->mbox); + + if (chan->cl->tx_block && chan->active_req) { + unsigned long wait; + int ret; + + if (!chan->cl->tx_tout) /* wait forever */ + wait = msecs_to_jiffies(3600000); + else + wait = msecs_to_jiffies(chan->cl->tx_tout); + + ret = wait_for_completion_timeout(&chan->tx_complete, wait); + if (ret == 0) { + t = -EIO; + tx_tick(chan, -EIO); + } + } + + return t; +} +EXPORT_SYMBOL_GPL(mbox_send_message); + +/** + * mbox_request_channel - Request a mailbox channel. + * @cl: Identity of the client requesting the channel. + * @index: Index of mailbox specifier in 'mboxes' property. + * + * The Client specifies its requirements and capabilities while asking for + * a mailbox channel. It can't be called from atomic context. + * The channel is exclusively allocated and can't be used by another + * client before the owner calls mbox_free_channel. + * After assignment, any packet received on this channel will be + * handed over to the client via the 'rx_callback'. + * The framework holds reference to the client, so the mbox_client + * structure shouldn't be modified until the mbox_free_channel returns. + * + * Return: Pointer to the channel assigned to the client if successful. + * ERR_PTR for request failure. + */ +struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index) +{ + struct device *dev = cl->dev; + struct mbox_controller *mbox; + struct of_phandle_args spec; + struct mbox_chan *chan; + unsigned long flags; + int ret; + + if (!dev || !dev->of_node) { + pr_debug("%s: No owner device node\n", __func__); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&con_mutex); + + if (of_parse_phandle_with_args(dev->of_node, "mboxes", + "#mbox-cells", index, &spec)) { + dev_dbg(dev, "%s: can't parse \"mboxes\" property\n", __func__); + mutex_unlock(&con_mutex); + return ERR_PTR(-ENODEV); + } + + chan = NULL; + list_for_each_entry(mbox, &mbox_cons, node) + if (mbox->dev->of_node == spec.np) { + chan = mbox->of_xlate(mbox, &spec); + break; + } + + of_node_put(spec.np); + + if (!chan || chan->cl || !try_module_get(mbox->dev->driver->owner)) { + dev_dbg(dev, "%s: mailbox not free\n", __func__); + mutex_unlock(&con_mutex); + return ERR_PTR(-EBUSY); + } + + spin_lock_irqsave(&chan->lock, flags); + chan->msg_free = 0; + chan->msg_count = 0; + chan->active_req = NULL; + chan->cl = cl; + init_completion(&chan->tx_complete); + + if (chan->txdone_method == TXDONE_BY_POLL && cl->knows_txdone) + chan->txdone_method |= TXDONE_BY_ACK; + + spin_unlock_irqrestore(&chan->lock, flags); + + ret = chan->mbox->ops->startup(chan); + if (ret) { + dev_err(dev, "Unable to startup the chan (%d)\n", ret); + mbox_free_channel(chan); + chan = ERR_PTR(ret); + } + + mutex_unlock(&con_mutex); + return chan; +} +EXPORT_SYMBOL_GPL(mbox_request_channel); + +/** + * mbox_free_channel - The client relinquishes control of a mailbox + * channel by this call. + * @chan: The mailbox channel to be freed. + */ +void mbox_free_channel(struct mbox_chan *chan) +{ + unsigned long flags; + + if (!chan || !chan->cl) + return; + + chan->mbox->ops->shutdown(chan); + + /* The queued TX requests are simply aborted, no callbacks are made */ + spin_lock_irqsave(&chan->lock, flags); + chan->cl = NULL; + chan->active_req = NULL; + if (chan->txdone_method == (TXDONE_BY_POLL | TXDONE_BY_ACK)) + chan->txdone_method = TXDONE_BY_POLL; + + module_put(chan->mbox->dev->driver->owner); + spin_unlock_irqrestore(&chan->lock, flags); +} +EXPORT_SYMBOL_GPL(mbox_free_channel); + +static struct mbox_chan * +of_mbox_index_xlate(struct mbox_controller *mbox, + const struct of_phandle_args *sp) +{ + int ind = sp->args[0]; + + if (ind >= mbox->num_chans) + return NULL; + + return &mbox->chans[ind]; +} + +/** + * mbox_controller_register - Register the mailbox controller + * @mbox: Pointer to the mailbox controller. + * + * The controller driver registers its communication channels + */ +int mbox_controller_register(struct mbox_controller *mbox) +{ + int i, txdone; + + /* Sanity check */ + if (!mbox || !mbox->dev || !mbox->ops || !mbox->num_chans) + return -EINVAL; + + if (mbox->txdone_irq) + txdone = TXDONE_BY_IRQ; + else if (mbox->txdone_poll) + txdone = TXDONE_BY_POLL; + else /* It has to be ACK then */ + txdone = TXDONE_BY_ACK; + + if (txdone == TXDONE_BY_POLL) { + mbox->poll.function = &poll_txdone; + mbox->poll.data = (unsigned long)mbox; + init_timer(&mbox->poll); + } + + for (i = 0; i < mbox->num_chans; i++) { + struct mbox_chan *chan = &mbox->chans[i]; + + chan->cl = NULL; + chan->mbox = mbox; + chan->txdone_method = txdone; + spin_lock_init(&chan->lock); + } + + if (!mbox->of_xlate) + mbox->of_xlate = of_mbox_index_xlate; + + mutex_lock(&con_mutex); + list_add_tail(&mbox->node, &mbox_cons); + mutex_unlock(&con_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(mbox_controller_register); + +/** + * mbox_controller_unregister - Unregister the mailbox controller + * @mbox: Pointer to the mailbox controller. + */ +void mbox_controller_unregister(struct mbox_controller *mbox) +{ + int i; + + if (!mbox) + return; + + mutex_lock(&con_mutex); + + list_del(&mbox->node); + + for (i = 0; i < mbox->num_chans; i++) + mbox_free_channel(&mbox->chans[i]); + + if (mbox->txdone_poll) + del_timer_sync(&mbox->poll); + + mutex_unlock(&con_mutex); +} +EXPORT_SYMBOL_GPL(mbox_controller_unregister); diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h new file mode 100644 index 000000000000..307d9cab2026 --- /dev/null +++ b/include/linux/mailbox_client.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2013-2014 Linaro Ltd. + * Author: Jassi Brar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MAILBOX_CLIENT_H +#define __MAILBOX_CLIENT_H + +#include +#include + +struct mbox_chan; + +/** + * struct mbox_client - User of a mailbox + * @dev: The client device + * @tx_block: If the mbox_send_message should block until data is + * transmitted. + * @tx_tout: Max block period in ms before TX is assumed failure + * @knows_txdone: If the client could run the TX state machine. Usually + * if the client receives some ACK packet for transmission. + * Unused if the controller already has TX_Done/RTR IRQ. + * @rx_callback: Atomic callback to provide client the data received + * @tx_done: Atomic callback to tell client of data transmission + */ +struct mbox_client { + struct device *dev; + bool tx_block; + unsigned long tx_tout; + bool knows_txdone; + + void (*rx_callback)(struct mbox_client *cl, void *mssg); + void (*tx_done)(struct mbox_client *cl, void *mssg, int r); +}; + +struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index); +int mbox_send_message(struct mbox_chan *chan, void *mssg); +void mbox_client_txdone(struct mbox_chan *chan, int r); /* atomic */ +bool mbox_client_peek_data(struct mbox_chan *chan); /* atomic */ +void mbox_free_channel(struct mbox_chan *chan); /* may sleep */ + +#endif /* __MAILBOX_CLIENT_H */ diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h new file mode 100644 index 000000000000..d4cf96f07cfc --- /dev/null +++ b/include/linux/mailbox_controller.h @@ -0,0 +1,133 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MAILBOX_CONTROLLER_H +#define __MAILBOX_CONTROLLER_H + +#include +#include +#include +#include +#include + +struct mbox_chan; + +/** + * struct mbox_chan_ops - methods to control mailbox channels + * @send_data: The API asks the MBOX controller driver, in atomic + * context try to transmit a message on the bus. Returns 0 if + * data is accepted for transmission, -EBUSY while rejecting + * if the remote hasn't yet read the last data sent. Actual + * transmission of data is reported by the controller via + * mbox_chan_txdone (if it has some TX ACK irq). It must not + * sleep. + * @startup: Called when a client requests the chan. The controller + * could ask clients for additional parameters of communication + * to be provided via client's chan_data. This call may + * block. After this call the Controller must forward any + * data received on the chan by calling mbox_chan_received_data. + * The controller may do stuff that need to sleep. + * @shutdown: Called when a client relinquishes control of a chan. + * This call may block too. The controller must not forward + * any received data anymore. + * The controller may do stuff that need to sleep. + * @last_tx_done: If the controller sets 'txdone_poll', the API calls + * this to poll status of last TX. The controller must + * give priority to IRQ method over polling and never + * set both txdone_poll and txdone_irq. Only in polling + * mode 'send_data' is expected to return -EBUSY. + * The controller may do stuff that need to sleep/block. + * Used only if txdone_poll:=true && txdone_irq:=false + * @peek_data: Atomic check for any received data. Return true if controller + * has some data to push to the client. False otherwise. + */ +struct mbox_chan_ops { + int (*send_data)(struct mbox_chan *chan, void *data); + int (*startup)(struct mbox_chan *chan); + void (*shutdown)(struct mbox_chan *chan); + bool (*last_tx_done)(struct mbox_chan *chan); + bool (*peek_data)(struct mbox_chan *chan); +}; + +/** + * struct mbox_controller - Controller of a class of communication channels + * @dev: Device backing this controller + * @ops: Operators that work on each communication chan + * @chans: Array of channels + * @num_chans: Number of channels in the 'chans' array. + * @txdone_irq: Indicates if the controller can report to API when + * the last transmitted data was read by the remote. + * Eg, if it has some TX ACK irq. + * @txdone_poll: If the controller can read but not report the TX + * done. Ex, some register shows the TX status but + * no interrupt rises. Ignored if 'txdone_irq' is set. + * @txpoll_period: If 'txdone_poll' is in effect, the API polls for + * last TX's status after these many millisecs + * @of_xlate: Controller driver specific mapping of channel via DT + * @poll: API private. Used to poll for TXDONE on all channels. + * @node: API private. To hook into list of controllers. + */ +struct mbox_controller { + struct device *dev; + struct mbox_chan_ops *ops; + struct mbox_chan *chans; + int num_chans; + bool txdone_irq; + bool txdone_poll; + unsigned txpoll_period; + struct mbox_chan *(*of_xlate)(struct mbox_controller *mbox, + const struct of_phandle_args *sp); + /* Internal to API */ + struct timer_list poll; + struct list_head node; +}; + +/* + * The length of circular buffer for queuing messages from a client. + * 'msg_count' tracks the number of buffered messages while 'msg_free' + * is the index where the next message would be buffered. + * We shouldn't need it too big because every transfer is interrupt + * triggered and if we have lots of data to transfer, the interrupt + * latencies are going to be the bottleneck, not the buffer length. + * Besides, mbox_send_message could be called from atomic context and + * the client could also queue another message from the notifier 'tx_done' + * of the last transfer done. + * REVISIT: If too many platforms see the "Try increasing MBOX_TX_QUEUE_LEN" + * print, it needs to be taken from config option or somesuch. + */ +#define MBOX_TX_QUEUE_LEN 20 + +/** + * struct mbox_chan - s/w representation of a communication chan + * @mbox: Pointer to the parent/provider of this channel + * @txdone_method: Way to detect TXDone chosen by the API + * @cl: Pointer to the current owner of this channel + * @tx_complete: Transmission completion + * @active_req: Currently active request hook + * @msg_count: No. of mssg currently queued + * @msg_free: Index of next available mssg slot + * @msg_data: Hook for data packet + * @lock: Serialise access to the channel + * @con_priv: Hook for controller driver to attach private data + */ +struct mbox_chan { + struct mbox_controller *mbox; + unsigned txdone_method; + struct mbox_client *cl; + struct completion tx_complete; + void *active_req; + unsigned msg_count, msg_free; + void *msg_data[MBOX_TX_QUEUE_LEN]; + spinlock_t lock; /* Serialise access to the channel */ + void *con_priv; +}; + +int mbox_controller_register(struct mbox_controller *mbox); /* can sleep */ +void mbox_controller_unregister(struct mbox_controller *mbox); /* can sleep */ +void mbox_chan_received_data(struct mbox_chan *chan, void *data); /* atomic */ +void mbox_chan_txdone(struct mbox_chan *chan, int r); /* atomic */ + +#endif /* __MAILBOX_CONTROLLER_H */ -- cgit v1.2.3-59-g8ed1b From 083bf668cb70e47b84db64856606e94beac87f01 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 14 Mar 2014 14:06:25 +0800 Subject: ACPI: make acpi_create_platform_device() an external API Signed-off-by: Zhang Rui --- drivers/acpi/acpi_platform.c | 1 + drivers/acpi/internal.h | 7 ------- include/linux/acpi.h | 1 + 3 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c index 2bf9082f7523..a3c89a1bcf54 100644 --- a/drivers/acpi/acpi_platform.c +++ b/drivers/acpi/acpi_platform.c @@ -113,3 +113,4 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev) kfree(resources); return pdev; } +EXPORT_SYMBOL_GPL(acpi_create_platform_device); diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index de47f9f746c9..f221d1eb594a 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -168,13 +168,6 @@ static inline int suspend_nvs_save(void) { return 0; } static inline void suspend_nvs_restore(void) {} #endif -/*-------------------------------------------------------------------------- - Platform bus support - -------------------------------------------------------------------------- */ -struct platform_device; - -struct platform_device *acpi_create_platform_device(struct acpi_device *adev); - /*-------------------------------------------------------------------------- Video -------------------------------------------------------------------------- */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 807cbc46d73e..2c24c2c1be45 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -432,6 +432,7 @@ static inline bool acpi_driver_match_device(struct device *dev, int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *); int acpi_device_modalias(struct device *, char *, int); +struct platform_device *acpi_create_platform_device(struct acpi_device *); #define ACPI_PTR(_ptr) (_ptr) #else /* !CONFIG_ACPI */ -- cgit v1.2.3-59-g8ed1b From 7b83fd9d91a411158f72d36958103c708c3b5a86 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Tue, 25 Mar 2014 10:40:09 +0800 Subject: Thermal: move the KELVIN_TO_MILLICELSIUS macro to thermal.h This macro can be used by other component so move it to a common header, but in a slightly different way: define two macros, one macro with an offset and the other doesn't. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/acpi/thermal.c | 18 +++++++++--------- include/linux/thermal.h | 2 ++ 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 112817e963e0..d24fa1964eb8 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -528,7 +528,6 @@ static void acpi_thermal_check(void *data) } /* sys I/F for generic thermal sysfs support */ -#define KELVIN_TO_MILLICELSIUS(t, off) (((t) - (off)) * 100) static int thermal_get_temp(struct thermal_zone_device *thermal, unsigned long *temp) @@ -543,7 +542,8 @@ static int thermal_get_temp(struct thermal_zone_device *thermal, if (result) return result; - *temp = KELVIN_TO_MILLICELSIUS(tz->temperature, tz->kelvin_offset); + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(tz->temperature, + tz->kelvin_offset); return 0; } @@ -647,7 +647,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, if (tz->trips.critical.flags.valid) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.critical.temperature, tz->kelvin_offset); return 0; @@ -657,7 +657,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, if (tz->trips.hot.flags.valid) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.hot.temperature, tz->kelvin_offset); return 0; @@ -667,7 +667,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, if (tz->trips.passive.flags.valid) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.passive.temperature, tz->kelvin_offset); return 0; @@ -678,7 +678,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE && tz->trips.active[i].flags.valid; i++) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.active[i].temperature, tz->kelvin_offset); return 0; @@ -694,7 +694,7 @@ static int thermal_get_crit_temp(struct thermal_zone_device *thermal, struct acpi_thermal *tz = thermal->devdata; if (tz->trips.critical.flags.valid) { - *temperature = KELVIN_TO_MILLICELSIUS( + *temperature = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.critical.temperature, tz->kelvin_offset); return 0; @@ -714,8 +714,8 @@ static int thermal_get_trend(struct thermal_zone_device *thermal, if (type == THERMAL_TRIP_ACTIVE) { unsigned long trip_temp; - unsigned long temp = KELVIN_TO_MILLICELSIUS(tz->temperature, - tz->kelvin_offset); + unsigned long temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( + tz->temperature, tz->kelvin_offset); if (thermal_get_trip_temp(thermal, trip, &trip_temp)) return -EINVAL; diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 0305cde21a74..79ce6b94884a 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -44,6 +44,8 @@ #define KELVIN_TO_CELSIUS(t) (long)(((long)t-2732 >= 0) ? \ ((long)t-2732+5)/10 : ((long)t-2732-5)/10) #define CELSIUS_TO_KELVIN(t) ((t)*10+2732) +#define DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, off) (((t) - (off)) * 100) +#define DECI_KELVIN_TO_MILLICELSIUS(t) DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, 2732) /* Adding event notification support elements */ #define THERMAL_GENL_FAMILY_NAME "thermal_event" -- cgit v1.2.3-59-g8ed1b From 77e337c6e23e3b9d22e09ffec202a80f755a54c2 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 3 Sep 2014 15:13:02 +0800 Subject: Thermal: introduce INT3402 thermal driver ACPI INT3402 device object could report temperature for the memory module. To expose such information to user space, a thermal zone device is registered for it so that the thermal sysfs interface can expose such information for userspace to use. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/thermal/int340x_thermal/Makefile | 1 + drivers/thermal/int340x_thermal/int3402_thermal.c | 242 ++++++++++++++++++++++ include/linux/thermal.h | 2 + 3 files changed, 245 insertions(+) create mode 100644 drivers/thermal/int340x_thermal/int3402_thermal.c (limited to 'include/linux') diff --git a/drivers/thermal/int340x_thermal/Makefile b/drivers/thermal/int340x_thermal/Makefile index e10a53bcefe7..67c98fd24200 100644 --- a/drivers/thermal/int340x_thermal/Makefile +++ b/drivers/thermal/int340x_thermal/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_INT340X_THERMAL) += int3400_thermal.o +obj-$(CONFIG_INT340X_THERMAL) += int3402_thermal.o diff --git a/drivers/thermal/int340x_thermal/int3402_thermal.c b/drivers/thermal/int340x_thermal/int3402_thermal.c new file mode 100644 index 000000000000..a5d08c14ba24 --- /dev/null +++ b/drivers/thermal/int340x_thermal/int3402_thermal.c @@ -0,0 +1,242 @@ +/* + * INT3402 thermal driver for memory temperature reporting + * + * Copyright (C) 2014, Intel Corporation + * Authors: Aaron Lu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include + +#define ACPI_ACTIVE_COOLING_MAX_NR 10 + +struct active_trip { + unsigned long temp; + int id; + bool valid; +}; + +struct int3402_thermal_data { + unsigned long *aux_trips; + int aux_trip_nr; + unsigned long psv_temp; + int psv_trip_id; + unsigned long crt_temp; + int crt_trip_id; + unsigned long hot_temp; + int hot_trip_id; + struct active_trip act_trips[ACPI_ACTIVE_COOLING_MAX_NR]; + acpi_handle *handle; +}; + +static int int3402_thermal_get_zone_temp(struct thermal_zone_device *zone, + unsigned long *temp) +{ + struct int3402_thermal_data *d = zone->devdata; + unsigned long long tmp; + acpi_status status; + + status = acpi_evaluate_integer(d->handle, "_TMP", NULL, &tmp); + if (ACPI_FAILURE(status)) + return -ENODEV; + + /* _TMP returns the temperature in tenths of degrees Kelvin */ + *temp = DECI_KELVIN_TO_MILLICELSIUS(tmp); + + return 0; +} + +static int int3402_thermal_get_trip_temp(struct thermal_zone_device *zone, + int trip, unsigned long *temp) +{ + struct int3402_thermal_data *d = zone->devdata; + int i; + + if (trip < d->aux_trip_nr) + *temp = d->aux_trips[trip]; + else if (trip == d->crt_trip_id) + *temp = d->crt_temp; + else if (trip == d->psv_trip_id) + *temp = d->psv_temp; + else if (trip == d->hot_trip_id) + *temp = d->hot_temp; + else { + for (i = 0; i < ACPI_ACTIVE_COOLING_MAX_NR; i++) { + if (d->act_trips[i].valid && + d->act_trips[i].id == trip) { + *temp = d->act_trips[i].temp; + break; + } + } + if (i == ACPI_ACTIVE_COOLING_MAX_NR) + return -EINVAL; + } + return 0; +} + +static int int3402_thermal_get_trip_type(struct thermal_zone_device *zone, + int trip, enum thermal_trip_type *type) +{ + struct int3402_thermal_data *d = zone->devdata; + int i; + + if (trip < d->aux_trip_nr) + *type = THERMAL_TRIP_PASSIVE; + else if (trip == d->crt_trip_id) + *type = THERMAL_TRIP_CRITICAL; + else if (trip == d->hot_trip_id) + *type = THERMAL_TRIP_HOT; + else if (trip == d->psv_trip_id) + *type = THERMAL_TRIP_PASSIVE; + else { + for (i = 0; i < ACPI_ACTIVE_COOLING_MAX_NR; i++) { + if (d->act_trips[i].valid && + d->act_trips[i].id == trip) { + *type = THERMAL_TRIP_ACTIVE; + break; + } + } + if (i == ACPI_ACTIVE_COOLING_MAX_NR) + return -EINVAL; + } + return 0; +} + +static int int3402_thermal_set_trip_temp(struct thermal_zone_device *zone, int trip, + unsigned long temp) +{ + struct int3402_thermal_data *d = zone->devdata; + acpi_status status; + char name[10]; + + snprintf(name, sizeof(name), "PAT%d", trip); + status = acpi_execute_simple_method(d->handle, name, + MILLICELSIUS_TO_DECI_KELVIN(temp)); + if (ACPI_FAILURE(status)) + return -EIO; + + d->aux_trips[trip] = temp; + return 0; +} + +static struct thermal_zone_device_ops int3402_thermal_zone_ops = { + .get_temp = int3402_thermal_get_zone_temp, + .get_trip_temp = int3402_thermal_get_trip_temp, + .get_trip_type = int3402_thermal_get_trip_type, + .set_trip_temp = int3402_thermal_set_trip_temp, +}; + +static struct thermal_zone_params int3402_thermal_params = { + .governor_name = "user_space", + .no_hwmon = true, +}; + +static int int3402_thermal_get_temp(acpi_handle handle, char *name, + unsigned long *temp) +{ + unsigned long long r; + acpi_status status; + + status = acpi_evaluate_integer(handle, name, NULL, &r); + if (ACPI_FAILURE(status)) + return -EIO; + + *temp = DECI_KELVIN_TO_MILLICELSIUS(r); + return 0; +} + +static int int3402_thermal_probe(struct platform_device *pdev) +{ + struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); + struct int3402_thermal_data *d; + struct thermal_zone_device *zone; + acpi_status status; + unsigned long long trip_cnt; + int trip_mask = 0, i; + + if (!acpi_has_method(adev->handle, "_TMP")) + return -ENODEV; + + d = devm_kzalloc(&pdev->dev, sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + + status = acpi_evaluate_integer(adev->handle, "PATC", NULL, &trip_cnt); + if (ACPI_FAILURE(status)) + trip_cnt = 0; + else { + d->aux_trips = devm_kzalloc(&pdev->dev, + sizeof(*d->aux_trips) * trip_cnt, GFP_KERNEL); + if (!d->aux_trips) + return -ENOMEM; + trip_mask = trip_cnt - 1; + d->handle = adev->handle; + d->aux_trip_nr = trip_cnt; + } + + d->crt_trip_id = -1; + if (!int3402_thermal_get_temp(adev->handle, "_CRT", &d->crt_temp)) + d->crt_trip_id = trip_cnt++; + d->hot_trip_id = -1; + if (!int3402_thermal_get_temp(adev->handle, "_HOT", &d->hot_temp)) + d->hot_trip_id = trip_cnt++; + d->psv_trip_id = -1; + if (!int3402_thermal_get_temp(adev->handle, "_PSV", &d->psv_temp)) + d->psv_trip_id = trip_cnt++; + for (i = 0; i < ACPI_ACTIVE_COOLING_MAX_NR; i++) { + char name[5] = { '_', 'A', 'C', '0' + i, '\0' }; + if (int3402_thermal_get_temp(adev->handle, name, + &d->act_trips[i].temp)) + break; + d->act_trips[i].id = trip_cnt++; + d->act_trips[i].valid = true; + } + + zone = thermal_zone_device_register(acpi_device_bid(adev), trip_cnt, + trip_mask, d, + &int3402_thermal_zone_ops, + &int3402_thermal_params, + 0, 0); + if (IS_ERR(zone)) + return PTR_ERR(zone); + platform_set_drvdata(pdev, zone); + + return 0; +} + +static int int3402_thermal_remove(struct platform_device *pdev) +{ + struct thermal_zone_device *zone = platform_get_drvdata(pdev); + + thermal_zone_device_unregister(zone); + return 0; +} + +static const struct acpi_device_id int3402_thermal_match[] = { + {"INT3402", 0}, + {} +}; + +MODULE_DEVICE_TABLE(acpi, int3402_thermal_match); + +static struct platform_driver int3402_thermal_driver = { + .probe = int3402_thermal_probe, + .remove = int3402_thermal_remove, + .driver = { + .name = "int3402 thermal", + .owner = THIS_MODULE, + .acpi_match_table = int3402_thermal_match, + }, +}; + +module_platform_driver(int3402_thermal_driver); + +MODULE_DESCRIPTION("INT3402 Thermal driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 79ce6b94884a..ef90838b36a0 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -46,6 +46,8 @@ #define CELSIUS_TO_KELVIN(t) ((t)*10+2732) #define DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, off) (((t) - (off)) * 100) #define DECI_KELVIN_TO_MILLICELSIUS(t) DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, 2732) +#define MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, off) (((t) / 100) + (off)) +#define MILLICELSIUS_TO_DECI_KELVIN(t) MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, 2732) /* Adding event notification support elements */ #define THERMAL_GENL_FAMILY_NAME "thermal_event" -- cgit v1.2.3-59-g8ed1b From 174e964ec224c3c591b83a6b5f0984d905d3678f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 9 Oct 2014 12:43:27 -0700 Subject: regulator: Include err.h from consumer.h to fix build failure sh:sh2007_defconfig fails to build with the following error: In file included from include/linux/regulator/machine.h:18:0, from arch/sh/boards/board-sh2007.c:10: include/linux/regulator/consumer.h: In function 'regulator_get_optional': include/linux/regulator/consumer.h:271:2: error: implicit declaration of function 'ERR_PTR' include/linux/err.h: At top level: include/linux/err.h:23:35: error: conflicting types for 'ERR_PTR' include/linux/regulator/consumer.h:271:9: note: previous implicit declaration of 'ERR_PTR' was here Since consumer.h uses ERR_PTR, it should include err.h. Signed-off-by: Guenter Roeck Signed-off-by: Mark Brown --- include/linux/regulator/consumer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index d347c805f923..f540b1496e2f 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -35,6 +35,8 @@ #ifndef __LINUX_REGULATOR_CONSUMER_H_ #define __LINUX_REGULATOR_CONSUMER_H_ +#include + struct device; struct notifier_block; struct regmap; -- cgit v1.2.3-59-g8ed1b From d4c5efdb97773f59a2b711754ca0953f24516739 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 26 Aug 2014 23:16:35 -0400 Subject: random: add and use memzero_explicit() for clearing data zatimend has reported that in his environment (3.16/gcc4.8.3/corei7) memset() calls which clear out sensitive data in extract_{buf,entropy, entropy_user}() in random driver are being optimized away by gcc. Add a helper memzero_explicit() (similarly as explicit_bzero() variants) that can be used in such cases where a variable with sensitive data is being cleared out in the end. Other use cases might also be in crypto code. [ I have put this into lib/string.c though, as it's always built-in and doesn't need any dependencies then. ] Fixes kernel bugzilla: 82041 Reported-by: zatimend@hotmail.co.uk Signed-off-by: Daniel Borkmann Acked-by: Hannes Frederic Sowa Cc: Alexey Dobriyan Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- drivers/char/random.c | 8 ++++---- include/linux/string.h | 5 +++-- lib/string.c | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/random.c b/drivers/char/random.c index c18d41db83d8..8c86a95203a0 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1106,7 +1106,7 @@ static void extract_buf(struct entropy_store *r, __u8 *out) __mix_pool_bytes(r, hash.w, sizeof(hash.w)); spin_unlock_irqrestore(&r->lock, flags); - memset(workspace, 0, sizeof(workspace)); + memzero_explicit(workspace, sizeof(workspace)); /* * In case the hash function has some recognizable output @@ -1118,7 +1118,7 @@ static void extract_buf(struct entropy_store *r, __u8 *out) hash.w[2] ^= rol32(hash.w[2], 16); memcpy(out, &hash, EXTRACT_SIZE); - memset(&hash, 0, sizeof(hash)); + memzero_explicit(&hash, sizeof(hash)); } /* @@ -1175,7 +1175,7 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, } /* Wipe data just returned from memory */ - memset(tmp, 0, sizeof(tmp)); + memzero_explicit(tmp, sizeof(tmp)); return ret; } @@ -1218,7 +1218,7 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, } /* Wipe data just returned from memory */ - memset(tmp, 0, sizeof(tmp)); + memzero_explicit(tmp, sizeof(tmp)); return ret; } diff --git a/include/linux/string.h b/include/linux/string.h index d36977e029af..3b42b3732da6 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -132,7 +132,7 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, - const void *from, size_t available); + const void *from, size_t available); /** * strstarts - does @str start with @prefix? @@ -144,7 +144,8 @@ static inline bool strstarts(const char *str, const char *prefix) return strncmp(str, prefix, strlen(prefix)) == 0; } -extern size_t memweight(const void *ptr, size_t bytes); +size_t memweight(const void *ptr, size_t bytes); +void memzero_explicit(void *s, size_t count); /** * kbasename - return the last part of a pathname. diff --git a/lib/string.c b/lib/string.c index 992bf30af759..3a3120452a1d 100644 --- a/lib/string.c +++ b/lib/string.c @@ -604,6 +604,22 @@ void *memset(void *s, int c, size_t count) EXPORT_SYMBOL(memset); #endif +/** + * memzero_explicit - Fill a region of memory (e.g. sensitive + * keying data) with 0s. + * @s: Pointer to the start of the area. + * @count: The size of the area. + * + * memzero_explicit() doesn't need an arch-specific version as + * it just invokes the one of memset() implicitly. + */ +void memzero_explicit(void *s, size_t count) +{ + memset(s, 0, count); + OPTIMIZER_HIDE_VAR(s); +} +EXPORT_SYMBOL(memzero_explicit); + #ifndef __HAVE_ARCH_MEMCPY /** * memcpy - Copy one area of memory to another -- cgit v1.2.3-59-g8ed1b From 70f3ce0510afdad7cbaf27ab7ab961377205c782 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 29 Sep 2014 11:47:54 +0200 Subject: mtd: spi-nor: make spi_nor_scan() take a chip type name, not spi_device_id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drivers currently call spi_nor_match_id() and then spi_nor_scan(). This adds a dependency on struct spi_device_id which we want to avoid. Make spi_nor_scan() do it for them. Signed-off-by: Ben Hutchings Signed-off-by: Rafał Miłecki Signed-off-by: Brian Norris --- drivers/mtd/devices/m25p80.c | 4 +--- drivers/mtd/spi-nor/fsl-quadspi.c | 7 +------ drivers/mtd/spi-nor/spi-nor.c | 13 +++++++++---- include/linux/mtd/spi-nor.h | 20 +++----------------- 4 files changed, 14 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index 822209d10689..bd5e4c6edfd4 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -193,7 +193,6 @@ static int m25p_probe(struct spi_device *spi) { struct mtd_part_parser_data ppdata; struct flash_platform_data *data; - const struct spi_device_id *id = NULL; struct m25p *flash; struct spi_nor *nor; enum read_mode mode = SPI_NOR_NORMAL; @@ -241,8 +240,7 @@ static int m25p_probe(struct spi_device *spi) else flash_name = spi->modalias; - id = spi_nor_match_id(flash_name); - ret = spi_nor_scan(nor, id, mode); + ret = spi_nor_scan(nor, flash_name, mode); if (ret) return ret; diff --git a/drivers/mtd/spi-nor/fsl-quadspi.c b/drivers/mtd/spi-nor/fsl-quadspi.c index 8d659a2888d5..d5269a26c839 100644 --- a/drivers/mtd/spi-nor/fsl-quadspi.c +++ b/drivers/mtd/spi-nor/fsl-quadspi.c @@ -881,7 +881,6 @@ static int fsl_qspi_probe(struct platform_device *pdev) /* iterate the subnodes. */ for_each_available_child_of_node(dev->of_node, np) { - const struct spi_device_id *id; char modalias[40]; /* skip the holes */ @@ -909,10 +908,6 @@ static int fsl_qspi_probe(struct platform_device *pdev) if (of_modalias_node(np, modalias, sizeof(modalias)) < 0) goto map_failed; - id = spi_nor_match_id(modalias); - if (!id) - goto map_failed; - ret = of_property_read_u32(np, "spi-max-frequency", &q->clk_rate); if (ret < 0) @@ -921,7 +916,7 @@ static int fsl_qspi_probe(struct platform_device *pdev) /* set the chip address for READID */ fsl_qspi_set_base_addr(q, nor); - ret = spi_nor_scan(nor, id, SPI_NOR_QUAD); + ret = spi_nor_scan(nor, modalias, SPI_NOR_QUAD); if (ret) goto map_failed; diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index ae16aa2f6885..5c8e39977bc5 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -28,6 +28,8 @@ #define JEDEC_MFR(_jedec_id) ((_jedec_id) >> 16) +static const struct spi_device_id *spi_nor_match_id(const char *name); + /* * Read the status register, returning its value in the location * Return the status register value. @@ -911,9 +913,9 @@ static int spi_nor_check(struct spi_nor *nor) return 0; } -int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id, - enum read_mode mode) +int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode) { + const struct spi_device_id *id = NULL; struct flash_info *info; struct device *dev = nor->dev; struct mtd_info *mtd = nor->mtd; @@ -925,6 +927,10 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id, if (ret) return ret; + id = spi_nor_match_id(name); + if (!id) + return -ENOENT; + info = (void *)id->driver_data; if (info->jedec_id) { @@ -1113,7 +1119,7 @@ int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id, } EXPORT_SYMBOL_GPL(spi_nor_scan); -const struct spi_device_id *spi_nor_match_id(char *name) +static const struct spi_device_id *spi_nor_match_id(const char *name) { const struct spi_device_id *id = spi_nor_ids; @@ -1124,7 +1130,6 @@ const struct spi_device_id *spi_nor_match_id(char *name) } return NULL; } -EXPORT_SYMBOL_GPL(spi_nor_match_id); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Huang Shijie "); diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index 9e6294f32ba8..a5a7a086748d 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -187,32 +187,18 @@ struct spi_nor { /** * spi_nor_scan() - scan the SPI NOR * @nor: the spi_nor structure - * @id: the spi_device_id provided by the driver + * @name: the chip type name * @mode: the read mode supported by the driver * * The drivers can use this fuction to scan the SPI NOR. * In the scanning, it will try to get all the necessary information to * fill the mtd_info{} and the spi_nor{}. * - * The board may assigns a spi_device_id with @id which be used to compared with - * the spi_device_id detected by the scanning. + * The chip type name can be provided through the @name parameter. * * Return: 0 for success, others for failure. */ -int spi_nor_scan(struct spi_nor *nor, const struct spi_device_id *id, - enum read_mode mode); +int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode); extern const struct spi_device_id spi_nor_ids[]; -/** - * spi_nor_match_id() - find the spi_device_id by the name - * @name: the name of the spi_device_id - * - * The drivers use this function to find the spi_device_id - * specified by the @name. - * - * Return: returns the right spi_device_id pointer on success, - * and returns NULL on failure. - */ -const struct spi_device_id *spi_nor_match_id(char *name); - #endif -- cgit v1.2.3-59-g8ed1b From aa281ac631008b9c18c405c8880007789f659c7d Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 19 Oct 2014 19:38:58 +0300 Subject: Boaz Harrosh - Fix broken email address I no longer have access to the Panasas email. So change to an email that can always reach me. Signed-off-by: Boaz Harrosh --- drivers/scsi/osd/Kbuild | 2 +- drivers/scsi/osd/Kconfig | 2 +- drivers/scsi/osd/osd_debug.h | 2 +- drivers/scsi/osd/osd_initiator.c | 4 ++-- drivers/scsi/osd/osd_uld.c | 4 ++-- fs/exofs/Kbuild | 2 +- fs/exofs/common.h | 2 +- fs/exofs/dir.c | 2 +- fs/exofs/exofs.h | 2 +- fs/exofs/file.c | 2 +- fs/exofs/inode.c | 2 +- fs/exofs/namei.c | 2 +- fs/exofs/ore.c | 4 ++-- fs/exofs/ore_raid.c | 2 +- fs/exofs/ore_raid.h | 2 +- fs/exofs/super.c | 2 +- fs/exofs/symlink.c | 2 +- fs/exofs/sys.c | 2 +- fs/nfs/objlayout/objio_osd.c | 2 +- fs/nfs/objlayout/objlayout.c | 2 +- fs/nfs/objlayout/objlayout.h | 2 +- fs/nfs/objlayout/pnfs_osd_xdr_cli.c | 2 +- include/linux/pnfs_osd_xdr.h | 2 +- include/scsi/osd_initiator.h | 2 +- include/scsi/osd_ore.h | 2 +- include/scsi/osd_protocol.h | 4 ++-- include/scsi/osd_sec.h | 2 +- include/scsi/osd_sense.h | 2 +- include/scsi/osd_types.h | 2 +- 29 files changed, 33 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/drivers/scsi/osd/Kbuild b/drivers/scsi/osd/Kbuild index 5fd73d77c3af..58cecd45b0f5 100644 --- a/drivers/scsi/osd/Kbuild +++ b/drivers/scsi/osd/Kbuild @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh +# Boaz Harrosh # Benny Halevy # # This program is free software; you can redistribute it and/or modify diff --git a/drivers/scsi/osd/Kconfig b/drivers/scsi/osd/Kconfig index a0703514eb0f..347cc5e33749 100644 --- a/drivers/scsi/osd/Kconfig +++ b/drivers/scsi/osd/Kconfig @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh +# Boaz Harrosh # Benny Halevy # # This program is free software; you can redistribute it and/or modify diff --git a/drivers/scsi/osd/osd_debug.h b/drivers/scsi/osd/osd_debug.h index 579e491f11df..26341261bb5c 100644 --- a/drivers/scsi/osd/osd_debug.h +++ b/drivers/scsi/osd/osd_debug.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index 5f4cbf0c4759..52e243b77000 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -7,7 +7,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify @@ -57,7 +57,7 @@ enum { OSD_REQ_RETRIES = 1 }; -MODULE_AUTHOR("Boaz Harrosh "); +MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("open-osd initiator library libosd.ko"); MODULE_LICENSE("GPL"); diff --git a/drivers/scsi/osd/osd_uld.c b/drivers/scsi/osd/osd_uld.c index e1d9a4c4c4b3..92cdd4b06526 100644 --- a/drivers/scsi/osd/osd_uld.c +++ b/drivers/scsi/osd/osd_uld.c @@ -10,7 +10,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify @@ -74,7 +74,7 @@ static const char osd_name[] = "osd"; static const char *osd_version_string = "open-osd 0.2.1"; -MODULE_AUTHOR("Boaz Harrosh "); +MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("open-osd Upper-Layer-Driver osd.ko"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CHARDEV_MAJOR(SCSI_OSD_MAJOR); diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index 389ba8312d5d..b47c7b8dc275 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh +# Boaz Harrosh # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 diff --git a/fs/exofs/common.h b/fs/exofs/common.h index 3bbd46956d77..7d88ef566213 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -4,7 +4,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 49f51ab4caac..d7defd557601 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index fffe86fd7a42..ad9cac670a47 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 71bf8e4fb5d4..1a376b42d305 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 3f9cafd73931..f1d3d4eb8c4f 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 4731fd991efe..28907460e8fa 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index cfc0205d62c4..7bd8ac8dfb28 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * This file is part of exofs. * @@ -29,7 +29,7 @@ #include "ore_raid.h" -MODULE_AUTHOR("Boaz Harrosh "); +MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); MODULE_LICENSE("GPL"); diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 84529b8a331b..27cbdb697649 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2011 - * Boaz Harrosh + * Boaz Harrosh * * This file is part of the objects raid engine (ore). * diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index cf6375d82129..a6e746775570 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -1,6 +1,6 @@ /* * Copyright (C) from 2011 - * Boaz Harrosh + * Boaz Harrosh * * This file is part of the objects raid engine (ore). * diff --git a/fs/exofs/super.c b/fs/exofs/super.c index ed73ed8ebbee..95965503afcb 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c index 4dd687c3e747..832e2624b80b 100644 --- a/fs/exofs/symlink.c +++ b/fs/exofs/symlink.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c index 1b4f2f95fc37..5e6a2c0a1f0b 100644 --- a/fs/exofs/sys.c +++ b/fs/exofs/sys.c @@ -1,7 +1,7 @@ /* * Copyright (C) 2012 * Sachin Bhamare - * Boaz Harrosh + * Boaz Harrosh * * This file is part of exofs. * diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index ae05278b3761..5cbd8cf9a5a5 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 697a16d11fac..ad988428162e 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index fd13f1d2f136..3472e8390ba5 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -6,7 +6,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c index b3918f7ac34d..f093c7ec983b 100644 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h index fe25876c1a5d..17d7d0d20eca 100644 --- a/include/linux/pnfs_osd_xdr.h +++ b/include/linux/pnfs_osd_xdr.h @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/include/scsi/osd_initiator.h b/include/scsi/osd_initiator.h index b2e85fdd2ae0..a09cca829082 100644 --- a/include/scsi/osd_initiator.h +++ b/include/scsi/osd_initiator.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 6ca3265a4dca..7a8d2cd30328 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2011 - * Boaz Harrosh + * Boaz Harrosh * * Public Declarations of the ORE API * diff --git a/include/scsi/osd_protocol.h b/include/scsi/osd_protocol.h index a2594afe05c7..e0ca835e7bf7 100644 --- a/include/scsi/osd_protocol.h +++ b/include/scsi/osd_protocol.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify @@ -496,7 +496,7 @@ struct osd_timestamp { */ struct osd_key_identifier { - u8 id[7]; /* if you know why 7 please email bharrosh@panasas.com */ + u8 id[7]; /* if you know why 7 please email ooo@electrozaur.com */ } __packed; /* for osd_capability.format */ diff --git a/include/scsi/osd_sec.h b/include/scsi/osd_sec.h index f96151c9c9e8..7abeb0f0db30 100644 --- a/include/scsi/osd_sec.h +++ b/include/scsi/osd_sec.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify diff --git a/include/scsi/osd_sense.h b/include/scsi/osd_sense.h index 91db543a5502..d52aa93a0b2d 100644 --- a/include/scsi/osd_sense.h +++ b/include/scsi/osd_sense.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify diff --git a/include/scsi/osd_types.h b/include/scsi/osd_types.h index bd0be7ed4bcf..48e8a165e136 100644 --- a/include/scsi/osd_types.h +++ b/include/scsi/osd_types.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify -- cgit v1.2.3-59-g8ed1b From 4846e3784585173f48e267b76f968bcb4a12d3b2 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Tue, 9 Sep 2014 22:18:31 +0200 Subject: watchdog: simplify definitions of WATCHDOG_NOWAYOUT(_INIT_STATUS)? Signed-off-by: Uwe Kleine-K=C3=B6nig Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/watchdog.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 2a3038ee17a3..395b70e0eccf 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -97,13 +97,8 @@ struct watchdog_device { #define WDOG_UNREGISTERED 4 /* Has the device been unregistered */ }; -#ifdef CONFIG_WATCHDOG_NOWAYOUT -#define WATCHDOG_NOWAYOUT 1 -#define WATCHDOG_NOWAYOUT_INIT_STATUS (1 << WDOG_NO_WAY_OUT) -#else -#define WATCHDOG_NOWAYOUT 0 -#define WATCHDOG_NOWAYOUT_INIT_STATUS 0 -#endif +#define WATCHDOG_NOWAYOUT IS_BUILTIN(CONFIG_WATCHDOG_NOWAYOUT) +#define WATCHDOG_NOWAYOUT_INIT_STATUS (WATCHDOG_NOWAYOUT << WDOG_NO_WAY_OUT) /* Use the following function to check whether or not the watchdog is active */ static inline bool watchdog_active(struct watchdog_device *wdd) -- cgit v1.2.3-59-g8ed1b From 51315cdfa0521fff3059cec5fb8ffecc7f37cba7 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Sun, 19 Oct 2014 11:30:27 +0200 Subject: cpufreq: allow driver-specific data This commit extends the cpufreq_driver structure with an additional 'void *driver_data' field that can be filled by the ->probe() function of a cpufreq driver to pass additional custom information to the driver itself. A new function called cpufreq_get_driver_data() is added to allow a cpufreq driver to retrieve those driver data, since they are typically needed from a cpufreq_policy->init() callback, which does not have access to the cpufreq_driver structure. This function call is similar to the existing cpufreq_get_current_driver() function call. Signed-off-by: Thomas Petazzoni Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 15 +++++++++++++++ include/linux/cpufreq.h | 2 ++ 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 24bf76fba141..058d6e084a6d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1731,6 +1731,21 @@ const char *cpufreq_get_current_driver(void) } EXPORT_SYMBOL_GPL(cpufreq_get_current_driver); +/** + * cpufreq_get_driver_data - return current driver data + * + * Return the private data of the currently loaded cpufreq + * driver, or NULL if no cpufreq driver is loaded. + */ +void *cpufreq_get_driver_data(void) +{ + if (cpufreq_driver) + return cpufreq_driver->driver_data; + + return NULL; +} +EXPORT_SYMBOL_GPL(cpufreq_get_driver_data); + /********************************************************************* * NOTIFIER LISTS INTERFACE * *********************************************************************/ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 138336b6bb04..503b085b7832 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -219,6 +219,7 @@ __ATTR(_name, 0644, show_##_name, store_##_name) struct cpufreq_driver { char name[CPUFREQ_NAME_LEN]; u8 flags; + void *driver_data; /* needed by all drivers */ int (*init) (struct cpufreq_policy *policy); @@ -312,6 +313,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); const char *cpufreq_get_current_driver(void); +void *cpufreq_get_driver_data(void); static inline void cpufreq_verify_within_limits(struct cpufreq_policy *policy, unsigned int min, unsigned int max) -- cgit v1.2.3-59-g8ed1b From 34e5a5273d6aa0ee8836bd5d6111b135ffae6931 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Sun, 19 Oct 2014 11:30:28 +0200 Subject: cpufreq: cpufreq-dt: extend with platform_data This commit extends the cpufreq-dt driver to take a platform_data structure. This structure is for now used to tell the cpufreq-dt driver the layout of the clocks on the platform, i.e whether all CPUs share the same clock or whether each CPU has a separate clock. Signed-off-by: Thomas Petazzoni Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 17 +++++++++++++++-- include/linux/cpufreq-dt.h | 22 ++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 include/linux/cpufreq-dt.h (limited to 'include/linux') diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 6bbb8b913446..52facdaf531e 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -178,6 +179,7 @@ try_again: static int cpufreq_init(struct cpufreq_policy *policy) { + struct cpufreq_dt_platform_data *pd; struct cpufreq_frequency_table *freq_table; struct thermal_cooling_device *cdev; struct device_node *np; @@ -265,9 +267,18 @@ static int cpufreq_init(struct cpufreq_policy *policy) policy->driver_data = priv; policy->clk = cpu_clk; - ret = cpufreq_generic_init(policy, freq_table, transition_latency); - if (ret) + ret = cpufreq_table_validate_and_show(policy, freq_table); + if (ret) { + dev_err(cpu_dev, "%s: invalid frequency table: %d\n", __func__, + ret); goto out_cooling_unregister; + } + + policy->cpuinfo.transition_latency = transition_latency; + + pd = cpufreq_get_driver_data(); + if (pd && !pd->independent_clocks) + cpumask_setall(policy->cpus); of_node_put(np); @@ -335,6 +346,8 @@ static int dt_cpufreq_probe(struct platform_device *pdev) if (!IS_ERR(cpu_reg)) regulator_put(cpu_reg); + dt_cpufreq_driver.driver_data = dev_get_platdata(&pdev->dev); + ret = cpufreq_register_driver(&dt_cpufreq_driver); if (ret) dev_err(cpu_dev, "failed register driver: %d\n", ret); diff --git a/include/linux/cpufreq-dt.h b/include/linux/cpufreq-dt.h new file mode 100644 index 000000000000..0414009e2c30 --- /dev/null +++ b/include/linux/cpufreq-dt.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2014 Marvell + * Thomas Petazzoni + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __CPUFREQ_DT_H__ +#define __CPUFREQ_DT_H__ + +struct cpufreq_dt_platform_data { + /* + * True when each CPU has its own clock to control its + * frequency, false when all CPUs are controlled by a single + * clock. + */ + bool independent_clocks; +}; + +#endif /* __CPUFREQ_DT_H__ */ -- cgit v1.2.3-59-g8ed1b From a5b7616c55e188fe3d6ef686bef402d4703ecb62 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 30 Sep 2014 03:14:55 +0100 Subject: mtd: m25p80,spi-nor: Fix module aliases for m25p80 m25p80's device ID table is now spi_nor_ids, defined in spi-nor. The MODULE_DEVICE_TABLE() macro doesn't work with extern definitions, but its use was also removed at the same time. Now if m25p80 is built as a module it doesn't get the necessary aliases to be loaded automatically. A clean solution to this will involve defining the list of device IDs in spi-nor.h and removing struct spi_device_id from the spi-nor API, but this is quite a large change. As a quick fix suitable for stable, copy the device IDs back into m25p80. Fixes: 03e296f613af ("mtd: m25p80: use the SPI nor framework") Cc: # 3.16.x: 32f1b7c8352f: mtd: move support for struct flash_platform_data into m25p80 Cc: # 3.16.x: 90e55b3812a1: mtd: m25p80: get rid of spi_get_device_id Cc: # 3.16.x: 70f3ce0510af: mtd: spi-nor: make spi_nor_scan() take a chip type name, not spi_device_id Cc: # 3.16.x Signed-off-by: Ben Hutchings Signed-off-by: Brian Norris --- drivers/mtd/devices/m25p80.c | 52 ++++++++++++++++++++++++++++++++++++++++++- drivers/mtd/spi-nor/spi-nor.c | 3 +-- include/linux/mtd/spi-nor.h | 1 - 3 files changed, 52 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index bd5e4c6edfd4..ed827cf894e4 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -261,12 +261,62 @@ static int m25p_remove(struct spi_device *spi) } +/* + * XXX This needs to be kept in sync with spi_nor_ids. We can't share + * it with spi-nor, because if this is built as a module then modpost + * won't be able to read it and add appropriate aliases. + */ +static const struct spi_device_id m25p_ids[] = { + {"at25fs010"}, {"at25fs040"}, {"at25df041a"}, {"at25df321a"}, + {"at25df641"}, {"at26f004"}, {"at26df081a"}, {"at26df161a"}, + {"at26df321"}, {"at45db081d"}, + {"en25f32"}, {"en25p32"}, {"en25q32b"}, {"en25p64"}, + {"en25q64"}, {"en25qh128"}, {"en25qh256"}, + {"f25l32pa"}, + {"mr25h256"}, {"mr25h10"}, + {"gd25q32"}, {"gd25q64"}, + {"160s33b"}, {"320s33b"}, {"640s33b"}, + {"mx25l2005a"}, {"mx25l4005a"}, {"mx25l8005"}, {"mx25l1606e"}, + {"mx25l3205d"}, {"mx25l3255e"}, {"mx25l6405d"}, {"mx25l12805d"}, + {"mx25l12855e"},{"mx25l25635e"},{"mx25l25655e"},{"mx66l51235l"}, + {"mx66l1g55g"}, + {"n25q064"}, {"n25q128a11"}, {"n25q128a13"}, {"n25q256a"}, + {"n25q512a"}, {"n25q512ax3"}, {"n25q00"}, + {"pm25lv512"}, {"pm25lv010"}, {"pm25lq032"}, + {"s25sl032p"}, {"s25sl064p"}, {"s25fl256s0"}, {"s25fl256s1"}, + {"s25fl512s"}, {"s70fl01gs"}, {"s25sl12800"}, {"s25sl12801"}, + {"s25fl129p0"}, {"s25fl129p1"}, {"s25sl004a"}, {"s25sl008a"}, + {"s25sl016a"}, {"s25sl032a"}, {"s25sl064a"}, {"s25fl008k"}, + {"s25fl016k"}, {"s25fl064k"}, + {"sst25vf040b"},{"sst25vf080b"},{"sst25vf016b"},{"sst25vf032b"}, + {"sst25vf064c"},{"sst25wf512"}, {"sst25wf010"}, {"sst25wf020"}, + {"sst25wf040"}, + {"m25p05"}, {"m25p10"}, {"m25p20"}, {"m25p40"}, + {"m25p80"}, {"m25p16"}, {"m25p32"}, {"m25p64"}, + {"m25p128"}, {"n25q032"}, + {"m25p05-nonjedec"}, {"m25p10-nonjedec"}, {"m25p20-nonjedec"}, + {"m25p40-nonjedec"}, {"m25p80-nonjedec"}, {"m25p16-nonjedec"}, + {"m25p32-nonjedec"}, {"m25p64-nonjedec"}, {"m25p128-nonjedec"}, + {"m45pe10"}, {"m45pe80"}, {"m45pe16"}, + {"m25pe20"}, {"m25pe80"}, {"m25pe16"}, + {"m25px16"}, {"m25px32"}, {"m25px32-s0"}, {"m25px32-s1"}, + {"m25px64"}, + {"w25x10"}, {"w25x20"}, {"w25x40"}, {"w25x80"}, + {"w25x16"}, {"w25x32"}, {"w25q32"}, {"w25q32dw"}, + {"w25x64"}, {"w25q64"}, {"w25q128"}, {"w25q80"}, + {"w25q80bl"}, {"w25q128"}, {"w25q256"}, {"cat25c11"}, + {"cat25c03"}, {"cat25c09"}, {"cat25c17"}, {"cat25128"}, + { }, +}; +MODULE_DEVICE_TABLE(spi, m25p_ids); + + static struct spi_driver m25p80_driver = { .driver = { .name = "m25p80", .owner = THIS_MODULE, }, - .id_table = spi_nor_ids, + .id_table = m25p_ids, .probe = m25p_probe, .remove = m25p_remove, diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c index 5c8e39977bc5..c51ee52386a7 100644 --- a/drivers/mtd/spi-nor/spi-nor.c +++ b/drivers/mtd/spi-nor/spi-nor.c @@ -475,7 +475,7 @@ struct flash_info { * more nor chips. This current list focusses on newer chips, which * have been converging on command sets which including JEDEC ID. */ -const struct spi_device_id spi_nor_ids[] = { +static const struct spi_device_id spi_nor_ids[] = { /* Atmel -- some are (confusingly) marketed as "DataFlash" */ { "at25fs010", INFO(0x1f6601, 0, 32 * 1024, 4, SECT_4K) }, { "at25fs040", INFO(0x1f6604, 0, 64 * 1024, 8, SECT_4K) }, @@ -639,7 +639,6 @@ const struct spi_device_id spi_nor_ids[] = { { "cat25128", CAT25_INFO(2048, 8, 64, 2, SPI_NOR_NO_ERASE | SPI_NOR_NO_FR) }, { }, }; -EXPORT_SYMBOL_GPL(spi_nor_ids); static const struct spi_device_id *spi_nor_read_id(struct spi_nor *nor) { diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h index a5a7a086748d..046a0a2e4c4e 100644 --- a/include/linux/mtd/spi-nor.h +++ b/include/linux/mtd/spi-nor.h @@ -199,6 +199,5 @@ struct spi_nor { * Return: 0 for success, others for failure. */ int spi_nor_scan(struct spi_nor *nor, const char *name, enum read_mode mode); -extern const struct spi_device_id spi_nor_ids[]; #endif -- cgit v1.2.3-59-g8ed1b From 5695be142e203167e3cb515ef86a88424f3524eb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 20 Oct 2014 18:12:32 +0200 Subject: OOM, PM: OOM killed task shouldn't escape PM suspend PM freezer relies on having all tasks frozen by the time devices are getting frozen so that no task will touch them while they are getting frozen. But OOM killer is allowed to kill an already frozen task in order to handle OOM situtation. In order to protect from late wake ups OOM killer is disabled after all tasks are frozen. This, however, still keeps a window open when a killed task didn't manage to die by the time freeze_processes finishes. Reduce the race window by checking all tasks after OOM killer has been disabled. This is still not race free completely unfortunately because oom_killer_disable cannot stop an already ongoing OOM killer so a task might still wake up from the fridge and get killed without freeze_processes noticing. Full synchronization of OOM and freezer is, however, too heavy weight for this highly unlikely case. Introduce and check oom_kills counter which gets incremented early when the allocator enters __alloc_pages_may_oom path and only check all the tasks if the counter changes during the freezing attempt. The counter is updated so early to reduce the race window since allocator checked oom_killer_disabled which is set by PM-freezing code. A false positive will push the PM-freezer into a slow path but that is not a big deal. Changes since v1 - push the re-check loop out of freeze_processes into check_frozen_processes and invert the condition to make the code more readable as per Rafael Fixes: f660daac474c6f (oom: thaw threads if oom killed thread is frozen before deferring) Cc: 3.2+ # 3.2+ Signed-off-by: Michal Hocko Signed-off-by: Rafael J. Wysocki --- include/linux/oom.h | 3 +++ kernel/power/process.c | 40 +++++++++++++++++++++++++++++++++++++++- mm/oom_kill.c | 17 +++++++++++++++++ mm/page_alloc.c | 8 ++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/oom.h b/include/linux/oom.h index 647395a1a550..e8d6e1058723 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -50,6 +50,9 @@ static inline bool oom_task_origin(const struct task_struct *p) extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); + +extern int oom_kills_count(void); +extern void note_oom_kill(void); extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, struct mem_cgroup *memcg, nodemask_t *nodemask, diff --git a/kernel/power/process.c b/kernel/power/process.c index 7b323221b9ee..5cc588c1abab 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,6 +108,28 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +/* + * Returns true if all freezable tasks (except for current) are frozen already + */ +static bool check_frozen_processes(void) +{ + struct task_struct *g, *p; + bool ret = true; + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p != current && !freezer_should_skip(p) && + !frozen(p)) { + ret = false; + goto done; + } + } +done: + read_unlock(&tasklist_lock); + + return ret; +} + /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -118,6 +140,7 @@ static int try_to_freeze_tasks(bool user_only) int freeze_processes(void) { int error; + int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -132,12 +155,27 @@ int freeze_processes(void) pm_wakeup_clear(); printk("Freezing user space processes ... "); pm_freezing = true; + oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { - printk("done."); __usermodehelper_set_disable_depth(UMH_DISABLED); oom_killer_disable(); + + /* + * There might have been an OOM kill while we were + * freezing tasks and the killed task might be still + * on the way out so we have to double check for race. + */ + if (oom_kills_count() != oom_kills_saved && + !check_frozen_processes()) { + __usermodehelper_set_disable_depth(UMH_ENABLED); + printk("OOM in progress."); + error = -EBUSY; + goto done; + } + printk("done."); } +done: printk("\n"); BUG_ON(in_atomic()); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bbf405a3a18f..5340f6b91312 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -404,6 +404,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, dump_tasks(memcg, nodemask); } +/* + * Number of OOM killer invocations (including memcg OOM killer). + * Primarily used by PM freezer to check for potential races with + * OOM killed frozen task. + */ +static atomic_t oom_kills = ATOMIC_INIT(0); + +int oom_kills_count(void) +{ + return atomic_read(&oom_kills); +} + +void note_oom_kill(void) +{ + atomic_inc(&oom_kills); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 736d8e1b6381..9cd36b822444 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2251,6 +2251,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, return NULL; } + /* + * PM-freezer should be notified that there might be an OOM killer on + * its way to kill and wake somebody up. This is too early and we might + * end up not killing anything but false positives are acceptable. + * See freeze_processes. + */ + note_oom_kill(); + /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if -- cgit v1.2.3-59-g8ed1b From 9e8beeb79ded25c5c1986f80fb8a7f6815345d5a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 18:58:48 -0600 Subject: audit: Remove "weak" from audit_classify_compat_syscall() declaration There's only one audit_classify_compat_syscall() definition, so it doesn't need to be weak. Remove the "weak" attribute from the audit_classify_compat_syscall() declaration. Signed-off-by: Bjorn Helgaas Acked-by: Richard Guy Briggs CC: AKASHI Takahiro --- include/linux/audit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/audit.h b/include/linux/audit.h index 36dffeccebdb..e58fe7df8b9c 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -90,7 +90,7 @@ extern unsigned compat_dir_class[]; extern unsigned compat_chattr_class[]; extern unsigned compat_signal_class[]; -extern int __weak audit_classify_compat_syscall(int abi, unsigned syscall); +extern int audit_classify_compat_syscall(int abi, unsigned syscall); /* audit_names->type values */ #define AUDIT_TYPE_UNKNOWN 0 /* we don't know yet */ -- cgit v1.2.3-59-g8ed1b From 96a2adbc6f501996418da9f7afe39bf0e4d006a9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 18:59:09 -0600 Subject: clocksource: Remove "weak" from clocksource_default_clock() declaration kernel/time/jiffies.c provides a default clocksource_default_clock() definition explicitly marked "weak". arch/s390 provides its own definition intended to override the default, but the "weak" attribute on the declaration applied to the s390 definition as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the clocksource_default_clock() declaration so we always prefer a non-weak definition over the weak one, independent of link order. Fixes: f1b82746c1e9 ("clocksource: Cleanup clocksource selection") Signed-off-by: Bjorn Helgaas Acked-by: John Stultz Acked-by: Ingo Molnar CC: Daniel Lezcano CC: Martin Schwidefsky --- include/linux/clocksource.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 653f0e2b6ca9..abcafaa20b86 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -287,7 +287,7 @@ extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_suspend(void); extern void clocksource_resume(void); -extern struct clocksource * __init __weak clocksource_default_clock(void); +extern struct clocksource * __init clocksource_default_clock(void); extern void clocksource_mark_unstable(struct clocksource *cs); extern u64 -- cgit v1.2.3-59-g8ed1b From 5ab03ac5aaa1f032e071f1b3dc433b7839359c03 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 18:59:41 -0600 Subject: vmcore: Remove "weak" from function declarations For the following functions: elfcorehdr_alloc() elfcorehdr_free() elfcorehdr_read() elfcorehdr_read_notes() remap_oldmem_pfn_range() fs/proc/vmcore.c provides default definitions explicitly marked "weak". arch/s390 provides its own definitions intended to override the default ones, but the "weak" attribute on the declarations applied to the s390 definitions as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the declarations so we always prefer a non-weak definition over the weak one, independent of link order. Fixes: be8a8d069e50 ("vmcore: introduce ELF header in new memory feature") Fixes: 9cb218131de1 ("vmcore: introduce remap_oldmem_pfn_range()") Signed-off-by: Bjorn Helgaas Acked-by: Andrew Morton Acked-by: Vivek Goyal CC: Michael Holzheu --- include/linux/crash_dump.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 72ab536ad3de..3849fce7ecfe 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -14,14 +14,13 @@ extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; -extern int __weak elfcorehdr_alloc(unsigned long long *addr, - unsigned long long *size); -extern void __weak elfcorehdr_free(unsigned long long addr); -extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos); -extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); -extern int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, - unsigned long from, unsigned long pfn, - unsigned long size, pgprot_t prot); +extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size); +extern void elfcorehdr_free(unsigned long long addr); +extern ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos); +extern ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); +extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot); extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); -- cgit v1.2.3-59-g8ed1b From 107bcc6d566cb40184068d888637f9aefe6252dd Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 19:00:25 -0600 Subject: kgdb: Remove "weak" from kgdb_arch_pc() declaration kernel/debug/debug_core.c provides a default kgdb_arch_pc() definition explicitly marked "weak". Several architectures provide their own definitions intended to override the default, but the "weak" attribute on the declaration applied to the arch definitions as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the declaration so we always prefer a non-weak definition over the weak one, independent of link order. Fixes: 688b744d8bc8 ("kgdb: fix signedness mixmatches, add statics, add declaration to header") Tested-by: Vineet Gupta # for ARC build Signed-off-by: Bjorn Helgaas Reviewed-by: Harvey Harrison --- include/linux/kgdb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 6b06d378f3df..e465bb15912d 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -283,7 +283,7 @@ struct kgdb_io { extern struct kgdb_arch arch_kgdb_ops; -extern unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs); +extern unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs); #ifdef CONFIG_SERIAL_KGDB_NMI extern int kgdb_register_nmi_console(void); -- cgit v1.2.3-59-g8ed1b From e0a8400c6923a163265d52798cdd4c33f3f8ab5a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 19:00:47 -0600 Subject: memory-hotplug: Remove "weak" from memory_block_size_bytes() declaration drivers/base/memory.c provides a default memory_block_size_bytes() definition explicitly marked "weak". Several architectures provide their own definitions intended to override the default, but the "weak" attribute on the declaration applied to the arch definitions as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the declaration so we always prefer a non-weak definition over the weak one, independent of link order. Fixes: 41f107266b19 ("drivers: base: Add prototype declaration to the header file") Signed-off-by: Bjorn Helgaas Acked-by: Andrew Morton CC: Rashika Kheria CC: Nathan Fontenot CC: Anton Blanchard CC: Heiko Carstens CC: Yinghai Lu --- include/linux/memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index bb7384e3c3d8..8b8d8d12348e 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -35,7 +35,7 @@ struct memory_block { }; int arch_get_memory_phys_device(unsigned long start_pfn); -unsigned long __weak memory_block_size_bytes(void); +unsigned long memory_block_size_bytes(void); /* These states are exposed to userspace as text strings in sysfs */ #define MEM_ONLINE (1<<0) /* exposed to userspace */ -- cgit v1.2.3-59-g8ed1b From 271a9c35158910496f6fc3a635c2ed85df6be3d9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 19:01:03 -0600 Subject: uprobes: Remove "weak" from function declarations For the following interfaces: set_swbp() set_orig_insn() is_swbp_insn() is_trap_insn() uprobe_get_swbp_addr() arch_uprobe_ignore() arch_uprobe_copy_ixol() kernel/events/uprobes.c provides default definitions explicitly marked "weak". Some architectures provide their own definitions intended to override the defaults, but the "weak" attribute on the declarations applied to the arch definitions as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the declarations so we always prefer a non-weak definition over the weak one, independent of link order. Signed-off-by: Bjorn Helgaas Acked-by: Ingo Molnar Acked-by: Srikar Dronamraju CC: Victor Kamensky CC: Oleg Nesterov CC: David A. Long CC: Ananth N Mavinakayanahalli --- include/linux/uprobes.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 4f844c6b03ee..60beb5dc7977 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -98,11 +98,11 @@ struct uprobes_state { struct xol_area *xol_area; }; -extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); -extern bool __weak is_trap_insn(uprobe_opcode_t *insn); -extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); +extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern bool is_swbp_insn(uprobe_opcode_t *insn); +extern bool is_trap_insn(uprobe_opcode_t *insn); +extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); @@ -128,8 +128,8 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data); extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs); -extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); -extern void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, +extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); +extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); #else /* !CONFIG_UPROBES */ struct uprobes_state { -- cgit v1.2.3-59-g8ed1b From 4aa7c6346be395bdf776f82bbb2e3e2bc60bdd2b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:35 +0200 Subject: vfs: add i_op->dentry_open() Add a new inode operation i_op->dentry_open(). This is for stacked filesystems that want to return a struct file from a different filesystem. Signed-off-by: Miklos Szeredi --- Documentation/filesystems/Locking | 2 ++ Documentation/filesystems/vfs.txt | 7 +++++++ fs/namei.c | 9 ++++++--- fs/open.c | 23 +++++++++++++++++++++-- include/linux/fs.h | 4 ++++ 5 files changed, 40 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 94d93b1f8b53..b30753cbf431 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -67,6 +67,7 @@ prototypes: struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); locking rules: all may block @@ -96,6 +97,7 @@ fiemap: no update_time: no atomic_open: yes tmpfile: no +dentry_open: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index fceff7c00a3c..20bf204426ca 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -364,6 +364,7 @@ struct inode_operations { int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); }; Again, all methods are called without any locks being held, unless @@ -696,6 +697,12 @@ struct address_space_operations { but instead uses bmap to find out where the blocks in the file are and uses those addresses directly. + dentry_open: *WARNING: probably going away soon, do not use!* This is an + alternative to f_op->open(), the difference is that this method may open + a file not necessarily originating from the same filesystem as the one + i_op->open() was called on. It may be useful for stacking filesystems + which want to allow native I/O directly on underlying files. + invalidatepage: If a page has PagePrivate set, then invalidatepage will be called when part or all of the page is to be removed diff --git a/fs/namei.c b/fs/namei.c index 43927d14db67..75306b3c9526 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3064,9 +3064,12 @@ finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; - file->f_path.mnt = nd->path.mnt; - error = finish_open(file, nd->path.dentry, NULL, opened); - if (error) { + + BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ + error = vfs_open(&nd->path, file, current_cred()); + if (!error) { + *opened |= FILE_OPENED; + } else { if (error == -EOPENSTALE) goto stale_open; goto out; diff --git a/fs/open.c b/fs/open.c index d6fd3acde134..de92c13b58be 100644 --- a/fs/open.c +++ b/fs/open.c @@ -823,8 +823,7 @@ struct file *dentry_open(const struct path *path, int flags, f = get_empty_filp(); if (!IS_ERR(f)) { f->f_flags = flags; - f->f_path = *path; - error = do_dentry_open(f, NULL, cred); + error = vfs_open(path, f, cred); if (!error) { /* from now on we need fput() to dispose of f */ error = open_check_o_direct(f); @@ -841,6 +840,26 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +/** + * vfs_open - open the file at the given path + * @path: path to open + * @filp: newly allocated file with f_flag initialized + * @cred: credentials to use + */ +int vfs_open(const struct path *path, struct file *filp, + const struct cred *cred) +{ + struct inode *inode = path->dentry->d_inode; + + if (inode->i_op->dentry_open) + return inode->i_op->dentry_open(path->dentry, filp, cred); + else { + filp->f_path = *path; + return do_dentry_open(filp, NULL, cred); + } +} +EXPORT_SYMBOL(vfs_open); + static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index a957d4366c24..5cf7f6759679 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1528,6 +1528,9 @@ struct inode_operations { umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); int (*set_acl)(struct inode *, struct posix_acl *, int); + + /* WARNING: probably going away soon, do not use! */ + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); } ____cacheline_aligned; ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, @@ -2040,6 +2043,7 @@ extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int); +extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); -- cgit v1.2.3-59-g8ed1b From 1c118596a7682912106c80007102ce0184c77780 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:35 +0200 Subject: vfs: export do_splice_direct() to modules Export do_splice_direct() to modules. Needed by overlay filesystem. Signed-off-by: Miklos Szeredi --- fs/internal.h | 6 ------ fs/splice.c | 1 + include/linux/fs.h | 3 +++ 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/internal.h b/fs/internal.h index 9477f8f6aefc..0f0626a6997c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -138,12 +138,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, */ extern int rw_verify_area(int, struct file *, const loff_t *, size_t); -/* - * splice.c - */ -extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags); - /* * pipe.c */ diff --git a/fs/splice.c b/fs/splice.c index f5cb9ba84510..75c6058eabf2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1330,6 +1330,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, return ret; } +EXPORT_SYMBOL(do_splice_direct); static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, diff --git a/include/linux/fs.h b/include/linux/fs.h index 5cf7f6759679..10ed65b2c31d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2456,6 +2456,9 @@ extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *, size_t len, unsigned int flags); +extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); + extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); -- cgit v1.2.3-59-g8ed1b From bd5d08569cc379f8366663a61558a9ce17c2e460 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:35 +0200 Subject: vfs: export __inode_permission() to modules We need to be able to check inode permissions (but not filesystem implied permissions) for stackable filesystems. Expose this interface for overlayfs. Signed-off-by: Miklos Szeredi --- fs/internal.h | 1 - fs/namei.c | 1 + include/linux/fs.h | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/internal.h b/fs/internal.h index 0f0626a6997c..757ba2abf21e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -47,7 +47,6 @@ extern void __init chrdev_init(void); /* * namei.c */ -extern int __inode_permission(struct inode *, int); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); diff --git a/fs/namei.c b/fs/namei.c index 75306b3c9526..d944f6db9b07 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -416,6 +416,7 @@ int __inode_permission(struct inode *inode, int mask) return security_inode_permission(inode, mask); } +EXPORT_SYMBOL(__inode_permission); /** * sb_permission - Check superblock-level permissions diff --git a/include/linux/fs.h b/include/linux/fs.h index 10ed65b2c31d..5419df70a835 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2257,6 +2257,7 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); +extern int __inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int); static inline bool execute_ok(struct inode *inode) -- cgit v1.2.3-59-g8ed1b From c771d683a62e5d36bc46036f5c07f4f5bb7dda61 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: vfs: introduce clone_private_mount() Overlayfs needs a private clone of the mount, so create a function for this and export to modules. Signed-off-by: Miklos Szeredi --- fs/namespace.c | 27 +++++++++++++++++++++++++++ include/linux/mount.h | 3 +++ 2 files changed, 30 insertions(+) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index fbba8b17330d..5b66b2b3624d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1686,6 +1686,33 @@ void drop_collected_mounts(struct vfsmount *mnt) namespace_unlock(); } +/** + * clone_private_mount - create a private clone of a path + * + * This creates a new vfsmount, which will be the clone of @path. The new will + * not be attached anywhere in the namespace and will be private (i.e. changes + * to the originating mount won't be propagated into this). + * + * Release with mntput(). + */ +struct vfsmount *clone_private_mount(struct path *path) +{ + struct mount *old_mnt = real_mount(path->mnt); + struct mount *new_mnt; + + if (IS_MNT_UNBINDABLE(old_mnt)) + return ERR_PTR(-EINVAL); + + down_read(&namespace_sem); + new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + if (IS_ERR(new_mnt)) + return ERR_CAST(new_mnt); + + return &new_mnt->mnt; +} +EXPORT_SYMBOL_GPL(clone_private_mount); + int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { diff --git a/include/linux/mount.h b/include/linux/mount.h index 9262e4bf0cc3..c2c561dc0114 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -81,6 +81,9 @@ extern struct vfsmount *mntget(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(struct path *path); extern int __mnt_is_readonly(struct vfsmount *mnt); +struct path; +extern struct vfsmount *clone_private_mount(struct path *path); + struct file_system_type; extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, -- cgit v1.2.3-59-g8ed1b From cbdf35bcb833bfd00f0925d7a9a33a21f41ea582 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: vfs: export check_sticky() It's already duplicated in btrfs and about to be used in overlayfs too. Move the sticky bit check to an inline helper and call the out-of-line helper only in the unlikly case of the sticky bit being set. Signed-off-by: Miklos Szeredi --- fs/btrfs/ioctl.c | 20 +------------------- fs/namei.c | 9 ++------- include/linux/fs.h | 9 +++++++++ 3 files changed, 12 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8d2b76e29d3b..4399f0c3a4ce 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -765,23 +765,6 @@ out: return ret; } -/* copy of check_sticky in fs/namei.c() -* It's inline, so penalty for filesystems that don't use sticky bit is -* minimal. -*/ -static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) -{ - kuid_t fsuid = current_fsuid(); - - if (!(dir->i_mode & S_ISVTX)) - return 0; - if (uid_eq(inode->i_uid, fsuid)) - return 0; - if (uid_eq(dir->i_uid, fsuid)) - return 0; - return !capable(CAP_FOWNER); -} - /* copy of may_delete in fs/namei.c() * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. @@ -817,8 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (btrfs_check_sticky(dir, victim->d_inode)|| - IS_APPEND(victim->d_inode)|| + if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) || IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) return -EPERM; if (isdir) { diff --git a/fs/namei.c b/fs/namei.c index d944f6db9b07..77fd536106cb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2384,22 +2384,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path, } EXPORT_SYMBOL(kern_path_mountpoint); -/* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. - */ -static inline int check_sticky(struct inode *dir, struct inode *inode) +int __check_sticky(struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (!(dir->i_mode & S_ISVTX)) - return 0; if (uid_eq(inode->i_uid, fsuid)) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } +EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check diff --git a/include/linux/fs.h b/include/linux/fs.h index 5419df70a835..55cc0a319baa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2259,6 +2259,7 @@ extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); extern int __inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int); +extern int __check_sticky(struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) { @@ -2745,6 +2746,14 @@ static inline int is_sxid(umode_t mode) return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); } +static inline int check_sticky(struct inode *dir, struct inode *inode) +{ + if (!(dir->i_mode & S_ISVTX)) + return 0; + + return __check_sticky(dir, inode); +} + static inline void inode_has_no_xattr(struct inode *inode) { if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC)) -- cgit v1.2.3-59-g8ed1b From 787fb6bc9682ec7c05fb5d9561b57100fbc1cc41 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: vfs: add whiteout support Whiteout isn't actually a new file type, but is represented as a char device (Linus's idea) with 0/0 device number. This has several advantages compared to introducing a new whiteout file type: - no userspace API changes (e.g. trivial to make backups of upper layer filesystem, without losing whiteouts) - no fs image format changes (you can boot an old kernel/fsck without whiteout support and things won't break) - implementation is trivial Signed-off-by: Miklos Szeredi --- fs/namei.c | 14 ++++++++++++++ include/linux/fs.h | 11 +++++++++++ 2 files changed, 25 insertions(+) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 77fd536106cb..d20191c0ebf5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4346,6 +4346,20 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } +int vfs_whiteout(struct inode *dir, struct dentry *dentry) +{ + int error = may_create(dir, dentry); + if (error) + return error; + + if (!dir->i_op->mknod) + return -EPERM; + + return dir->i_op->mknod(dir, dentry, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); +} +EXPORT_SYMBOL(vfs_whiteout); + int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); diff --git a/include/linux/fs.h b/include/linux/fs.h index 55cc0a319baa..69118b3cb917 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -222,6 +222,13 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) +/* + * Whiteout is represented by a char device. The following constants define the + * mode and device number to use. + */ +#define WHITEOUT_MODE 0 +#define WHITEOUT_DEV 0 + /* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. @@ -1398,6 +1405,7 @@ extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct ino extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); +extern int vfs_whiteout(struct inode *, struct dentry *); /* * VFS dentry helper functions. @@ -1628,6 +1636,9 @@ struct super_operations { #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) +#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ + (inode)->i_rdev == WHITEOUT_DEV) + /* * Inode state bits. Protected by inode->i_lock * -- cgit v1.2.3-59-g8ed1b From 69c433ed2ecd2d3264efd7afec4439524b319121 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:39 +0200 Subject: fs: limit filesystem stacking depth Add a simple read-only counter to super_block that indicates how deep this is in the stack of filesystems. Previously ecryptfs was the only stackable filesystem and it explicitly disallowed multiple layers of itself. Overlayfs, however, can be stacked recursively and also may be stacked on top of ecryptfs or vice versa. To limit the kernel stack usage we must limit the depth of the filesystem stack. Initially the limit is set to 2. Signed-off-by: Miklos Szeredi --- fs/ecryptfs/main.c | 7 +++++++ fs/overlayfs/super.c | 9 +++++++++ include/linux/fs.h | 11 +++++++++++ 3 files changed, 27 insertions(+) (limited to 'include/linux') diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 1b119d3bf924..c4cd1fd86cc2 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -566,6 +566,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; s->s_magic = ECRYPTFS_SUPER_MAGIC; + s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; + + rc = -EINVAL; + if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("eCryptfs: maximum fs stacking depth exceeded\n"); + goto out_free; + } inode = ecryptfs_get_inode(path.dentry->d_inode, s); rc = PTR_ERR(inode); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7dcc24e84417..08b704cebfc4 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -677,6 +677,15 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } ufs->lower_namelen = statfs.f_namelen; + sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth, + lowerpath.mnt->mnt_sb->s_stack_depth) + 1; + + err = -EINVAL; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("overlayfs: maximum fs stacking depth exceeded\n"); + goto out_put_workpath; + } + ufs->upper_mnt = clone_private_mount(&upperpath); err = PTR_ERR(ufs->upper_mnt); if (IS_ERR(ufs->upper_mnt)) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 69118b3cb917..4e41a4a331bb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -261,6 +261,12 @@ struct iattr { */ #include +/* + * Maximum number of layers of fs stack. Needs to be limited to + * prevent kernel stack overflow + */ +#define FILESYSTEM_MAX_STACK_DEPTH 2 + /** * enum positive_aop_returns - aop return codes with specific semantics * @@ -1273,6 +1279,11 @@ struct super_block { struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; + + /* + * Indicates how deep in a filesystem stack this SB is + */ + int s_stack_depth; }; extern struct timespec current_fs_time(struct super_block *sb); -- cgit v1.2.3-59-g8ed1b From 571ee1b6859869a09ed718d390aac2b9414646a2 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 9 Oct 2014 18:30:08 +0800 Subject: kvm: vfio: fix unregister kvm_device_ops of vfio After commit 80ce163 (KVM: VFIO: register kvm_device_ops dynamically), kvm_device_ops of vfio can be registered dynamically. Commit 3c3c29fd (kvm-vfio: do not use module_init) move the dynamic register invoked by kvm_init in order to fix broke unloading of the kvm module. However, kvm_device_ops of vfio is unregistered after rmmod kvm-intel module which lead to device type collision detection warning after kvm-intel module reinsmod. WARNING: CPU: 1 PID: 10358 at /root/cathy/kvm/arch/x86/kvm/../../../virt/kvm/kvm_main.c:3289 kvm_init+0x234/0x282 [kvm]() Modules linked in: kvm_intel(O+) kvm(O) nfsv3 nfs_acl auth_rpcgss oid_registry nfsv4 dns_resolver nfs fscache lockd sunrpc pci_stub bridge stp llc autofs4 8021q cpufreq_ondemand ipv6 joydev microcode pcspkr igb i2c_algo_bit ehci_pci ehci_hcd e1000e i2c_i801 ixgbe ptp pps_core hwmon mdio tpm_tis tpm ipmi_si ipmi_msghandler acpi_cpufreq isci libsas scsi_transport_sas button dm_mirror dm_region_hash dm_log dm_mod [last unloaded: kvm_intel] CPU: 1 PID: 10358 Comm: insmod Tainted: G W O 3.17.0-rc1 #2 Hardware name: Intel Corporation S2600CP/S2600CP, BIOS RMLSDP.86I.00.29.D696.1311111329 11/11/2013 0000000000000cd9 ffff880ff08cfd18 ffffffff814a61d9 0000000000000cd9 0000000000000000 ffff880ff08cfd58 ffffffff810417b7 ffff880ff08cfd48 ffffffffa045bcac ffffffffa049c420 0000000000000040 00000000000000ff Call Trace: [] dump_stack+0x49/0x60 [] warn_slowpath_common+0x7c/0x96 [] ? kvm_init+0x234/0x282 [kvm] [] warn_slowpath_null+0x15/0x17 [] kvm_init+0x234/0x282 [kvm] [] vmx_init+0x1bf/0x42a [kvm_intel] [] ? vmx_check_processor_compat+0x64/0x64 [kvm_intel] [] do_one_initcall+0xe3/0x170 [] ? __vunmap+0xad/0xb8 [] do_init_module+0x2b/0x174 [] load_module+0x43e/0x569 [] ? do_init_module+0x174/0x174 [] ? copy_module_from_user+0x39/0x82 [] ? module_sect_show+0x20/0x20 [] SyS_init_module+0x54/0x81 [] system_call_fastpath+0x16/0x1b ---[ end trace 0626f4a3ddea56f3 ]--- The bug can be reproduced by: rmmod kvm_intel.ko insmod kvm_intel.ko without rmmod/insmod kvm.ko This patch fixes the bug by unregistering kvm_device_ops of vfio when the kvm-intel module is removed. Reported-by: Liu Rongrong Fixes: 3c3c29fd0d7cddc32862c350d0700ce69953e3bd Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 7 +++++++ virt/kvm/vfio.c | 5 +++++ virt/kvm/vfio.h | 4 ++++ 4 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 28be31f49250..ea53b04993f2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1080,6 +1080,7 @@ void kvm_device_get(struct kvm_device *dev); void kvm_device_put(struct kvm_device *dev); struct kvm_device *kvm_device_from_filp(struct file *filp); int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type); +void kvm_unregister_device_ops(u32 type); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_xics_ops; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 384eaa7b02fa..25ffac9e947d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2354,6 +2354,12 @@ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) return 0; } +void kvm_unregister_device_ops(u32 type) +{ + if (kvm_device_ops_table[type] != NULL) + kvm_device_ops_table[type] = NULL; +} + static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_create_device *cd) { @@ -3328,5 +3334,6 @@ void kvm_exit(void) kvm_arch_exit(); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); + kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 281e7cf2b8e5..620e37f741b8 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -283,3 +283,8 @@ int kvm_vfio_ops_init(void) { return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO); } + +void kvm_vfio_ops_exit(void) +{ + kvm_unregister_device_ops(KVM_DEV_TYPE_VFIO); +} diff --git a/virt/kvm/vfio.h b/virt/kvm/vfio.h index 92eac75d6b62..ab88c7dc0514 100644 --- a/virt/kvm/vfio.h +++ b/virt/kvm/vfio.h @@ -3,11 +3,15 @@ #ifdef CONFIG_KVM_VFIO int kvm_vfio_ops_init(void); +void kvm_vfio_ops_exit(void); #else static inline int kvm_vfio_ops_init(void) { return 0; } +static inline void kvm_vfio_ops_exit(void) +{ +} #endif #endif -- cgit v1.2.3-59-g8ed1b From dda02fd6278d9e995850b3c1dba484f17cbe4de4 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Fri, 24 Oct 2014 17:47:57 +0800 Subject: mm, cma: make parameters order consistent in func declaration and definition In the current code, the base and size parameters order is not consistent in functions declaration and definition. If someone calls these functions according to the declaration parameters order in cma.h, he will run into some bug and it's hard to find the reason. This patch makes the parameters order consistent in functions declaration and definition. Signed-off-by: Weijie Yang Acked-by: Michal Nazarewicz Signed-off-by: Marek Szyprowski --- include/linux/cma.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cma.h b/include/linux/cma.h index 0430ed05d3b9..a93438beb33c 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -18,12 +18,12 @@ struct cma; extern phys_addr_t cma_get_base(struct cma *cma); extern unsigned long cma_get_size(struct cma *cma); -extern int __init cma_declare_contiguous(phys_addr_t size, - phys_addr_t base, phys_addr_t limit, +extern int __init cma_declare_contiguous(phys_addr_t base, + phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, struct cma **res_cma); -extern int cma_init_reserved_mem(phys_addr_t size, - phys_addr_t base, int order_per_bit, +extern int cma_init_reserved_mem(phys_addr_t base, + phys_addr_t size, int order_per_bit, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align); extern bool cma_release(struct cma *cma, struct page *pages, int count); -- cgit v1.2.3-59-g8ed1b From e999dbc254044e8d2a5818d92d205f65bae28f37 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 19 Oct 2014 17:13:57 +0200 Subject: Revert "block: all blk-mq requests are tagged" This reverts commit fb3ccb5da71273e7f0d50b50bc879e50cedd60e7. SCSI-2/SPI actually needs the tagged/untagged flag in the request to work properly. Revert this patch and add a follow on to set it in the right place. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Acked-by: Jens Axboe Reported-by: Meelis Roos Tested-by: Meelis Roos Cc: stable@vger.kernel.org --- include/linux/blkdev.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0207a78a8d82..51d0dc2259cf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1136,8 +1136,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) /* * tag stuff */ -#define blk_rq_tagged(rq) \ - ((rq)->mq_ctx || ((rq)->cmd_flags & REQ_QUEUED)) +#define blk_rq_tagged(rq) ((rq)->cmd_flags & REQ_QUEUED) extern int blk_queue_start_tag(struct request_queue *, struct request *); extern struct request *blk_queue_find_tag(struct request_queue *, int); extern void blk_queue_end_tag(struct request_queue *, struct request *); -- cgit v1.2.3-59-g8ed1b From 5631b8fba640a4ab2f8a954f63a603fa34eda96b Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Sat, 25 Oct 2014 15:09:42 -0700 Subject: compiler/gcc4+: Remove inaccurate comment about 'asm goto' miscompiles The bug referenced by the comment in this commit was not completely fixed in GCC 4.8.2, as I mentioned in a thread back in February: https://lkml.org/lkml/2014/2/12/797 The conclusion at that time was to make the quirk unconditional until the bug could be found and fixed in GCC. Unfortunately, when I submitted the patch (commit a9f18034) I left a comment in that claimed the bug was fixed in GCC 4.8.2+. This comment is inaccurate, and should be removed. Signed-off-by: Steven Noonan Signed-off-by: Ingo Molnar Cc: Jakub Jelinek Cc: Richard Henderson Cc: Linus Torvalds Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1414274982-14040-1-git-send-email-steven@uplinklabs.net Cc: Ingo Molnar --- include/linux/compiler-gcc4.h | 1 - include/linux/compiler-gcc5.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h index 2507fd2a1eb4..d1a558239b1a 100644 --- a/include/linux/compiler-gcc4.h +++ b/include/linux/compiler-gcc4.h @@ -71,7 +71,6 @@ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 * * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * Fixed in GCC 4.8.2 and later versions. * * (asm goto is automatically volatile - the naming reflects this.) */ diff --git a/include/linux/compiler-gcc5.h b/include/linux/compiler-gcc5.h index cdd1cc202d51..c8c565952548 100644 --- a/include/linux/compiler-gcc5.h +++ b/include/linux/compiler-gcc5.h @@ -53,7 +53,6 @@ * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670 * * Work it around via a compiler barrier quirk suggested by Jakub Jelinek. - * Fixed in GCC 4.8.2 and later versions. * * (asm goto is automatically volatile - the naming reflects this.) */ -- cgit v1.2.3-59-g8ed1b From ebcf34f3d4be11f994340aff629f3c17171a4f65 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 26 Oct 2014 19:14:06 -0700 Subject: skbuff.h: fix kernel-doc warning for headers_end Fix kernel-doc warning in by making both headers_start and headers_end private fields. Warning(..//include/linux/skbuff.h:654): No description found for parameter 'headers_end[0]' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a59d9343c25b..5884f95ff0e9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -557,7 +557,9 @@ struct sk_buff { /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() */ + /* private: */ __u32 headers_start[0]; + /* public: */ /* if you move pkt_type around you also must adapt those constants */ #ifdef __BIG_ENDIAN_BITFIELD @@ -642,7 +644,9 @@ struct sk_buff { __u16 network_header; __u16 mac_header; + /* private: */ __u32 headers_end[0]; + /* public: */ /* These elements must be at the end, see alloc_skb() for details. */ sk_buff_data_t tail; -- cgit v1.2.3-59-g8ed1b From 1efed2d06c703489342ab6af2951683e07509c99 Mon Sep 17 00:00:00 2001 From: Olivier Blin Date: Fri, 24 Oct 2014 19:43:00 +0200 Subject: usbnet: add a callback for set_rx_mode To delegate promiscuous mode and multicast filtering to the subdriver. Signed-off-by: Olivier Blin Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 20 ++++++++++++++++++++ include/linux/usb/usbnet.h | 4 ++++ 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 20615bbd693b..3a6770a65d78 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1052,6 +1052,21 @@ static void __handle_link_change(struct usbnet *dev) clear_bit(EVENT_LINK_CHANGE, &dev->flags); } +static void usbnet_set_rx_mode(struct net_device *net) +{ + struct usbnet *dev = netdev_priv(net); + + usbnet_defer_kevent(dev, EVENT_SET_RX_MODE); +} + +static void __handle_set_rx_mode(struct usbnet *dev) +{ + if (dev->driver_info->set_rx_mode) + (dev->driver_info->set_rx_mode)(dev); + + clear_bit(EVENT_SET_RX_MODE, &dev->flags); +} + /* work that cannot be done in interrupt context uses keventd. * * NOTE: with 2.5 we could do more of this using completion callbacks, @@ -1157,6 +1172,10 @@ skip_reset: if (test_bit (EVENT_LINK_CHANGE, &dev->flags)) __handle_link_change(dev); + if (test_bit (EVENT_SET_RX_MODE, &dev->flags)) + __handle_set_rx_mode(dev); + + if (dev->flags) netdev_dbg(dev->net, "kevent done, flags = 0x%lx\n", dev->flags); } @@ -1525,6 +1544,7 @@ static const struct net_device_ops usbnet_netdev_ops = { .ndo_stop = usbnet_stop, .ndo_start_xmit = usbnet_start_xmit, .ndo_tx_timeout = usbnet_tx_timeout, + .ndo_set_rx_mode = usbnet_set_rx_mode, .ndo_change_mtu = usbnet_change_mtu, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 26088feb6608..d9a4905e01d0 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -78,6 +78,7 @@ struct usbnet { # define EVENT_NO_RUNTIME_PM 9 # define EVENT_RX_KILL 10 # define EVENT_LINK_CHANGE 11 +# define EVENT_SET_RX_MODE 12 }; static inline struct usb_driver *driver_of(struct usb_interface *intf) @@ -159,6 +160,9 @@ struct driver_info { /* called by minidriver when receiving indication */ void (*indication)(struct usbnet *dev, void *ind, int indlen); + /* rx mode change (device changes address list filtering) */ + void (*set_rx_mode)(struct usbnet *dev); + /* for new devices, use the descriptor-reading code instead */ int in; /* rx endpoint */ int out; /* tx endpoint */ -- cgit v1.2.3-59-g8ed1b From 54ef6df3f3f1353d99c80c437259d317b2cd1cbd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Oct 2014 21:11:27 -0700 Subject: rcu: Provide counterpart to rcu_dereference() for non-RCU situations Although rcu_dereference() and friends can be used in situations where object lifetimes are being managed by something other than RCU, the resulting sparse and lockdep-RCU noise can be annoying. This commit therefore supplies a lockless_dereference(), which provides the protection for dereferences without the RCU-related debugging noise. Reported-by: Al Viro Signed-off-by: Paul E. McKenney Signed-off-by: Al Viro --- include/linux/rcupdate.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index a4a819ffb2d1..53ff1a752d7e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -616,6 +616,21 @@ static inline void rcu_preempt_sleep_check(void) */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) +/** + * lockless_dereference() - safely load a pointer for later dereference + * @p: The pointer to load + * + * Similar to rcu_dereference(), but for situations where the pointed-to + * object's lifetime is managed by something other than RCU. That + * "something other" might be reference counting or simple immortality. + */ +#define lockless_dereference(p) \ +({ \ + typeof(p) _________p1 = ACCESS_ONCE(p); \ + smp_read_barrier_depends(); /* Dependency order vs. p above. */ \ + (_________p1); \ +}) + /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to -- cgit v1.2.3-59-g8ed1b From d1b72cc6d8cb766c802fdc70a5edc2f0ba8a2b57 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 27 Oct 2014 15:42:01 +0100 Subject: overlayfs: fix lockdep misannotation In an overlay directory that shadows an empty lower directory, say /mnt/a/empty102, do: touch /mnt/a/empty102/x unlink /mnt/a/empty102/x rmdir /mnt/a/empty102 It's actually harmless, but needs another level of nesting between I_MUTEX_CHILD and I_MUTEX_NORMAL. Signed-off-by: Miklos Szeredi Tested-by: David Howells Signed-off-by: Al Viro --- fs/namei.c | 2 +- fs/overlayfs/readdir.c | 2 +- include/linux/fs.h | 9 ++++++--- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index 42df664e95e5..922f27068c4c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2497,7 +2497,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) } mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2); return NULL; } EXPORT_SYMBOL(lock_rename); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 3fbf0d306e12..401f0840f5cc 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -571,7 +571,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; - mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD); list_for_each_entry(p, list, l_node) { struct dentry *dentry; diff --git a/include/linux/fs.h b/include/linux/fs.h index 4e41a4a331bb..01036262095f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -639,11 +639,13 @@ static inline int inode_unhashed(struct inode *inode) * 2: child/target * 3: xattr * 4: second non-directory - * The last is for certain operations (such as rename) which lock two + * 5: second parent (when locking independent directories in rename) + * + * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is - * parent -> child -> normal -> xattr -> second non-directory + * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory */ enum inode_i_mutex_lock_class { @@ -651,7 +653,8 @@ enum inode_i_mutex_lock_class I_MUTEX_PARENT, I_MUTEX_CHILD, I_MUTEX_XATTR, - I_MUTEX_NONDIR2 + I_MUTEX_NONDIR2, + I_MUTEX_PARENT2, }; void lock_two_nondirectories(struct inode *, struct inode*); -- cgit v1.2.3-59-g8ed1b From cb1a5ab6ece7a37da4ac98ee26b0475b7c3ea79e Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 28 Oct 2014 20:27:43 -0600 Subject: block: Fix merge logic when CONFIG_BLK_DEV_INTEGRITY is not defined Commit 4eaf99beadce switched to returning bool and as a result reversed the logic of the integrity merge checks. However, the empty stubs used when the block integrity code is compiled out were still returning 0. Make these stubs return "true". Signed-off-by: Martin K. Petersen Reported-by: Michael L. Semon Tested-by: Michael L. Semon Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0207a78a8d82..6cbee8395f60 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1583,13 +1583,13 @@ static inline bool blk_integrity_merge_rq(struct request_queue *rq, struct request *r1, struct request *r2) { - return 0; + return true; } static inline bool blk_integrity_merge_bio(struct request_queue *rq, struct request *r, struct bio *b) { - return 0; + return true; } static inline bool blk_integrity_is_initialized(struct gendisk *g) { -- cgit v1.2.3-59-g8ed1b From 47f29df7db78ee4fcdb104cf36918d987ddd0278 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 29 Oct 2014 14:50:29 -0700 Subject: drivers: of: add return value to of_reserved_mem_device_init() Driver calling of_reserved_mem_device_init() might be interested if the initialization has been successful or not, so add support for returning error code. This fixes a build warining caused by commit 7bfa5ab6fa1b ("drivers: dma-coherent: add initialization from device tree"), which has been merged without this change and without fixing function return value. Fixes: 7bfa5ab6fa1b1 ("drivers: dma-coherent: add initialization from device tree") Signed-off-by: Marek Szyprowski Acked-by: Arnd Bergmann Cc: Michal Nazarewicz Cc: Grant Likely Cc: Laura Abbott Cc: Josh Cartwright Cc: Joonsoo Kim Cc: Kyungmin Park Cc: Russell King Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/dma-contiguous.c | 3 ++- drivers/of/of_reserved_mem.c | 14 +++++++++----- include/linux/of_reserved_mem.h | 9 ++++++--- 3 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 473ff4892401..950fff9ce453 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -223,9 +223,10 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, #undef pr_fmt #define pr_fmt(fmt) fmt -static void rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) +static int rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) { dev_set_cma_area(dev, rmem->priv); + return 0; } static void rmem_cma_device_release(struct reserved_mem *rmem, diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c index 59fb12e84e6b..dc566b38645f 100644 --- a/drivers/of/of_reserved_mem.c +++ b/drivers/of/of_reserved_mem.c @@ -243,23 +243,27 @@ static inline struct reserved_mem *__find_rmem(struct device_node *node) * This function assign memory region pointed by "memory-region" device tree * property to the given device. */ -void of_reserved_mem_device_init(struct device *dev) +int of_reserved_mem_device_init(struct device *dev) { struct reserved_mem *rmem; struct device_node *np; + int ret; np = of_parse_phandle(dev->of_node, "memory-region", 0); if (!np) - return; + return -ENODEV; rmem = __find_rmem(np); of_node_put(np); if (!rmem || !rmem->ops || !rmem->ops->device_init) - return; + return -EINVAL; + + ret = rmem->ops->device_init(rmem, dev); + if (ret == 0) + dev_info(dev, "assigned reserved memory node %s\n", rmem->name); - rmem->ops->device_init(rmem, dev); - dev_info(dev, "assigned reserved memory node %s\n", rmem->name); + return ret; } /** diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h index 5b5efae09135..ad2f67054372 100644 --- a/include/linux/of_reserved_mem.h +++ b/include/linux/of_reserved_mem.h @@ -16,7 +16,7 @@ struct reserved_mem { }; struct reserved_mem_ops { - void (*device_init)(struct reserved_mem *rmem, + int (*device_init)(struct reserved_mem *rmem, struct device *dev); void (*device_release)(struct reserved_mem *rmem, struct device *dev); @@ -28,14 +28,17 @@ typedef int (*reservedmem_of_init_fn)(struct reserved_mem *rmem); _OF_DECLARE(reservedmem, name, compat, init, reservedmem_of_init_fn) #ifdef CONFIG_OF_RESERVED_MEM -void of_reserved_mem_device_init(struct device *dev); +int of_reserved_mem_device_init(struct device *dev); void of_reserved_mem_device_release(struct device *dev); void fdt_init_reserved_mem(void); void fdt_reserved_mem_save_node(unsigned long node, const char *uname, phys_addr_t base, phys_addr_t size); #else -static inline void of_reserved_mem_device_init(struct device *dev) { } +static inline int of_reserved_mem_device_init(struct device *dev) +{ + return -ENOSYS; +} static inline void of_reserved_mem_device_release(struct device *pdev) { } static inline void fdt_init_reserved_mem(void) { } -- cgit v1.2.3-59-g8ed1b From 6d50e60cd2edb5a57154db5a6f64eef5aa59b751 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 29 Oct 2014 14:50:31 -0700 Subject: mm, thp: fix collapsing of hugepages on madvise If an anonymous mapping is not allowed to fault thp memory and then madvise(MADV_HUGEPAGE) is used after fault, khugepaged will never collapse this memory into thp memory. This occurs because the madvise(2) handler for thp, hugepage_madvise(), clears VM_NOHUGEPAGE on the stack and it isn't stored in vma->vm_flags until the final action of madvise_behavior(). This causes the khugepaged_enter_vma_merge() to be a no-op in hugepage_madvise() when the vma had previously had VM_NOHUGEPAGE set. Fix this by passing the correct vma flags to the khugepaged mm slot handler. There's no chance khugepaged can run on this vma until after madvise_behavior() returns since we hold mm->mmap_sem. It would be possible to clear VM_NOHUGEPAGE directly from vma->vm_flags in hugepage_advise(), but I didn't want to introduce special case behavior into madvise_behavior(). I think it's best to just let it always set vma->vm_flags itself. Signed-off-by: David Rientjes Reported-by: Suleiman Souhlal Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/khugepaged.h | 17 ++++++++++------- mm/huge_memory.c | 11 ++++++----- mm/mmap.c | 8 ++++---- 3 files changed, 20 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 6b394f0b5148..eeb307985715 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -6,7 +6,8 @@ #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); -extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma); +extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags); #define khugepaged_enabled() \ (transparent_hugepage_flags & \ @@ -35,13 +36,13 @@ static inline void khugepaged_exit(struct mm_struct *mm) __khugepaged_exit(mm); } -static inline int khugepaged_enter(struct vm_area_struct *vma) +static inline int khugepaged_enter(struct vm_area_struct *vma, + unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) if ((khugepaged_always() || - (khugepaged_req_madv() && - vma->vm_flags & VM_HUGEPAGE)) && - !(vma->vm_flags & VM_NOHUGEPAGE)) + (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) && + !(vm_flags & VM_NOHUGEPAGE)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; @@ -54,11 +55,13 @@ static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) static inline void khugepaged_exit(struct mm_struct *mm) { } -static inline int khugepaged_enter(struct vm_area_struct *vma) +static inline int khugepaged_enter(struct vm_area_struct *vma, + unsigned long vm_flags) { return 0; } -static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 780d12c000e9..de984159cf0b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -803,7 +803,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma))) + if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { @@ -1970,7 +1970,7 @@ int hugepage_madvise(struct vm_area_struct *vma, * register it here without waiting a page fault that * may not happen any time soon. */ - if (unlikely(khugepaged_enter_vma_merge(vma))) + if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) return -ENOMEM; break; case MADV_NOHUGEPAGE: @@ -2071,7 +2071,8 @@ int __khugepaged_enter(struct mm_struct *mm) return 0; } -int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +int khugepaged_enter_vma_merge(struct vm_area_struct *vma, + unsigned long vm_flags) { unsigned long hstart, hend; if (!vma->anon_vma) @@ -2083,11 +2084,11 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) if (vma->vm_ops) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); + VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) - return khugepaged_enter(vma); + return khugepaged_enter(vma, vm_flags); return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 7f855206e7fb..87e82b38453c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1080,7 +1080,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL); if (err) return NULL; - khugepaged_enter_vma_merge(prev); + khugepaged_enter_vma_merge(prev, vm_flags); return prev; } @@ -1099,7 +1099,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, next->vm_pgoff - pglen, NULL); if (err) return NULL; - khugepaged_enter_vma_merge(area); + khugepaged_enter_vma_merge(area, vm_flags); return area; } @@ -2208,7 +2208,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } vma_unlock_anon_vma(vma); - khugepaged_enter_vma_merge(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(vma->vm_mm); return error; } @@ -2277,7 +2277,7 @@ int expand_downwards(struct vm_area_struct *vma, } } vma_unlock_anon_vma(vma); - khugepaged_enter_vma_merge(vma); + khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(vma->vm_mm); return error; } -- cgit v1.2.3-59-g8ed1b From 3a3c02ecf7f2852f122d6d16fb9b3d9cb0c6f201 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 Oct 2014 14:50:46 -0700 Subject: mm: page-writeback: inline account_page_dirtied() into single caller A follow-up patch would have changed the call signature. To save the trouble, just fold it instead. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: [3.17.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/page-writeback.c | 23 ++++------------------- 2 files changed, 4 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 27eb1bfbe704..b46461116cd2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1235,7 +1235,6 @@ int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); void account_page_dirtied(struct page *page, struct address_space *mapping); -void account_page_writeback(struct page *page); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int clear_page_dirty_for_io(struct page *page); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ff24c9d83112..ff6a5b07211e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2115,23 +2115,6 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) } EXPORT_SYMBOL(account_page_dirtied); -/* - * Helper function for set_page_writeback family. - * - * The caller must hold mem_cgroup_begin/end_update_page_stat() lock - * while calling this function. - * See test_set_page_writeback for example. - * - * NOTE: Unlike account_page_dirtied this does not rely on being atomic - * wrt interrupts. - */ -void account_page_writeback(struct page *page) -{ - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); - inc_zone_page_state(page, NR_WRITEBACK); -} -EXPORT_SYMBOL(account_page_writeback); - /* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. @@ -2410,8 +2393,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } else { ret = TestSetPageWriteback(page); } - if (!ret) - account_page_writeback(page); + if (!ret) { + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + inc_zone_page_state(page, NR_WRITEBACK); + } mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); return ret; -- cgit v1.2.3-59-g8ed1b From d7365e783edb858279be1d03f61bc8d5d3383d90 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 29 Oct 2014 14:50:48 -0700 Subject: mm: memcontrol: fix missed end-writeback page accounting Commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") changed page migration to uncharge the old page right away. The page is locked, unmapped, truncated, and off the LRU, but it could race with writeback ending, which then doesn't unaccount the page properly: test_clear_page_writeback() migration wait_on_page_writeback() TestClearPageWriteback() mem_cgroup_migrate() clear PCG_USED mem_cgroup_update_page_stat() if (PageCgroupUsed(pc)) decrease memcg pages under writeback release pc->mem_cgroup->move_lock The per-page statistics interface is heavily optimized to avoid a function call and a lookup_page_cgroup() in the file unmap fast path, which means it doesn't verify whether a page is still charged before clearing PageWriteback() and it has to do it in the stat update later. Rework it so that it looks up the page's memcg once at the beginning of the transaction and then uses it throughout. The charge will be verified before clearing PageWriteback() and migration can't uncharge the page as long as that is still set. The RCU lock will protect the memcg past uncharge. As far as losing the optimization goes, the following test results are from a microbenchmark that maps, faults, and unmaps a 4GB sparse file three times in a nested fashion, so that there are two negative passes that don't account but still go through the new transaction overhead. There is no actual difference: old: 33.195102545 seconds time elapsed ( +- 0.01% ) new: 33.199231369 seconds time elapsed ( +- 0.03% ) The time spent in page_remove_rmap()'s callees still adds up to the same, but the time spent in the function itself seems reduced: # Children Self Command Shared Object Symbol old: 0.12% 0.11% filemapstress [kernel.kallsyms] [k] page_remove_rmap new: 0.12% 0.08% filemapstress [kernel.kallsyms] [k] page_remove_rmap Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: [3.17.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 58 ++++++++----------------- mm/memcontrol.c | 105 ++++++++++++++++++++++++--------------------- mm/page-writeback.c | 22 +++++----- mm/rmap.c | 20 ++++----- 4 files changed, 96 insertions(+), 109 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 19df5d857411..6b75640ef5ab 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -139,48 +139,23 @@ static inline bool mem_cgroup_disabled(void) return false; } -void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, - unsigned long *flags); - -extern atomic_t memcg_moving; - -static inline void mem_cgroup_begin_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) -{ - if (mem_cgroup_disabled()) - return; - rcu_read_lock(); - *locked = false; - if (atomic_read(&memcg_moving)) - __mem_cgroup_begin_update_page_stat(page, locked, flags); -} - -void __mem_cgroup_end_update_page_stat(struct page *page, - unsigned long *flags); -static inline void mem_cgroup_end_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) -{ - if (mem_cgroup_disabled()) - return; - if (*locked) - __mem_cgroup_end_update_page_stat(page, flags); - rcu_read_unlock(); -} - -void mem_cgroup_update_page_stat(struct page *page, - enum mem_cgroup_stat_index idx, - int val); - -static inline void mem_cgroup_inc_page_stat(struct page *page, +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, + unsigned long *flags); +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, + unsigned long flags); +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, + enum mem_cgroup_stat_index idx, int val); + +static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { - mem_cgroup_update_page_stat(page, idx, 1); + mem_cgroup_update_page_stat(memcg, idx, 1); } -static inline void mem_cgroup_dec_page_stat(struct page *page, +static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { - mem_cgroup_update_page_stat(page, idx, -1); + mem_cgroup_update_page_stat(memcg, idx, -1); } unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, @@ -315,13 +290,14 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void mem_cgroup_begin_update_page_stat(struct page *page, +static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, unsigned long *flags) { + return NULL; } -static inline void mem_cgroup_end_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) +static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, + bool locked, unsigned long flags) { } @@ -343,12 +319,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } -static inline void mem_cgroup_inc_page_stat(struct page *page, +static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { } -static inline void mem_cgroup_dec_page_stat(struct page *page, +static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 23976fd885fd..d6ac0e33e150 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1536,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) * start move here. */ -/* for quick checking without looking up memcg */ -atomic_t memcg_moving __read_mostly; - static void mem_cgroup_start_move(struct mem_cgroup *memcg) { - atomic_inc(&memcg_moving); atomic_inc(&memcg->moving_account); synchronize_rcu(); } @@ -1552,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) * Now, mem_cgroup_clear_mc() may call this function with NULL. * We check NULL in callee rather than caller. */ - if (memcg) { - atomic_dec(&memcg_moving); + if (memcg) atomic_dec(&memcg->moving_account); - } } /* @@ -2204,41 +2198,52 @@ cleanup: return true; } -/* - * Used to update mapped file or writeback or other statistics. +/** + * mem_cgroup_begin_page_stat - begin a page state statistics transaction + * @page: page that is going to change accounted state + * @locked: &memcg->move_lock slowpath was taken + * @flags: IRQ-state flags for &memcg->move_lock * - * Notes: Race condition + * This function must mark the beginning of an accounted page state + * change to prevent double accounting when the page is concurrently + * being moved to another memcg: * - * Charging occurs during page instantiation, while the page is - * unmapped and locked in page migration, or while the page table is - * locked in THP migration. No race is possible. + * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); + * if (TestClearPageState(page)) + * mem_cgroup_update_page_stat(memcg, state, -1); + * mem_cgroup_end_page_stat(memcg, locked, flags); * - * Uncharge happens to pages with zero references, no race possible. + * The RCU lock is held throughout the transaction. The fast path can + * get away without acquiring the memcg->move_lock (@locked is false) + * because page moving starts with an RCU grace period. * - * Charge moving between groups is protected by checking mm->moving - * account and taking the move_lock in the slowpath. + * The RCU lock also protects the memcg from being freed when the page + * state that is going to change is the only thing preventing the page + * from being uncharged. E.g. end-writeback clearing PageWriteback(), + * which allows migration to go ahead and uncharge the page before the + * account transaction might be complete. */ - -void __mem_cgroup_begin_update_page_stat(struct page *page, - bool *locked, unsigned long *flags) +struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, + bool *locked, + unsigned long *flags) { struct mem_cgroup *memcg; struct page_cgroup *pc; + rcu_read_lock(); + + if (mem_cgroup_disabled()) + return NULL; + pc = lookup_page_cgroup(page); again: memcg = pc->mem_cgroup; if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - /* - * If this memory cgroup is not under account moving, we don't - * need to take move_lock_mem_cgroup(). Because we already hold - * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock(). - */ - VM_BUG_ON(!rcu_read_lock_held()); + return NULL; + + *locked = false; if (atomic_read(&memcg->moving_account) <= 0) - return; + return memcg; move_lock_mem_cgroup(memcg, flags); if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { @@ -2246,36 +2251,40 @@ again: goto again; } *locked = true; + + return memcg; } -void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) +/** + * mem_cgroup_end_page_stat - finish a page state statistics transaction + * @memcg: the memcg that was accounted against + * @locked: value received from mem_cgroup_begin_page_stat() + * @flags: value received from mem_cgroup_begin_page_stat() + */ +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, + unsigned long flags) { - struct page_cgroup *pc = lookup_page_cgroup(page); + if (memcg && locked) + move_unlock_mem_cgroup(memcg, &flags); - /* - * It's guaranteed that pc->mem_cgroup never changes while - * lock is held because a routine modifies pc->mem_cgroup - * should take move_lock_mem_cgroup(). - */ - move_unlock_mem_cgroup(pc->mem_cgroup, flags); + rcu_read_unlock(); } -void mem_cgroup_update_page_stat(struct page *page, +/** + * mem_cgroup_update_page_stat - update page state statistics + * @memcg: memcg to account against + * @idx: page state item to account + * @val: number of pages (positive or negative) + * + * See mem_cgroup_begin_page_stat() for locking requirements. + */ +void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx, int val) { - struct mem_cgroup *memcg; - struct page_cgroup *pc = lookup_page_cgroup(page); - unsigned long uninitialized_var(flags); - - if (mem_cgroup_disabled()) - return; - VM_BUG_ON(!rcu_read_lock_held()); - memcg = pc->mem_cgroup; - if (unlikely(!memcg || !PageCgroupUsed(pc))) - return; - this_cpu_add(memcg->stat->count[idx], val); + if (memcg) + this_cpu_add(memcg->stat->count[idx], val); } /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ff6a5b07211e..19ceae87522d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2327,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - int ret; - bool locked; unsigned long memcg_flags; + struct mem_cgroup *memcg; + bool locked; + int ret; - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2352,22 +2353,23 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - int ret; - bool locked; unsigned long memcg_flags; + struct mem_cgroup *memcg; + bool locked; + int ret; - mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); if (mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; @@ -2394,10 +2396,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); + mem_cgroup_end_page_stat(memcg, locked, memcg_flags); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 116a5053415b..f574046f77d4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1042,15 +1042,16 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - bool locked; + struct mem_cgroup *memcg; unsigned long flags; + bool locked; - mem_cgroup_begin_update_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_end_page_stat(memcg, locked, flags); } /** @@ -1061,9 +1062,10 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { + struct mem_cgroup *uninitialized_var(memcg); bool anon = PageAnon(page); - bool locked; unsigned long flags; + bool locked; /* * The anon case has no mem_cgroup page_stat to update; but may @@ -1071,7 +1073,7 @@ void page_remove_rmap(struct page *page) * we hold the lock against page_stat move: so avoid it on anon. */ if (!anon) - mem_cgroup_begin_update_page_stat(page, &locked, &flags); + memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) @@ -1096,8 +1098,7 @@ void page_remove_rmap(struct page *page) -hpage_nr_pages(page)); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1110,10 +1111,9 @@ void page_remove_rmap(struct page *page) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ - return; out: if (!anon) - mem_cgroup_end_update_page_stat(page, &locked, &flags); + mem_cgroup_end_page_stat(memcg, locked, flags); } /* -- cgit v1.2.3-59-g8ed1b From 39bb5e62867de82b269b07df900165029b928359 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Oct 2014 10:32:34 -0700 Subject: net: skb_fclone_busy() needs to detect orphaned skb Some drivers are unable to perform TX completions in a bound time. They instead call skb_orphan() Problem is skb_fclone_busy() has to detect this case, otherwise we block TCP retransmits and can freeze unlucky tcp sessions on mostly idle hosts. Signed-off-by: Eric Dumazet Fixes: 1f3279ae0c13 ("tcp: avoid retransmits of TCP packets hanging in host queues") Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 ++++++-- net/ipv4/tcp_output.c | 2 +- net/xfrm/xfrm_policy.c | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5884f95ff0e9..6c8b6f604e76 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -799,15 +799,19 @@ struct sk_buff_fclones { * @skb: buffer * * Returns true is skb is a fast clone, and its clone is not freed. + * Some drivers call skb_orphan() in their ndo_start_xmit(), + * so we also check that this didnt happen. */ -static inline bool skb_fclone_busy(const struct sk_buff *skb) +static inline bool skb_fclone_busy(const struct sock *sk, + const struct sk_buff *skb) { const struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); return skb->fclone == SKB_FCLONE_ORIG && - fclones->skb2.fclone == SKB_FCLONE_CLONE; + fclones->skb2.fclone == SKB_FCLONE_CLONE && + fclones->skb2.sk == sk; } static inline struct sk_buff *alloc_skb_fclone(unsigned int size, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3af21296d967..a3d453b94747 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2126,7 +2126,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { - if (unlikely(skb_fclone_busy(skb))) { + if (unlikely(skb_fclone_busy(sk, skb))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4c4e457e7888..88bf289abdc9 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1962,7 +1962,7 @@ static int xdst_queue_output(struct sock *sk, struct sk_buff *skb) struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; - if (unlikely(skb_fclone_busy(skb))) { + if (unlikely(skb_fclone_busy(sk, skb))) { kfree_skb(skb); return 0; } -- cgit v1.2.3-59-g8ed1b From b2de525f095708b2adbadaec3f1e4017a23d1e09 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Mon, 29 Sep 2014 10:21:10 -0400 Subject: Return short read or 0 at end of a raw device, not EIO Author: David Jeffery Changes to the basic direct I/O code have broken the raw driver when reading to the end of a raw device. Instead of returning a short read for a read that extends partially beyond the device's end or 0 when at the end of the device, these reads now return EIO. The raw driver needs the same end of device handling as was added for normal block devices. Using blkdev_read_iter, which has the needed size checks, prevents the EIO conditions at the end of the device. Signed-off-by: David Jeffery Signed-off-by: Al Viro --- drivers/char/raw.c | 2 +- fs/block_dev.c | 3 ++- include/linux/fs.h | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 0102dc788608..a24891b97547 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -285,7 +285,7 @@ static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd, static const struct file_operations raw_fops = { .read = new_sync_read, - .read_iter = generic_file_read_iter, + .read_iter = blkdev_read_iter, .write = new_sync_write, .write_iter = blkdev_write_iter, .fsync = blkdev_fsync, diff --git a/fs/block_dev.c b/fs/block_dev.c index cc9d4114cda0..1d9c9f3754f8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1585,7 +1585,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) } EXPORT_SYMBOL_GPL(blkdev_write_iter); -static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) +ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *bd_inode = file->f_mapping->host; @@ -1599,6 +1599,7 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) iov_iter_truncate(to, size); return generic_file_read_iter(iocb, to); } +EXPORT_SYMBOL_GPL(blkdev_read_iter); /* * Try to release a page associated with block device when the system diff --git a/include/linux/fs.h b/include/linux/fs.h index 01036262095f..9ab779e8a63c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2469,6 +2469,7 @@ extern ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo extern ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); /* fs/block_dev.c */ +extern ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to); extern ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from); extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync); -- cgit v1.2.3-59-g8ed1b From a87fa1d81a9fb5e9adca9820e16008c40ad09f33 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Mon, 3 Nov 2014 15:15:35 +0000 Subject: of: Fix overflow bug in string property parsing functions The string property read helpers will run off the end of the buffer if it is handed a malformed string property. Rework the parsers to make sure that doesn't happen. At the same time add new test cases to make sure the functions behave themselves. The original implementations of of_property_read_string_index() and of_property_count_strings() both open-coded the same block of parsing code, each with it's own subtly different bugs. The fix here merges functions into a single helper and makes the original functions static inline wrappers around the helper. One non-bugfix aspect of this patch is the addition of a new wrapper, of_property_read_string_array(). The new wrapper is needed by the device_properties feature that Rafael is working on and planning to merge for v3.19. The implementation is identical both with and without the new static inline wrapper, so it just got left in to reduce the churn on the header file. Signed-off-by: Grant Likely Cc: Rafael J. Wysocki Cc: Mika Westerberg Cc: Rob Herring Cc: Arnd Bergmann Cc: Darren Hart Cc: # v3.3+: Drop selftest hunks that don't apply --- drivers/of/base.c | 88 ++++++++--------------------- drivers/of/selftest.c | 66 ++++++++++++++++++++-- drivers/of/testcase-data/tests-phandle.dtsi | 2 + include/linux/of.h | 84 ++++++++++++++++++++++----- 4 files changed, 154 insertions(+), 86 deletions(-) (limited to 'include/linux') diff --git a/drivers/of/base.c b/drivers/of/base.c index 2305dc0382bc..3823edf2d012 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -1279,52 +1279,6 @@ int of_property_read_string(struct device_node *np, const char *propname, } EXPORT_SYMBOL_GPL(of_property_read_string); -/** - * of_property_read_string_index - Find and read a string from a multiple - * strings property. - * @np: device node from which the property value is to be read. - * @propname: name of the property to be searched. - * @index: index of the string in the list of strings - * @out_string: pointer to null terminated return string, modified only if - * return value is 0. - * - * Search for a property in a device tree node and retrieve a null - * terminated string value (pointer to data, not a copy) in the list of strings - * contained in that property. - * Returns 0 on success, -EINVAL if the property does not exist, -ENODATA if - * property does not have a value, and -EILSEQ if the string is not - * null-terminated within the length of the property data. - * - * The out_string pointer is modified only if a valid string can be decoded. - */ -int of_property_read_string_index(struct device_node *np, const char *propname, - int index, const char **output) -{ - struct property *prop = of_find_property(np, propname, NULL); - int i = 0; - size_t l = 0, total = 0; - const char *p; - - if (!prop) - return -EINVAL; - if (!prop->value) - return -ENODATA; - if (strnlen(prop->value, prop->length) >= prop->length) - return -EILSEQ; - - p = prop->value; - - for (i = 0; total < prop->length; total += l, p += l) { - l = strlen(p) + 1; - if (i++ == index) { - *output = p; - return 0; - } - } - return -ENODATA; -} -EXPORT_SYMBOL_GPL(of_property_read_string_index); - /** * of_property_match_string() - Find string in a list and return index * @np: pointer to node containing string list property @@ -1351,7 +1305,7 @@ int of_property_match_string(struct device_node *np, const char *propname, end = p + prop->length; for (i = 0; p < end; i++, p += l) { - l = strlen(p) + 1; + l = strnlen(p, end - p) + 1; if (p + l > end) return -EILSEQ; pr_debug("comparing %s with %s\n", string, p); @@ -1363,39 +1317,41 @@ int of_property_match_string(struct device_node *np, const char *propname, EXPORT_SYMBOL_GPL(of_property_match_string); /** - * of_property_count_strings - Find and return the number of strings from a - * multiple strings property. + * of_property_read_string_util() - Utility helper for parsing string properties * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. + * @out_strs: output array of string pointers. + * @sz: number of array elements to read. + * @skip: Number of strings to skip over at beginning of list. * - * Search for a property in a device tree node and retrieve the number of null - * terminated string contain in it. Returns the number of strings on - * success, -EINVAL if the property does not exist, -ENODATA if property - * does not have a value, and -EILSEQ if the string is not null-terminated - * within the length of the property data. + * Don't call this function directly. It is a utility helper for the + * of_property_read_string*() family of functions. */ -int of_property_count_strings(struct device_node *np, const char *propname) +int of_property_read_string_helper(struct device_node *np, const char *propname, + const char **out_strs, size_t sz, int skip) { struct property *prop = of_find_property(np, propname, NULL); - int i = 0; - size_t l = 0, total = 0; - const char *p; + int l = 0, i = 0; + const char *p, *end; if (!prop) return -EINVAL; if (!prop->value) return -ENODATA; - if (strnlen(prop->value, prop->length) >= prop->length) - return -EILSEQ; - p = prop->value; + end = p + prop->length; - for (i = 0; total < prop->length; total += l, p += l, i++) - l = strlen(p) + 1; - - return i; + for (i = 0; p < end && (!out_strs || i < skip + sz); i++, p += l) { + l = strnlen(p, end - p) + 1; + if (p + l > end) + return -EILSEQ; + if (out_strs && i >= skip) + *out_strs++ = p; + } + i -= skip; + return i <= 0 ? -ENODATA : i; } -EXPORT_SYMBOL_GPL(of_property_count_strings); +EXPORT_SYMBOL_GPL(of_property_read_string_helper); void of_print_phandle_args(const char *msg, const struct of_phandle_args *args) { diff --git a/drivers/of/selftest.c b/drivers/of/selftest.c index 78001270a598..11b873c54a77 100644 --- a/drivers/of/selftest.c +++ b/drivers/of/selftest.c @@ -339,8 +339,9 @@ static void __init of_selftest_parse_phandle_with_args(void) selftest(rc == -EINVAL, "expected:%i got:%i\n", -EINVAL, rc); } -static void __init of_selftest_property_match_string(void) +static void __init of_selftest_property_string(void) { + const char *strings[4]; struct device_node *np; int rc; @@ -357,13 +358,66 @@ static void __init of_selftest_property_match_string(void) rc = of_property_match_string(np, "phandle-list-names", "third"); selftest(rc == 2, "third expected:0 got:%i\n", rc); rc = of_property_match_string(np, "phandle-list-names", "fourth"); - selftest(rc == -ENODATA, "unmatched string; rc=%i", rc); + selftest(rc == -ENODATA, "unmatched string; rc=%i\n", rc); rc = of_property_match_string(np, "missing-property", "blah"); - selftest(rc == -EINVAL, "missing property; rc=%i", rc); + selftest(rc == -EINVAL, "missing property; rc=%i\n", rc); rc = of_property_match_string(np, "empty-property", "blah"); - selftest(rc == -ENODATA, "empty property; rc=%i", rc); + selftest(rc == -ENODATA, "empty property; rc=%i\n", rc); rc = of_property_match_string(np, "unterminated-string", "blah"); - selftest(rc == -EILSEQ, "unterminated string; rc=%i", rc); + selftest(rc == -EILSEQ, "unterminated string; rc=%i\n", rc); + + /* of_property_count_strings() tests */ + rc = of_property_count_strings(np, "string-property"); + selftest(rc == 1, "Incorrect string count; rc=%i\n", rc); + rc = of_property_count_strings(np, "phandle-list-names"); + selftest(rc == 3, "Incorrect string count; rc=%i\n", rc); + rc = of_property_count_strings(np, "unterminated-string"); + selftest(rc == -EILSEQ, "unterminated string; rc=%i\n", rc); + rc = of_property_count_strings(np, "unterminated-string-list"); + selftest(rc == -EILSEQ, "unterminated string array; rc=%i\n", rc); + + /* of_property_read_string_index() tests */ + rc = of_property_read_string_index(np, "string-property", 0, strings); + selftest(rc == 0 && !strcmp(strings[0], "foobar"), "of_property_read_string_index() failure; rc=%i\n", rc); + strings[0] = NULL; + rc = of_property_read_string_index(np, "string-property", 1, strings); + selftest(rc == -ENODATA && strings[0] == NULL, "of_property_read_string_index() failure; rc=%i\n", rc); + rc = of_property_read_string_index(np, "phandle-list-names", 0, strings); + selftest(rc == 0 && !strcmp(strings[0], "first"), "of_property_read_string_index() failure; rc=%i\n", rc); + rc = of_property_read_string_index(np, "phandle-list-names", 1, strings); + selftest(rc == 0 && !strcmp(strings[0], "second"), "of_property_read_string_index() failure; rc=%i\n", rc); + rc = of_property_read_string_index(np, "phandle-list-names", 2, strings); + selftest(rc == 0 && !strcmp(strings[0], "third"), "of_property_read_string_index() failure; rc=%i\n", rc); + strings[0] = NULL; + rc = of_property_read_string_index(np, "phandle-list-names", 3, strings); + selftest(rc == -ENODATA && strings[0] == NULL, "of_property_read_string_index() failure; rc=%i\n", rc); + strings[0] = NULL; + rc = of_property_read_string_index(np, "unterminated-string", 0, strings); + selftest(rc == -EILSEQ && strings[0] == NULL, "of_property_read_string_index() failure; rc=%i\n", rc); + rc = of_property_read_string_index(np, "unterminated-string-list", 0, strings); + selftest(rc == 0 && !strcmp(strings[0], "first"), "of_property_read_string_index() failure; rc=%i\n", rc); + strings[0] = NULL; + rc = of_property_read_string_index(np, "unterminated-string-list", 2, strings); /* should fail */ + selftest(rc == -EILSEQ && strings[0] == NULL, "of_property_read_string_index() failure; rc=%i\n", rc); + strings[1] = NULL; + + /* of_property_read_string_array() tests */ + rc = of_property_read_string_array(np, "string-property", strings, 4); + selftest(rc == 1, "Incorrect string count; rc=%i\n", rc); + rc = of_property_read_string_array(np, "phandle-list-names", strings, 4); + selftest(rc == 3, "Incorrect string count; rc=%i\n", rc); + rc = of_property_read_string_array(np, "unterminated-string", strings, 4); + selftest(rc == -EILSEQ, "unterminated string; rc=%i\n", rc); + /* -- An incorrectly formed string should cause a failure */ + rc = of_property_read_string_array(np, "unterminated-string-list", strings, 4); + selftest(rc == -EILSEQ, "unterminated string array; rc=%i\n", rc); + /* -- parsing the correctly formed strings should still work: */ + strings[2] = NULL; + rc = of_property_read_string_array(np, "unterminated-string-list", strings, 2); + selftest(rc == 2 && strings[2] == NULL, "of_property_read_string_array() failure; rc=%i\n", rc); + strings[1] = NULL; + rc = of_property_read_string_array(np, "phandle-list-names", strings, 1); + selftest(rc == 1 && strings[1] == NULL, "Overwrote end of string array; rc=%i, str='%s'\n", rc, strings[1]); } #define propcmp(p1, p2) (((p1)->length == (p2)->length) && \ @@ -881,7 +935,7 @@ static int __init of_selftest(void) of_selftest_find_node_by_name(); of_selftest_dynamic(); of_selftest_parse_phandle_with_args(); - of_selftest_property_match_string(); + of_selftest_property_string(); of_selftest_property_copy(); of_selftest_changeset(); of_selftest_parse_interrupts(); diff --git a/drivers/of/testcase-data/tests-phandle.dtsi b/drivers/of/testcase-data/tests-phandle.dtsi index ce0fe083d406..5b1527e8a7fb 100644 --- a/drivers/of/testcase-data/tests-phandle.dtsi +++ b/drivers/of/testcase-data/tests-phandle.dtsi @@ -39,7 +39,9 @@ phandle-list-bad-args = <&provider2 1 0>, <&provider3 0>; empty-property; + string-property = "foobar"; unterminated-string = [40 41 42 43]; + unterminated-string-list = "first", "second", [40 41 42 43]; }; }; }; diff --git a/include/linux/of.h b/include/linux/of.h index 6545e7aec7bb..29f0adc5f3e4 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -267,14 +267,12 @@ extern int of_property_read_u64(const struct device_node *np, extern int of_property_read_string(struct device_node *np, const char *propname, const char **out_string); -extern int of_property_read_string_index(struct device_node *np, - const char *propname, - int index, const char **output); extern int of_property_match_string(struct device_node *np, const char *propname, const char *string); -extern int of_property_count_strings(struct device_node *np, - const char *propname); +extern int of_property_read_string_helper(struct device_node *np, + const char *propname, + const char **out_strs, size_t sz, int index); extern int of_device_is_compatible(const struct device_node *device, const char *); extern int of_device_is_available(const struct device_node *device); @@ -486,15 +484,9 @@ static inline int of_property_read_string(struct device_node *np, return -ENOSYS; } -static inline int of_property_read_string_index(struct device_node *np, - const char *propname, int index, - const char **out_string) -{ - return -ENOSYS; -} - -static inline int of_property_count_strings(struct device_node *np, - const char *propname) +static inline int of_property_read_string_helper(struct device_node *np, + const char *propname, + const char **out_strs, size_t sz, int index) { return -ENOSYS; } @@ -667,6 +659,70 @@ static inline int of_property_count_u64_elems(const struct device_node *np, return of_property_count_elems_of_size(np, propname, sizeof(u64)); } +/** + * of_property_read_string_array() - Read an array of strings from a multiple + * strings property. + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @out_strs: output array of string pointers. + * @sz: number of array elements to read. + * + * Search for a property in a device tree node and retrieve a list of + * terminated string values (pointer to data, not a copy) in that property. + * + * If @out_strs is NULL, the number of strings in the property is returned. + */ +static inline int of_property_read_string_array(struct device_node *np, + const char *propname, const char **out_strs, + size_t sz) +{ + return of_property_read_string_helper(np, propname, out_strs, sz, 0); +} + +/** + * of_property_count_strings() - Find and return the number of strings from a + * multiple strings property. + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * + * Search for a property in a device tree node and retrieve the number of null + * terminated string contain in it. Returns the number of strings on + * success, -EINVAL if the property does not exist, -ENODATA if property + * does not have a value, and -EILSEQ if the string is not null-terminated + * within the length of the property data. + */ +static inline int of_property_count_strings(struct device_node *np, + const char *propname) +{ + return of_property_read_string_helper(np, propname, NULL, 0, 0); +} + +/** + * of_property_read_string_index() - Find and read a string from a multiple + * strings property. + * @np: device node from which the property value is to be read. + * @propname: name of the property to be searched. + * @index: index of the string in the list of strings + * @out_string: pointer to null terminated return string, modified only if + * return value is 0. + * + * Search for a property in a device tree node and retrieve a null + * terminated string value (pointer to data, not a copy) in the list of strings + * contained in that property. + * Returns 0 on success, -EINVAL if the property does not exist, -ENODATA if + * property does not have a value, and -EILSEQ if the string is not + * null-terminated within the length of the property data. + * + * The out_string pointer is modified only if a valid string can be decoded. + */ +static inline int of_property_read_string_index(struct device_node *np, + const char *propname, + int index, const char **output) +{ + int rc = of_property_read_string_helper(np, propname, output, 1, index); + return rc < 0 ? rc : 0; +} + /** * of_property_read_bool - Findfrom a property * @np: device node from which the property value is to be read. -- cgit v1.2.3-59-g8ed1b From 32f638fc11db0526c706454d9ab4339d55ac89f3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 30 Oct 2014 10:17:25 -0600 Subject: PCI: Don't oops on virtual buses in acpi_pci_get_bridge_handle() acpi_pci_get_bridge_handle() returns the ACPI handle for the bridge device (either a host bridge or a PCI-to-PCI bridge) leading to a PCI bus. But SR-IOV virtual functions can be on a virtual bus with no bridge leading to it. Return a NULL acpi_handle in this case instead of trying to dereference the NULL pointer to the bridge. This fixes a NULL pointer dereference oops in pci_get_hp_params() when adding SR-IOV VF devices on virtual buses. [bhelgaas: changelog, add comment in code] Fixes: 6cd33649fa83 ("PCI: Add pci_configure_device() during enumeration") Link: https://bugzilla.kernel.org/show_bug.cgi?id=87591 Reported-by: Chao Zhou Reported-by: Joerg Roedel Signed-off-by: Yinghai Lu Signed-off-by: Bjorn Helgaas --- include/linux/pci-acpi.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h index 64dacb7288a6..24c7728ca681 100644 --- a/include/linux/pci-acpi.h +++ b/include/linux/pci-acpi.h @@ -41,8 +41,13 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus) if (pci_is_root_bus(pbus)) dev = pbus->bridge; - else + else { + /* If pbus is a virtual bus, there is no bridge to it */ + if (!pbus->self) + return NULL; + dev = &pbus->self->dev; + } return ACPI_HANDLE(dev); } -- cgit v1.2.3-59-g8ed1b