From 35738392b6c050625e41cc6b941f9828794149b3 Mon Sep 17 00:00:00 2001 From: Chris Cui Date: Tue, 6 May 2014 12:49:58 -0700 Subject: drivers/rtc/rtc-pcf8523.c: fix month definition PCF8523 uses 1-12 to represent month according to datasheet. link: www.nxp.com/documents/data_sheet/PCF8523.pdf. Signed-off-by: Chris Cui Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-pcf8523.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-pcf8523.c b/drivers/rtc/rtc-pcf8523.c index 5c8f8226c848..4cdb64be061b 100644 --- a/drivers/rtc/rtc-pcf8523.c +++ b/drivers/rtc/rtc-pcf8523.c @@ -206,7 +206,7 @@ static int pcf8523_rtc_read_time(struct device *dev, struct rtc_time *tm) tm->tm_hour = bcd2bin(regs[2] & 0x3f); tm->tm_mday = bcd2bin(regs[3] & 0x3f); tm->tm_wday = regs[4] & 0x7; - tm->tm_mon = bcd2bin(regs[5] & 0x1f); + tm->tm_mon = bcd2bin(regs[5] & 0x1f) - 1; tm->tm_year = bcd2bin(regs[6]) + 100; return rtc_valid_tm(tm); @@ -229,7 +229,7 @@ static int pcf8523_rtc_set_time(struct device *dev, struct rtc_time *tm) regs[3] = bin2bcd(tm->tm_hour); regs[4] = bin2bcd(tm->tm_mday); regs[5] = tm->tm_wday; - regs[6] = bin2bcd(tm->tm_mon); + regs[6] = bin2bcd(tm->tm_mon + 1); regs[7] = bin2bcd(tm->tm_year - 100); msg.addr = client->addr; -- cgit v1.2.3-59-g8ed1b From 93030d83b9e1079836d82b46ab3ec671b1fdb623 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 6 May 2014 12:49:59 -0700 Subject: slub: fix memcg_propagate_slab_attrs After creating a cache for a memcg we should initialize its sysfs attrs with the values from its parent. That's what memcg_propagate_slab_attrs is for. Currently it's broken - we clearly muddled root-vs-memcg caches there. Let's fix it up. Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 5e234f1f8853..042a47b4d0f5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5071,15 +5071,18 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) #ifdef CONFIG_MEMCG_KMEM int i; char *buffer = NULL; + struct kmem_cache *root_cache; - if (!is_root_cache(s)) + if (is_root_cache(s)) return; + root_cache = s->memcg_params->root_cache; + /* * This mean this cache had no attribute written. Therefore, no point * in copying default values around */ - if (!s->max_attr_size) + if (!root_cache->max_attr_size) return; for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { @@ -5101,7 +5104,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) */ if (buffer) buf = buffer; - else if (s->max_attr_size < ARRAY_SIZE(mbuf)) + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) buf = mbuf; else { buffer = (char *) get_zeroed_page(GFP_KERNEL); @@ -5110,7 +5113,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) buf = buffer; } - attr->show(s->memcg_params->root_cache, buf); + attr->show(root_cache, buf); attr->store(s, buf, strlen(buf)); } -- cgit v1.2.3-59-g8ed1b From 457c1b27ed56ec472d202731b12417bff023594a Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Tue, 6 May 2014 12:50:00 -0700 Subject: hugetlb: ensure hugepage access is denied if hugepages are not supported Currently, I am seeing the following when I `mount -t hugetlbfs /none /dev/hugetlbfs`, and then simply do a `ls /dev/hugetlbfs`. I think it's related to the fact that hugetlbfs is properly not correctly setting itself up in this state?: Unable to handle kernel paging request for data at address 0x00000031 Faulting instruction address: 0xc000000000245710 Oops: Kernel access of bad area, sig: 11 [#1] SMP NR_CPUS=2048 NUMA pSeries .... In KVM guests on Power, in a guest not backed by hugepages, we see the following: AnonHugePages: 0 kB HugePages_Total: 0 HugePages_Free: 0 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 64 kB HPAGE_SHIFT == 0 in this configuration, which indicates that hugepages are not supported at boot-time, but this is only checked in hugetlb_init(). Extract the check to a helper function, and use it in a few relevant places. This does make hugetlbfs not supported (not registered at all) in this environment. I believe this is fine, as there are no valid hugepages and that won't change at runtime. [akpm@linux-foundation.org: use pr_info(), per Mel] [akpm@linux-foundation.org: fix build when HPAGE_SHIFT is undefined] Signed-off-by: Nishanth Aravamudan Reviewed-by: Aneesh Kumar K.V Acked-by: Mel Gorman Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 5 +++++ include/linux/hugetlb.h | 10 ++++++++++ mm/hugetlb.c | 19 ++++++++++++++----- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 204027520937..e19d4c0cacae 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1030,6 +1030,11 @@ static int __init init_hugetlbfs_fs(void) int error; int i; + if (!hugepages_supported()) { + pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); + return -ENOTSUPP; + } + error = bdi_init(&hugetlbfs_backing_dev_info); if (error) return error; diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5b337cf8fb86..b65166de1d9d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -412,6 +412,16 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h, return &mm->page_table_lock; } +static inline bool hugepages_supported(void) +{ + /* + * Some platform decide whether they support huge pages at boot + * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when + * there is no such support + */ + return HPAGE_SHIFT != 0; +} + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page_node(h, nid) NULL diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 246192929a2d..c82290b9c1fc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1981,11 +1981,7 @@ static int __init hugetlb_init(void) { int i; - /* Some platform decide whether they support huge pages at boot - * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when - * there is no such support - */ - if (HPAGE_SHIFT == 0) + if (!hugepages_supported()) return 0; if (!size_to_hstate(default_hstate_size)) { @@ -2112,6 +2108,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->max_huge_pages; if (write && h->order >= MAX_ORDER) @@ -2165,6 +2164,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, unsigned long tmp; int ret; + if (!hugepages_supported()) + return -ENOTSUPP; + tmp = h->nr_overcommit_huge_pages; if (write && h->order >= MAX_ORDER) @@ -2190,6 +2192,8 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return; seq_printf(m, "HugePages_Total: %5lu\n" "HugePages_Free: %5lu\n" @@ -2206,6 +2210,8 @@ void hugetlb_report_meminfo(struct seq_file *m) int hugetlb_report_node_meminfo(int nid, char *buf) { struct hstate *h = &default_hstate; + if (!hugepages_supported()) + return 0; return sprintf(buf, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" @@ -2220,6 +2226,9 @@ void hugetlb_show_meminfo(void) struct hstate *h; int nid; + if (!hugepages_supported()) + return; + for_each_node_state(nid, N_MEMORY) for_each_hstate(h) pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", -- cgit v1.2.3-59-g8ed1b From d5c9fde3dae750889168807038243ff36431d276 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 6 May 2014 12:50:01 -0700 Subject: mm/page-writeback.c: fix divide by zero in pos_ratio_polynom It is possible for "limit - setpoint + 1" to equal zero, after getting truncated to a 32 bit variable, and resulting in a divide by zero error. Using the fully 64 bit divide functions avoids this problem. It also will cause pos_ratio_polynom() to return the correct value when (setpoint - limit) exceeds 2^32. Also uninline pos_ratio_polynom, at Andrew's request. Signed-off-by: Rik van Riel Reviewed-by: Michal Hocko Cc: Aneesh Kumar K.V Cc: Mel Gorman Cc: Nishanth Aravamudan Cc: Luiz Capitulino Cc: Masayoshi Mizuma Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index ef413492a149..a4317da60532 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -593,14 +593,14 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) * => fast response on large errors; small oscillation near setpoint */ -static inline long long pos_ratio_polynom(unsigned long setpoint, +static long long pos_ratio_polynom(unsigned long setpoint, unsigned long dirty, unsigned long limit) { long long pos_ratio; long x; - x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, + x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, limit - setpoint + 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; @@ -842,7 +842,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, x_intercept = bdi_setpoint + span; if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty), + pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), x_intercept - bdi_setpoint + 1); } else pos_ratio /= 4; -- cgit v1.2.3-59-g8ed1b From 0e3b7e5402cbec4cee58895a00945356ee26a720 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Tue, 6 May 2014 12:50:02 -0700 Subject: MAINTAINERS: zswap/zbud: change maintainer email address sjenning@linux.vnet.ibm.com is no longer a viable entity. Signed-off-by: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7578deb8ff20..51ebb779c5f3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9960,7 +9960,7 @@ F: drivers/net/hamradio/*scc.c F: drivers/net/hamradio/z8530.h ZBUD COMPRESSED PAGE ALLOCATOR -M: Seth Jennings +M: Seth Jennings L: linux-mm@kvack.org S: Maintained F: mm/zbud.c @@ -10005,7 +10005,7 @@ F: mm/zsmalloc.c F: include/linux/zsmalloc.h ZSWAP COMPRESSED SWAP CACHING -M: Seth Jennings +M: Seth Jennings L: linux-mm@kvack.org S: Maintained F: mm/zswap.c -- cgit v1.2.3-59-g8ed1b From 49e068f0b73dd042c186ffa9b420a9943e90389a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 6 May 2014 12:50:03 -0700 Subject: mm/compaction: make isolate_freepages start at pageblock boundary The compaction freepage scanner implementation in isolate_freepages() starts by taking the current cc->free_pfn value as the first pfn. In a for loop, it scans from this first pfn to the end of the pageblock, and then subtracts pageblock_nr_pages from the first pfn to obtain the first pfn for the next for loop iteration. This means that when cc->free_pfn starts at offset X rather than being aligned on pageblock boundary, the scanner will start at offset X in all scanned pageblock, ignoring potentially many free pages. Currently this can happen when a) zone's end pfn is not pageblock aligned, or b) through zone->compact_cached_free_pfn with CONFIG_HOLES_IN_ZONE enabled and a hole spanning the beginning of a pageblock This patch fixes the problem by aligning the initial pfn in isolate_freepages() to pageblock boundary. This also permits replacing the end-of-pageblock alignment within the for loop with a simple pageblock_nr_pages increment. Signed-off-by: Vlastimil Babka Reported-by: Heesub Shin Acked-by: Minchan Kim Cc: Mel Gorman Acked-by: Joonsoo Kim Cc: Bartlomiej Zolnierkiewicz Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Acked-by: Rik van Riel Cc: Dongjun Shin Cc: Sunghwan Yun Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 37f976287068..627dc2e4320f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -671,16 +671,20 @@ static void isolate_freepages(struct zone *zone, struct compact_control *cc) { struct page *page; - unsigned long high_pfn, low_pfn, pfn, z_end_pfn, end_pfn; + unsigned long high_pfn, low_pfn, pfn, z_end_pfn; int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; /* * Initialise the free scanner. The starting point is where we last - * scanned from (or the end of the zone if starting). The low point - * is the end of the pageblock the migration scanner is using. + * successfully isolated from, zone-cached value, or the end of the + * zone when isolating for the first time. We need this aligned to + * the pageblock boundary, because we do pfn -= pageblock_nr_pages + * in the for loop. + * The low boundary is the end of the pageblock the migration scanner + * is using. */ - pfn = cc->free_pfn; + pfn = cc->free_pfn & ~(pageblock_nr_pages-1); low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* @@ -700,6 +704,7 @@ static void isolate_freepages(struct zone *zone, for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; + unsigned long end_pfn; /* * This can iterate a massively long zone without finding any @@ -734,13 +739,10 @@ static void isolate_freepages(struct zone *zone, isolated = 0; /* - * As pfn may not start aligned, pfn+pageblock_nr_page - * may cross a MAX_ORDER_NR_PAGES boundary and miss - * a pfn_valid check. Ensure isolate_freepages_block() - * only scans within a pageblock + * Take care when isolating in last pageblock of a zone which + * ends in the middle of a pageblock. */ - end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); - end_pfn = min(end_pfn, z_end_pfn); + end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); isolated = isolate_freepages_block(cc, pfn, end_pfn, freelist, false); nr_freepages += isolated; -- cgit v1.2.3-59-g8ed1b From 139b6a6fb1539e04b01663d61baff3088c63dbb5 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 6 May 2014 12:50:05 -0700 Subject: mm: filemap: update find_get_pages_tag() to deal with shadow entries Dave Jones reports the following crash when find_get_pages_tag() runs into an exceptional entry: kernel BUG at mm/filemap.c:1347! RIP: find_get_pages_tag+0x1cb/0x220 Call Trace: find_get_pages_tag+0x36/0x220 pagevec_lookup_tag+0x21/0x30 filemap_fdatawait_range+0xbe/0x1e0 filemap_fdatawait+0x27/0x30 sync_inodes_sb+0x204/0x2a0 sync_inodes_one_sb+0x19/0x20 iterate_supers+0xb2/0x110 sys_sync+0x44/0xb0 ia32_do_call+0x13/0x13 1343 /* 1344 * This function is never used on a shmem/tmpfs 1345 * mapping, so a swap entry won't be found here. 1346 */ 1347 BUG(); After commit 0cd6144aadd2 ("mm + fs: prepare for non-page entries in page cache radix trees") this comment and BUG() are out of date because exceptional entries can now appear in all mappings - as shadows of recently evicted pages. However, as Hugh Dickins notes, "it is truly surprising for a PAGECACHE_TAG_WRITEBACK (and probably any other PAGECACHE_TAG_*) to appear on an exceptional entry. I expect it comes down to an occasional race in RCU lookup of the radix_tree: lacking absolute synchronization, we might sometimes catch an exceptional entry, with the tag which really belongs with the unexceptional entry which was there an instant before." And indeed, not only is the tree walk lockless, the tags are also read in chunks, one radix tree node at a time. There is plenty of time for page reclaim to swoop in and replace a page that was already looked up as tagged with a shadow entry. Remove the BUG() and update the comment. While reviewing all other lookup sites for whether they properly deal with shadow entries of evicted pages, update all the comments and fix memcg file charge moving to not miss shmem/tmpfs swapcache pages. Fixes: 0cd6144aadd2 ("mm + fs: prepare for non-page entries in page cache radix trees") Signed-off-by: Johannes Weiner Reported-by: Dave Jones Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 49 ++++++++++++++++++++++++++++--------------------- mm/memcontrol.c | 20 ++++++++++++-------- mm/truncate.c | 8 -------- 3 files changed, 40 insertions(+), 37 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 5020b280a771..000a220e2a41 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -906,8 +906,8 @@ EXPORT_SYMBOL(page_cache_prev_hole); * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned with an increased refcount. * - * If the slot holds a shadow entry of a previously evicted page, it - * is returned. + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. * * Otherwise, %NULL is returned. */ @@ -928,9 +928,9 @@ repeat: if (radix_tree_deref_retry(page)) goto repeat; /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so return it without - * attempting to raise page count. + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. */ goto out; } @@ -983,8 +983,8 @@ EXPORT_SYMBOL(find_get_page); * page cache page, it is returned locked and with an increased * refcount. * - * If the slot holds a shadow entry of a previously evicted page, it - * is returned. + * If the slot holds a shadow entry of a previously evicted page, or a + * swap entry from shmem/tmpfs, it is returned. * * Otherwise, %NULL is returned. * @@ -1099,8 +1099,8 @@ EXPORT_SYMBOL(find_or_create_page); * with ascending indexes. There may be holes in the indices due to * not-present pages. * - * Any shadow entries of evicted pages are included in the returned - * array. + * Any shadow entries of evicted pages, or swap entries from + * shmem/tmpfs, are included in the returned array. * * find_get_entries() returns the number of pages and shadow entries * which were found. @@ -1128,9 +1128,9 @@ repeat: if (radix_tree_deref_retry(page)) goto restart; /* - * Otherwise, we must be storing a swap entry - * here as an exceptional entry: so return it - * without attempting to raise page count. + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Return + * it without attempting to raise page count. */ goto export; } @@ -1198,9 +1198,9 @@ repeat: goto restart; } /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so skip over it - - * we only reach this from invalidate_mapping_pages(). + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Skip + * over it. */ continue; } @@ -1265,9 +1265,9 @@ repeat: goto restart; } /* - * Otherwise, shmem/tmpfs must be storing a swap entry - * here as an exceptional entry: so stop looking for - * contiguous pages. + * A shadow entry of a recently evicted page, + * or a swap entry from shmem/tmpfs. Stop + * looking for contiguous pages. */ break; } @@ -1341,10 +1341,17 @@ repeat: goto restart; } /* - * This function is never used on a shmem/tmpfs - * mapping, so a swap entry won't be found here. + * A shadow entry of a recently evicted page. + * + * Those entries should never be tagged, but + * this tree walk is lockless and the tags are + * looked up in bulk, one radix tree node at a + * time, so there is a sizable window for page + * reclaim to evict a page we saw tagged. + * + * Skip over it. */ - BUG(); + continue; } if (!page_cache_get_speculative(page)) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 29501f040568..c47dffdcb246 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6686,16 +6686,20 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, pgoff = pte_to_pgoff(ptent); /* page is moved even if it's not RSS of this task(page-faulted). */ - page = find_get_page(mapping, pgoff); - #ifdef CONFIG_SWAP /* shmem/tmpfs may report page out on swap: account for that too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - if (do_swap_account) - *entry = swap; - page = find_get_page(swap_address_space(swap), swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + if (do_swap_account) + *entry = swp; + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif return page; } diff --git a/mm/truncate.c b/mm/truncate.c index e5cc39ab0751..6a78c814bebf 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -484,14 +484,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, unsigned long count = 0; int i; - /* - * Note: this function may get called on a shmem/tmpfs mapping: - * pagevec_lookup() might then return 0 prematurely (because it - * got a gangful of swap entries); but it's hardly worth worrying - * about - it can rarely have anything to free from such a mapping - * (most pages are dirty), and already skips over any difficulties. - */ - pagevec_init(&pvec, 0); while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, -- cgit v1.2.3-59-g8ed1b From 6b6751f7feba68d8f5c72b72cc69a1c5a625529c Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 6 May 2014 12:50:06 -0700 Subject: autofs: fix lockref lookup autofs needs to be able to see private data dentry flags for its dentrys that are being created but not yet hashed and for its dentrys that have been rmdir()ed but not yet freed. It needs to do this so it can block processes in these states until a status has been returned to indicate the given operation is complete. It does this by keeping two lists, active and expring, of dentrys in this state and uses ->d_release() to keep them stable while it checks the reference count to determine if they should be used. But with the recent lockref changes dentrys being freed sometimes don't transition to a reference count of 0 before being freed so autofs can occassionally use a dentry that is invalid which can lead to a panic. Signed-off-by: Ian Kent Cc: Al Viro Cc: Linus Torvalds Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/root.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 2caf36ac3e93..cc87c1abac97 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) spin_lock(&active->d_lock); /* Already gone? */ - if (!d_count(active)) + if ((int) d_count(active) <= 0) goto next; qstr = &active->d_name; @@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) spin_lock(&expiring->d_lock); - /* Bad luck, we've already been dentry_iput */ + /* We've already been dentry_iput or unlinked */ if (!expiring->d_inode) goto next; -- cgit v1.2.3-59-g8ed1b From 623762517e2370be3b3f95f4fe08d6c063a49b06 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 6 May 2014 12:50:07 -0700 Subject: revert "mm: vmscan: do not swap anon pages just because free+file is low" This reverts commit 0bf1457f0cfc ("mm: vmscan: do not swap anon pages just because free+file is low") because it introduced a regression in mostly-anonymous workloads, where reclaim would become ineffective and trap every allocating task in direct reclaim. The problem is that there is a runaway feedback loop in the scan balance between file and anon, where the balance tips heavily towards a tiny thrashing file LRU and anonymous pages are no longer being looked at. The commit in question removed the safe guard that would detect such situations and respond with forced anonymous reclaim. This commit was part of a series to fix premature swapping in loads with relatively little cache, and while it made a small difference, the cure is obviously worse than the disease. Revert it. Signed-off-by: Johannes Weiner Reported-by: Christian Borntraeger Acked-by: Christian Borntraeger Acked-by: Rafael Aquini Cc: Rik van Riel Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3f56c8deb3c0..32c661d66a45 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1915,6 +1915,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + get_lru_size(lruvec, LRU_INACTIVE_FILE); + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (global_reclaim(sc)) { + unsigned long free = zone_page_state(zone, NR_FREE_PAGES); + + if (unlikely(file + free <= high_wmark_pages(zone))) { + scan_balance = SCAN_ANON; + goto out; + } + } + /* * There is enough inactive page cache, do not reclaim * anything from the anonymous working set right now. -- cgit v1.2.3-59-g8ed1b From 41a212859a4dd583d3aa032cdd3efa564c4f189f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 6 May 2014 12:50:08 -0700 Subject: slub: use sysfs'es release mechanism for kmem_cache debugobjects warning during netfilter exit: ------------[ cut here ]------------ WARNING: CPU: 6 PID: 4178 at lib/debugobjects.c:260 debug_print_object+0x8d/0xb0() ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x20 Modules linked in: CPU: 6 PID: 4178 Comm: kworker/u16:2 Tainted: G W 3.11.0-next-20130906-sasha #3984 Workqueue: netns cleanup_net Call Trace: dump_stack+0x52/0x87 warn_slowpath_common+0x8c/0xc0 warn_slowpath_fmt+0x46/0x50 debug_print_object+0x8d/0xb0 __debug_check_no_obj_freed+0xa5/0x220 debug_check_no_obj_freed+0x15/0x20 kmem_cache_free+0x197/0x340 kmem_cache_destroy+0x86/0xe0 nf_conntrack_cleanup_net_list+0x131/0x170 nf_conntrack_pernet_exit+0x5d/0x70 ops_exit_list+0x5e/0x70 cleanup_net+0xfb/0x1c0 process_one_work+0x338/0x550 worker_thread+0x215/0x350 kthread+0xe7/0xf0 ret_from_fork+0x7c/0xb0 Also during dcookie cleanup: WARNING: CPU: 12 PID: 9725 at lib/debugobjects.c:260 debug_print_object+0x8c/0xb0() ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x20 Modules linked in: CPU: 12 PID: 9725 Comm: trinity-c141 Not tainted 3.15.0-rc2-next-20140423-sasha-00018-gc4ff6c4 #408 Call Trace: dump_stack (lib/dump_stack.c:52) warn_slowpath_common (kernel/panic.c:430) warn_slowpath_fmt (kernel/panic.c:445) debug_print_object (lib/debugobjects.c:262) __debug_check_no_obj_freed (lib/debugobjects.c:697) debug_check_no_obj_freed (lib/debugobjects.c:726) kmem_cache_free (mm/slub.c:2689 mm/slub.c:2717) kmem_cache_destroy (mm/slab_common.c:363) dcookie_unregister (fs/dcookies.c:302 fs/dcookies.c:343) event_buffer_release (arch/x86/oprofile/../../../drivers/oprofile/event_buffer.c:153) __fput (fs/file_table.c:217) ____fput (fs/file_table.c:253) task_work_run (kernel/task_work.c:125 (discriminator 1)) do_notify_resume (include/linux/tracehook.h:196 arch/x86/kernel/signal.c:751) int_signal (arch/x86/kernel/entry_64.S:807) Sysfs has a release mechanism. Use that to release the kmem_cache structure if CONFIG_SYSFS is enabled. Only slub is changed - slab currently only supports /proc/slabinfo and not /sys/kernel/slab/*. We talked about adding that and someone was working on it. [akpm@linux-foundation.org: fix CONFIG_SYSFS=n build] [akpm@linux-foundation.org: fix CONFIG_SYSFS=n build even more] Signed-off-by: Christoph Lameter Reported-by: Sasha Levin Tested-by: Sasha Levin Acked-by: Greg KH Cc: Thomas Gleixner Cc: Pekka Enberg Cc: Russell King Cc: Bart Van Assche Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 9 +++++++++ mm/slab.h | 1 + mm/slab_common.c | 13 +++++++++++-- mm/slub.c | 30 ++++++++---------------------- 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index f2f7398848cf..d82abd40a3c0 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -101,4 +101,13 @@ struct kmem_cache { struct kmem_cache_node *node[MAX_NUMNODES]; }; +#ifdef CONFIG_SYSFS +#define SLAB_SUPPORTS_SYSFS +void sysfs_slab_remove(struct kmem_cache *); +#else +static inline void sysfs_slab_remove(struct kmem_cache *s) +{ +} +#endif + #endif /* _LINUX_SLUB_DEF_H */ diff --git a/mm/slab.h b/mm/slab.h index 3045316b7c9d..6bd4c353704f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -91,6 +91,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) int __kmem_cache_shutdown(struct kmem_cache *); +void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; struct file; diff --git a/mm/slab_common.c b/mm/slab_common.c index f3cfccf76dda..102cc6fca3d3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -323,6 +323,12 @@ static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) } #endif /* CONFIG_MEMCG_KMEM */ +void slab_kmem_cache_release(struct kmem_cache *s) +{ + kfree(s->name); + kmem_cache_free(kmem_cache, s); +} + void kmem_cache_destroy(struct kmem_cache *s) { get_online_cpus(); @@ -352,8 +358,11 @@ void kmem_cache_destroy(struct kmem_cache *s) rcu_barrier(); memcg_free_cache_params(s); - kfree(s->name); - kmem_cache_free(kmem_cache, s); +#ifdef SLAB_SUPPORTS_SYSFS + sysfs_slab_remove(s); +#else + slab_kmem_cache_release(s); +#endif goto out_put_cpus; out_unlock: diff --git a/mm/slub.c b/mm/slub.c index 042a47b4d0f5..2b1ce697fc4b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -210,14 +210,11 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; #ifdef CONFIG_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); -static void sysfs_slab_remove(struct kmem_cache *); static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) { } - static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif @@ -3238,24 +3235,7 @@ static inline int kmem_cache_close(struct kmem_cache *s) int __kmem_cache_shutdown(struct kmem_cache *s) { - int rc = kmem_cache_close(s); - - if (!rc) { - /* - * Since slab_attr_store may take the slab_mutex, we should - * release the lock while removing the sysfs entry in order to - * avoid a deadlock. Because this is pretty much the last - * operation we do and the lock will be released shortly after - * that in slab_common.c, we could just move sysfs_slab_remove - * to a later point in common code. We should do that when we - * have a common sysfs framework for all allocators. - */ - mutex_unlock(&slab_mutex); - sysfs_slab_remove(s); - mutex_lock(&slab_mutex); - } - - return rc; + return kmem_cache_close(s); } /******************************************************************** @@ -5122,6 +5102,11 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) #endif } +static void kmem_cache_release(struct kobject *k) +{ + slab_kmem_cache_release(to_slab(k)); +} + static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, @@ -5129,6 +5114,7 @@ static const struct sysfs_ops slab_sysfs_ops = { static struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, + .release = kmem_cache_release, }; static int uevent_filter(struct kset *kset, struct kobject *kobj) @@ -5255,7 +5241,7 @@ out_put_kobj: goto out; } -static void sysfs_slab_remove(struct kmem_cache *s) +void sysfs_slab_remove(struct kmem_cache *s) { if (slab_state < FULL) /* -- cgit v1.2.3-59-g8ed1b From 1e2ee49f7f1b79f0b14884fe6a602f0411b39552 Mon Sep 17 00:00:00 2001 From: Will Woods Date: Tue, 6 May 2014 12:50:10 -0700 Subject: fanotify: fix -EOVERFLOW with large files on 64-bit On 64-bit systems, O_LARGEFILE is automatically added to flags inside the open() syscall (also openat(), blkdev_open(), etc). Userspace therefore defines O_LARGEFILE to be 0 - you can use it, but it's a no-op. Everything should be O_LARGEFILE by default. But: when fanotify does create_fd() it uses dentry_open(), which skips all that. And userspace can't set O_LARGEFILE in fanotify_init() because it's defined to 0. So if fanotify gets an event regarding a large file, the read() will just fail with -EOVERFLOW. This patch adds O_LARGEFILE to fanotify_init()'s event_f_flags on 64-bit systems, using the same test as open()/openat()/etc. Addresses https://bugzilla.redhat.com/show_bug.cgi?id=696821 Signed-off-by: Will Woods Acked-by: Eric Paris Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4e565c814309..732648b270dc 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -698,6 +698,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) } group->overflow_event = &oevent->fse; + if (force_o_largefile()) + event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS spin_lock_init(&group->fanotify_data.access_lock); -- cgit v1.2.3-59-g8ed1b From d353efd02357a74753cd45f367a2d3d357fd6904 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 6 May 2014 12:50:11 -0700 Subject: fs/affs/super.c: bugfix / double free Commit 842a859db26b ("affs: use ->kill_sb() to simplify ->put_super() and failure exits of ->mount()") adds .kill_sb which frees sbi but doesn't remove sbi free in case of parse_options error causing double free+random crash. Signed-off-by: Fabian Frederick Cc: Alexander Viro Cc: [3.14.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/affs/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/affs/super.c b/fs/affs/super.c index 6d589f28bf9b..895ac7dc9dbf 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -340,8 +340,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) &blocksize,&sbi->s_prefix, sbi->s_volume, &mount_flags)) { printk(KERN_ERR "AFFS: Error parsing options\n"); - kfree(sbi->s_prefix); - kfree(sbi); return -EINVAL; } /* N.B. after this point s_prefix must be released */ -- cgit v1.2.3-59-g8ed1b From 3ca9e5d36afb5c0a6ee6ceee69e507370beb59c6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 6 May 2014 12:50:12 -0700 Subject: agp: info leak in agpioc_info_wrap() On 64 bit systems the agp_info struct has a 4 byte hole between ->agp_mode and ->aper_base. We need to clear it to avoid disclosing stack information to userspace. Signed-off-by: Dan Carpenter Cc: David Airlie Cc: Daniel Vetter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/agp/frontend.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/agp/frontend.c b/drivers/char/agp/frontend.c index 8121b4c70ede..b29703324e94 100644 --- a/drivers/char/agp/frontend.c +++ b/drivers/char/agp/frontend.c @@ -730,6 +730,7 @@ static int agpioc_info_wrap(struct agp_file_private *priv, void __user *arg) agp_copy_info(agp_bridge, &kerninfo); + memset(&userinfo, 0, sizeof(userinfo)); userinfo.version.major = kerninfo.version.major; userinfo.version.minor = kerninfo.version.minor; userinfo.bridge_id = kerninfo.device->vendor | -- cgit v1.2.3-59-g8ed1b