From f2b91d8d385d2cef3a1e3b3846f2dde4a6720c43 Mon Sep 17 00:00:00 2001
From: Zhang Zhen <zhenzhang.zhang@huawei.com>
Date: Wed, 15 Apr 2015 16:12:51 -0700
Subject: vfs: delete vfs_readdir function declaration

vfs_readdir() was replaced by iterate_dir() in commit 5c0ba4e0762e
("[readdir] introduce iterate_dir() and dir_context").

Signed-off-by: Zhang Zhen <zhenzhang.zhang@huawei.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index d502e5436c84..60733bdc74b4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2684,7 +2684,6 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes);
 loff_t inode_get_bytes(struct inode *inode);
 void inode_set_bytes(struct inode *inode, loff_t bytes);
 
-extern int vfs_readdir(struct file *, filldir_t, void *);
 extern int iterate_dir(struct file *, struct dir_context *);
 
 extern int vfs_stat(const char __user *, struct kstat *);
-- 
cgit v1.3-6-gb490


From d7e4a2ea51c1b0ac0b2d53f5ab4a2e878b1c4aec Mon Sep 17 00:00:00 2001
From: Zhang Zhen <zhenzhang.zhang@huawei.com>
Date: Wed, 15 Apr 2015 16:12:57 -0700
Subject: mm: refactor zone_movable_is_highmem()

All callers of zone_movable_is_highmem are under #ifdef CONFIG_HIGHMEM,
so the else branch return 0 is not needed.

Signed-off-by: Zhang Zhen <zhenzhang.zhang@huawei.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2782df47101e..54d74f6eb233 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -842,16 +842,16 @@ static inline int populated_zone(struct zone *zone)
 
 extern int movable_zone;
 
+#ifdef CONFIG_HIGHMEM
 static inline int zone_movable_is_highmem(void)
 {
-#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 	return movable_zone == ZONE_HIGHMEM;
-#elif defined(CONFIG_HIGHMEM)
-	return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM;
 #else
-	return 0;
+	return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM;
 #endif
 }
+#endif
 
 static inline int is_highmem_idx(enum zone_type idx)
 {
-- 
cgit v1.3-6-gb490


From e8c6158fef15a1532bd5242a0cd88565eedabe61 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Apr 2015 16:13:08 -0700
Subject: mm: consolidate all page-flags helpers in <linux/page-flags.h>

Currently we take a naive approach to page flags on compound pages - we
set the flag on the page without consideration if the flag makes sense
for tail page or for compound page in general.  This patchset try to
sort this out by defining per-flag policy on what need to be done if
page-flag helper operate on compound page.

The last patch in the patchset also sanitizes usege of page->mapping for
tail pages.  We don't define the meaning of page->mapping for tail
pages.  Currently it's always NULL, which can be inconsistent with head
page and potentially lead to problems.

For now I caught one case of illegal usage of page flags or ->mapping:
sound subsystem allocates pages with __GFP_COMP and maps them with PTEs.
It leads to setting dirty bit on tail pages and access to tail_page's
->mapping.  I don't see any bad behaviour caused by this, but worth
fixing anyway.

This patchset makes more sense if you take my THP refcounting into
account: we will see more compound pages mapped with PTEs and we need to
define behaviour of flags on compound pages to avoid bugs.

This patch (of 16):

We have page-flags helper function declarations/definitions spread over
several header files.  Let's consolidate them in <linux/page-flags.h>.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h    |  7 ----
 include/linux/ksm.h        | 17 --------
 include/linux/mm.h         | 81 --------------------------------------
 include/linux/page-flags.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7b5785032049..1a782733a420 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -41,8 +41,6 @@ extern int hugetlb_max_hstate __read_mostly;
 struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
 void hugepage_put_subpool(struct hugepage_subpool *spool);
 
-int PageHuge(struct page *page);
-
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
 int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
@@ -109,11 +107,6 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
-static inline int PageHuge(struct page *page)
-{
-	return 0;
-}
-
 static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 {
 }
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 3be6bb18562d..7ae216a39c9e 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -35,18 +35,6 @@ static inline void ksm_exit(struct mm_struct *mm)
 		__ksm_exit(mm);
 }
 
-/*
- * A KSM page is one of those write-protected "shared pages" or "merged pages"
- * which KSM maps into multiple mms, wherever identical anonymous page content
- * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
- * anon_vma, but to that page's node of the stable tree.
- */
-static inline int PageKsm(struct page *page)
-{
-	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
-				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
-}
-
 static inline struct stable_node *page_stable_node(struct page *page)
 {
 	return PageKsm(page) ? page_rmapping(page) : NULL;
@@ -87,11 +75,6 @@ static inline void ksm_exit(struct mm_struct *mm)
 {
 }
 
-static inline int PageKsm(struct page *page)
-{
-	return 0;
-}
-
 #ifdef CONFIG_MMU
 static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6571dd78e984..fb1fc38b01ce 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -494,15 +494,6 @@ static inline int page_count(struct page *page)
 	return atomic_read(&compound_head(page)->_count);
 }
 
-#ifdef CONFIG_HUGETLB_PAGE
-extern int PageHeadHuge(struct page *page_head);
-#else /* CONFIG_HUGETLB_PAGE */
-static inline int PageHeadHuge(struct page *page_head)
-{
-	return 0;
-}
-#endif /* CONFIG_HUGETLB_PAGE */
-
 static inline bool __compound_tail_refcounted(struct page *page)
 {
 	return !PageSlab(page) && !PageHeadHuge(page);
@@ -571,53 +562,6 @@ static inline void init_page_count(struct page *page)
 	atomic_set(&page->_count, 1);
 }
 
-/*
- * PageBuddy() indicate that the page is free and in the buddy system
- * (see mm/page_alloc.c).
- *
- * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to
- * -2 so that an underflow of the page_mapcount() won't be mistaken
- * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very
- * efficiently by most CPU architectures.
- */
-#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
-
-static inline int PageBuddy(struct page *page)
-{
-	return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE;
-}
-
-static inline void __SetPageBuddy(struct page *page)
-{
-	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
-	atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE);
-}
-
-static inline void __ClearPageBuddy(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageBuddy(page), page);
-	atomic_set(&page->_mapcount, -1);
-}
-
-#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
-
-static inline int PageBalloon(struct page *page)
-{
-	return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE;
-}
-
-static inline void __SetPageBalloon(struct page *page)
-{
-	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
-	atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE);
-}
-
-static inline void __ClearPageBalloon(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageBalloon(page), page);
-	atomic_set(&page->_mapcount, -1);
-}
-
 void put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
 
@@ -1006,26 +950,6 @@ void page_address_init(void);
 #define page_address_init()  do { } while(0)
 #endif
 
-/*
- * On an anonymous page mapped into a user virtual memory area,
- * page->mapping points to its anon_vma, not to a struct address_space;
- * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
- *
- * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
- * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit;
- * and then page->mapping points, not to an anon_vma, but to a private
- * structure which KSM associates with that merged page.  See ksm.h.
- *
- * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used.
- *
- * Please note that, confusingly, "page_mapping" refers to the inode
- * address_space which maps the page from disk; whereas "page_mapped"
- * refers to user virtual address space into which the page is mapped.
- */
-#define PAGE_MAPPING_ANON	1
-#define PAGE_MAPPING_KSM	2
-#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
-
 extern struct address_space *page_mapping(struct page *page);
 
 /* Neutral page->mapping pointer to address_space or anon_vma or other */
@@ -1045,11 +969,6 @@ struct address_space *page_file_mapping(struct page *page)
 	return page->mapping;
 }
 
-static inline int PageAnon(struct page *page)
-{
-	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
-}
-
 /*
  * Return the pagecache index of the passed page.  Regular pagecache pages
  * use ->index whereas swapcache pages use ->private
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index c851ff92d5b3..84d10b65cec6 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -289,6 +289,47 @@ PAGEFLAG_FALSE(HWPoison)
 #define __PG_HWPOISON 0
 #endif
 
+/*
+ * On an anonymous page mapped into a user virtual memory area,
+ * page->mapping points to its anon_vma, not to a struct address_space;
+ * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
+ *
+ * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
+ * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit;
+ * and then page->mapping points, not to an anon_vma, but to a private
+ * structure which KSM associates with that merged page.  See ksm.h.
+ *
+ * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used.
+ *
+ * Please note that, confusingly, "page_mapping" refers to the inode
+ * address_space which maps the page from disk; whereas "page_mapped"
+ * refers to user virtual address space into which the page is mapped.
+ */
+#define PAGE_MAPPING_ANON	1
+#define PAGE_MAPPING_KSM	2
+#define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
+
+static inline int PageAnon(struct page *page)
+{
+	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
+#ifdef CONFIG_KSM
+/*
+ * A KSM page is one of those write-protected "shared pages" or "merged pages"
+ * which KSM maps into multiple mms, wherever identical anonymous page content
+ * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
+ * anon_vma, but to that page's node of the stable tree.
+ */
+static inline int PageKsm(struct page *page)
+{
+	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
+				(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+}
+#else
+TESTPAGEFLAG_FALSE(Ksm)
+#endif
+
 u64 stable_page_flags(struct page *page);
 
 static inline int PageUptodate(struct page *page)
@@ -426,6 +467,14 @@ static inline void ClearPageCompound(struct page *page)
 
 #endif /* !PAGEFLAGS_EXTENDED */
 
+#ifdef CONFIG_HUGETLB_PAGE
+int PageHuge(struct page *page);
+int PageHeadHuge(struct page *page);
+#else
+TESTPAGEFLAG_FALSE(Huge)
+TESTPAGEFLAG_FALSE(HeadHuge)
+#endif
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * PageHuge() only returns true for hugetlbfs pages, but not for
@@ -479,6 +528,53 @@ static inline int PageTransTail(struct page *page)
 }
 #endif
 
+/*
+ * PageBuddy() indicate that the page is free and in the buddy system
+ * (see mm/page_alloc.c).
+ *
+ * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to
+ * -2 so that an underflow of the page_mapcount() won't be mistaken
+ * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very
+ * efficiently by most CPU architectures.
+ */
+#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
+
+static inline int PageBuddy(struct page *page)
+{
+	return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE;
+}
+
+static inline void __SetPageBuddy(struct page *page)
+{
+	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
+	atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE);
+}
+
+static inline void __ClearPageBuddy(struct page *page)
+{
+	VM_BUG_ON_PAGE(!PageBuddy(page), page);
+	atomic_set(&page->_mapcount, -1);
+}
+
+#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
+
+static inline int PageBalloon(struct page *page)
+{
+	return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE;
+}
+
+static inline void __SetPageBalloon(struct page *page)
+{
+	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
+	atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE);
+}
+
+static inline void __ClearPageBalloon(struct page *page)
+{
+	VM_BUG_ON_PAGE(!PageBalloon(page), page);
+	atomic_set(&page->_mapcount, -1);
+}
+
 /*
  * If network-based swap is enabled, sl*b must keep track of whether pages
  * were allocated from pfmemalloc reserves.
-- 
cgit v1.3-6-gb490


From 8d63d99a5dfbdb997d12dd3c07b2070ca723db3b Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Apr 2015 16:13:12 -0700
Subject: mm: avoid tail page refcounting on non-THP compound pages

THP uses tail page refcounting to be able to split huge pages at any time.
 Tail page refcounting is not needed for other users of compound pages and
it's harmful because of overhead.

We try to exclude non-THP pages from tail page refcounting using
__compound_tail_refcounted() check.  It excludes most common non-THP
compound pages: SL*B and hugetlb, but it doesn't catch rest of __GFP_COMP
users -- drivers.

And it's not only about overhead.

Drivers might want to use compound pages to get refcounting semantics
suitable for mapping high-order pages to userspace.  But tail page
refcounting breaks it.

Tail page refcounting uses ->_mapcount in tail pages to store GUP pins on
them.  It means GUP pins would affect page_mapcount() for tail pages.
It's not a problem for THP, because it never maps tail pages.  But unlike
THP, drivers map parts of compound pages with PTEs and it makes
page_mapcount() be called for tail pages.

In particular, GUP pins would shift PSS up and affect /proc/kpagecount for
such pages.  But, I'm not aware about anything which can lead to crash or
other serious misbehaviour.

Since currently all THP pages are anonymous and all drivers pages are not,
we can fix the __compound_tail_refcounted() check by requiring PageAnon()
to enable tail page refcounting.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fb1fc38b01ce..515ec5045b60 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -496,7 +496,7 @@ static inline int page_count(struct page *page)
 
 static inline bool __compound_tail_refcounted(struct page *page)
 {
-	return !PageSlab(page) && !PageHeadHuge(page);
+	return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
 }
 
 /*
-- 
cgit v1.3-6-gb490


From 5bbe3547aa3ba5242366a322a28996872301b703 Mon Sep 17 00:00:00 2001
From: Eric B Munson <emunson@akamai.com>
Date: Wed, 15 Apr 2015 16:13:20 -0700
Subject: mm: allow compaction of unevictable pages

Currently, pages which are marked as unevictable are protected from
compaction, but not from other types of migration.  The POSIX real time
extension explicitly states that mlock() will prevent a major page
fault, but the spirit of this is that mlock() should give a process the
ability to control sources of latency, including minor page faults.
However, the mlock manpage only explicitly says that a locked page will
not be written to swap and this can cause some confusion.  The
compaction code today does not give a developer who wants to avoid swap
but wants to have large contiguous areas available any method to achieve
this state.  This patch introduces a sysctl for controlling compaction
behavior with respect to the unevictable lru.  Users who demand no page
faults after a page is present can set compact_unevictable_allowed to 0
and users who need the large contiguous areas can enable compaction on
locked memory by leaving the default value of 1.

To illustrate this problem I wrote a quick test program that mmaps a
large number of 1MB files filled with random data.  These maps are
created locked and read only.  Then every other mmap is unmapped and I
attempt to allocate huge pages to the static huge page pool.  When the
compact_unevictable_allowed sysctl is 0, I cannot allocate hugepages
after fragmenting memory.  When the value is set to 1, allocations
succeed.

Signed-off-by: Eric B Munson <emunson@akamai.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 11 +++++++++++
 include/linux/compaction.h  |  1 +
 kernel/sysctl.c             |  9 +++++++++
 mm/compaction.c             |  7 +++++++
 4 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 902b4574acfb..9832ec52f859 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -21,6 +21,7 @@ Currently, these files are in /proc/sys/vm:
 - admin_reserve_kbytes
 - block_dump
 - compact_memory
+- compact_unevictable_allowed
 - dirty_background_bytes
 - dirty_background_ratio
 - dirty_bytes
@@ -106,6 +107,16 @@ huge pages although processes will also directly compact memory as required.
 
 ==============================================================
 
+compact_unevictable_allowed
+
+Available only when CONFIG_COMPACTION is set. When set to 1, compaction is
+allowed to examine the unevictable lru (mlocked pages) for pages to compact.
+This should be used on systems where stalls for minor page faults are an
+acceptable trade for large contiguous free memory.  Set to 0 to prevent
+compaction from moving pages that are unevictable.  Default value is 1.
+
+==============================================================
+
 dirty_background_bytes
 
 Contains the amount of dirty memory at which the background kernel
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index a014559e4a49..aa8f61cf3a19 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -34,6 +34,7 @@ extern int sysctl_compaction_handler(struct ctl_table *table, int write,
 extern int sysctl_extfrag_threshold;
 extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos);
+extern int sysctl_compact_unevictable_allowed;
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8c0eabd41886..42b7fc2860c1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1335,6 +1335,15 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &min_extfrag_threshold,
 		.extra2		= &max_extfrag_threshold,
 	},
+	{
+		.procname	= "compact_unevictable_allowed",
+		.data		= &sysctl_compact_unevictable_allowed,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 
 #endif /* CONFIG_COMPACTION */
 	{
diff --git a/mm/compaction.c b/mm/compaction.c
index a18201a8124e..570426edcadf 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1046,6 +1046,12 @@ typedef enum {
 	ISOLATE_SUCCESS,	/* Pages isolated, migrate */
 } isolate_migrate_t;
 
+/*
+ * Allow userspace to control policy on scanning the unevictable LRU for
+ * compactable pages.
+ */
+int sysctl_compact_unevictable_allowed __read_mostly = 1;
+
 /*
  * Isolate all pages that can be migrated from the first suitable block,
  * starting at the block pointed to by the migrate scanner pfn within
@@ -1057,6 +1063,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	unsigned long low_pfn, end_pfn;
 	struct page *page;
 	const isolate_mode_t isolate_mode =
+		(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
 		(cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
 
 	/*
-- 
cgit v1.3-6-gb490


From cc5993bd7b8cff4a3e37042ee1358d1d5eafa70c Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 15 Apr 2015 16:13:26 -0700
Subject: mm: rename deactivate_page to deactivate_file_page

"deactivate_page" was created for file invalidation so it has too
specific logic for file-backed pages.  So, let's change the name of the
function and date to a file-specific one and yield the generic name.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Wang, Yalin <Yalin.Wang@sonymobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  2 +-
 mm/swap.c            | 24 ++++++++++++------------
 mm/truncate.c        |  2 +-
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7067eca501e2..cee108cbe2d5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -307,7 +307,7 @@ extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
-extern void deactivate_page(struct page *page);
+extern void deactivate_file_page(struct page *page);
 extern void swap_setup(void);
 
 extern void add_page_to_unevictable_list(struct page *page);
diff --git a/mm/swap.c b/mm/swap.c
index cd3a5e64cea9..e3a4feac9b0e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -42,7 +42,7 @@ int page_cluster;
 
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
 
 /*
  * This path almost never happens for VM activity - pages are normally
@@ -743,7 +743,7 @@ void lru_cache_add_active_or_unevictable(struct page *page,
  * be write it out by flusher threads as this is much more effective
  * than the single-page writeout from reclaim.
  */
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
 			      void *arg)
 {
 	int lru, file;
@@ -811,36 +811,36 @@ void lru_add_drain_cpu(int cpu)
 		local_irq_restore(flags);
 	}
 
-	pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+	pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
 	if (pagevec_count(pvec))
-		pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+		pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
 
 	activate_page_drain(cpu);
 }
 
 /**
- * deactivate_page - forcefully deactivate a page
+ * deactivate_file_page - forcefully deactivate a file page
  * @page: page to deactivate
  *
  * This function hints the VM that @page is a good reclaim candidate,
  * for example if its invalidation fails due to the page being dirty
  * or under writeback.
  */
-void deactivate_page(struct page *page)
+void deactivate_file_page(struct page *page)
 {
 	/*
-	 * In a workload with many unevictable page such as mprotect, unevictable
-	 * page deactivation for accelerating reclaim is pointless.
+	 * In a workload with many unevictable page such as mprotect,
+	 * unevictable page deactivation for accelerating reclaim is pointless.
 	 */
 	if (PageUnevictable(page))
 		return;
 
 	if (likely(get_page_unless_zero(page))) {
-		struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+		struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
 
 		if (!pagevec_add(pvec, page))
-			pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
-		put_cpu_var(lru_deactivate_pvecs);
+			pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+		put_cpu_var(lru_deactivate_file_pvecs);
 	}
 }
 
@@ -872,7 +872,7 @@ void lru_add_drain_all(void)
 
 		if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
 		    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
-		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+		    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
 		    need_activate_page_drain(cpu)) {
 			INIT_WORK(work, lru_add_drain_per_cpu);
 			schedule_work_on(cpu, work);
diff --git a/mm/truncate.c b/mm/truncate.c
index 7a9d8a3cb143..66af9031fae8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -490,7 +490,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 			 * of interest and try to speed up its reclaim.
 			 */
 			if (!ret)
-				deactivate_page(page);
+				deactivate_file_page(page);
 			count += ret;
 		}
 		pagevec_remove_exceptionals(&pvec);
-- 
cgit v1.3-6-gb490


From c6a918200c4f4ebf74b7e1ae4fea9115c7b297f8 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 15 Apr 2015 16:13:36 -0700
Subject: hugetlbfs: add minimum size tracking fields to subpool structure

hugetlbfs allocates huge pages from the global pool as needed.  Even if
the global pool contains a sufficient number pages for the filesystem size
at mount time, those global pages could be grabbed for some other use.  As
a result, filesystem huge page allocations may fail due to lack of pages.

Applications such as a database want to use huge pages for performance
reasons.  hugetlbfs filesystem semantics with ownership and modes work
well to manage access to a pool of huge pages.  However, the application
would like some reasonable assurance that allocations will not fail due to
a lack of huge pages.  At application startup time, the application would
like to configure itself to use a specific number of huge pages.  Before
starting, the application can check to make sure that enough huge pages
exist in the system global pools.  However, there are no guarantees that
those pages will be available when needed by the application.  What the
application wants is exclusive use of a subset of huge pages.

Add a new hugetlbfs mount option 'min_size=<value>' to indicate that the
specified number of pages will be available for use by the filesystem.  At
mount time, this number of huge pages will be reserved for exclusive use
of the filesystem.  If there is not a sufficient number of free pages, the
mount will fail.  As pages are allocated to and freeed from the
filesystem, the number of reserved pages is adjusted so that the specified
minimum is maintained.

This patch (of 4):

Add a field to the subpool structure to indicate the minimimum number of
huge pages to always be used by this subpool.  This minimum count includes
allocated pages as well as reserved pages.  If the minimum number of pages
for the subpool have not been allocated, pages are reserved up to this
minimum.  An additional field (rsv_hpages) is used to track the number of
pages reserved to meet this minimum size.  The hstate pointer in the
subpool is convenient to have when reserving and unreserving the pages.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 8 +++++++-
 mm/hugetlb.c            | 3 +--
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1a782733a420..d1a77b87408d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -22,7 +22,13 @@ struct mmu_gather;
 struct hugepage_subpool {
 	spinlock_t lock;
 	long count;
-	long max_hpages, used_hpages;
+	long max_hpages;	/* Maximum huge pages or -1 if no maximum. */
+	long used_hpages;	/* Used count against maximum, includes */
+				/* both alloced and reserved pages. */
+	struct hstate *hstate;
+	long min_hpages;	/* Minimum huge pages or -1 if no minimum. */
+	long rsv_hpages;	/* Pages reserved against global pool to */
+				/* sasitfy minimum size. */
 };
 
 struct resv_map {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8874c8ad55aa..4494976c2042 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -77,14 +77,13 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
 {
 	struct hugepage_subpool *spool;
 
-	spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
 	if (!spool)
 		return NULL;
 
 	spin_lock_init(&spool->lock);
 	spool->count = 1;
 	spool->max_hpages = nr_blocks;
-	spool->used_hpages = 0;
 
 	return spool;
 }
-- 
cgit v1.3-6-gb490


From 7ca02d0ae586fe7df59632966a64f3f1a756ef05 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 15 Apr 2015 16:13:42 -0700
Subject: hugetlbfs: accept subpool min_size mount option and setup accordingly

Make 'min_size=<value>' be an option when mounting a hugetlbfs.  This
option takes the same value as the 'size' option.  min_size can be
specified without specifying size.  If both are specified, min_size must
be less that or equal to size else the mount will fail.  If min_size is
specified, then at mount time an attempt is made to reserve min_size
pages.  If the reservation fails, the mount fails.  At umount time, the
reserved pages are released.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 90 ++++++++++++++++++++++++++++++++++++++-----------
 include/linux/hugetlb.h |  3 +-
 mm/hugetlb.c            | 25 +++++++++++---
 3 files changed, 94 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index db76cec3ce21..3a8f12762821 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -47,9 +47,10 @@ struct hugetlbfs_config {
 	kuid_t   uid;
 	kgid_t   gid;
 	umode_t mode;
-	long	nr_blocks;
+	long	max_hpages;
 	long	nr_inodes;
 	struct hstate *hstate;
+	long    min_hpages;
 };
 
 struct hugetlbfs_inode_info {
@@ -67,7 +68,7 @@ int sysctl_hugetlb_shm_group;
 enum {
 	Opt_size, Opt_nr_inodes,
 	Opt_mode, Opt_uid, Opt_gid,
-	Opt_pagesize,
+	Opt_pagesize, Opt_min_size,
 	Opt_err,
 };
 
@@ -78,6 +79,7 @@ static const match_table_t tokens = {
 	{Opt_uid,	"uid=%u"},
 	{Opt_gid,	"gid=%u"},
 	{Opt_pagesize,	"pagesize=%s"},
+	{Opt_min_size,	"min_size=%s"},
 	{Opt_err,	NULL},
 };
 
@@ -754,14 +756,38 @@ static const struct super_operations hugetlbfs_ops = {
 	.show_options	= generic_show_options,
 };
 
+enum { NO_SIZE, SIZE_STD, SIZE_PERCENT };
+
+/*
+ * Convert size option passed from command line to number of huge pages
+ * in the pool specified by hstate.  Size option could be in bytes
+ * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
+ */
+static long long
+hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
+								int val_type)
+{
+	if (val_type == NO_SIZE)
+		return -1;
+
+	if (val_type == SIZE_PERCENT) {
+		size_opt <<= huge_page_shift(h);
+		size_opt *= h->max_huge_pages;
+		do_div(size_opt, 100);
+	}
+
+	size_opt >>= huge_page_shift(h);
+	return size_opt;
+}
+
 static int
 hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 {
 	char *p, *rest;
 	substring_t args[MAX_OPT_ARGS];
 	int option;
-	unsigned long long size = 0;
-	enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
+	unsigned long long max_size_opt = 0, min_size_opt = 0;
+	int max_val_type = NO_SIZE, min_val_type = NO_SIZE;
 
 	if (!options)
 		return 0;
@@ -799,10 +825,10 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			/* memparse() will accept a K/M/G without a digit */
 			if (!isdigit(*args[0].from))
 				goto bad_val;
-			size = memparse(args[0].from, &rest);
-			setsize = SIZE_STD;
+			max_size_opt = memparse(args[0].from, &rest);
+			max_val_type = SIZE_STD;
 			if (*rest == '%')
-				setsize = SIZE_PERCENT;
+				max_val_type = SIZE_PERCENT;
 			break;
 		}
 
@@ -825,6 +851,17 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			break;
 		}
 
+		case Opt_min_size: {
+			/* memparse() will accept a K/M/G without a digit */
+			if (!isdigit(*args[0].from))
+				goto bad_val;
+			min_size_opt = memparse(args[0].from, &rest);
+			min_val_type = SIZE_STD;
+			if (*rest == '%')
+				min_val_type = SIZE_PERCENT;
+			break;
+		}
+
 		default:
 			pr_err("Bad mount option: \"%s\"\n", p);
 			return -EINVAL;
@@ -832,15 +869,22 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 		}
 	}
 
-	/* Do size after hstate is set up */
-	if (setsize > NO_SIZE) {
-		struct hstate *h = pconfig->hstate;
-		if (setsize == SIZE_PERCENT) {
-			size <<= huge_page_shift(h);
-			size *= h->max_huge_pages;
-			do_div(size, 100);
-		}
-		pconfig->nr_blocks = (size >> huge_page_shift(h));
+	/*
+	 * Use huge page pool size (in hstate) to convert the size
+	 * options to number of huge pages.  If NO_SIZE, -1 is returned.
+	 */
+	pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
+						max_size_opt, max_val_type);
+	pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate,
+						min_size_opt, min_val_type);
+
+	/*
+	 * If max_size was specified, then min_size must be smaller
+	 */
+	if (max_val_type > NO_SIZE &&
+	    pconfig->min_hpages > pconfig->max_hpages) {
+		pr_err("minimum size can not be greater than maximum size\n");
+		return -EINVAL;
 	}
 
 	return 0;
@@ -859,12 +903,13 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	save_mount_options(sb, data);
 
-	config.nr_blocks = -1; /* No limit on size by default */
+	config.max_hpages = -1; /* No limit on size by default */
 	config.nr_inodes = -1; /* No limit on number of inodes by default */
 	config.uid = current_fsuid();
 	config.gid = current_fsgid();
 	config.mode = 0755;
 	config.hstate = &default_hstate;
+	config.min_hpages = -1; /* No default minimum size */
 	ret = hugetlbfs_parse_options(data, &config);
 	if (ret)
 		return ret;
@@ -878,8 +923,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
 	sbinfo->max_inodes = config.nr_inodes;
 	sbinfo->free_inodes = config.nr_inodes;
 	sbinfo->spool = NULL;
-	if (config.nr_blocks != -1) {
-		sbinfo->spool = hugepage_new_subpool(config.nr_blocks);
+	/*
+	 * Allocate and initialize subpool if maximum or minimum size is
+	 * specified.  Any needed reservations (for minimim size) are taken
+	 * taken when the subpool is created.
+	 */
+	if (config.max_hpages != -1 || config.min_hpages != -1) {
+		sbinfo->spool = hugepage_new_subpool(config.hstate,
+							config.max_hpages,
+							config.min_hpages);
 		if (!sbinfo->spool)
 			goto out_free;
 	}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d1a77b87408d..5713c49a4a5c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -44,7 +44,8 @@ extern int hugetlb_max_hstate __read_mostly;
 #define for_each_hstate(h) \
 	for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)
 
-struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+						long min_hpages);
 void hugepage_put_subpool(struct hugepage_subpool *spool);
 
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 499cb72c74b1..995c8d65a95c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
 static int num_fault_mutexes;
 static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
+
 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
 {
 	bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,12 +71,18 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
 	spin_unlock(&spool->lock);
 
 	/* If no pages are used, and no other handles to the subpool
-	 * remain, free the subpool the subpool remain */
-	if (free)
+	 * remain, give up any reservations mased on minimum size and
+	 * free the subpool */
+	if (free) {
+		if (spool->min_hpages != -1)
+			hugetlb_acct_memory(spool->hstate,
+						-spool->min_hpages);
 		kfree(spool);
+	}
 }
 
-struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+						long min_hpages)
 {
 	struct hugepage_subpool *spool;
 
@@ -83,7 +92,15 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
 
 	spin_lock_init(&spool->lock);
 	spool->count = 1;
-	spool->max_hpages = nr_blocks;
+	spool->max_hpages = max_hpages;
+	spool->hstate = h;
+	spool->min_hpages = min_hpages;
+
+	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
+		kfree(spool);
+		return NULL;
+	}
+	spool->rsv_hpages = min_hpages;
 
 	return spool;
 }
-- 
cgit v1.3-6-gb490


From e244c9e66f6197f55f6fbb2d5e70714e262cc595 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 15 Apr 2015 16:14:14 -0700
Subject: mm, mempool: disallow mempools based on slab caches with constructors

All occurrences of mempools based on slab caches with object constructors
have been removed from the tree, so disallow creating them.

We can only dereference mem->ctor in mm/mempool.c without including
mm/slab.h in include/linux/mempool.h.  So simply note the restriction,
just like the comment restricting usage of __GFP_ZERO, and warn on kernels
with CONFIG_DEBUG_VM() if such a mempool is allocated from.

We don't want to incur this check on every element allocation, so use
VM_BUG_ON().

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Dave Kleikamp <shaggy@kernel.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sebastian Ott <sebott@linux.vnet.ibm.com>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempool.h | 3 ++-
 mm/mempool.c            | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index b19b3023c880..69b6951e8fd2 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -36,7 +36,8 @@ extern void mempool_free(void *element, mempool_t *pool);
 
 /*
  * A mempool_alloc_t and mempool_free_t that get the memory from
- * a slab that is passed in through pool_data.
+ * a slab cache that is passed in through pool_data.
+ * Note: the slab cache may not have a ctor function.
  */
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
diff --git a/mm/mempool.c b/mm/mempool.c
index 949970db2874..b60fb85526ed 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -15,6 +15,7 @@
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
+#include "slab.h"
 
 static void add_element(mempool_t *pool, void *element)
 {
@@ -334,6 +335,7 @@ EXPORT_SYMBOL(mempool_free);
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
 {
 	struct kmem_cache *mem = pool_data;
+	VM_BUG_ON(mem->ctor);
 	return kmem_cache_alloc(mem, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_alloc_slab);
-- 
cgit v1.3-6-gb490


From 7e1f049efb86bd86c06b80eeac0ef80cdeb8c0e7 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 15 Apr 2015 16:14:41 -0700
Subject: mm: hugetlb: cleanup using paeg_huge_active()

Now we have an easy access to hugepages' activeness, so existing helpers to
get the information can be cleaned up.

[akpm@linux-foundation.org: s/PageHugeActive/page_huge_active/]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h    |  2 --
 include/linux/page-flags.h |  7 +++++++
 mm/hugetlb.c               | 42 +++++-------------------------------------
 mm/memory_hotplug.c        |  2 +-
 4 files changed, 13 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5713c49a4a5c..205026175c42 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -84,7 +84,6 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 int dequeue_hwpoisoned_huge_page(struct page *page);
 bool isolate_huge_page(struct page *page, struct list_head *list);
 void putback_active_hugepage(struct page *page);
-bool is_hugepage_active(struct page *page);
 void free_huge_page(struct page *page);
 
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -152,7 +151,6 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list)
 	return false;
 }
 #define putback_active_hugepage(p)	do {} while (0)
-#define is_hugepage_active(x)	false
 
 static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 84d10b65cec6..f34e040b34e9 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -470,11 +470,18 @@ static inline void ClearPageCompound(struct page *page)
 #ifdef CONFIG_HUGETLB_PAGE
 int PageHuge(struct page *page);
 int PageHeadHuge(struct page *page);
+bool page_huge_active(struct page *page);
 #else
 TESTPAGEFLAG_FALSE(Huge)
 TESTPAGEFLAG_FALSE(HeadHuge)
+
+static inline bool page_huge_active(struct page *page)
+{
+	return 0;
+}
 #endif
 
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * PageHuge() only returns true for hugetlbfs pages, but not for
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 05407831016b..271e4432734c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3896,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 
 #ifdef CONFIG_MEMORY_FAILURE
 
-/* Should be called in hugetlb_lock */
-static int is_hugepage_on_freelist(struct page *hpage)
-{
-	struct page *page;
-	struct page *tmp;
-	struct hstate *h = page_hstate(hpage);
-	int nid = page_to_nid(hpage);
-
-	list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
-		if (page == hpage)
-			return 1;
-	return 0;
-}
-
 /*
  * This function is called from memory failure code.
  * Assume the caller holds page lock of the head page.
@@ -3921,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
 	int ret = -EBUSY;
 
 	spin_lock(&hugetlb_lock);
-	if (is_hugepage_on_freelist(hpage)) {
+	/*
+	 * Just checking !page_huge_active is not enough, because that could be
+	 * an isolated/hwpoisoned hugepage (which have >0 refcount).
+	 */
+	if (!page_huge_active(hpage) && !page_count(hpage)) {
 		/*
 		 * Hwpoisoned hugepage isn't linked to activelist or freelist,
 		 * but dangling hpage->lru can trigger list-debug warnings
@@ -3965,25 +3955,3 @@ void putback_active_hugepage(struct page *page)
 	spin_unlock(&hugetlb_lock);
 	put_page(page);
 }
-
-bool is_hugepage_active(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageHuge(page), page);
-	/*
-	 * This function can be called for a tail page because the caller,
-	 * scan_movable_pages, scans through a given pfn-range which typically
-	 * covers one memory block. In systems using gigantic hugepage (1GB
-	 * for x86_64,) a hugepage is larger than a memory block, and we don't
-	 * support migrating such large hugepages for now, so return false
-	 * when called for tail pages.
-	 */
-	if (PageTail(page))
-		return false;
-	/*
-	 * Refcount of a hwpoisoned hugepages is 1, but they are not active,
-	 * so we should return false for them.
-	 */
-	if (unlikely(PageHWPoison(page)))
-		return false;
-	return page_count(page) > 0;
-}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e2e8014fb755..457bde530cbe 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1373,7 +1373,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
 			if (PageLRU(page))
 				return pfn;
 			if (PageHuge(page)) {
-				if (is_hugepage_active(page))
+				if (page_huge_active(page))
 					return pfn;
 				else
 					pfn = round_up(pfn + 1,
-- 
cgit v1.3-6-gb490


From cdd7875e0c4db5c41e28b645d3bf7d41bd2cbb45 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 15 Apr 2015 16:14:47 -0700
Subject: include/linux/mm.h: simplify flag check

Flip the flag test so that it is the simplest.  No functional change, just
a small readability improvement:

No code changed:

  # arch/x86/kernel/sys_x86_64.o:

   text    data     bss     dec     hex filename
   1551      24       0    1575     627 sys_x86_64.o.before
   1551      24       0    1575     627 sys_x86_64.o.after

md5:
   70708d1b1ad35cc891118a69dc1a63f9  sys_x86_64.o.before.asm
   70708d1b1ad35cc891118a69dc1a63f9  sys_x86_64.o.after.asm

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 515ec5045b60..68c21b2374c4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1894,10 +1894,10 @@ extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);
 static inline unsigned long
 vm_unmapped_area(struct vm_unmapped_area_info *info)
 {
-	if (!(info->flags & VM_UNMAPPED_AREA_TOPDOWN))
-		return unmapped_area(info);
-	else
+	if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
 		return unmapped_area_topdown(info);
+	else
+		return unmapped_area(info);
 }
 
 /* truncate.c */
-- 
cgit v1.3-6-gb490


From e39155ea11eac6da056b04669d7c9fc612e2065a Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Apr 2015 16:14:53 -0700
Subject: mm: uninline and cleanup page-mapping related helpers

Most-used page->mapping helper -- page_mapping() -- has already uninlined.
 Let's uninline also page_rmapping() and page_anon_vma().  It saves us
depending on configuration around 400 bytes in text:

   text	   data	    bss	    dec	    hex	filename
 660318	  99254	 410000	1169572	 11d8a4	mm/built-in.o-before
 659854	  99254	 410000	1169108	 11d6d4	mm/built-in.o

I also tried to make code a bit more clean.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h   |  8 ++------
 include/linux/rmap.h |  8 --------
 mm/util.c            | 41 ++++++++++++++++++++++++++++++++++++-----
 3 files changed, 38 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 68c21b2374c4..0e7bb2194da5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -950,14 +950,10 @@ void page_address_init(void);
 #define page_address_init()  do { } while(0)
 #endif
 
+extern void *page_rmapping(struct page *page);
+extern struct anon_vma *page_anon_vma(struct page *page);
 extern struct address_space *page_mapping(struct page *page);
 
-/* Neutral page->mapping pointer to address_space or anon_vma or other */
-static inline void *page_rmapping(struct page *page)
-{
-	return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS);
-}
-
 extern struct address_space *__page_file_mapping(struct page *);
 
 static inline
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c4c559a45dc8..c89c53a113a8 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -105,14 +105,6 @@ static inline void put_anon_vma(struct anon_vma *anon_vma)
 		__put_anon_vma(anon_vma);
 }
 
-static inline struct anon_vma *page_anon_vma(struct page *page)
-{
-	if (((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) !=
-					    PAGE_MAPPING_ANON)
-		return NULL;
-	return page_rmapping(page);
-}
-
 static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
 {
 	struct anon_vma *anon_vma = vma->anon_vma;
diff --git a/mm/util.c b/mm/util.c
index 3981ae9d1b15..68ff8a5361e7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -325,9 +325,37 @@ void kvfree(const void *addr)
 }
 EXPORT_SYMBOL(kvfree);
 
+static inline void *__page_rmapping(struct page *page)
+{
+	unsigned long mapping;
+
+	mapping = (unsigned long)page->mapping;
+	mapping &= ~PAGE_MAPPING_FLAGS;
+
+	return (void *)mapping;
+}
+
+/* Neutral page->mapping pointer to address_space or anon_vma or other */
+void *page_rmapping(struct page *page)
+{
+	page = compound_head(page);
+	return __page_rmapping(page);
+}
+
+struct anon_vma *page_anon_vma(struct page *page)
+{
+	unsigned long mapping;
+
+	page = compound_head(page);
+	mapping = (unsigned long)page->mapping;
+	if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+		return NULL;
+	return __page_rmapping(page);
+}
+
 struct address_space *page_mapping(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	unsigned long mapping;
 
 	/* This happens if someone calls flush_dcache_page on slab page */
 	if (unlikely(PageSlab(page)))
@@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page)
 		swp_entry_t entry;
 
 		entry.val = page_private(page);
-		mapping = swap_address_space(entry);
-	} else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
-		mapping = NULL;
-	return mapping;
+		return swap_address_space(entry);
+	}
+
+	mapping = (unsigned long)page->mapping;
+	if (mapping & PAGE_MAPPING_FLAGS)
+		return NULL;
+	return page->mapping;
 }
 
 int overcommit_ratio_handler(struct ctl_table *table, int write,
-- 
cgit v1.3-6-gb490


From 923936157b158f36bd6a3d86496dce82b1a957de Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Wed, 15 Apr 2015 16:15:05 -0700
Subject: mm/mempool.c: kasan: poison mempool elements

Mempools keep allocated objects in reserved for situations when ordinary
allocation may not be possible to satisfy.  These objects shouldn't be
accessed before they leave the pool.

This patch poison elements when get into the pool and unpoison when they
leave it.  This will let KASan to detect use-after-free of mempool's
elements.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Tested-by: David Rientjes <rientjes@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dmitry Chernenkov <drcheren@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  2 ++
 mm/kasan/kasan.c      | 13 +++++++++++++
 mm/mempool.c          | 23 +++++++++++++++++++++++
 3 files changed, 38 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 5bb074431eb0..5486d777b706 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -44,6 +44,7 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object);
 
 void kasan_kmalloc_large(const void *ptr, size_t size);
 void kasan_kfree_large(const void *ptr);
+void kasan_kfree(void *ptr);
 void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size);
 void kasan_krealloc(const void *object, size_t new_size);
 
@@ -71,6 +72,7 @@ static inline void kasan_poison_object_data(struct kmem_cache *cache,
 
 static inline void kasan_kmalloc_large(void *ptr, size_t size) {}
 static inline void kasan_kfree_large(const void *ptr) {}
+static inline void kasan_kfree(void *ptr) {}
 static inline void kasan_kmalloc(struct kmem_cache *s, const void *object,
 				size_t size) {}
 static inline void kasan_krealloc(const void *object, size_t new_size) {}
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 936d81661c47..6c513a63ea84 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -389,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size)
 		kasan_kmalloc(page->slab_cache, object, size);
 }
 
+void kasan_kfree(void *ptr)
+{
+	struct page *page;
+
+	page = virt_to_head_page(ptr);
+
+	if (unlikely(!PageSlab(page)))
+		kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+				KASAN_FREE_PAGE);
+	else
+		kasan_slab_free(page->slab_cache, ptr);
+}
+
 void kasan_kfree_large(const void *ptr)
 {
 	struct page *page = virt_to_page(ptr);
diff --git a/mm/mempool.c b/mm/mempool.c
index 2884d5bad77e..2cc08de8b1db 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/kasan.h>
 #include <linux/kmemleak.h>
 #include <linux/export.h>
 #include <linux/mempool.h>
@@ -101,10 +102,31 @@ static inline void poison_element(mempool_t *pool, void *element)
 }
 #endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
 
+static void kasan_poison_element(mempool_t *pool, void *element)
+{
+	if (pool->alloc == mempool_alloc_slab)
+		kasan_slab_free(pool->pool_data, element);
+	if (pool->alloc == mempool_kmalloc)
+		kasan_kfree(element);
+	if (pool->alloc == mempool_alloc_pages)
+		kasan_free_pages(element, (unsigned long)pool->pool_data);
+}
+
+static void kasan_unpoison_element(mempool_t *pool, void *element)
+{
+	if (pool->alloc == mempool_alloc_slab)
+		kasan_slab_alloc(pool->pool_data, element);
+	if (pool->alloc == mempool_kmalloc)
+		kasan_krealloc(element, (size_t)pool->pool_data);
+	if (pool->alloc == mempool_alloc_pages)
+		kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+}
+
 static void add_element(mempool_t *pool, void *element)
 {
 	BUG_ON(pool->curr_nr >= pool->min_nr);
 	poison_element(pool, element);
+	kasan_poison_element(pool, element);
 	pool->elements[pool->curr_nr++] = element;
 }
 
@@ -114,6 +136,7 @@ static void *remove_element(mempool_t *pool)
 
 	BUG_ON(pool->curr_nr < 0);
 	check_element(pool, element);
+	kasan_unpoison_element(pool, element);
 	return element;
 }
 
-- 
cgit v1.3-6-gb490


From dd9061846a3ba01b0fa45423aaa087e4a69187fa Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <boaz@plexistor.com>
Date: Wed, 15 Apr 2015 16:15:11 -0700
Subject: mm: new pfn_mkwrite same as page_mkwrite for VM_PFNMAP

This will allow FS that uses VM_PFNMAP | VM_MIXEDMAP (no page structs) to
get notified when access is a write to a read-only PFN.

This can happen if we mmap() a file then first mmap-read from it to
page-in a read-only PFN, than we mmap-write to the same page.

We need this functionality to fix a DAX bug, where in the scenario above
we fail to set ctime/mtime though we modified the file.  An xfstest is
attached to this patchset that shows the failure and the fix.  (A DAX
patch will follow)

This functionality is extra important for us, because upon dirtying of a
pmem page we also want to RDMA the page to a remote cluster node.

We define a new pfn_mkwrite and do not reuse page_mkwrite because
  1 - The name ;-)
  2 - But mainly because it would take a very long and tedious
      audit of all page_mkwrite functions of VM_MIXEDMAP/VM_PFNMAP
      users. To make sure they do not now CRASH. For example current
      DAX code (which this is for) would crash.
      If we would want to reuse page_mkwrite, We will need to first
      patch all users, so to not-crash-on-no-page. Then enable this
      patch. But even if I did that I would not sleep so well at night.
      Adding a new vector is the safest thing to do, and is not that
      expensive. an extra pointer at a static function vector per driver.
      Also the new vector is better for performance, because else we
      Will call all current Kernel vectors, so to:
        check-ha-no-page-do-nothing and return.

No need to call it from do_shared_fault because do_wp_page is called to
change pte permissions anyway.

Signed-off-by: Yigal Korman <yigal@plexistor.com>
Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/Locking |  8 ++++++++
 include/linux/mm.h                |  3 +++
 mm/memory.c                       | 43 +++++++++++++++++++++++++++++++++++----
 3 files changed, 50 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index f91926f2f482..8bb8a7ee0f99 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -525,6 +525,7 @@ prototypes:
 	void (*close)(struct vm_area_struct*);
 	int (*fault)(struct vm_area_struct*, struct vm_fault *);
 	int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
+	int (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *);
 	int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
 
 locking rules:
@@ -534,6 +535,7 @@ close:		yes
 fault:		yes		can return with page locked
 map_pages:	yes
 page_mkwrite:	yes		can return with page locked
+pfn_mkwrite:	yes
 access:		yes
 
 	->fault() is called when a previously not present pte is about
@@ -560,6 +562,12 @@ the page has been truncated, the filesystem should not look up a new page
 like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
 will cause the VM to retry the fault.
 
+	->pfn_mkwrite() is the same as page_mkwrite but when the pte is
+VM_PFNMAP or VM_MIXEDMAP with a page-less entry. Expected return is
+VM_FAULT_NOPAGE. Or one of the VM_FAULT_ERROR types. The default behavior
+after this call is to make the pte read-write, unless pfn_mkwrite returns
+an error.
+
 	->access() is called when get_user_pages() fails in
 access_process_vm(), typically used to debug a process through
 /proc/pid/mem or ptrace.  This function is needed only for
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0e7bb2194da5..8b086070c3a5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -251,6 +251,9 @@ struct vm_operations_struct {
 	 * writable, if an error is returned it will cause a SIGBUS */
 	int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
 
+	/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
+	int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
+
 	/* called by access_process_vm when get_user_pages() fails, typically
 	 * for use by special VMAs that can switch between memory and hardware
 	 */
diff --git a/mm/memory.c b/mm/memory.c
index f9628e568c58..22e037e3364e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2180,6 +2180,42 @@ oom:
 	return VM_FAULT_OOM;
 }
 
+/*
+ * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
+ * mapping
+ */
+static int wp_pfn_shared(struct mm_struct *mm,
+			struct vm_area_struct *vma, unsigned long address,
+			pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+			pmd_t *pmd)
+{
+	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
+		struct vm_fault vmf = {
+			.page = NULL,
+			.pgoff = linear_page_index(vma, address),
+			.virtual_address = (void __user *)(address & PAGE_MASK),
+			.flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
+		};
+		int ret;
+
+		pte_unmap_unlock(page_table, ptl);
+		ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
+		if (ret & VM_FAULT_ERROR)
+			return ret;
+		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+		/*
+		 * We might have raced with another page fault while we
+		 * released the pte_offset_map_lock.
+		 */
+		if (!pte_same(*page_table, orig_pte)) {
+			pte_unmap_unlock(page_table, ptl);
+			return 0;
+		}
+	}
+	return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
+			     NULL, 0, 0);
+}
+
 static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
 			  unsigned long address, pte_t *page_table,
 			  pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
@@ -2258,13 +2294,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * VM_PFNMAP VMA.
 		 *
 		 * We should not cow pages in a shared writeable mapping.
-		 * Just mark the pages writable as we can't do any dirty
-		 * accounting on raw pfn maps.
+		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
 		 */
 		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
 				     (VM_WRITE|VM_SHARED))
-			return wp_page_reuse(mm, vma, address, page_table, ptl,
-					     orig_pte, old_page, 0, 0);
+			return wp_pfn_shared(mm, vma, address, page_table, ptl,
+					     orig_pte, pmd);
 
 		pte_unmap_unlock(page_table, ptl);
 		return wp_page_copy(mm, vma, address, page_table, pmd,
-- 
cgit v1.3-6-gb490


From 0e3b210ce1722168227cb3bc7746256d0c0afece Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <boaz@plexistor.com>
Date: Wed, 15 Apr 2015 16:15:14 -0700
Subject: dax: use pfn_mkwrite to update c/mtime + freeze protection

From: Yigal Korman <yigal@plexistor.com>

[v1]
Without this patch, c/mtime is not updated correctly when mmap'ed page is
first read from and then written to.

A new xfstest is submitted for testing this (generic/080)

[v2]
Jan Kara has pointed out that if we add the
sb_start/end_pagefault pair in the new pfn_mkwrite we
are then fixing another bug where: A user could start
writing to the page while filesystem is frozen.

Signed-off-by: Yigal Korman <yigal@plexistor.com>
Signed-off-by: Boaz Harrosh <boaz@plexistor.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dax.c           | 17 +++++++++++++++++
 fs/ext2/file.c     |  1 +
 fs/ext4/file.c     |  1 +
 include/linux/fs.h |  1 +
 4 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/fs/dax.c b/fs/dax.c
index ed1619ec6537..d0bd1f4f81b3 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -463,6 +463,23 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 }
 EXPORT_SYMBOL_GPL(dax_fault);
 
+/**
+ * dax_pfn_mkwrite - handle first write to DAX page
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ *
+ */
+int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+	sb_start_pagefault(sb);
+	file_update_time(vma->vm_file);
+	sb_end_pagefault(sb);
+	return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
+
 /**
  * dax_zero_page_range - zero a range within a page of a DAX file
  * @inode: The file being truncated
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index e31701713516..866a3ce3f864 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -39,6 +39,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static const struct vm_operations_struct ext2_dax_vm_ops = {
 	.fault		= ext2_dax_fault,
 	.page_mkwrite	= ext2_dax_mkwrite,
+	.pfn_mkwrite	= dax_pfn_mkwrite,
 };
 
 static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 598abbbe6786..aa78c70553f4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -206,6 +206,7 @@ static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.fault		= ext4_dax_fault,
 	.page_mkwrite	= ext4_dax_mkwrite,
+	.pfn_mkwrite	= dax_pfn_mkwrite,
 };
 #else
 #define ext4_dax_vm_ops	ext4_file_vm_ops
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 60733bdc74b4..0f696328f218 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2620,6 +2620,7 @@ int dax_clear_blocks(struct inode *, sector_t block, long size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
 #define dax_mkwrite(vma, vmf, gb)	dax_fault(vma, vmf, gb)
 
 #ifdef CONFIG_BLOCK
-- 
cgit v1.3-6-gb490


From 312fcae227037619dc858c9ccd362c7b847730a2 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Wed, 15 Apr 2015 16:15:30 -0700
Subject: zsmalloc: support compaction

This patch provides core functions for migration of zsmalloc.  Migraion
policy is simple as follows.

for each size class {
        while {
                src_page = get zs_page from ZS_ALMOST_EMPTY
                if (!src_page)
                        break;
                dst_page = get zs_page from ZS_ALMOST_FULL
                if (!dst_page)
                        dst_page = get zs_page from ZS_ALMOST_EMPTY
                if (!dst_page)
                        break;
                migrate(from src_page, to dst_page);
        }
}

For migration, we need to identify which objects in zspage are allocated
to migrate them out.  We could know it by iterating of freed objects in a
zspage because first_page of zspage keeps free objects singly-linked list
but it's not efficient.  Instead, this patch adds a tag(ie,
OBJ_ALLOCATED_TAG) in header of each object(ie, handle) so we could check
whether the object is allocated easily.

This patch adds another status bit in handle to synchronize between user
access through zs_map_object and migration.  During migration, we cannot
move objects user are using due to data coherency between old object and
new object.

[akpm@linux-foundation.org: zsmalloc.c needs sched.h for cond_resched()]
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Juneho Choi <juno.choi@lge.com>
Cc: Gunho Lee <gunho.lee@lge.com>
Cc: Luigi Semenzato <semenzato@google.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Seth Jennings <sjennings@variantweb.net>
Cc: Nitin Gupta <ngupta@vflare.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/zsmalloc.h |   1 +
 mm/zsmalloc.c            | 378 ++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 360 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 3283c6a55425..1338190b5478 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -47,5 +47,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
 
 unsigned long zs_get_total_pages(struct zs_pool *pool);
+unsigned long zs_compact(struct zs_pool *pool);
 
 #endif
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 55b171016f4f..c4ae608dc725 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -78,6 +78,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/bitops.h>
 #include <linux/errno.h>
 #include <linux/highmem.h>
@@ -135,7 +136,26 @@
 #endif
 #endif
 #define _PFN_BITS		(MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS)
+
+/*
+ * Memory for allocating for handle keeps object position by
+ * encoding <page, obj_idx> and the encoded value has a room
+ * in least bit(ie, look at obj_to_location).
+ * We use the bit to synchronize between object access by
+ * user and migration.
+ */
+#define HANDLE_PIN_BIT	0
+
+/*
+ * Head in allocated object should have OBJ_ALLOCATED_TAG
+ * to identify the object was allocated or not.
+ * It's okay to add the status bit in the least bit because
+ * header keeps handle which is 4byte-aligned address so we
+ * have room for two bit at least.
+ */
+#define OBJ_ALLOCATED_TAG 1
+#define OBJ_TAG_BITS 1
+#define OBJ_INDEX_BITS	(BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
 #define OBJ_INDEX_MASK	((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
 
 #define MAX(a, b) ((a) >= (b) ? (a) : (b))
@@ -610,35 +630,35 @@ static struct page *get_next_page(struct page *page)
 
 /*
  * Encode <page, obj_idx> as a single handle value.
- * On hardware platforms with physical memory starting at 0x0 the pfn
- * could be 0 so we ensure that the handle will never be 0 by adjusting the
- * encoded obj_idx value before encoding.
+ * We use the least bit of handle for tagging.
  */
-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+static void *location_to_obj(struct page *page, unsigned long obj_idx)
 {
-	unsigned long handle;
+	unsigned long obj;
 
 	if (!page) {
 		BUG_ON(obj_idx);
 		return NULL;
 	}
 
-	handle = page_to_pfn(page) << OBJ_INDEX_BITS;
-	handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
+	obj = page_to_pfn(page) << OBJ_INDEX_BITS;
+	obj |= ((obj_idx) & OBJ_INDEX_MASK);
+	obj <<= OBJ_TAG_BITS;
 
-	return (void *)handle;
+	return (void *)obj;
 }
 
 /*
  * Decode <page, obj_idx> pair from the given object handle. We adjust the
  * decoded obj_idx back to its original value since it was adjusted in
- * obj_location_to_handle().
+ * location_to_obj().
  */
-static void obj_to_location(unsigned long handle, struct page **page,
+static void obj_to_location(unsigned long obj, struct page **page,
 				unsigned long *obj_idx)
 {
-	*page = pfn_to_page(handle >> OBJ_INDEX_BITS);
-	*obj_idx = (handle & OBJ_INDEX_MASK) - 1;
+	obj >>= OBJ_TAG_BITS;
+	*page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+	*obj_idx = (obj & OBJ_INDEX_MASK);
 }
 
 static unsigned long handle_to_obj(unsigned long handle)
@@ -646,6 +666,11 @@ static unsigned long handle_to_obj(unsigned long handle)
 	return *(unsigned long *)handle;
 }
 
+unsigned long obj_to_head(void *obj)
+{
+	return *(unsigned long *)obj;
+}
+
 static unsigned long obj_idx_to_offset(struct page *page,
 				unsigned long obj_idx, int class_size)
 {
@@ -657,6 +682,25 @@ static unsigned long obj_idx_to_offset(struct page *page,
 	return off + obj_idx * class_size;
 }
 
+static inline int trypin_tag(unsigned long handle)
+{
+	unsigned long *ptr = (unsigned long *)handle;
+
+	return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
+}
+
+static void pin_tag(unsigned long handle)
+{
+	while (!trypin_tag(handle));
+}
+
+static void unpin_tag(unsigned long handle)
+{
+	unsigned long *ptr = (unsigned long *)handle;
+
+	clear_bit_unlock(HANDLE_PIN_BIT, ptr);
+}
+
 static void reset_page(struct page *page)
 {
 	clear_bit(PG_private, &page->flags);
@@ -718,7 +762,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
 		link = (struct link_free *)vaddr + off / sizeof(*link);
 
 		while ((off += class->size) < PAGE_SIZE) {
-			link->next = obj_location_to_handle(page, i++);
+			link->next = location_to_obj(page, i++);
 			link += class->size / sizeof(*link);
 		}
 
@@ -728,7 +772,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
 		 * page (if present)
 		 */
 		next_page = get_next_page(page);
-		link->next = obj_location_to_handle(next_page, 0);
+		link->next = location_to_obj(next_page, 0);
 		kunmap_atomic(vaddr);
 		page = next_page;
 		off %= PAGE_SIZE;
@@ -782,7 +826,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
 
 	init_zspage(first_page, class);
 
-	first_page->freelist = obj_location_to_handle(first_page, 0);
+	first_page->freelist = location_to_obj(first_page, 0);
 	/* Maximum number of objects we can store in this zspage */
 	first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
 
@@ -1017,6 +1061,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
 	return true;
 }
 
+static bool zspage_full(struct page *page)
+{
+	BUG_ON(!is_first_page(page));
+
+	return page->inuse == page->objects;
+}
+
 #ifdef CONFIG_ZSMALLOC_STAT
 
 static inline void zs_stat_inc(struct size_class *class,
@@ -1219,6 +1270,9 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 	 */
 	BUG_ON(in_interrupt());
 
+	/* From now on, migration cannot move the object */
+	pin_tag(handle);
+
 	obj = handle_to_obj(handle);
 	obj_to_location(obj, &page, &obj_idx);
 	get_zspage_mapping(get_first_page(page), &class_idx, &fg);
@@ -1276,6 +1330,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 		__zs_unmap_object(area, pages, off, class->size);
 	}
 	put_cpu_var(zs_map_area);
+	unpin_tag(handle);
 }
 EXPORT_SYMBOL_GPL(zs_unmap_object);
 
@@ -1289,6 +1344,7 @@ static unsigned long obj_malloc(struct page *first_page,
 	unsigned long m_objidx, m_offset;
 	void *vaddr;
 
+	handle |= OBJ_ALLOCATED_TAG;
 	obj = (unsigned long)first_page->freelist;
 	obj_to_location(obj, &m_page, &m_objidx);
 	m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
@@ -1374,6 +1430,7 @@ static void obj_free(struct zs_pool *pool, struct size_class *class,
 
 	BUG_ON(!obj);
 
+	obj &= ~OBJ_ALLOCATED_TAG;
 	obj_to_location(obj, &f_page, &f_objidx);
 	first_page = get_first_page(f_page);
 
@@ -1402,8 +1459,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 	if (unlikely(!handle))
 		return;
 
+	pin_tag(handle);
 	obj = handle_to_obj(handle);
-	free_handle(pool, handle);
 	obj_to_location(obj, &f_page, &f_objidx);
 	first_page = get_first_page(f_page);
 
@@ -1413,18 +1470,301 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 	spin_lock(&class->lock);
 	obj_free(pool, class, obj);
 	fullness = fix_fullness_group(class, first_page);
-	if (fullness == ZS_EMPTY)
+	if (fullness == ZS_EMPTY) {
 		zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
 				class->size, class->pages_per_zspage));
+		atomic_long_sub(class->pages_per_zspage,
+				&pool->pages_allocated);
+		free_zspage(first_page);
+	}
 	spin_unlock(&class->lock);
+	unpin_tag(handle);
+
+	free_handle(pool, handle);
+}
+EXPORT_SYMBOL_GPL(zs_free);
+
+static void zs_object_copy(unsigned long src, unsigned long dst,
+				struct size_class *class)
+{
+	struct page *s_page, *d_page;
+	unsigned long s_objidx, d_objidx;
+	unsigned long s_off, d_off;
+	void *s_addr, *d_addr;
+	int s_size, d_size, size;
+	int written = 0;
+
+	s_size = d_size = class->size;
+
+	obj_to_location(src, &s_page, &s_objidx);
+	obj_to_location(dst, &d_page, &d_objidx);
+
+	s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
+	d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+
+	if (s_off + class->size > PAGE_SIZE)
+		s_size = PAGE_SIZE - s_off;
+
+	if (d_off + class->size > PAGE_SIZE)
+		d_size = PAGE_SIZE - d_off;
+
+	s_addr = kmap_atomic(s_page);
+	d_addr = kmap_atomic(d_page);
+
+	while (1) {
+		size = min(s_size, d_size);
+		memcpy(d_addr + d_off, s_addr + s_off, size);
+		written += size;
+
+		if (written == class->size)
+			break;
+
+		if (s_off + size >= PAGE_SIZE) {
+			kunmap_atomic(d_addr);
+			kunmap_atomic(s_addr);
+			s_page = get_next_page(s_page);
+			BUG_ON(!s_page);
+			s_addr = kmap_atomic(s_page);
+			d_addr = kmap_atomic(d_page);
+			s_size = class->size - written;
+			s_off = 0;
+		} else {
+			s_off += size;
+			s_size -= size;
+		}
+
+		if (d_off + size >= PAGE_SIZE) {
+			kunmap_atomic(d_addr);
+			d_page = get_next_page(d_page);
+			BUG_ON(!d_page);
+			d_addr = kmap_atomic(d_page);
+			d_size = class->size - written;
+			d_off = 0;
+		} else {
+			d_off += size;
+			d_size -= size;
+		}
+	}
+
+	kunmap_atomic(d_addr);
+	kunmap_atomic(s_addr);
+}
+
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct page *page, int index,
+					struct size_class *class)
+{
+	unsigned long head;
+	int offset = 0;
+	unsigned long handle = 0;
+	void *addr = kmap_atomic(page);
+
+	if (!is_first_page(page))
+		offset = page->index;
+	offset += class->size * index;
+
+	while (offset < PAGE_SIZE) {
+		head = obj_to_head(addr + offset);
+		if (head & OBJ_ALLOCATED_TAG) {
+			handle = head & ~OBJ_ALLOCATED_TAG;
+			if (trypin_tag(handle))
+				break;
+			handle = 0;
+		}
+
+		offset += class->size;
+		index++;
+	}
+
+	kunmap_atomic(addr);
+	return handle;
+}
+
+struct zs_compact_control {
+	/* Source page for migration which could be a subpage of zspage. */
+	struct page *s_page;
+	/* Destination page for migration which should be a first page
+	 * of zspage. */
+	struct page *d_page;
+	 /* Starting object index within @s_page which used for live object
+	  * in the subpage. */
+	int index;
+	/* how many of objects are migrated */
+	int nr_migrated;
+};
+
+static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
+				struct zs_compact_control *cc)
+{
+	unsigned long used_obj, free_obj;
+	unsigned long handle;
+	struct page *s_page = cc->s_page;
+	struct page *d_page = cc->d_page;
+	unsigned long index = cc->index;
+	int nr_migrated = 0;
+	int ret = 0;
+
+	while (1) {
+		handle = find_alloced_obj(s_page, index, class);
+		if (!handle) {
+			s_page = get_next_page(s_page);
+			if (!s_page)
+				break;
+			index = 0;
+			continue;
+		}
+
+		/* Stop if there is no more space */
+		if (zspage_full(d_page)) {
+			unpin_tag(handle);
+			ret = -ENOMEM;
+			break;
+		}
 
+		used_obj = handle_to_obj(handle);
+		free_obj = obj_malloc(d_page, class, handle);
+		zs_object_copy(used_obj, free_obj, class);
+		index++;
+		record_obj(handle, free_obj);
+		unpin_tag(handle);
+		obj_free(pool, class, used_obj);
+		nr_migrated++;
+	}
+
+	/* Remember last position in this iteration */
+	cc->s_page = s_page;
+	cc->index = index;
+	cc->nr_migrated = nr_migrated;
+
+	return ret;
+}
+
+static struct page *alloc_target_page(struct size_class *class)
+{
+	int i;
+	struct page *page;
+
+	for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+		page = class->fullness_list[i];
+		if (page) {
+			remove_zspage(page, class, i);
+			break;
+		}
+	}
+
+	return page;
+}
+
+static void putback_zspage(struct zs_pool *pool, struct size_class *class,
+				struct page *first_page)
+{
+	int class_idx;
+	enum fullness_group fullness;
+
+	BUG_ON(!is_first_page(first_page));
+
+	get_zspage_mapping(first_page, &class_idx, &fullness);
+	insert_zspage(first_page, class, fullness);
+	fullness = fix_fullness_group(class, first_page);
 	if (fullness == ZS_EMPTY) {
+		zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+			class->size, class->pages_per_zspage));
 		atomic_long_sub(class->pages_per_zspage,
 				&pool->pages_allocated);
+
 		free_zspage(first_page);
 	}
 }
-EXPORT_SYMBOL_GPL(zs_free);
+
+static struct page *isolate_source_page(struct size_class *class)
+{
+	struct page *page;
+
+	page = class->fullness_list[ZS_ALMOST_EMPTY];
+	if (page)
+		remove_zspage(page, class, ZS_ALMOST_EMPTY);
+
+	return page;
+}
+
+static unsigned long __zs_compact(struct zs_pool *pool,
+				struct size_class *class)
+{
+	int nr_to_migrate;
+	struct zs_compact_control cc;
+	struct page *src_page;
+	struct page *dst_page = NULL;
+	unsigned long nr_total_migrated = 0;
+
+	cond_resched();
+
+	spin_lock(&class->lock);
+	while ((src_page = isolate_source_page(class))) {
+
+		BUG_ON(!is_first_page(src_page));
+
+		/* The goal is to migrate all live objects in source page */
+		nr_to_migrate = src_page->inuse;
+		cc.index = 0;
+		cc.s_page = src_page;
+
+		while ((dst_page = alloc_target_page(class))) {
+			cc.d_page = dst_page;
+			/*
+			 * If there is no more space in dst_page, try to
+			 * allocate another zspage.
+			 */
+			if (!migrate_zspage(pool, class, &cc))
+				break;
+
+			putback_zspage(pool, class, dst_page);
+			nr_total_migrated += cc.nr_migrated;
+			nr_to_migrate -= cc.nr_migrated;
+		}
+
+		/* Stop if we couldn't find slot */
+		if (dst_page == NULL)
+			break;
+
+		putback_zspage(pool, class, dst_page);
+		putback_zspage(pool, class, src_page);
+		spin_unlock(&class->lock);
+		nr_total_migrated += cc.nr_migrated;
+		cond_resched();
+		spin_lock(&class->lock);
+	}
+
+	if (src_page)
+		putback_zspage(pool, class, src_page);
+
+	spin_unlock(&class->lock);
+
+	return nr_total_migrated;
+}
+
+unsigned long zs_compact(struct zs_pool *pool)
+{
+	int i;
+	unsigned long nr_migrated = 0;
+	struct size_class *class;
+
+	for (i = zs_size_classes - 1; i >= 0; i--) {
+		class = pool->size_class[i];
+		if (!class)
+			continue;
+		if (class->index != i)
+			continue;
+		nr_migrated += __zs_compact(pool, class);
+	}
+
+	synchronize_rcu();
+
+	return nr_migrated;
+}
+EXPORT_SYMBOL_GPL(zs_compact);
 
 /**
  * zs_create_pool - Creates an allocation pool to work from.
-- 
cgit v1.3-6-gb490


From 23f40a94d860449f39f00c3350bf850d15983e63 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 15 Apr 2015 16:16:33 -0700
Subject: include/linux: remove empty conditionals

Commit 607ca46e97a1 ("UAPI: (Scripted) Disintegrate include/linux") left
behind some empty conditional blocks.  Since they are useless and may
cause a reader to wonder whether something is missing, remove them.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/a.out.h | 67 ---------------------------------------------------
 include/linux/types.h |  6 -----
 2 files changed, 73 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/a.out.h b/include/linux/a.out.h
index 220f14338895..ee884168989f 100644
--- a/include/linux/a.out.h
+++ b/include/linux/a.out.h
@@ -4,44 +4,6 @@
 #include <uapi/linux/a.out.h>
 
 #ifndef __ASSEMBLY__
-#if defined (M_OLDSUN2)
-#else
-#endif
-#if defined (M_68010)
-#else
-#endif
-#if defined (M_68020)
-#else
-#endif
-#if defined (M_SPARC)
-#else
-#endif
-#if !defined (N_MAGIC)
-#endif
-#if !defined (N_BADMAG)
-#endif
-#if !defined (N_TXTOFF)
-#endif
-#if !defined (N_DATOFF)
-#endif
-#if !defined (N_TRELOFF)
-#endif
-#if !defined (N_DRELOFF)
-#endif
-#if !defined (N_SYMOFF)
-#endif
-#if !defined (N_STROFF)
-#endif
-#if !defined (N_TXTADDR)
-#endif
-#if defined(vax) || defined(hp300) || defined(pyr)
-#endif
-#ifdef	sony
-#endif	/* Sony.  */
-#ifdef is68k
-#endif
-#if defined(m68k) && defined(PORTAR)
-#endif
 #ifdef linux
 #include <asm/page.h>
 #if defined(__i386__) || defined(__mc68000__)
@@ -51,34 +13,5 @@
 #endif
 #endif
 #endif
-#ifndef N_DATADDR
-#endif
-#if !defined (N_BSSADDR)
-#endif
-#if !defined (N_NLIST_DECLARED)
-#endif /* no N_NLIST_DECLARED.  */
-#if !defined (N_UNDF)
-#endif
-#if !defined (N_ABS)
-#endif
-#if !defined (N_TEXT)
-#endif
-#if !defined (N_DATA)
-#endif
-#if !defined (N_BSS)
-#endif
-#if !defined (N_FN)
-#endif
-#if !defined (N_EXT)
-#endif
-#if !defined (N_TYPE)
-#endif
-#if !defined (N_STAB)
-#endif
-#if !defined (N_RELOCATION_INFO_DECLARED)
-#ifdef NS32K
-#else
-#endif
-#endif /* no N_RELOCATION_INFO_DECLARED.  */
 #endif /*__ASSEMBLY__ */
 #endif /* __A_OUT_GNU_H__ */
diff --git a/include/linux/types.h b/include/linux/types.h
index 6747247e3f9f..59698be03490 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -146,12 +146,6 @@ typedef u64 dma_addr_t;
 typedef u32 dma_addr_t;
 #endif /* dma_addr_t */
 
-#ifdef __CHECKER__
-#else
-#endif
-#ifdef __CHECK_ENDIAN__
-#else
-#endif
 typedef unsigned __bitwise__ gfp_t;
 typedef unsigned __bitwise__ fmode_t;
 typedef unsigned __bitwise__ oom_flags_t;
-- 
cgit v1.3-6-gb490


From 2813893f8b197a14f1e1ddb04d99bce46817c84a Mon Sep 17 00:00:00 2001
From: Iulia Manda <iulia.manda21@gmail.com>
Date: Wed, 15 Apr 2015 16:16:41 -0700
Subject: kernel: conditionally support non-root users, groups and capabilities

There are a lot of embedded systems that run most or all of their
functionality in init, running as root:root.  For these systems,
supporting multiple users is not necessary.

This patch adds a new symbol, CONFIG_MULTIUSER, that makes support for
non-root users, non-root groups, and capabilities optional.  It is enabled
under CONFIG_EXPERT menu.

When this symbol is not defined, UID and GID are zero in any possible case
and processes always have all capabilities.

The following syscalls are compiled out: setuid, setregid, setgid,
setreuid, setresuid, getresuid, setresgid, getresgid, setgroups,
getgroups, setfsuid, setfsgid, capget, capset.

Also, groups.c is compiled out completely.

In kernel/capability.c, capable function was moved in order to avoid
adding two ifdef blocks.

This change saves about 25 KB on a defconfig build.  The most minimal
kernels have total text sizes in the high hundreds of kB rather than
low MB.  (The 25k goes down a bit with allnoconfig, but not that much.

The kernel was booted in Qemu.  All the common functionalities work.
Adding users/groups is not possible, failing with -ENOSYS.

Bloat-o-meter output:
add/remove: 7/87 grow/shrink: 19/397 up/down: 1675/-26325 (-24650)

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Iulia Manda <iulia.manda21@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/Kconfig                     |  1 +
 drivers/staging/lustre/lustre/Kconfig |  1 +
 fs/nfs/Kconfig                        |  2 +-
 fs/nfsd/Kconfig                       |  1 +
 include/linux/capability.h            | 29 +++++++++++++++++++++++++++++
 include/linux/cred.h                  | 23 +++++++++++++++++++----
 include/linux/uidgid.h                | 12 ++++++++++++
 init/Kconfig                          | 19 ++++++++++++++++++-
 kernel/Makefile                       |  4 +++-
 kernel/capability.c                   | 35 +++++++++++++++++++----------------
 kernel/cred.c                         |  3 +++
 kernel/groups.c                       |  3 ---
 kernel/sys.c                          |  2 ++
 kernel/sys_ni.c                       | 14 ++++++++++++++
 net/sunrpc/Kconfig                    |  2 ++
 security/Kconfig                      |  1 +
 16 files changed, 126 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a5ced5c3c1e0..de2726a487b0 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -328,6 +328,7 @@ config COMPAT
 	select COMPAT_BINFMT_ELF if BINFMT_ELF
 	select ARCH_WANT_OLD_COMPAT_IPC
 	select COMPAT_OLD_SIGACTION
+	depends on MULTIUSER
 	help
 	  Select this option if you want to enable your system kernel to
 	  handle system-calls from ELF binaries for 31 bit ESA.  This option
diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig
index 6725467ef4d0..62c7bba75274 100644
--- a/drivers/staging/lustre/lustre/Kconfig
+++ b/drivers/staging/lustre/lustre/Kconfig
@@ -10,6 +10,7 @@ config LUSTRE_FS
 	select CRYPTO_SHA1
 	select CRYPTO_SHA256
 	select CRYPTO_SHA512
+	depends on MULTIUSER
 	help
 	  This option enables Lustre file system client support. Choose Y
 	  here if you want to access a Lustre file system cluster. To compile
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index c7abc10279af..f31fd0dd92c6 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -1,6 +1,6 @@
 config NFS_FS
 	tristate "NFS client support"
-	depends on INET && FILE_LOCKING
+	depends on INET && FILE_LOCKING && MULTIUSER
 	select LOCKD
 	select SUNRPC
 	select NFS_ACL_SUPPORT if NFS_V3_ACL
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 683bf718aead..fc2d108f5272 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -6,6 +6,7 @@ config NFSD
 	select SUNRPC
 	select EXPORTFS
 	select NFS_ACL_SUPPORT if NFSD_V2_ACL
+	depends on MULTIUSER
 	help
 	  Choose Y here if you want to allow other computers to access
 	  files residing on this system using Sun's Network File System
diff --git a/include/linux/capability.h b/include/linux/capability.h
index aa93e5ef594c..af9f0b9e80e6 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -205,6 +205,7 @@ static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a,
 			   cap_intersect(permitted, __cap_nfsd_set));
 }
 
+#ifdef CONFIG_MULTIUSER
 extern bool has_capability(struct task_struct *t, int cap);
 extern bool has_ns_capability(struct task_struct *t,
 			      struct user_namespace *ns, int cap);
@@ -213,6 +214,34 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
 				      struct user_namespace *ns, int cap);
 extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
+#else
+static inline bool has_capability(struct task_struct *t, int cap)
+{
+	return true;
+}
+static inline bool has_ns_capability(struct task_struct *t,
+			      struct user_namespace *ns, int cap)
+{
+	return true;
+}
+static inline bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+	return true;
+}
+static inline bool has_ns_capability_noaudit(struct task_struct *t,
+				      struct user_namespace *ns, int cap)
+{
+	return true;
+}
+static inline bool capable(int cap)
+{
+	return true;
+}
+static inline bool ns_capable(struct user_namespace *ns, int cap)
+{
+	return true;
+}
+#endif /* CONFIG_MULTIUSER */
 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
 
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 2fb2ca2127ed..8b6c083e68a7 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -62,9 +62,27 @@ do {							\
 		groups_free(group_info);		\
 } while (0)
 
-extern struct group_info *groups_alloc(int);
 extern struct group_info init_groups;
+#ifdef CONFIG_MULTIUSER
+extern struct group_info *groups_alloc(int);
 extern void groups_free(struct group_info *);
+
+extern int in_group_p(kgid_t);
+extern int in_egroup_p(kgid_t);
+#else
+static inline void groups_free(struct group_info *group_info)
+{
+}
+
+static inline int in_group_p(kgid_t grp)
+{
+        return 1;
+}
+static inline int in_egroup_p(kgid_t grp)
+{
+        return 1;
+}
+#endif
 extern int set_current_groups(struct group_info *);
 extern void set_groups(struct cred *, struct group_info *);
 extern int groups_search(const struct group_info *, kgid_t);
@@ -74,9 +92,6 @@ extern bool may_setgroups(void);
 #define GROUP_AT(gi, i) \
 	((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK])
 
-extern int in_group_p(kgid_t);
-extern int in_egroup_p(kgid_t);
-
 /*
  * The security context of a task
  *
diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h
index 2d1f9b627f91..0ee05da38899 100644
--- a/include/linux/uidgid.h
+++ b/include/linux/uidgid.h
@@ -29,6 +29,7 @@ typedef struct {
 #define KUIDT_INIT(value) (kuid_t){ value }
 #define KGIDT_INIT(value) (kgid_t){ value }
 
+#ifdef CONFIG_MULTIUSER
 static inline uid_t __kuid_val(kuid_t uid)
 {
 	return uid.val;
@@ -38,6 +39,17 @@ static inline gid_t __kgid_val(kgid_t gid)
 {
 	return gid.val;
 }
+#else
+static inline uid_t __kuid_val(kuid_t uid)
+{
+	return 0;
+}
+
+static inline gid_t __kgid_val(kgid_t gid)
+{
+	return 0;
+}
+#endif
 
 #define GLOBAL_ROOT_UID KUIDT_INIT(0)
 #define GLOBAL_ROOT_GID KGIDT_INIT(0)
diff --git a/init/Kconfig b/init/Kconfig
index a905b7301e10..3b9df1aa35db 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -394,6 +394,7 @@ endchoice
 
 config BSD_PROCESS_ACCT
 	bool "BSD Process Accounting"
+	depends on MULTIUSER
 	help
 	  If you say Y here, a user level program will be able to instruct the
 	  kernel (via a special system call) to write process accounting
@@ -420,6 +421,7 @@ config BSD_PROCESS_ACCT_V3
 config TASKSTATS
 	bool "Export task/process statistics through netlink"
 	depends on NET
+	depends on MULTIUSER
 	default n
 	help
 	  Export selected statistics for tasks/processes through the
@@ -1160,6 +1162,7 @@ config CHECKPOINT_RESTORE
 
 menuconfig NAMESPACES
 	bool "Namespaces support" if EXPERT
+	depends on MULTIUSER
 	default !EXPERT
 	help
 	  Provides the way to make tasks work with different objects using
@@ -1356,11 +1359,25 @@ menuconfig EXPERT
 
 config UID16
 	bool "Enable 16-bit UID system calls" if EXPERT
-	depends on HAVE_UID16
+	depends on HAVE_UID16 && MULTIUSER
 	default y
 	help
 	  This enables the legacy 16-bit UID syscall wrappers.
 
+config MULTIUSER
+	bool "Multiple users, groups and capabilities support" if EXPERT
+	default y
+	help
+	  This option enables support for non-root users, groups and
+	  capabilities.
+
+	  If you say N here, all processes will run with UID 0, GID 0, and all
+	  possible capabilities.  Saying N here also compiles out support for
+	  system calls related to UIDs, GIDs, and capabilities, such as setuid,
+	  setgid, and capset.
+
+	  If unsure, say Y here.
+
 config SGETMASK_SYSCALL
 	bool "sgetmask/ssetmask syscalls support" if EXPERT
 	def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH
diff --git a/kernel/Makefile b/kernel/Makefile
index 1408b3353a3c..0f8f8b0bc1bf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,9 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o groups.o smpboot.o
+	    async.o range.o smpboot.o
+
+obj-$(CONFIG_MULTIUSER) += groups.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/capability.c b/kernel/capability.c
index 989f5bfc57dc..45432b54d5c6 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str)
 }
 __setup("no_file_caps", file_caps_disable);
 
+#ifdef CONFIG_MULTIUSER
 /*
  * More recent versions of libcap are available from:
  *
@@ -386,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap)
 }
 EXPORT_SYMBOL(ns_capable);
 
+
+/**
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool capable(int cap)
+{
+	return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+#endif /* CONFIG_MULTIUSER */
+
 /**
  * file_ns_capable - Determine if the file's opener had a capability in effect
  * @file:  The file we want to check
@@ -411,22 +430,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns,
 }
 EXPORT_SYMBOL(file_ns_capable);
 
-/**
- * capable - Determine if the current task has a superior capability in effect
- * @cap: The capability to be tested for
- *
- * Return true if the current task has the given superior capability currently
- * available for use, false if not.
- *
- * This sets PF_SUPERPRIV on the task if the capability is available on the
- * assumption that it's about to be used.
- */
-bool capable(int cap)
-{
-	return ns_capable(&init_user_ns, cap);
-}
-EXPORT_SYMBOL(capable);
-
 /**
  * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
  * @inode: The inode in question
diff --git a/kernel/cred.c b/kernel/cred.c
index e0573a43c7df..ec1c07667ec1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -29,6 +29,9 @@
 
 static struct kmem_cache *cred_jar;
 
+/* init to 2 - one for init_task, one to ensure it is never freed */
+struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+
 /*
  * The initial credentials for the initial task
  */
diff --git a/kernel/groups.c b/kernel/groups.c
index 664411f171b5..74d431d25251 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -9,9 +9,6 @@
 #include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 
-/* init to 2 - one for init_task, one to ensure it is never freed */
-struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
-
 struct group_info *groups_alloc(int gidsetsize)
 {
 	struct group_info *group_info;
diff --git a/kernel/sys.c b/kernel/sys.c
index a03d9cd23ed7..3be344902316 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -325,6 +325,7 @@ out_unlock:
  * SMP: There are not races, the GIDs are checked only by filesystem
  *      operations (as far as semantic preservation is concerned).
  */
+#ifdef CONFIG_MULTIUSER
 SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
 {
 	struct user_namespace *ns = current_user_ns();
@@ -815,6 +816,7 @@ change_okay:
 	commit_creds(new);
 	return old_fsgid;
 }
+#endif /* CONFIG_MULTIUSER */
 
 /**
  * sys_getpid - return the thread group id of the current process
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5adcb0ae3a58..7995ef5868d8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -159,6 +159,20 @@ cond_syscall(sys_uselib);
 cond_syscall(sys_fadvise64);
 cond_syscall(sys_fadvise64_64);
 cond_syscall(sys_madvise);
+cond_syscall(sys_setuid);
+cond_syscall(sys_setregid);
+cond_syscall(sys_setgid);
+cond_syscall(sys_setreuid);
+cond_syscall(sys_setresuid);
+cond_syscall(sys_getresuid);
+cond_syscall(sys_setresgid);
+cond_syscall(sys_getresgid);
+cond_syscall(sys_setgroups);
+cond_syscall(sys_getgroups);
+cond_syscall(sys_setfsuid);
+cond_syscall(sys_setfsgid);
+cond_syscall(sys_capget);
+cond_syscall(sys_capset);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index fb78117b896c..9068e72aa73c 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -1,9 +1,11 @@
 config SUNRPC
 	tristate
+	depends on MULTIUSER
 
 config SUNRPC_GSS
 	tristate
 	select OID_REGISTRY
+	depends on MULTIUSER
 
 config SUNRPC_BACKCHANNEL
 	bool
diff --git a/security/Kconfig b/security/Kconfig
index beb86b500adf..bf4ec46474b6 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -21,6 +21,7 @@ config SECURITY_DMESG_RESTRICT
 config SECURITY
 	bool "Enable different security models"
 	depends on SYSFS
+	depends on MULTIUSER
 	help
 	  This allows you to choose different security modules to be
 	  configured into your kernel.
-- 
cgit v1.3-6-gb490


From 96831c0a6738f88f89e7012f4df0a747514af0a0 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jsitnicki@gmail.com>
Date: Wed, 15 Apr 2015 16:16:44 -0700
Subject: kernel/resource.c: remove deprecated __check_region() and friends

All users of __check_region(), check_region(), and check_mem_region() are
gone.  We got rid of the last user in v4.0-rc1.  Remove them.

bloat-o-meter on x86_64 shows:

add/remove: 0/3 grow/shrink: 0/0 up/down: 0/-102 (-102)
function                                     old     new   delta
__kstrtab___check_region                      15       -     -15
__ksymtab___check_region                      16       -     -16
__check_region                                71       -     -71

Signed-off-by: Jakub Sitnicki <jsitnicki@gmail.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ioport.h |  8 --------
 kernel/resource.c      | 32 --------------------------------
 2 files changed, 40 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 2c5250222278..388e3ae94f7a 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -196,10 +196,8 @@ extern struct resource * __request_region(struct resource *,
 
 /* Compatibility cruft */
 #define release_region(start,n)	__release_region(&ioport_resource, (start), (n))
-#define check_mem_region(start,n)	__check_region(&iomem_resource, (start), (n))
 #define release_mem_region(start,n)	__release_region(&iomem_resource, (start), (n))
 
-extern int __check_region(struct resource *, resource_size_t, resource_size_t);
 extern void __release_region(struct resource *, resource_size_t,
 				resource_size_t);
 #ifdef CONFIG_MEMORY_HOTREMOVE
@@ -207,12 +205,6 @@ extern int release_mem_region_adjustable(struct resource *, resource_size_t,
 				resource_size_t);
 #endif
 
-static inline int __deprecated check_region(resource_size_t s,
-						resource_size_t n)
-{
-	return __check_region(&ioport_resource, s, n);
-}
-
 /* Wrappers for managed devices */
 struct device;
 
diff --git a/kernel/resource.c b/kernel/resource.c
index 19f2357dfda3..90552aab5f2d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1034,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res)
  *
  * request_region creates a new busy region.
  *
- * check_region returns non-zero if the area is already busy.
- *
  * release_region releases a matching busy region.
  */
 
@@ -1097,36 +1095,6 @@ struct resource * __request_region(struct resource *parent,
 }
 EXPORT_SYMBOL(__request_region);
 
-/**
- * __check_region - check if a resource region is busy or free
- * @parent: parent resource descriptor
- * @start: resource start address
- * @n: resource region size
- *
- * Returns 0 if the region is free at the moment it is checked,
- * returns %-EBUSY if the region is busy.
- *
- * NOTE:
- * This function is deprecated because its use is racy.
- * Even if it returns 0, a subsequent call to request_region()
- * may fail because another driver etc. just allocated the region.
- * Do NOT use it.  It will be removed from the kernel.
- */
-int __check_region(struct resource *parent, resource_size_t start,
-			resource_size_t n)
-{
-	struct resource * res;
-
-	res = __request_region(parent, start, n, "check-region", 0);
-	if (!res)
-		return -EBUSY;
-
-	release_resource(res);
-	free_resource(res);
-	return 0;
-}
-EXPORT_SYMBOL(__check_region);
-
 /**
  * __release_region - release a previously reserved resource region
  * @parent: parent resource descriptor
-- 
cgit v1.3-6-gb490


From 7a54f46b301cfab8a0d7365aa186545f8b98f22e Mon Sep 17 00:00:00 2001
From: Joel Stanley <joel@jms.id.au>
Date: Wed, 15 Apr 2015 16:16:53 -0700
Subject: kernel/reboot.c: add orderly_reboot for graceful reboot

The kernel has orderly_poweroff which allows the kernel to initiate a
graceful shutdown of userspace, by running /sbin/poweroff.  This adds
orderly_reboot that will cause userspace to shut itself down by calling
/sbin/reboot.

This will be used for shutdown initiated by a system controller on
platforms that do not use ACPI.

orderly_reboot() should be used when the system wants to allow userspace
to gracefully shut itself down.  For cases where the system may imminently
catch on fire, the existing emergency_restart() provides an immediate
reboot without involving userspace.

Signed-off-by: Joel Stanley <joel@jms.id.au>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Kerr <jk@ozlabs.org>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/reboot.h |  3 ++-
 kernel/reboot.c        | 53 +++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 50 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 67fc8fcdc4b0..a7ff409f386d 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -70,7 +70,8 @@ void ctrl_alt_del(void);
 #define POWEROFF_CMD_PATH_LEN	256
 extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN];
 
-extern int orderly_poweroff(bool force);
+extern void orderly_poweroff(bool force);
+extern void orderly_reboot(void);
 
 /*
  * Emergency restart, callable from an interrupt handler.
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 5925f5ae8dff..d20c85d9f8c0 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -387,8 +387,9 @@ void ctrl_alt_del(void)
 }
 
 char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
+static const char reboot_cmd[] = "/sbin/reboot";
 
-static int __orderly_poweroff(bool force)
+static int run_cmd(const char *cmd)
 {
 	char **argv;
 	static char *envp[] = {
@@ -397,8 +398,7 @@ static int __orderly_poweroff(bool force)
 		NULL
 	};
 	int ret;
-
-	argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
+	argv = argv_split(GFP_KERNEL, cmd, NULL);
 	if (argv) {
 		ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 		argv_free(argv);
@@ -406,8 +406,33 @@ static int __orderly_poweroff(bool force)
 		ret = -ENOMEM;
 	}
 
+	return ret;
+}
+
+static int __orderly_reboot(void)
+{
+	int ret;
+
+	ret = run_cmd(reboot_cmd);
+
+	if (ret) {
+		pr_warn("Failed to start orderly reboot: forcing the issue\n");
+		emergency_sync();
+		kernel_restart(NULL);
+	}
+
+	return ret;
+}
+
+static int __orderly_poweroff(bool force)
+{
+	int ret;
+
+	ret = run_cmd(poweroff_cmd);
+
 	if (ret && force) {
 		pr_warn("Failed to start orderly shutdown: forcing the issue\n");
+
 		/*
 		 * I guess this should try to kick off some daemon to sync and
 		 * poweroff asap.  Or not even bother syncing if we're doing an
@@ -436,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func);
  * This may be called from any context to trigger a system shutdown.
  * If the orderly shutdown fails, it will force an immediate shutdown.
  */
-int orderly_poweroff(bool force)
+void orderly_poweroff(bool force)
 {
 	if (force) /* do not override the pending "true" */
 		poweroff_force = true;
 	schedule_work(&poweroff_work);
-	return 0;
 }
 EXPORT_SYMBOL_GPL(orderly_poweroff);
 
+static void reboot_work_func(struct work_struct *work)
+{
+	__orderly_reboot();
+}
+
+static DECLARE_WORK(reboot_work, reboot_work_func);
+
+/**
+ * orderly_reboot - Trigger an orderly system reboot
+ *
+ * This may be called from any context to trigger a system reboot.
+ * If the orderly reboot fails, it will force an immediate reboot.
+ */
+void orderly_reboot(void)
+{
+	schedule_work(&reboot_work);
+}
+EXPORT_SYMBOL_GPL(orderly_reboot);
+
 static int __init reboot_setup(char *str)
 {
 	for (;;) {
-- 
cgit v1.3-6-gb490


From 7b1460eccad0621e0c48f52bbedeb7adddc55ac9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 15 Apr 2015 16:16:59 -0700
Subject: printk: comment pr_cont() stating it is only to continue a line

KERN_CONT is nicely commented in kern_levels.h, but pr_cont() is now used
more often, and it lacks the comment stating what it is used for.  It can
be confused as continuing the log level, but that is not its purpose.  Its
purpose is to continue a line that had no newline enclosed.  This should
be documented by pr_cont() as well.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index baa3f97d8ce8..9b30871c9149 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -255,6 +255,11 @@ extern asmlinkage void dump_stack(void) __cold;
 	printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_info(fmt, ...) \
 	printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
+/*
+ * Like KERN_CONT, pr_cont() should only be used when continuing
+ * a line with no newline ('\n') enclosed. Otherwise it defaults
+ * back to KERN_DEFAULT.
+ */
 #define pr_cont(fmt, ...) \
 	printk(KERN_CONT fmt, ##__VA_ARGS__)
 
-- 
cgit v1.3-6-gb490


From 41416f2330112d29f2cfa337bfc7e672bf0c2768 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 15 Apr 2015 16:17:28 -0700
Subject: lib/string_helpers.c: change semantics of string_escape_mem

The current semantics of string_escape_mem are inadequate for one of its
current users, vsnprintf().  If that is to honour its contract, it must
know how much space would be needed for the entire escaped buffer, and
string_escape_mem provides no way of obtaining that (short of allocating a
large enough buffer (~4 times input string) to let it play with, and
that's definitely a big no-no inside vsnprintf).

So change the semantics for string_escape_mem to be more snprintf-like:
Return the size of the output that would be generated if the destination
buffer was big enough, but of course still only write to the part of dst
it is allowed to, and (contrary to snprintf) don't do '\0'-termination.
It is then up to the caller to detect whether output was truncated and to
append a '\0' if desired.  Also, we must output partial escape sequences,
otherwise a call such as snprintf(buf, 3, "%1pE", "\123") would cause
printf to write a \0 to buf[2] but leaving buf[0] and buf[1] with whatever
they previously contained.

This also fixes a bug in the escaped_string() helper function, which used
to unconditionally pass a length of "end-buf" to string_escape_mem();
since the latter doesn't check osz for being insanely large, it would
happily write to dst.  For example, kasprintf(GFP_KERNEL, "something and
then %pE", ...); is an easy way to trigger an oops.

In test-string_helpers.c, the -ENOMEM test is replaced with testing for
getting the expected return value even if the buffer is too small.  We
also ensure that nothing is written (by relying on a NULL pointer deref)
if the output size is 0 by passing NULL - this has to work for
kasprintf("%pE") to work.

In net/sunrpc/cache.c, I think qword_add still has the same semantics.
Someone should definitely double-check this.

In fs/proc/array.c, I made the minimum possible change, but longer-term it
should stop poking around in seq_file internals.

[andriy.shevchenko@linux.intel.com: simplify qword_add]
[andriy.shevchenko@linux.intel.com: add missed curly braces]
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c                |  4 ++--
 include/linux/string_helpers.h |  8 +++----
 lib/string_helpers.c           | 49 ++++++------------------------------------
 lib/test-string_helpers.c      | 40 +++++++++++++++++-----------------
 lib/vsprintf.c                 |  8 +++++--
 net/sunrpc/cache.c             |  8 ++++---
 6 files changed, 44 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index a4490c0a4644..13f047ad08e4 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -99,8 +99,8 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
 	buf = m->buf + m->count;
 
 	/* Ignore error for now */
-	string_escape_str(tcomm, &buf, m->size - m->count,
-			  ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+	buf += string_escape_str(tcomm, buf, m->size - m->count,
+				 ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
 
 	m->count = buf - m->buf;
 	seq_putc(m, '\n');
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index 657571817260..0991913f4953 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -47,22 +47,22 @@ static inline int string_unescape_any_inplace(char *buf)
 #define ESCAPE_ANY_NP		(ESCAPE_ANY | ESCAPE_NP)
 #define ESCAPE_HEX		0x20
 
-int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz,
+int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		unsigned int flags, const char *esc);
 
 static inline int string_escape_mem_any_np(const char *src, size_t isz,
-		char **dst, size_t osz, const char *esc)
+		char *dst, size_t osz, const char *esc)
 {
 	return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, esc);
 }
 
-static inline int string_escape_str(const char *src, char **dst, size_t sz,
+static inline int string_escape_str(const char *src, char *dst, size_t sz,
 		unsigned int flags, const char *esc)
 {
 	return string_escape_mem(src, strlen(src), dst, sz, flags, esc);
 }
 
-static inline int string_escape_str_any_np(const char *src, char **dst,
+static inline int string_escape_str_any_np(const char *src, char *dst,
 		size_t sz, const char *esc)
 {
 	return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, esc);
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 9c48ddad0f0d..1826c7407258 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -274,11 +274,6 @@ static bool escape_space(unsigned char c, char **dst, char *end)
 		return false;
 	}
 
-	if (out + 2 > end) {
-		*dst = out + 2;
-		return true;
-	}
-
 	if (out < end)
 		*out = '\\';
 	++out;
@@ -309,11 +304,6 @@ static bool escape_special(unsigned char c, char **dst, char *end)
 		return false;
 	}
 
-	if (out + 2 > end) {
-		*dst = out + 2;
-		return true;
-	}
-
 	if (out < end)
 		*out = '\\';
 	++out;
@@ -332,11 +322,6 @@ static bool escape_null(unsigned char c, char **dst, char *end)
 	if (c)
 		return false;
 
-	if (out + 2 > end) {
-		*dst = out + 2;
-		return true;
-	}
-
 	if (out < end)
 		*out = '\\';
 	++out;
@@ -352,11 +337,6 @@ static bool escape_octal(unsigned char c, char **dst, char *end)
 {
 	char *out = *dst;
 
-	if (out + 4 > end) {
-		*dst = out + 4;
-		return true;
-	}
-
 	if (out < end)
 		*out = '\\';
 	++out;
@@ -378,11 +358,6 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
 {
 	char *out = *dst;
 
-	if (out + 4 > end) {
-		*dst = out + 4;
-		return true;
-	}
-
 	if (out < end)
 		*out = '\\';
 	++out;
@@ -449,20 +424,17 @@ static bool escape_hex(unsigned char c, char **dst, char *end)
  * it if needs.
  *
  * Return:
- * The amount of the characters processed to the destination buffer, or
- * %-ENOMEM if the size of buffer is not enough to put an escaped character is
- * returned.
- *
- * Even in the case of error @dst pointer will be updated to point to the byte
- * after the last processed character.
+ * The total size of the escaped output that would be generated for
+ * the given input and flags. To check whether the output was
+ * truncated, compare the return value to osz. There is room left in
+ * dst for a '\0' terminator if and only if ret < osz.
  */
-int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz,
+int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz,
 		      unsigned int flags, const char *esc)
 {
-	char *p = *dst;
+	char *p = dst;
 	char *end = p + osz;
 	bool is_dict = esc && *esc;
-	int ret;
 
 	while (isz--) {
 		unsigned char c = *src++;
@@ -502,13 +474,6 @@ int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz,
 		escape_passthrough(c, &p, end);
 	}
 
-	if (p > end) {
-		*dst = end;
-		return -ENOMEM;
-	}
-
-	ret = p - *dst;
-	*dst = p;
-	return ret;
+	return p - dst;
 }
 EXPORT_SYMBOL(string_escape_mem);
diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c
index ab0d30e1e18f..8e376efd88a4 100644
--- a/lib/test-string_helpers.c
+++ b/lib/test-string_helpers.c
@@ -260,16 +260,28 @@ static __init const char *test_string_find_match(const struct test_string_2 *s2,
 	return NULL;
 }
 
+static __init void
+test_string_escape_overflow(const char *in, int p, unsigned int flags, const char *esc,
+			    int q_test, const char *name)
+{
+	int q_real;
+
+	q_real = string_escape_mem(in, p, NULL, 0, flags, esc);
+	if (q_real != q_test)
+		pr_warn("Test '%s' failed: flags = %u, osz = 0, expected %d, got %d\n",
+			name, flags, q_test, q_real);
+}
+
 static __init void test_string_escape(const char *name,
 				      const struct test_string_2 *s2,
 				      unsigned int flags, const char *esc)
 {
-	int q_real = 512;
-	char *out_test = kmalloc(q_real, GFP_KERNEL);
-	char *out_real = kmalloc(q_real, GFP_KERNEL);
+	size_t out_size = 512;
+	char *out_test = kmalloc(out_size, GFP_KERNEL);
+	char *out_real = kmalloc(out_size, GFP_KERNEL);
 	char *in = kmalloc(256, GFP_KERNEL);
-	char *buf = out_real;
 	int p = 0, q_test = 0;
+	int q_real;
 
 	if (!out_test || !out_real || !in)
 		goto out;
@@ -301,29 +313,19 @@ static __init void test_string_escape(const char *name,
 		q_test += len;
 	}
 
-	q_real = string_escape_mem(in, p, &buf, q_real, flags, esc);
+	q_real = string_escape_mem(in, p, out_real, out_size, flags, esc);
 
 	test_string_check_buf(name, flags, in, p, out_real, q_real, out_test,
 			      q_test);
+
+	test_string_escape_overflow(in, p, flags, esc, q_test, name);
+
 out:
 	kfree(in);
 	kfree(out_real);
 	kfree(out_test);
 }
 
-static __init void test_string_escape_nomem(void)
-{
-	char *in = "\eb \\C\007\"\x90\r]";
-	char out[64], *buf = out;
-	int rc = -ENOMEM, ret;
-
-	ret = string_escape_str_any_np(in, &buf, strlen(in), NULL);
-	if (ret == rc)
-		return;
-
-	pr_err("Test 'escape nomem' failed: got %d instead of %d\n", ret, rc);
-}
-
 static int __init test_string_helpers_init(void)
 {
 	unsigned int i;
@@ -342,8 +344,6 @@ static int __init test_string_helpers_init(void)
 	for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++)
 		test_string_escape("escape 1", escape1, i, TEST_STRING_2_DICT_1);
 
-	test_string_escape_nomem();
-
 	return -EINVAL;
 }
 module_init(test_string_helpers_init);
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 4da1e7aaf9d5..3a1e0843f9a2 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1235,8 +1235,12 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
 
 	len = spec.field_width < 0 ? 1 : spec.field_width;
 
-	/* Ignore the error. We print as many characters as we can */
-	string_escape_mem(addr, len, &buf, end - buf, flags, NULL);
+	/*
+	 * string_escape_mem() writes as many characters as it can to
+	 * the given buffer, and returns the total size of the output
+	 * had the buffer been big enough.
+	 */
+	buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL);
 
 	return buf;
 }
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 5199bb1a017e..2928afffbb81 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1072,10 +1072,12 @@ void qword_add(char **bpp, int *lp, char *str)
 
 	if (len < 0) return;
 
-	ret = string_escape_str(str, &bp, len, ESCAPE_OCTAL, "\\ \n\t");
-	if (ret < 0 || ret == len)
+	ret = string_escape_str(str, bp, len, ESCAPE_OCTAL, "\\ \n\t");
+	if (ret >= len) {
+		bp += len;
 		len = -1;
-	else {
+	} else {
+		bp += ret;
 		len -= ret;
 		*bp++ = ' ';
 		len--;
-- 
cgit v1.3-6-gb490


From 89c1e79eb302349fcaf0697bc9116a4ff16bfeb0 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Wed, 15 Apr 2015 16:17:42 -0700
Subject: linux/bitmap.h: improve BITMAP_{LAST,FIRST}_WORD_MASK

The macro BITMAP_LAST_WORD_MASK can be implemented without a conditional,
which will generally lead to slightly better generated code (221 bytes
saved for allmodconfig-GCOV_KERNEL, ~2k with GCOV_KERNEL).  As a small
bonus, this also ensures that the nbits parameter is expanded exactly
once.

In BITMAP_FIRST_WORD_MASK, if start is signed gcc is technically allowed
to assume it is positive (or divisible by BITS_PER_LONG), and hence just
do the simple mask.  It doesn't seem to use this, and even on an
architecture like x86 where the shift only depends on the lower 5 or 6
bits, and these bits are not affected by the signedness of the expression,
gcc still generates code to compute the C99 mandated value of start %
BITS_PER_LONG.  So just use a mask explicitly, also for consistency with
BITMAP_LAST_WORD_MASK.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Tejun Heo <tj@kernel.org>
Reviewed-by: George Spelvin <linux@horizon.com>
Cc: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitmap.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index dbfbf4990005..be4fa5ddf36c 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -172,12 +172,8 @@ extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int
 extern int bitmap_print_to_pagebuf(bool list, char *buf,
 				   const unsigned long *maskp, int nmaskbits);
 
-#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
-#define BITMAP_LAST_WORD_MASK(nbits)					\
-(									\
-	((nbits) % BITS_PER_LONG) ?					\
-		(1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL		\
-)
+#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
+#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
 
 #define small_const_nbits(nbits) \
 	(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
-- 
cgit v1.3-6-gb490