From f2b91d8d385d2cef3a1e3b3846f2dde4a6720c43 Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Wed, 15 Apr 2015 16:12:51 -0700 Subject: vfs: delete vfs_readdir function declaration vfs_readdir() was replaced by iterate_dir() in commit 5c0ba4e0762e ("[readdir] introduce iterate_dir() and dir_context"). Signed-off-by: Zhang Zhen Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index d502e5436c84..60733bdc74b4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2684,7 +2684,6 @@ void inode_sub_bytes(struct inode *inode, loff_t bytes); loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); -extern int vfs_readdir(struct file *, filldir_t, void *); extern int iterate_dir(struct file *, struct dir_context *); extern int vfs_stat(const char __user *, struct kstat *); -- cgit v1.3-6-gb490 From d7e4a2ea51c1b0ac0b2d53f5ab4a2e878b1c4aec Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Wed, 15 Apr 2015 16:12:57 -0700 Subject: mm: refactor zone_movable_is_highmem() All callers of zone_movable_is_highmem are under #ifdef CONFIG_HIGHMEM, so the else branch return 0 is not needed. Signed-off-by: Zhang Zhen Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2782df47101e..54d74f6eb233 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -842,16 +842,16 @@ static inline int populated_zone(struct zone *zone) extern int movable_zone; +#ifdef CONFIG_HIGHMEM static inline int zone_movable_is_highmem(void) { -#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP return movable_zone == ZONE_HIGHMEM; -#elif defined(CONFIG_HIGHMEM) - return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; #else - return 0; + return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; #endif } +#endif static inline int is_highmem_idx(enum zone_type idx) { -- cgit v1.3-6-gb490 From e8c6158fef15a1532bd5242a0cd88565eedabe61 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Apr 2015 16:13:08 -0700 Subject: mm: consolidate all page-flags helpers in Currently we take a naive approach to page flags on compound pages - we set the flag on the page without consideration if the flag makes sense for tail page or for compound page in general. This patchset try to sort this out by defining per-flag policy on what need to be done if page-flag helper operate on compound page. The last patch in the patchset also sanitizes usege of page->mapping for tail pages. We don't define the meaning of page->mapping for tail pages. Currently it's always NULL, which can be inconsistent with head page and potentially lead to problems. For now I caught one case of illegal usage of page flags or ->mapping: sound subsystem allocates pages with __GFP_COMP and maps them with PTEs. It leads to setting dirty bit on tail pages and access to tail_page's ->mapping. I don't see any bad behaviour caused by this, but worth fixing anyway. This patchset makes more sense if you take my THP refcounting into account: we will see more compound pages mapped with PTEs and we need to define behaviour of flags on compound pages to avoid bugs. This patch (of 16): We have page-flags helper function declarations/definitions spread over several header files. Let's consolidate them in . Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Acked-by: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Naoya Horiguchi Cc: Steve Capper Cc: "Aneesh Kumar K.V" Cc: Johannes Weiner Cc: Michal Hocko Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 7 ---- include/linux/ksm.h | 17 -------- include/linux/mm.h | 81 -------------------------------------- include/linux/page-flags.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 96 insertions(+), 105 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7b5785032049..1a782733a420 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -41,8 +41,6 @@ extern int hugetlb_max_hstate __read_mostly; struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); void hugepage_put_subpool(struct hugepage_subpool *spool); -int PageHuge(struct page *page); - void reset_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); @@ -109,11 +107,6 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, #else /* !CONFIG_HUGETLB_PAGE */ -static inline int PageHuge(struct page *page) -{ - return 0; -} - static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { } diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 3be6bb18562d..7ae216a39c9e 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -35,18 +35,6 @@ static inline void ksm_exit(struct mm_struct *mm) __ksm_exit(mm); } -/* - * A KSM page is one of those write-protected "shared pages" or "merged pages" - * which KSM maps into multiple mms, wherever identical anonymous page content - * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any - * anon_vma, but to that page's node of the stable tree. - */ -static inline int PageKsm(struct page *page) -{ - return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == - (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); -} - static inline struct stable_node *page_stable_node(struct page *page) { return PageKsm(page) ? page_rmapping(page) : NULL; @@ -87,11 +75,6 @@ static inline void ksm_exit(struct mm_struct *mm) { } -static inline int PageKsm(struct page *page) -{ - return 0; -} - #ifdef CONFIG_MMU static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6571dd78e984..fb1fc38b01ce 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -494,15 +494,6 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } -#ifdef CONFIG_HUGETLB_PAGE -extern int PageHeadHuge(struct page *page_head); -#else /* CONFIG_HUGETLB_PAGE */ -static inline int PageHeadHuge(struct page *page_head) -{ - return 0; -} -#endif /* CONFIG_HUGETLB_PAGE */ - static inline bool __compound_tail_refcounted(struct page *page) { return !PageSlab(page) && !PageHeadHuge(page); @@ -571,53 +562,6 @@ static inline void init_page_count(struct page *page) atomic_set(&page->_count, 1); } -/* - * PageBuddy() indicate that the page is free and in the buddy system - * (see mm/page_alloc.c). - * - * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to - * -2 so that an underflow of the page_mapcount() won't be mistaken - * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very - * efficiently by most CPU architectures. - */ -#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) - -static inline int PageBuddy(struct page *page) -{ - return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE; -} - -static inline void __SetPageBuddy(struct page *page) -{ - VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); - atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE); -} - -static inline void __ClearPageBuddy(struct page *page) -{ - VM_BUG_ON_PAGE(!PageBuddy(page), page); - atomic_set(&page->_mapcount, -1); -} - -#define PAGE_BALLOON_MAPCOUNT_VALUE (-256) - -static inline int PageBalloon(struct page *page) -{ - return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE; -} - -static inline void __SetPageBalloon(struct page *page) -{ - VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); - atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE); -} - -static inline void __ClearPageBalloon(struct page *page) -{ - VM_BUG_ON_PAGE(!PageBalloon(page), page); - atomic_set(&page->_mapcount, -1); -} - void put_page(struct page *page); void put_pages_list(struct list_head *pages); @@ -1006,26 +950,6 @@ void page_address_init(void); #define page_address_init() do { } while(0) #endif -/* - * On an anonymous page mapped into a user virtual memory area, - * page->mapping points to its anon_vma, not to a struct address_space; - * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. - * - * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, - * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit; - * and then page->mapping points, not to an anon_vma, but to a private - * structure which KSM associates with that merged page. See ksm.h. - * - * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used. - * - * Please note that, confusingly, "page_mapping" refers to the inode - * address_space which maps the page from disk; whereas "page_mapped" - * refers to user virtual address space into which the page is mapped. - */ -#define PAGE_MAPPING_ANON 1 -#define PAGE_MAPPING_KSM 2 -#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) - extern struct address_space *page_mapping(struct page *page); /* Neutral page->mapping pointer to address_space or anon_vma or other */ @@ -1045,11 +969,6 @@ struct address_space *page_file_mapping(struct page *page) return page->mapping; } -static inline int PageAnon(struct page *page) -{ - return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; -} - /* * Return the pagecache index of the passed page. Regular pagecache pages * use ->index whereas swapcache pages use ->private diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c851ff92d5b3..84d10b65cec6 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -289,6 +289,47 @@ PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif +/* + * On an anonymous page mapped into a user virtual memory area, + * page->mapping points to its anon_vma, not to a struct address_space; + * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. + * + * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, + * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit; + * and then page->mapping points, not to an anon_vma, but to a private + * structure which KSM associates with that merged page. See ksm.h. + * + * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used. + * + * Please note that, confusingly, "page_mapping" refers to the inode + * address_space which maps the page from disk; whereas "page_mapped" + * refers to user virtual address space into which the page is mapped. + */ +#define PAGE_MAPPING_ANON 1 +#define PAGE_MAPPING_KSM 2 +#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) + +static inline int PageAnon(struct page *page) +{ + return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; +} + +#ifdef CONFIG_KSM +/* + * A KSM page is one of those write-protected "shared pages" or "merged pages" + * which KSM maps into multiple mms, wherever identical anonymous page content + * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any + * anon_vma, but to that page's node of the stable tree. + */ +static inline int PageKsm(struct page *page) +{ + return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); +} +#else +TESTPAGEFLAG_FALSE(Ksm) +#endif + u64 stable_page_flags(struct page *page); static inline int PageUptodate(struct page *page) @@ -426,6 +467,14 @@ static inline void ClearPageCompound(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ +#ifdef CONFIG_HUGETLB_PAGE +int PageHuge(struct page *page); +int PageHeadHuge(struct page *page); +#else +TESTPAGEFLAG_FALSE(Huge) +TESTPAGEFLAG_FALSE(HeadHuge) +#endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for @@ -479,6 +528,53 @@ static inline int PageTransTail(struct page *page) } #endif +/* + * PageBuddy() indicate that the page is free and in the buddy system + * (see mm/page_alloc.c). + * + * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to + * -2 so that an underflow of the page_mapcount() won't be mistaken + * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very + * efficiently by most CPU architectures. + */ +#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) + +static inline int PageBuddy(struct page *page) +{ + return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE; +} + +static inline void __SetPageBuddy(struct page *page) +{ + VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); + atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE); +} + +static inline void __ClearPageBuddy(struct page *page) +{ + VM_BUG_ON_PAGE(!PageBuddy(page), page); + atomic_set(&page->_mapcount, -1); +} + +#define PAGE_BALLOON_MAPCOUNT_VALUE (-256) + +static inline int PageBalloon(struct page *page) +{ + return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE; +} + +static inline void __SetPageBalloon(struct page *page) +{ + VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); + atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE); +} + +static inline void __ClearPageBalloon(struct page *page) +{ + VM_BUG_ON_PAGE(!PageBalloon(page), page); + atomic_set(&page->_mapcount, -1); +} + /* * If network-based swap is enabled, sl*b must keep track of whether pages * were allocated from pfmemalloc reserves. -- cgit v1.3-6-gb490 From 8d63d99a5dfbdb997d12dd3c07b2070ca723db3b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Apr 2015 16:13:12 -0700 Subject: mm: avoid tail page refcounting on non-THP compound pages THP uses tail page refcounting to be able to split huge pages at any time. Tail page refcounting is not needed for other users of compound pages and it's harmful because of overhead. We try to exclude non-THP pages from tail page refcounting using __compound_tail_refcounted() check. It excludes most common non-THP compound pages: SL*B and hugetlb, but it doesn't catch rest of __GFP_COMP users -- drivers. And it's not only about overhead. Drivers might want to use compound pages to get refcounting semantics suitable for mapping high-order pages to userspace. But tail page refcounting breaks it. Tail page refcounting uses ->_mapcount in tail pages to store GUP pins on them. It means GUP pins would affect page_mapcount() for tail pages. It's not a problem for THP, because it never maps tail pages. But unlike THP, drivers map parts of compound pages with PTEs and it makes page_mapcount() be called for tail pages. In particular, GUP pins would shift PSS up and affect /proc/kpagecount for such pages. But, I'm not aware about anything which can lead to crash or other serious misbehaviour. Since currently all THP pages are anonymous and all drivers pages are not, we can fix the __compound_tail_refcounted() check by requiring PageAnon() to enable tail page refcounting. Signed-off-by: Kirill A. Shutemov Acked-by: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index fb1fc38b01ce..515ec5045b60 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -496,7 +496,7 @@ static inline int page_count(struct page *page) static inline bool __compound_tail_refcounted(struct page *page) { - return !PageSlab(page) && !PageHeadHuge(page); + return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); } /* -- cgit v1.3-6-gb490 From 5bbe3547aa3ba5242366a322a28996872301b703 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Wed, 15 Apr 2015 16:13:20 -0700 Subject: mm: allow compaction of unevictable pages Currently, pages which are marked as unevictable are protected from compaction, but not from other types of migration. The POSIX real time extension explicitly states that mlock() will prevent a major page fault, but the spirit of this is that mlock() should give a process the ability to control sources of latency, including minor page faults. However, the mlock manpage only explicitly says that a locked page will not be written to swap and this can cause some confusion. The compaction code today does not give a developer who wants to avoid swap but wants to have large contiguous areas available any method to achieve this state. This patch introduces a sysctl for controlling compaction behavior with respect to the unevictable lru. Users who demand no page faults after a page is present can set compact_unevictable_allowed to 0 and users who need the large contiguous areas can enable compaction on locked memory by leaving the default value of 1. To illustrate this problem I wrote a quick test program that mmaps a large number of 1MB files filled with random data. These maps are created locked and read only. Then every other mmap is unmapped and I attempt to allocate huge pages to the static huge page pool. When the compact_unevictable_allowed sysctl is 0, I cannot allocate hugepages after fragmenting memory. When the value is set to 1, allocations succeed. Signed-off-by: Eric B Munson Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: David Rientjes Acked-by: Rik van Riel Cc: Vlastimil Babka Cc: Thomas Gleixner Cc: Christoph Lameter Cc: Peter Zijlstra Cc: Mel Gorman Cc: David Rientjes Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 11 +++++++++++ include/linux/compaction.h | 1 + kernel/sysctl.c | 9 +++++++++ mm/compaction.c | 7 +++++++ 4 files changed, 28 insertions(+) (limited to 'include/linux') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 902b4574acfb..9832ec52f859 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -21,6 +21,7 @@ Currently, these files are in /proc/sys/vm: - admin_reserve_kbytes - block_dump - compact_memory +- compact_unevictable_allowed - dirty_background_bytes - dirty_background_ratio - dirty_bytes @@ -106,6 +107,16 @@ huge pages although processes will also directly compact memory as required. ============================================================== +compact_unevictable_allowed + +Available only when CONFIG_COMPACTION is set. When set to 1, compaction is +allowed to examine the unevictable lru (mlocked pages) for pages to compact. +This should be used on systems where stalls for minor page faults are an +acceptable trade for large contiguous free memory. Set to 0 to prevent +compaction from moving pages that are unevictable. Default value is 1. + +============================================================== + dirty_background_bytes Contains the amount of dirty memory at which the background kernel diff --git a/include/linux/compaction.h b/include/linux/compaction.h index a014559e4a49..aa8f61cf3a19 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -34,6 +34,7 @@ extern int sysctl_compaction_handler(struct ctl_table *table, int write, extern int sysctl_extfrag_threshold; extern int sysctl_extfrag_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); +extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8c0eabd41886..42b7fc2860c1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1335,6 +1335,15 @@ static struct ctl_table vm_table[] = { .extra1 = &min_extfrag_threshold, .extra2 = &max_extfrag_threshold, }, + { + .procname = "compact_unevictable_allowed", + .data = &sysctl_compact_unevictable_allowed, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + .extra2 = &one, + }, #endif /* CONFIG_COMPACTION */ { diff --git a/mm/compaction.c b/mm/compaction.c index a18201a8124e..570426edcadf 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1046,6 +1046,12 @@ typedef enum { ISOLATE_SUCCESS, /* Pages isolated, migrate */ } isolate_migrate_t; +/* + * Allow userspace to control policy on scanning the unevictable LRU for + * compactable pages. + */ +int sysctl_compact_unevictable_allowed __read_mostly = 1; + /* * Isolate all pages that can be migrated from the first suitable block, * starting at the block pointed to by the migrate scanner pfn within @@ -1057,6 +1063,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, unsigned long low_pfn, end_pfn; struct page *page; const isolate_mode_t isolate_mode = + (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); /* -- cgit v1.3-6-gb490 From cc5993bd7b8cff4a3e37042ee1358d1d5eafa70c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Apr 2015 16:13:26 -0700 Subject: mm: rename deactivate_page to deactivate_file_page "deactivate_page" was created for file invalidation so it has too specific logic for file-backed pages. So, let's change the name of the function and date to a file-specific one and yield the generic name. Signed-off-by: Minchan Kim Cc: Michal Hocko Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Shaohua Li Cc: Wang, Yalin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- mm/swap.c | 24 ++++++++++++------------ mm/truncate.c | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 7067eca501e2..cee108cbe2d5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -307,7 +307,7 @@ extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); -extern void deactivate_page(struct page *page); +extern void deactivate_file_page(struct page *page); extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); diff --git a/mm/swap.c b/mm/swap.c index cd3a5e64cea9..e3a4feac9b0e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -42,7 +42,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -743,7 +743,7 @@ void lru_cache_add_active_or_unevictable(struct page *page, * be write it out by flusher threads as this is much more effective * than the single-page writeout from reclaim. */ -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, +static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, void *arg) { int lru, file; @@ -811,36 +811,36 @@ void lru_add_drain_cpu(int cpu) local_irq_restore(flags); } - pvec = &per_cpu(lru_deactivate_pvecs, cpu); + pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); activate_page_drain(cpu); } /** - * deactivate_page - forcefully deactivate a page + * deactivate_file_page - forcefully deactivate a file page * @page: page to deactivate * * This function hints the VM that @page is a good reclaim candidate, * for example if its invalidation fails due to the page being dirty * or under writeback. */ -void deactivate_page(struct page *page) +void deactivate_file_page(struct page *page) { /* - * In a workload with many unevictable page such as mprotect, unevictable - * page deactivation for accelerating reclaim is pointless. + * In a workload with many unevictable page such as mprotect, + * unevictable page deactivation for accelerating reclaim is pointless. */ if (PageUnevictable(page)) return; if (likely(get_page_unless_zero(page))) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); if (!pagevec_add(pvec, page)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - put_cpu_var(lru_deactivate_pvecs); + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + put_cpu_var(lru_deactivate_file_pvecs); } } @@ -872,7 +872,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); schedule_work_on(cpu, work); diff --git a/mm/truncate.c b/mm/truncate.c index 7a9d8a3cb143..66af9031fae8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -490,7 +490,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, * of interest and try to speed up its reclaim. */ if (!ret) - deactivate_page(page); + deactivate_file_page(page); count += ret; } pagevec_remove_exceptionals(&pvec); -- cgit v1.3-6-gb490 From c6a918200c4f4ebf74b7e1ae4fea9115c7b297f8 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 15 Apr 2015 16:13:36 -0700 Subject: hugetlbfs: add minimum size tracking fields to subpool structure hugetlbfs allocates huge pages from the global pool as needed. Even if the global pool contains a sufficient number pages for the filesystem size at mount time, those global pages could be grabbed for some other use. As a result, filesystem huge page allocations may fail due to lack of pages. Applications such as a database want to use huge pages for performance reasons. hugetlbfs filesystem semantics with ownership and modes work well to manage access to a pool of huge pages. However, the application would like some reasonable assurance that allocations will not fail due to a lack of huge pages. At application startup time, the application would like to configure itself to use a specific number of huge pages. Before starting, the application can check to make sure that enough huge pages exist in the system global pools. However, there are no guarantees that those pages will be available when needed by the application. What the application wants is exclusive use of a subset of huge pages. Add a new hugetlbfs mount option 'min_size=' to indicate that the specified number of pages will be available for use by the filesystem. At mount time, this number of huge pages will be reserved for exclusive use of the filesystem. If there is not a sufficient number of free pages, the mount will fail. As pages are allocated to and freeed from the filesystem, the number of reserved pages is adjusted so that the specified minimum is maintained. This patch (of 4): Add a field to the subpool structure to indicate the minimimum number of huge pages to always be used by this subpool. This minimum count includes allocated pages as well as reserved pages. If the minimum number of pages for the subpool have not been allocated, pages are reserved up to this minimum. An additional field (rsv_hpages) is used to track the number of pages reserved to meet this minimum size. The hstate pointer in the subpool is convenient to have when reserving and unreserving the pages. Signed-off-by: Mike Kravetz Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 8 +++++++- mm/hugetlb.c | 3 +-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 1a782733a420..d1a77b87408d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -22,7 +22,13 @@ struct mmu_gather; struct hugepage_subpool { spinlock_t lock; long count; - long max_hpages, used_hpages; + long max_hpages; /* Maximum huge pages or -1 if no maximum. */ + long used_hpages; /* Used count against maximum, includes */ + /* both alloced and reserved pages. */ + struct hstate *hstate; + long min_hpages; /* Minimum huge pages or -1 if no minimum. */ + long rsv_hpages; /* Pages reserved against global pool to */ + /* sasitfy minimum size. */ }; struct resv_map { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8874c8ad55aa..4494976c2042 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -77,14 +77,13 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) { struct hugepage_subpool *spool; - spool = kmalloc(sizeof(*spool), GFP_KERNEL); + spool = kzalloc(sizeof(*spool), GFP_KERNEL); if (!spool) return NULL; spin_lock_init(&spool->lock); spool->count = 1; spool->max_hpages = nr_blocks; - spool->used_hpages = 0; return spool; } -- cgit v1.3-6-gb490 From 7ca02d0ae586fe7df59632966a64f3f1a756ef05 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 15 Apr 2015 16:13:42 -0700 Subject: hugetlbfs: accept subpool min_size mount option and setup accordingly Make 'min_size=' be an option when mounting a hugetlbfs. This option takes the same value as the 'size' option. min_size can be specified without specifying size. If both are specified, min_size must be less that or equal to size else the mount will fail. If min_size is specified, then at mount time an attempt is made to reserve min_size pages. If the reservation fails, the mount fails. At umount time, the reserved pages are released. Signed-off-by: Mike Kravetz Cc: Davidlohr Bueso Cc: Aneesh Kumar Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 90 ++++++++++++++++++++++++++++++++++++++----------- include/linux/hugetlb.h | 3 +- mm/hugetlb.c | 25 +++++++++++--- 3 files changed, 94 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index db76cec3ce21..3a8f12762821 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -47,9 +47,10 @@ struct hugetlbfs_config { kuid_t uid; kgid_t gid; umode_t mode; - long nr_blocks; + long max_hpages; long nr_inodes; struct hstate *hstate; + long min_hpages; }; struct hugetlbfs_inode_info { @@ -67,7 +68,7 @@ int sysctl_hugetlb_shm_group; enum { Opt_size, Opt_nr_inodes, Opt_mode, Opt_uid, Opt_gid, - Opt_pagesize, + Opt_pagesize, Opt_min_size, Opt_err, }; @@ -78,6 +79,7 @@ static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_pagesize, "pagesize=%s"}, + {Opt_min_size, "min_size=%s"}, {Opt_err, NULL}, }; @@ -754,14 +756,38 @@ static const struct super_operations hugetlbfs_ops = { .show_options = generic_show_options, }; +enum { NO_SIZE, SIZE_STD, SIZE_PERCENT }; + +/* + * Convert size option passed from command line to number of huge pages + * in the pool specified by hstate. Size option could be in bytes + * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). + */ +static long long +hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, + int val_type) +{ + if (val_type == NO_SIZE) + return -1; + + if (val_type == SIZE_PERCENT) { + size_opt <<= huge_page_shift(h); + size_opt *= h->max_huge_pages; + do_div(size_opt, 100); + } + + size_opt >>= huge_page_shift(h); + return size_opt; +} + static int hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) { char *p, *rest; substring_t args[MAX_OPT_ARGS]; int option; - unsigned long long size = 0; - enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE; + unsigned long long max_size_opt = 0, min_size_opt = 0; + int max_val_type = NO_SIZE, min_val_type = NO_SIZE; if (!options) return 0; @@ -799,10 +825,10 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) /* memparse() will accept a K/M/G without a digit */ if (!isdigit(*args[0].from)) goto bad_val; - size = memparse(args[0].from, &rest); - setsize = SIZE_STD; + max_size_opt = memparse(args[0].from, &rest); + max_val_type = SIZE_STD; if (*rest == '%') - setsize = SIZE_PERCENT; + max_val_type = SIZE_PERCENT; break; } @@ -825,6 +851,17 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) break; } + case Opt_min_size: { + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(*args[0].from)) + goto bad_val; + min_size_opt = memparse(args[0].from, &rest); + min_val_type = SIZE_STD; + if (*rest == '%') + min_val_type = SIZE_PERCENT; + break; + } + default: pr_err("Bad mount option: \"%s\"\n", p); return -EINVAL; @@ -832,15 +869,22 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) } } - /* Do size after hstate is set up */ - if (setsize > NO_SIZE) { - struct hstate *h = pconfig->hstate; - if (setsize == SIZE_PERCENT) { - size <<= huge_page_shift(h); - size *= h->max_huge_pages; - do_div(size, 100); - } - pconfig->nr_blocks = (size >> huge_page_shift(h)); + /* + * Use huge page pool size (in hstate) to convert the size + * options to number of huge pages. If NO_SIZE, -1 is returned. + */ + pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, + max_size_opt, max_val_type); + pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, + min_size_opt, min_val_type); + + /* + * If max_size was specified, then min_size must be smaller + */ + if (max_val_type > NO_SIZE && + pconfig->min_hpages > pconfig->max_hpages) { + pr_err("minimum size can not be greater than maximum size\n"); + return -EINVAL; } return 0; @@ -859,12 +903,13 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) save_mount_options(sb, data); - config.nr_blocks = -1; /* No limit on size by default */ + config.max_hpages = -1; /* No limit on size by default */ config.nr_inodes = -1; /* No limit on number of inodes by default */ config.uid = current_fsuid(); config.gid = current_fsgid(); config.mode = 0755; config.hstate = &default_hstate; + config.min_hpages = -1; /* No default minimum size */ ret = hugetlbfs_parse_options(data, &config); if (ret) return ret; @@ -878,8 +923,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) sbinfo->max_inodes = config.nr_inodes; sbinfo->free_inodes = config.nr_inodes; sbinfo->spool = NULL; - if (config.nr_blocks != -1) { - sbinfo->spool = hugepage_new_subpool(config.nr_blocks); + /* + * Allocate and initialize subpool if maximum or minimum size is + * specified. Any needed reservations (for minimim size) are taken + * taken when the subpool is created. + */ + if (config.max_hpages != -1 || config.min_hpages != -1) { + sbinfo->spool = hugepage_new_subpool(config.hstate, + config.max_hpages, + config.min_hpages); if (!sbinfo->spool) goto out_free; } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d1a77b87408d..5713c49a4a5c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -44,7 +44,8 @@ extern int hugetlb_max_hstate __read_mostly; #define for_each_hstate(h) \ for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) -struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); +struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, + long min_hpages); void hugepage_put_subpool(struct hugepage_subpool *spool); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 499cb72c74b1..995c8d65a95c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock); static int num_fault_mutexes; static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; +/* Forward declaration */ +static int hugetlb_acct_memory(struct hstate *h, long delta); + static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { bool free = (spool->count == 0) && (spool->used_hpages == 0); @@ -68,12 +71,18 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) spin_unlock(&spool->lock); /* If no pages are used, and no other handles to the subpool - * remain, free the subpool the subpool remain */ - if (free) + * remain, give up any reservations mased on minimum size and + * free the subpool */ + if (free) { + if (spool->min_hpages != -1) + hugetlb_acct_memory(spool->hstate, + -spool->min_hpages); kfree(spool); + } } -struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) +struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, + long min_hpages) { struct hugepage_subpool *spool; @@ -83,7 +92,15 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) spin_lock_init(&spool->lock); spool->count = 1; - spool->max_hpages = nr_blocks; + spool->max_hpages = max_hpages; + spool->hstate = h; + spool->min_hpages = min_hpages; + + if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { + kfree(spool); + return NULL; + } + spool->rsv_hpages = min_hpages; return spool; } -- cgit v1.3-6-gb490 From e244c9e66f6197f55f6fbb2d5e70714e262cc595 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 15 Apr 2015 16:14:14 -0700 Subject: mm, mempool: disallow mempools based on slab caches with constructors All occurrences of mempools based on slab caches with object constructors have been removed from the tree, so disallow creating them. We can only dereference mem->ctor in mm/mempool.c without including mm/slab.h in include/linux/mempool.h. So simply note the restriction, just like the comment restricting usage of __GFP_ZERO, and warn on kernels with CONFIG_DEBUG_VM() if such a mempool is allocated from. We don't want to incur this check on every element allocation, so use VM_BUG_ON(). Signed-off-by: David Rientjes Cc: Dave Kleikamp Cc: Christoph Hellwig Cc: Sebastian Ott Cc: Mikulas Patocka Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempool.h | 3 ++- mm/mempool.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mempool.h b/include/linux/mempool.h index b19b3023c880..69b6951e8fd2 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -36,7 +36,8 @@ extern void mempool_free(void *element, mempool_t *pool); /* * A mempool_alloc_t and mempool_free_t that get the memory from - * a slab that is passed in through pool_data. + * a slab cache that is passed in through pool_data. + * Note: the slab cache may not have a ctor function. */ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data); void mempool_free_slab(void *element, void *pool_data); diff --git a/mm/mempool.c b/mm/mempool.c index 949970db2874..b60fb85526ed 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -15,6 +15,7 @@ #include #include #include +#include "slab.h" static void add_element(mempool_t *pool, void *element) { @@ -334,6 +335,7 @@ EXPORT_SYMBOL(mempool_free); void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) { struct kmem_cache *mem = pool_data; + VM_BUG_ON(mem->ctor); return kmem_cache_alloc(mem, gfp_mask); } EXPORT_SYMBOL(mempool_alloc_slab); -- cgit v1.3-6-gb490 From 7e1f049efb86bd86c06b80eeac0ef80cdeb8c0e7 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 15 Apr 2015 16:14:41 -0700 Subject: mm: hugetlb: cleanup using paeg_huge_active() Now we have an easy access to hugepages' activeness, so existing helpers to get the information can be cleaned up. [akpm@linux-foundation.org: s/PageHugeActive/page_huge_active/] Signed-off-by: Naoya Horiguchi Cc: Hugh Dickins Reviewed-by: Michal Hocko Cc: Mel Gorman Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 -- include/linux/page-flags.h | 7 +++++++ mm/hugetlb.c | 42 +++++------------------------------------- mm/memory_hotplug.c | 2 +- 4 files changed, 13 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5713c49a4a5c..205026175c42 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -84,7 +84,6 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); int dequeue_hwpoisoned_huge_page(struct page *page); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); -bool is_hugepage_active(struct page *page); void free_huge_page(struct page *page); #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE @@ -152,7 +151,6 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list) return false; } #define putback_active_hugepage(p) do {} while (0) -#define is_hugepage_active(x) false static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 84d10b65cec6..f34e040b34e9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -470,11 +470,18 @@ static inline void ClearPageCompound(struct page *page) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); int PageHeadHuge(struct page *page); +bool page_huge_active(struct page *page); #else TESTPAGEFLAG_FALSE(Huge) TESTPAGEFLAG_FALSE(HeadHuge) + +static inline bool page_huge_active(struct page *page) +{ + return 0; +} #endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 05407831016b..271e4432734c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3896,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, #ifdef CONFIG_MEMORY_FAILURE -/* Should be called in hugetlb_lock */ -static int is_hugepage_on_freelist(struct page *hpage) -{ - struct page *page; - struct page *tmp; - struct hstate *h = page_hstate(hpage); - int nid = page_to_nid(hpage); - - list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) - if (page == hpage) - return 1; - return 0; -} - /* * This function is called from memory failure code. * Assume the caller holds page lock of the head page. @@ -3921,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) int ret = -EBUSY; spin_lock(&hugetlb_lock); - if (is_hugepage_on_freelist(hpage)) { + /* + * Just checking !page_huge_active is not enough, because that could be + * an isolated/hwpoisoned hugepage (which have >0 refcount). + */ + if (!page_huge_active(hpage) && !page_count(hpage)) { /* * Hwpoisoned hugepage isn't linked to activelist or freelist, * but dangling hpage->lru can trigger list-debug warnings @@ -3965,25 +3955,3 @@ void putback_active_hugepage(struct page *page) spin_unlock(&hugetlb_lock); put_page(page); } - -bool is_hugepage_active(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHuge(page), page); - /* - * This function can be called for a tail page because the caller, - * scan_movable_pages, scans through a given pfn-range which typically - * covers one memory block. In systems using gigantic hugepage (1GB - * for x86_64,) a hugepage is larger than a memory block, and we don't - * support migrating such large hugepages for now, so return false - * when called for tail pages. - */ - if (PageTail(page)) - return false; - /* - * Refcount of a hwpoisoned hugepages is 1, but they are not active, - * so we should return false for them. - */ - if (unlikely(PageHWPoison(page))) - return false; - return page_count(page) > 0; -} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e2e8014fb755..457bde530cbe 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1373,7 +1373,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) if (PageLRU(page)) return pfn; if (PageHuge(page)) { - if (is_hugepage_active(page)) + if (page_huge_active(page)) return pfn; else pfn = round_up(pfn + 1, -- cgit v1.3-6-gb490 From cdd7875e0c4db5c41e28b645d3bf7d41bd2cbb45 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 15 Apr 2015 16:14:47 -0700 Subject: include/linux/mm.h: simplify flag check Flip the flag test so that it is the simplest. No functional change, just a small readability improvement: No code changed: # arch/x86/kernel/sys_x86_64.o: text data bss dec hex filename 1551 24 0 1575 627 sys_x86_64.o.before 1551 24 0 1575 627 sys_x86_64.o.after md5: 70708d1b1ad35cc891118a69dc1a63f9 sys_x86_64.o.before.asm 70708d1b1ad35cc891118a69dc1a63f9 sys_x86_64.o.after.asm Signed-off-by: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 515ec5045b60..68c21b2374c4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1894,10 +1894,10 @@ extern unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); static inline unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info) { - if (!(info->flags & VM_UNMAPPED_AREA_TOPDOWN)) - return unmapped_area(info); - else + if (info->flags & VM_UNMAPPED_AREA_TOPDOWN) return unmapped_area_topdown(info); + else + return unmapped_area(info); } /* truncate.c */ -- cgit v1.3-6-gb490 From e39155ea11eac6da056b04669d7c9fc612e2065a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Apr 2015 16:14:53 -0700 Subject: mm: uninline and cleanup page-mapping related helpers Most-used page->mapping helper -- page_mapping() -- has already uninlined. Let's uninline also page_rmapping() and page_anon_vma(). It saves us depending on configuration around 400 bytes in text: text data bss dec hex filename 660318 99254 410000 1169572 11d8a4 mm/built-in.o-before 659854 99254 410000 1169108 11d6d4 mm/built-in.o I also tried to make code a bit more clean. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Konstantin Khlebnikov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++------ include/linux/rmap.h | 8 -------- mm/util.c | 41 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 38 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 68c21b2374c4..0e7bb2194da5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -950,14 +950,10 @@ void page_address_init(void); #define page_address_init() do { } while(0) #endif +extern void *page_rmapping(struct page *page); +extern struct anon_vma *page_anon_vma(struct page *page); extern struct address_space *page_mapping(struct page *page); -/* Neutral page->mapping pointer to address_space or anon_vma or other */ -static inline void *page_rmapping(struct page *page) -{ - return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); -} - extern struct address_space *__page_file_mapping(struct page *); static inline diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c4c559a45dc8..c89c53a113a8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -105,14 +105,6 @@ static inline void put_anon_vma(struct anon_vma *anon_vma) __put_anon_vma(anon_vma); } -static inline struct anon_vma *page_anon_vma(struct page *page) -{ - if (((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != - PAGE_MAPPING_ANON) - return NULL; - return page_rmapping(page); -} - static inline void vma_lock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; diff --git a/mm/util.c b/mm/util.c index 3981ae9d1b15..68ff8a5361e7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -325,9 +325,37 @@ void kvfree(const void *addr) } EXPORT_SYMBOL(kvfree); +static inline void *__page_rmapping(struct page *page) +{ + unsigned long mapping; + + mapping = (unsigned long)page->mapping; + mapping &= ~PAGE_MAPPING_FLAGS; + + return (void *)mapping; +} + +/* Neutral page->mapping pointer to address_space or anon_vma or other */ +void *page_rmapping(struct page *page) +{ + page = compound_head(page); + return __page_rmapping(page); +} + +struct anon_vma *page_anon_vma(struct page *page) +{ + unsigned long mapping; + + page = compound_head(page); + mapping = (unsigned long)page->mapping; + if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + return NULL; + return __page_rmapping(page); +} + struct address_space *page_mapping(struct page *page) { - struct address_space *mapping = page->mapping; + unsigned long mapping; /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(PageSlab(page))) @@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page) swp_entry_t entry; entry.val = page_private(page); - mapping = swap_address_space(entry); - } else if ((unsigned long)mapping & PAGE_MAPPING_ANON) - mapping = NULL; - return mapping; + return swap_address_space(entry); + } + + mapping = (unsigned long)page->mapping; + if (mapping & PAGE_MAPPING_FLAGS) + return NULL; + return page->mapping; } int overcommit_ratio_handler(struct ctl_table *table, int write, -- cgit v1.3-6-gb490 From 923936157b158f36bd6a3d86496dce82b1a957de Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 15 Apr 2015 16:15:05 -0700 Subject: mm/mempool.c: kasan: poison mempool elements Mempools keep allocated objects in reserved for situations when ordinary allocation may not be possible to satisfy. These objects shouldn't be accessed before they leave the pool. This patch poison elements when get into the pool and unpoison when they leave it. This will let KASan to detect use-after-free of mempool's elements. Signed-off-by: Andrey Ryabinin Tested-by: David Rientjes Cc: Catalin Marinas Cc: Dmitry Chernenkov Cc: Dmitry Vyukov Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 2 ++ mm/kasan/kasan.c | 13 +++++++++++++ mm/mempool.c | 23 +++++++++++++++++++++++ 3 files changed, 38 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5bb074431eb0..5486d777b706 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -44,6 +44,7 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object); void kasan_kmalloc_large(const void *ptr, size_t size); void kasan_kfree_large(const void *ptr); +void kasan_kfree(void *ptr); void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size); void kasan_krealloc(const void *object, size_t new_size); @@ -71,6 +72,7 @@ static inline void kasan_poison_object_data(struct kmem_cache *cache, static inline void kasan_kmalloc_large(void *ptr, size_t size) {} static inline void kasan_kfree_large(const void *ptr) {} +static inline void kasan_kfree(void *ptr) {} static inline void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size) {} static inline void kasan_krealloc(const void *object, size_t new_size) {} diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 936d81661c47..6c513a63ea84 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -389,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size) kasan_kmalloc(page->slab_cache, object, size); } +void kasan_kfree(void *ptr) +{ + struct page *page; + + page = virt_to_head_page(ptr); + + if (unlikely(!PageSlab(page))) + kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), + KASAN_FREE_PAGE); + else + kasan_slab_free(page->slab_cache, ptr); +} + void kasan_kfree_large(const void *ptr) { struct page *page = virt_to_page(ptr); diff --git a/mm/mempool.c b/mm/mempool.c index 2884d5bad77e..2cc08de8b1db 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -101,10 +102,31 @@ static inline void poison_element(mempool_t *pool, void *element) } #endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ +static void kasan_poison_element(mempool_t *pool, void *element) +{ + if (pool->alloc == mempool_alloc_slab) + kasan_slab_free(pool->pool_data, element); + if (pool->alloc == mempool_kmalloc) + kasan_kfree(element); + if (pool->alloc == mempool_alloc_pages) + kasan_free_pages(element, (unsigned long)pool->pool_data); +} + +static void kasan_unpoison_element(mempool_t *pool, void *element) +{ + if (pool->alloc == mempool_alloc_slab) + kasan_slab_alloc(pool->pool_data, element); + if (pool->alloc == mempool_kmalloc) + kasan_krealloc(element, (size_t)pool->pool_data); + if (pool->alloc == mempool_alloc_pages) + kasan_alloc_pages(element, (unsigned long)pool->pool_data); +} + static void add_element(mempool_t *pool, void *element) { BUG_ON(pool->curr_nr >= pool->min_nr); poison_element(pool, element); + kasan_poison_element(pool, element); pool->elements[pool->curr_nr++] = element; } @@ -114,6 +136,7 @@ static void *remove_element(mempool_t *pool) BUG_ON(pool->curr_nr < 0); check_element(pool, element); + kasan_unpoison_element(pool, element); return element; } -- cgit v1.3-6-gb490 From dd9061846a3ba01b0fa45423aaa087e4a69187fa Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 15 Apr 2015 16:15:11 -0700 Subject: mm: new pfn_mkwrite same as page_mkwrite for VM_PFNMAP This will allow FS that uses VM_PFNMAP | VM_MIXEDMAP (no page structs) to get notified when access is a write to a read-only PFN. This can happen if we mmap() a file then first mmap-read from it to page-in a read-only PFN, than we mmap-write to the same page. We need this functionality to fix a DAX bug, where in the scenario above we fail to set ctime/mtime though we modified the file. An xfstest is attached to this patchset that shows the failure and the fix. (A DAX patch will follow) This functionality is extra important for us, because upon dirtying of a pmem page we also want to RDMA the page to a remote cluster node. We define a new pfn_mkwrite and do not reuse page_mkwrite because 1 - The name ;-) 2 - But mainly because it would take a very long and tedious audit of all page_mkwrite functions of VM_MIXEDMAP/VM_PFNMAP users. To make sure they do not now CRASH. For example current DAX code (which this is for) would crash. If we would want to reuse page_mkwrite, We will need to first patch all users, so to not-crash-on-no-page. Then enable this patch. But even if I did that I would not sleep so well at night. Adding a new vector is the safest thing to do, and is not that expensive. an extra pointer at a static function vector per driver. Also the new vector is better for performance, because else we Will call all current Kernel vectors, so to: check-ha-no-page-do-nothing and return. No need to call it from do_shared_fault because do_wp_page is called to change pte permissions anyway. Signed-off-by: Yigal Korman Signed-off-by: Boaz Harrosh Acked-by: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Jan Kara Cc: Hugh Dickins Cc: Mel Gorman Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 8 ++++++++ include/linux/mm.h | 3 +++ mm/memory.c | 43 +++++++++++++++++++++++++++++++++++---- 3 files changed, 50 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index f91926f2f482..8bb8a7ee0f99 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -525,6 +525,7 @@ prototypes: void (*close)(struct vm_area_struct*); int (*fault)(struct vm_area_struct*, struct vm_fault *); int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); + int (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *); int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: @@ -534,6 +535,7 @@ close: yes fault: yes can return with page locked map_pages: yes page_mkwrite: yes can return with page locked +pfn_mkwrite: yes access: yes ->fault() is called when a previously not present pte is about @@ -560,6 +562,12 @@ the page has been truncated, the filesystem should not look up a new page like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which will cause the VM to retry the fault. + ->pfn_mkwrite() is the same as page_mkwrite but when the pte is +VM_PFNMAP or VM_MIXEDMAP with a page-less entry. Expected return is +VM_FAULT_NOPAGE. Or one of the VM_FAULT_ERROR types. The default behavior +after this call is to make the pte read-write, unless pfn_mkwrite returns +an error. + ->access() is called when get_user_pages() fails in access_process_vm(), typically used to debug a process through /proc/pid/mem or ptrace. This function is needed only for diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e7bb2194da5..8b086070c3a5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -251,6 +251,9 @@ struct vm_operations_struct { * writable, if an error is returned it will cause a SIGBUS */ int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); + /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ + int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); + /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware */ diff --git a/mm/memory.c b/mm/memory.c index f9628e568c58..22e037e3364e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2180,6 +2180,42 @@ oom: return VM_FAULT_OOM; } +/* + * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED + * mapping + */ +static int wp_pfn_shared(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, + pmd_t *pmd) +{ + if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { + struct vm_fault vmf = { + .page = NULL, + .pgoff = linear_page_index(vma, address), + .virtual_address = (void __user *)(address & PAGE_MASK), + .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, + }; + int ret; + + pte_unmap_unlock(page_table, ptl); + ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); + if (ret & VM_FAULT_ERROR) + return ret; + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + /* + * We might have raced with another page fault while we + * released the pte_offset_map_lock. + */ + if (!pte_same(*page_table, orig_pte)) { + pte_unmap_unlock(page_table, ptl); + return 0; + } + } + return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, + NULL, 0, 0); +} + static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, @@ -2258,13 +2294,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * VM_PFNMAP VMA. * * We should not cow pages in a shared writeable mapping. - * Just mark the pages writable as we can't do any dirty - * accounting on raw pfn maps. + * Just mark the pages writable and/or call ops->pfn_mkwrite. */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_page_reuse(mm, vma, address, page_table, ptl, - orig_pte, old_page, 0, 0); + return wp_pfn_shared(mm, vma, address, page_table, ptl, + orig_pte, pmd); pte_unmap_unlock(page_table, ptl); return wp_page_copy(mm, vma, address, page_table, pmd, -- cgit v1.3-6-gb490 From 0e3b210ce1722168227cb3bc7746256d0c0afece Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 15 Apr 2015 16:15:14 -0700 Subject: dax: use pfn_mkwrite to update c/mtime + freeze protection From: Yigal Korman [v1] Without this patch, c/mtime is not updated correctly when mmap'ed page is first read from and then written to. A new xfstest is submitted for testing this (generic/080) [v2] Jan Kara has pointed out that if we add the sb_start/end_pagefault pair in the new pfn_mkwrite we are then fixing another bug where: A user could start writing to the page while filesystem is frozen. Signed-off-by: Yigal Korman Signed-off-by: Boaz Harrosh Reviewed-by: Jan Kara Cc: Matthew Wilcox Cc: Dave Chinner Cc: Hugh Dickins Cc: Mel Gorman Cc: Kirill A. Shutemov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 17 +++++++++++++++++ fs/ext2/file.c | 1 + fs/ext4/file.c | 1 + include/linux/fs.h | 1 + 4 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/fs/dax.c b/fs/dax.c index ed1619ec6537..d0bd1f4f81b3 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -463,6 +463,23 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } EXPORT_SYMBOL_GPL(dax_fault); +/** + * dax_pfn_mkwrite - handle first write to DAX page + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * + */ +int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + sb_end_pagefault(sb); + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); + /** * dax_zero_page_range - zero a range within a page of a DAX file * @inode: The file being truncated diff --git a/fs/ext2/file.c b/fs/ext2/file.c index e31701713516..866a3ce3f864 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -39,6 +39,7 @@ static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, .page_mkwrite = ext2_dax_mkwrite, + .pfn_mkwrite = dax_pfn_mkwrite, }; static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 598abbbe6786..aa78c70553f4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -206,6 +206,7 @@ static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .page_mkwrite = ext4_dax_mkwrite, + .pfn_mkwrite = dax_pfn_mkwrite, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops diff --git a/include/linux/fs.h b/include/linux/fs.h index 60733bdc74b4..0f696328f218 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2620,6 +2620,7 @@ int dax_clear_blocks(struct inode *, sector_t block, long size); int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); +int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); #define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) #ifdef CONFIG_BLOCK -- cgit v1.3-6-gb490 From 312fcae227037619dc858c9ccd362c7b847730a2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Apr 2015 16:15:30 -0700 Subject: zsmalloc: support compaction This patch provides core functions for migration of zsmalloc. Migraion policy is simple as follows. for each size class { while { src_page = get zs_page from ZS_ALMOST_EMPTY if (!src_page) break; dst_page = get zs_page from ZS_ALMOST_FULL if (!dst_page) dst_page = get zs_page from ZS_ALMOST_EMPTY if (!dst_page) break; migrate(from src_page, to dst_page); } } For migration, we need to identify which objects in zspage are allocated to migrate them out. We could know it by iterating of freed objects in a zspage because first_page of zspage keeps free objects singly-linked list but it's not efficient. Instead, this patch adds a tag(ie, OBJ_ALLOCATED_TAG) in header of each object(ie, handle) so we could check whether the object is allocated easily. This patch adds another status bit in handle to synchronize between user access through zs_map_object and migration. During migration, we cannot move objects user are using due to data coherency between old object and new object. [akpm@linux-foundation.org: zsmalloc.c needs sched.h for cond_resched()] Signed-off-by: Minchan Kim Cc: Juneho Choi Cc: Gunho Lee Cc: Luigi Semenzato Cc: Dan Streetman Cc: Seth Jennings Cc: Nitin Gupta Cc: Jerome Marchand Cc: Sergey Senozhatsky Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zsmalloc.h | 1 + mm/zsmalloc.c | 378 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 360 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 3283c6a55425..1338190b5478 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -47,5 +47,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, void zs_unmap_object(struct zs_pool *pool, unsigned long handle); unsigned long zs_get_total_pages(struct zs_pool *pool); +unsigned long zs_compact(struct zs_pool *pool); #endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 55b171016f4f..c4ae608dc725 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -78,6 +78,7 @@ #include #include +#include #include #include #include @@ -135,7 +136,26 @@ #endif #endif #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) -#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) + +/* + * Memory for allocating for handle keeps object position by + * encoding and the encoded value has a room + * in least bit(ie, look at obj_to_location). + * We use the bit to synchronize between object access by + * user and migration. + */ +#define HANDLE_PIN_BIT 0 + +/* + * Head in allocated object should have OBJ_ALLOCATED_TAG + * to identify the object was allocated or not. + * It's okay to add the status bit in the least bit because + * header keeps handle which is 4byte-aligned address so we + * have room for two bit at least. + */ +#define OBJ_ALLOCATED_TAG 1 +#define OBJ_TAG_BITS 1 +#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) #define MAX(a, b) ((a) >= (b) ? (a) : (b)) @@ -610,35 +630,35 @@ static struct page *get_next_page(struct page *page) /* * Encode as a single handle value. - * On hardware platforms with physical memory starting at 0x0 the pfn - * could be 0 so we ensure that the handle will never be 0 by adjusting the - * encoded obj_idx value before encoding. + * We use the least bit of handle for tagging. */ -static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) +static void *location_to_obj(struct page *page, unsigned long obj_idx) { - unsigned long handle; + unsigned long obj; if (!page) { BUG_ON(obj_idx); return NULL; } - handle = page_to_pfn(page) << OBJ_INDEX_BITS; - handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); + obj = page_to_pfn(page) << OBJ_INDEX_BITS; + obj |= ((obj_idx) & OBJ_INDEX_MASK); + obj <<= OBJ_TAG_BITS; - return (void *)handle; + return (void *)obj; } /* * Decode pair from the given object handle. We adjust the * decoded obj_idx back to its original value since it was adjusted in - * obj_location_to_handle(). + * location_to_obj(). */ -static void obj_to_location(unsigned long handle, struct page **page, +static void obj_to_location(unsigned long obj, struct page **page, unsigned long *obj_idx) { - *page = pfn_to_page(handle >> OBJ_INDEX_BITS); - *obj_idx = (handle & OBJ_INDEX_MASK) - 1; + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *obj_idx = (obj & OBJ_INDEX_MASK); } static unsigned long handle_to_obj(unsigned long handle) @@ -646,6 +666,11 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } +unsigned long obj_to_head(void *obj) +{ + return *(unsigned long *)obj; +} + static unsigned long obj_idx_to_offset(struct page *page, unsigned long obj_idx, int class_size) { @@ -657,6 +682,25 @@ static unsigned long obj_idx_to_offset(struct page *page, return off + obj_idx * class_size; } +static inline int trypin_tag(unsigned long handle) +{ + unsigned long *ptr = (unsigned long *)handle; + + return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); +} + +static void pin_tag(unsigned long handle) +{ + while (!trypin_tag(handle)); +} + +static void unpin_tag(unsigned long handle) +{ + unsigned long *ptr = (unsigned long *)handle; + + clear_bit_unlock(HANDLE_PIN_BIT, ptr); +} + static void reset_page(struct page *page) { clear_bit(PG_private, &page->flags); @@ -718,7 +762,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { - link->next = obj_location_to_handle(page, i++); + link->next = location_to_obj(page, i++); link += class->size / sizeof(*link); } @@ -728,7 +772,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) * page (if present) */ next_page = get_next_page(page); - link->next = obj_location_to_handle(next_page, 0); + link->next = location_to_obj(next_page, 0); kunmap_atomic(vaddr); page = next_page; off %= PAGE_SIZE; @@ -782,7 +826,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) init_zspage(first_page, class); - first_page->freelist = obj_location_to_handle(first_page, 0); + first_page->freelist = location_to_obj(first_page, 0); /* Maximum number of objects we can store in this zspage */ first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; @@ -1017,6 +1061,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) return true; } +static bool zspage_full(struct page *page) +{ + BUG_ON(!is_first_page(page)); + + return page->inuse == page->objects; +} + #ifdef CONFIG_ZSMALLOC_STAT static inline void zs_stat_inc(struct size_class *class, @@ -1219,6 +1270,9 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, */ BUG_ON(in_interrupt()); + /* From now on, migration cannot move the object */ + pin_tag(handle); + obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); @@ -1276,6 +1330,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } put_cpu_var(zs_map_area); + unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); @@ -1289,6 +1344,7 @@ static unsigned long obj_malloc(struct page *first_page, unsigned long m_objidx, m_offset; void *vaddr; + handle |= OBJ_ALLOCATED_TAG; obj = (unsigned long)first_page->freelist; obj_to_location(obj, &m_page, &m_objidx); m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); @@ -1374,6 +1430,7 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, BUG_ON(!obj); + obj &= ~OBJ_ALLOCATED_TAG; obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); @@ -1402,8 +1459,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle) if (unlikely(!handle)) return; + pin_tag(handle); obj = handle_to_obj(handle); - free_handle(pool, handle); obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); @@ -1413,18 +1470,301 @@ void zs_free(struct zs_pool *pool, unsigned long handle) spin_lock(&class->lock); obj_free(pool, class, obj); fullness = fix_fullness_group(class, first_page); - if (fullness == ZS_EMPTY) + if (fullness == ZS_EMPTY) { zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( class->size, class->pages_per_zspage)); + atomic_long_sub(class->pages_per_zspage, + &pool->pages_allocated); + free_zspage(first_page); + } spin_unlock(&class->lock); + unpin_tag(handle); + + free_handle(pool, handle); +} +EXPORT_SYMBOL_GPL(zs_free); + +static void zs_object_copy(unsigned long src, unsigned long dst, + struct size_class *class) +{ + struct page *s_page, *d_page; + unsigned long s_objidx, d_objidx; + unsigned long s_off, d_off; + void *s_addr, *d_addr; + int s_size, d_size, size; + int written = 0; + + s_size = d_size = class->size; + + obj_to_location(src, &s_page, &s_objidx); + obj_to_location(dst, &d_page, &d_objidx); + + s_off = obj_idx_to_offset(s_page, s_objidx, class->size); + d_off = obj_idx_to_offset(d_page, d_objidx, class->size); + + if (s_off + class->size > PAGE_SIZE) + s_size = PAGE_SIZE - s_off; + + if (d_off + class->size > PAGE_SIZE) + d_size = PAGE_SIZE - d_off; + + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + + while (1) { + size = min(s_size, d_size); + memcpy(d_addr + d_off, s_addr + s_off, size); + written += size; + + if (written == class->size) + break; + + if (s_off + size >= PAGE_SIZE) { + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); + s_page = get_next_page(s_page); + BUG_ON(!s_page); + s_addr = kmap_atomic(s_page); + d_addr = kmap_atomic(d_page); + s_size = class->size - written; + s_off = 0; + } else { + s_off += size; + s_size -= size; + } + + if (d_off + size >= PAGE_SIZE) { + kunmap_atomic(d_addr); + d_page = get_next_page(d_page); + BUG_ON(!d_page); + d_addr = kmap_atomic(d_page); + d_size = class->size - written; + d_off = 0; + } else { + d_off += size; + d_size -= size; + } + } + + kunmap_atomic(d_addr); + kunmap_atomic(s_addr); +} + +/* + * Find alloced object in zspage from index object and + * return handle. + */ +static unsigned long find_alloced_obj(struct page *page, int index, + struct size_class *class) +{ + unsigned long head; + int offset = 0; + unsigned long handle = 0; + void *addr = kmap_atomic(page); + + if (!is_first_page(page)) + offset = page->index; + offset += class->size * index; + + while (offset < PAGE_SIZE) { + head = obj_to_head(addr + offset); + if (head & OBJ_ALLOCATED_TAG) { + handle = head & ~OBJ_ALLOCATED_TAG; + if (trypin_tag(handle)) + break; + handle = 0; + } + + offset += class->size; + index++; + } + + kunmap_atomic(addr); + return handle; +} + +struct zs_compact_control { + /* Source page for migration which could be a subpage of zspage. */ + struct page *s_page; + /* Destination page for migration which should be a first page + * of zspage. */ + struct page *d_page; + /* Starting object index within @s_page which used for live object + * in the subpage. */ + int index; + /* how many of objects are migrated */ + int nr_migrated; +}; + +static int migrate_zspage(struct zs_pool *pool, struct size_class *class, + struct zs_compact_control *cc) +{ + unsigned long used_obj, free_obj; + unsigned long handle; + struct page *s_page = cc->s_page; + struct page *d_page = cc->d_page; + unsigned long index = cc->index; + int nr_migrated = 0; + int ret = 0; + + while (1) { + handle = find_alloced_obj(s_page, index, class); + if (!handle) { + s_page = get_next_page(s_page); + if (!s_page) + break; + index = 0; + continue; + } + + /* Stop if there is no more space */ + if (zspage_full(d_page)) { + unpin_tag(handle); + ret = -ENOMEM; + break; + } + used_obj = handle_to_obj(handle); + free_obj = obj_malloc(d_page, class, handle); + zs_object_copy(used_obj, free_obj, class); + index++; + record_obj(handle, free_obj); + unpin_tag(handle); + obj_free(pool, class, used_obj); + nr_migrated++; + } + + /* Remember last position in this iteration */ + cc->s_page = s_page; + cc->index = index; + cc->nr_migrated = nr_migrated; + + return ret; +} + +static struct page *alloc_target_page(struct size_class *class) +{ + int i; + struct page *page; + + for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { + page = class->fullness_list[i]; + if (page) { + remove_zspage(page, class, i); + break; + } + } + + return page; +} + +static void putback_zspage(struct zs_pool *pool, struct size_class *class, + struct page *first_page) +{ + int class_idx; + enum fullness_group fullness; + + BUG_ON(!is_first_page(first_page)); + + get_zspage_mapping(first_page, &class_idx, &fullness); + insert_zspage(first_page, class, fullness); + fullness = fix_fullness_group(class, first_page); if (fullness == ZS_EMPTY) { + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( + class->size, class->pages_per_zspage)); atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); + free_zspage(first_page); } } -EXPORT_SYMBOL_GPL(zs_free); + +static struct page *isolate_source_page(struct size_class *class) +{ + struct page *page; + + page = class->fullness_list[ZS_ALMOST_EMPTY]; + if (page) + remove_zspage(page, class, ZS_ALMOST_EMPTY); + + return page; +} + +static unsigned long __zs_compact(struct zs_pool *pool, + struct size_class *class) +{ + int nr_to_migrate; + struct zs_compact_control cc; + struct page *src_page; + struct page *dst_page = NULL; + unsigned long nr_total_migrated = 0; + + cond_resched(); + + spin_lock(&class->lock); + while ((src_page = isolate_source_page(class))) { + + BUG_ON(!is_first_page(src_page)); + + /* The goal is to migrate all live objects in source page */ + nr_to_migrate = src_page->inuse; + cc.index = 0; + cc.s_page = src_page; + + while ((dst_page = alloc_target_page(class))) { + cc.d_page = dst_page; + /* + * If there is no more space in dst_page, try to + * allocate another zspage. + */ + if (!migrate_zspage(pool, class, &cc)) + break; + + putback_zspage(pool, class, dst_page); + nr_total_migrated += cc.nr_migrated; + nr_to_migrate -= cc.nr_migrated; + } + + /* Stop if we couldn't find slot */ + if (dst_page == NULL) + break; + + putback_zspage(pool, class, dst_page); + putback_zspage(pool, class, src_page); + spin_unlock(&class->lock); + nr_total_migrated += cc.nr_migrated; + cond_resched(); + spin_lock(&class->lock); + } + + if (src_page) + putback_zspage(pool, class, src_page); + + spin_unlock(&class->lock); + + return nr_total_migrated; +} + +unsigned long zs_compact(struct zs_pool *pool) +{ + int i; + unsigned long nr_migrated = 0; + struct size_class *class; + + for (i = zs_size_classes - 1; i >= 0; i--) { + class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) + continue; + nr_migrated += __zs_compact(pool, class); + } + + synchronize_rcu(); + + return nr_migrated; +} +EXPORT_SYMBOL_GPL(zs_compact); /** * zs_create_pool - Creates an allocation pool to work from. -- cgit v1.3-6-gb490 From 23f40a94d860449f39f00c3350bf850d15983e63 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:16:33 -0700 Subject: include/linux: remove empty conditionals Commit 607ca46e97a1 ("UAPI: (Scripted) Disintegrate include/linux") left behind some empty conditional blocks. Since they are useless and may cause a reader to wonder whether something is missing, remove them. Signed-off-by: Rasmus Villemoes Cc: Geert Uytterhoeven Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/a.out.h | 67 --------------------------------------------------- include/linux/types.h | 6 ----- 2 files changed, 73 deletions(-) (limited to 'include/linux') diff --git a/include/linux/a.out.h b/include/linux/a.out.h index 220f14338895..ee884168989f 100644 --- a/include/linux/a.out.h +++ b/include/linux/a.out.h @@ -4,44 +4,6 @@ #include #ifndef __ASSEMBLY__ -#if defined (M_OLDSUN2) -#else -#endif -#if defined (M_68010) -#else -#endif -#if defined (M_68020) -#else -#endif -#if defined (M_SPARC) -#else -#endif -#if !defined (N_MAGIC) -#endif -#if !defined (N_BADMAG) -#endif -#if !defined (N_TXTOFF) -#endif -#if !defined (N_DATOFF) -#endif -#if !defined (N_TRELOFF) -#endif -#if !defined (N_DRELOFF) -#endif -#if !defined (N_SYMOFF) -#endif -#if !defined (N_STROFF) -#endif -#if !defined (N_TXTADDR) -#endif -#if defined(vax) || defined(hp300) || defined(pyr) -#endif -#ifdef sony -#endif /* Sony. */ -#ifdef is68k -#endif -#if defined(m68k) && defined(PORTAR) -#endif #ifdef linux #include #if defined(__i386__) || defined(__mc68000__) @@ -51,34 +13,5 @@ #endif #endif #endif -#ifndef N_DATADDR -#endif -#if !defined (N_BSSADDR) -#endif -#if !defined (N_NLIST_DECLARED) -#endif /* no N_NLIST_DECLARED. */ -#if !defined (N_UNDF) -#endif -#if !defined (N_ABS) -#endif -#if !defined (N_TEXT) -#endif -#if !defined (N_DATA) -#endif -#if !defined (N_BSS) -#endif -#if !defined (N_FN) -#endif -#if !defined (N_EXT) -#endif -#if !defined (N_TYPE) -#endif -#if !defined (N_STAB) -#endif -#if !defined (N_RELOCATION_INFO_DECLARED) -#ifdef NS32K -#else -#endif -#endif /* no N_RELOCATION_INFO_DECLARED. */ #endif /*__ASSEMBLY__ */ #endif /* __A_OUT_GNU_H__ */ diff --git a/include/linux/types.h b/include/linux/types.h index 6747247e3f9f..59698be03490 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -146,12 +146,6 @@ typedef u64 dma_addr_t; typedef u32 dma_addr_t; #endif /* dma_addr_t */ -#ifdef __CHECKER__ -#else -#endif -#ifdef __CHECK_ENDIAN__ -#else -#endif typedef unsigned __bitwise__ gfp_t; typedef unsigned __bitwise__ fmode_t; typedef unsigned __bitwise__ oom_flags_t; -- cgit v1.3-6-gb490 From 2813893f8b197a14f1e1ddb04d99bce46817c84a Mon Sep 17 00:00:00 2001 From: Iulia Manda Date: Wed, 15 Apr 2015 16:16:41 -0700 Subject: kernel: conditionally support non-root users, groups and capabilities There are a lot of embedded systems that run most or all of their functionality in init, running as root:root. For these systems, supporting multiple users is not necessary. This patch adds a new symbol, CONFIG_MULTIUSER, that makes support for non-root users, non-root groups, and capabilities optional. It is enabled under CONFIG_EXPERT menu. When this symbol is not defined, UID and GID are zero in any possible case and processes always have all capabilities. The following syscalls are compiled out: setuid, setregid, setgid, setreuid, setresuid, getresuid, setresgid, getresgid, setgroups, getgroups, setfsuid, setfsgid, capget, capset. Also, groups.c is compiled out completely. In kernel/capability.c, capable function was moved in order to avoid adding two ifdef blocks. This change saves about 25 KB on a defconfig build. The most minimal kernels have total text sizes in the high hundreds of kB rather than low MB. (The 25k goes down a bit with allnoconfig, but not that much. The kernel was booted in Qemu. All the common functionalities work. Adding users/groups is not possible, failing with -ENOSYS. Bloat-o-meter output: add/remove: 7/87 grow/shrink: 19/397 up/down: 1675/-26325 (-24650) [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Iulia Manda Reviewed-by: Josh Triplett Acked-by: Geert Uytterhoeven Tested-by: Paul E. McKenney Reviewed-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/Kconfig | 1 + drivers/staging/lustre/lustre/Kconfig | 1 + fs/nfs/Kconfig | 2 +- fs/nfsd/Kconfig | 1 + include/linux/capability.h | 29 +++++++++++++++++++++++++++++ include/linux/cred.h | 23 +++++++++++++++++++---- include/linux/uidgid.h | 12 ++++++++++++ init/Kconfig | 19 ++++++++++++++++++- kernel/Makefile | 4 +++- kernel/capability.c | 35 +++++++++++++++++++---------------- kernel/cred.c | 3 +++ kernel/groups.c | 3 --- kernel/sys.c | 2 ++ kernel/sys_ni.c | 14 ++++++++++++++ net/sunrpc/Kconfig | 2 ++ security/Kconfig | 1 + 16 files changed, 126 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a5ced5c3c1e0..de2726a487b0 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -328,6 +328,7 @@ config COMPAT select COMPAT_BINFMT_ELF if BINFMT_ELF select ARCH_WANT_OLD_COMPAT_IPC select COMPAT_OLD_SIGACTION + depends on MULTIUSER help Select this option if you want to enable your system kernel to handle system-calls from ELF binaries for 31 bit ESA. This option diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig index 6725467ef4d0..62c7bba75274 100644 --- a/drivers/staging/lustre/lustre/Kconfig +++ b/drivers/staging/lustre/lustre/Kconfig @@ -10,6 +10,7 @@ config LUSTRE_FS select CRYPTO_SHA1 select CRYPTO_SHA256 select CRYPTO_SHA512 + depends on MULTIUSER help This option enables Lustre file system client support. Choose Y here if you want to access a Lustre file system cluster. To compile diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index c7abc10279af..f31fd0dd92c6 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -1,6 +1,6 @@ config NFS_FS tristate "NFS client support" - depends on INET && FILE_LOCKING + depends on INET && FILE_LOCKING && MULTIUSER select LOCKD select SUNRPC select NFS_ACL_SUPPORT if NFS_V3_ACL diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 683bf718aead..fc2d108f5272 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -6,6 +6,7 @@ config NFSD select SUNRPC select EXPORTFS select NFS_ACL_SUPPORT if NFSD_V2_ACL + depends on MULTIUSER help Choose Y here if you want to allow other computers to access files residing on this system using Sun's Network File System diff --git a/include/linux/capability.h b/include/linux/capability.h index aa93e5ef594c..af9f0b9e80e6 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -205,6 +205,7 @@ static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a, cap_intersect(permitted, __cap_nfsd_set)); } +#ifdef CONFIG_MULTIUSER extern bool has_capability(struct task_struct *t, int cap); extern bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap); @@ -213,6 +214,34 @@ extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); +#else +static inline bool has_capability(struct task_struct *t, int cap) +{ + return true; +} +static inline bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap) +{ + return true; +} +static inline bool has_capability_noaudit(struct task_struct *t, int cap) +{ + return true; +} +static inline bool has_ns_capability_noaudit(struct task_struct *t, + struct user_namespace *ns, int cap) +{ + return true; +} +static inline bool capable(int cap) +{ + return true; +} +static inline bool ns_capable(struct user_namespace *ns, int cap) +{ + return true; +} +#endif /* CONFIG_MULTIUSER */ extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); diff --git a/include/linux/cred.h b/include/linux/cred.h index 2fb2ca2127ed..8b6c083e68a7 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -62,9 +62,27 @@ do { \ groups_free(group_info); \ } while (0) -extern struct group_info *groups_alloc(int); extern struct group_info init_groups; +#ifdef CONFIG_MULTIUSER +extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); + +extern int in_group_p(kgid_t); +extern int in_egroup_p(kgid_t); +#else +static inline void groups_free(struct group_info *group_info) +{ +} + +static inline int in_group_p(kgid_t grp) +{ + return 1; +} +static inline int in_egroup_p(kgid_t grp) +{ + return 1; +} +#endif extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); @@ -74,9 +92,6 @@ extern bool may_setgroups(void); #define GROUP_AT(gi, i) \ ((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK]) -extern int in_group_p(kgid_t); -extern int in_egroup_p(kgid_t); - /* * The security context of a task * diff --git a/include/linux/uidgid.h b/include/linux/uidgid.h index 2d1f9b627f91..0ee05da38899 100644 --- a/include/linux/uidgid.h +++ b/include/linux/uidgid.h @@ -29,6 +29,7 @@ typedef struct { #define KUIDT_INIT(value) (kuid_t){ value } #define KGIDT_INIT(value) (kgid_t){ value } +#ifdef CONFIG_MULTIUSER static inline uid_t __kuid_val(kuid_t uid) { return uid.val; @@ -38,6 +39,17 @@ static inline gid_t __kgid_val(kgid_t gid) { return gid.val; } +#else +static inline uid_t __kuid_val(kuid_t uid) +{ + return 0; +} + +static inline gid_t __kgid_val(kgid_t gid) +{ + return 0; +} +#endif #define GLOBAL_ROOT_UID KUIDT_INIT(0) #define GLOBAL_ROOT_GID KGIDT_INIT(0) diff --git a/init/Kconfig b/init/Kconfig index a905b7301e10..3b9df1aa35db 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -394,6 +394,7 @@ endchoice config BSD_PROCESS_ACCT bool "BSD Process Accounting" + depends on MULTIUSER help If you say Y here, a user level program will be able to instruct the kernel (via a special system call) to write process accounting @@ -420,6 +421,7 @@ config BSD_PROCESS_ACCT_V3 config TASKSTATS bool "Export task/process statistics through netlink" depends on NET + depends on MULTIUSER default n help Export selected statistics for tasks/processes through the @@ -1160,6 +1162,7 @@ config CHECKPOINT_RESTORE menuconfig NAMESPACES bool "Namespaces support" if EXPERT + depends on MULTIUSER default !EXPERT help Provides the way to make tasks work with different objects using @@ -1356,11 +1359,25 @@ menuconfig EXPERT config UID16 bool "Enable 16-bit UID system calls" if EXPERT - depends on HAVE_UID16 + depends on HAVE_UID16 && MULTIUSER default y help This enables the legacy 16-bit UID syscall wrappers. +config MULTIUSER + bool "Multiple users, groups and capabilities support" if EXPERT + default y + help + This option enables support for non-root users, groups and + capabilities. + + If you say N here, all processes will run with UID 0, GID 0, and all + possible capabilities. Saying N here also compiles out support for + system calls related to UIDs, GIDs, and capabilities, such as setuid, + setgid, and capset. + + If unsure, say Y here. + config SGETMASK_SYSCALL bool "sgetmask/ssetmask syscalls support" if EXPERT def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH diff --git a/kernel/Makefile b/kernel/Makefile index 1408b3353a3c..0f8f8b0bc1bf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,9 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o groups.o smpboot.o + async.o range.o smpboot.o + +obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files diff --git a/kernel/capability.c b/kernel/capability.c index 989f5bfc57dc..45432b54d5c6 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -35,6 +35,7 @@ static int __init file_caps_disable(char *str) } __setup("no_file_caps", file_caps_disable); +#ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * @@ -386,6 +387,24 @@ bool ns_capable(struct user_namespace *ns, int cap) } EXPORT_SYMBOL(ns_capable); + +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); +#endif /* CONFIG_MULTIUSER */ + /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check @@ -411,22 +430,6 @@ bool file_ns_capable(const struct file *file, struct user_namespace *ns, } EXPORT_SYMBOL(file_ns_capable); -/** - * capable - Determine if the current task has a superior capability in effect - * @cap: The capability to be tested for - * - * Return true if the current task has the given superior capability currently - * available for use, false if not. - * - * This sets PF_SUPERPRIV on the task if the capability is available on the - * assumption that it's about to be used. - */ -bool capable(int cap) -{ - return ns_capable(&init_user_ns, cap); -} -EXPORT_SYMBOL(capable); - /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question diff --git a/kernel/cred.c b/kernel/cred.c index e0573a43c7df..ec1c07667ec1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -29,6 +29,9 @@ static struct kmem_cache *cred_jar; +/* init to 2 - one for init_task, one to ensure it is never freed */ +struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; + /* * The initial credentials for the initial task */ diff --git a/kernel/groups.c b/kernel/groups.c index 664411f171b5..74d431d25251 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -9,9 +9,6 @@ #include #include -/* init to 2 - one for init_task, one to ensure it is never freed */ -struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; - struct group_info *groups_alloc(int gidsetsize) { struct group_info *group_info; diff --git a/kernel/sys.c b/kernel/sys.c index a03d9cd23ed7..3be344902316 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -325,6 +325,7 @@ out_unlock: * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned). */ +#ifdef CONFIG_MULTIUSER SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { struct user_namespace *ns = current_user_ns(); @@ -815,6 +816,7 @@ change_okay: commit_creds(new); return old_fsgid; } +#endif /* CONFIG_MULTIUSER */ /** * sys_getpid - return the thread group id of the current process diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5adcb0ae3a58..7995ef5868d8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -159,6 +159,20 @@ cond_syscall(sys_uselib); cond_syscall(sys_fadvise64); cond_syscall(sys_fadvise64_64); cond_syscall(sys_madvise); +cond_syscall(sys_setuid); +cond_syscall(sys_setregid); +cond_syscall(sys_setgid); +cond_syscall(sys_setreuid); +cond_syscall(sys_setresuid); +cond_syscall(sys_getresuid); +cond_syscall(sys_setresgid); +cond_syscall(sys_getresgid); +cond_syscall(sys_setgroups); +cond_syscall(sys_getgroups); +cond_syscall(sys_setfsuid); +cond_syscall(sys_setfsgid); +cond_syscall(sys_capget); +cond_syscall(sys_capset); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index fb78117b896c..9068e72aa73c 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -1,9 +1,11 @@ config SUNRPC tristate + depends on MULTIUSER config SUNRPC_GSS tristate select OID_REGISTRY + depends on MULTIUSER config SUNRPC_BACKCHANNEL bool diff --git a/security/Kconfig b/security/Kconfig index beb86b500adf..bf4ec46474b6 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -21,6 +21,7 @@ config SECURITY_DMESG_RESTRICT config SECURITY bool "Enable different security models" depends on SYSFS + depends on MULTIUSER help This allows you to choose different security modules to be configured into your kernel. -- cgit v1.3-6-gb490 From 96831c0a6738f88f89e7012f4df0a747514af0a0 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 15 Apr 2015 16:16:44 -0700 Subject: kernel/resource.c: remove deprecated __check_region() and friends All users of __check_region(), check_region(), and check_mem_region() are gone. We got rid of the last user in v4.0-rc1. Remove them. bloat-o-meter on x86_64 shows: add/remove: 0/3 grow/shrink: 0/0 up/down: 0/-102 (-102) function old new delta __kstrtab___check_region 15 - -15 __ksymtab___check_region 16 - -16 __check_region 71 - -71 Signed-off-by: Jakub Sitnicki Cc: Bjorn Helgaas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 8 -------- kernel/resource.c | 32 -------------------------------- 2 files changed, 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 2c5250222278..388e3ae94f7a 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -196,10 +196,8 @@ extern struct resource * __request_region(struct resource *, /* Compatibility cruft */ #define release_region(start,n) __release_region(&ioport_resource, (start), (n)) -#define check_mem_region(start,n) __check_region(&iomem_resource, (start), (n)) #define release_mem_region(start,n) __release_region(&iomem_resource, (start), (n)) -extern int __check_region(struct resource *, resource_size_t, resource_size_t); extern void __release_region(struct resource *, resource_size_t, resource_size_t); #ifdef CONFIG_MEMORY_HOTREMOVE @@ -207,12 +205,6 @@ extern int release_mem_region_adjustable(struct resource *, resource_size_t, resource_size_t); #endif -static inline int __deprecated check_region(resource_size_t s, - resource_size_t n) -{ - return __check_region(&ioport_resource, s, n); -} - /* Wrappers for managed devices */ struct device; diff --git a/kernel/resource.c b/kernel/resource.c index 19f2357dfda3..90552aab5f2d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1034,8 +1034,6 @@ resource_size_t resource_alignment(struct resource *res) * * request_region creates a new busy region. * - * check_region returns non-zero if the area is already busy. - * * release_region releases a matching busy region. */ @@ -1097,36 +1095,6 @@ struct resource * __request_region(struct resource *parent, } EXPORT_SYMBOL(__request_region); -/** - * __check_region - check if a resource region is busy or free - * @parent: parent resource descriptor - * @start: resource start address - * @n: resource region size - * - * Returns 0 if the region is free at the moment it is checked, - * returns %-EBUSY if the region is busy. - * - * NOTE: - * This function is deprecated because its use is racy. - * Even if it returns 0, a subsequent call to request_region() - * may fail because another driver etc. just allocated the region. - * Do NOT use it. It will be removed from the kernel. - */ -int __check_region(struct resource *parent, resource_size_t start, - resource_size_t n) -{ - struct resource * res; - - res = __request_region(parent, start, n, "check-region", 0); - if (!res) - return -EBUSY; - - release_resource(res); - free_resource(res); - return 0; -} -EXPORT_SYMBOL(__check_region); - /** * __release_region - release a previously reserved resource region * @parent: parent resource descriptor -- cgit v1.3-6-gb490 From 7a54f46b301cfab8a0d7365aa186545f8b98f22e Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 15 Apr 2015 16:16:53 -0700 Subject: kernel/reboot.c: add orderly_reboot for graceful reboot The kernel has orderly_poweroff which allows the kernel to initiate a graceful shutdown of userspace, by running /sbin/poweroff. This adds orderly_reboot that will cause userspace to shut itself down by calling /sbin/reboot. This will be used for shutdown initiated by a system controller on platforms that do not use ACPI. orderly_reboot() should be used when the system wants to allow userspace to gracefully shut itself down. For cases where the system may imminently catch on fire, the existing emergency_restart() provides an immediate reboot without involving userspace. Signed-off-by: Joel Stanley Cc: Fabian Frederick Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Rusty Russell Cc: Jeremy Kerr Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/reboot.h | 3 ++- kernel/reboot.c | 53 +++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 50 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 67fc8fcdc4b0..a7ff409f386d 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -70,7 +70,8 @@ void ctrl_alt_del(void); #define POWEROFF_CMD_PATH_LEN 256 extern char poweroff_cmd[POWEROFF_CMD_PATH_LEN]; -extern int orderly_poweroff(bool force); +extern void orderly_poweroff(bool force); +extern void orderly_reboot(void); /* * Emergency restart, callable from an interrupt handler. diff --git a/kernel/reboot.c b/kernel/reboot.c index 5925f5ae8dff..d20c85d9f8c0 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -387,8 +387,9 @@ void ctrl_alt_del(void) } char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; +static const char reboot_cmd[] = "/sbin/reboot"; -static int __orderly_poweroff(bool force) +static int run_cmd(const char *cmd) { char **argv; static char *envp[] = { @@ -397,8 +398,7 @@ static int __orderly_poweroff(bool force) NULL }; int ret; - - argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + argv = argv_split(GFP_KERNEL, cmd, NULL); if (argv) { ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); argv_free(argv); @@ -406,8 +406,33 @@ static int __orderly_poweroff(bool force) ret = -ENOMEM; } + return ret; +} + +static int __orderly_reboot(void) +{ + int ret; + + ret = run_cmd(reboot_cmd); + + if (ret) { + pr_warn("Failed to start orderly reboot: forcing the issue\n"); + emergency_sync(); + kernel_restart(NULL); + } + + return ret; +} + +static int __orderly_poweroff(bool force) +{ + int ret; + + ret = run_cmd(poweroff_cmd); + if (ret && force) { pr_warn("Failed to start orderly shutdown: forcing the issue\n"); + /* * I guess this should try to kick off some daemon to sync and * poweroff asap. Or not even bother syncing if we're doing an @@ -436,15 +461,33 @@ static DECLARE_WORK(poweroff_work, poweroff_work_func); * This may be called from any context to trigger a system shutdown. * If the orderly shutdown fails, it will force an immediate shutdown. */ -int orderly_poweroff(bool force) +void orderly_poweroff(bool force) { if (force) /* do not override the pending "true" */ poweroff_force = true; schedule_work(&poweroff_work); - return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); +static void reboot_work_func(struct work_struct *work) +{ + __orderly_reboot(); +} + +static DECLARE_WORK(reboot_work, reboot_work_func); + +/** + * orderly_reboot - Trigger an orderly system reboot + * + * This may be called from any context to trigger a system reboot. + * If the orderly reboot fails, it will force an immediate reboot. + */ +void orderly_reboot(void) +{ + schedule_work(&reboot_work); +} +EXPORT_SYMBOL_GPL(orderly_reboot); + static int __init reboot_setup(char *str) { for (;;) { -- cgit v1.3-6-gb490 From 7b1460eccad0621e0c48f52bbedeb7adddc55ac9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2015 16:16:59 -0700 Subject: printk: comment pr_cont() stating it is only to continue a line KERN_CONT is nicely commented in kern_levels.h, but pr_cont() is now used more often, and it lacks the comment stating what it is used for. It can be confused as continuing the log level, but that is not its purpose. Its purpose is to continue a line that had no newline enclosed. This should be documented by pr_cont() as well. Signed-off-by: Steven Rostedt Acked-by: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/printk.h b/include/linux/printk.h index baa3f97d8ce8..9b30871c9149 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -255,6 +255,11 @@ extern asmlinkage void dump_stack(void) __cold; printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__) #define pr_info(fmt, ...) \ printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) +/* + * Like KERN_CONT, pr_cont() should only be used when continuing + * a line with no newline ('\n') enclosed. Otherwise it defaults + * back to KERN_DEFAULT. + */ #define pr_cont(fmt, ...) \ printk(KERN_CONT fmt, ##__VA_ARGS__) -- cgit v1.3-6-gb490 From 41416f2330112d29f2cfa337bfc7e672bf0c2768 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:17:28 -0700 Subject: lib/string_helpers.c: change semantics of string_escape_mem The current semantics of string_escape_mem are inadequate for one of its current users, vsnprintf(). If that is to honour its contract, it must know how much space would be needed for the entire escaped buffer, and string_escape_mem provides no way of obtaining that (short of allocating a large enough buffer (~4 times input string) to let it play with, and that's definitely a big no-no inside vsnprintf). So change the semantics for string_escape_mem to be more snprintf-like: Return the size of the output that would be generated if the destination buffer was big enough, but of course still only write to the part of dst it is allowed to, and (contrary to snprintf) don't do '\0'-termination. It is then up to the caller to detect whether output was truncated and to append a '\0' if desired. Also, we must output partial escape sequences, otherwise a call such as snprintf(buf, 3, "%1pE", "\123") would cause printf to write a \0 to buf[2] but leaving buf[0] and buf[1] with whatever they previously contained. This also fixes a bug in the escaped_string() helper function, which used to unconditionally pass a length of "end-buf" to string_escape_mem(); since the latter doesn't check osz for being insanely large, it would happily write to dst. For example, kasprintf(GFP_KERNEL, "something and then %pE", ...); is an easy way to trigger an oops. In test-string_helpers.c, the -ENOMEM test is replaced with testing for getting the expected return value even if the buffer is too small. We also ensure that nothing is written (by relying on a NULL pointer deref) if the output size is 0 by passing NULL - this has to work for kasprintf("%pE") to work. In net/sunrpc/cache.c, I think qword_add still has the same semantics. Someone should definitely double-check this. In fs/proc/array.c, I made the minimum possible change, but longer-term it should stop poking around in seq_file internals. [andriy.shevchenko@linux.intel.com: simplify qword_add] [andriy.shevchenko@linux.intel.com: add missed curly braces] Signed-off-by: Rasmus Villemoes Acked-by: Andy Shevchenko Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 4 ++-- include/linux/string_helpers.h | 8 +++---- lib/string_helpers.c | 49 ++++++------------------------------------ lib/test-string_helpers.c | 40 +++++++++++++++++----------------- lib/vsprintf.c | 8 +++++-- net/sunrpc/cache.c | 8 ++++--- 6 files changed, 44 insertions(+), 73 deletions(-) (limited to 'include/linux') diff --git a/fs/proc/array.c b/fs/proc/array.c index a4490c0a4644..13f047ad08e4 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -99,8 +99,8 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) buf = m->buf + m->count; /* Ignore error for now */ - string_escape_str(tcomm, &buf, m->size - m->count, - ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + buf += string_escape_str(tcomm, buf, m->size - m->count, + ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); m->count = buf - m->buf; seq_putc(m, '\n'); diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index 657571817260..0991913f4953 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -47,22 +47,22 @@ static inline int string_unescape_any_inplace(char *buf) #define ESCAPE_ANY_NP (ESCAPE_ANY | ESCAPE_NP) #define ESCAPE_HEX 0x20 -int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz, +int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, unsigned int flags, const char *esc); static inline int string_escape_mem_any_np(const char *src, size_t isz, - char **dst, size_t osz, const char *esc) + char *dst, size_t osz, const char *esc) { return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, esc); } -static inline int string_escape_str(const char *src, char **dst, size_t sz, +static inline int string_escape_str(const char *src, char *dst, size_t sz, unsigned int flags, const char *esc) { return string_escape_mem(src, strlen(src), dst, sz, flags, esc); } -static inline int string_escape_str_any_np(const char *src, char **dst, +static inline int string_escape_str_any_np(const char *src, char *dst, size_t sz, const char *esc) { return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, esc); diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 9c48ddad0f0d..1826c7407258 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -274,11 +274,6 @@ static bool escape_space(unsigned char c, char **dst, char *end) return false; } - if (out + 2 > end) { - *dst = out + 2; - return true; - } - if (out < end) *out = '\\'; ++out; @@ -309,11 +304,6 @@ static bool escape_special(unsigned char c, char **dst, char *end) return false; } - if (out + 2 > end) { - *dst = out + 2; - return true; - } - if (out < end) *out = '\\'; ++out; @@ -332,11 +322,6 @@ static bool escape_null(unsigned char c, char **dst, char *end) if (c) return false; - if (out + 2 > end) { - *dst = out + 2; - return true; - } - if (out < end) *out = '\\'; ++out; @@ -352,11 +337,6 @@ static bool escape_octal(unsigned char c, char **dst, char *end) { char *out = *dst; - if (out + 4 > end) { - *dst = out + 4; - return true; - } - if (out < end) *out = '\\'; ++out; @@ -378,11 +358,6 @@ static bool escape_hex(unsigned char c, char **dst, char *end) { char *out = *dst; - if (out + 4 > end) { - *dst = out + 4; - return true; - } - if (out < end) *out = '\\'; ++out; @@ -449,20 +424,17 @@ static bool escape_hex(unsigned char c, char **dst, char *end) * it if needs. * * Return: - * The amount of the characters processed to the destination buffer, or - * %-ENOMEM if the size of buffer is not enough to put an escaped character is - * returned. - * - * Even in the case of error @dst pointer will be updated to point to the byte - * after the last processed character. + * The total size of the escaped output that would be generated for + * the given input and flags. To check whether the output was + * truncated, compare the return value to osz. There is room left in + * dst for a '\0' terminator if and only if ret < osz. */ -int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz, +int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, unsigned int flags, const char *esc) { - char *p = *dst; + char *p = dst; char *end = p + osz; bool is_dict = esc && *esc; - int ret; while (isz--) { unsigned char c = *src++; @@ -502,13 +474,6 @@ int string_escape_mem(const char *src, size_t isz, char **dst, size_t osz, escape_passthrough(c, &p, end); } - if (p > end) { - *dst = end; - return -ENOMEM; - } - - ret = p - *dst; - *dst = p; - return ret; + return p - dst; } EXPORT_SYMBOL(string_escape_mem); diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c index ab0d30e1e18f..8e376efd88a4 100644 --- a/lib/test-string_helpers.c +++ b/lib/test-string_helpers.c @@ -260,16 +260,28 @@ static __init const char *test_string_find_match(const struct test_string_2 *s2, return NULL; } +static __init void +test_string_escape_overflow(const char *in, int p, unsigned int flags, const char *esc, + int q_test, const char *name) +{ + int q_real; + + q_real = string_escape_mem(in, p, NULL, 0, flags, esc); + if (q_real != q_test) + pr_warn("Test '%s' failed: flags = %u, osz = 0, expected %d, got %d\n", + name, flags, q_test, q_real); +} + static __init void test_string_escape(const char *name, const struct test_string_2 *s2, unsigned int flags, const char *esc) { - int q_real = 512; - char *out_test = kmalloc(q_real, GFP_KERNEL); - char *out_real = kmalloc(q_real, GFP_KERNEL); + size_t out_size = 512; + char *out_test = kmalloc(out_size, GFP_KERNEL); + char *out_real = kmalloc(out_size, GFP_KERNEL); char *in = kmalloc(256, GFP_KERNEL); - char *buf = out_real; int p = 0, q_test = 0; + int q_real; if (!out_test || !out_real || !in) goto out; @@ -301,29 +313,19 @@ static __init void test_string_escape(const char *name, q_test += len; } - q_real = string_escape_mem(in, p, &buf, q_real, flags, esc); + q_real = string_escape_mem(in, p, out_real, out_size, flags, esc); test_string_check_buf(name, flags, in, p, out_real, q_real, out_test, q_test); + + test_string_escape_overflow(in, p, flags, esc, q_test, name); + out: kfree(in); kfree(out_real); kfree(out_test); } -static __init void test_string_escape_nomem(void) -{ - char *in = "\eb \\C\007\"\x90\r]"; - char out[64], *buf = out; - int rc = -ENOMEM, ret; - - ret = string_escape_str_any_np(in, &buf, strlen(in), NULL); - if (ret == rc) - return; - - pr_err("Test 'escape nomem' failed: got %d instead of %d\n", ret, rc); -} - static int __init test_string_helpers_init(void) { unsigned int i; @@ -342,8 +344,6 @@ static int __init test_string_helpers_init(void) for (i = 0; i < (ESCAPE_ANY_NP | ESCAPE_HEX) + 1; i++) test_string_escape("escape 1", escape1, i, TEST_STRING_2_DICT_1); - test_string_escape_nomem(); - return -EINVAL; } module_init(test_string_helpers_init); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 4da1e7aaf9d5..3a1e0843f9a2 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1235,8 +1235,12 @@ char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec, len = spec.field_width < 0 ? 1 : spec.field_width; - /* Ignore the error. We print as many characters as we can */ - string_escape_mem(addr, len, &buf, end - buf, flags, NULL); + /* + * string_escape_mem() writes as many characters as it can to + * the given buffer, and returns the total size of the output + * had the buffer been big enough. + */ + buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL); return buf; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 5199bb1a017e..2928afffbb81 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1072,10 +1072,12 @@ void qword_add(char **bpp, int *lp, char *str) if (len < 0) return; - ret = string_escape_str(str, &bp, len, ESCAPE_OCTAL, "\\ \n\t"); - if (ret < 0 || ret == len) + ret = string_escape_str(str, bp, len, ESCAPE_OCTAL, "\\ \n\t"); + if (ret >= len) { + bp += len; len = -1; - else { + } else { + bp += ret; len -= ret; *bp++ = ' '; len--; -- cgit v1.3-6-gb490 From 89c1e79eb302349fcaf0697bc9116a4ff16bfeb0 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 15 Apr 2015 16:17:42 -0700 Subject: linux/bitmap.h: improve BITMAP_{LAST,FIRST}_WORD_MASK The macro BITMAP_LAST_WORD_MASK can be implemented without a conditional, which will generally lead to slightly better generated code (221 bytes saved for allmodconfig-GCOV_KERNEL, ~2k with GCOV_KERNEL). As a small bonus, this also ensures that the nbits parameter is expanded exactly once. In BITMAP_FIRST_WORD_MASK, if start is signed gcc is technically allowed to assume it is positive (or divisible by BITS_PER_LONG), and hence just do the simple mask. It doesn't seem to use this, and even on an architecture like x86 where the shift only depends on the lower 5 or 6 bits, and these bits are not affected by the signedness of the expression, gcc still generates code to compute the C99 mandated value of start % BITS_PER_LONG. So just use a mask explicitly, also for consistency with BITMAP_LAST_WORD_MASK. Signed-off-by: Rasmus Villemoes Cc: Tejun Heo Reviewed-by: George Spelvin Cc: Yury Norov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index dbfbf4990005..be4fa5ddf36c 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -172,12 +172,8 @@ extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int extern int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp, int nmaskbits); -#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG)) -#define BITMAP_LAST_WORD_MASK(nbits) \ -( \ - ((nbits) % BITS_PER_LONG) ? \ - (1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \ -) +#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) +#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) #define small_const_nbits(nbits) \ (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) -- cgit v1.3-6-gb490