From 0bf5f9492389aa8df5c8e38fcb4488802d24504d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Jul 2019 16:26:27 -0700 Subject: mm: fix the MAP_UNINITIALIZED flag We can't expose UAPI symbols differently based on CONFIG_ symbols, as userspace won't have them available. Instead always define the flag, but only respect it based on the config option. Link: http://lkml.kernel.org/r/20190703122359.18200-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Vladimir Murzin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/asm-generic/mman-common.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index abd238d0f7a4..cb556b430e71 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -19,15 +19,13 @@ #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ -#ifdef CONFIG_MMAP_ALLOW_UNINITIALIZED -# define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be uninitialized */ -#else -# define MAP_UNINITIALIZED 0x0 /* Don't support this flag */ -#endif /* 0x0100 - 0x80000 flags are defined in asm-generic/mman.h */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ +#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be + * uninitialized */ + /* * Flags for mlock */ -- cgit v1.2.3-59-g8ed1b From 89165b8b0ee97bd775ac4376b932fd030f7462bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Jul 2019 16:26:30 -0700 Subject: mm: provide a print_vma_addr stub for !CONFIG_MMU Link: http://lkml.kernel.org/r/20190703122359.18200-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Vladimir Murzin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0389c34ac529..74797ed20c2c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2767,7 +2767,13 @@ extern int randomize_va_space; #endif const char * arch_vma_name(struct vm_area_struct *vma); +#ifdef CONFIG_MMU void print_vma_addr(char *prefix, unsigned long rip); +#else +static inline void print_vma_addr(char *prefix, unsigned long rip) +{ +} +#endif void *sparse_buffer_alloc(unsigned long size); struct page *sparse_mem_map_populate(unsigned long pnum, int nid, -- cgit v1.2.3-59-g8ed1b From 9b98fa22948551e20a15b0b9d22589e3724c361a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 16 Jul 2019 16:26:33 -0700 Subject: mm: stub out all of swapops.h for !CONFIG_MMU The whole header file deals with swap entries and PTEs, none of which can exist for nommu builds. The current nommu ports have lots of stubs to allow the inline functions in swapops.h to compile, but as none of this functionality is actually used there is no point in even providing it. This way we don't have to provide the stubs for the upcoming RISC-V nommu port, and can eventually remove it from the existing ports. Link: http://lkml.kernel.org/r/20190703122359.18200-4-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Vladimir Murzin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swapops.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 15bdb6fe71e5..877fd239b6ff 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -6,6 +6,8 @@ #include #include +#ifdef CONFIG_MMU + /* * swapcache pages are stored in the swapper_space radix tree. We want to * get good packing density in that tree, so the index should be dense in @@ -50,13 +52,11 @@ static inline pgoff_t swp_offset(swp_entry_t entry) return entry.val & SWP_OFFSET_MASK; } -#ifdef CONFIG_MMU /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { return !pte_none(pte) && !pte_present(pte); } -#endif /* * Convert the arch-dependent pte representation of a swp_entry_t into an @@ -360,4 +360,5 @@ static inline int non_swap_entry(swp_entry_t entry) } #endif +#endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ -- cgit v1.2.3-59-g8ed1b From ce251e0e3c0597ea8cab5787df579bd1f9c1aca1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jul 2019 16:26:42 -0700 Subject: include/linux/kernel.h: add typeof_member() macro Add typeof_member() macro so that types can be extracted without introducing dummy variables. Link: http://lkml.kernel.org/r/20190529190720.GA5703@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 0c9bc231107f..4fa360a13c1e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -88,6 +88,8 @@ */ #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) +#define typeof_member(T, m) typeof(((T*)0)->m) + #define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP #define DIV_ROUND_DOWN_ULL(ll, d) \ -- cgit v1.2.3-59-g8ed1b From 95b980d62d52c4c1768ee719e8db3efe27ef52b2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 16 Jul 2019 16:26:57 -0700 Subject: linux/bits.h: make BIT(), GENMASK(), and friends available in assembly BIT(), GENMASK(), etc. are useful to define register bits of hardware. However, low-level code is often written in assembly, where they are not available due to the hard-coded 1UL, 0UL. In fact, in-kernel headers such as arch/arm64/include/asm/sysreg.h use _BITUL() instead of BIT() so that the register bit macros are available in assembly. Using macros in include/uapi/linux/const.h have two reasons: [1] For use in uapi headers We should use underscore-prefixed variants for user-space. [2] For use in assembly code Since _BITUL() uses UL(1) instead of 1UL, it can be used as an alternative of BIT(). For [2], it is pretty easy to change BIT() etc. for use in assembly. This allows to replace _BUTUL() in kernel-space headers with BIT(). Link: http://lkml.kernel.org/r/20190609153941.17249-1-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bits.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bits.h b/include/linux/bits.h index 2b7b532c1d51..669d69441a62 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -1,13 +1,15 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BITS_H #define __LINUX_BITS_H + +#include #include -#define BIT(nr) (1UL << (nr)) -#define BIT_ULL(nr) (1ULL << (nr)) -#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT(nr) (UL(1) << (nr)) +#define BIT_ULL(nr) (ULL(1) << (nr)) +#define BIT_MASK(nr) (UL(1) << ((nr) % BITS_PER_LONG)) #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) -#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG)) +#define BIT_ULL_MASK(nr) (ULL(1) << ((nr) % BITS_PER_LONG_LONG)) #define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) #define BITS_PER_BYTE 8 @@ -17,10 +19,11 @@ * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ #define GENMASK(h, l) \ - (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) + (((~UL(0)) - (UL(1) << (l)) + 1) & \ + (~UL(0) >> (BITS_PER_LONG - 1 - (h)))) #define GENMASK_ULL(h, l) \ - (((~0ULL) - (1ULL << (l)) + 1) & \ - (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h)))) + (((~ULL(0)) - (ULL(1) << (l)) + 1) & \ + (~ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) #endif /* __LINUX_BITS_H */ -- cgit v1.2.3-59-g8ed1b From c296d4dc13aefe96792538a949996b8938f28f13 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Tue, 16 Jul 2019 16:27:06 -0700 Subject: asm-generic: fix a compilation warning Fix this compilation warning on x86 by making flush_cache_vmap() inline. lib/ioremap.c: In function 'ioremap_page_range': lib/ioremap.c:214:16: warning: variable 'start' set but not used [-Wunused-but-set-variable] unsigned long start; ^~~~~ While at it, convert all other similar functions to inline for consistency. Link: http://lkml.kernel.org/r/1562594592-15228-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai Reviewed-by: Andrew Morton Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/cacheflush.h | 74 ++++++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 0dd47a6db2cf..a950a22c4890 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -5,24 +5,70 @@ /* Keep includes the same across arches. */ #include +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 + /* * The cache doesn't need to be flushed when TLB entries change when * the cache is mapped to physical memory, not virtual memory */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_range(start, end) do { } while (0) -#define flush_icache_page(vma,pg) do { } while (0) -#define flush_icache_user_range(vma,pg,adr,len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) +static inline void flush_cache_all(void) +{ +} + +static inline void flush_cache_mm(struct mm_struct *mm) +{ +} + +static inline void flush_cache_dup_mm(struct mm_struct *mm) +{ +} + +static inline void flush_cache_range(struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ +} + +static inline void flush_cache_page(struct vm_area_struct *vma, + unsigned long vmaddr, + unsigned long pfn) +{ +} + +static inline void flush_dcache_page(struct page *page) +{ +} + +static inline void flush_dcache_mmap_lock(struct address_space *mapping) +{ +} + +static inline void flush_dcache_mmap_unlock(struct address_space *mapping) +{ +} + +static inline void flush_icache_range(unsigned long start, unsigned long end) +{ +} + +static inline void flush_icache_page(struct vm_area_struct *vma, + struct page *page) +{ +} + +static inline void flush_icache_user_range(struct vm_area_struct *vma, + struct page *page, + unsigned long addr, int len) +{ +} + +static inline void flush_cache_vmap(unsigned long start, unsigned long end) +{ +} + +static inline void flush_cache_vunmap(unsigned long start, unsigned long end) +{ +} #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ -- cgit v1.2.3-59-g8ed1b From 4c6080cd6f8baad9f7faa3deac9a90e59726b119 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jul 2019 16:27:12 -0700 Subject: lib/list: tweak LIST_POISON2 for better code generation on x86_64 list_del() poisoning can generate 2 64-bit immediate loads but it also can generate one 64-bit immediate load and an addition: 48 b8 00 01 00 00 00 00 ad de movabs rax,0xdead000000000100 48 89 47 58 mov QWORD PTR [rdi+0x58],rax 48 05 00 01 00 00 <=====> add rax,0x100 48 89 47 60 mov QWORD PTR [rdi+0x60],rax However on x86_64 not all constants are equal: those within [-128, 127] range can be added with shorter "add r64, imm32" instruction: 48 b8 00 01 00 00 00 00 ad de movabs rax,0xdead000000000100 48 89 47 58 mov QWORD PTR [rdi+0x58],rax 48 83 c0 22 <======> add rax,0x22 48 89 47 60 mov QWORD PTR [rdi+0x60],rax Patch saves 2 bytes per some LIST_POISON2 usage. (Slightly disappointing) space savings on F29 x86_64 config: add/remove: 0/0 grow/shrink: 0/2164 up/down: 0/-5184 (-5184) Function old new delta zstd_get_workspace 548 546 -2 ... mlx4_delete_all_resources_for_slave 4826 4804 -22 Total: Before=83304131, After=83298947, chg -0.01% New constants are: 0xdead000000000100 0xdead000000000122 Note: LIST_POISON1 can't be changed to ...11 because something in page allocator requires low bit unset. Link: http://lkml.kernel.org/r/20190513191502.GA8492@avx2 Signed-off-by: Alexey Dobriyan Cc: Vasiliy Kulikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/poison.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/poison.h b/include/linux/poison.h index d6d980a681c7..df34330b4e34 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -21,7 +21,7 @@ * non-initialized list entries. */ #define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA) -#define LIST_POISON2 ((void *) 0x200 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *) 0x122 + POISON_POINTER_DELTA) /********** include/linux/timer.h **********/ /* -- cgit v1.2.3-59-g8ed1b From 0f472d04f59ff89d15b2a1c4eafde7317ddd67a2 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 16 Jul 2019 16:27:33 -0700 Subject: mm/ioremap: probe platform for p4d huge map support Finish up what commit c2febafc6773 ("mm: convert generic code to 5-level paging") started while levelling up P4D huge mapping support at par with PUD and PMD. A new arch call back arch_ioremap_p4d_supported() is added which just maintains status quo (P4D huge map not supported) on x86, arm64 and powerpc. When HAVE_ARCH_HUGE_VMAP is enabled its just a simple check from the arch about the support, hence runtime effects are minimal. Link: http://lkml.kernel.org/r/1561699231-20991-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Thomas Gleixner Acked-by: Michael Ellerman (powerpc) Cc: Catalin Marinas Cc: Will Deacon Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 5 +++++ arch/powerpc/mm/book3s64/radix_pgtable.c | 5 +++++ arch/x86/mm/ioremap.c | 5 +++++ include/linux/io.h | 1 + lib/ioremap.c | 2 ++ 5 files changed, 18 insertions(+) (limited to 'include') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 1b49c08dfa2b..e661469cabdd 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -942,6 +942,11 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys) return dt_virt; } +int __init arch_ioremap_p4d_supported(void) +{ + return 0; +} + int __init arch_ioremap_pud_supported(void) { /* diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 65c2ba1e1783..b4ca9e95e678 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1237,3 +1237,8 @@ int radix__ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, return 0; } } + +int __init arch_ioremap_p4d_supported(void) +{ + return 0; +} diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index e500f1df1140..63e99f15d7cf 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -459,6 +459,11 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); +int __init arch_ioremap_p4d_supported(void) +{ + return 0; +} + int __init arch_ioremap_pud_supported(void) { #ifdef CONFIG_X86_64 diff --git a/include/linux/io.h b/include/linux/io.h index 9876e5801a9d..accac822336a 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -33,6 +33,7 @@ static inline int ioremap_page_range(unsigned long addr, unsigned long end, #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP void __init ioremap_huge_init(void); +int arch_ioremap_p4d_supported(void); int arch_ioremap_pud_supported(void); int arch_ioremap_pmd_supported(void); #else diff --git a/lib/ioremap.c b/lib/ioremap.c index a95161d9c883..0a2ffadc6d71 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -30,6 +30,8 @@ early_param("nohugeiomap", set_nohugeiomap); void __init ioremap_huge_init(void) { if (!ioremap_huge_disabled) { + if (arch_ioremap_p4d_supported()) + ioremap_p4d_capable = 1; if (arch_ioremap_pud_supported()) ioremap_pud_capable = 1; if (arch_ioremap_pmd_supported()) -- cgit v1.2.3-59-g8ed1b From 9f973cb38088e0cf42e0bae97ff140813e623f13 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 16 Jul 2019 16:27:45 -0700 Subject: lib/rbtree: avoid generating code twice for the cached versions As was already noted in rbtree.h, the logic to cache rb_first (or rb_last) can easily be implemented externally to the core rbtree api. Change the implementation to do just that. Previously the update of rb_leftmost was wired deeper into the implmentation, but there were some disadvantages to that - mostly, lib/rbtree.c had separate instantiations for rb_insert_color() vs rb_insert_color_cached(), as well as rb_erase() vs rb_erase_cached(), which were doing exactly the same thing save for the rb_leftmost update at the start of either function. text data bss dec hex filename 5405 120 0 5525 1595 lib/rbtree.o-vanilla 3827 96 0 3923 f53 lib/rbtree.o-patch [dave@stgolabs.net: changelog addition] Link: http://lkml.kernel.org/r/20190628171416.by5gdizl3rcxk5h5@linux-r8p5 [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20190628045008.39926-1-walken@google.com Signed-off-by: Michel Lespinasse Acked-by: Davidlohr Bueso Acked-by: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree.h | 70 ++++++++++++++++++++++++++-------------- include/linux/rbtree_augmented.h | 27 ++++++---------- lib/rbtree.c | 40 ++--------------------- 3 files changed, 59 insertions(+), 78 deletions(-) (limited to 'include') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index e6337fce08f2..1fd61a9af45c 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -32,25 +32,9 @@ struct rb_root { struct rb_node *rb_node; }; -/* - * Leftmost-cached rbtrees. - * - * We do not cache the rightmost node based on footprint - * size vs number of potential users that could benefit - * from O(1) rb_last(). Just not worth it, users that want - * this feature can always implement the logic explicitly. - * Furthermore, users that want to cache both pointers may - * find it a bit asymmetric, but that's ok. - */ -struct rb_root_cached { - struct rb_root rb_root; - struct rb_node *rb_leftmost; -}; - #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } -#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) @@ -72,12 +56,6 @@ extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); -extern void rb_insert_color_cached(struct rb_node *, - struct rb_root_cached *, bool); -extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); -/* Same as rb_first(), but O(1) */ -#define rb_first_cached(root) (root)->rb_leftmost - /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); @@ -87,8 +65,6 @@ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); -extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, - struct rb_root_cached *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) @@ -136,4 +112,50 @@ static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent typeof(*pos), field); 1; }); \ pos = n) +/* + * Leftmost-cached rbtrees. + * + * We do not cache the rightmost node based on footprint + * size vs number of potential users that could benefit + * from O(1) rb_last(). Just not worth it, users that want + * this feature can always implement the logic explicitly. + * Furthermore, users that want to cache both pointers may + * find it a bit asymmetric, but that's ok. + */ +struct rb_root_cached { + struct rb_root rb_root; + struct rb_node *rb_leftmost; +}; + +#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } + +/* Same as rb_first(), but O(1) */ +#define rb_first_cached(root) (root)->rb_leftmost + +static inline void rb_insert_color_cached(struct rb_node *node, + struct rb_root_cached *root, + bool leftmost) +{ + if (leftmost) + root->rb_leftmost = node; + rb_insert_color(node, &root->rb_root); +} + +static inline void rb_erase_cached(struct rb_node *node, + struct rb_root_cached *root) +{ + if (root->rb_leftmost == node) + root->rb_leftmost = rb_next(node); + rb_erase(node, &root->rb_root); +} + +static inline void rb_replace_node_cached(struct rb_node *victim, + struct rb_node *new, + struct rb_root_cached *root) +{ + if (root->rb_leftmost == victim) + root->rb_leftmost = new; + rb_replace_node(victim, new, &root->rb_root); +} + #endif /* _LINUX_RBTREE_H */ diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 0f902ccb48b0..179faab29f52 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -30,10 +30,9 @@ struct rb_augment_callbacks { void (*rotate)(struct rb_node *old, struct rb_node *new); }; -extern void __rb_insert_augmented(struct rb_node *node, - struct rb_root *root, - bool newleft, struct rb_node **leftmost, +extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); + /* * Fixup the rbtree and update the augmented information when rebalancing. * @@ -48,7 +47,7 @@ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - __rb_insert_augmented(node, root, false, NULL, augment->rotate); + __rb_insert_augmented(node, root, augment->rotate); } static inline void @@ -56,8 +55,9 @@ rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *root, bool newleft, const struct rb_augment_callbacks *augment) { - __rb_insert_augmented(node, &root->rb_root, - newleft, &root->rb_leftmost, augment->rotate); + if (newleft) + root->rb_leftmost = node; + rb_insert_augmented(node, &root->rb_root, augment); } #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ @@ -150,7 +150,6 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, - struct rb_node **leftmost, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; @@ -158,9 +157,6 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, struct rb_node *parent, *rebalance; unsigned long pc; - if (leftmost && node == *leftmost) - *leftmost = rb_next(node); - if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) @@ -260,8 +256,7 @@ static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - struct rb_node *rebalance = __rb_erase_augmented(node, root, - NULL, augment); + struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } @@ -270,11 +265,9 @@ static __always_inline void rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, const struct rb_augment_callbacks *augment) { - struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root, - &root->rb_leftmost, - augment); - if (rebalance) - __rb_erase_color(rebalance, &root->rb_root, augment->rotate); + if (root->rb_leftmost == node) + root->rb_leftmost = rb_next(node); + rb_erase_augmented(node, &root->rb_root, augment); } #endif /* _LINUX_RBTREE_AUGMENTED_H */ diff --git a/lib/rbtree.c b/lib/rbtree.c index 1ef6e25d031c..abc86c6a3177 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -83,14 +83,10 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, static __always_inline void __rb_insert(struct rb_node *node, struct rb_root *root, - bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; - if (newleft) - *leftmost = node; - while (true) { /* * Loop invariant: node is red. @@ -437,38 +433,19 @@ static const struct rb_augment_callbacks dummy_callbacks = { void rb_insert_color(struct rb_node *node, struct rb_root *root) { - __rb_insert(node, root, false, NULL, dummy_rotate); + __rb_insert(node, root, dummy_rotate); } EXPORT_SYMBOL(rb_insert_color); void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *rebalance; - rebalance = __rb_erase_augmented(node, root, - NULL, &dummy_callbacks); + rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); if (rebalance) ____rb_erase_color(rebalance, root, dummy_rotate); } EXPORT_SYMBOL(rb_erase); -void rb_insert_color_cached(struct rb_node *node, - struct rb_root_cached *root, bool leftmost) -{ - __rb_insert(node, &root->rb_root, leftmost, - &root->rb_leftmost, dummy_rotate); -} -EXPORT_SYMBOL(rb_insert_color_cached); - -void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) -{ - struct rb_node *rebalance; - rebalance = __rb_erase_augmented(node, &root->rb_root, - &root->rb_leftmost, &dummy_callbacks); - if (rebalance) - ____rb_erase_color(rebalance, &root->rb_root, dummy_rotate); -} -EXPORT_SYMBOL(rb_erase_cached); - /* * Augmented rbtree manipulation functions. * @@ -477,10 +454,9 @@ EXPORT_SYMBOL(rb_erase_cached); */ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, - bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - __rb_insert(node, root, newleft, leftmost, augment_rotate); + __rb_insert(node, root, augment_rotate); } EXPORT_SYMBOL(__rb_insert_augmented); @@ -591,16 +567,6 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, } EXPORT_SYMBOL(rb_replace_node); -void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, - struct rb_root_cached *root) -{ - rb_replace_node(victim, new, &root->rb_root); - - if (root->rb_leftmost == victim) - root->rb_leftmost = new; -} -EXPORT_SYMBOL(rb_replace_node_cached); - void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root) { -- cgit v1.2.3-59-g8ed1b From b98cca444d287a63dd96df04af7fb9793567599e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 16 Jul 2019 16:28:00 -0700 Subject: mm, kprobes: generalize and rename notify_page_fault() as kprobe_page_fault() Architectures which support kprobes have very similar boilerplate around calling kprobe_fault_handler(). Use a helper function in kprobes.h to unify them, based on the x86 code. This changes the behaviour for other architectures when preemption is enabled. Previously, they would have disabled preemption while calling the kprobe handler. However, preemption would be disabled if this fault was due to a kprobe, so we know the fault was not due to a kprobe handler and can simply return failure. This behaviour was introduced in commit a980c0ef9f6d ("x86/kprobes: Refactor kprobes_fault() like kprobe_exceptions_notify()") [anshuman.khandual@arm.com: export kprobe_fault_handler()] Link: http://lkml.kernel.org/r/1561133358-8876-1-git-send-email-anshuman.khandual@arm.com Link: http://lkml.kernel.org/r/1560420444-25737-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Dave Hansen Cc: Michal Hocko Cc: Matthew Wilcox Cc: Mark Rutland Cc: Christophe Leroy Cc: Stephen Rothwell Cc: Andrey Konovalov Cc: Michael Ellerman Cc: Paul Mackerras Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Tony Luck Cc: Fenghua Yu Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Andy Lutomirski Cc: Vineet Gupta Cc: James Hogan Cc: Paul Burton Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/fault.c | 24 +----------------------- arch/arm64/mm/fault.c | 24 +----------------------- arch/ia64/mm/fault.c | 24 +----------------------- arch/mips/include/asm/kprobes.h | 1 + arch/mips/kernel/kprobes.c | 2 +- arch/powerpc/mm/fault.c | 23 ++--------------------- arch/s390/mm/fault.c | 16 +--------------- arch/sh/mm/fault.c | 18 ++---------------- arch/sparc/mm/fault_64.c | 16 +--------------- arch/x86/mm/fault.c | 21 ++------------------- include/linux/kprobes.h | 19 +++++++++++++++++++ 11 files changed, 32 insertions(+), 156 deletions(-) (limited to 'include') diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 0e417233dad7..890eeaac3cbb 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -27,28 +27,6 @@ #ifdef CONFIG_MMU -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr) -{ - int ret = 0; - - if (!user_mode(regs)) { - /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, fsr)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs, unsigned int fsr) -{ - return 0; -} -#endif - /* * This is useful to dump out the page tables associated with * 'addr' in mm 'mm'. @@ -265,7 +243,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) vm_fault_t fault; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - if (notify_page_fault(regs, fsr)) + if (kprobe_page_fault(regs, fsr)) return 0; tsk = current; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index c8c61b1eb479..9568c116ac7f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -59,28 +59,6 @@ static inline const struct fault_info *esr_to_debug_fault_info(unsigned int esr) return debug_fault_info + DBG_ESR_EVT(esr); } -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, esr)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr) -{ - return 0; -} -#endif - static void data_abort_decode(unsigned int esr) { pr_alert("Data abort info:\n"); @@ -434,7 +412,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, unsigned long vm_flags = VM_READ | VM_WRITE; unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - if (notify_page_fault(regs, esr)) + if (kprobe_page_fault(regs, esr)) return 0; /* diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 3c3a283d3172..c2f299fe9e04 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -21,28 +21,6 @@ extern int die(char *, struct pt_regs *, long); -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs, int trap) -{ - int ret = 0; - - if (!user_mode(regs)) { - /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, trap)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs, int trap) -{ - return 0; -} -#endif - /* * Return TRUE if ADDRESS points at a page in the kernel's mapped segment * (inside region 5, on ia64) and that page is present. @@ -116,7 +94,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re /* * This is to handle the kprobes on user space access instructions */ - if (notify_page_fault(regs, TRAP_BRKPT)) + if (kprobe_page_fault(regs, TRAP_BRKPT)) return; if (user_mode(regs)) diff --git a/arch/mips/include/asm/kprobes.h b/arch/mips/include/asm/kprobes.h index 3cf8e4d5fa28..68b1e5d458cf 100644 --- a/arch/mips/include/asm/kprobes.h +++ b/arch/mips/include/asm/kprobes.h @@ -41,6 +41,7 @@ do { \ #define kretprobe_blacklist_size 0 void arch_remove_kprobe(struct kprobe *p); +int kprobe_fault_handler(struct pt_regs *regs, int trapnr); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c index 81ba1d3c367c..6cfae2411c04 100644 --- a/arch/mips/kernel/kprobes.c +++ b/arch/mips/kernel/kprobes.c @@ -398,7 +398,7 @@ out: return 1; } -static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr) +int kprobe_fault_handler(struct pt_regs *regs, int trapnr) { struct kprobe *cur = kprobe_running(); struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index d989592b6fc8..8432c281de92 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -42,26 +42,6 @@ #include #include -static inline bool notify_page_fault(struct pt_regs *regs) -{ - bool ret = false; - -#ifdef CONFIG_KPROBES - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 11)) - ret = true; - preempt_enable(); - } -#endif /* CONFIG_KPROBES */ - - if (unlikely(debugger_fault_handler(regs))) - ret = true; - - return ret; -} - /* * Check whether the instruction inst is a store using * an update addressing form which will update r1. @@ -461,8 +441,9 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address, int is_write = page_fault_is_write(error_code); vm_fault_t fault, major = 0; bool must_retry = false; + bool kprobe_fault = kprobe_page_fault(regs, 11); - if (notify_page_fault(regs)) + if (unlikely(debugger_fault_handler(regs) || kprobe_fault)) return 0; if (unlikely(page_fault_is_bad(error_code))) { diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 0ba174f779da..63507662828f 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -67,20 +67,6 @@ static int __init fault_init(void) } early_initcall(fault_init); -static inline int notify_page_fault(struct pt_regs *regs) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (kprobes_built_in() && !user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); - } - return ret; -} - /* * Find out which address space caused the exception. */ @@ -412,7 +398,7 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) */ clear_pt_regs_flag(regs, PIF_PER_TRAP); - if (notify_page_fault(regs)) + if (kprobe_page_fault(regs, 14)) return 0; mm = tsk->mm; diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 3093bc372138..5f51456f4fc7 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -24,20 +24,6 @@ #include #include -static inline int notify_page_fault(struct pt_regs *regs, int trap) -{ - int ret = 0; - - if (kprobes_built_in() && !user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, trap)) - ret = 1; - preempt_enable(); - } - - return ret; -} - static void force_sig_info_fault(int si_signo, int si_code, unsigned long address) { @@ -412,14 +398,14 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (unlikely(fault_in_kernel_space(address))) { if (vmalloc_fault(address) >= 0) return; - if (notify_page_fault(regs, vec)) + if (kprobe_page_fault(regs, vec)) return; bad_area_nosemaphore(regs, error_code, address); return; } - if (unlikely(notify_page_fault(regs, vec))) + if (unlikely(kprobe_page_fault(regs, vec))) return; /* Only enable interrupts if they were on before the fault */ diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 83fda4d9c3b2..2371fb6b97e4 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -38,20 +38,6 @@ int show_unhandled_signals = 1; -static inline __kprobes int notify_page_fault(struct pt_regs *regs) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (kprobes_built_in() && !user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 0)) - ret = 1; - preempt_enable(); - } - return ret; -} - static void __kprobes unhandled_fault(unsigned long address, struct task_struct *tsk, struct pt_regs *regs) @@ -285,7 +271,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs) fault_code = get_thread_fault_code(); - if (notify_page_fault(regs)) + if (kprobe_page_fault(regs, 0)) goto exit_exception; si_code = SEGV_MAPERR; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 794f364cb882..d1634c59ed56 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -46,23 +46,6 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr) return 0; } -static nokprobe_inline int kprobes_fault(struct pt_regs *regs) -{ - if (!kprobes_built_in()) - return 0; - if (user_mode(regs)) - return 0; - /* - * To be potentially processing a kprobe fault and to be allowed to call - * kprobe_running(), we have to be non-preemptible. - */ - if (preemptible()) - return 0; - if (!kprobe_running()) - return 0; - return kprobe_fault_handler(regs, X86_TRAP_PF); -} - /* * Prefetch quirks: * @@ -1282,7 +1265,7 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, return; /* kprobes don't want to hook the spurious faults: */ - if (kprobes_fault(regs)) + if (kprobe_page_fault(regs, X86_TRAP_PF)) return; /* @@ -1313,7 +1296,7 @@ void do_user_addr_fault(struct pt_regs *regs, mm = tsk->mm; /* kprobes don't want to hook the spurious faults: */ - if (unlikely(kprobes_fault(regs))) + if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF))) return; /* diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 443d9800ca3f..04bdaf01112c 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -458,4 +458,23 @@ static inline bool is_kprobe_optinsn_slot(unsigned long addr) } #endif +/* Returns true if kprobes handled the fault */ +static nokprobe_inline bool kprobe_page_fault(struct pt_regs *regs, + unsigned int trap) +{ + if (!kprobes_built_in()) + return false; + if (user_mode(regs)) + return false; + /* + * To be potentially processing a kprobe fault and to be allowed + * to call kprobe_running(), we have to be non-preemptible. + */ + if (preemptible()) + return false; + if (!kprobe_running()) + return false; + return kprobe_fault_handler(regs, trap); +} + #endif /* _LINUX_KPROBES_H */ -- cgit v1.2.3-59-g8ed1b From 694a58e29ef27c4c26f103a9decfd053f94dd34c Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Tue, 16 Jul 2019 16:28:07 -0700 Subject: uapi linux/coda.h: use __kernel_pid_t for userspace Part of a patch by Mikko Rapeli, as Arnd Bergman commented on the original patch. pid_t might differ between libc and the kernel, so the kernel interface has to use types that the kernel defines. Link: http://lkml.kernel.org/r/f374a71f4d351bc8c8b3ac18ad7765c88d806d10.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Mikko Rapeli Signed-off-by: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/coda.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/coda.h b/include/uapi/linux/coda.h index 695fade33c64..ed8cb263e482 100644 --- a/include/uapi/linux/coda.h +++ b/include/uapi/linux/coda.h @@ -295,8 +295,8 @@ struct coda_statfs { struct coda_in_hdr { u_int32_t opcode; u_int32_t unique; /* Keep multiple outstanding msgs distinct */ - pid_t pid; - pid_t pgid; + __kernel_pid_t pid; + __kernel_pid_t pgid; vuid_t uid; }; -- cgit v1.2.3-59-g8ed1b From f90fb3c7e2c13ae829db2274b88b845a75038b8a Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Tue, 16 Jul 2019 16:28:10 -0700 Subject: uapi linux/coda_psdev.h: move upc_req definition from uapi to kernel side headers Only users of upc_req in kernel side fs/coda/psdev.c and fs/coda/upcall.c already include linux/coda_psdev.h. Suggested by Jan Harkes in https://lore.kernel.org/lkml/20150531111913.GA23377@cs.cmu.edu/ Fixes these include/uapi/linux/coda_psdev.h compilation errors in userspace: linux/coda_psdev.h:12:19: error: field `uc_chain' has incomplete type struct list_head uc_chain; ^ linux/coda_psdev.h:13:2: error: unknown type name `caddr_t' caddr_t uc_data; ^ linux/coda_psdev.h:14:2: error: unknown type name `u_short' u_short uc_flags; ^ linux/coda_psdev.h:15:2: error: unknown type name `u_short' u_short uc_inSize; /* Size is at most 5000 bytes */ ^ linux/coda_psdev.h:16:2: error: unknown type name `u_short' u_short uc_outSize; ^ linux/coda_psdev.h:17:2: error: unknown type name `u_short' u_short uc_opcode; /* copied from data to save lookup */ ^ linux/coda_psdev.h:19:2: error: unknown type name `wait_queue_head_t' wait_queue_head_t uc_sleep; /* process' wait queue */ ^ Link: http://lkml.kernel.org/r/9f99f5ce6a0563d5266e6cf7aa9585aac2cae971.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Mikko Rapeli Signed-off-by: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/coda_psdev.h | 11 +++++++++++ include/uapi/linux/coda_psdev.h | 13 ------------- 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h index 15170954aa2b..57d2b2faf6a3 100644 --- a/include/linux/coda_psdev.h +++ b/include/linux/coda_psdev.h @@ -19,6 +19,17 @@ struct venus_comm { struct mutex vc_mutex; }; +/* messages between coda filesystem in kernel and Venus */ +struct upc_req { + struct list_head uc_chain; + caddr_t uc_data; + u_short uc_flags; + u_short uc_inSize; /* Size is at most 5000 bytes */ + u_short uc_outSize; + u_short uc_opcode; /* copied from data to save lookup */ + int uc_unique; + wait_queue_head_t uc_sleep; /* process' wait queue */ +}; static inline struct venus_comm *coda_vcp(struct super_block *sb) { diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h index aa6623efd2dd..d50d51a57fe4 100644 --- a/include/uapi/linux/coda_psdev.h +++ b/include/uapi/linux/coda_psdev.h @@ -7,19 +7,6 @@ #define CODA_PSDEV_MAJOR 67 #define MAX_CODADEVS 5 /* how many do we allow */ - -/* messages between coda filesystem in kernel and Venus */ -struct upc_req { - struct list_head uc_chain; - caddr_t uc_data; - u_short uc_flags; - u_short uc_inSize; /* Size is at most 5000 bytes */ - u_short uc_outSize; - u_short uc_opcode; /* copied from data to save lookup */ - int uc_unique; - wait_queue_head_t uc_sleep; /* process' wait queue */ -}; - #define CODA_REQ_ASYNC 0x1 #define CODA_REQ_READ 0x2 #define CODA_REQ_WRITE 0x4 -- cgit v1.2.3-59-g8ed1b From 6e51f8aa76b67d0a6eb168fd41a81e8478ae07a9 Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Tue, 16 Jul 2019 16:28:16 -0700 Subject: coda: potential buffer overflow in coda_psdev_write() Add checks to make sure the downcall message we got from the Coda cache manager is large enough to contain the data it is supposed to have. i.e. when we get a CODA_ZAPDIR we can access &out->coda_zapdir.CodaFid. Link: http://lkml.kernel.org/r/894fb6b250add09e4e3935f14649f21284a5cb18.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Jan Harkes Reported-by: Dan Carpenter Cc: Arnd Bergmann Cc: Colin Ian King Cc: David Howells Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coda/psdev.c | 8 ++++++-- fs/coda/upcall.c | 34 +++++++++++++++++++++++++++++++++- include/linux/coda_psdev.h | 3 ++- 3 files changed, 41 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 241f7e04ad04..b4da2812499e 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -100,8 +100,12 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, ssize_t retval = 0, count = 0; int error; + /* make sure there is enough to copy out the (opcode, unique) values */ + if (nbytes < (2 * sizeof(u_int32_t))) + return -EINVAL; + /* Peek at the opcode, uniquefier */ - if (copy_from_user(&hdr, buf, 2 * sizeof(u_long))) + if (copy_from_user(&hdr, buf, 2 * sizeof(u_int32_t))) return -EFAULT; if (DOWNCALL(hdr.opcode)) { @@ -127,7 +131,7 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, } /* what downcall errors does Venus handle ? */ - error = coda_downcall(vcp, hdr.opcode, dcbuf); + error = coda_downcall(vcp, hdr.opcode, dcbuf, nbytes); CODA_FREE(dcbuf, nbytes); if (error) { diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 1175a1722411..cf1e662681a5 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -804,12 +804,44 @@ exit: * * CODA_REPLACE -- replace one CodaFid with another throughout the name cache */ -int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out) +int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out, + size_t nbytes) { struct inode *inode = NULL; struct CodaFid *fid = NULL, *newfid; struct super_block *sb; + /* + * Make sure we have received enough data from the cache + * manager to populate the necessary fields in the buffer + */ + switch (opcode) { + case CODA_PURGEUSER: + if (nbytes < sizeof(struct coda_purgeuser_out)) + return -EINVAL; + break; + + case CODA_ZAPDIR: + if (nbytes < sizeof(struct coda_zapdir_out)) + return -EINVAL; + break; + + case CODA_ZAPFILE: + if (nbytes < sizeof(struct coda_zapfile_out)) + return -EINVAL; + break; + + case CODA_PURGEFID: + if (nbytes < sizeof(struct coda_purgefid_out)) + return -EINVAL; + break; + + case CODA_REPLACE: + if (nbytes < sizeof(struct coda_replace_out)) + return -EINVAL; + break; + } + /* Handle invalidation requests. */ mutex_lock(&vcp->vc_mutex); sb = vcp->vc_sb; diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h index 57d2b2faf6a3..d1672fd5e638 100644 --- a/include/linux/coda_psdev.h +++ b/include/linux/coda_psdev.h @@ -71,7 +71,8 @@ int venus_symlink(struct super_block *sb, struct CodaFid *fid, int venus_access(struct super_block *sb, struct CodaFid *fid, int mask); int venus_pioctl(struct super_block *sb, struct CodaFid *fid, unsigned int cmd, struct PioctlData *data); -int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out); +int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out, + size_t nbytes); int venus_fsync(struct super_block *sb, struct CodaFid *fid); int venus_statfs(struct dentry *dentry, struct kstatfs *sfs); -- cgit v1.2.3-59-g8ed1b From b2a57e334086602be56b74958d9f29b955cd157f Mon Sep 17 00:00:00 2001 From: Sam Protsenko Date: Tue, 16 Jul 2019 16:28:20 -0700 Subject: coda: fix build using bare-metal toolchain The kernel is self-contained project and can be built with bare-metal toolchain. But bare-metal toolchain doesn't define __linux__. Because of this u_quad_t type is not defined when using bare-metal toolchain and codafs build fails. This patch fixes it by defining u_quad_t type unconditionally. Link: http://lkml.kernel.org/r/3cbb40b0a57b6f9923a9d67b53473c0b691a3eaa.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Sam Protsenko Signed-off-by: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/coda.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/coda.h b/include/linux/coda.h index d30209b9cef8..0ca0c83fdb1c 100644 --- a/include/linux/coda.h +++ b/include/linux/coda.h @@ -58,8 +58,7 @@ Mellon the rights to redistribute these changes without encumbrance. #ifndef _CODA_HEADER_ #define _CODA_HEADER_ -#if defined(__linux__) typedef unsigned long long u_quad_t; -#endif + #include #endif -- cgit v1.2.3-59-g8ed1b From 2fe7491d219428a32f09948e88bfaf8e71b9a66b Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Tue, 16 Jul 2019 16:28:26 -0700 Subject: uapi linux/coda_psdev.h: move CODA_REQ_ from uapi to kernel side headers These constants only used internally and not exposed to userspace. Link: http://lkml.kernel.org/r/baeafc30dad70d8b422ee679420099c2d8aa7da0.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/coda_psdev.h | 5 +++++ include/uapi/linux/coda_psdev.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h index d1672fd5e638..9487f792770c 100644 --- a/include/linux/coda_psdev.h +++ b/include/linux/coda_psdev.h @@ -31,6 +31,11 @@ struct upc_req { wait_queue_head_t uc_sleep; /* process' wait queue */ }; +#define CODA_REQ_ASYNC 0x1 +#define CODA_REQ_READ 0x2 +#define CODA_REQ_WRITE 0x4 +#define CODA_REQ_ABORT 0x8 + static inline struct venus_comm *coda_vcp(struct super_block *sb) { return (struct venus_comm *)((sb)->s_fs_info); diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h index d50d51a57fe4..3dacb7fad66a 100644 --- a/include/uapi/linux/coda_psdev.h +++ b/include/uapi/linux/coda_psdev.h @@ -7,9 +7,4 @@ #define CODA_PSDEV_MAJOR 67 #define MAX_CODADEVS 5 /* how many do we allow */ -#define CODA_REQ_ASYNC 0x1 -#define CODA_REQ_READ 0x2 -#define CODA_REQ_WRITE 0x4 -#define CODA_REQ_ABORT 0x8 - #endif /* _UAPI__CODA_PSDEV_H */ -- cgit v1.2.3-59-g8ed1b From 6ced9aa7b56baeb241a715df4539e60d5e3118e2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 Jul 2019 16:28:32 -0700 Subject: coda: stop using 'struct timespec' in user API We exchange file timestamps with user space using psdev device read/write operations with a fixed but architecture specific binary layout. On 32-bit systems, this uses a 'timespec' structure that is defined by the C library to contain two 32-bit values for seconds and nanoseconds. As we get ready for the year 2038 overflow of the 32-bit signed seconds, the kernel now uses 64-bit timestamps internally, and user space will do the same change by changing the 'timespec' definition in the future. Unfortunately, this breaks the layout of the coda_vattr structure, so we need to redefine that in terms of something that does not change. I'm introducing a new 'struct vtimespec' structure here that keeps the existing layout, and the same change has to be done in the coda user space copy of linux/coda.h before anyone can use that on a 32-bit architecture with 64-bit time_t. An open question is what should happen to actual times past y2038, as they are now truncated to the last valid date when sent to user space, and interpreted as pre-1970 times when a timestamp with the MSB set is read back into the kernel. Alternatively, we could change the new timespec64_to_coda()/coda_to_timespec64() functions to use a different interpretation and extend the available range further to the future by disallowing past timestamps. This would require more changes in the user space side though. Link: http://lkml.kernel.org/r/562b7324149461743e4fbe2fedbf7c242f7e274a.1558117389.git.jaharkes@cs.cmu.edu Link: https://patchwork.kernel.org/patch/10474735/ Signed-off-by: Arnd Bergmann Signed-off-by: Jan Harkes Acked-by: Jan Harkes Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/coda.txt | 11 ++++++--- fs/coda/coda_linux.c | 50 +++++++++++++++++++++++++++++--------- include/uapi/linux/coda.h | 20 ++++++++++++--- 3 files changed, 62 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt index 61311356025d..ea5969068895 100644 --- a/Documentation/filesystems/coda.txt +++ b/Documentation/filesystems/coda.txt @@ -481,7 +481,10 @@ kernel support. - + struct vtimespec { + long tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ + }; struct coda_vattr { enum coda_vtype va_type; /* vnode type (for create) */ @@ -493,9 +496,9 @@ kernel support. long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ - struct timespec va_atime; /* time of last access */ - struct timespec va_mtime; /* time of last modification */ - struct timespec va_ctime; /* time file changed */ + struct vtimespec va_atime; /* time of last access */ + struct vtimespec va_mtime; /* time of last modification */ + struct vtimespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device special file represents */ diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index f3d543dd9a98..8addcd166908 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -66,6 +66,32 @@ unsigned short coda_flags_to_cflags(unsigned short flags) return coda_flags; } +static struct timespec64 coda_to_timespec64(struct vtimespec ts) +{ + /* + * We interpret incoming timestamps as 'signed' to match traditional + * usage and support pre-1970 timestamps, but this breaks in y2038 + * on 32-bit machines. + */ + struct timespec64 ts64 = { + .tv_sec = ts.tv_sec, + .tv_nsec = ts.tv_nsec, + }; + + return ts64; +} + +static struct vtimespec timespec64_to_coda(struct timespec64 ts64) +{ + /* clamp the timestamps to the maximum range rather than wrapping */ + struct vtimespec ts = { + .tv_sec = lower_32_bits(clamp_t(time64_t, ts64.tv_sec, + LONG_MIN, LONG_MAX)), + .tv_nsec = ts64.tv_nsec, + }; + + return ts; +} /* utility functions below */ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) @@ -105,11 +131,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_size != -1) inode->i_blocks = (attr->va_size + 511) >> 9; if (attr->va_atime.tv_sec != -1) - inode->i_atime = timespec_to_timespec64(attr->va_atime); + inode->i_atime = coda_to_timespec64(attr->va_atime); if (attr->va_mtime.tv_sec != -1) - inode->i_mtime = timespec_to_timespec64(attr->va_mtime); + inode->i_mtime = coda_to_timespec64(attr->va_mtime); if (attr->va_ctime.tv_sec != -1) - inode->i_ctime = timespec_to_timespec64(attr->va_ctime); + inode->i_ctime = coda_to_timespec64(attr->va_ctime); } @@ -130,12 +156,12 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) vattr->va_uid = (vuid_t) -1; vattr->va_gid = (vgid_t) -1; vattr->va_size = (off_t) -1; - vattr->va_atime.tv_sec = (time_t) -1; - vattr->va_atime.tv_nsec = (time_t) -1; - vattr->va_mtime.tv_sec = (time_t) -1; - vattr->va_mtime.tv_nsec = (time_t) -1; - vattr->va_ctime.tv_sec = (time_t) -1; - vattr->va_ctime.tv_nsec = (time_t) -1; + vattr->va_atime.tv_sec = (long) -1; + vattr->va_atime.tv_nsec = (long) -1; + vattr->va_mtime.tv_sec = (long) -1; + vattr->va_mtime.tv_nsec = (long) -1; + vattr->va_ctime.tv_sec = (long) -1; + vattr->va_ctime.tv_nsec = (long) -1; vattr->va_type = C_VNON; vattr->va_fileid = -1; vattr->va_gen = -1; @@ -175,13 +201,13 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) vattr->va_size = iattr->ia_size; } if ( valid & ATTR_ATIME ) { - vattr->va_atime = timespec64_to_timespec(iattr->ia_atime); + vattr->va_atime = timespec64_to_coda(iattr->ia_atime); } if ( valid & ATTR_MTIME ) { - vattr->va_mtime = timespec64_to_timespec(iattr->ia_mtime); + vattr->va_mtime = timespec64_to_coda(iattr->ia_mtime); } if ( valid & ATTR_CTIME ) { - vattr->va_ctime = timespec64_to_timespec(iattr->ia_ctime); + vattr->va_ctime = timespec64_to_coda(iattr->ia_ctime); } } diff --git a/include/uapi/linux/coda.h b/include/uapi/linux/coda.h index ed8cb263e482..fc5f7874208a 100644 --- a/include/uapi/linux/coda.h +++ b/include/uapi/linux/coda.h @@ -211,6 +211,20 @@ struct CodaFid { */ enum coda_vtype { C_VNON, C_VREG, C_VDIR, C_VBLK, C_VCHR, C_VLNK, C_VSOCK, C_VFIFO, C_VBAD }; +#ifdef __linux__ +/* + * This matches the traditional Linux 'timespec' structure binary layout, + * before using 64-bit time_t everywhere. Overflows in y2038 on 32-bit + * architectures. + */ +struct vtimespec { + long tv_sec; /* seconds */ + long tv_nsec; /* nanoseconds */ +}; +#else +#define vtimespec timespec +#endif + struct coda_vattr { long va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ @@ -220,9 +234,9 @@ struct coda_vattr { long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ - struct timespec va_atime; /* time of last access */ - struct timespec va_mtime; /* time of last modification */ - struct timespec va_ctime; /* time file changed */ + struct vtimespec va_atime; /* time of last access */ + struct vtimespec va_mtime; /* time of last modification */ + struct vtimespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ cdev_t va_rdev; /* device special file represents */ -- cgit v1.2.3-59-g8ed1b From 5e7c31dfe74703f428220384b2863525957cc160 Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Tue, 16 Jul 2019 16:28:35 -0700 Subject: coda: change Coda's user api to use 64-bit time_t in timespec Move the 32-bit time_t problems to userspace. Link: http://lkml.kernel.org/r/8d089068823bfb292a4020f773922fbd82ffad39.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/coda.txt | 10 +++++----- fs/coda/coda_linux.c | 21 +++++++-------------- include/uapi/linux/coda.h | 33 +++++++-------------------------- 3 files changed, 19 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/coda.txt b/Documentation/filesystems/coda.txt index ea5969068895..545262c167c3 100644 --- a/Documentation/filesystems/coda.txt +++ b/Documentation/filesystems/coda.txt @@ -481,8 +481,8 @@ kernel support. - struct vtimespec { - long tv_sec; /* seconds */ + struct coda_timespec { + int64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; @@ -496,9 +496,9 @@ kernel support. long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ - struct vtimespec va_atime; /* time of last access */ - struct vtimespec va_mtime; /* time of last modification */ - struct vtimespec va_ctime; /* time file changed */ + struct coda_timespec va_atime; /* time of last access */ + struct coda_timespec va_mtime; /* time of last modification */ + struct coda_timespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device special file represents */ diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 8addcd166908..e4b5f02f0dd4 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -66,13 +66,8 @@ unsigned short coda_flags_to_cflags(unsigned short flags) return coda_flags; } -static struct timespec64 coda_to_timespec64(struct vtimespec ts) +static struct timespec64 coda_to_timespec64(struct coda_timespec ts) { - /* - * We interpret incoming timestamps as 'signed' to match traditional - * usage and support pre-1970 timestamps, but this breaks in y2038 - * on 32-bit machines. - */ struct timespec64 ts64 = { .tv_sec = ts.tv_sec, .tv_nsec = ts.tv_nsec, @@ -81,12 +76,10 @@ static struct timespec64 coda_to_timespec64(struct vtimespec ts) return ts64; } -static struct vtimespec timespec64_to_coda(struct timespec64 ts64) +static struct coda_timespec timespec64_to_coda(struct timespec64 ts64) { - /* clamp the timestamps to the maximum range rather than wrapping */ - struct vtimespec ts = { - .tv_sec = lower_32_bits(clamp_t(time64_t, ts64.tv_sec, - LONG_MIN, LONG_MAX)), + struct coda_timespec ts = { + .tv_sec = ts64.tv_sec, .tv_nsec = ts64.tv_nsec, }; @@ -156,11 +149,11 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) vattr->va_uid = (vuid_t) -1; vattr->va_gid = (vgid_t) -1; vattr->va_size = (off_t) -1; - vattr->va_atime.tv_sec = (long) -1; + vattr->va_atime.tv_sec = (int64_t) -1; vattr->va_atime.tv_nsec = (long) -1; - vattr->va_mtime.tv_sec = (long) -1; + vattr->va_mtime.tv_sec = (int64_t) -1; vattr->va_mtime.tv_nsec = (long) -1; - vattr->va_ctime.tv_sec = (long) -1; + vattr->va_ctime.tv_sec = (int64_t) -1; vattr->va_ctime.tv_nsec = (long) -1; vattr->va_type = C_VNON; vattr->va_fileid = -1; diff --git a/include/uapi/linux/coda.h b/include/uapi/linux/coda.h index fc5f7874208a..5dba636b6e11 100644 --- a/include/uapi/linux/coda.h +++ b/include/uapi/linux/coda.h @@ -86,10 +86,6 @@ typedef unsigned long long u_quad_t; #define inline -struct timespec { - long ts_sec; - long ts_nsec; -}; #else /* DJGPP but not KERNEL */ #include typedef unsigned long long u_quad_t; @@ -110,13 +106,6 @@ typedef unsigned long long u_quad_t; #define cdev_t dev_t #endif -#ifdef __CYGWIN32__ -struct timespec { - time_t tv_sec; /* seconds */ - long tv_nsec; /* nanoseconds */ -}; -#endif - #ifndef __BIT_TYPES_DEFINED__ #define __BIT_TYPES_DEFINED__ typedef signed char int8_t; @@ -211,19 +200,10 @@ struct CodaFid { */ enum coda_vtype { C_VNON, C_VREG, C_VDIR, C_VBLK, C_VCHR, C_VLNK, C_VSOCK, C_VFIFO, C_VBAD }; -#ifdef __linux__ -/* - * This matches the traditional Linux 'timespec' structure binary layout, - * before using 64-bit time_t everywhere. Overflows in y2038 on 32-bit - * architectures. - */ -struct vtimespec { - long tv_sec; /* seconds */ +struct coda_timespec { + int64_t tv_sec; /* seconds */ long tv_nsec; /* nanoseconds */ }; -#else -#define vtimespec timespec -#endif struct coda_vattr { long va_type; /* vnode type (for create) */ @@ -234,9 +214,9 @@ struct coda_vattr { long va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ - struct vtimespec va_atime; /* time of last access */ - struct vtimespec va_mtime; /* time of last modification */ - struct vtimespec va_ctime; /* time file changed */ + struct coda_timespec va_atime; /* time of last access */ + struct coda_timespec va_mtime; /* time of last modification */ + struct coda_timespec va_ctime; /* time file changed */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ cdev_t va_rdev; /* device special file represents */ @@ -301,7 +281,8 @@ struct coda_statfs { #define CIOC_KERNEL_VERSION _IOWR('c', 10, size_t) -#define CODA_KERNEL_VERSION 3 /* 128-bit file identifiers */ +// CODA_KERNEL_VERSION 3 /* 128-bit file identifiers */ +#define CODA_KERNEL_VERSION 4 /* 64-bit timespec */ /* * Venus <-> Coda RPC arguments -- cgit v1.2.3-59-g8ed1b From 8fc8b9df831387e0d02c1d0f5bb53d327e0d477a Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 16 Jul 2019 16:28:47 -0700 Subject: coda: move internal defs out of include/linux/ [ver #2] Move include/linux/coda_psdev.h to fs/coda/ as there's nothing else that uses it. Link: http://lkml.kernel.org/r/3ceeee0415a929b89fb02700b6b4b3a07938acb8.1558117389.git.jaharkes@cs.cmu.edu Link: https://patchwork.kernel.org/patch/10590257/ Signed-off-by: David Howells Signed-off-by: Jan Harkes Cc: Yann Droneaud Cc: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Sam Protsenko Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coda/cache.c | 2 +- fs/coda/cnode.c | 2 +- fs/coda/coda_linux.c | 2 +- fs/coda/coda_psdev.h | 89 ++++++++++++++++++++++++++++++++++++++++++++++ fs/coda/dir.c | 2 +- fs/coda/file.c | 3 +- fs/coda/inode.c | 2 +- fs/coda/pioctl.c | 3 +- fs/coda/psdev.c | 3 +- fs/coda/symlink.c | 3 +- fs/coda/upcall.c | 2 +- include/linux/coda_psdev.h | 89 ---------------------------------------------- 12 files changed, 99 insertions(+), 103 deletions(-) create mode 100644 fs/coda/coda_psdev.h delete mode 100644 include/linux/coda_psdev.h (limited to 'include') diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 201fc08a8b4f..3b8c4513118f 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -21,7 +21,7 @@ #include #include -#include +#include "coda_psdev.h" #include "coda_linux.h" #include "coda_cache.h" diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 845b5a66952a..2e5badf67f98 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -8,8 +8,8 @@ #include #include -#include #include +#include "coda_psdev.h" #include "coda_linux.h" static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index e4b5f02f0dd4..2e1a5a192074 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -18,7 +18,7 @@ #include #include -#include +#include "coda_psdev.h" #include "coda_linux.h" /* initialize the debugging variables */ diff --git a/fs/coda/coda_psdev.h b/fs/coda/coda_psdev.h new file mode 100644 index 000000000000..012e16f741a6 --- /dev/null +++ b/fs/coda/coda_psdev.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __CODA_PSDEV_H +#define __CODA_PSDEV_H + +#include +#include +#include + +struct kstatfs; + +/* messages between coda filesystem in kernel and Venus */ +struct upc_req { + struct list_head uc_chain; + caddr_t uc_data; + u_short uc_flags; + u_short uc_inSize; /* Size is at most 5000 bytes */ + u_short uc_outSize; + u_short uc_opcode; /* copied from data to save lookup */ + int uc_unique; + wait_queue_head_t uc_sleep; /* process' wait queue */ +}; + +#define CODA_REQ_ASYNC 0x1 +#define CODA_REQ_READ 0x2 +#define CODA_REQ_WRITE 0x4 +#define CODA_REQ_ABORT 0x8 + +/* communication pending/processing queues */ +struct venus_comm { + u_long vc_seq; + wait_queue_head_t vc_waitq; /* Venus wait queue */ + struct list_head vc_pending; + struct list_head vc_processing; + int vc_inuse; + struct super_block *vc_sb; + struct mutex vc_mutex; +}; + +static inline struct venus_comm *coda_vcp(struct super_block *sb) +{ + return (struct venus_comm *)((sb)->s_fs_info); +} + +/* upcalls */ +int venus_rootfid(struct super_block *sb, struct CodaFid *fidp); +int venus_getattr(struct super_block *sb, struct CodaFid *fid, + struct coda_vattr *attr); +int venus_setattr(struct super_block *, struct CodaFid *, struct coda_vattr *); +int venus_lookup(struct super_block *sb, struct CodaFid *fid, + const char *name, int length, int *type, + struct CodaFid *resfid); +int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, + kuid_t uid); +int venus_open(struct super_block *sb, struct CodaFid *fid, int flags, + struct file **f); +int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length, + struct CodaFid *newfid, struct coda_vattr *attrs); +int venus_create(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length, int excl, int mode, + struct CodaFid *newfid, struct coda_vattr *attrs); +int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length); +int venus_remove(struct super_block *sb, struct CodaFid *dirfid, + const char *name, int length); +int venus_readlink(struct super_block *sb, struct CodaFid *fid, + char *buffer, int *length); +int venus_rename(struct super_block *sb, struct CodaFid *new_fid, + struct CodaFid *old_fid, size_t old_length, + size_t new_length, const char *old_name, + const char *new_name); +int venus_link(struct super_block *sb, struct CodaFid *fid, + struct CodaFid *dirfid, const char *name, int len ); +int venus_symlink(struct super_block *sb, struct CodaFid *fid, + const char *name, int len, const char *symname, int symlen); +int venus_access(struct super_block *sb, struct CodaFid *fid, int mask); +int venus_pioctl(struct super_block *sb, struct CodaFid *fid, + unsigned int cmd, struct PioctlData *data); +int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out, + size_t nbytes); +int venus_fsync(struct super_block *sb, struct CodaFid *fid); +int venus_statfs(struct dentry *dentry, struct kstatfs *sfs); + +/* + * Statistics + */ + +extern struct venus_comm coda_comms[]; +#endif diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 7e103eb8ffcd..716a0b932ec0 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -23,7 +23,7 @@ #include #include -#include +#include "coda_psdev.h" #include "coda_linux.h" #include "coda_cache.h" diff --git a/fs/coda/file.c b/fs/coda/file.c index 43d371551d2b..a6b32c883a50 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -22,8 +22,7 @@ #include #include -#include - +#include "coda_psdev.h" #include "coda_linux.h" #include "coda_int.h" diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 23f6ebd08e80..96d832ed23b5 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -27,7 +27,7 @@ #include #include -#include +#include "coda_psdev.h" #include "coda_linux.h" #include "coda_cache.h" diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index e0c17b7dccce..644d48c12ce8 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -20,8 +20,7 @@ #include #include -#include - +#include "coda_psdev.h" #include "coda_linux.h" /* pioctl ops */ diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index e80bda1de6c5..0a61e949a430 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -38,8 +38,7 @@ #include #include -#include - +#include "coda_psdev.h" #include "coda_linux.h" #include "coda_int.h" diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c index 202297d156df..8907d0508198 100644 --- a/fs/coda/symlink.c +++ b/fs/coda/symlink.c @@ -17,8 +17,7 @@ #include #include -#include - +#include "coda_psdev.h" #include "coda_linux.h" static int coda_symlink_filler(struct file *file, struct page *page) diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 1e2f50722107..eb8cc30f2589 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -33,7 +33,7 @@ #include #include -#include +#include "coda_psdev.h" #include "coda_linux.h" #include "coda_cache.h" diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h deleted file mode 100644 index 9487f792770c..000000000000 --- a/include/linux/coda_psdev.h +++ /dev/null @@ -1,89 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __CODA_PSDEV_H -#define __CODA_PSDEV_H - -#include -#include -#include - -struct kstatfs; - -/* communication pending/processing queues */ -struct venus_comm { - u_long vc_seq; - wait_queue_head_t vc_waitq; /* Venus wait queue */ - struct list_head vc_pending; - struct list_head vc_processing; - int vc_inuse; - struct super_block *vc_sb; - struct mutex vc_mutex; -}; - -/* messages between coda filesystem in kernel and Venus */ -struct upc_req { - struct list_head uc_chain; - caddr_t uc_data; - u_short uc_flags; - u_short uc_inSize; /* Size is at most 5000 bytes */ - u_short uc_outSize; - u_short uc_opcode; /* copied from data to save lookup */ - int uc_unique; - wait_queue_head_t uc_sleep; /* process' wait queue */ -}; - -#define CODA_REQ_ASYNC 0x1 -#define CODA_REQ_READ 0x2 -#define CODA_REQ_WRITE 0x4 -#define CODA_REQ_ABORT 0x8 - -static inline struct venus_comm *coda_vcp(struct super_block *sb) -{ - return (struct venus_comm *)((sb)->s_fs_info); -} - -/* upcalls */ -int venus_rootfid(struct super_block *sb, struct CodaFid *fidp); -int venus_getattr(struct super_block *sb, struct CodaFid *fid, - struct coda_vattr *attr); -int venus_setattr(struct super_block *, struct CodaFid *, struct coda_vattr *); -int venus_lookup(struct super_block *sb, struct CodaFid *fid, - const char *name, int length, int *type, - struct CodaFid *resfid); -int venus_close(struct super_block *sb, struct CodaFid *fid, int flags, - kuid_t uid); -int venus_open(struct super_block *sb, struct CodaFid *fid, int flags, - struct file **f); -int venus_mkdir(struct super_block *sb, struct CodaFid *dirfid, - const char *name, int length, - struct CodaFid *newfid, struct coda_vattr *attrs); -int venus_create(struct super_block *sb, struct CodaFid *dirfid, - const char *name, int length, int excl, int mode, - struct CodaFid *newfid, struct coda_vattr *attrs) ; -int venus_rmdir(struct super_block *sb, struct CodaFid *dirfid, - const char *name, int length); -int venus_remove(struct super_block *sb, struct CodaFid *dirfid, - const char *name, int length); -int venus_readlink(struct super_block *sb, struct CodaFid *fid, - char *buffer, int *length); -int venus_rename(struct super_block *, struct CodaFid *new_fid, - struct CodaFid *old_fid, size_t old_length, - size_t new_length, const char *old_name, - const char *new_name); -int venus_link(struct super_block *sb, struct CodaFid *fid, - struct CodaFid *dirfid, const char *name, int len ); -int venus_symlink(struct super_block *sb, struct CodaFid *fid, - const char *name, int len, const char *symname, int symlen); -int venus_access(struct super_block *sb, struct CodaFid *fid, int mask); -int venus_pioctl(struct super_block *sb, struct CodaFid *fid, - unsigned int cmd, struct PioctlData *data); -int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out, - size_t nbytes); -int venus_fsync(struct super_block *sb, struct CodaFid *fid); -int venus_statfs(struct dentry *dentry, struct kstatfs *sfs); - -/* - * Statistics - */ - -extern struct venus_comm coda_comms[]; -#endif -- cgit v1.2.3-59-g8ed1b From 6dc280ebeed2c96a2fb933103dafe655a922b9c1 Mon Sep 17 00:00:00 2001 From: Jan Harkes Date: Tue, 16 Jul 2019 16:28:51 -0700 Subject: coda: remove uapi/linux/coda_psdev.h Nothing is left in this header that is used by userspace. Link: http://lkml.kernel.org/r/bb11378cef94739f2cf89425dd6d302a52c64480.1558117389.git.jaharkes@cs.cmu.edu Signed-off-by: Jan Harkes Cc: Arnd Bergmann Cc: Colin Ian King Cc: Dan Carpenter Cc: David Howells Cc: Fabian Frederick Cc: Mikko Rapeli Cc: Sam Protsenko Cc: Yann Droneaud Cc: Zhouyang Jia Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coda/coda_psdev.h | 5 ++++- include/uapi/linux/coda_psdev.h | 10 ---------- 2 files changed, 4 insertions(+), 11 deletions(-) delete mode 100644 include/uapi/linux/coda_psdev.h (limited to 'include') diff --git a/fs/coda/coda_psdev.h b/fs/coda/coda_psdev.h index 012e16f741a6..801423cbbdfc 100644 --- a/fs/coda/coda_psdev.h +++ b/fs/coda/coda_psdev.h @@ -3,8 +3,11 @@ #define __CODA_PSDEV_H #include +#include #include -#include + +#define CODA_PSDEV_MAJOR 67 +#define MAX_CODADEVS 5 /* how many do we allow */ struct kstatfs; diff --git a/include/uapi/linux/coda_psdev.h b/include/uapi/linux/coda_psdev.h deleted file mode 100644 index 3dacb7fad66a..000000000000 --- a/include/uapi/linux/coda_psdev.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI__CODA_PSDEV_H -#define _UAPI__CODA_PSDEV_H - -#include - -#define CODA_PSDEV_MAJOR 67 -#define MAX_CODADEVS 5 /* how many do we allow */ - -#endif /* _UAPI__CODA_PSDEV_H */ -- cgit v1.2.3-59-g8ed1b From a9fba24c6ac9b66c09dfc2a0e845ecace187e89c Mon Sep 17 00:00:00 2001 From: Pedro Cuadra Date: Tue, 16 Jul 2019 16:29:13 -0700 Subject: coda: add hinting support for partial file caching This adds support for partial file caching in Coda. Every read, write and mmap informs the userspace cache manager about what part of a file is about to be accessed so that the cache manager can ensure the relevant parts are available before the operation is allowed to proceed. When a read or write operation completes, this is also reported to allow the cache manager to track when partially cached content can be released. If the cache manager does not support partial file caching, or when the entire file has been fetched into the local cache, the cache manager may return an EOPNOTSUPP error to indicate that intent upcalls are no longer necessary until the file is closed. [akpm@linux-foundation.org: little whitespace fixup] Link: http://lkml.kernel.org/r/20190618181301.6960-1-jaharkes@cs.cmu.edu Signed-off-by: Pedro Cuadra Signed-off-by: Jan Harkes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coda/coda_fs_i.h | 1 + fs/coda/coda_psdev.h | 3 ++ fs/coda/file.c | 61 +++++++++++++++++++++++++++++++++-------- fs/coda/psdev.c | 2 +- fs/coda/upcall.c | 70 +++++++++++++++++++++++++++++++++++++++-------- include/uapi/linux/coda.h | 29 ++++++++++++++++++-- 6 files changed, 139 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h index c99d574d1c43..1763ff95d865 100644 --- a/fs/coda/coda_fs_i.h +++ b/fs/coda/coda_fs_i.h @@ -40,6 +40,7 @@ struct coda_file_info { int cfi_magic; /* magic number */ struct file *cfi_container; /* container file for this cnode */ unsigned int cfi_mapcount; /* nr of times this file is mapped */ + bool cfi_access_intent; /* is access intent supported */ }; /* flags */ diff --git a/fs/coda/coda_psdev.h b/fs/coda/coda_psdev.h index 801423cbbdfc..52da08c770b0 100644 --- a/fs/coda/coda_psdev.h +++ b/fs/coda/coda_psdev.h @@ -83,6 +83,9 @@ int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out, size_t nbytes); int venus_fsync(struct super_block *sb, struct CodaFid *fid); int venus_statfs(struct dentry *dentry, struct kstatfs *sfs); +int venus_access_intent(struct super_block *sb, struct CodaFid *fid, + bool *access_intent_supported, + size_t count, loff_t ppos, int type); /* * Statistics diff --git a/fs/coda/file.c b/fs/coda/file.c index 0dbd13ab72e3..128d63df5bfb 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "coda_psdev.h" @@ -37,9 +38,25 @@ static ssize_t coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *coda_file = iocb->ki_filp; + struct inode *coda_inode = file_inode(coda_file); struct coda_file_info *cfi = coda_ftoc(coda_file); + loff_t ki_pos = iocb->ki_pos; + size_t count = iov_iter_count(to); + ssize_t ret; + + ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ki_pos, CODA_ACCESS_TYPE_READ); + if (ret) + goto finish_read; - return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0); + ret = vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos, 0); + +finish_read: + venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ki_pos, CODA_ACCESS_TYPE_READ_FINISH); + return ret; } static ssize_t @@ -48,10 +65,17 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) struct file *coda_file = iocb->ki_filp; struct inode *coda_inode = file_inode(coda_file); struct coda_file_info *cfi = coda_ftoc(coda_file); - struct file *host_file; + struct file *host_file = cfi->cfi_container; + loff_t ki_pos = iocb->ki_pos; + size_t count = iov_iter_count(to); ssize_t ret; - host_file = cfi->cfi_container; + ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ki_pos, CODA_ACCESS_TYPE_WRITE); + if (ret) + goto finish_write; + file_start_write(host_file); inode_lock(coda_inode); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); @@ -60,6 +84,11 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode); inode_unlock(coda_inode); file_end_write(host_file); + +finish_write: + venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ki_pos, CODA_ACCESS_TYPE_WRITE_FINISH); return ret; } @@ -94,29 +123,35 @@ coda_vm_close(struct vm_area_struct *vma) static int coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) { - struct coda_file_info *cfi; + struct inode *coda_inode = file_inode(coda_file); + struct coda_file_info *cfi = coda_ftoc(coda_file); + struct file *host_file = cfi->cfi_container; + struct inode *host_inode = file_inode(host_file); struct coda_inode_info *cii; - struct file *host_file; - struct inode *coda_inode, *host_inode; struct coda_vm_ops *cvm_ops; + loff_t ppos; + size_t count; int ret; - cfi = coda_ftoc(coda_file); - host_file = cfi->cfi_container; - if (!host_file->f_op->mmap) return -ENODEV; if (WARN_ON(coda_file != vma->vm_file)) return -EIO; + count = vma->vm_end - vma->vm_start; + ppos = vma->vm_pgoff * PAGE_SIZE; + + ret = venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode), + &cfi->cfi_access_intent, + count, ppos, CODA_ACCESS_TYPE_MMAP); + if (ret) + return ret; + cvm_ops = kmalloc(sizeof(struct coda_vm_ops), GFP_KERNEL); if (!cvm_ops) return -ENOMEM; - coda_inode = file_inode(coda_file); - host_inode = file_inode(host_file); - cii = ITOC(coda_inode); spin_lock(&cii->c_lock); coda_file->f_mapping = host_file->f_mapping; @@ -188,6 +223,8 @@ int coda_open(struct inode *coda_inode, struct file *coda_file) cfi->cfi_magic = CODA_MAGIC; cfi->cfi_mapcount = 0; cfi->cfi_container = host_file; + /* assume access intents are supported unless we hear otherwise */ + cfi->cfi_access_intent = true; BUG_ON(coda_file->private_data != NULL); coda_file->private_data = cfi; diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index ebfbbea9fa48..240669f51eac 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -388,7 +388,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam"); MODULE_DESCRIPTION("Coda Distributed File System VFS interface"); MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR); MODULE_LICENSE("GPL"); -MODULE_VERSION("6.11"); +MODULE_VERSION("7.0"); static int __init init_coda(void) { diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 15c0e4fdb0e3..eb3b1898da46 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -569,6 +569,47 @@ int venus_statfs(struct dentry *dentry, struct kstatfs *sfs) return error; } +int venus_access_intent(struct super_block *sb, struct CodaFid *fid, + bool *access_intent_supported, + size_t count, loff_t ppos, int type) +{ + union inputArgs *inp; + union outputArgs *outp; + int insize, outsize, error; + bool finalizer = + type == CODA_ACCESS_TYPE_READ_FINISH || + type == CODA_ACCESS_TYPE_WRITE_FINISH; + + if (!*access_intent_supported && !finalizer) + return 0; + + insize = SIZE(access_intent); + UPARG(CODA_ACCESS_INTENT); + + inp->coda_access_intent.VFid = *fid; + inp->coda_access_intent.count = count; + inp->coda_access_intent.pos = ppos; + inp->coda_access_intent.type = type; + + error = coda_upcall(coda_vcp(sb), insize, + finalizer ? NULL : &outsize, inp); + + /* + * we have to free the request buffer for synchronous upcalls + * or when asynchronous upcalls fail, but not when asynchronous + * upcalls succeed + */ + if (!finalizer || error) + kvfree(inp); + + /* Chunked access is not supported or an old Coda client */ + if (error == -EOPNOTSUPP) { + *access_intent_supported = false; + error = 0; + } + return error; +} + /* * coda_upcall and coda_downcall routines. */ @@ -598,10 +639,12 @@ static void coda_unblock_signals(sigset_t *old) * has seen them, * - CODA_CLOSE or CODA_RELEASE upcall (to avoid reference count problems) * - CODA_STORE (to avoid data loss) + * - CODA_ACCESS_INTENT (to avoid reference count problems) */ #define CODA_INTERRUPTIBLE(r) (!coda_hard && \ (((r)->uc_opcode != CODA_CLOSE && \ (r)->uc_opcode != CODA_STORE && \ + (r)->uc_opcode != CODA_ACCESS_INTENT && \ (r)->uc_opcode != CODA_RELEASE) || \ (r)->uc_flags & CODA_REQ_READ)) @@ -687,21 +730,25 @@ static int coda_upcall(struct venus_comm *vcp, goto exit; } + buffer->ih.unique = ++vcp->vc_seq; + req->uc_data = (void *)buffer; - req->uc_flags = 0; + req->uc_flags = outSize ? 0 : CODA_REQ_ASYNC; req->uc_inSize = inSize; - req->uc_outSize = *outSize ? *outSize : inSize; - req->uc_opcode = ((union inputArgs *)buffer)->ih.opcode; - req->uc_unique = ++vcp->vc_seq; + req->uc_outSize = (outSize && *outSize) ? *outSize : inSize; + req->uc_opcode = buffer->ih.opcode; + req->uc_unique = buffer->ih.unique; init_waitqueue_head(&req->uc_sleep); - /* Fill in the common input args. */ - ((union inputArgs *)buffer)->ih.unique = req->uc_unique; - /* Append msg to pending queue and poke Venus. */ list_add_tail(&req->uc_chain, &vcp->vc_pending); - wake_up_interruptible(&vcp->vc_waitq); + + if (req->uc_flags & CODA_REQ_ASYNC) { + mutex_unlock(&vcp->vc_mutex); + return 0; + } + /* We can be interrupted while we wait for Venus to process * our request. If the interrupt occurs before Venus has read * the request, we dequeue and return. If it occurs after the @@ -743,20 +790,20 @@ static int coda_upcall(struct venus_comm *vcp, sig_req = kmalloc(sizeof(struct upc_req), GFP_KERNEL); if (!sig_req) goto exit; - sig_req->uc_data = kvzalloc(sizeof(struct coda_in_hdr), GFP_KERNEL); - if (!sig_req->uc_data) { + sig_inputArgs = kvzalloc(sizeof(struct coda_in_hdr), GFP_KERNEL); + if (!sig_inputArgs) { kfree(sig_req); goto exit; } error = -EINTR; - sig_inputArgs = (union inputArgs *)sig_req->uc_data; sig_inputArgs->ih.opcode = CODA_SIGNAL; sig_inputArgs->ih.unique = req->uc_unique; sig_req->uc_flags = CODA_REQ_ASYNC; sig_req->uc_opcode = sig_inputArgs->ih.opcode; sig_req->uc_unique = sig_inputArgs->ih.unique; + sig_req->uc_data = (void *)sig_inputArgs; sig_req->uc_inSize = sizeof(struct coda_in_hdr); sig_req->uc_outSize = sizeof(struct coda_in_hdr); @@ -911,4 +958,3 @@ unlock_out: iput(inode); return 0; } - diff --git a/include/uapi/linux/coda.h b/include/uapi/linux/coda.h index 5dba636b6e11..aa34c2dcae8d 100644 --- a/include/uapi/linux/coda.h +++ b/include/uapi/linux/coda.h @@ -271,7 +271,8 @@ struct coda_statfs { #define CODA_STATFS 34 #define CODA_STORE 35 #define CODA_RELEASE 36 -#define CODA_NCALLS 37 +#define CODA_ACCESS_INTENT 37 +#define CODA_NCALLS 38 #define DOWNCALL(opcode) (opcode >= CODA_REPLACE && opcode <= CODA_PURGEFID) @@ -281,8 +282,12 @@ struct coda_statfs { #define CIOC_KERNEL_VERSION _IOWR('c', 10, size_t) +// CODA_KERNEL_VERSION 0 /* don't care about kernel version number */ +// CODA_KERNEL_VERSION 1 /* The old venus 4.6 compatible interface */ +// CODA_KERNEL_VERSION 2 /* venus_lookup gets an extra parameter */ // CODA_KERNEL_VERSION 3 /* 128-bit file identifiers */ -#define CODA_KERNEL_VERSION 4 /* 64-bit timespec */ +// CODA_KERNEL_VERSION 4 /* 64-bit timespec */ +#define CODA_KERNEL_VERSION 5 /* access intent support */ /* * Venus <-> Coda RPC arguments @@ -637,6 +642,25 @@ struct coda_statfs_out { struct coda_statfs stat; }; +#define CODA_ACCESS_TYPE_READ 1 +#define CODA_ACCESS_TYPE_WRITE 2 +#define CODA_ACCESS_TYPE_MMAP 3 +#define CODA_ACCESS_TYPE_READ_FINISH 4 +#define CODA_ACCESS_TYPE_WRITE_FINISH 5 + +/* coda_access_intent: NO_OUT */ +struct coda_access_intent_in { + struct coda_in_hdr ih; + struct CodaFid VFid; + int count; + int pos; + int type; +}; + +struct coda_access_intent_out { + struct coda_out_hdr out; +}; + /* * Occasionally, we don't cache the fid returned by CODA_LOOKUP. * For instance, if the fid is inconsistent. @@ -668,6 +692,7 @@ union inputArgs { struct coda_open_by_fd_in coda_open_by_fd; struct coda_open_by_path_in coda_open_by_path; struct coda_statfs_in coda_statfs; + struct coda_access_intent_in coda_access_intent; }; union outputArgs { -- cgit v1.2.3-59-g8ed1b From 201766a20e30f982ccfe36bebfad9602c3ff574a Mon Sep 17 00:00:00 2001 From: Elvira Khabirova Date: Tue, 16 Jul 2019 16:29:42 -0700 Subject: ptrace: add PTRACE_GET_SYSCALL_INFO request PTRACE_GET_SYSCALL_INFO is a generic ptrace API that lets ptracer obtain details of the syscall the tracee is blocked in. There are two reasons for a special syscall-related ptrace request. Firstly, with the current ptrace API there are cases when ptracer cannot retrieve necessary information about syscalls. Some examples include: * The notorious int-0x80-from-64-bit-task issue. See [1] for details. In short, if a 64-bit task performs a syscall through int 0x80, its tracer has no reliable means to find out that the syscall was, in fact, a compat syscall, and misidentifies it. * Syscall-enter-stop and syscall-exit-stop look the same for the tracer. Common practice is to keep track of the sequence of ptrace-stops in order not to mix the two syscall-stops up. But it is not as simple as it looks; for example, strace had a (just recently fixed) long-standing bug where attaching strace to a tracee that is performing the execve system call led to the tracer identifying the following syscall-exit-stop as syscall-enter-stop, which messed up all the state tracking. * Since the introduction of commit 84d77d3f06e7 ("ptrace: Don't allow accessing an undumpable mm"), both PTRACE_PEEKDATA and process_vm_readv become unavailable when the process dumpable flag is cleared. On such architectures as ia64 this results in all syscall arguments being unavailable for the tracer. Secondly, ptracers also have to support a lot of arch-specific code for obtaining information about the tracee. For some architectures, this requires a ptrace(PTRACE_PEEKUSER, ...) invocation for every syscall argument and return value. ptrace(2) man page: long ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data); ... PTRACE_GET_SYSCALL_INFO Retrieve information about the syscall that caused the stop. The information is placed into the buffer pointed by "data" argument, which should be a pointer to a buffer of type "struct ptrace_syscall_info". The "addr" argument contains the size of the buffer pointed to by "data" argument (i.e., sizeof(struct ptrace_syscall_info)). The return value contains the number of bytes available to be written by the kernel. If the size of data to be written by the kernel exceeds the size specified by "addr" argument, the output is truncated. [ldv@altlinux.org: selftests/seccomp/seccomp_bpf: update for PTRACE_GET_SYSCALL_INFO] Link: http://lkml.kernel.org/r/20190708182904.GA12332@altlinux.org Link: http://lkml.kernel.org/r/20190510152842.GF28558@altlinux.org Signed-off-by: Elvira Khabirova Co-developed-by: Dmitry V. Levin Signed-off-by: Dmitry V. Levin Reviewed-by: Oleg Nesterov Reviewed-by: Kees Cook Reviewed-by: Andy Lutomirski Cc: Eugene Syromyatnikov Cc: Benjamin Herrenschmidt Cc: Greentime Hu Cc: Helge Deller [parisc] Cc: James E.J. Bottomley Cc: James Hogan Cc: kbuild test robot Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Ralf Baechle Cc: Richard Kuo Cc: Shuah Khan Cc: Vincent Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/tracehook.h | 9 ++- include/uapi/linux/ptrace.h | 35 +++++++++ kernel/ptrace.c | 101 +++++++++++++++++++++++++- tools/testing/selftests/seccomp/seccomp_bpf.c | 13 +++- 4 files changed, 150 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 8446573cc682..36fb3bbed6b2 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -54,13 +54,15 @@ struct linux_binprm; /* * ptrace report for syscall entry and exit looks identical. */ -static inline int ptrace_report_syscall(struct pt_regs *regs) +static inline int ptrace_report_syscall(struct pt_regs *regs, + unsigned long message) { int ptrace = current->ptrace; if (!(ptrace & PT_PTRACED)) return 0; + current->ptrace_message = message; ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); /* @@ -73,6 +75,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs) current->exit_code = 0; } + current->ptrace_message = 0; return fatal_signal_pending(current); } @@ -98,7 +101,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs) static inline __must_check int tracehook_report_syscall_entry( struct pt_regs *regs) { - return ptrace_report_syscall(regs); + return ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_ENTRY); } /** @@ -123,7 +126,7 @@ static inline void tracehook_report_syscall_exit(struct pt_regs *regs, int step) if (step) user_single_step_report(regs); else - ptrace_report_syscall(regs); + ptrace_report_syscall(regs, PTRACE_EVENTMSG_SYSCALL_EXIT); } /** diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index d5a1b8a492b9..a71b6e3b03eb 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -73,6 +73,41 @@ struct seccomp_metadata { __u64 flags; /* Output: filter's flags */ }; +#define PTRACE_GET_SYSCALL_INFO 0x420e +#define PTRACE_SYSCALL_INFO_NONE 0 +#define PTRACE_SYSCALL_INFO_ENTRY 1 +#define PTRACE_SYSCALL_INFO_EXIT 2 +#define PTRACE_SYSCALL_INFO_SECCOMP 3 + +struct ptrace_syscall_info { + __u8 op; /* PTRACE_SYSCALL_INFO_* */ + __u32 arch __attribute__((__aligned__(sizeof(__u32)))); + __u64 instruction_pointer; + __u64 stack_pointer; + union { + struct { + __u64 nr; + __u64 args[6]; + } entry; + struct { + __s64 rval; + __u8 is_error; + } exit; + struct { + __u64 nr; + __u64 args[6]; + __u32 ret_data; + } seccomp; + }; +}; + +/* + * These values are stored in task->ptrace_message + * by tracehook_report_syscall_* to describe the current syscall-stop. + */ +#define PTRACE_EVENTMSG_SYSCALL_ENTRY 1 +#define PTRACE_EVENTMSG_SYSCALL_EXIT 2 + /* Read signals from a shared (process wide) queue */ #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 83a531cea2f3..cb9ddcc08119 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -32,6 +32,8 @@ #include #include +#include /* for syscall_get_* */ + /* * Access another process' address space via ptrace. * Source/target buffer must be kernel space, @@ -897,7 +899,100 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type, * to ensure no machine forgets it. */ EXPORT_SYMBOL_GPL(task_user_regset_view); -#endif + +static unsigned long +ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int i; + + info->op = PTRACE_SYSCALL_INFO_ENTRY; + info->entry.nr = syscall_get_nr(child, regs); + syscall_get_arguments(child, regs, args); + for (i = 0; i < ARRAY_SIZE(args); i++) + info->entry.args[i] = args[i]; + + /* args is the last field in struct ptrace_syscall_info.entry */ + return offsetofend(struct ptrace_syscall_info, entry.args); +} + +static unsigned long +ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * As struct ptrace_syscall_info.entry is currently a subset + * of struct ptrace_syscall_info.seccomp, it makes sense to + * initialize that subset using ptrace_get_syscall_info_entry(). + * This can be reconsidered in the future if these structures + * diverge significantly enough. + */ + ptrace_get_syscall_info_entry(child, regs, info); + info->op = PTRACE_SYSCALL_INFO_SECCOMP; + info->seccomp.ret_data = child->ptrace_message; + + /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); +} + +static unsigned long +ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + info->op = PTRACE_SYSCALL_INFO_EXIT; + info->exit.rval = syscall_get_error(child, regs); + info->exit.is_error = !!info->exit.rval; + if (!info->exit.is_error) + info->exit.rval = syscall_get_return_value(child, regs); + + /* is_error is the last field in struct ptrace_syscall_info.exit */ + return offsetofend(struct ptrace_syscall_info, exit.is_error); +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = PTRACE_SYSCALL_INFO_NONE, + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + /* + * This does not need lock_task_sighand() to access + * child->last_siginfo because ptrace_freeze_traced() + * called earlier by ptrace_check_attach() ensures that + * the tracee cannot go away and clear its last_siginfo. + */ + switch (child->last_siginfo ? child->last_siginfo->si_code : 0) { + case SIGTRAP | 0x80: + switch (child->ptrace_message) { + case PTRACE_EVENTMSG_SYSCALL_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, + &info); + break; + case PTRACE_EVENTMSG_SYSCALL_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, + &info); + break; + } + break; + case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): + actual_size = ptrace_get_syscall_info_seccomp(child, regs, + &info); + break; + } + + write_size = min(actual_size, user_size); + return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; +} +#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data) @@ -1114,6 +1209,10 @@ int ptrace_request(struct task_struct *child, long request, ret = __put_user(kiov.iov_len, &uiov->iov_len); break; } + + case PTRACE_GET_SYSCALL_INFO: + ret = ptrace_get_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index dc66fe852768..6ef7f16c4cf5 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1775,13 +1775,18 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee, unsigned long msg; static bool entry; - /* Make sure we got an empty message. */ + /* + * The traditional way to tell PTRACE_SYSCALL entry/exit + * is by counting. + */ + entry = !entry; + + /* Make sure we got an appropriate message. */ ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg); EXPECT_EQ(0, ret); - EXPECT_EQ(0, msg); + EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY + : PTRACE_EVENTMSG_SYSCALL_EXIT, msg); - /* The only way to tell PTRACE_SYSCALL entry/exit is by counting. */ - entry = !entry; if (!entry) return; -- cgit v1.2.3-59-g8ed1b From e2d9018e81ba9357d3bb8bddc0ee58d460d092fe Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 16 Jul 2019 16:29:50 -0700 Subject: signal: reorder struct sighand_struct struct sighand_struct::siglock field is the most used field by far, put it first so that is can be accessed without IMM8 or IMM32 encoding on x86_64. Space savings (on trimmed down VM test config): add/remove: 0/0 grow/shrink: 8/68 up/down: 49/-1147 (-1098) Function old new delta complete_signal 512 533 +21 do_signalfd4 335 346 +11 __cleanup_sighand 39 43 +4 unhandled_signal 49 52 +3 prepare_signal 692 695 +3 ignore_signals 37 40 +3 __tty_check_change.part 248 251 +3 ksys_unshare 780 781 +1 sighand_ctor 33 29 -4 ptrace_trap_notify 60 56 -4 sigqueue_free 98 91 -7 run_posix_cpu_timers 1389 1382 -7 proc_pid_status 2448 2441 -7 proc_pid_limits 344 337 -7 posix_cpu_timer_rearm 222 215 -7 posix_cpu_timer_get 249 242 -7 kill_pid_info_as_cred 243 236 -7 freeze_task 197 190 -7 flush_old_exec 1873 1866 -7 do_task_stat 3363 3356 -7 do_send_sig_info 98 91 -7 do_group_exit 147 140 -7 init_sighand 2088 2080 -8 do_notify_parent_cldstop 399 391 -8 signalfd_cleanup 50 41 -9 do_notify_parent 557 545 -12 __send_signal 1029 1017 -12 ptrace_stop 590 577 -13 get_signal 1576 1563 -13 __lock_task_sighand 112 99 -13 zap_pid_ns_processes 391 377 -14 update_rlimit_cpu 78 64 -14 tty_signal_session_leader 413 399 -14 tty_open_proc_set_tty 149 135 -14 tty_jobctrl_ioctl 936 922 -14 set_cpu_itimer 339 325 -14 ptrace_resume 226 212 -14 ptrace_notify 110 96 -14 proc_clear_tty 81 67 -14 posix_cpu_timer_del 229 215 -14 kernel_sigaction 156 142 -14 getrusage 977 963 -14 get_current_tty 98 84 -14 force_sigsegv 89 75 -14 force_sig_info 205 191 -14 flush_signals 83 69 -14 flush_itimer_signals 85 71 -14 do_timer_create 1120 1106 -14 do_sigpending 88 74 -14 do_signal_stop 537 523 -14 cgroup_init_fs_context 644 630 -14 call_usermodehelper_exec_async 402 388 -14 calculate_sigpending 58 44 -14 __x64_sys_timer_delete 248 234 -14 __set_current_blocked 80 66 -14 __ptrace_unlink 310 296 -14 __ptrace_detach.part 187 173 -14 send_sigqueue 362 347 -15 get_cpu_itimer 214 199 -15 signalfd_poll 175 159 -16 dequeue_signal 340 323 -17 do_getitimer 192 174 -18 release_task.part 1060 1040 -20 ptrace_peek_siginfo 408 387 -21 posix_cpu_timer_set 827 806 -21 exit_signals 437 416 -21 do_sigaction 541 520 -21 do_setitimer 485 464 -21 disassociate_ctty.part 545 517 -28 __x64_sys_rt_sigtimedwait 721 679 -42 __x64_sys_ptrace 1319 1277 -42 ptrace_request 1828 1782 -46 signalfd_read 507 459 -48 wait_consider_task 2027 1971 -56 do_coredump 3672 3616 -56 copy_process.part 6936 6871 -65 Link: http://lkml.kernel.org/r/20190503192800.GA18004@avx2 Signed-off-by: Alexey Dobriyan Cc: Michal Hocko Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/signal.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 532458698bde..01add55a609b 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -15,10 +15,10 @@ */ struct sighand_struct { - refcount_t count; - struct k_sigaction action[_NSIG]; spinlock_t siglock; + refcount_t count; wait_queue_head_t signalfd_wqh; + struct k_sigaction action[_NSIG]; }; /* -- cgit v1.2.3-59-g8ed1b From b772434be0891ed1081a08ae7cfd4666728f8e82 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 16 Jul 2019 16:29:53 -0700 Subject: signal: simplify set_user_sigmask/restore_user_sigmask task->saved_sigmask and ->restore_sigmask are only used in the ret-from- syscall paths. This means that set_user_sigmask() can save ->blocked in ->saved_sigmask and do set_restore_sigmask() to indicate that ->blocked was modified. This way the callers do not need 2 sigset_t's passed to set/restore and restore_user_sigmask() renamed to restore_saved_sigmask_unless() turns into the trivial helper which just calls restore_saved_sigmask(). Link: http://lkml.kernel.org/r/20190606113206.GA9464@redhat.com Signed-off-by: Oleg Nesterov Cc: Deepa Dinamani Cc: Arnd Bergmann Cc: Jens Axboe Cc: Davidlohr Bueso Cc: Eric Wong Cc: Jason Baron Cc: Thomas Gleixner Cc: Al Viro Cc: Eric W. Biederman Cc: David Laight Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 20 +++++-------- fs/eventpoll.c | 12 +++----- fs/io_uring.c | 11 ++----- fs/select.c | 34 ++++++++-------------- include/linux/compat.h | 3 +- include/linux/sched/signal.h | 12 ++++++-- include/linux/signal.h | 4 --- kernel/signal.c | 69 ++++++++++++-------------------------------- 8 files changed, 57 insertions(+), 108 deletions(-) (limited to 'include') diff --git a/fs/aio.c b/fs/aio.c index 2d405733a8c6..8327db0c8e08 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2094,7 +2094,6 @@ SYSCALL_DEFINE6(io_pgetevents, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; - sigset_t ksigmask, sigsaved; struct timespec64 ts; bool interrupted; int ret; @@ -2105,14 +2104,14 @@ SYSCALL_DEFINE6(io_pgetevents, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); interrupted = signal_pending(current); - restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted); + restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; @@ -2130,7 +2129,6 @@ SYSCALL_DEFINE6(io_pgetevents_time32, const struct __aio_sigset __user *, usig) { struct __aio_sigset ksig = { NULL, }; - sigset_t ksigmask, sigsaved; struct timespec64 ts; bool interrupted; int ret; @@ -2142,14 +2140,14 @@ SYSCALL_DEFINE6(io_pgetevents_time32, return -EFAULT; - ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); interrupted = signal_pending(current); - restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted); + restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; @@ -2198,7 +2196,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { NULL, }; - sigset_t ksigmask, sigsaved; struct timespec64 t; bool interrupted; int ret; @@ -2209,14 +2206,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); interrupted = signal_pending(current); - restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted); + restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; @@ -2234,7 +2231,6 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, const struct __compat_aio_sigset __user *, usig) { struct __compat_aio_sigset ksig = { NULL, }; - sigset_t ksigmask, sigsaved; struct timespec64 t; bool interrupted; int ret; @@ -2245,14 +2241,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64, if (usig && copy_from_user(&ksig, usig, sizeof(ksig))) return -EFAULT; - ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize); + ret = set_compat_user_sigmask(ksig.sigmask, ksig.sigsetsize); if (ret) return ret; ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); interrupted = signal_pending(current); - restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted); + restore_saved_sigmask_unless(interrupted); if (interrupted && !ret) ret = -ERESTARTNOHAND; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4c74c768ae43..0f9c073d78d5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2313,19 +2313,17 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, size_t, sigsetsize) { int error; - sigset_t ksigmask, sigsaved; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ - error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + error = set_user_sigmask(sigmask, sigsetsize); if (error) return error; error = do_epoll_wait(epfd, events, maxevents, timeout); - - restore_user_sigmask(sigmask, &sigsaved, error == -EINTR); + restore_saved_sigmask_unless(error == -EINTR); return error; } @@ -2338,19 +2336,17 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, compat_size_t, sigsetsize) { long err; - sigset_t ksigmask, sigsaved; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ - err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + err = set_compat_user_sigmask(sigmask, sigsetsize); if (err) return err; err = do_epoll_wait(epfd, events, maxevents, timeout); - - restore_user_sigmask(sigmask, &sigsaved, err == -EINTR); + restore_saved_sigmask_unless(err == -EINTR); return err; } diff --git a/fs/io_uring.c b/fs/io_uring.c index d682049c07b2..e2a66e12fbc6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2400,7 +2400,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { struct io_cq_ring *ring = ctx->cq_ring; - sigset_t ksigmask, sigsaved; int ret; if (io_cqring_events(ring) >= min_events) @@ -2410,21 +2409,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, #ifdef CONFIG_COMPAT if (in_compat_syscall()) ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig, - &ksigmask, &sigsaved, sigsz); + sigsz); else #endif - ret = set_user_sigmask(sig, &ksigmask, - &sigsaved, sigsz); + ret = set_user_sigmask(sig, sigsz); if (ret) return ret; } ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events); - - if (sig) - restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS); - + restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; diff --git a/fs/select.c b/fs/select.c index a4d8f6e8b63c..1fc1b247fede 100644 --- a/fs/select.c +++ b/fs/select.c @@ -730,7 +730,6 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, const sigset_t __user *sigmask, size_t sigsetsize, enum poll_time_type type) { - sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; @@ -753,12 +752,12 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, return -EINVAL; } - ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = core_sys_select(n, inp, outp, exp, to); - restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND); + restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); ret = poll_select_copy_remaining(&end_time, tsp, type, ret); return ret; @@ -1086,7 +1085,6 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { - sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; @@ -1099,17 +1097,16 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, return -EINVAL; } - ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); - restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR); + restore_saved_sigmask_unless(ret == -EINTR); /* We can restart this syscall, usually */ if (ret == -EINTR) ret = -ERESTARTNOHAND; - ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret); return ret; @@ -1121,7 +1118,6 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask, size_t, sigsetsize) { - sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; @@ -1134,17 +1130,16 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, return -EINVAL; } - ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + ret = set_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); - restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR); + restore_saved_sigmask_unless(ret == -EINTR); /* We can restart this syscall, usually */ if (ret == -EINTR) ret = -ERESTARTNOHAND; - ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret); return ret; @@ -1319,7 +1314,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, void __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize, enum poll_time_type type) { - sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; @@ -1342,12 +1336,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, return -EINVAL; } - ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = compat_core_sys_select(n, inp, outp, exp, to); - restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND); + restore_saved_sigmask_unless(ret == -ERESTARTNOHAND); ret = poll_select_copy_remaining(&end_time, tsp, type, ret); return ret; @@ -1402,7 +1396,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds, struct old_timespec32 __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { - sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; @@ -1415,17 +1408,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, return -EINVAL; } - ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); - restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR); + restore_saved_sigmask_unless(ret == -EINTR); /* We can restart this syscall, usually */ if (ret == -EINTR) ret = -ERESTARTNOHAND; - ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret); return ret; @@ -1437,7 +1429,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, unsigned int, nfds, struct __kernel_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { - sigset_t ksigmask, sigsaved; struct timespec64 ts, end_time, *to = NULL; int ret; @@ -1450,17 +1441,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds, return -EINVAL; } - ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize); + ret = set_compat_user_sigmask(sigmask, sigsetsize); if (ret) return ret; ret = do_sys_poll(ufds, nfds, to); - restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR); + restore_saved_sigmask_unless(ret == -EINTR); /* We can restart this syscall, usually */ if (ret == -EINTR) ret = -ERESTARTNOHAND; - ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret); return ret; diff --git a/include/linux/compat.h b/include/linux/compat.h index ebddcb6cfcf8..16dafd9f4b86 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -138,8 +138,7 @@ typedef struct { compat_sigset_word sig[_COMPAT_NSIG_WORDS]; } compat_sigset_t; -int set_compat_user_sigmask(const compat_sigset_t __user *usigmask, - sigset_t *set, sigset_t *oldset, +int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize); struct compat_sigaction { diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 01add55a609b..efd8ce7675ed 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -420,7 +420,6 @@ void task_join_group_stop(struct task_struct *task); static inline void set_restore_sigmask(void) { set_thread_flag(TIF_RESTORE_SIGMASK); - WARN_ON(!test_thread_flag(TIF_SIGPENDING)); } static inline void clear_tsk_restore_sigmask(struct task_struct *task) @@ -451,7 +450,6 @@ static inline bool test_and_clear_restore_sigmask(void) static inline void set_restore_sigmask(void) { current->restore_sigmask = true; - WARN_ON(!test_thread_flag(TIF_SIGPENDING)); } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { @@ -484,6 +482,16 @@ static inline void restore_saved_sigmask(void) __set_current_blocked(¤t->saved_sigmask); } +extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); + +static inline void restore_saved_sigmask_unless(bool interrupted) +{ + if (interrupted) + WARN_ON(!test_thread_flag(TIF_SIGPENDING)); + else + restore_saved_sigmask(); +} + static inline sigset_t *sigmask_to_save(void) { sigset_t *res = ¤t->blocked; diff --git a/include/linux/signal.h b/include/linux/signal.h index 78c2bb376954..b5d99482d3fe 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -273,10 +273,6 @@ extern int group_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p, enum pid_type type); extern int __group_send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern int sigprocmask(int, sigset_t *, sigset_t *); -extern int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set, - sigset_t *oldset, size_t sigsetsize); -extern void restore_user_sigmask(const void __user *usigmask, - sigset_t *sigsaved, bool interrupted); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; diff --git a/kernel/signal.c b/kernel/signal.c index dabe100d2091..91b789dd6e72 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2951,80 +2951,49 @@ EXPORT_SYMBOL(sigprocmask); * * This is useful for syscalls such as ppoll, pselect, io_pgetevents and * epoll_pwait where a new sigmask is passed from userland for the syscalls. + * + * Note that it does set_restore_sigmask() in advance, so it must be always + * paired with restore_saved_sigmask_unless() before return from syscall. */ -int set_user_sigmask(const sigset_t __user *usigmask, sigset_t *set, - sigset_t *oldset, size_t sigsetsize) +int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize) { - if (!usigmask) - return 0; + sigset_t kmask; + if (!umask) + return 0; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; - if (copy_from_user(set, usigmask, sizeof(sigset_t))) + if (copy_from_user(&kmask, umask, sizeof(sigset_t))) return -EFAULT; - *oldset = current->blocked; - set_current_blocked(set); + set_restore_sigmask(); + current->saved_sigmask = current->blocked; + set_current_blocked(&kmask); return 0; } -EXPORT_SYMBOL(set_user_sigmask); #ifdef CONFIG_COMPAT -int set_compat_user_sigmask(const compat_sigset_t __user *usigmask, - sigset_t *set, sigset_t *oldset, +int set_compat_user_sigmask(const compat_sigset_t __user *umask, size_t sigsetsize) { - if (!usigmask) - return 0; + sigset_t kmask; + if (!umask) + return 0; if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (get_compat_sigset(set, usigmask)) + if (get_compat_sigset(&kmask, umask)) return -EFAULT; - *oldset = current->blocked; - set_current_blocked(set); + set_restore_sigmask(); + current->saved_sigmask = current->blocked; + set_current_blocked(&kmask); return 0; } -EXPORT_SYMBOL(set_compat_user_sigmask); #endif -/* - * restore_user_sigmask: - * usigmask: sigmask passed in from userland. - * sigsaved: saved sigmask when the syscall started and changed the sigmask to - * usigmask. - * - * This is useful for syscalls such as ppoll, pselect, io_pgetevents and - * epoll_pwait where a new sigmask is passed in from userland for the syscalls. - */ -void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved, - bool interrupted) -{ - - if (!usigmask) - return; - /* - * When signals are pending, do not restore them here. - * Restoring sigmask here can lead to delivering signals that the above - * syscalls are intended to block because of the sigmask passed in. - */ - if (interrupted) { - current->saved_sigmask = *sigsaved; - set_restore_sigmask(); - return; - } - - /* - * This is needed because the fast syscall return path does not restore - * saved_sigmask when signals are not pending. - */ - set_current_blocked(sigsaved); -} -EXPORT_SYMBOL(restore_user_sigmask); - /** * sys_rt_sigprocmask - change the list of currently blocked signals * @how: whether to add, remove, or set signals -- cgit v1.2.3-59-g8ed1b From f57e515a1b56325a28a0972c632a623a9c84590c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 16 Jul 2019 16:30:06 -0700 Subject: kernel/pid.c: convert struct pid count to refcount_t struct pid's count is an atomic_t field used as a refcount. Use refcount_t for it which is basically atomic_t but does additional checking to prevent use-after-free bugs. For memory ordering, the only change is with the following: - if ((atomic_read(&pid->count) == 1) || - atomic_dec_and_test(&pid->count)) { + if (refcount_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); Here the change is from: Fully ordered --> RELEASE + ACQUIRE (as per refcount-vs-atomic.rst) This ACQUIRE should take care of making sure the free happens after the refcount_dec_and_test(). The above hunk also removes atomic_read() since it is not needed for the code to work and it is unclear how beneficial it is. The removal lets refcount_dec_and_test() check for cases where get_pid() happened before the object was freed. Link: http://lkml.kernel.org/r/20190701183826.191936-1-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Andrea Parri Reviewed-by: Kees Cook Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Peter Zijlstra Cc: Will Deacon Cc: Paul E. McKenney Cc: Elena Reshetova Cc: Jann Horn Cc: Eric W. Biederman Cc: KJ Tsanaktsidis Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pid.h | 5 +++-- kernel/pid.c | 9 ++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/pid.h b/include/linux/pid.h index 1484db6ca8d1..2a83e434db9d 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -4,6 +4,7 @@ #include #include +#include enum pid_type { @@ -57,7 +58,7 @@ struct upid { struct pid { - atomic_t count; + refcount_t count; unsigned int level; /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; @@ -74,7 +75,7 @@ extern const struct file_operations pidfd_fops; static inline struct pid *get_pid(struct pid *pid) { if (pid) - atomic_inc(&pid->count); + refcount_inc(&pid->count); return pid; } diff --git a/kernel/pid.c b/kernel/pid.c index 16263b526560..0a9f2e437217 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -37,14 +37,14 @@ #include #include #include -#include +#include #include #include #include #include struct pid init_struct_pid = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .tasks = { { .first = NULL }, { .first = NULL }, @@ -108,8 +108,7 @@ void put_pid(struct pid *pid) return; ns = pid->numbers[pid->level].ns; - if ((atomic_read(&pid->count) == 1) || - atomic_dec_and_test(&pid->count)) { + if (refcount_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } @@ -212,7 +211,7 @@ struct pid *alloc_pid(struct pid_namespace *ns) } get_pid_ns(ns); - atomic_set(&pid->count, 1); + refcount_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); -- cgit v1.2.3-59-g8ed1b From 6b15f678fb7d5ef54e089e6ace72f007fe6e9895 Mon Sep 17 00:00:00 2001 From: Drew Davenport Date: Tue, 16 Jul 2019 16:30:18 -0700 Subject: include/asm-generic/bug.h: fix "cut here" for WARN_ON for __WARN_TAINT architectures For architectures using __WARN_TAINT, the WARN_ON macro did not print out the "cut here" string. The other WARN_XXX macros would print "cut here" inside __warn_printk, which is not called for WARN_ON since it doesn't have a message to print. Link: http://lkml.kernel.org/r/20190624154831.163888-1-ddavenport@chromium.org Fixes: a7bed27af194 ("bug: fix "cut here" location for __WARN_TAINT architectures") Signed-off-by: Drew Davenport Acked-by: Kees Cook Tested-by: Kees Cook Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/bug.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 0e9bd9c83870..aa6c093d9ce9 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -104,8 +104,10 @@ extern void warn_slowpath_null(const char *file, const int line); warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg) #else extern __printf(1, 2) void __warn_printk(const char *fmt, ...); -#define __WARN() __WARN_TAINT(TAINT_WARN) -#define __WARN_printf(arg...) do { __warn_printk(arg); __WARN(); } while (0) +#define __WARN() do { \ + printk(KERN_WARNING CUT_HERE); __WARN_TAINT(TAINT_WARN); \ +} while (0) +#define __WARN_printf(arg...) __WARN_printf_taint(TAINT_WARN, arg) #define __WARN_printf_taint(taint, arg...) \ do { __warn_printk(arg); __WARN_TAINT(taint); } while (0) #endif -- cgit v1.2.3-59-g8ed1b From 97a0efea657e986322b09b99016b3f7d2ce37021 Mon Sep 17 00:00:00 2001 From: Tom Levy Date: Tue, 16 Jul 2019 16:30:24 -0700 Subject: include/linux/lz4.h: fix spelling and copy-paste errors in documentation Fix a few spelling and grammar errors, and two places where fast/safe in the documentation did not match the function. Link: http://lkml.kernel.org/r/20190321014452.13297-1-tomlevy93@gmail.com Signed-off-by: Tom Levy Reviewed-by: Andrew Morton Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lz4.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/lz4.h b/include/linux/lz4.h index 394e3d9213b8..b16e15b9587a 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -278,7 +278,7 @@ int LZ4_decompress_fast(const char *source, char *dest, int originalSize); * @compressedSize: is the precise full size of the compressed block * @maxDecompressedSize: is the size of 'dest' buffer * - * Decompresses data fom 'source' into 'dest'. + * Decompresses data from 'source' into 'dest'. * If the source stream is detected malformed, the function will * stop decoding and return a negative result. * This function is protected against buffer overflow exploits, @@ -522,7 +522,7 @@ int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode, const char *dictionary, int dictSize); /** - * LZ4_decompress_fast_continue() - Decompress blocks in streaming mode + * LZ4_decompress_safe_continue() - Decompress blocks in streaming mode * @LZ4_streamDecode: the 'LZ4_streamDecode_t' structure * @source: source address of the compressed data * @dest: output buffer address of the uncompressed data @@ -530,7 +530,7 @@ int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode, * @compressedSize: is the precise full size of the compressed block * @maxDecompressedSize: is the size of 'dest' buffer * - * These decoding function allows decompression of multiple blocks + * This decoding function allows decompression of multiple blocks * in "streaming" mode. * Previously decoded blocks *must* remain available at the memory position * where they were decoded (up to 64 KB) @@ -569,7 +569,7 @@ int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode, * which must be already allocated with 'originalSize' bytes * @originalSize: is the original and therefore uncompressed size * - * These decoding function allows decompression of multiple blocks + * This decoding function allows decompression of multiple blocks * in "streaming" mode. * Previously decoded blocks *must* remain available at the memory position * where they were decoded (up to 64 KB) @@ -610,10 +610,10 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode, * @dictStart: pointer to the start of the dictionary in memory * @dictSize: size of dictionary * - * These decoding function works the same as + * This decoding function works the same as * a combination of LZ4_setStreamDecode() followed by * LZ4_decompress_safe_continue() - * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure. + * It is stand-alone, and doesn't need an LZ4_streamDecode_t structure. * * Return: number of bytes decompressed into destination buffer * (necessarily <= maxDecompressedSize) @@ -633,10 +633,10 @@ int LZ4_decompress_safe_usingDict(const char *source, char *dest, * @dictStart: pointer to the start of the dictionary in memory * @dictSize: size of dictionary * - * These decoding function works the same as + * This decoding function works the same as * a combination of LZ4_setStreamDecode() followed by - * LZ4_decompress_safe_continue() - * It is stand-alone, and don'tn eed a LZ4_streamDecode_t structure. + * LZ4_decompress_fast_continue() + * It is stand-alone, and doesn't need an LZ4_streamDecode_t structure. * * Return: number of bytes decompressed into destination buffer * (necessarily <= maxDecompressedSize) -- cgit v1.2.3-59-g8ed1b From eca499ab3749a4537dee77ffead47a1a2c0dee19 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 16 Jul 2019 16:30:31 -0700 Subject: mm/hotplug: make remove_memory() interface usable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Presently the remove_memory() interface is inherently broken. It tries to remove memory but panics if some memory is not offline. The problem is that it is impossible to ensure that all memory blocks are offline as this function also takes lock_device_hotplug that is required to change memory state via sysfs. So, between calling this function and offlining all memory blocks there is always a window when lock_device_hotplug is released, and therefore, there is always a chance for a panic during this window. Make this interface to return an error if memory removal fails. This way it is safe to call this function without panicking machine, and also makes it symmetric to add_memory() which already returns an error. Link: http://lkml.kernel.org/r/20190517215438.6487-3-pasha.tatashin@soleen.com Signed-off-by: Pavel Tatashin Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Bjorn Helgaas Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: Dave Hansen Cc: Dave Jiang Cc: Fengguang Wu Cc: Huang Ying Cc: James Morris Cc: Jérôme Glisse Cc: Keith Busch Cc: Ross Zwisler Cc: Sasha Levin Cc: Takashi Iwai Cc: Tom Lendacky Cc: Vishal Verma Cc: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 8 ++++-- mm/memory_hotplug.c | 64 ++++++++++++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ae892eef8b82..988fde33cd7f 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -324,7 +324,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {} extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); -extern void remove_memory(int nid, u64 start, u64 size); +extern int remove_memory(int nid, u64 start, u64 size); extern void __remove_memory(int nid, u64 start, u64 size); #else @@ -341,7 +341,11 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) return -EINVAL; } -static inline void remove_memory(int nid, u64 start, u64 size) {} +static inline int remove_memory(int nid, u64 start, u64 size) +{ + return -EBUSY; +} + static inline void __remove_memory(int nid, u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6166ba5a15f3..4ebe696138e8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1734,9 +1734,10 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); - } - return ret; + return -EBUSY; + } + return 0; } static int check_cpu_on_node(pg_data_t *pgdat) @@ -1819,19 +1820,9 @@ static void __release_memory_resource(resource_size_t start, } } -/** - * remove_memory - * @nid: the node ID - * @start: physical address of the region to remove - * @size: size of the region to remove - * - * NOTE: The caller must call lock_device_hotplug() to serialize hotplug - * and online/offline operations before this call, as required by - * try_offline_node(). - */ -void __ref __remove_memory(int nid, u64 start, u64 size) +static int __ref try_remove_memory(int nid, u64 start, u64 size) { - int ret; + int rc = 0; BUG_ON(check_hotplug_memory_range(start, size)); @@ -1839,13 +1830,13 @@ void __ref __remove_memory(int nid, u64 start, u64 size) /* * All memory blocks must be offlined before removing memory. Check - * whether all memory blocks in question are offline and trigger a BUG() + * whether all memory blocks in question are offline and return error * if this is not the case. */ - ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, - check_memblock_offlined_cb); - if (ret) - BUG(); + rc = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, + check_memblock_offlined_cb); + if (rc) + goto done; /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); @@ -1857,14 +1848,45 @@ void __ref __remove_memory(int nid, u64 start, u64 size) try_offline_node(nid); +done: mem_hotplug_done(); + return rc; } -void remove_memory(int nid, u64 start, u64 size) +/** + * remove_memory + * @nid: the node ID + * @start: physical address of the region to remove + * @size: size of the region to remove + * + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations before this call, as required by + * try_offline_node(). + */ +void __remove_memory(int nid, u64 start, u64 size) +{ + + /* + * trigger BUG() is some memory is not offlined prior to calling this + * function + */ + if (try_remove_memory(nid, start, size)) + BUG(); +} + +/* + * Remove memory if every memory block is offline, otherwise return -EBUSY is + * some memory is not offline + */ +int remove_memory(int nid, u64 start, u64 size) { + int rc; + lock_device_hotplug(); - __remove_memory(nid, start, size); + rc = try_remove_memory(nid, start, size); unlock_device_hotplug(); + + return rc; } EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit v1.2.3-59-g8ed1b From 22fcea6f85f2cc74e61bd8b3640faa8467553c24 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 16 Jul 2019 16:30:38 -0700 Subject: mm: move MAP_SYNC to asm-generic/mman-common.h This enables support for synchronous DAX fault on powerpc The generic changes are added as part of b6fb293f2497 ("mm: Define MAP_SYNC and VM_SYNC flags") Without this, mmap returns EOPNOTSUPP for MAP_SYNC with MAP_SHARED_VALIDATE Instead of adding MAP_SYNC with same value to arch/powerpc/include/uapi/asm/mman.h, I am moving the #define to asm-generic/mman-common.h. Two architectures using mman-common.h directly are sparc and powerpc. We should be able to consloidate more #defines to mman-common.h. That can be done as a separate patch. Link: http://lkml.kernel.org/r/20190528091120.13322-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Jan Kara Cc: Michael Ellerman Cc: Ross Zwisler Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/asm-generic/mman-common.h | 3 ++- include/uapi/asm-generic/mman.h | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index cb556b430e71..93a8b420ce1e 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -20,7 +20,8 @@ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ -/* 0x0100 - 0x80000 flags are defined in asm-generic/mman.h */ +/* 0x0100 - 0x40000 flags are defined in asm-generic/mman.h */ +#define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index 653687d9771b..2dffcbf705b3 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -13,7 +13,6 @@ #define MAP_NONBLOCK 0x10000 /* do not block on IO */ #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ -#define MAP_SYNC 0x80000 /* perform synchronous page faults for the mapping */ /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */ -- cgit v1.2.3-59-g8ed1b From 8aa3c927ec10d1230c3ace8357f624479665f701 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 16 Jul 2019 16:30:41 -0700 Subject: mm/mmap: move common defines to mman-common.h Two architecture that use arch specific MMAP flags are powerpc and sparc. We still have few flag values common across them and other architectures. Consolidate this in mman-common.h. Also update the comment to indicate where to find HugeTLB specific reserved values Link: http://lkml.kernel.org/r/20190604090950.31417-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/uapi/asm/mman.h | 6 +----- arch/sparc/include/uapi/asm/mman.h | 6 ------ include/uapi/asm-generic/mman-common.h | 6 +++++- include/uapi/asm-generic/mman.h | 9 ++++----- 4 files changed, 10 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h index 65065ce32814..c0c737215b00 100644 --- a/arch/powerpc/include/uapi/asm/mman.h +++ b/arch/powerpc/include/uapi/asm/mman.h @@ -21,15 +21,11 @@ #define MAP_DENYWRITE 0x0800 /* ETXTBSY */ #define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ + #define MCL_CURRENT 0x2000 /* lock all currently mapped pages */ #define MCL_FUTURE 0x4000 /* lock all additions to address space */ #define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ -#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ -#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ - /* Override any generic PKEY permission defines */ #define PKEY_DISABLE_EXECUTE 0x4 #undef PKEY_ACCESS_MASK diff --git a/arch/sparc/include/uapi/asm/mman.h b/arch/sparc/include/uapi/asm/mman.h index f6f99ec65bb3..cec9f4109687 100644 --- a/arch/sparc/include/uapi/asm/mman.h +++ b/arch/sparc/include/uapi/asm/mman.h @@ -22,10 +22,4 @@ #define MCL_FUTURE 0x4000 /* lock all additions to address space */ #define MCL_ONFAULT 0x8000 /* lock all pages that are faulted in */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ -#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ -#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ - - #endif /* _UAPI__SPARC_MMAN_H__ */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 93a8b420ce1e..63b1f506ea67 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -20,7 +20,11 @@ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ -/* 0x0100 - 0x40000 flags are defined in asm-generic/mman.h */ +/* 0x0100 - 0x4000 flags are defined in asm-generic/mman.h */ +#define MAP_POPULATE 0x008000 /* populate (prefault) pagetables */ +#define MAP_NONBLOCK 0x010000 /* do not block on IO */ +#define MAP_STACK 0x020000 /* give out an address that is best suited for process/thread stacks */ +#define MAP_HUGETLB 0x040000 /* create a huge page mapping */ #define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index 2dffcbf705b3..57e8195d0b53 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -9,12 +9,11 @@ #define MAP_EXECUTABLE 0x1000 /* mark it as an executable */ #define MAP_LOCKED 0x2000 /* pages are locked */ #define MAP_NORESERVE 0x4000 /* don't check for reservations */ -#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */ -#define MAP_NONBLOCK 0x10000 /* do not block on IO */ -#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ -#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ -/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */ +/* + * Bits [26:31] are reserved, see asm-generic/hugetlb_encode.h + * for MAP_HUGETLB usage + */ #define MCL_CURRENT 1 /* lock all current mappings */ #define MCL_FUTURE 2 /* lock all future mappings */ -- cgit v1.2.3-59-g8ed1b From 7588adf8dff12c4b358557a13796a25fef796548 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 16 Jul 2019 16:30:44 -0700 Subject: mm: clean up is_device_*_page() definitions Refactor is_device_{public,private}_page() with is_pci_p2pdma_page() to make them all consistent in depending on their respective config options even when CONFIG_DEV_PAGEMAP_OPS is enabled for other reasons. This allows a little more compile-time optimisation as well as the conceptual and cosmetic cleanup. Link: http://lkml.kernel.org/r/187c2ab27dea70635d375a61b2f2076d26c032b0.1558547956.git.robin.murphy@arm.com Signed-off-by: Robin Murphy Suggested-by: Jerome Glisse Reviewed-by: Anshuman Khandual Cc: Catalin Marinas Cc: Dan Williams Cc: David Hildenbrand Cc: Ira Weiny Cc: Michael Ellerman Cc: Michal Hocko Cc: Oliver O'Halloran Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 74797ed20c2c..baa8b8761d8c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -956,41 +956,28 @@ static inline bool put_devmap_managed_page(struct page *page) return false; } -static inline bool is_device_private_page(const struct page *page) -{ - return is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PRIVATE; -} - -#ifdef CONFIG_PCI_P2PDMA -static inline bool is_pci_p2pdma_page(const struct page *page) -{ - return is_zone_device_page(page) && - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; -} -#else /* CONFIG_PCI_P2PDMA */ -static inline bool is_pci_p2pdma_page(const struct page *page) -{ - return false; -} -#endif /* CONFIG_PCI_P2PDMA */ - #else /* CONFIG_DEV_PAGEMAP_OPS */ static inline bool put_devmap_managed_page(struct page *page) { return false; } +#endif /* CONFIG_DEV_PAGEMAP_OPS */ static inline bool is_device_private_page(const struct page *page) { - return false; + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && + IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PRIVATE; } static inline bool is_pci_p2pdma_page(const struct page *page) { - return false; + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && + IS_ENABLED(CONFIG_PCI_P2PDMA) && + is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } -#endif /* CONFIG_DEV_PAGEMAP_OPS */ /* 127: arbitrary random number, small enough to assemble well */ #define page_ref_zero_or_close_to_overflow(page) \ -- cgit v1.2.3-59-g8ed1b From 175967318c3018d01931ac950c82adab5deb47ca Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Tue, 16 Jul 2019 16:30:47 -0700 Subject: mm: introduce ARCH_HAS_PTE_DEVMAP ARCH_HAS_ZONE_DEVICE is somewhat meaningless in itself, and combined with the long-out-of-date comment can lead to the impression than an architecture may just enable it (since __add_pages() now "comprehends device memory" for itself) and expect things to work. In practice, however, ZONE_DEVICE users have little chance of functioning correctly without __HAVE_ARCH_PTE_DEVMAP, so let's clean that up the same way as ARCH_HAS_PTE_SPECIAL and make it the proper dependency so the real situation is clearer. Link: http://lkml.kernel.org/r/87554aa78478a02a63f2c4cf60a847279ae3eb3b.1558547956.git.robin.murphy@arm.com Signed-off-by: Robin Murphy Acked-by: Dan Williams Reviewed-by: Ira Weiny Acked-by: Oliver O'Halloran Reviewed-by: Anshuman Khandual Cc: Michael Ellerman Cc: Catalin Marinas Cc: David Hildenbrand Cc: Jerome Glisse Cc: Michal Hocko Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 1 - arch/x86/Kconfig | 2 +- arch/x86/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable_types.h | 1 - include/linux/mm.h | 4 ++-- include/linux/pfn_t.h | 4 ++-- mm/Kconfig | 5 ++--- mm/gup.c | 2 +- 9 files changed, 11 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index f516796dd819..d8dcd8820369 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -129,6 +129,7 @@ config PPC select ARCH_HAS_MMIOWB if PPC64 select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 + select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC64 @@ -136,7 +137,6 @@ config PPC select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE if PPC64 select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 62e6ea0a7650..8308f32e9782 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -90,7 +90,6 @@ #define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ #define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ #define _PAGE_DEVMAP _RPAGE_SW1 /* software: ZONE_DEVICE page */ -#define __HAVE_ARCH_PTE_DEVMAP /* * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 879741336771..4a55bd01e918 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -70,6 +70,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 @@ -80,7 +81,6 @@ config X86 select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_ZONE_DEVICE if X86_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5e0509b41986..0bc530c4eb13 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -271,7 +271,7 @@ static inline int has_transparent_hugepage(void) return boot_cpu_has(X86_FEATURE_PSE); } -#ifdef __HAVE_ARCH_PTE_DEVMAP +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pmd_devmap(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_DEVMAP); @@ -732,7 +732,7 @@ static inline int pte_present(pte_t a) return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } -#ifdef __HAVE_ARCH_PTE_DEVMAP +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t a) { return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index d6ff0bbdb394..b5e49e6bac63 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -103,7 +103,6 @@ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) -#define __HAVE_ARCH_PTE_DEVMAP #else #define _PAGE_NX (_AT(pteval_t, 0)) #define _PAGE_DEVMAP (_AT(pteval_t, 0)) diff --git a/include/linux/mm.h b/include/linux/mm.h index baa8b8761d8c..f43f4de4de68 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -547,7 +547,7 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma) struct mmu_gather; struct inode; -#if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) +#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline int pmd_devmap(pmd_t pmd) { return 0; @@ -1750,7 +1750,7 @@ static inline void sync_mm_rss(struct mm_struct *mm) } #endif -#ifndef __HAVE_ARCH_PTE_DEVMAP +#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { return 0; diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index 01e8037023f7..2d9148221e9a 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -97,7 +97,7 @@ static inline pud_t pfn_t_pud(pfn_t pfn, pgprot_t pgprot) #endif #endif -#ifdef __HAVE_ARCH_PTE_DEVMAP +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP static inline bool pfn_t_devmap(pfn_t pfn) { const u64 flags = PFN_DEV|PFN_MAP; @@ -115,7 +115,7 @@ pmd_t pmd_mkdevmap(pmd_t pmd); defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pud_mkdevmap(pud_t pud); #endif -#endif /* __HAVE_ARCH_PTE_DEVMAP */ +#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */ #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static inline bool pfn_t_special(pfn_t pfn) diff --git a/mm/Kconfig b/mm/Kconfig index 495d7368ced8..56cec636a1fc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -649,8 +649,7 @@ config IDLE_PAGE_TRACKING See Documentation/admin-guide/mm/idle_page_tracking.rst for more details. -# arch_add_memory() comprehends device memory -config ARCH_HAS_ZONE_DEVICE +config ARCH_HAS_PTE_DEVMAP bool config ZONE_DEVICE @@ -658,7 +657,7 @@ config ZONE_DEVICE depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP - depends on ARCH_HAS_ZONE_DEVICE + depends on ARCH_HAS_PTE_DEVMAP select XARRAY_MULTI help diff --git a/mm/gup.c b/mm/gup.c index 8bbaa5523116..98f13ab37bac 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1895,7 +1895,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ -#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, struct page **pages, int *nr) { -- cgit v1.2.3-59-g8ed1b From 79eb597cba06c435b72f220e9d426ae413fc2579 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Tue, 16 Jul 2019 16:30:54 -0700 Subject: mm: add account_locked_vm utility function locked_vm accounting is done roughly the same way in five places, so unify them in a helper. Include the helper's caller in the debug print to distinguish between callsites. Error codes stay the same, so user-visible behavior does too. The one exception is that the -EPERM case in tce_account_locked_vm is removed because Alexey has never seen it triggered. [daniel.m.jordan@oracle.com: v3] Link: http://lkml.kernel.org/r/20190529205019.20927-1-daniel.m.jordan@oracle.com [sfr@canb.auug.org.au: fix mm/util.c] Link: http://lkml.kernel.org/r/20190524175045.26897-1-daniel.m.jordan@oracle.com Signed-off-by: Daniel Jordan Signed-off-by: Stephen Rothwell Tested-by: Alexey Kardashevskiy Acked-by: Alex Williamson Cc: Alan Tull Cc: Alex Williamson Cc: Benjamin Herrenschmidt Cc: Christoph Lameter Cc: Christophe Leroy Cc: Davidlohr Bueso Cc: Jason Gunthorpe Cc: Mark Rutland Cc: Michael Ellerman Cc: Moritz Fischer Cc: Paul Mackerras Cc: Steve Sistare Cc: Wu Hao Cc: Ira Weiny Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kvm/book3s_64_vio.c | 44 ++------------------- arch/powerpc/mm/book3s64/iommu_api.c | 41 ++------------------ drivers/fpga/dfl-afu-dma-region.c | 53 ++----------------------- drivers/vfio/vfio_iommu_spapr_tce.c | 54 +++----------------------- drivers/vfio/vfio_iommu_type1.c | 17 +------- include/linux/mm.h | 4 ++ mm/util.c | 75 ++++++++++++++++++++++++++++++++++++ 7 files changed, 98 insertions(+), 190 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 5bf05cc774e2..e99a14798ab0 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -45,43 +46,6 @@ static unsigned long kvmppc_stt_pages(unsigned long tce_pages) return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE; } -static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc) -{ - long ret = 0; - - if (!current || !current->mm) - return ret; /* process exited */ - - down_write(¤t->mm->mmap_sem); - - if (inc) { - unsigned long locked, lock_limit; - - locked = current->mm->locked_vm + stt_pages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - ret = -ENOMEM; - else - current->mm->locked_vm += stt_pages; - } else { - if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm)) - stt_pages = current->mm->locked_vm; - - current->mm->locked_vm -= stt_pages; - } - - pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid, - inc ? '+' : '-', - stt_pages << PAGE_SHIFT, - current->mm->locked_vm << PAGE_SHIFT, - rlimit(RLIMIT_MEMLOCK), - ret ? " - exceeded" : ""); - - up_write(¤t->mm->mmap_sem); - - return ret; -} - static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head) { struct kvmppc_spapr_tce_iommu_table *stit = container_of(head, @@ -291,7 +255,7 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) kvm_put_kvm(stt->kvm); - kvmppc_account_memlimit( + account_locked_vm(current->mm, kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false); call_rcu(&stt->rcu, release_spapr_tce_table); @@ -316,7 +280,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return -EINVAL; npages = kvmppc_tce_pages(size); - ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); + ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true); if (ret) return ret; @@ -362,7 +326,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, kfree(stt); fail_acct: - kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); + account_locked_vm(current->mm, kvmppc_stt_pages(npages), false); return ret; } diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index 90ee3a89722c..b056cae3388b 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -46,40 +47,6 @@ struct mm_iommu_table_group_mem_t { u64 dev_hpa; /* Device memory base address */ }; -static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, - unsigned long npages, bool incr) -{ - long ret = 0, locked, lock_limit; - - if (!npages) - return 0; - - down_write(&mm->mmap_sem); - - if (incr) { - locked = mm->locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - ret = -ENOMEM; - else - mm->locked_vm += npages; - } else { - if (WARN_ON_ONCE(npages > mm->locked_vm)) - npages = mm->locked_vm; - mm->locked_vm -= npages; - } - - pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n", - current ? current->pid : 0, - incr ? '+' : '-', - npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, - rlimit(RLIMIT_MEMLOCK)); - up_write(&mm->mmap_sem); - - return ret; -} - bool mm_iommu_preregistered(struct mm_struct *mm) { return !list_empty(&mm->context.iommu_group_mem_list); @@ -96,7 +63,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, unsigned long entry, chunk; if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { - ret = mm_iommu_adjust_locked_vm(mm, entries, true); + ret = account_locked_vm(mm, entries, true); if (ret) return ret; @@ -211,7 +178,7 @@ free_exit: kfree(mem); unlock_exit: - mm_iommu_adjust_locked_vm(mm, locked_entries, false); + account_locked_vm(mm, locked_entries, false); return ret; } @@ -311,7 +278,7 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) unlock_exit: mutex_unlock(&mem_list_mutex); - mm_iommu_adjust_locked_vm(mm, unlock_entries, false); + account_locked_vm(mm, unlock_entries, false); return ret; } diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c index dcd80b088c7b..62f924489db5 100644 --- a/drivers/fpga/dfl-afu-dma-region.c +++ b/drivers/fpga/dfl-afu-dma-region.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "dfl-afu.h" @@ -31,52 +32,6 @@ void afu_dma_region_init(struct dfl_feature_platform_data *pdata) afu->dma_regions = RB_ROOT; } -/** - * afu_dma_adjust_locked_vm - adjust locked memory - * @dev: port device - * @npages: number of pages - * @incr: increase or decrease locked memory - * - * Increase or decrease the locked memory size with npages input. - * - * Return 0 on success. - * Return -ENOMEM if locked memory size is over the limit and no CAP_IPC_LOCK. - */ -static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr) -{ - unsigned long locked, lock_limit; - int ret = 0; - - /* the task is exiting. */ - if (!current->mm) - return 0; - - down_write(¤t->mm->mmap_sem); - - if (incr) { - locked = current->mm->locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - ret = -ENOMEM; - else - current->mm->locked_vm += npages; - } else { - if (WARN_ON_ONCE(npages > current->mm->locked_vm)) - npages = current->mm->locked_vm; - current->mm->locked_vm -= npages; - } - - dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid, - incr ? '+' : '-', npages << PAGE_SHIFT, - current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK), - ret ? "- exceeded" : ""); - - up_write(¤t->mm->mmap_sem); - - return ret; -} - /** * afu_dma_pin_pages - pin pages of given dma memory region * @pdata: feature device platform data @@ -92,7 +47,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata, struct device *dev = &pdata->dev->dev; int ret, pinned; - ret = afu_dma_adjust_locked_vm(dev, npages, true); + ret = account_locked_vm(current->mm, npages, true); if (ret) return ret; @@ -121,7 +76,7 @@ put_pages: free_pages: kfree(region->pages); unlock_vm: - afu_dma_adjust_locked_vm(dev, npages, false); + account_locked_vm(current->mm, npages, false); return ret; } @@ -141,7 +96,7 @@ static void afu_dma_unpin_pages(struct dfl_feature_platform_data *pdata, put_all_pages(region->pages, npages); kfree(region->pages); - afu_dma_adjust_locked_vm(dev, npages, false); + account_locked_vm(current->mm, npages, false); dev_dbg(dev, "%ld pages unpinned\n", npages); } diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 7048c9198c21..8ce9ad21129f 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -31,51 +32,6 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group); -static long try_increment_locked_vm(struct mm_struct *mm, long npages) -{ - long ret = 0, locked, lock_limit; - - if (WARN_ON_ONCE(!mm)) - return -EPERM; - - if (!npages) - return 0; - - down_write(&mm->mmap_sem); - locked = mm->locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - ret = -ENOMEM; - else - mm->locked_vm += npages; - - pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, - npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, - rlimit(RLIMIT_MEMLOCK), - ret ? " - exceeded" : ""); - - up_write(&mm->mmap_sem); - - return ret; -} - -static void decrement_locked_vm(struct mm_struct *mm, long npages) -{ - if (!mm || !npages) - return; - - down_write(&mm->mmap_sem); - if (WARN_ON_ONCE(npages > mm->locked_vm)) - npages = mm->locked_vm; - mm->locked_vm -= npages; - pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, - npages << PAGE_SHIFT, - mm->locked_vm << PAGE_SHIFT, - rlimit(RLIMIT_MEMLOCK)); - up_write(&mm->mmap_sem); -} - /* * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation * @@ -333,7 +289,7 @@ static int tce_iommu_enable(struct tce_container *container) return ret; locked = table_group->tce32_size >> PAGE_SHIFT; - ret = try_increment_locked_vm(container->mm, locked); + ret = account_locked_vm(container->mm, locked, true); if (ret) return ret; @@ -352,7 +308,7 @@ static void tce_iommu_disable(struct tce_container *container) container->enabled = false; BUG_ON(!container->mm); - decrement_locked_vm(container->mm, container->locked_pages); + account_locked_vm(container->mm, container->locked_pages, false); } static void *tce_iommu_open(unsigned long arg) @@ -656,7 +612,7 @@ static long tce_iommu_create_table(struct tce_container *container, if (!table_size) return -EINVAL; - ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT); + ret = account_locked_vm(container->mm, table_size >> PAGE_SHIFT, true); if (ret) return ret; @@ -675,7 +631,7 @@ static void tce_iommu_free_table(struct tce_container *container, unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; iommu_tce_table_put(tbl); - decrement_locked_vm(container->mm, pages); + account_locked_vm(container->mm, pages, false); } static long tce_iommu_create_window(struct tce_container *container, diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index add34adfadc7..054391f30fa8 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -272,21 +272,8 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async) ret = down_write_killable(&mm->mmap_sem); if (!ret) { - if (npage > 0) { - if (!dma->lock_cap) { - unsigned long limit; - - limit = task_rlimit(dma->task, - RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (mm->locked_vm + npage > limit) - ret = -ENOMEM; - } - } - - if (!ret) - mm->locked_vm += npage; - + ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task, + dma->lock_cap); up_write(&mm->mmap_sem); } diff --git a/include/linux/mm.h b/include/linux/mm.h index f43f4de4de68..bd6512559bed 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1543,6 +1543,10 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); +int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, + struct task_struct *task, bool bypass_rlim); + /* Container for pinned pfns / pages */ struct frame_vector { unsigned int nr_allocated; /* Number of frames we have space for */ diff --git a/mm/util.c b/mm/util.c index 68575a315dc5..e6351a80f248 100644 --- a/mm/util.c +++ b/mm/util.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -300,6 +301,80 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) } #endif +/** + * __account_locked_vm - account locked pages to an mm's locked_vm + * @mm: mm to account against + * @pages: number of pages to account + * @inc: %true if @pages should be considered positive, %false if not + * @task: task used to check RLIMIT_MEMLOCK + * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped + * + * Assumes @task and @mm are valid (i.e. at least one reference on each), and + * that mmap_sem is held as writer. + * + * Return: + * * 0 on success + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. + */ +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, + struct task_struct *task, bool bypass_rlim) +{ + unsigned long locked_vm, limit; + int ret = 0; + + lockdep_assert_held_write(&mm->mmap_sem); + + locked_vm = mm->locked_vm; + if (inc) { + if (!bypass_rlim) { + limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked_vm + pages > limit) + ret = -ENOMEM; + } + if (!ret) + mm->locked_vm = locked_vm + pages; + } else { + WARN_ON_ONCE(pages > locked_vm); + mm->locked_vm = locked_vm - pages; + } + + pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, + (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, + locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), + ret ? " - exceeded" : ""); + + return ret; +} +EXPORT_SYMBOL_GPL(__account_locked_vm); + +/** + * account_locked_vm - account locked pages to an mm's locked_vm + * @mm: mm to account against, may be NULL + * @pages: number of pages to account + * @inc: %true if @pages should be considered positive, %false if not + * + * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). + * + * Return: + * * 0 on success, or if mm is NULL + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. + */ +int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) +{ + int ret; + + if (pages == 0 || !mm) + return 0; + + down_write(&mm->mmap_sem); + ret = __account_locked_vm(mm, pages, inc, current, + capable(CAP_IPC_LOCK)); + up_write(&mm->mmap_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(account_locked_vm); + unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) -- cgit v1.2.3-59-g8ed1b