aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-02-05 20:20:07 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2016-02-05 20:20:07 -0800
commit5af9c2e19da6514a1a50b07d97d93b74a7711873 (patch)
treef3558d3e3761199934fadcdf15e931235e47316a
parentMerge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client (diff)
parentepoll: restrict EPOLLEXCLUSIVE to POLLIN and POLLOUT (diff)
downloadlinux-dev-5af9c2e19da6514a1a50b07d97d93b74a7711873.tar.xz
linux-dev-5af9c2e19da6514a1a50b07d97d93b74a7711873.zip
Merge branch 'akpm' (patches from Andrew)
Merge fixes from Andrew Morton: "22 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (22 commits) epoll: restrict EPOLLEXCLUSIVE to POLLIN and POLLOUT radix-tree: fix oops after radix_tree_iter_retry MAINTAINERS: trim the file triggers for ABI/API dax: dirty inode only if required thp: make deferred_split_scan() work again mm: replace vma_lock_anon_vma with anon_vma_lock_read/write ocfs2/dlm: clear refmap bit of recovery lock while doing local recovery cleanup um: asm/page.h: remove the pte_high member from struct pte_t mm, hugetlb: don't require CMA for runtime gigantic pages mm/hugetlb: fix gigantic page initialization/allocation mm: downgrade VM_BUG in isolate_lru_page() to warning mempolicy: do not try to queue pages from !vma_migratable() mm, vmstat: fix wrong WQ sleep when memory reclaim doesn't make any progress vmstat: make vmstat_update deferrable mm, vmstat: make quiet_vmstat lighter mm/Kconfig: correct description of DEFERRED_STRUCT_PAGE_INIT memblock: don't mark memblock_phys_mem_size() as __init dump_stack: avoid potential deadlocks mm: validate_mm browse_rb SMP race condition m32r: fix build failure due to SMP and MMU ...
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/m32r/Kconfig1
-rw-r--r--arch/um/include/asm/page.h23
-rw-r--r--arch/x86/mm/hugetlbpage.c4
-rw-r--r--fs/block_dev.c8
-rw-r--r--fs/dax.c3
-rw-r--r--fs/eventpoll.c38
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--include/linux/gfp.h6
-rw-r--r--include/linux/radix-tree.h6
-rw-r--r--include/linux/rmap.h14
-rw-r--r--kernel/signal.c6
-rw-r--r--lib/dump_stack.c7
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/hugetlb.c7
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/mempolicy.c14
-rw-r--r--mm/mmap.c62
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/vmstat.c70
23 files changed, 166 insertions, 126 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index a9010d992da3..02a94eb64b52 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -223,9 +223,7 @@ F: drivers/scsi/aacraid/
ABI/API
L: linux-api@vger.kernel.org
-F: Documentation/ABI/
F: include/linux/syscalls.h
-F: include/uapi/
F: kernel/sys_ni.c
ABIT UGURU 1,2 HARDWARE MONITOR DRIVER
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 836ac5a963c8..2841c0a3fd3b 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -276,6 +276,7 @@ source "kernel/Kconfig.preempt"
config SMP
bool "Symmetric multi-processing support"
+ depends on MMU
---help---
This enables support for systems with more than one CPU. If you have
a system with only one CPU, say N. If you have a system with more
diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h
index e13d41c392ae..f878bec23576 100644
--- a/arch/um/include/asm/page.h
+++ b/arch/um/include/asm/page.h
@@ -34,21 +34,18 @@ struct page;
#if defined(CONFIG_3_LEVEL_PGTABLES) && !defined(CONFIG_64BIT)
-typedef struct { unsigned long pte_low, pte_high; } pte_t;
+typedef struct { unsigned long pte; } pte_t;
typedef struct { unsigned long pmd; } pmd_t;
typedef struct { unsigned long pgd; } pgd_t;
-#define pte_val(x) ((x).pte_low | ((unsigned long long) (x).pte_high << 32))
-
-#define pte_get_bits(pte, bits) ((pte).pte_low & (bits))
-#define pte_set_bits(pte, bits) ((pte).pte_low |= (bits))
-#define pte_clear_bits(pte, bits) ((pte).pte_low &= ~(bits))
-#define pte_copy(to, from) ({ (to).pte_high = (from).pte_high; \
- smp_wmb(); \
- (to).pte_low = (from).pte_low; })
-#define pte_is_zero(pte) (!((pte).pte_low & ~_PAGE_NEWPAGE) && !(pte).pte_high)
-#define pte_set_val(pte, phys, prot) \
- ({ (pte).pte_high = (phys) >> 32; \
- (pte).pte_low = (phys) | pgprot_val(prot); })
+#define pte_val(p) ((p).pte)
+
+#define pte_get_bits(p, bits) ((p).pte & (bits))
+#define pte_set_bits(p, bits) ((p).pte |= (bits))
+#define pte_clear_bits(p, bits) ((p).pte &= ~(bits))
+#define pte_copy(to, from) ({ (to).pte = (from).pte; })
+#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE))
+#define pte_set_val(p, phys, prot) \
+ ({ (p).pte = (phys) | pgprot_val(prot); })
#define pmd_val(x) ((x).pmd)
#define __pmd(x) ((pmd_t) { (x) } )
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 42982b26e32b..740d7ac03a55 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -173,10 +173,10 @@ static __init int setup_hugepagesz(char *opt)
}
__setup("hugepagesz=", setup_hugepagesz);
-#ifdef CONFIG_CMA
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static __init int gigantic_pages_init(void)
{
- /* With CMA we can allocate gigantic pages at runtime */
+ /* With compaction or CMA we can allocate gigantic pages at runtime */
if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT))
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
return 0;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index afb437484362..39b3a174a425 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1730,6 +1730,12 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return __dax_fault(vma, vmf, blkdev_get_block, NULL);
}
+static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ return dax_pfn_mkwrite(vma, vmf);
+}
+
static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, unsigned int flags)
{
@@ -1739,7 +1745,7 @@ static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
static const struct vm_operations_struct blkdev_dax_vm_ops = {
.fault = blkdev_dax_fault,
.pmd_fault = blkdev_dax_pmd_fault,
- .pfn_mkwrite = blkdev_dax_fault,
+ .pfn_mkwrite = blkdev_dax_pfn_mkwrite,
};
static const struct vm_operations_struct blkdev_default_vm_ops = {
diff --git a/fs/dax.c b/fs/dax.c
index e0e9358baf35..fc2e3141138b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -358,7 +358,8 @@ static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
void *entry;
WARN_ON_ONCE(pmd_entry && !dirty);
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ if (dirty)
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
spin_lock_irq(&mapping->tree_lock);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index ae1dbcf47e97..cde60741cad2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -94,6 +94,11 @@
/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
+#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
+
+#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
+ EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
+
/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4
@@ -1068,7 +1073,22 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* wait list.
*/
if (waitqueue_active(&ep->wq)) {
- ewake = 1;
+ if ((epi->event.events & EPOLLEXCLUSIVE) &&
+ !((unsigned long)key & POLLFREE)) {
+ switch ((unsigned long)key & EPOLLINOUT_BITS) {
+ case POLLIN:
+ if (epi->event.events & POLLIN)
+ ewake = 1;
+ break;
+ case POLLOUT:
+ if (epi->event.events & POLLOUT)
+ ewake = 1;
+ break;
+ case 0:
+ ewake = 1;
+ break;
+ }
+ }
wake_up_locked(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
@@ -1875,9 +1895,13 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
- if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD ||
- (op == EPOLL_CTL_ADD && is_file_epoll(tf.file))))
- goto error_tgt_fput;
+ if (epds.events & EPOLLEXCLUSIVE) {
+ if (op == EPOLL_CTL_MOD)
+ goto error_tgt_fput;
+ if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+ (epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
+ goto error_tgt_fput;
+ }
/*
* At this point it is safe to assume that the "private_data" contains
@@ -1950,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
break;
case EPOLL_CTL_MOD:
if (epi) {
- epds.events |= POLLERR | POLLHUP;
- error = ep_modify(ep, epi, &epds);
+ if (!(epi->event.events & EPOLLEXCLUSIVE)) {
+ epds.events |= POLLERR | POLLHUP;
+ error = ep_modify(ep, epi, &epds);
+ }
} else
error = -ENOENT;
break;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index c5bdf02c213b..b94a425f0175 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2367,6 +2367,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
break;
}
}
+ dlm_lockres_clear_refmap_bit(dlm, res,
+ dead_node);
spin_unlock(&res->spinlock);
continue;
}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 28ad5f6494b0..af1f2b24bbe4 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -547,16 +547,16 @@ static inline bool pm_suspended_storage(void)
}
#endif /* CONFIG_PM_SLEEP */
-#ifdef CONFIG_CMA
-
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype);
extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
+#endif
+#ifdef CONFIG_CMA
/* CMA stuff */
extern void init_cma_reserved_pageblock(struct page *page);
-
#endif
#endif /* __LINUX_GFP_H */
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 00b17c526c1f..f54be7082207 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -400,7 +400,7 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
* @iter: pointer to radix tree iterator
* Returns: current chunk size
*/
-static __always_inline unsigned
+static __always_inline long
radix_tree_chunk_size(struct radix_tree_iter *iter)
{
return iter->next_index - iter->index;
@@ -434,9 +434,9 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
return slot + offset + 1;
}
} else {
- unsigned size = radix_tree_chunk_size(iter) - 1;
+ long size = radix_tree_chunk_size(iter);
- while (size--) {
+ while (--size > 0) {
slot++;
iter->index++;
if (likely(*slot))
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bdf597c4f0be..a07f42bedda3 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -109,20 +109,6 @@ static inline void put_anon_vma(struct anon_vma *anon_vma)
__put_anon_vma(anon_vma);
}
-static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
-{
- struct anon_vma *anon_vma = vma->anon_vma;
- if (anon_vma)
- down_write(&anon_vma->root->rwsem);
-}
-
-static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
-{
- struct anon_vma *anon_vma = vma->anon_vma;
- if (anon_vma)
- up_write(&anon_vma->root->rwsem);
-}
-
static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{
down_write(&anon_vma->root->rwsem);
diff --git a/kernel/signal.c b/kernel/signal.c
index f3f1f7a972fd..0508544c8ced 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set)
current->saved_sigmask = current->blocked;
set_current_blocked(set);
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ while (!signal_pending(current)) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
set_restore_sigmask();
return -ERESTARTNOHAND;
}
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index 6745c6230db3..c30d07e99dba 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -25,6 +25,7 @@ static atomic_t dump_lock = ATOMIC_INIT(-1);
asmlinkage __visible void dump_stack(void)
{
+ unsigned long flags;
int was_locked;
int old;
int cpu;
@@ -33,9 +34,8 @@ asmlinkage __visible void dump_stack(void)
* Permit this cpu to perform nested stack dumps while serialising
* against other CPUs
*/
- preempt_disable();
-
retry:
+ local_irq_save(flags);
cpu = smp_processor_id();
old = atomic_cmpxchg(&dump_lock, -1, cpu);
if (old == -1) {
@@ -43,6 +43,7 @@ retry:
} else if (old == cpu) {
was_locked = 1;
} else {
+ local_irq_restore(flags);
cpu_relax();
goto retry;
}
@@ -52,7 +53,7 @@ retry:
if (!was_locked)
atomic_set(&dump_lock, -1);
- preempt_enable();
+ local_irq_restore(flags);
}
#else
asmlinkage __visible void dump_stack(void)
diff --git a/mm/Kconfig b/mm/Kconfig
index 97a4e06b15c0..03cbfa072f42 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
bool
config DEFERRED_STRUCT_PAGE_INIT
- bool "Defer initialisation of struct pages to kswapd"
+ bool "Defer initialisation of struct pages to kthreads"
default n
depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
depends on MEMORY_HOTPLUG
@@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT
single thread. On very large machines this can take a considerable
amount of time. If this option is set, large machines will bring up
a subset of memmap at boot and then initialise the rest in parallel
- when kswapd starts. This has a potential performance impact on
- processes running early in the lifetime of the systemm until kswapd
- finishes the initialisation.
+ by starting one-off "pgdatinitX" kernel thread for each node X. This
+ has a potential performance impact on processes running early in the
+ lifetime of the system until these kthreads finish the
+ initialisation.
config IDLE_PAGE_TRACKING
bool "Enable idle page tracking"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index cc5d29d2da9b..926c76d56388 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
* here rather than calling cond_resched().
*/
if (current->flags & PF_WQ_WORKER)
- schedule_timeout(1);
+ schedule_timeout_uninterruptible(1);
else
cond_resched();
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 36c070167b71..08fc0ba2207e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3482,7 +3482,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
- list_for_each_safe(pos, next, &list) {
+ list_for_each_safe(pos, next, &pgdata->split_queue) {
page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page);
if (get_page_unless_zero(page)) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 12908dcf5831..06ae13e869d0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA))
static void destroy_compound_gigantic_page(struct page *page,
unsigned int order)
{
@@ -1214,8 +1214,8 @@ void free_huge_page(struct page *page)
set_page_private(page, 0);
page->mapping = NULL;
- BUG_ON(page_count(page));
- BUG_ON(page_mapcount(page));
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(page_mapcount(page), page);
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
@@ -1286,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
set_page_count(p, 0);
set_compound_head(p, page);
}
+ atomic_set(compound_mapcount_ptr(page), -1);
}
/*
diff --git a/mm/memblock.c b/mm/memblock.c
index d2ed81e59a94..dd7989929f13 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1448,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
* Remaining API functions
*/
-phys_addr_t __init memblock_phys_mem_size(void)
+phys_addr_t __init_memblock memblock_phys_mem_size(void)
{
return memblock.memory.total_size;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 27d135408a22..4c4187c0e1de 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -548,8 +548,7 @@ retry:
goto retry;
}
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- migrate_page_add(page, qp->pagelist, flags);
+ migrate_page_add(page, qp->pagelist, flags);
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -625,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
- if (vma->vm_flags & VM_PFNMAP)
+ if (!vma_migratable(vma))
return 1;
if (endvma > end)
@@ -644,16 +643,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (vma_migratable(vma) &&
- vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+ if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
change_prot_numa(vma, start, endvma);
return 1;
}
- if ((flags & MPOL_MF_STRICT) ||
- ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma)))
- /* queue pages from current vma */
+ /* queue pages from current vma */
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
return 0;
return 1;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index cfc0cdca421e..2f2415a7a688 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -390,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
}
#ifdef CONFIG_DEBUG_VM_RB
-static int browse_rb(struct rb_root *root)
+static int browse_rb(struct mm_struct *mm)
{
+ struct rb_root *root = &mm->mm_rb;
int i = 0, j, bug = 0;
struct rb_node *nd, *pn = NULL;
unsigned long prev = 0, pend = 0;
@@ -414,12 +415,14 @@ static int browse_rb(struct rb_root *root)
vma->vm_start, vma->vm_end);
bug = 1;
}
+ spin_lock(&mm->page_table_lock);
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
pr_emerg("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
}
+ spin_unlock(&mm->page_table_lock);
i++;
pn = nd;
prev = vma->vm_start;
@@ -456,12 +459,16 @@ static void validate_mm(struct mm_struct *mm)
struct vm_area_struct *vma = mm->mmap;
while (vma) {
+ struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
- vma_lock_anon_vma(vma);
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_verify(avc);
- vma_unlock_anon_vma(vma);
+ if (anon_vma) {
+ anon_vma_lock_read(anon_vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_verify(avc);
+ anon_vma_unlock_read(anon_vma);
+ }
+
highest_address = vma->vm_end;
vma = vma->vm_next;
i++;
@@ -475,7 +482,7 @@ static void validate_mm(struct mm_struct *mm)
mm->highest_vm_end, highest_address);
bug = 1;
}
- i = browse_rb(&mm->mm_rb);
+ i = browse_rb(mm);
if (i != mm->map_count) {
if (i != -1)
pr_emerg("map_count %d rb %d\n", mm->map_count, i);
@@ -2142,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
- int error;
+ int error = 0;
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
- /*
- * We must make sure the anon_vma is allocated
- * so that the anon_vma locking is not a noop.
- */
+ /* Guard against wrapping around to address 0. */
+ if (address < PAGE_ALIGN(address+4))
+ address = PAGE_ALIGN(address+4);
+ else
+ return -ENOMEM;
+
+ /* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
- vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
- * Also guard against wrapping around to address 0.
*/
- if (address < PAGE_ALIGN(address+4))
- address = PAGE_ALIGN(address+4);
- else {
- vma_unlock_anon_vma(vma);
- return -ENOMEM;
- }
- error = 0;
+ anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address > vma->vm_end) {
@@ -2185,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
* updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against
* concurrent vma expansions.
- * vma_lock_anon_vma() doesn't help here, as
+ * anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
@@ -2208,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
}
- vma_unlock_anon_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm);
return error;
@@ -2224,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
int error;
- /*
- * We must make sure the anon_vma is allocated
- * so that the anon_vma locking is not a noop.
- */
- if (unlikely(anon_vma_prepare(vma)))
- return -ENOMEM;
-
address &= PAGE_MASK;
error = security_mmap_addr(address);
if (error)
return error;
- vma_lock_anon_vma(vma);
+ /* We must make sure the anon_vma is allocated. */
+ if (unlikely(anon_vma_prepare(vma)))
+ return -ENOMEM;
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
+ anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) {
@@ -2260,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma,
* updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against
* concurrent vma expansions.
- * vma_lock_anon_vma() doesn't help here, as
+ * anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
@@ -2281,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
}
}
- vma_unlock_anon_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm);
return error;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ea2c4d3e0c03..838ca8bb64f7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6620,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page)
return !has_unmovable_pages(zone, page, 0, true);
}
-#ifdef CONFIG_CMA
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static unsigned long pfn_max_align_down(unsigned long pfn)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb3dd37ccd7c..71b1c29948db 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1443,7 +1443,7 @@ int isolate_lru_page(struct page *page)
int ret = -EBUSY;
VM_BUG_ON_PAGE(!page_count(page), page);
- VM_BUG_ON_PAGE(PageTail(page), page);
+ WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 40b2c74ddf16..084c6725b373 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1396,10 +1396,15 @@ static void vmstat_update(struct work_struct *w)
* Counters were updated so we expect more updates
* to occur in the future. Keep on running the
* update worker thread.
+ * If we were marked on cpu_stat_off clear the flag
+ * so that vmstat_shepherd doesn't schedule us again.
*/
- queue_delayed_work_on(smp_processor_id(), vmstat_wq,
- this_cpu_ptr(&vmstat_work),
- round_jiffies_relative(sysctl_stat_interval));
+ if (!cpumask_test_and_clear_cpu(smp_processor_id(),
+ cpu_stat_off)) {
+ queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ this_cpu_ptr(&vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ }
} else {
/*
* We did not update any counters so the app may be in
@@ -1417,18 +1422,6 @@ static void vmstat_update(struct work_struct *w)
* until the diffs stay at zero. The function is used by NOHZ and can only be
* invoked when tick processing is not active.
*/
-void quiet_vmstat(void)
-{
- if (system_state != SYSTEM_RUNNING)
- return;
-
- do {
- if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
- cancel_delayed_work(this_cpu_ptr(&vmstat_work));
-
- } while (refresh_cpu_vm_stats(false));
-}
-
/*
* Check if the diffs for a certain cpu indicate that
* an update is needed.
@@ -1452,6 +1445,30 @@ static bool need_update(int cpu)
return false;
}
+void quiet_vmstat(void)
+{
+ if (system_state != SYSTEM_RUNNING)
+ return;
+
+ /*
+ * If we are already in hands of the shepherd then there
+ * is nothing for us to do here.
+ */
+ if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+ return;
+
+ if (!need_update(smp_processor_id()))
+ return;
+
+ /*
+ * Just refresh counters and do not care about the pending delayed
+ * vmstat_update. It doesn't fire that often to matter and canceling
+ * it would be too expensive from this path.
+ * vmstat_shepherd will take care about that for us.
+ */
+ refresh_cpu_vm_stats(false);
+}
+
/*
* Shepherd worker thread that checks the
@@ -1469,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w)
get_online_cpus();
/* Check processors whose vmstat worker threads have been disabled */
- for_each_cpu(cpu, cpu_stat_off)
- if (need_update(cpu) &&
- cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
-
- queue_delayed_work_on(cpu, vmstat_wq,
- &per_cpu(vmstat_work, cpu), 0);
+ for_each_cpu(cpu, cpu_stat_off) {
+ struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
+ if (need_update(cpu)) {
+ if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+ queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+ } else {
+ /*
+ * Cancel the work if quiet_vmstat has put this
+ * cpu on cpu_stat_off because the work item might
+ * be still scheduled
+ */
+ cancel_delayed_work(dw);
+ }
+ }
put_online_cpus();
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
-
}
static void __init start_shepherd_timer(void)
@@ -1488,7 +1512,7 @@ static void __init start_shepherd_timer(void)
int cpu;
for_each_possible_cpu(cpu)
- INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu),
+ INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))