aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c92
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/kmemleak.c7
-rw-r--r--mm/memory.c29
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mlock.c18
-rw-r--r--mm/mmap.c26
-rw-r--r--mm/oom_kill.c37
-rw-r--r--mm/page_alloc.c62
-rw-r--r--mm/page_cgroup.c2
-rw-r--r--mm/prio_tree.c1
-rw-r--r--mm/shmem.c154
-rw-r--r--mm/slab.c1
-rw-r--r--mm/slub.c169
-rw-r--r--mm/swap.c3
-rw-r--r--mm/vmscan.c27
-rw-r--r--mm/vmstat.c18
17 files changed, 342 insertions, 308 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0a619e0e2e0b..83326ad66d9b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -244,24 +244,28 @@ static ssize_t single_flag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf,
enum transparent_hugepage_flag flag)
{
- if (test_bit(flag, &transparent_hugepage_flags))
- return sprintf(buf, "[yes] no\n");
- else
- return sprintf(buf, "yes [no]\n");
+ return sprintf(buf, "%d\n",
+ !!test_bit(flag, &transparent_hugepage_flags));
}
+
static ssize_t single_flag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count,
enum transparent_hugepage_flag flag)
{
- if (!memcmp("yes", buf,
- min(sizeof("yes")-1, count))) {
+ unsigned long value;
+ int ret;
+
+ ret = kstrtoul(buf, 10, &value);
+ if (ret < 0)
+ return ret;
+ if (value > 1)
+ return -EINVAL;
+
+ if (value)
set_bit(flag, &transparent_hugepage_flags);
- } else if (!memcmp("no", buf,
- min(sizeof("no")-1, count))) {
+ else
clear_bit(flag, &transparent_hugepage_flags);
- } else
- return -EINVAL;
return count;
}
@@ -680,8 +684,11 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
vma, haddr, numa_node_id(), 0);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
goto out;
+ }
+ count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
put_page(page);
goto out;
@@ -909,11 +916,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
new_page = NULL;
if (unlikely(!new_page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
put_page(page);
goto out;
}
+ count_vm_event(THP_FAULT_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
@@ -1390,6 +1399,7 @@ int split_huge_page(struct page *page)
BUG_ON(!PageSwapBacked(page));
__split_huge_page(page, anon_vma);
+ count_vm_event(THP_SPLIT);
BUG_ON(PageCompound(page));
out_unlock:
@@ -1398,6 +1408,9 @@ out:
return ret;
}
+#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+ VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
@@ -1406,11 +1419,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
/*
* Be somewhat over-protective like KSM for now!
*/
- if (*vm_flags & (VM_HUGEPAGE |
- VM_SHARED | VM_MAYSHARE |
- VM_PFNMAP | VM_IO | VM_DONTEXPAND |
- VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
- VM_MIXEDMAP | VM_SAO))
+ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
return -EINVAL;
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
@@ -1426,11 +1435,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
/*
* Be somewhat over-protective like KSM for now!
*/
- if (*vm_flags & (VM_NOHUGEPAGE |
- VM_SHARED | VM_MAYSHARE |
- VM_PFNMAP | VM_IO | VM_DONTEXPAND |
- VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
- VM_MIXEDMAP | VM_SAO))
+ if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
return -EINVAL;
*vm_flags &= ~VM_HUGEPAGE;
*vm_flags |= VM_NOHUGEPAGE;
@@ -1564,10 +1569,14 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
* page fault if needed.
*/
return 0;
- if (vma->vm_file || vma->vm_ops)
+ if (vma->vm_ops)
/* khugepaged not yet working on file or special mappings */
return 0;
- VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+ * true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
@@ -1784,9 +1793,11 @@ static void collapse_huge_page(struct mm_struct *mm,
node, __GFP_OTHER_NODE);
if (unlikely(!new_page)) {
up_read(&mm->mmap_sem);
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
return;
}
+ count_vm_event(THP_COLLAPSE_ALLOC);
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
up_read(&mm->mmap_sem);
put_page(new_page);
@@ -1816,12 +1827,15 @@ static void collapse_huge_page(struct mm_struct *mm,
(vma->vm_flags & VM_NOHUGEPAGE))
goto out;
- /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
- if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+ if (!vma->anon_vma || vma->vm_ops)
goto out;
if (is_vma_temporary_stack(vma))
goto out;
- VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+ * true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -2054,13 +2068,16 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
progress++;
continue;
}
- /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
- if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+ if (!vma->anon_vma || vma->vm_ops)
goto skip;
if (is_vma_temporary_stack(vma))
goto skip;
-
- VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping()
+ * must be true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) ||
+ vma->vm_flags & VM_NO_THP);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2151,8 +2168,11 @@ static void khugepaged_do_scan(struct page **hpage)
#ifndef CONFIG_NUMA
if (!*hpage) {
*hpage = alloc_hugepage(khugepaged_defrag());
- if (unlikely(!*hpage))
+ if (unlikely(!*hpage)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
break;
+ }
+ count_vm_event(THP_COLLAPSE_ALLOC);
}
#else
if (IS_ERR(*hpage))
@@ -2192,8 +2212,11 @@ static struct page *khugepaged_alloc_hugepage(void)
do {
hpage = alloc_hugepage(khugepaged_defrag());
- if (!hpage)
+ if (!hpage) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
khugepaged_alloc_sleep();
+ } else
+ count_vm_event(THP_COLLAPSE_ALLOC);
} while (unlikely(!hpage) &&
likely(khugepaged_enabled()));
return hpage;
@@ -2210,8 +2233,11 @@ static void khugepaged_loop(void)
while (likely(khugepaged_enabled())) {
#ifndef CONFIG_NUMA
hpage = khugepaged_alloc_hugepage();
- if (unlikely(!hpage))
+ if (unlikely(!hpage)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
break;
+ }
+ count_vm_event(THP_COLLAPSE_ALLOC);
#else
if (IS_ERR(hpage)) {
khugepaged_alloc_sleep();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8ee3bd8ec5b5..bbb4a5bbb958 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -475,7 +475,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
/* If reserves cannot be used, ensure enough pages are in the pool */
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
- goto err;;
+ goto err;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c1d5867543e4..aacee45616fc 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1414,9 +1414,12 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++(*pos);
list_for_each_continue_rcu(n, &object_list) {
- next_obj = list_entry(n, struct kmemleak_object, object_list);
- if (get_object(next_obj))
+ struct kmemleak_object *obj =
+ list_entry(n, struct kmemleak_object, object_list);
+ if (get_object(obj)) {
+ next_obj = obj;
break;
+ }
}
put_object(prev_obj);
diff --git a/mm/memory.c b/mm/memory.c
index 9da8cab1b1b0..61e66f026563 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1359,7 +1359,7 @@ split_fallthrough:
*/
mark_page_accessed(page);
}
- if (flags & FOLL_MLOCK) {
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -1410,6 +1410,12 @@ no_page_table:
return page;
}
+static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
+{
+ return stack_guard_page_start(vma, addr) ||
+ stack_guard_page_end(vma, addr+PAGE_SIZE);
+}
+
/**
* __get_user_pages() - pin user pages in memory
* @tsk: task_struct of target task
@@ -1488,7 +1494,6 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
vma = find_extend_vma(mm, start);
if (!vma && in_gate_area(mm, start)) {
unsigned long pg = start & PAGE_MASK;
- struct vm_area_struct *gate_vma = get_gate_vma(mm);
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
@@ -1513,10 +1518,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
pte_unmap(pte);
return i ? : -EFAULT;
}
+ vma = get_gate_vma(mm);
if (pages) {
struct page *page;
- page = vm_normal_page(gate_vma, start, *pte);
+ page = vm_normal_page(vma, start, *pte);
if (!page) {
if (!(gup_flags & FOLL_DUMP) &&
is_zero_pfn(pte_pfn(*pte)))
@@ -1530,12 +1536,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
get_page(page);
}
pte_unmap(pte);
- if (vmas)
- vmas[i] = gate_vma;
- i++;
- start += PAGE_SIZE;
- nr_pages--;
- continue;
+ goto next_page;
}
if (!vma ||
@@ -1565,6 +1566,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
int ret;
unsigned int fault_flags = 0;
+ /* For mlock, just skip the stack guard page. */
+ if (foll_flags & FOLL_MLOCK) {
+ if (stack_guard_page(vma, start))
+ goto next_page;
+ }
if (foll_flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (nonblocking)
@@ -1631,6 +1637,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
flush_anon_page(vma, page, start);
flush_dcache_page(page);
}
+next_page:
if (vmas)
vmas[i] = vma;
i++;
@@ -3386,7 +3393,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+ if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
@@ -3678,7 +3685,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
*/
#ifdef CONFIG_HAVE_IOREMAP_PROT
vma = find_vma(mm, addr);
- if (!vma)
+ if (!vma || vma->vm_start > addr)
break;
if (vma->vm_ops && vma->vm_ops->access)
ret = vma->vm_ops->access(vma, addr, buf,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a2acaf820fe5..9ca1d604f7cd 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -375,7 +375,7 @@ void online_page(struct page *page)
#endif
#ifdef CONFIG_FLATMEM
- max_mapnr = max(page_to_pfn(page), max_mapnr);
+ max_mapnr = max(pfn, max_mapnr);
#endif
ClearPageReserved(page);
diff --git a/mm/mlock.c b/mm/mlock.c
index 2689a08c79af..516b2c2ddd5a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,13 +135,6 @@ void munlock_vma_page(struct page *page)
}
}
-static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
-{
- return (vma->vm_flags & VM_GROWSDOWN) &&
- (vma->vm_start == addr) &&
- !vma_stack_continue(vma->vm_prev, addr);
-}
-
/**
* __mlock_vma_pages_range() - mlock a range of pages in the vma.
* @vma: target vma
@@ -169,7 +162,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
VM_BUG_ON(end > vma->vm_end);
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
- gup_flags = FOLL_TOUCH;
+ gup_flags = FOLL_TOUCH | FOLL_MLOCK;
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
@@ -185,15 +178,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
gup_flags |= FOLL_FORCE;
- if (vma->vm_flags & VM_LOCKED)
- gup_flags |= FOLL_MLOCK;
-
- /* We don't try to access the guard page of a stack vma */
- if (stack_guard_page(vma, start)) {
- addr += PAGE_SIZE;
- nr_pages--;
- }
-
return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
NULL, NULL, nonblocking);
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 2ec8eb5a9cdd..772140c53ab1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -259,7 +259,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
* randomize_va_space to 2, which will still cause mm->start_brk
* to be arbitrarily shifted
*/
- if (mm->start_brk > PAGE_ALIGN(mm->end_data))
+ if (current->brk_randomized)
min_brk = mm->start_brk;
else
min_brk = mm->end_data;
@@ -1767,10 +1767,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
size = address - vma->vm_start;
grow = (address - vma->vm_end) >> PAGE_SHIFT;
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- vma->vm_end = address;
- perf_event_mmap(vma);
+ error = -ENOMEM;
+ if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ vma->vm_end = address;
+ perf_event_mmap(vma);
+ }
}
}
vma_unlock_anon_vma(vma);
@@ -1814,11 +1817,14 @@ static int expand_downwards(struct vm_area_struct *vma,
size = vma->vm_end - address;
grow = (vma->vm_start - address) >> PAGE_SHIFT;
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- vma->vm_start = address;
- vma->vm_pgoff -= grow;
- perf_event_mmap(vma);
+ error = -ENOMEM;
+ if (grow <= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ vma->vm_start = address;
+ vma->vm_pgoff -= grow;
+ perf_event_mmap(vma);
+ }
}
}
vma_unlock_anon_vma(vma);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6a819d1b2c7d..f52e85c80e8d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -84,24 +84,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
#endif /* CONFIG_NUMA */
/*
- * If this is a system OOM (not a memcg OOM) and the task selected to be
- * killed is not already running at high (RT) priorities, speed up the
- * recovery by boosting the dying task to the lowest FIFO priority.
- * That helps with the recovery and avoids interfering with RT tasks.
- */
-static void boost_dying_task_prio(struct task_struct *p,
- struct mem_cgroup *mem)
-{
- struct sched_param param = { .sched_priority = 1 };
-
- if (mem)
- return;
-
- if (!rt_task(p))
- sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
-}
-
-/*
* The process p may have detached its own ->mm while exiting or through
* use_mm(), but one or more of its subthreads may still have a valid
* pointer. Return p, or any of its subthreads with a valid ->mm, with
@@ -190,10 +172,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
/*
* The baseline for the badness score is the proportion of RAM that each
- * task's rss and swap space use.
+ * task's rss, pagetable and swap space use.
*/
- points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
- totalpages;
+ points = get_mm_rss(p->mm) + p->mm->nr_ptes;
+ points += get_mm_counter(p->mm, MM_SWAPENTS);
+
+ points *= 1000;
+ points /= totalpages;
task_unlock(p);
/*
@@ -452,13 +437,6 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
set_tsk_thread_flag(p, TIF_MEMDIE);
force_sig(SIGKILL, p);
- /*
- * We give our sacrificial lamb high priority and access to
- * all the memory it needs. That way it should be able to
- * exit() and clear out its resources quickly...
- */
- boost_dying_task_prio(p, mem);
-
return 0;
}
#undef K
@@ -482,7 +460,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
*/
if (p->flags & PF_EXITING) {
set_tsk_thread_flag(p, TIF_MEMDIE);
- boost_dying_task_prio(p, mem);
return 0;
}
@@ -556,7 +533,6 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
*/
if (fatal_signal_pending(current)) {
set_thread_flag(TIF_MEMDIE);
- boost_dying_task_prio(current, NULL);
return;
}
@@ -712,7 +688,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
*/
if (fatal_signal_pending(current)) {
set_thread_flag(TIF_MEMDIE);
- boost_dying_task_prio(current, NULL);
return;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2747f5e5abc1..9d5498e2d0f5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -54,6 +54,7 @@
#include <trace/events/kmem.h>
#include <linux/ftrace_event.h>
#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -2317,6 +2318,21 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
+static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+{
+ if (addr) {
+ unsigned long alloc_end = addr + (PAGE_SIZE << order);
+ unsigned long used = addr + PAGE_ALIGN(size);
+
+ split_page(virt_to_page((void *)addr), order);
+ while (used < alloc_end) {
+ free_page(used);
+ used += PAGE_SIZE;
+ }
+ }
+ return (void *)addr;
+}
+
/**
* alloc_pages_exact - allocate an exact number physically-contiguous pages.
* @size: the number of bytes to allocate
@@ -2336,22 +2352,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
unsigned long addr;
addr = __get_free_pages(gfp_mask, order);
- if (addr) {
- unsigned long alloc_end = addr + (PAGE_SIZE << order);
- unsigned long used = addr + PAGE_ALIGN(size);
-
- split_page(virt_to_page((void *)addr), order);
- while (used < alloc_end) {
- free_page(used);
- used += PAGE_SIZE;
- }
- }
-
- return (void *)addr;
+ return make_alloc_exact(addr, order, size);
}
EXPORT_SYMBOL(alloc_pages_exact);
/**
+ * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
+ * pages on a node.
+ * @nid: the preferred node ID where memory should be allocated
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * Like alloc_pages_exact(), but try to allocate on node nid first before falling
+ * back.
+ * Note this is not alloc_pages_exact_node() which allocates on a specific node,
+ * but is not exact.
+ */
+void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+{
+ unsigned order = get_order(size);
+ struct page *p = alloc_pages_node(nid, gfp_mask, order);
+ if (!p)
+ return NULL;
+ return make_alloc_exact((unsigned long)page_address(p), order, size);
+}
+EXPORT_SYMBOL(alloc_pages_exact_nid);
+
+/**
* free_pages_exact - release memory allocated via alloc_pages_exact()
* @virt: the value returned by alloc_pages_exact.
* @size: size of allocation, same value as passed to alloc_pages_exact().
@@ -3176,7 +3203,7 @@ static __init_refok int __build_all_zonelists(void *data)
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*/
-void build_all_zonelists(void *data)
+void __ref build_all_zonelists(void *data)
{
set_zonelist_order();
@@ -3514,7 +3541,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
pcp->batch = PAGE_SHIFT * 8;
}
-static __meminit void setup_zone_pageset(struct zone *zone)
+static void setup_zone_pageset(struct zone *zone)
{
int cpu;
@@ -3564,7 +3591,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
if (!slab_is_available()) {
zone->wait_table = (wait_queue_head_t *)
- alloc_bootmem_node(pgdat, alloc_size);
+ alloc_bootmem_node_nopanic(pgdat, alloc_size);
} else {
/*
* This case means that a zone whose size was 0 gets new memory
@@ -4141,7 +4168,8 @@ static void __init setup_usemap(struct pglist_data *pgdat,
unsigned long usemapsize = usemap_size(zonesize);
zone->pageblock_flags = NULL;
if (usemapsize)
- zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
+ zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
+ usemapsize);
}
#else
static inline void setup_usemap(struct pglist_data *pgdat,
@@ -4307,7 +4335,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
- map = alloc_bootmem_node(pgdat, size);
+ map = alloc_bootmem_node_nopanic(pgdat, size);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 99055010cece..2daadc322ba6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -134,7 +134,7 @@ static void *__init_refok alloc_page_cgroup(size_t size, int nid)
{
void *addr = NULL;
- addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN);
+ addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
if (addr)
return addr;
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
index 603ae98d9694..799dcfd7cd8c 100644
--- a/mm/prio_tree.c
+++ b/mm/prio_tree.c
@@ -13,6 +13,7 @@
#include <linux/mm.h>
#include <linux/prio_tree.h>
+#include <linux/prefetch.h>
/*
* See lib/prio_tree.c for details on the general radix priority search tree
diff --git a/mm/shmem.c b/mm/shmem.c
index 58da7c150ba6..ba4ad28b7db6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -421,7 +421,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
* a waste to allocate index if we cannot allocate data.
*/
if (sbinfo->max_blocks) {
- if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks - 1) >= 0)
return ERR_PTR(-ENOSPC);
percpu_counter_inc(&sbinfo->used_blocks);
spin_lock(&inode->i_lock);
@@ -851,7 +852,7 @@ static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_
static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
{
- struct inode *inode;
+ struct address_space *mapping;
unsigned long idx;
unsigned long size;
unsigned long limit;
@@ -874,8 +875,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
if (size > SHMEM_NR_DIRECT)
size = SHMEM_NR_DIRECT;
offset = shmem_find_swp(entry, ptr, ptr+size);
- if (offset >= 0)
+ if (offset >= 0) {
+ shmem_swp_balance_unmap();
goto found;
+ }
if (!info->i_indirect)
goto lost2;
@@ -916,6 +919,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
shmem_swp_unmap(ptr);
if (offset >= 0) {
shmem_dir_unmap(dir);
+ ptr = shmem_swp_map(subdir);
goto found;
}
}
@@ -927,8 +931,7 @@ lost2:
return 0;
found:
idx += offset;
- inode = igrab(&info->vfs_inode);
- spin_unlock(&info->lock);
+ ptr += offset;
/*
* Move _head_ to start search for next from here.
@@ -939,37 +942,18 @@ found:
*/
if (shmem_swaplist.next != &info->swaplist)
list_move_tail(&shmem_swaplist, &info->swaplist);
- mutex_unlock(&shmem_swaplist_mutex);
- error = 1;
- if (!inode)
- goto out;
/*
- * Charge page using GFP_KERNEL while we can wait.
- * Charged back to the user(not to caller) when swap account is used.
- * add_to_page_cache() will be called with GFP_NOWAIT.
+ * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
+ * but also to hold up shmem_evict_inode(): so inode cannot be freed
+ * beneath us (pagelock doesn't help until the page is in pagecache).
*/
- error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
- if (error)
- goto out;
- error = radix_tree_preload(GFP_KERNEL);
- if (error) {
- mem_cgroup_uncharge_cache_page(page);
- goto out;
- }
- error = 1;
-
- spin_lock(&info->lock);
- ptr = shmem_swp_entry(info, idx, NULL);
- if (ptr && ptr->val == entry.val) {
- error = add_to_page_cache_locked(page, inode->i_mapping,
- idx, GFP_NOWAIT);
- /* does mem_cgroup_uncharge_cache_page on error */
- } else /* we must compensate for our precharge above */
- mem_cgroup_uncharge_cache_page(page);
+ mapping = info->vfs_inode.i_mapping;
+ error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
+ /* which does mem_cgroup_uncharge_cache_page on error */
if (error == -EEXIST) {
- struct page *filepage = find_get_page(inode->i_mapping, idx);
+ struct page *filepage = find_get_page(mapping, idx);
error = 1;
if (filepage) {
/*
@@ -989,14 +973,8 @@ found:
swap_free(entry);
error = 1; /* not an error, but entry was found */
}
- if (ptr)
- shmem_swp_unmap(ptr);
+ shmem_swp_unmap(ptr);
spin_unlock(&info->lock);
- radix_tree_preload_end();
-out:
- unlock_page(page);
- page_cache_release(page);
- iput(inode); /* allows for NULL */
return error;
}
@@ -1008,6 +986,26 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
struct list_head *p, *next;
struct shmem_inode_info *info;
int found = 0;
+ int error;
+
+ /*
+ * Charge page using GFP_KERNEL while we can wait, before taking
+ * the shmem_swaplist_mutex which might hold up shmem_writepage().
+ * Charged back to the user (not to caller) when swap account is used.
+ * add_to_page_cache() will be called with GFP_NOWAIT.
+ */
+ error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
+ if (error)
+ goto out;
+ /*
+ * Try to preload while we can wait, to not make a habit of
+ * draining atomic reserves; but don't latch on to this cpu,
+ * it's okay if sometimes we get rescheduled after this.
+ */
+ error = radix_tree_preload(GFP_KERNEL);
+ if (error)
+ goto uncharge;
+ radix_tree_preload_end();
mutex_lock(&shmem_swaplist_mutex);
list_for_each_safe(p, next, &shmem_swaplist) {
@@ -1015,17 +1013,19 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
found = shmem_unuse_inode(info, entry, page);
cond_resched();
if (found)
- goto out;
+ break;
}
mutex_unlock(&shmem_swaplist_mutex);
- /*
- * Can some race bring us here? We've been holding page lock,
- * so I think not; but would rather try again later than BUG()
- */
+
+uncharge:
+ if (!found)
+ mem_cgroup_uncharge_cache_page(page);
+ if (found < 0)
+ error = found;
+out:
unlock_page(page);
page_cache_release(page);
-out:
- return (found < 0) ? found : 0;
+ return error;
}
/*
@@ -1063,7 +1063,25 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
else
swap.val = 0;
+ /*
+ * Add inode to shmem_unuse()'s list of swapped-out inodes,
+ * if it's not already there. Do it now because we cannot take
+ * mutex while holding spinlock, and must do so before the page
+ * is moved to swap cache, when its pagelock no longer protects
+ * the inode from eviction. But don't unlock the mutex until
+ * we've taken the spinlock, because shmem_unuse_inode() will
+ * prune a !swapped inode from the swaplist under both locks.
+ */
+ if (swap.val) {
+ mutex_lock(&shmem_swaplist_mutex);
+ if (list_empty(&info->swaplist))
+ list_add_tail(&info->swaplist, &shmem_swaplist);
+ }
+
spin_lock(&info->lock);
+ if (swap.val)
+ mutex_unlock(&shmem_swaplist_mutex);
+
if (index >= info->next_index) {
BUG_ON(!(info->flags & SHMEM_TRUNCATE));
goto unlock;
@@ -1083,21 +1101,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
delete_from_page_cache(page);
shmem_swp_set(info, entry, swap.val);
shmem_swp_unmap(entry);
- if (list_empty(&info->swaplist))
- inode = igrab(inode);
- else
- inode = NULL;
spin_unlock(&info->lock);
swap_shmem_alloc(swap);
BUG_ON(page_mapped(page));
swap_writepage(page, wbc);
- if (inode) {
- mutex_lock(&shmem_swaplist_mutex);
- /* move instead of add in case we're racing */
- list_move_tail(&info->swaplist, &shmem_swaplist);
- mutex_unlock(&shmem_swaplist_mutex);
- iput(inode);
- }
return 0;
}
@@ -1397,21 +1404,16 @@ repeat:
shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
- if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
- shmem_acct_block(info->flags)) {
- spin_unlock(&info->lock);
- error = -ENOSPC;
- goto failed;
- }
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks) >= 0 ||
+ shmem_acct_block(info->flags))
+ goto nospace;
percpu_counter_inc(&sbinfo->used_blocks);
spin_lock(&inode->i_lock);
inode->i_blocks += BLOCKS_PER_PAGE;
spin_unlock(&inode->i_lock);
- } else if (shmem_acct_block(info->flags)) {
- spin_unlock(&info->lock);
- error = -ENOSPC;
- goto failed;
- }
+ } else if (shmem_acct_block(info->flags))
+ goto nospace;
if (!filepage) {
int ret;
@@ -1491,6 +1493,24 @@ done:
error = 0;
goto out;
+nospace:
+ /*
+ * Perhaps the page was brought in from swap between find_lock_page
+ * and taking info->lock? We allow for that at add_to_page_cache_lru,
+ * but must also avoid reporting a spurious ENOSPC while working on a
+ * full tmpfs. (When filepage has been passed in to shmem_getpage, it
+ * is already in page cache, which prevents this race from occurring.)
+ */
+ if (!filepage) {
+ struct page *page = find_get_page(mapping, idx);
+ if (page) {
+ spin_unlock(&info->lock);
+ page_cache_release(page);
+ goto repeat;
+ }
+ }
+ spin_unlock(&info->lock);
+ error = -ENOSPC;
failed:
if (*pagep != filepage) {
unlock_page(filepage);
diff --git a/mm/slab.c b/mm/slab.c
index 46a9c163a92f..bcfa4987c8ae 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
#include <linux/debugobjects.h>
#include <linux/kmemcheck.h>
#include <linux/memory.h>
+#include <linux/prefetch.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
diff --git a/mm/slub.c b/mm/slub.c
index 94d2a33a866e..4ea7f1a22a94 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -261,6 +261,18 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
return *(void **)(object + s->offset);
}
+static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
+{
+ void *p;
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
+#else
+ p = get_freepointer(s, object);
+#endif
+ return p;
+}
+
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
{
*(void **)(object + s->offset) = fp;
@@ -271,10 +283,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
__p += (__s)->size)
-/* Scan freelist */
-#define for_each_free_object(__p, __s, __free) \
- for (__p = (__free); __p; __p = get_freepointer((__s), __p))
-
/* Determine object index from a given position */
static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
{
@@ -332,6 +340,21 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
#ifdef CONFIG_SLUB_DEBUG
/*
+ * Determine a map of object in use on a page.
+ *
+ * Slab lock or node listlock must be held to guarantee that the page does
+ * not vanish from under us.
+ */
+static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
+{
+ void *p;
+ void *addr = page_address(page);
+
+ for (p = page->freelist; p; p = get_freepointer(s, p))
+ set_bit(slab_index(p, s, addr), map);
+}
+
+/*
* Debug settings:
*/
#ifdef CONFIG_SLUB_DEBUG_ON
@@ -1487,7 +1510,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
page = get_partial_node(get_node(s, searchnode));
- if (page || node != -1)
+ if (page || node != NUMA_NO_NODE)
return page;
return get_any_partial(s, flags);
@@ -1540,7 +1563,6 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
}
}
-#ifdef CONFIG_CMPXCHG_LOCAL
#ifdef CONFIG_PREEMPT
/*
* Calculate the next globally unique transaction for disambiguiation
@@ -1600,17 +1622,12 @@ static inline void note_cmpxchg_failure(const char *n,
stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
}
-#endif
-
void init_kmem_cache_cpus(struct kmem_cache *s)
{
-#ifdef CONFIG_CMPXCHG_LOCAL
int cpu;
for_each_possible_cpu(cpu)
per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
-#endif
-
}
/*
* Remove the cpu slab
@@ -1643,9 +1660,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
page->inuse--;
}
c->page = NULL;
-#ifdef CONFIG_CMPXCHG_LOCAL
c->tid = next_tid(c->tid);
-#endif
unfreeze_slab(s, page, tail);
}
@@ -1779,8 +1794,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void **object;
- struct page *new;
-#ifdef CONFIG_CMPXCHG_LOCAL
+ struct page *page;
unsigned long flags;
local_irq_save(flags);
@@ -1792,37 +1806,35 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
*/
c = this_cpu_ptr(s->cpu_slab);
#endif
-#endif
/* We handle __GFP_ZERO in the caller */
gfpflags &= ~__GFP_ZERO;
- if (!c->page)
+ page = c->page;
+ if (!page)
goto new_slab;
- slab_lock(c->page);
+ slab_lock(page);
if (unlikely(!node_match(c, node)))
goto another_slab;
stat(s, ALLOC_REFILL);
load_freelist:
- object = c->page->freelist;
+ object = page->freelist;
if (unlikely(!object))
goto another_slab;
if (kmem_cache_debug(s))
goto debug;
c->freelist = get_freepointer(s, object);
- c->page->inuse = c->page->objects;
- c->page->freelist = NULL;
- c->node = page_to_nid(c->page);
+ page->inuse = page->objects;
+ page->freelist = NULL;
+
unlock_out:
- slab_unlock(c->page);
-#ifdef CONFIG_CMPXCHG_LOCAL
+ slab_unlock(page);
c->tid = next_tid(c->tid);
local_irq_restore(flags);
-#endif
stat(s, ALLOC_SLOWPATH);
return object;
@@ -1830,10 +1842,11 @@ another_slab:
deactivate_slab(s, c);
new_slab:
- new = get_partial(s, gfpflags, node);
- if (new) {
- c->page = new;
+ page = get_partial(s, gfpflags, node);
+ if (page) {
stat(s, ALLOC_FROM_PARTIAL);
+ c->node = page_to_nid(page);
+ c->page = page;
goto load_freelist;
}
@@ -1841,33 +1854,35 @@ new_slab:
if (gfpflags & __GFP_WAIT)
local_irq_enable();
- new = new_slab(s, gfpflags, node);
+ page = new_slab(s, gfpflags, node);
if (gfpflags & __GFP_WAIT)
local_irq_disable();
- if (new) {
+ if (page) {
c = __this_cpu_ptr(s->cpu_slab);
stat(s, ALLOC_SLAB);
if (c->page)
flush_slab(s, c);
- slab_lock(new);
- __SetPageSlubFrozen(new);
- c->page = new;
+
+ slab_lock(page);
+ __SetPageSlubFrozen(page);
+ c->node = page_to_nid(page);
+ c->page = page;
goto load_freelist;
}
if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
slab_out_of_memory(s, gfpflags, node);
-#ifdef CONFIG_CMPXCHG_LOCAL
local_irq_restore(flags);
-#endif
return NULL;
debug:
- if (!alloc_debug_processing(s, c->page, object, addr))
+ if (!alloc_debug_processing(s, page, object, addr))
goto another_slab;
- c->page->inuse++;
- c->page->freelist = get_freepointer(s, object);
+ page->inuse++;
+ page->freelist = get_freepointer(s, object);
+ deactivate_slab(s, c);
+ c->page = NULL;
c->node = NUMA_NO_NODE;
goto unlock_out;
}
@@ -1887,20 +1902,12 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
{
void **object;
struct kmem_cache_cpu *c;
-#ifdef CONFIG_CMPXCHG_LOCAL
unsigned long tid;
-#else
- unsigned long flags;
-#endif
if (slab_pre_alloc_hook(s, gfpflags))
return NULL;
-#ifndef CONFIG_CMPXCHG_LOCAL
- local_irq_save(flags);
-#else
redo:
-#endif
/*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
@@ -1910,7 +1917,6 @@ redo:
*/
c = __this_cpu_ptr(s->cpu_slab);
-#ifdef CONFIG_CMPXCHG_LOCAL
/*
* The transaction ids are globally unique per cpu and per operation on
* a per cpu queue. Thus they can be guarantee that the cmpxchg_double
@@ -1919,7 +1925,6 @@ redo:
*/
tid = c->tid;
barrier();
-#endif
object = c->freelist;
if (unlikely(!object || !node_match(c, node)))
@@ -1927,7 +1932,6 @@ redo:
object = __slab_alloc(s, gfpflags, node, addr, c);
else {
-#ifdef CONFIG_CMPXCHG_LOCAL
/*
* The cmpxchg will only match if there was no additional
* operation and if we are on the right processor.
@@ -1940,24 +1944,17 @@ redo:
* Since this is without lock semantics the protection is only against
* code executing on this cpu *not* from access by other cpus.
*/
- if (unlikely(!this_cpu_cmpxchg_double(
+ if (unlikely(!irqsafe_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
object, tid,
- get_freepointer(s, object), next_tid(tid)))) {
+ get_freepointer_safe(s, object), next_tid(tid)))) {
note_cmpxchg_failure("slab_alloc", s, tid);
goto redo;
}
-#else
- c->freelist = get_freepointer(s, object);
-#endif
stat(s, ALLOC_FASTPATH);
}
-#ifndef CONFIG_CMPXCHG_LOCAL
- local_irq_restore(flags);
-#endif
-
if (unlikely(gfpflags & __GFP_ZERO) && object)
memset(object, 0, s->objsize);
@@ -2034,18 +2031,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
{
void *prior;
void **object = (void *)x;
-#ifdef CONFIG_CMPXCHG_LOCAL
unsigned long flags;
local_irq_save(flags);
-#endif
slab_lock(page);
stat(s, FREE_SLOWPATH);
- if (kmem_cache_debug(s))
- goto debug;
+ if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
+ goto out_unlock;
-checks_ok:
prior = page->freelist;
set_freepointer(s, object, prior);
page->freelist = object;
@@ -2070,9 +2064,7 @@ checks_ok:
out_unlock:
slab_unlock(page);
-#ifdef CONFIG_CMPXCHG_LOCAL
local_irq_restore(flags);
-#endif
return;
slab_empty:
@@ -2084,17 +2076,9 @@ slab_empty:
stat(s, FREE_REMOVE_PARTIAL);
}
slab_unlock(page);
-#ifdef CONFIG_CMPXCHG_LOCAL
local_irq_restore(flags);
-#endif
stat(s, FREE_SLAB);
discard_slab(s, page);
- return;
-
-debug:
- if (!free_debug_processing(s, page, x, addr))
- goto out_unlock;
- goto checks_ok;
}
/*
@@ -2113,20 +2097,11 @@ static __always_inline void slab_free(struct kmem_cache *s,
{
void **object = (void *)x;
struct kmem_cache_cpu *c;
-#ifdef CONFIG_CMPXCHG_LOCAL
unsigned long tid;
-#else
- unsigned long flags;
-#endif
slab_free_hook(s, x);
-#ifndef CONFIG_CMPXCHG_LOCAL
- local_irq_save(flags);
-
-#else
redo:
-#endif
/*
* Determine the currently cpus per cpu slab.
@@ -2136,16 +2111,13 @@ redo:
*/
c = __this_cpu_ptr(s->cpu_slab);
-#ifdef CONFIG_CMPXCHG_LOCAL
tid = c->tid;
barrier();
-#endif
- if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
+ if (likely(page == c->page)) {
set_freepointer(s, object, c->freelist);
-#ifdef CONFIG_CMPXCHG_LOCAL
- if (unlikely(!this_cpu_cmpxchg_double(
+ if (unlikely(!irqsafe_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
c->freelist, tid,
object, next_tid(tid)))) {
@@ -2153,16 +2125,10 @@ redo:
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
-#else
- c->freelist = object;
-#endif
stat(s, FREE_FASTPATH);
} else
__slab_free(s, page, x, addr);
-#ifndef CONFIG_CMPXCHG_LOCAL
- local_irq_restore(flags);
-#endif
}
void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2673,9 +2639,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
return;
slab_err(s, page, "%s", text);
slab_lock(page);
- for_each_free_object(p, s, page->freelist)
- set_bit(slab_index(p, s, addr), map);
+ get_map(s, page, map);
for_each_object(p, s, addr, page->objects) {
if (!test_bit(slab_index(p, s, addr), map)) {
@@ -3203,7 +3168,7 @@ static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
list_for_each_entry(p, &n->partial, lru)
p->slab = s;
-#ifdef CONFIG_SLAB_DEBUG
+#ifdef CONFIG_SLUB_DEBUG
list_for_each_entry(p, &n->full, lru)
p->slab = s;
#endif
@@ -3610,10 +3575,11 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
/* Now we know that a valid freelist exists */
bitmap_zero(map, page->objects);
- for_each_free_object(p, s, page->freelist) {
- set_bit(slab_index(p, s, addr), map);
- if (!check_object(s, page, p, SLUB_RED_INACTIVE))
- return 0;
+ get_map(s, page, map);
+ for_each_object(p, s, addr, page->objects) {
+ if (test_bit(slab_index(p, s, addr), map))
+ if (!check_object(s, page, p, SLUB_RED_INACTIVE))
+ return 0;
}
for_each_object(p, s, addr, page->objects)
@@ -3821,8 +3787,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
void *p;
bitmap_zero(map, page->objects);
- for_each_free_object(p, s, page->freelist)
- set_bit(slab_index(p, s, addr), map);
+ get_map(s, page, map);
for_each_object(p, s, addr, page->objects)
if (!test_bit(slab_index(p, s, addr), map))
diff --git a/mm/swap.c b/mm/swap.c
index a448db377cb0..5602f1a1b1e7 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -396,6 +396,9 @@ static void lru_deactivate_fn(struct page *page, void *arg)
if (!PageLRU(page))
return;
+ if (PageUnevictable(page))
+ return;
+
/* Some processes are using the page */
if (page_mapped(page))
return;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c7f5a6d4b75b..c9177202c8ce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,6 +41,8 @@
#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <linux/oom.h>
+#include <linux/prefetch.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -936,7 +938,7 @@ keep_lumpy:
* back off and wait for congestion to clear because further reclaim
* will encounter the same problem
*/
- if (nr_dirty == nr_congested && nr_dirty != 0)
+ if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
zone_set_flag(zone, ZONE_CONGESTED);
free_page_list(&free_pages);
@@ -1988,17 +1990,12 @@ static bool zone_reclaimable(struct zone *zone)
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
}
-/*
- * As hibernation is going on, kswapd is freezed so that it can't mark
- * the zone into all_unreclaimable. It can't handle OOM during hibernation.
- * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
- */
+/* All zones in zonelist are unreclaimable? */
static bool all_unreclaimable(struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
- bool all_unreclaimable = true;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2006,13 +2003,11 @@ static bool all_unreclaimable(struct zonelist *zonelist,
continue;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- if (zone_reclaimable(zone)) {
- all_unreclaimable = false;
- break;
- }
+ if (!zone->all_unreclaimable)
+ return false;
}
- return all_unreclaimable;
+ return true;
}
/*
@@ -2108,6 +2103,14 @@ out:
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
+ /*
+ * As hibernation is going on, kswapd is freezed so that it can't mark
+ * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
+ * check.
+ */
+ if (oom_killer_disabled)
+ return 0;
+
/* top priority shrink_zones still had more to do? don't OOM, then */
if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 772b39b87d95..897ea9e88238 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -321,9 +321,12 @@ static inline void mod_state(struct zone *zone,
/*
* The fetching of the stat_threshold is racy. We may apply
* a counter threshold to the wrong the cpu if we get
- * rescheduled while executing here. However, the following
- * will apply the threshold again and therefore bring the
- * counter under the threshold.
+ * rescheduled while executing here. However, the next
+ * counter update will apply the threshold again and
+ * therefore bring the counter under the threshold again.
+ *
+ * Most of the time the thresholds are the same anyways
+ * for all cpus in a zone.
*/
t = this_cpu_read(pcp->stat_threshold);
@@ -945,7 +948,16 @@ static const char * const vmstat_text[] = {
"unevictable_pgs_cleared",
"unevictable_pgs_stranded",
"unevictable_pgs_mlockfreed",
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ "thp_fault_alloc",
+ "thp_fault_fallback",
+ "thp_collapse_alloc",
+ "thp_collapse_alloc_failed",
+ "thp_split",
#endif
+
+#endif /* CONFIG_VM_EVENTS_COUNTERS */
};
static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,