aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c125
1 files changed, 100 insertions, 25 deletions
diff --git a/mm/memory.c b/mm/memory.c
index dc7f3543b1fd..228efaca75d3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -71,6 +71,8 @@
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
#include <trace/events/kmem.h>
@@ -437,7 +439,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
* of a chain of data-dependent loads, meaning most CPUs (alpha
* being the notable exception) will already guarantee loads are
* seen in-order. See the alpha page table accessors for the
- * smp_read_barrier_depends() barriers in page table walking code.
+ * smp_rmb() barriers in page table walking code.
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
@@ -1098,7 +1100,7 @@ again:
}
entry = pte_to_swp_entry(ptent);
- if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+ if (is_device_private_entry(entry)) {
struct page *page = device_private_entry_to_page(entry);
if (unlikely(details && details->check_mapping)) {
@@ -1498,7 +1500,7 @@ out:
}
#ifdef pte_index
-static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd,
+static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
int err;
@@ -1506,8 +1508,9 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd,
if (!page_count(page))
return -EINVAL;
err = validate_page_before_insert(page);
- return err ? err : insert_page_into_pte_locked(
- mm, pte_offset_map(pmd, addr), addr, page, prot);
+ if (err)
+ return err;
+ return insert_page_into_pte_locked(mm, pte, addr, page, prot);
}
/* insert_pages() amortizes the cost of spinlock operations
@@ -1517,7 +1520,8 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num, pgprot_t prot)
{
pmd_t *pmd = NULL;
- spinlock_t *pte_lock = NULL;
+ pte_t *start_pte, *pte;
+ spinlock_t *pte_lock;
struct mm_struct *const mm = vma->vm_mm;
unsigned long curr_page_idx = 0;
unsigned long remaining_pages_total = *num;
@@ -1536,18 +1540,17 @@ more:
ret = -ENOMEM;
if (pte_alloc(mm, pmd))
goto out;
- pte_lock = pte_lockptr(mm, pmd);
while (pages_to_write_in_pmd) {
int pte_idx = 0;
const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
- spin_lock(pte_lock);
- for (; pte_idx < batch_size; ++pte_idx) {
- int err = insert_page_in_batch_locked(mm, pmd,
+ start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
+ for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
+ int err = insert_page_in_batch_locked(mm, pte,
addr, pages[curr_page_idx], prot);
if (unlikely(err)) {
- spin_unlock(pte_lock);
+ pte_unmap_unlock(start_pte, pte_lock);
ret = err;
remaining_pages_total -= pte_idx;
goto out;
@@ -1555,7 +1558,7 @@ more:
addr += PAGE_SIZE;
++curr_page_idx;
}
- spin_unlock(pte_lock);
+ pte_unmap_unlock(start_pte, pte_lock);
pages_to_write_in_pmd -= batch_size;
remaining_pages_total -= batch_size;
}
@@ -1600,7 +1603,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
#else
unsigned long idx = 0, pgcount = *num;
- int err;
+ int err = -EINVAL;
for (; idx < pgcount; ++idx) {
err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
@@ -1799,7 +1802,7 @@ out_unlock:
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vmf_insert_pfn(), except that it allows drivers to
+ * This is exactly like vmf_insert_pfn(), except that it allows drivers
* to override pgprot on a per-page basis.
*
* This only makes sense for IO mappings, and it makes no sense for
@@ -1935,7 +1938,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vmf_insert_mixed(), except that it allows drivers to
+ * This is exactly like vmf_insert_mixed(), except that it allows drivers
* to override pgprot on a per-page basis.
*
* Typically this function should be used by drivers to set caching- and
@@ -2081,7 +2084,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
- * @addr: target user address to start at
+ * @addr: target page aligned user address to start at
* @pfn: page frame number of kernel physical memory address
* @size: size of mapping area
* @prot: page protection flags for this mapping
@@ -2100,6 +2103,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long remap_pfn = pfn;
int err;
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
+ return -EINVAL;
+
/*
* Physically remapped pages are special. Tell the
* rest of the world about it:
@@ -2204,7 +2210,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
{
pte_t *pte;
int err = 0;
- spinlock_t *uninitialized_var(ptl);
+ spinlock_t *ptl;
if (create) {
pte = (mm == &init_mm) ?
@@ -2711,7 +2717,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
@@ -3094,6 +3100,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
int locked;
int exclusive = 0;
vm_fault_t ret = 0;
+ void *shadow = NULL;
if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
@@ -3140,8 +3147,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
err = mem_cgroup_charge(page, vma->vm_mm,
GFP_KERNEL);
ClearPageSwapCache(page);
- if (err)
+ if (err) {
+ ret = VM_FAULT_OOM;
goto out_page;
+ }
+
+ shadow = get_shadow_from_swap_cache(entry);
+ if (shadow)
+ workingset_refault(page, shadow);
lru_cache_add(page);
swap_readpage(page, true);
@@ -3252,10 +3265,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
- activate_page(page);
}
swap_free(entry);
@@ -3400,7 +3412,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -3658,7 +3670,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
@@ -4346,6 +4358,67 @@ retry_pud:
return handle_pte_fault(&vmf);
}
+/**
+ * mm_account_fault - Do page fault accountings
+ *
+ * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
+ * of perf event counters, but we'll still do the per-task accounting to
+ * the task who triggered this page fault.
+ * @address: the faulted address.
+ * @flags: the fault flags.
+ * @ret: the fault retcode.
+ *
+ * This will take care of most of the page fault accountings. Meanwhile, it
+ * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
+ * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * still be in per-arch page fault handlers at the entry of page fault.
+ */
+static inline void mm_account_fault(struct pt_regs *regs,
+ unsigned long address, unsigned int flags,
+ vm_fault_t ret)
+{
+ bool major;
+
+ /*
+ * We don't do accounting for some specific faults:
+ *
+ * - Unsuccessful faults (e.g. when the address wasn't valid). That
+ * includes arch_vma_access_permitted() failing before reaching here.
+ * So this is not a "this many hardware page faults" counter. We
+ * should use the hw profiling for that.
+ *
+ * - Incomplete faults (VM_FAULT_RETRY). They will only be counted
+ * once they're completed.
+ */
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
+ return;
+
+ /*
+ * We define the fault as a major fault when the final successful fault
+ * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
+ * handle it immediately previously).
+ */
+ major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
+
+ if (major)
+ current->maj_flt++;
+ else
+ current->min_flt++;
+
+ /*
+ * If the fault is done for GUP, regs will be NULL. We only do the
+ * accounting for the per thread fault counters who triggered the
+ * fault, and we skip the perf event updates.
+ */
+ if (!regs)
+ return;
+
+ if (major)
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ else
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*
@@ -4353,7 +4426,7 @@ retry_pud:
* return value. See filemap_fault() and __lock_page_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags)
+ unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
@@ -4394,6 +4467,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
mem_cgroup_oom_synchronize(false);
}
+ mm_account_fault(regs, address, flags, ret);
+
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
@@ -4667,7 +4742,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *maddr;
struct page *page = NULL;
- ret = get_user_pages_remote(tsk, mm, addr, 1,
+ ret = get_user_pages_remote(mm, addr, 1,
gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT