aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/vm/soft-dirty.txt7
-rw-r--r--fs/exec.c2
-rw-r--r--fs/proc/task_mmu.c46
-rw-r--r--include/linux/mm.h6
-rw-r--r--mm/mmap.c12
5 files changed, 61 insertions, 12 deletions
diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt
index 9a12a5956bc0..55684d11a1e8 100644
--- a/Documentation/vm/soft-dirty.txt
+++ b/Documentation/vm/soft-dirty.txt
@@ -28,6 +28,13 @@ This is so, since the pages are still mapped to physical memory, and thus all
the kernel does is finds this fact out and puts both writable and soft-dirty
bits on the PTE.
+ While in most cases tracking memory changes by #PF-s is more than enough
+there is still a scenario when we can lose soft dirty bits -- a task
+unmaps a previously mapped memory region and then maps a new one at exactly
+the same place. When unmap is called, the kernel internally clears PTE values
+including soft dirty bits. To notify user space application about such
+memory region renewal the kernel always marks new memory regions (and
+expanded regions) as soft dirty.
This feature is actively used by the checkpoint-restore project. You
can find more details about it on http://criu.org
diff --git a/fs/exec.c b/fs/exec.c
index fd774c7cb483..2d1e52a58fe9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -266,7 +266,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
vma->vm_end = STACK_TOP_MAX;
vma->vm_start = vma->vm_end - PAGE_SIZE;
- vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
+ vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
INIT_LIST_HEAD(&vma->anon_vma_chain);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 107d026f5d6e..09228639b83d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -740,6 +740,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
ptent = pte_file_clear_soft_dirty(ptent);
}
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+
set_pte_at(vma->vm_mm, addr, pte, ptent);
#endif
}
@@ -949,13 +952,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
} else {
- *pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ flags2 |= __PM_SOFT_DIRTY;
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
return;
}
if (page && !PageAnon(page))
flags |= PM_FILE;
- if (pte_soft_dirty(pte))
+ if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte))
flags2 |= __PM_SOFT_DIRTY;
*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
@@ -974,7 +979,7 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p
*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
| PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
}
#else
static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
@@ -997,7 +1002,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
int pmd_flags2;
- pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
+ if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
+ pmd_flags2 = __PM_SOFT_DIRTY;
+ else
+ pmd_flags2 = 0;
+
for (; addr != end; addr += PAGE_SIZE) {
unsigned long offset;
@@ -1015,12 +1024,17 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (pmd_trans_unstable(pmd))
return 0;
for (; addr != end; addr += PAGE_SIZE) {
+ int flags2;
/* check to see if we've left 'vma' behind
* and need a new, higher one */
if (vma && (addr >= vma->vm_end)) {
vma = find_vma(walk->mm, addr);
- pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+ flags2 = __PM_SOFT_DIRTY;
+ else
+ flags2 = 0;
+ pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
}
/* check that 'vma' actually covers this address,
@@ -1044,13 +1058,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
#ifdef CONFIG_HUGETLB_PAGE
static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
- pte_t pte, int offset)
+ pte_t pte, int offset, int flags2)
{
if (pte_present(pte))
- *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
- | PM_STATUS2(pm->v2, 0) | PM_PRESENT);
+ *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
+ PM_STATUS2(pm->v2, flags2) |
+ PM_PRESENT);
else
- *pme = make_pme(PM_NOT_PRESENT(pm->v2));
+ *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
+ PM_STATUS2(pm->v2, flags2));
}
/* This function walks within one hugetlb entry in the single call */
@@ -1059,12 +1075,22 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
+ struct vm_area_struct *vma;
int err = 0;
+ int flags2;
pagemap_entry_t pme;
+ vma = find_vma(walk->mm, addr);
+ WARN_ON_ONCE(!vma);
+
+ if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+ flags2 = __PM_SOFT_DIRTY;
+ else
+ flags2 = 0;
+
for (; addr != end; addr += PAGE_SIZE) {
int offset = (addr & ~hmask) >> PAGE_SHIFT;
- huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
+ huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
err = add_to_pagemap(addr, &pme, pm);
if (err)
return err;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d2d59b4149d0..dce24569f8fc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -115,6 +115,12 @@ extern unsigned int kobjsize(const void *objp);
#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
+#ifdef CONFIG_MEM_SOFT_DIRTY
+# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */
+#else
+# define VM_SOFTDIRTY 0
+#endif
+
#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */
#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
diff --git a/mm/mmap.c b/mm/mmap.c
index 13926a5a6901..51958d192a48 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1609,6 +1609,15 @@ out:
if (file)
uprobe_mmap(vma);
+ /*
+ * New (or expanded) vma always get soft dirty status.
+ * Otherwise user-space soft-dirty page tracker won't
+ * be able to distinguish situation when vma area unmapped,
+ * then new mapped in-place (which must be aimed as
+ * a completely new data area).
+ */
+ vma->vm_flags |= VM_SOFTDIRTY;
+
return addr;
unmap_and_free_vma:
@@ -2652,6 +2661,7 @@ out:
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
+ vma->vm_flags |= VM_SOFTDIRTY;
return addr;
}
@@ -2916,7 +2926,7 @@ int install_special_mapping(struct mm_struct *mm,
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
+ vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
vma->vm_ops = &special_mapping_vmops;