From 1de14c3c5cbc9bb17e9dcc648cda51c0c85d54b9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 12 Apr 2013 16:23:54 -0700 Subject: x86-32: Fix possible incomplete TLB invalidate with PAE pagetables This patch attempts to fix: https://bugzilla.kernel.org/show_bug.cgi?id=56461 The symptom is a crash and messages like this: chrome: Corrupted page table at address 34a03000 *pdpt = 0000000000000000 *pde = 0000000000000000 Bad pagetable: 000f [#1] PREEMPT SMP Ingo guesses this got introduced by commit 611ae8e3f520 ("x86/tlb: enable tlb flush range support for x86") since that code started to free unused pagetables. On x86-32 PAE kernels, that new code has the potential to free an entire PMD page and will clear one of the four page-directory-pointer-table (aka pgd_t entries). The hardware aggressively "caches" these top-level entries and invlpg does not actually affect the CPU's copy. If we clear one we *HAVE* to do a full TLB flush, otherwise we might continue using a freed pmd page. (note, we do this properly on the population side in pud_populate()). This patch tracks whenever we clear one of these entries in the 'struct mmu_gather', and ensures that we follow up with a full tlb flush. BTW, I disassembled and checked that: if (tlb->fullmm == 0) and if (!tlb->fullmm && !tlb->need_flush_all) generate essentially the same code, so there should be zero impact there to the !PAE case. Signed-off-by: Dave Hansen Cc: Peter Anvin Cc: Ingo Molnar Cc: Artem S Tashkinov Signed-off-by: Linus Torvalds --- mm/memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 494526ae024a..13cbc420fead 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -216,6 +216,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) tlb->mm = mm; tlb->fullmm = fullmm; + tlb->need_flush_all = 0; tlb->start = -1UL; tlb->end = 0; tlb->need_flush = 0; -- cgit v1.2.3-59-g8ed1b From b4cbb197c7e7a68dbad0d491242e3ca67420c13e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 16 Apr 2013 13:45:37 -0700 Subject: vm: add vm_iomap_memory() helper function Various drivers end up replicating the code to mmap() their memory buffers into user space, and our core memory remapping function may be very flexible but it is unnecessarily complicated for the common cases to use. Our internal VM uses pfn's ("page frame numbers") which simplifies things for the VM, and allows us to pass physical addresses around in a denser and more efficient format than passing a "phys_addr_t" around, and having to shift it up and down by the page size. But it just means that drivers end up doing that shifting instead at the interface level. It also means that drivers end up mucking around with internal VM things like the vma details (vm_pgoff, vm_start/end) way more than they really need to. So this just exports a function to map a certain physical memory range into user space (using a phys_addr_t based interface that is much more natural for a driver) and hides all the complexity from the driver. Some drivers will still end up tweaking the vm_page_prot details for things like prefetching or cacheability etc, but that's actually relevant to the driver, rather than caring about what the page offset of the mapping is into the particular IO memory region. Acked-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/memory.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) (limited to 'mm/memory.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index e19ff30ad0a2..e2091b88d24c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1611,6 +1611,8 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); + struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags, diff --git a/mm/memory.c b/mm/memory.c index 13cbc420fead..ba94dec5b259 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2393,6 +2393,53 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); +/** + * vm_iomap_memory - remap memory to userspace + * @vma: user vma to map to + * @start: start of area + * @len: size of area + * + * This is a simplified io_remap_pfn_range() for common driver use. The + * driver just needs to give us the physical memory range to be mapped, + * we'll figure out the rest from the vma information. + * + * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get + * whatever write-combining details or similar. + */ +int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) +{ + unsigned long vm_len, pfn, pages; + + /* Check that the physical memory area passed in looks valid */ + if (start + len < start) + return -EINVAL; + /* + * You *really* shouldn't map things that aren't page-aligned, + * but we've historically allowed it because IO memory might + * just have smaller alignment. + */ + len += start & ~PAGE_MASK; + pfn = start >> PAGE_SHIFT; + pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; + if (pfn + pages < pfn) + return -EINVAL; + + /* We start the mapping 'vm_pgoff' pages into the area */ + if (vma->vm_pgoff > pages) + return -EINVAL; + pfn += vma->vm_pgoff; + pages -= vma->vm_pgoff; + + /* Can we fit all of the mapping? */ + vm_len = vma->vm_end - vma->vm_start; + if (vm_len >> PAGE_SHIFT > pages) + return -EINVAL; + + /* Ok, let it rip */ + return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_iomap_memory); + static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data) -- cgit v1.2.3-59-g8ed1b From 071361d3473ebb8142907470ff12d59c59f6be72 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Dec 2012 10:19:12 -0800 Subject: mm: Convert print_symbol to %pSR Use the new vsprintf extension to avoid any possible message interleaving. Signed-off-by: Joe Perches Acked-by: Christoph Lameter Signed-off-by: Jiri Kosina --- mm/memory.c | 8 ++++---- mm/slab.c | 8 +++----- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 494526ae024a..a64c2114616c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -714,11 +714,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ if (vma->vm_ops) - print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", - (unsigned long)vma->vm_ops->fault); + printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", + vma->vm_ops->fault); if (vma->vm_file && vma->vm_file->f_op) - print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", - (unsigned long)vma->vm_file->f_op->mmap); + printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", + vma->vm_file->f_op->mmap); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } diff --git a/mm/slab.c b/mm/slab.c index 856e4a192d25..96079244c860 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2040,11 +2040,9 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) } if (cachep->flags & SLAB_STORE_USER) { - printk(KERN_ERR "Last user: [<%p>]", - *dbg_userword(cachep, objp)); - print_symbol("(%s)", - (unsigned long)*dbg_userword(cachep, objp)); - printk("\n"); + printk(KERN_ERR "Last user: [<%p>](%pSR)\n", + *dbg_userword(cachep, objp), + *dbg_userword(cachep, objp)); } realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; -- cgit v1.2.3-59-g8ed1b From 52f37629fd3c7b24e1e6c125e665454cd7ac1acb Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 29 Apr 2013 15:08:15 -0700 Subject: THP: fix comment about memory barrier Currently the memory barrier in __do_huge_pmd_anonymous_page doesn't work. Because lru_cache_add_lru uses pagevec so it could miss spinlock easily so above rule was broken so user might see inconsistent data. I was not first person who pointed out the problem. Mel and Peter pointed out a few months ago and Peter pointed out further that even spin_lock/unlock can't make sure of it: http://marc.info/?t=134333512700004 In particular: *A = a; LOCK UNLOCK *B = b; may occur as: LOCK, STORE *B, STORE *A, UNLOCK At last, Hugh pointed out that even we don't need memory barrier in there because __SetPageUpdate already have done it from Nick's commit 0ed361dec369 ("mm: fix PageUptodate data race") explicitly. So this patch fixes comment on THP and adds same comment for do_anonymous_page, too because everybody except Hugh was missing that. It means we need a comment about that. Signed-off-by: Minchan Kim Acked-by: Andrea Arcangeli Acked-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Hugh Dickins Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 11 +++++------ mm/memory.c | 5 +++++ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e2f7f5aaaafb..45eaae030628 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -713,6 +713,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, return VM_FAULT_OOM; clear_huge_page(page, haddr, HPAGE_PMD_NR); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * clear_huge_page writes become visible before the set_pmd_at() + * write. + */ __SetPageUptodate(page); spin_lock(&mm->page_table_lock); @@ -724,12 +729,6 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, } else { pmd_t entry; entry = mk_huge_pmd(page, vma); - /* - * The spinlocking to take the lru_lock inside - * page_add_new_anon_rmap() acts as a full memory - * barrier to be sure clear_huge_page writes become - * visible after the set_pmd_at() write. - */ page_add_new_anon_rmap(page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); pgtable_trans_huge_deposit(mm, pgtable); diff --git a/mm/memory.c b/mm/memory.c index ba94dec5b259..f7a1fba85d14 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3244,6 +3244,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceeding stores to the page contents become visible before + * the set_pte_at() write. + */ __SetPageUptodate(page); if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) -- cgit v1.2.3-59-g8ed1b