From 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:40 -0700 Subject: [PATCH] mm: split page table lock Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with a many-threaded application which concurrently initializes different parts of a large anonymous area. This patch corrects that, by using a separate spinlock per page table page, to guard the page table entries in that page, instead of using the mm's single page_table_lock. (But even then, page_table_lock is still used to guard page table allocation, and anon_vma allocation.) In this implementation, the spinlock is tucked inside the struct page of the page table page: with a BUILD_BUG_ON in case it overflows - which it would in the case of 32-bit PA-RISC with spinlock debugging enabled. Splitting the lock is not quite for free: another cacheline access. Ideally, I suppose we would use split ptlock only for multi-threaded processes on multi-cpu machines; but deciding that dynamically would have its own costs. So for now enable it by config, at some number of cpus - since the Kconfig language doesn't support inequalities, let preprocessor compare that with NR_CPUS. But I don't think it's worth being user-configurable: for good testing of both split and unsplit configs, split now at 4 cpus, and perhaps change that to 8 later. There is a benefit even for singly threaded processes: kswapd can be attacking one part of the mm while another part is busy faulting. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 8461e2dd91d7..e9ef599498b5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) { struct page *page = pmd_page(*pmd); pmd_clear(pmd); + pte_lock_deinit(page); pte_free_tlb(tlb, page); dec_page_state(nr_page_table_pages); tlb->mm->nr_ptes--; @@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) if (!new) return -ENOMEM; + pte_lock_init(new); spin_lock(&mm->page_table_lock); - if (pmd_present(*pmd)) /* Another has populated it */ + if (pmd_present(*pmd)) { /* Another has populated it */ + pte_lock_deinit(new); pte_free(new); - else { + } else { mm->nr_ptes++; inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); @@ -432,7 +435,7 @@ again: if (!dst_pte) return -ENOMEM; src_pte = pte_offset_map_nested(src_pmd, addr); - src_ptl = &src_mm->page_table_lock; + src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock(src_ptl); do { @@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range); * (but do_wp_page is only called after already making such a check; * and do_anonymous_page and do_no_page can safely check later on). */ -static inline int pte_unmap_same(struct mm_struct *mm, +static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, pte_t *page_table, pte_t orig_pte) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) if (sizeof(pte_t) > sizeof(unsigned long)) { - spin_lock(&mm->page_table_lock); + spinlock_t *ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); same = pte_same(*page_table, orig_pte); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } #endif pte_unmap(page_table); @@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_t pte; int ret = VM_FAULT_MINOR; - if (!pte_unmap_same(mm, page_table, orig_pte)) + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) goto out; entry = pte_to_swp_entry(orig_pte); @@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(page); entry = mk_pte(page, vma->vm_page_prot); - ptl = &mm->page_table_lock; + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (!pte_none(*page_table)) goto release; @@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, pgoff_t pgoff; int err; - if (!pte_unmap_same(mm, page_table, orig_pte)) + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return VM_FAULT_MINOR; if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { @@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, pte, pmd, write_access, entry); } - ptl = &mm->page_table_lock; + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) goto unlock; -- cgit v1.2.3-59-g8ed1b