From db114b83ab6064d9b1d6ec5650e096c89bd95e25 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Dec 2009 17:59:25 -0800 Subject: ksm: hold anon_vma in rmap_item For full functionality, page_referenced_one() and try_to_unmap_one() need to know the vma: to pass vma down to arch-dependent flushes, or to observe VM_LOCKED or VM_EXEC. But KSM keeps no record of vma: nor can it, since vmas get split and merged without its knowledge. Instead, note page's anon_vma in its rmap_item when adding to stable tree: all the vmas which might map that page are listed by its anon_vma. page_referenced_ksm() and try_to_unmap_ksm() then traverse the anon_vma, first to find the probable vma, that which matches rmap_item's mm; but if that is not enough to locate all instances, traverse again to try the others. This catches those occasions when fork has duplicated a pte of a ksm page, but ksmd has not yet come around to assign it an rmap_item. But each rmap_item in the stable tree which refers to an anon_vma needs to take a reference to it. Andrea's anon_vma design cleverly avoided a reference count (an anon_vma was free when its list of vmas was empty), but KSM now needs to add that. Is a 32-bit count sufficient? I believe so - the anon_vma is only free when both count is 0 and list is empty. Signed-off-by: Hugh Dickins Cc: Izik Eidus Cc: Andrea Arcangeli Cc: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 157 +++++++++++++++++++++++++++++++++++++------------------------- mm/rmap.c | 5 +- 2 files changed, 98 insertions(+), 64 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 2f58ceebfe8f..f7d121c42d01 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -121,7 +121,7 @@ struct stable_node { /** * struct rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list - * @filler: unused space we're making available in this patch + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @mm: the memory structure this rmap_item is pointing into * @address: the virtual address this rmap_item tracks (+ flags in low bits) * @oldchecksum: previous checksum of the page at that virtual address @@ -131,7 +131,7 @@ struct stable_node { */ struct rmap_item { struct rmap_item *rmap_list; - unsigned long filler; + struct anon_vma *anon_vma; /* when stable */ struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ @@ -196,13 +196,6 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); -/* - * Temporary hack for page_referenced_ksm() and try_to_unmap_ksm(), - * later we rework things a little to get the right vma to them. - */ -static DEFINE_SPINLOCK(ksm_fallback_vma_lock); -static struct vm_area_struct ksm_fallback_vma; - #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ sizeof(struct __struct), __alignof__(struct __struct),\ (__flags), NULL) @@ -323,6 +316,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) return rmap_item->address & STABLE_FLAG; } +static void hold_anon_vma(struct rmap_item *rmap_item, + struct anon_vma *anon_vma) +{ + rmap_item->anon_vma = anon_vma; + atomic_inc(&anon_vma->ksm_refcount); +} + +static void drop_anon_vma(struct rmap_item *rmap_item) +{ + struct anon_vma *anon_vma = rmap_item->anon_vma; + + if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { + int empty = list_empty(&anon_vma->head); + spin_unlock(&anon_vma->lock); + if (empty) + anon_vma_free(anon_vma); + } +} + /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -472,6 +484,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ksm_pages_shared--; } + drop_anon_vma(rmap_item); rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { @@ -752,6 +765,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, pte_t orig_pte = __pte(0); int err = -EFAULT; + if (page == kpage) /* ksm page forked */ + return 0; + if (!(vma->vm_flags & VM_MERGEABLE)) goto out; if (!PageAnon(page)) @@ -805,9 +821,6 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, struct vm_area_struct *vma; int err = -EFAULT; - if (page == kpage) /* ksm page forked */ - return 0; - down_read(&mm->mmap_sem); if (ksm_test_exit(mm)) goto out; @@ -816,6 +829,11 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, goto out; err = try_to_merge_one_page(vma, page, kpage); + if (err) + goto out; + + /* Must get reference to anon_vma while still holding mmap_sem */ + hold_anon_vma(rmap_item, vma->anon_vma); out: up_read(&mm->mmap_sem); return err; @@ -869,6 +887,11 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, lru_cache_add_lru(kpage, LRU_ACTIVE_ANON); err = try_to_merge_one_page(vma, page, kpage); + if (err) + goto up; + + /* Must get reference to anon_vma while still holding mmap_sem */ + hold_anon_vma(rmap_item, vma->anon_vma); up: up_read(&mm->mmap_sem); @@ -879,8 +902,10 @@ up: * If that fails, we have a ksm page with only one pte * pointing to it: so break it. */ - if (err) + if (err) { + drop_anon_vma(rmap_item); break_cow(rmap_item); + } } if (err) { put_page(kpage); @@ -1155,7 +1180,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) * in which case we need to break_cow on both. */ if (!stable_node) { + drop_anon_vma(tree_rmap_item); break_cow(tree_rmap_item); + drop_anon_vma(rmap_item); break_cow(rmap_item); } } @@ -1490,7 +1517,7 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, struct hlist_node *hlist; unsigned int mapcount = page_mapcount(page); int referenced = 0; - struct vm_area_struct *vma; + int search_new_forks = 0; VM_BUG_ON(!PageKsm(page)); VM_BUG_ON(!PageLocked(page)); @@ -1498,36 +1525,40 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, stable_node = page_stable_node(page); if (!stable_node) return 0; - - /* - * Temporary hack: really we need anon_vma in rmap_item, to - * provide the correct vma, and to find recently forked instances. - * Use zalloc to avoid weirdness if any other fields are involved. - */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC); - if (!vma) { - spin_lock(&ksm_fallback_vma_lock); - vma = &ksm_fallback_vma; - } - +again: hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { - if (memcg && !mm_match_cgroup(rmap_item->mm, memcg)) - continue; + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct vm_area_struct *vma; + + spin_lock(&anon_vma->lock); + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; - vma->vm_mm = rmap_item->mm; - vma->vm_start = rmap_item->address; - vma->vm_end = vma->vm_start + PAGE_SIZE; + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) + continue; - referenced += page_referenced_one(page, vma, + referenced += page_referenced_one(page, vma, rmap_item->address, &mapcount, vm_flags); + if (!search_new_forks || !mapcount) + break; + } + spin_unlock(&anon_vma->lock); if (!mapcount) goto out; } + if (!search_new_forks++) + goto again; out: - if (vma == &ksm_fallback_vma) - spin_unlock(&ksm_fallback_vma_lock); - else - kmem_cache_free(vm_area_cachep, vma); return referenced; } @@ -1537,7 +1568,7 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) struct hlist_node *hlist; struct rmap_item *rmap_item; int ret = SWAP_AGAIN; - struct vm_area_struct *vma; + int search_new_forks = 0; VM_BUG_ON(!PageKsm(page)); VM_BUG_ON(!PageLocked(page)); @@ -1545,35 +1576,37 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) stable_node = page_stable_node(page); if (!stable_node) return SWAP_FAIL; - - /* - * Temporary hack: really we need anon_vma in rmap_item, to - * provide the correct vma, and to find recently forked instances. - * Use zalloc to avoid weirdness if any other fields are involved. - */ - if (TTU_ACTION(flags) != TTU_UNMAP) - return SWAP_FAIL; - - vma = kmem_cache_zalloc(vm_area_cachep, GFP_ATOMIC); - if (!vma) { - spin_lock(&ksm_fallback_vma_lock); - vma = &ksm_fallback_vma; - } - +again: hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { - vma->vm_mm = rmap_item->mm; - vma->vm_start = rmap_item->address; - vma->vm_end = vma->vm_start + PAGE_SIZE; + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct vm_area_struct *vma; - ret = try_to_unmap_one(page, vma, rmap_item->address, flags); - if (ret != SWAP_AGAIN || !page_mapped(page)) - goto out; + spin_lock(&anon_vma->lock); + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + ret = try_to_unmap_one(page, vma, + rmap_item->address, flags); + if (ret != SWAP_AGAIN || !page_mapped(page)) { + spin_unlock(&anon_vma->lock); + goto out; + } + } + spin_unlock(&anon_vma->lock); } + if (!search_new_forks++) + goto again; out: - if (vma == &ksm_fallback_vma) - spin_unlock(&ksm_fallback_vma_lock); - else - kmem_cache_free(vm_area_cachep, vma); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 869aaa3206a2..ebdf582ef185 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -68,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void) return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); } -static inline void anon_vma_free(struct anon_vma *anon_vma) +void anon_vma_free(struct anon_vma *anon_vma) { kmem_cache_free(anon_vma_cachep, anon_vma); } @@ -172,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) list_del(&vma->anon_vma_node); /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head); + empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); spin_unlock(&anon_vma->lock); if (empty) @@ -184,6 +184,7 @@ static void anon_vma_ctor(void *data) struct anon_vma *anon_vma = data; spin_lock_init(&anon_vma->lock); + ksm_refcount_init(anon_vma); INIT_LIST_HEAD(&anon_vma->head); } -- cgit v1.2.3-59-g8ed1b