From 7b2a2d4a18fffac3c4872021529b0657896db788 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 19 Oct 2012 14:07:31 +0100 Subject: mm: migrate: Add a tracepoint for migrate_pages The pgmigrate_success and pgmigrate_fail vmstat counters tells the user about migration activity but not the type or the reason. This patch adds a tracepoint to identify the type of page migration and why the page is being migrated. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- include/linux/migrate.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include/linux/migrate.h') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ce7e6671968b..9d1c159e2427 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -7,6 +7,15 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); +enum migrate_reason { + MR_COMPACTION, + MR_MEMORY_FAILURE, + MR_MEMORY_HOTPLUG, + MR_SYSCALL, /* also applies to cpusets */ + MR_MEMPOLICY_MBIND, + MR_CMA +}; + #ifdef CONFIG_MIGRATION extern void putback_lru_pages(struct list_head *l); @@ -14,7 +23,7 @@ extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - enum migrate_mode mode); + enum migrate_mode mode, int reason); extern int migrate_huge_page(struct page *, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode); @@ -35,7 +44,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - enum migrate_mode mode) { return -ENOSYS; } + enum migrate_mode mode, int reason) { return -ENOSYS; } static inline int migrate_huge_page(struct page *page, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode) { return -ENOSYS; } -- cgit v1.3-8-gc7d7 From 7039e1dbec6eeaa8ecab43a82d6589eeced995c3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:34 +0200 Subject: mm: migrate: Introduce migrate_misplaced_page() Note: This was originally based on Peter's patch "mm/migrate: Introduce migrate_misplaced_page()" but borrows extremely heavily from Andrea's "autonuma: memory follows CPU algorithm and task/mm_autonuma stats collection". The end result is barely recognisable so signed-offs had to be dropped. If original authors are ok with it, I'll re-add the signed-off-bys. Add migrate_misplaced_page() which deals with migrating pages from faults. Based-on-work-by: Lee Schermerhorn Based-on-work-by: Peter Zijlstra Based-on-work-by: Andrea Arcangeli Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- include/linux/migrate.h | 11 +++++ mm/migrate.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 117 insertions(+), 2 deletions(-) (limited to 'include/linux/migrate.h') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9d1c159e2427..f0d0313eea6f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -13,6 +13,7 @@ enum migrate_reason { MR_MEMORY_HOTPLUG, MR_SYSCALL, /* also applies to cpusets */ MR_MEMPOLICY_MBIND, + MR_NUMA_MISPLACED, MR_CMA }; @@ -73,4 +74,14 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #define fail_migrate_page NULL #endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_NUMA_BALANCING +extern int migrate_misplaced_page(struct page *page, int node); +#else +static inline int migrate_misplaced_page(struct page *page, int node) +{ + return -EAGAIN; /* can't migrate now */ +} +#endif /* CONFIG_NUMA_BALANCING */ + #endif /* _LINUX_MIGRATE_H */ diff --git a/mm/migrate.c b/mm/migrate.c index 27be9c923dc1..d168aec98427 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -282,7 +282,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, struct buffer_head *head, enum migrate_mode mode) { - int expected_count; + int expected_count = 0; void **pslot; if (!mapping) { @@ -1415,4 +1415,108 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, } return err; } -#endif + +#ifdef CONFIG_NUMA_BALANCING +/* + * Returns true if this is a safe migration target node for misplaced NUMA + * pages. Currently it only checks the watermarks which crude + */ +static bool migrate_balanced_pgdat(struct pglist_data *pgdat, + int nr_migrate_pages) +{ + int z; + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable) + continue; + + /* Avoid waking kswapd by allocating pages_to_migrate pages. */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone) + + nr_migrate_pages, + 0, 0)) + continue; + return true; + } + return false; +} + +static struct page *alloc_misplaced_dst_page(struct page *page, + unsigned long data, + int **result) +{ + int nid = (int) data; + struct page *newpage; + + newpage = alloc_pages_exact_node(nid, + (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | + __GFP_NOMEMALLOC | __GFP_NORETRY | + __GFP_NOWARN) & + ~GFP_IOFS, 0); + return newpage; +} + +/* + * Attempt to migrate a misplaced page to the specified destination + * node. Caller is expected to have an elevated reference count on + * the page that will be dropped by this function before returning. + */ +int migrate_misplaced_page(struct page *page, int node) +{ + int isolated = 0; + LIST_HEAD(migratepages); + + /* + * Don't migrate pages that are mapped in multiple processes. + * TODO: Handle false sharing detection instead of this hammer + */ + if (page_mapcount(page) != 1) { + put_page(page); + goto out; + } + + /* Avoid migrating to a node that is nearly full */ + if (migrate_balanced_pgdat(NODE_DATA(node), 1)) { + int page_lru; + + if (isolate_lru_page(page)) { + put_page(page); + goto out; + } + isolated = 1; + + /* + * Page is isolated which takes a reference count so now the + * callers reference can be safely dropped without the page + * disappearing underneath us during migration + */ + put_page(page); + + page_lru = page_is_file_cache(page); + inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); + list_add(&page->lru, &migratepages); + } + + if (isolated) { + int nr_remaining; + + nr_remaining = migrate_pages(&migratepages, + alloc_misplaced_dst_page, + node, false, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); + if (nr_remaining) { + putback_lru_pages(&migratepages); + isolated = 0; + } + } + BUG_ON(!list_empty(&migratepages)); +out: + return isolated; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#endif /* CONFIG_NUMA */ -- cgit v1.3-8-gc7d7 From e14808b49f55e0e1135da5e4a154a540dd9f3662 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 19 Nov 2012 10:59:15 +0000 Subject: mm: numa: Rate limit setting of pte_numa if node is saturated If there are a large number of NUMA hinting faults and all of them are resulting in migrations it may indicate that memory is just bouncing uselessly around. NUMA balancing cost is likely exceeding any benefit from locality. Rate limit the PTE updates if the node is migration rate-limited. As noted in the comments, this distorts the NUMA faulting statistics. Signed-off-by: Mel Gorman --- include/linux/migrate.h | 6 ++++++ kernel/sched/fair.c | 9 +++++++++ mm/migrate.c | 20 ++++++++++++++++++++ 3 files changed, 35 insertions(+) (limited to 'include/linux/migrate.h') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index f0d0313eea6f..91556889adac 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -77,11 +77,17 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #ifdef CONFIG_NUMA_BALANCING extern int migrate_misplaced_page(struct page *page, int node); +extern int migrate_misplaced_page(struct page *page, int node); +extern bool migrate_ratelimited(int node); #else static inline int migrate_misplaced_page(struct page *page, int node) { return -EAGAIN; /* can't migrate now */ } +static inline bool migrate_ratelimited(int node) +{ + return false; +} #endif /* CONFIG_NUMA_BALANCING */ #endif /* _LINUX_MIGRATE_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7727b0161579..37e895a941ab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -861,6 +862,14 @@ void task_numa_work(struct callback_head *work) if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; + /* + * Do not set pte_numa if the current running node is rate-limited. + * This loses statistics on the fault but if we are unwilling to + * migrate to this node, it is less likely we can do useful work + */ + if (migrate_ratelimited(numa_node_id())) + return; + start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ diff --git a/mm/migrate.c b/mm/migrate.c index 4b8267f1842f..32a1afca6009 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1464,10 +1464,30 @@ static struct page *alloc_misplaced_dst_page(struct page *page, * page migration rate limiting control. * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs * window of time. Default here says do not migrate more than 1280M per second. + * If a node is rate-limited then PTE NUMA updates are also rate-limited. However + * as it is faults that reset the window, pte updates will happen unconditionally + * if there has not been a fault since @pteupdate_interval_millisecs after the + * throttle window closed. */ static unsigned int migrate_interval_millisecs __read_mostly = 100; +static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); +/* Returns true if NUMA migration is currently rate limited */ +bool migrate_ratelimited(int node) +{ + pg_data_t *pgdat = NODE_DATA(node); + + if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + + msecs_to_jiffies(pteupdate_interval_millisecs))) + return false; + + if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) + return false; + + return true; +} + /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on -- cgit v1.3-8-gc7d7 From b32967ff101a7508f70be8de59b278d4df92fa00 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 19 Nov 2012 12:35:47 +0000 Subject: mm: numa: Add THP migration for the NUMA working set scanning fault case. Note: This is very heavily based on a patch from Peter Zijlstra with fixes from Ingo Molnar, Hugh Dickins and Johannes Weiner. That patch put a lot of migration logic into mm/huge_memory.c where it does not belong. This version puts tries to share some of the migration logic with migrate_misplaced_page. However, it should be noted that now migrate.c is doing more with the pagetable manipulation than is preferred. The end result is barely recognisable so as before, the signed-offs had to be removed but will be re-added if the original authors are ok with it. Add THP migration for the NUMA working set scanning fault case. It uses the page lock to serialize. No migration pte dance is necessary because the pte is already unmapped when we decide to migrate. [dhillf@gmail.com: Fix memory leak on isolation failure] [dhillf@gmail.com: Fix transfer of last_nid information] Signed-off-by: Mel Gorman --- include/linux/migrate.h | 15 ++++ mm/huge_memory.c | 59 +++++++++---- mm/internal.h | 7 +- mm/memcontrol.c | 7 +- mm/migrate.c | 231 +++++++++++++++++++++++++++++++++++++++--------- 5 files changed, 255 insertions(+), 64 deletions(-) (limited to 'include/linux/migrate.h') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 91556889adac..51eac4bdc606 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -79,6 +79,12 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, extern int migrate_misplaced_page(struct page *page, int node); extern int migrate_misplaced_page(struct page *page, int node); extern bool migrate_ratelimited(int node); +extern int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node); + #else static inline int migrate_misplaced_page(struct page *page, int node) { @@ -88,6 +94,15 @@ static inline bool migrate_ratelimited(int node) { return false; } + +static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node) +{ + return -EAGAIN; +} #endif /* CONFIG_NUMA_BALANCING */ #endif /* _LINUX_MIGRATE_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 199b261a257e..711baf84b153 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -600,7 +600,7 @@ out: } __setup("transparent_hugepage=", setup_transparent_hugepage); -static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pmd = pmd_mkwrite(pmd); @@ -1023,10 +1023,12 @@ out: int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { - struct page *page = NULL; + struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int target_nid; int current_nid = -1; + bool migrated; + bool page_locked = false; spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) @@ -1034,42 +1036,61 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(pmd); get_page(page); - spin_unlock(&mm->page_table_lock); current_nid = page_to_nid(page); count_vm_numa_event(NUMA_HINT_FAULTS); if (current_nid == numa_node_id()) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); target_nid = mpol_misplaced(page, vma, haddr); - if (target_nid == -1) + if (target_nid == -1) { + put_page(page); goto clear_pmdnuma; + } - /* - * Due to lacking code to migrate thp pages, we'll split - * (which preserves the special PROT_NONE) and re-take the - * fault on the normal pages. - */ - split_huge_page(page); - put_page(page); - - return 0; + /* Acquire the page lock to serialise THP migrations */ + spin_unlock(&mm->page_table_lock); + lock_page(page); + page_locked = true; -clear_pmdnuma: + /* Confirm the PTE did not while locked */ spin_lock(&mm->page_table_lock); - if (unlikely(!pmd_same(pmd, *pmdp))) + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + put_page(page); goto out_unlock; + } + spin_unlock(&mm->page_table_lock); + + /* Migrate the THP to the requested node */ + migrated = migrate_misplaced_transhuge_page(mm, vma, + pmdp, pmd, addr, + page, target_nid); + if (migrated) + current_nid = target_nid; + else { + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(pmd, *pmdp))) { + unlock_page(page); + goto out_unlock; + } + goto clear_pmdnuma; + } + + task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); + return 0; +clear_pmdnuma: pmd = pmd_mknonnuma(pmd); set_pmd_at(mm, haddr, pmdp, pmd); VM_BUG_ON(pmd_numa(*pmdp)); update_mmu_cache_pmd(vma, addr, pmdp); + if (page_locked) + unlock_page(page); out_unlock: spin_unlock(&mm->page_table_lock); - if (page) { - put_page(page); - task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false); - } + if (current_nid != -1) + task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); return 0; } diff --git a/mm/internal.h b/mm/internal.h index a4fa284f6bc2..7e60ac826f2b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -212,15 +212,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) { if (TestClearPageMlocked(page)) { unsigned long flags; + int nr_pages = hpage_nr_pages(page); local_irq_save(flags); - __dec_zone_page_state(page, NR_MLOCK); + __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); SetPageMlocked(newpage); - __inc_zone_page_state(newpage, NR_MLOCK); + __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); local_irq_restore(flags); } } +extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dd39ba000b31..d97af9636ab2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3288,15 +3288,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; struct page_cgroup *pc; enum charge_type ctype; *memcgp = NULL; - VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return; + if (PageTransHuge(page)) + nr_pages <<= compound_order(page); + pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { @@ -3358,7 +3361,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, * charged to the res_counter since we plan on replacing the * old one and only one page is going to be left afterwards. */ - __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); + __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); } /* remove redundant charge if migration failed*/ diff --git a/mm/migrate.c b/mm/migrate.c index 2a5ce135eef0..c9400960fd52 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -410,7 +410,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, */ void migrate_page_copy(struct page *newpage, struct page *page) { - if (PageHuge(page)) + if (PageHuge(page) || PageTransHuge(page)) copy_huge_page(newpage, page); else copy_highpage(newpage, page); @@ -1491,25 +1491,10 @@ bool migrate_ratelimited(int node) return true; } -/* - * Attempt to migrate a misplaced page to the specified destination - * node. Caller is expected to have an elevated reference count on - * the page that will be dropped by this function before returning. - */ -int migrate_misplaced_page(struct page *page, int node) +/* Returns true if the node is migrate rate-limited after the update */ +bool numamigrate_update_ratelimit(pg_data_t *pgdat) { - pg_data_t *pgdat = NODE_DATA(node); - int isolated = 0; - LIST_HEAD(migratepages); - - /* - * Don't migrate pages that are mapped in multiple processes. - * TODO: Handle false sharing detection instead of this hammer - */ - if (page_mapcount(page) != 1) { - put_page(page); - goto out; - } + bool rate_limited = false; /* * Rate-limit the amount of data that is being migrated to a node. @@ -1522,13 +1507,18 @@ int migrate_misplaced_page(struct page *page, int node) pgdat->numabalancing_migrate_next_window = jiffies + msecs_to_jiffies(migrate_interval_millisecs); } - if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) { - spin_unlock(&pgdat->numabalancing_migrate_lock); - put_page(page); - goto out; - } - pgdat->numabalancing_migrate_nr_pages++; + if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) + rate_limited = true; + else + pgdat->numabalancing_migrate_nr_pages++; spin_unlock(&pgdat->numabalancing_migrate_lock); + + return rate_limited; +} + +int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +{ + int ret = 0; /* Avoid migrating to a node that is nearly full */ if (migrate_balanced_pgdat(pgdat, 1)) { @@ -1536,13 +1526,18 @@ int migrate_misplaced_page(struct page *page, int node) if (isolate_lru_page(page)) { put_page(page); - goto out; + return 0; } - isolated = 1; + /* Page is isolated */ + ret = 1; page_lru = page_is_file_cache(page); - inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); - list_add(&page->lru, &migratepages); + if (!PageTransHuge(page)) + inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); + else + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, + HPAGE_PMD_NR); } /* @@ -1555,23 +1550,177 @@ int migrate_misplaced_page(struct page *page, int node) */ put_page(page); - if (isolated) { - int nr_remaining; - - nr_remaining = migrate_pages(&migratepages, - alloc_misplaced_dst_page, - node, false, MIGRATE_ASYNC, - MR_NUMA_MISPLACED); - if (nr_remaining) { - putback_lru_pages(&migratepages); - isolated = 0; - } else - count_vm_numa_event(NUMA_PAGE_MIGRATE); + return ret; +} + +/* + * Attempt to migrate a misplaced page to the specified destination + * node. Caller is expected to have an elevated reference count on + * the page that will be dropped by this function before returning. + */ +int migrate_misplaced_page(struct page *page, int node) +{ + pg_data_t *pgdat = NODE_DATA(node); + int isolated = 0; + int nr_remaining; + LIST_HEAD(migratepages); + + /* + * Don't migrate pages that are mapped in multiple processes. + * TODO: Handle false sharing detection instead of this hammer + */ + if (page_mapcount(page) != 1) { + put_page(page); + goto out; } + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (numamigrate_update_ratelimit(pgdat)) { + put_page(page); + goto out; + } + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) + goto out; + + list_add(&page->lru, &migratepages); + nr_remaining = migrate_pages(&migratepages, + alloc_misplaced_dst_page, + node, false, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); + if (nr_remaining) { + putback_lru_pages(&migratepages); + isolated = 0; + } else + count_vm_numa_event(NUMA_PAGE_MIGRATE); BUG_ON(!list_empty(&migratepages)); out: return isolated; } + +int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node) +{ + unsigned long haddr = address & HPAGE_PMD_MASK; + pg_data_t *pgdat = NODE_DATA(node); + int isolated = 0; + struct page *new_page = NULL; + struct mem_cgroup *memcg = NULL; + int page_lru = page_is_file_cache(page); + + /* + * Don't migrate pages that are mapped in multiple processes. + * TODO: Handle false sharing detection instead of this hammer + */ + if (page_mapcount(page) != 1) + goto out_dropref; + + /* + * Rate-limit the amount of data that is being migrated to a node. + * Optimal placement is no good if the memory bus is saturated and + * all the time is being spent migrating! + */ + if (numamigrate_update_ratelimit(pgdat)) + goto out_dropref; + + new_page = alloc_pages_node(node, + (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); + if (!new_page) + goto out_dropref; + page_xchg_last_nid(new_page, page_last_nid(page)); + + isolated = numamigrate_isolate_page(pgdat, page); + if (!isolated) { + put_page(new_page); + goto out_keep_locked; + } + + /* Prepare a page as a migration target */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + + /* anon mapping, we can simply copy page->mapping to the new page: */ + new_page->mapping = page->mapping; + new_page->index = page->index; + migrate_page_copy(new_page, page); + WARN_ON(PageLRU(new_page)); + + /* Recheck the target PMD */ + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, entry))) { + spin_unlock(&mm->page_table_lock); + + /* Reverse changes made by migrate_page_copy() */ + if (TestClearPageActive(new_page)) + SetPageActive(page); + if (TestClearPageUnevictable(new_page)) + SetPageUnevictable(page); + mlock_migrate_page(page, new_page); + + unlock_page(new_page); + put_page(new_page); /* Free it */ + + unlock_page(page); + putback_lru_page(page); + + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); + goto out; + } + + /* + * Traditional migration needs to prepare the memcg charge + * transaction early to prevent the old page from being + * uncharged when installing migration entries. Here we can + * save the potential rollback and start the charge transfer + * only when migration is already known to end successfully. + */ + mem_cgroup_prepare_migration(page, new_page, &memcg); + + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = pmd_mknonnuma(entry); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + + page_add_new_anon_rmap(new_page, vma, haddr); + + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, address, entry); + page_remove_rmap(page); + /* + * Finish the charge transaction under the page table lock to + * prevent split_huge_page() from dividing up the charge + * before it's fully transferred to the new page. + */ + mem_cgroup_end_migration(memcg, page, new_page, true); + spin_unlock(&mm->page_table_lock); + + unlock_page(new_page); + unlock_page(page); + put_page(page); /* Drop the rmap reference */ + put_page(page); /* Drop the LRU isolation reference */ + + count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); + count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); + +out: + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, + -HPAGE_PMD_NR); + return isolated; + +out_dropref: + put_page(page); +out_keep_locked: + return 0; +} #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */ -- cgit v1.3-8-gc7d7 From 220018d388b8ab1fca1c5f0c6474bab47ad2c9c0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 5 Dec 2012 09:32:56 +0000 Subject: mm: numa: Add THP migration for the NUMA working set scanning fault case build fix Commit "Add THP migration for the NUMA working set scanning fault case" breaks the build because HPAGE_PMD_SHIFT and HPAGE_PMD_MASK defined to explode without CONFIG_TRANSPARENT_HUGEPAGE: mm/migrate.c: In function 'migrate_misplaced_transhuge_page_put': mm/migrate.c:1549: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed mm/migrate.c:1564: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed mm/migrate.c:1566: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed mm/migrate.c:1573: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed mm/migrate.c:1606: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed mm/migrate.c:1648: error: call to '__build_bug_failed' declared with attribute error: BUILD_BUG failed CONFIG_NUMA_BALANCING allows compilation without enabling transparent hugepages, so define the dummy function for such a configuration and only define migrate_misplaced_transhuge_page_put() when transparent hugepages are enabled. Signed-off-by: David Rientjes Signed-off-by: Mel Gorman --- include/linux/migrate.h | 16 +++++++++------- mm/migrate.c | 2 ++ 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'include/linux/migrate.h') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 51eac4bdc606..d52afb9a790c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -79,12 +79,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, extern int migrate_misplaced_page(struct page *page, int node); extern int migrate_misplaced_page(struct page *page, int node); extern bool migrate_ratelimited(int node); -extern int migrate_misplaced_transhuge_page(struct mm_struct *mm, - struct vm_area_struct *vma, - pmd_t *pmd, pmd_t entry, - unsigned long address, - struct page *page, int node); - #else static inline int migrate_misplaced_page(struct page *page, int node) { @@ -94,7 +88,15 @@ static inline bool migrate_ratelimited(int node) { return false; } +#endif /* CONFIG_NUMA_BALANCING */ +#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +extern int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node); +#else static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, pmd_t entry, @@ -103,6 +105,6 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, { return -EAGAIN; } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/ #endif /* _LINUX_MIGRATE_H */ diff --git a/mm/migrate.c b/mm/migrate.c index c9400960fd52..9341a501d168 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1602,7 +1602,9 @@ int migrate_misplaced_page(struct page *page, int node) out: return isolated; } +#endif /* CONFIG_NUMA_BALANCING */ +#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) int migrate_misplaced_transhuge_page(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, pmd_t entry, -- cgit v1.3-8-gc7d7 From 78bd52097d04205a33a8014a1b8ac01cf1ae9d06 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:31 -0800 Subject: mm: adjust address_space_operations.migratepage() return code Memory fragmentation introduced by ballooning might reduce significantly the number of 2MB contiguous memory blocks that can be used within a guest, thus imposing performance penalties associated with the reduced number of transparent huge pages that could be used by the guest workload. This patch-set follows the main idea discussed at 2012 LSFMMS session: "Ballooning for transparent huge pages" -- http://lwn.net/Articles/490114/ to introduce the required changes to the virtio_balloon driver, as well as the changes to the core compaction & migration bits, in order to make those subsystems aware of ballooned pages and allow memory balloon pages become movable within a guest, thus avoiding the aforementioned fragmentation issue Following are numbers that prove this patch benefits on allowing compaction to be more effective at memory ballooned guests. Results for STRESS-HIGHALLOC benchmark, from Mel Gorman's mmtests suite, running on a 4gB RAM KVM guest which was ballooning 512mB RAM in 64mB chunks, at every minute (inflating/deflating), while test was running: ===BEGIN stress-highalloc STRESS-HIGHALLOC highalloc-3.7 highalloc-3.7 rc4-clean rc4-patch Pass 1 55.00 ( 0.00%) 62.00 ( 7.00%) Pass 2 54.00 ( 0.00%) 62.00 ( 8.00%) while Rested 75.00 ( 0.00%) 80.00 ( 5.00%) MMTests Statistics: duration 3.7 3.7 rc4-clean rc4-patch User 1207.59 1207.46 System 1300.55 1299.61 Elapsed 2273.72 2157.06 MMTests Statistics: vmstat 3.7 3.7 rc4-clean rc4-patch Page Ins 3581516 2374368 Page Outs 11148692 10410332 Swap Ins 80 47 Swap Outs 3641 476 Direct pages scanned 37978 33826 Kswapd pages scanned 1828245 1342869 Kswapd pages reclaimed 1710236 1304099 Direct pages reclaimed 32207 31005 Kswapd efficiency 93% 97% Kswapd velocity 804.077 622.546 Direct efficiency 84% 91% Direct velocity 16.703 15.682 Percentage direct scans 2% 2% Page writes by reclaim 79252 9704 Page writes file 75611 9228 Page writes anon 3641 476 Page reclaim immediate 16764 11014 Page rescued immediate 0 0 Slabs scanned 2171904 2152448 Direct inode steals 385 2261 Kswapd inode steals 659137 609670 Kswapd skipped wait 1 69 THP fault alloc 546 631 THP collapse alloc 361 339 THP splits 259 263 THP fault fallback 98 50 THP collapse fail 20 17 Compaction stalls 747 499 Compaction success 244 145 Compaction failures 503 354 Compaction pages moved 370888 474837 Compaction move failure 77378 65259 ===END stress-highalloc This patch: Introduce MIGRATEPAGE_SUCCESS as the default return code for address_space_operations.migratepage() method and documents the expected return code for the same method in failure cases. Signed-off-by: Rafael Aquini Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- include/linux/migrate.h | 7 +++++++ mm/migrate.c | 33 +++++++++++++++------------------ 3 files changed, 24 insertions(+), 20 deletions(-) (limited to 'include/linux/migrate.h') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 47e6e2f21e21..4a55f35a6ced 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -582,11 +582,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, int rc; rc = migrate_huge_page_move_mapping(mapping, newpage, page); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) return rc; migrate_page_copy(newpage, page); - return 0; + return MIGRATEPAGE_SUCCESS; } static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ce7e6671968b..a4e886d17f87 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -7,6 +7,13 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); +/* + * Return values from addresss_space_operations.migratepage(): + * - negative errno on page migration failure; + * - zero on page migration success; + */ +#define MIGRATEPAGE_SUCCESS 0 + #ifdef CONFIG_MIGRATION extern void putback_lru_pages(struct list_head *l); diff --git a/mm/migrate.c b/mm/migrate.c index 1dc4598d2513..33f5f82a6006 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -276,7 +276,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, /* Anonymous page without mapping */ if (page_count(page) != 1) return -EAGAIN; - return 0; + return MIGRATEPAGE_SUCCESS; } spin_lock_irq(&mapping->tree_lock); @@ -346,7 +346,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, } spin_unlock_irq(&mapping->tree_lock); - return 0; + return MIGRATEPAGE_SUCCESS; } /* @@ -362,7 +362,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, if (!mapping) { if (page_count(page) != 1) return -EAGAIN; - return 0; + return MIGRATEPAGE_SUCCESS; } spin_lock_irq(&mapping->tree_lock); @@ -389,7 +389,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, page_unfreeze_refs(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); - return 0; + return MIGRATEPAGE_SUCCESS; } /* @@ -476,11 +476,11 @@ int migrate_page(struct address_space *mapping, rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) return rc; migrate_page_copy(newpage, page); - return 0; + return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(migrate_page); @@ -503,7 +503,7 @@ int buffer_migrate_page(struct address_space *mapping, rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) return rc; /* @@ -539,7 +539,7 @@ int buffer_migrate_page(struct address_space *mapping, } while (bh != head); - return 0; + return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(buffer_migrate_page); #endif @@ -618,7 +618,7 @@ static int fallback_migrate_page(struct address_space *mapping, * * Return value: * < 0 - error code - * == 0 - success + * MIGRATEPAGE_SUCCESS - success */ static int move_to_new_page(struct page *newpage, struct page *page, int remap_swapcache, enum migrate_mode mode) @@ -655,7 +655,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, else rc = fallback_migrate_page(mapping, newpage, page, mode); - if (rc) { + if (rc != MIGRATEPAGE_SUCCESS) { newpage->mapping = NULL; } else { if (remap_swapcache) @@ -804,7 +804,7 @@ skip_unmap: put_anon_vma(anon_vma); uncharge: - mem_cgroup_end_migration(mem, page, newpage, rc == 0); + mem_cgroup_end_migration(mem, page, newpage, rc == MIGRATEPAGE_SUCCESS); unlock: unlock_page(page); out: @@ -977,7 +977,7 @@ int migrate_pages(struct list_head *from, case -EAGAIN: retry++; break; - case 0: + case MIGRATEPAGE_SUCCESS: break; default: /* Permanent failure */ @@ -986,15 +986,12 @@ int migrate_pages(struct list_head *from, } } } - rc = 0; + rc = nr_failed + retry; out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; - if (rc) - return rc; - - return nr_failed + retry; + return rc; } int migrate_huge_page(struct page *hpage, new_page_t get_new_page, @@ -1014,7 +1011,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page, /* try again */ cond_resched(); break; - case 0: + case MIGRATEPAGE_SUCCESS: goto out; default: rc = -EIO; -- cgit v1.3-8-gc7d7 From 18468d93e53b037e1a04ec58398eab763d054064 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:38 -0800 Subject: mm: introduce a common interface for balloon pages mobility Memory fragmentation introduced by ballooning might reduce significantly the number of 2MB contiguous memory blocks that can be used within a guest, thus imposing performance penalties associated with the reduced number of transparent huge pages that could be used by the guest workload. This patch introduces a common interface to help a balloon driver on making its page set movable to compaction, and thus allowing the system to better leverage the compation efforts on memory defragmentation. [akpm@linux-foundation.org: use PAGE_FLAGS_CHECK_AT_PREP, s/__balloon_page_flags/page_flags_cleared/, small cleanups] [rientjes@google.com: allow balloon compaction for any system with memory compaction enabled, which is the defconfig] Signed-off-by: Rafael Aquini Acked-by: Mel Gorman Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/balloon_compaction.h | 272 +++++++++++++++++++++++++++++++++ include/linux/migrate.h | 10 ++ include/linux/pagemap.h | 16 ++ mm/Kconfig | 15 ++ mm/Makefile | 3 +- mm/balloon_compaction.c | 302 +++++++++++++++++++++++++++++++++++++ 6 files changed, 617 insertions(+), 1 deletion(-) create mode 100644 include/linux/balloon_compaction.h create mode 100644 mm/balloon_compaction.c (limited to 'include/linux/migrate.h') diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h new file mode 100644 index 000000000000..f7f1d7169b11 --- /dev/null +++ b/include/linux/balloon_compaction.h @@ -0,0 +1,272 @@ +/* + * include/linux/balloon_compaction.h + * + * Common interface definitions for making balloon pages movable by compaction. + * + * Despite being perfectly possible to perform ballooned pages migration, they + * make a special corner case to compaction scans because balloon pages are not + * enlisted at any LRU list like the other pages we do compact / migrate. + * + * As the page isolation scanning step a compaction thread does is a lockless + * procedure (from a page standpoint), it might bring some racy situations while + * performing balloon page compaction. In order to sort out these racy scenarios + * and safely perform balloon's page compaction and migration we must, always, + * ensure following these three simple rules: + * + * i. when updating a balloon's page ->mapping element, strictly do it under + * the following lock order, independently of the far superior + * locking scheme (lru_lock, balloon_lock): + * +-page_lock(page); + * +--spin_lock_irq(&b_dev_info->pages_lock); + * ... page->mapping updates here ... + * + * ii. before isolating or dequeueing a balloon page from the balloon device + * pages list, the page reference counter must be raised by one and the + * extra refcount must be dropped when the page is enqueued back into + * the balloon device page list, thus a balloon page keeps its reference + * counter raised only while it is under our special handling; + * + * iii. after the lockless scan step have selected a potential balloon page for + * isolation, re-test the page->mapping flags and the page ref counter + * under the proper page lock, to ensure isolating a valid balloon page + * (not yet isolated, nor under release procedure) + * + * The functions provided by this interface are placed to help on coping with + * the aforementioned balloon page corner case, as well as to ensure the simple + * set of exposed rules are satisfied while we are dealing with balloon pages + * compaction / migration. + * + * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini + */ +#ifndef _LINUX_BALLOON_COMPACTION_H +#define _LINUX_BALLOON_COMPACTION_H +#include +#include +#include +#include +#include + +/* + * Balloon device information descriptor. + * This struct is used to allow the common balloon compaction interface + * procedures to find the proper balloon device holding memory pages they'll + * have to cope for page compaction / migration, as well as it serves the + * balloon driver as a page book-keeper for its registered balloon devices. + */ +struct balloon_dev_info { + void *balloon_device; /* balloon device descriptor */ + struct address_space *mapping; /* balloon special page->mapping */ + unsigned long isolated_pages; /* # of isolated pages for migration */ + spinlock_t pages_lock; /* Protection to pages list */ + struct list_head pages; /* Pages enqueued & handled to Host */ +}; + +extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info); +extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); +extern struct balloon_dev_info *balloon_devinfo_alloc( + void *balloon_dev_descriptor); + +static inline void balloon_devinfo_free(struct balloon_dev_info *b_dev_info) +{ + kfree(b_dev_info); +} + +/* + * balloon_page_free - release a balloon page back to the page free lists + * @page: ballooned page to be set free + * + * This function must be used to properly set free an isolated/dequeued balloon + * page at the end of a sucessful page migration, or at the balloon driver's + * page release procedure. + */ +static inline void balloon_page_free(struct page *page) +{ + /* + * Balloon pages always get an extra refcount before being isolated + * and before being dequeued to help on sorting out fortuite colisions + * between a thread attempting to isolate and another thread attempting + * to release the very same balloon page. + * + * Before we handle the page back to Buddy, lets drop its extra refcnt. + */ + put_page(page); + __free_page(page); +} + +#ifdef CONFIG_BALLOON_COMPACTION +extern bool balloon_page_isolate(struct page *page); +extern void balloon_page_putback(struct page *page); +extern int balloon_page_migrate(struct page *newpage, + struct page *page, enum migrate_mode mode); +extern struct address_space +*balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, + const struct address_space_operations *a_ops); + +static inline void balloon_mapping_free(struct address_space *balloon_mapping) +{ + kfree(balloon_mapping); +} + +/* + * page_flags_cleared - helper to perform balloon @page ->flags tests. + * + * As balloon pages are obtained from buddy and we do not play with page->flags + * at driver level (exception made when we get the page lock for compaction), + * we can safely identify a ballooned page by checking if the + * PAGE_FLAGS_CHECK_AT_PREP page->flags are all cleared. This approach also + * helps us skip ballooned pages that are locked for compaction or release, thus + * mitigating their racy check at balloon_page_movable() + */ +static inline bool page_flags_cleared(struct page *page) +{ + return !(page->flags & PAGE_FLAGS_CHECK_AT_PREP); +} + +/* + * __is_movable_balloon_page - helper to perform @page mapping->flags tests + */ +static inline bool __is_movable_balloon_page(struct page *page) +{ + struct address_space *mapping = page->mapping; + return mapping_balloon(mapping); +} + +/* + * balloon_page_movable - test page->mapping->flags to identify balloon pages + * that can be moved by compaction/migration. + * + * This function is used at core compaction's page isolation scheme, therefore + * most pages exposed to it are not enlisted as balloon pages and so, to avoid + * undesired side effects like racing against __free_pages(), we cannot afford + * holding the page locked while testing page->mapping->flags here. + * + * As we might return false positives in the case of a balloon page being just + * released under us, the page->mapping->flags need to be re-tested later, + * under the proper page lock, at the functions that will be coping with the + * balloon page case. + */ +static inline bool balloon_page_movable(struct page *page) +{ + /* + * Before dereferencing and testing mapping->flags, let's make sure + * this is not a page that uses ->mapping in a different way + */ + if (page_flags_cleared(page) && !page_mapped(page) && + page_count(page) == 1) + return __is_movable_balloon_page(page); + + return false; +} + +/* + * balloon_page_insert - insert a page into the balloon's page list and make + * the page->mapping assignment accordingly. + * @page : page to be assigned as a 'balloon page' + * @mapping : allocated special 'balloon_mapping' + * @head : balloon's device page list head + * + * Caller must ensure the page is locked and the spin_lock protecting balloon + * pages list is held before inserting a page into the balloon device. + */ +static inline void balloon_page_insert(struct page *page, + struct address_space *mapping, + struct list_head *head) +{ + page->mapping = mapping; + list_add(&page->lru, head); +} + +/* + * balloon_page_delete - delete a page from balloon's page list and clear + * the page->mapping assignement accordingly. + * @page : page to be released from balloon's page list + * + * Caller must ensure the page is locked and the spin_lock protecting balloon + * pages list is held before deleting a page from the balloon device. + */ +static inline void balloon_page_delete(struct page *page) +{ + page->mapping = NULL; + list_del(&page->lru); +} + +/* + * balloon_page_device - get the b_dev_info descriptor for the balloon device + * that enqueues the given page. + */ +static inline struct balloon_dev_info *balloon_page_device(struct page *page) +{ + struct address_space *mapping = page->mapping; + if (likely(mapping)) + return mapping->private_data; + + return NULL; +} + +static inline gfp_t balloon_mapping_gfp_mask(void) +{ + return GFP_HIGHUSER_MOVABLE; +} + +static inline bool balloon_compaction_check(void) +{ + return true; +} + +#else /* !CONFIG_BALLOON_COMPACTION */ + +static inline void *balloon_mapping_alloc(void *balloon_device, + const struct address_space_operations *a_ops) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void balloon_mapping_free(struct address_space *balloon_mapping) +{ + return; +} + +static inline void balloon_page_insert(struct page *page, + struct address_space *mapping, + struct list_head *head) +{ + list_add(&page->lru, head); +} + +static inline void balloon_page_delete(struct page *page) +{ + list_del(&page->lru); +} + +static inline bool balloon_page_movable(struct page *page) +{ + return false; +} + +static inline bool balloon_page_isolate(struct page *page) +{ + return false; +} + +static inline void balloon_page_putback(struct page *page) +{ + return; +} + +static inline int balloon_page_migrate(struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + return 0; +} + +static inline gfp_t balloon_mapping_gfp_mask(void) +{ + return GFP_HIGHUSER; +} + +static inline bool balloon_compaction_check(void) +{ + return false; +} +#endif /* CONFIG_BALLOON_COMPACTION */ +#endif /* _LINUX_BALLOON_COMPACTION_H */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index a4e886d17f87..ce42847ed35f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -11,8 +11,18 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); * Return values from addresss_space_operations.migratepage(): * - negative errno on page migration failure; * - zero on page migration success; + * + * The balloon page migration introduces this special case where a 'distinct' + * return code is used to flag a successful page migration to unmap_and_move(). + * This approach is necessary because page migration can race against balloon + * deflation procedure, and for such case we could introduce a nasty page leak + * if a successfully migrated balloon page gets released concurrently with + * migration's unmap_and_move() wrap-up steps. */ #define MIGRATEPAGE_SUCCESS 0 +#define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page + * sucessful migration case. + */ #ifdef CONFIG_MIGRATION diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e42c762f0dc7..6da609d14c15 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -24,6 +24,7 @@ enum mapping_flags { AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ + AS_BALLOON_MAP = __GFP_BITS_SHIFT + 4, /* balloon page special map */ }; static inline void mapping_set_error(struct address_space *mapping, int error) @@ -53,6 +54,21 @@ static inline int mapping_unevictable(struct address_space *mapping) return !!mapping; } +static inline void mapping_set_balloon(struct address_space *mapping) +{ + set_bit(AS_BALLOON_MAP, &mapping->flags); +} + +static inline void mapping_clear_balloon(struct address_space *mapping) +{ + clear_bit(AS_BALLOON_MAP, &mapping->flags); +} + +static inline int mapping_balloon(struct address_space *mapping) +{ + return mapping && test_bit(AS_BALLOON_MAP, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return (__force gfp_t)mapping->flags & __GFP_BITS_MASK; diff --git a/mm/Kconfig b/mm/Kconfig index a3f8dddaaab3..e6651c5de14f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,6 +187,21 @@ config SPLIT_PTLOCK_CPUS default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" +# +# support for memory balloon compaction +config BALLOON_COMPACTION + bool "Allow for balloon memory compaction/migration" + def_bool y + depends on COMPACTION && VIRTIO_BALLOON + help + Memory fragmentation introduced by ballooning might reduce + significantly the number of 2MB contiguous memory blocks that can be + used within a guest, thus imposing performance penalties associated + with the reduced number of transparent huge pages that could be used + by the guest workload. Allowing the compaction & migration for memory + pages enlisted as being part of memory balloon devices avoids the + scenario aforementioned and helps improving memory defragmentation. + # # support for memory compaction config COMPACTION diff --git a/mm/Makefile b/mm/Makefile index 6b025f80af34..3a4628751f89 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o interval_tree.o $(mmu-y) + compaction.o balloon_compaction.o \ + interval_tree.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c new file mode 100644 index 000000000000..07dbc8ec46cf --- /dev/null +++ b/mm/balloon_compaction.c @@ -0,0 +1,302 @@ +/* + * mm/balloon_compaction.c + * + * Common interface for making balloon pages movable by compaction. + * + * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini + */ +#include +#include +#include +#include + +/* + * balloon_devinfo_alloc - allocates a balloon device information descriptor. + * @balloon_dev_descriptor: pointer to reference the balloon device which + * this struct balloon_dev_info will be servicing. + * + * Driver must call it to properly allocate and initialize an instance of + * struct balloon_dev_info which will be used to reference a balloon device + * as well as to keep track of the balloon device page list. + */ +struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor) +{ + struct balloon_dev_info *b_dev_info; + b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL); + if (!b_dev_info) + return ERR_PTR(-ENOMEM); + + b_dev_info->balloon_device = balloon_dev_descriptor; + b_dev_info->mapping = NULL; + b_dev_info->isolated_pages = 0; + spin_lock_init(&b_dev_info->pages_lock); + INIT_LIST_HEAD(&b_dev_info->pages); + + return b_dev_info; +} +EXPORT_SYMBOL_GPL(balloon_devinfo_alloc); + +/* + * balloon_page_enqueue - allocates a new page and inserts it into the balloon + * page list. + * @b_dev_info: balloon device decriptor where we will insert a new page to + * + * Driver must call it to properly allocate a new enlisted balloon page + * before definetively removing it from the guest system. + * This function returns the page address for the recently enqueued page or + * NULL in the case we fail to allocate a new page this turn. + */ +struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) +{ + unsigned long flags; + struct page *page = alloc_page(balloon_mapping_gfp_mask() | + __GFP_NOMEMALLOC | __GFP_NORETRY); + if (!page) + return NULL; + + /* + * Block others from accessing the 'page' when we get around to + * establishing additional references. We should be the only one + * holding a reference to the 'page' at this point. + */ + BUG_ON(!trylock_page(page)); + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + unlock_page(page); + return page; +} +EXPORT_SYMBOL_GPL(balloon_page_enqueue); + +/* + * balloon_page_dequeue - removes a page from balloon's page list and returns + * the its address to allow the driver release the page. + * @b_dev_info: balloon device decriptor where we will grab a page from. + * + * Driver must call it to properly de-allocate a previous enlisted balloon page + * before definetively releasing it back to the guest system. + * This function returns the page address for the recently dequeued page or + * NULL in the case we find balloon's page list temporarily empty due to + * compaction isolated pages. + */ +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) +{ + struct page *page, *tmp; + unsigned long flags; + bool dequeued_page; + + dequeued_page = false; + list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { + /* + * Block others from accessing the 'page' while we get around + * establishing additional references and preparing the 'page' + * to be released by the balloon driver. + */ + if (trylock_page(page)) { + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + /* + * Raise the page refcount here to prevent any wrong + * attempt to isolate this page, in case of coliding + * with balloon_page_isolate() just after we release + * the page lock. + * + * balloon_page_free() will take care of dropping + * this extra refcount later. + */ + get_page(page); + balloon_page_delete(page); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + unlock_page(page); + dequeued_page = true; + break; + } + } + + if (!dequeued_page) { + /* + * If we are unable to dequeue a balloon page because the page + * list is empty and there is no isolated pages, then something + * went out of track and some balloon pages are lost. + * BUG() here, otherwise the balloon driver may get stuck into + * an infinite loop while attempting to release all its pages. + */ + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + if (unlikely(list_empty(&b_dev_info->pages) && + !b_dev_info->isolated_pages)) + BUG(); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + page = NULL; + } + return page; +} +EXPORT_SYMBOL_GPL(balloon_page_dequeue); + +#ifdef CONFIG_BALLOON_COMPACTION +/* + * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages. + * @b_dev_info: holds the balloon device information descriptor. + * @a_ops: balloon_mapping address_space_operations descriptor. + * + * Driver must call it to properly allocate and initialize an instance of + * struct address_space which will be used as the special page->mapping for + * balloon device enlisted page instances. + */ +struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, + const struct address_space_operations *a_ops) +{ + struct address_space *mapping; + + mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); + if (!mapping) + return ERR_PTR(-ENOMEM); + + /* + * Give a clean 'zeroed' status to all elements of this special + * balloon page->mapping struct address_space instance. + */ + address_space_init_once(mapping); + + /* + * Set mapping->flags appropriately, to allow balloon pages + * ->mapping identification. + */ + mapping_set_balloon(mapping); + mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask()); + + /* balloon's page->mapping->a_ops callback descriptor */ + mapping->a_ops = a_ops; + + /* + * Establish a pointer reference back to the balloon device descriptor + * this particular page->mapping will be servicing. + * This is used by compaction / migration procedures to identify and + * access the balloon device pageset while isolating / migrating pages. + * + * As some balloon drivers can register multiple balloon devices + * for a single guest, this also helps compaction / migration to + * properly deal with multiple balloon pagesets, when required. + */ + mapping->private_data = b_dev_info; + b_dev_info->mapping = mapping; + + return mapping; +} +EXPORT_SYMBOL_GPL(balloon_mapping_alloc); + +static inline void __isolate_balloon_page(struct page *page) +{ + struct balloon_dev_info *b_dev_info = page->mapping->private_data; + unsigned long flags; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + list_del(&page->lru); + b_dev_info->isolated_pages++; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); +} + +static inline void __putback_balloon_page(struct page *page) +{ + struct balloon_dev_info *b_dev_info = page->mapping->private_data; + unsigned long flags; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + list_add(&page->lru, &b_dev_info->pages); + b_dev_info->isolated_pages--; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); +} + +static inline int __migrate_balloon_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + return page->mapping->a_ops->migratepage(mapping, newpage, page, mode); +} + +/* __isolate_lru_page() counterpart for a ballooned page */ +bool balloon_page_isolate(struct page *page) +{ + /* + * Avoid burning cycles with pages that are yet under __free_pages(), + * or just got freed under us. + * + * In case we 'win' a race for a balloon page being freed under us and + * raise its refcount preventing __free_pages() from doing its job + * the put_page() at the end of this block will take care of + * release this page, thus avoiding a nasty leakage. + */ + if (likely(get_page_unless_zero(page))) { + /* + * As balloon pages are not isolated from LRU lists, concurrent + * compaction threads can race against page migration functions + * as well as race against the balloon driver releasing a page. + * + * In order to avoid having an already isolated balloon page + * being (wrongly) re-isolated while it is under migration, + * or to avoid attempting to isolate pages being released by + * the balloon driver, lets be sure we have the page lock + * before proceeding with the balloon page isolation steps. + */ + if (likely(trylock_page(page))) { + /* + * A ballooned page, by default, has just one refcount. + * Prevent concurrent compaction threads from isolating + * an already isolated balloon page by refcount check. + */ + if (__is_movable_balloon_page(page) && + page_count(page) == 2) { + __isolate_balloon_page(page); + unlock_page(page); + return true; + } + unlock_page(page); + } + put_page(page); + } + return false; +} + +/* putback_lru_page() counterpart for a ballooned page */ +void balloon_page_putback(struct page *page) +{ + /* + * 'lock_page()' stabilizes the page and prevents races against + * concurrent isolation threads attempting to re-isolate it. + */ + lock_page(page); + + if (__is_movable_balloon_page(page)) { + __putback_balloon_page(page); + /* drop the extra ref count taken for page isolation */ + put_page(page); + } else { + WARN_ON(1); + dump_page(page); + } + unlock_page(page); +} + +/* move_to_new_page() counterpart for a ballooned page */ +int balloon_page_migrate(struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + struct address_space *mapping; + int rc = -EAGAIN; + + /* + * Block others from accessing the 'newpage' when we get around to + * establishing additional references. We should be the only one + * holding a reference to the 'newpage' at this point. + */ + BUG_ON(!trylock_page(newpage)); + + if (WARN_ON(!__is_movable_balloon_page(page))) { + dump_page(page); + unlock_page(newpage); + return rc; + } + + mapping = page->mapping; + if (mapping) + rc = __migrate_balloon_page(mapping, newpage, page, mode); + + unlock_page(newpage); + return rc; +} +#endif /* CONFIG_BALLOON_COMPACTION */ -- cgit v1.3-8-gc7d7 From 5733c7d11dff44e98d2ca16617886a78086b354f Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:47 -0800 Subject: mm: introduce putback_movable_pages() The PATCH "mm: introduce compaction and migration for virtio ballooned pages" hacks around putback_lru_pages() in order to allow ballooned pages to be re-inserted on balloon page list as if a ballooned page was like a LRU page. As ballooned pages are not legitimate LRU pages, this patch introduces putback_movable_pages() to properly cope with cases where the isolated pageset contains ballooned pages and LRU pages, thus fixing the mentioned inelegant hack around putback_lru_pages(). Signed-off-by: Rafael Aquini Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 ++ mm/compaction.c | 6 +++--- mm/migrate.c | 20 ++++++++++++++++++++ mm/page_alloc.c | 2 +- 4 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include/linux/migrate.h') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ce42847ed35f..0b5865c61efd 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -27,6 +27,7 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); #ifdef CONFIG_MIGRATION extern void putback_lru_pages(struct list_head *l); +extern void putback_movable_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, @@ -50,6 +51,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, #else static inline void putback_lru_pages(struct list_head *l) {} +static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode) { return -ENOSYS; } diff --git a/mm/compaction.c b/mm/compaction.c index 470474c03b61..d24dd2d7bad4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1003,7 +1003,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_PARTIAL; - putback_lru_pages(&cc->migratepages); + putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; goto out; case ISOLATE_NONE: @@ -1026,9 +1026,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) trace_mm_compaction_migratepages(nr_migrate - nr_remaining, nr_remaining); - /* Release LRU pages not migrated */ + /* Release isolated pages not migrated */ if (err) { - putback_lru_pages(&cc->migratepages); + putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; if (err == -ENOMEM) { ret = COMPACT_PARTIAL; diff --git a/mm/migrate.c b/mm/migrate.c index 427343c0c296..3f675ca08279 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -76,6 +76,26 @@ void putback_lru_pages(struct list_head *l) struct page *page; struct page *page2; + list_for_each_entry_safe(page, page2, l, lru) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } +} + +/* + * Put previously isolated pages back onto the appropriate lists + * from where they were once taken off for compaction/migration. + * + * This function shall be used instead of putback_lru_pages(), + * whenever the isolated pageset has been built by isolate_migratepages_range() + */ +void putback_movable_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a7b7611d4e4..d9fbac1b5030 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5761,7 +5761,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, 0, false, MIGRATE_SYNC); } - putback_lru_pages(&cc->migratepages); + putback_movable_pages(&cc->migratepages); return ret > 0 ? 0 : ret; } -- cgit v1.3-8-gc7d7