From 97a225e69a1f880886f33d2e65a7ace13f152caa Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 3 Jun 2020 15:59:01 -0700 Subject: mm/page_alloc: integrate classzone_idx and high_zoneidx classzone_idx is just different name for high_zoneidx now. So, integrate them and add some comment to struct alloc_context in order to reduce future confusion about the meaning of this variable. The accessor, ac_classzone_idx() is also removed since it isn't needed after integration. In addition to integration, this patch also renames high_zoneidx to highest_zoneidx since it represents more precise meaning. Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Reviewed-by: Baoquan He Acked-by: Vlastimil Babka Acked-by: David Rientjes Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Ye Xiaolong Link: http://lkml.kernel.org/r/1587095923-7515-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Linus Torvalds --- include/trace/events/compaction.h | 22 +++++++++++++--------- include/trace/events/vmscan.h | 14 +++++++++----- 2 files changed, 22 insertions(+), 14 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index e5bf6ee4e814..54e5bf081171 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -314,40 +314,44 @@ TRACE_EVENT(mm_compaction_kcompactd_sleep, DECLARE_EVENT_CLASS(kcompactd_wake_template, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx), + TP_ARGS(nid, order, highest_zoneidx), TP_STRUCT__entry( __field(int, nid) __field(int, order) - __field(enum zone_type, classzone_idx) + __field(enum zone_type, highest_zoneidx) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; - __entry->classzone_idx = classzone_idx; + __entry->highest_zoneidx = highest_zoneidx; ), + /* + * classzone_idx is previous name of the highest_zoneidx. + * Reason not to change it is the ABI requirement of the tracepoint. + */ TP_printk("nid=%d order=%d classzone_idx=%-8s", __entry->nid, __entry->order, - __print_symbolic(__entry->classzone_idx, ZONE_TYPE)) + __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE)) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx) + TP_ARGS(nid, order, highest_zoneidx) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx) + TP_ARGS(nid, order, highest_zoneidx) ); #endif diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 74bb594ccb25..2070df64958e 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -265,7 +265,7 @@ TRACE_EVENT(mm_shrink_slab_end, ); TRACE_EVENT(mm_vmscan_lru_isolate, - TP_PROTO(int classzone_idx, + TP_PROTO(int highest_zoneidx, int order, unsigned long nr_requested, unsigned long nr_scanned, @@ -274,10 +274,10 @@ TRACE_EVENT(mm_vmscan_lru_isolate, isolate_mode_t isolate_mode, int lru), - TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), + TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), TP_STRUCT__entry( - __field(int, classzone_idx) + __field(int, highest_zoneidx) __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) @@ -288,7 +288,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate, ), TP_fast_assign( - __entry->classzone_idx = classzone_idx; + __entry->highest_zoneidx = highest_zoneidx; __entry->order = order; __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; @@ -298,9 +298,13 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __entry->lru = lru; ), + /* + * classzone is previous name of the highest_zoneidx. + * Reason not to change it is the ABI requirement of the tracepoint. + */ TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", __entry->isolate_mode, - __entry->classzone_idx, + __entry->highest_zoneidx, __entry->order, __entry->nr_requested, __entry->nr_scanned, -- cgit v1.2.3-59-g8ed1b From 71a2c112a0f6da497e1b44e18e97b1716c240518 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 3 Jun 2020 16:00:30 -0700 Subject: khugepaged: introduce 'max_ptes_shared' tunable 'max_ptes_shared' specifies how many pages can be shared across multiple processes. Exceeding the number would block the collapse:: /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared A higher value may increase memory footprint for some workloads. By default, at least half of pages has to be not shared. [colin.king@canonical.com: fix several spelling mistakes] Link: http://lkml.kernel.org/r/20200420084241.65433-1-colin.king@canonical.com Signed-off-by: Kirill A. Shutemov Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton Tested-by: Zi Yan Reviewed-by: William Kucharski Reviewed-by: Zi Yan Acked-by: Yang Shi Cc: Andrea Arcangeli Cc: John Hubbard Cc: Mike Kravetz Cc: Ralph Campbell Link: http://lkml.kernel.org/r/20200416160026.16538-9-kirill.shutemov@linux.intel.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/transhuge.rst | 7 +++ include/trace/events/huge_memory.h | 3 +- mm/khugepaged.c | 52 +++++++++++++++++-- tools/testing/selftests/vm/khugepaged.c | 83 ++++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 5 deletions(-) (limited to 'include/trace') diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 2f31de8f7c74..6a233e42be08 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -220,6 +220,13 @@ memory. A lower value can prevent THPs from being collapsed, resulting fewer pages being collapsed into THPs, and lower memory access performance. +``max_ptes_shared`` specifies how many pages can be shared across multiple +processes. Exceeding the number would block the collapse:: + + /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared + +A higher value may increase memory footprint for some workloads. + Boot parameter ============== diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 70e32ff096ec..4fdb14a81108 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -12,6 +12,8 @@ EM( SCAN_SUCCEED, "succeeded") \ EM( SCAN_PMD_NULL, "pmd_null") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ + EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ + EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \ EM( SCAN_PAGE_RO, "no_writable_page") \ @@ -31,7 +33,6 @@ EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ - EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ EM( SCAN_TRUNCATED, "truncated") \ EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1bae66327f82..e7897a7dfbec 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -28,6 +28,8 @@ enum scan_result { SCAN_SUCCEED, SCAN_PMD_NULL, SCAN_EXCEED_NONE_PTE, + SCAN_EXCEED_SWAP_PTE, + SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PAGE_RO, @@ -47,7 +49,6 @@ enum scan_result { SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, - SCAN_EXCEED_SWAP_PTE, SCAN_TRUNCATED, SCAN_PAGE_HAS_PRIVATE, }; @@ -72,6 +73,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; +static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -291,15 +293,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr = __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, khugepaged_max_ptes_swap_store); +static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_shared); +} + +static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_shared; + + err = kstrtoul(buf, 10, &max_ptes_shared); + if (err || max_ptes_shared > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_shared = max_ptes_shared; + + return count; +} + +static struct kobj_attribute khugepaged_max_ptes_shared_attr = + __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show, + khugepaged_max_ptes_shared_store); + static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, &khugepaged_max_ptes_none_attr.attr, + &khugepaged_max_ptes_swap_attr.attr, + &khugepaged_max_ptes_shared_attr.attr, &pages_to_scan_attr.attr, &pages_collapsed_attr.attr, &full_scans_attr.attr, &scan_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr, - &khugepaged_max_ptes_swap_attr.attr, NULL, }; @@ -359,6 +389,7 @@ int __init khugepaged_init(void) khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; + khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; return 0; } @@ -557,7 +588,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, { struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0, result = 0, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; @@ -585,6 +616,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); + if (page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + goto out; + } + if (PageCompound(page)) { struct page *p; page = compound_head(page); @@ -1168,7 +1205,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none_or_zero = 0, result = 0, referenced = 0; + int ret = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; spinlock_t *ptl; @@ -1240,6 +1278,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } + if (page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + goto out_unmap; + } + page = compound_head(page); /* diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index ef67a8a3d784..51b89cedd09d 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -78,6 +78,7 @@ struct khugepaged_settings { unsigned int scan_sleep_millisecs; unsigned int max_ptes_none; unsigned int max_ptes_swap; + unsigned int max_ptes_shared; unsigned long pages_to_scan; }; @@ -277,6 +278,7 @@ static void write_settings(struct settings *settings) khugepaged->scan_sleep_millisecs); write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); + write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); } @@ -313,6 +315,7 @@ static void save_settings(void) read_num("khugepaged/scan_sleep_millisecs"), .max_ptes_none = read_num("khugepaged/max_ptes_none"), .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), + .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), .pages_to_scan = read_num("khugepaged/pages_to_scan"), }; success("OK"); @@ -896,12 +899,90 @@ static void collapse_fork_compound(void) fail("Fail"); fill_memory(p, 0, page_size); + write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); if (wait_for_scan("Collapse PTE table full of compound pages in child", p)) fail("Timeout"); else if (check_huge(p)) success("OK"); else fail("Fail"); + write_num("khugepaged/max_ptes_shared", + default_settings.khugepaged.max_ptes_shared); + + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +static void collapse_max_ptes_shared() +{ + int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); + int wstatus; + void *p; + + p = alloc_mapping(); + + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p)) + fail("Timeout"); + else if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + + if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p)) + fail("Timeout"); + else if (check_huge(p)) + success("OK"); + else + fail("Fail"); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); @@ -930,6 +1011,7 @@ int main(void) default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; + default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; save_settings(); @@ -947,6 +1029,7 @@ int main(void) collapse_compound_extreme(); collapse_fork(); collapse_fork_compound(); + collapse_max_ptes_shared(); restore_settings(0); } -- cgit v1.2.3-59-g8ed1b