aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/cma.c40
-rw-r--r--mm/cma.h4
-rw-r--r--mm/compaction.c187
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/gup.c181
-rw-r--r--mm/hmm.c5
-rw-r--r--mm/huge_memory.c23
-rw-r--r--mm/hugetlb.c91
-rw-r--r--mm/internal.h9
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/ksm.c3
-rw-r--r--mm/maccess.c22
-rw-r--r--mm/memcontrol.c42
-rw-r--r--mm/memory-failure.c7
-rw-r--r--mm/memory.c93
-rw-r--r--mm/memory_hotplug.c30
-rw-r--r--mm/mempolicy.c37
-rw-r--r--mm/migrate.c123
-rw-r--r--mm/mmu_notifier.c9
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/oom_kill.c24
-rw-r--r--mm/page_alloc.c14
-rw-r--r--mm/page_isolation.c5
-rw-r--r--mm/percpu-internal.h55
-rw-r--r--mm/percpu-km.c5
-rw-r--r--mm/percpu-stats.c36
-rw-r--r--mm/percpu-vm.c5
-rw-r--r--mm/percpu.c208
-rw-r--r--mm/process_vm_access.c2
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/swap.c13
-rw-r--r--mm/swap_state.c78
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/usercopy.c2
-rw-r--r--mm/userfaultfd.c2
-rw-r--r--mm/vmscan.c30
-rw-r--r--mm/vmstat.c30
-rw-r--r--mm/workingset.c23
-rw-r--r--mm/zpool.c8
-rw-r--r--mm/zsmalloc.c2
42 files changed, 1017 insertions, 452 deletions
diff --git a/mm/cma.c b/mm/cma.c
index 26ecff818881..7f415d7cda9f 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -52,7 +52,7 @@ unsigned long cma_get_size(const struct cma *cma)
const char *cma_get_name(const struct cma *cma)
{
- return cma->name ? cma->name : "(undefined)";
+ return cma->name;
}
static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
@@ -93,17 +93,15 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
mutex_unlock(&cma->lock);
}
-static int __init cma_activate_area(struct cma *cma)
+static void __init cma_activate_area(struct cma *cma)
{
unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
unsigned i = cma->count >> pageblock_order;
struct zone *zone;
cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
- if (!cma->bitmap) {
- cma->count = 0;
- return -ENOMEM;
- }
+ if (!cma->bitmap)
+ goto out_error;
WARN_ON_ONCE(!pfn_valid(pfn));
zone = page_zone(pfn_to_page(pfn));
@@ -133,25 +131,22 @@ static int __init cma_activate_area(struct cma *cma)
spin_lock_init(&cma->mem_head_lock);
#endif
- return 0;
+ return;
not_in_zone:
- pr_err("CMA area %s could not be activated\n", cma->name);
bitmap_free(cma->bitmap);
+out_error:
cma->count = 0;
- return -EINVAL;
+ pr_err("CMA area %s could not be activated\n", cma->name);
+ return;
}
static int __init cma_init_reserved_areas(void)
{
int i;
- for (i = 0; i < cma_area_count; i++) {
- int ret = cma_activate_area(&cma_areas[i]);
-
- if (ret)
- return ret;
- }
+ for (i = 0; i < cma_area_count; i++)
+ cma_activate_area(&cma_areas[i]);
return 0;
}
@@ -202,13 +197,12 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
* subsystems (like slab allocator) are available.
*/
cma = &cma_areas[cma_area_count];
- if (name) {
- cma->name = name;
- } else {
- cma->name = kasprintf(GFP_KERNEL, "cma%d\n", cma_area_count);
- if (!cma->name)
- return -ENOMEM;
- }
+
+ if (name)
+ snprintf(cma->name, CMA_MAX_NAME, name);
+ else
+ snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count);
+
cma->base_pfn = PFN_DOWN(base);
cma->count = size >> PAGE_SHIFT;
cma->order_per_bit = order_per_bit;
@@ -425,7 +419,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
struct page *page = NULL;
int ret = -ENOMEM;
- if (!cma || !cma->count)
+ if (!cma || !cma->count || !cma->bitmap)
return NULL;
pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
diff --git a/mm/cma.h b/mm/cma.h
index 6698fa63279b..20f6e24bc477 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -4,6 +4,8 @@
#include <linux/debugfs.h>
+#define CMA_MAX_NAME 64
+
struct cma {
unsigned long base_pfn;
unsigned long count;
@@ -15,7 +17,7 @@ struct cma {
spinlock_t mem_head_lock;
struct debugfs_u32_array dfs_bitmap;
#endif
- const char *name;
+ char name[CMA_MAX_NAME];
};
extern struct cma cma_areas[MAX_CMA_AREAS];
diff --git a/mm/compaction.c b/mm/compaction.c
index 86375605faa9..b89581bf859c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,24 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order)
#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
+/*
+ * Fragmentation score check interval for proactive compaction purposes.
+ */
+static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500;
+
+/*
+ * Page order with-respect-to which proactive compaction
+ * calculates external fragmentation, which is used as
+ * the "fragmentation score" of a node/zone.
+ */
+#if defined CONFIG_TRANSPARENT_HUGEPAGE
+#define COMPACTION_HPAGE_ORDER HPAGE_PMD_ORDER
+#elif defined CONFIG_HUGETLBFS
+#define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER
+#else
+#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT)
+#endif
+
static unsigned long release_freepages(struct list_head *freelist)
{
struct page *page, *next;
@@ -136,7 +154,7 @@ EXPORT_SYMBOL(__ClearPageMovable);
/*
* Compaction is deferred when compaction fails to result in a page
- * allocation success. 1 << compact_defer_limit compactions are skipped up
+ * allocation success. 1 << compact_defer_shift, compactions are skipped up
* to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
*/
void defer_compaction(struct zone *zone, int order)
@@ -1459,7 +1477,7 @@ static void isolate_freepages(struct compact_control *cc)
* this pfn aligned down to the pageblock boundary, because we do
* block_start_pfn -= pageblock_nr_pages in the for loop.
* For ending point, take care when isolating in last pageblock of a
- * a zone which ends in the middle of a pageblock.
+ * zone which ends in the middle of a pageblock.
* The low boundary is the end of the pageblock the migration scanner
* is using.
*/
@@ -1857,6 +1875,76 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
+static bool kswapd_is_running(pg_data_t *pgdat)
+{
+ return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING);
+}
+
+/*
+ * A zone's fragmentation score is the external fragmentation wrt to the
+ * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value
+ * in the range [0, 100].
+ *
+ * The scaling factor ensures that proactive compaction focuses on larger
+ * zones like ZONE_NORMAL, rather than smaller, specialized zones like
+ * ZONE_DMA32. For smaller zones, the score value remains close to zero,
+ * and thus never exceeds the high threshold for proactive compaction.
+ */
+static unsigned int fragmentation_score_zone(struct zone *zone)
+{
+ unsigned long score;
+
+ score = zone->present_pages *
+ extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+ return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
+}
+
+/*
+ * The per-node proactive (background) compaction process is started by its
+ * corresponding kcompactd thread when the node's fragmentation score
+ * exceeds the high threshold. The compaction process remains active till
+ * the node's score falls below the low threshold, or one of the back-off
+ * conditions is met.
+ */
+static unsigned int fragmentation_score_node(pg_data_t *pgdat)
+{
+ unsigned int score = 0;
+ int zoneid;
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct zone *zone;
+
+ zone = &pgdat->node_zones[zoneid];
+ score += fragmentation_score_zone(zone);
+ }
+
+ return score;
+}
+
+static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
+{
+ unsigned int wmark_low;
+
+ /*
+ * Cap the low watermak to avoid excessive compaction
+ * activity in case a user sets the proactivess tunable
+ * close to 100 (maximum).
+ */
+ wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
+ return low ? wmark_low : min(wmark_low + 10, 100U);
+}
+
+static bool should_proactive_compact_node(pg_data_t *pgdat)
+{
+ int wmark_high;
+
+ if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat))
+ return false;
+
+ wmark_high = fragmentation_score_wmark(pgdat, false);
+ return fragmentation_score_node(pgdat) > wmark_high;
+}
+
static enum compact_result __compact_finished(struct compact_control *cc)
{
unsigned int order;
@@ -1883,6 +1971,25 @@ static enum compact_result __compact_finished(struct compact_control *cc)
return COMPACT_PARTIAL_SKIPPED;
}
+ if (cc->proactive_compaction) {
+ int score, wmark_low;
+ pg_data_t *pgdat;
+
+ pgdat = cc->zone->zone_pgdat;
+ if (kswapd_is_running(pgdat))
+ return COMPACT_PARTIAL_SKIPPED;
+
+ score = fragmentation_score_zone(cc->zone);
+ wmark_low = fragmentation_score_wmark(pgdat, true);
+
+ if (score > wmark_low)
+ ret = COMPACT_CONTINUE;
+ else
+ ret = COMPACT_SUCCESS;
+
+ goto out;
+ }
+
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
@@ -1941,6 +2048,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
}
}
+out:
if (cc->contended || fatal_signal_pending(current))
ret = COMPACT_CONTENDED;
@@ -2421,6 +2529,41 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
return rc;
}
+/*
+ * Compact all zones within a node till each zone's fragmentation score
+ * reaches within proactive compaction thresholds (as determined by the
+ * proactiveness tunable).
+ *
+ * It is possible that the function returns before reaching score targets
+ * due to various back-off conditions, such as, contention on per-node or
+ * per-zone locks.
+ */
+static void proactive_compact_node(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = -1,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+ .whole_zone = true,
+ .gfp_mask = GFP_KERNEL,
+ .proactive_compaction = true,
+ };
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ cc.zone = zone;
+
+ compact_zone(&cc, NULL);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+}
/* Compact all zones within a node */
static void compact_node(int nid)
@@ -2468,6 +2611,13 @@ static void compact_nodes(void)
int sysctl_compact_memory;
/*
+ * Tunable for proactive compaction. It determines how
+ * aggressively the kernel should compact memory in the
+ * background. It takes values in the range [0, 100].
+ */
+unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+
+/*
* This is the entry point for compacting all nodes via
* /proc/sys/vm/compact_memory
*/
@@ -2646,6 +2796,7 @@ static int kcompactd(void *p)
{
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
+ unsigned int proactive_defer = 0;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -2661,12 +2812,34 @@ static int kcompactd(void *p)
unsigned long pflags;
trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
- wait_event_freezable(pgdat->kcompactd_wait,
- kcompactd_work_requested(pgdat));
+ if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
+ kcompactd_work_requested(pgdat),
+ msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
+
+ psi_memstall_enter(&pflags);
+ kcompactd_do_work(pgdat);
+ psi_memstall_leave(&pflags);
+ continue;
+ }
- psi_memstall_enter(&pflags);
- kcompactd_do_work(pgdat);
- psi_memstall_leave(&pflags);
+ /* kcompactd wait timeout */
+ if (should_proactive_compact_node(pgdat)) {
+ unsigned int prev_score, score;
+
+ if (proactive_defer) {
+ proactive_defer--;
+ continue;
+ }
+ prev_score = fragmentation_score_node(pgdat);
+ proactive_compact_node(pgdat);
+ score = fragmentation_score_node(pgdat);
+ /*
+ * Defer proactive compaction if the fragmentation
+ * score did not go down i.e. no progress made.
+ */
+ proactive_defer = score < prev_score ?
+ 0 : 1 << COMPACT_MAX_DEFER_SHIFT;
+ }
}
return 0;
diff --git a/mm/filemap.c b/mm/filemap.c
index f2bb5ff0293d..8e75bce0346d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2885,7 +2885,7 @@ filler:
* Case a, the page will be up to date when the page is unlocked.
* There is no need to serialise on the page lock here as the page
* is pinned so the lock gives no additional protection. Even if the
- * the page is truncated, the data is still valid if PageUptodate as
+ * page is truncated, the data is still valid if PageUptodate as
* it's a race vs truncate race.
* Case b, the page will not be up to date
* Case c, the page may be truncated but in itself, the data may still
diff --git a/mm/gup.c b/mm/gup.c
index d8a33dd1430d..39e58df6925d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -859,7 +859,7 @@ unmap:
* does not include FOLL_NOWAIT, the mmap_lock may be released. If it
* is, *@locked will be set to 0 and -EBUSY returned.
*/
-static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
+static int faultin_page(struct vm_area_struct *vma,
unsigned long address, unsigned int *flags, int *locked)
{
unsigned int fault_flags = 0;
@@ -884,7 +884,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_TRIED;
}
- ret = handle_mm_fault(vma, address, fault_flags);
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
@@ -893,13 +893,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
BUG();
}
- if (tsk) {
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
-
if (ret & VM_FAULT_RETRY) {
if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*locked = 0;
@@ -969,7 +962,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
/**
* __get_user_pages() - pin user pages in memory
- * @tsk: task_struct of target task
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -1028,7 +1020,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* instead of __get_user_pages. __get_user_pages should be used only if
* you need some special @gup_flags.
*/
-static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+static long __get_user_pages(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1110,8 +1102,7 @@ retry:
page = follow_page_mask(vma, start, foll_flags, &ctx);
if (!page) {
- ret = faultin_page(tsk, vma, start, &foll_flags,
- locked);
+ ret = faultin_page(vma, start, &foll_flags, locked);
switch (ret) {
case 0:
goto retry;
@@ -1185,8 +1176,6 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
/**
* fixup_user_fault() - manually resolve a user page fault
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @address: user address
* @fault_flags:flags to pass down to handle_mm_fault()
@@ -1214,7 +1203,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
* This function will not return with an unlocked mmap_lock. So it has not the
* same semantics wrt the @mm->mmap_lock as does filemap_fault().
*/
-int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+int fixup_user_fault(struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked)
{
@@ -1238,7 +1227,7 @@ retry:
fatal_signal_pending(current))
return -EINTR;
- ret = handle_mm_fault(vma, address, fault_flags);
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, 0);
@@ -1255,12 +1244,6 @@ retry:
goto retry;
}
- if (tsk) {
- if (major)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
return 0;
}
EXPORT_SYMBOL_GPL(fixup_user_fault);
@@ -1269,8 +1252,7 @@ EXPORT_SYMBOL_GPL(fixup_user_fault);
* Please note that this function, unlike __get_user_pages will not
* return 0 for nr_pages > 0 without FOLL_NOWAIT
*/
-static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
- struct mm_struct *mm,
+static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1303,7 +1285,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
pages_done = 0;
lock_dropped = false;
for (;;) {
- ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
+ ret = __get_user_pages(mm, start, nr_pages, flags, pages,
vmas, locked);
if (!locked)
/* VM_FAULT_RETRY couldn't trigger, bypass */
@@ -1363,7 +1345,7 @@ retry:
}
*locked = 1;
- ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
+ ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
pages, NULL, locked);
if (!*locked) {
/* Continue to retry until we succeeded */
@@ -1450,7 +1432,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+ return __get_user_pages(mm, start, nr_pages, gup_flags,
NULL, NULL, locked);
}
@@ -1534,7 +1516,7 @@ struct page *get_dump_page(unsigned long addr)
struct vm_area_struct *vma;
struct page *page;
- if (__get_user_pages(current, current->mm, addr, 1,
+ if (__get_user_pages(current->mm, addr, 1,
FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
NULL) < 1)
return NULL;
@@ -1543,8 +1525,7 @@ struct page *get_dump_page(unsigned long addr)
}
#endif /* CONFIG_ELF_CORE */
#else /* CONFIG_MMU */
-static long __get_user_pages_locked(struct task_struct *tsk,
- struct mm_struct *mm, unsigned long start,
+static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
struct vm_area_struct **vmas, int *locked,
unsigned int foll_flags)
@@ -1609,59 +1590,7 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
}
#ifdef CONFIG_CMA
-static struct page *new_non_cma_page(struct page *page, unsigned long private)
-{
- /*
- * We want to make sure we allocate the new page from the same node
- * as the source page.
- */
- int nid = page_to_nid(page);
- /*
- * Trying to allocate a page for migration. Ignore allocation
- * failure warnings. We don't force __GFP_THISNODE here because
- * this node here is the node where we have CMA reservation and
- * in some case these nodes will have really less non movable
- * allocation memory.
- */
- gfp_t gfp_mask = GFP_USER | __GFP_NOWARN;
-
- if (PageHighMem(page))
- gfp_mask |= __GFP_HIGHMEM;
-
-#ifdef CONFIG_HUGETLB_PAGE
- if (PageHuge(page)) {
- struct hstate *h = page_hstate(page);
- /*
- * We don't want to dequeue from the pool because pool pages will
- * mostly be from the CMA region.
- */
- return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
- }
-#endif
- if (PageTransHuge(page)) {
- struct page *thp;
- /*
- * ignore allocation failure warnings
- */
- gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN;
-
- /*
- * Remove the movable mask so that we don't allocate from
- * CMA area again.
- */
- thp_gfpmask &= ~__GFP_MOVABLE;
- thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
- }
-
- return __alloc_pages_node(nid, gfp_mask, 0);
-}
-
-static long check_and_migrate_cma_pages(struct task_struct *tsk,
- struct mm_struct *mm,
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1674,6 +1603,10 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
bool migrate_allow = true;
LIST_HEAD(cma_page_list);
long ret = nr_pages;
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
+ };
check_again:
for (i = 0; i < nr_pages;) {
@@ -1719,8 +1652,8 @@ check_again:
for (i = 0; i < nr_pages; i++)
put_page(pages[i]);
- if (migrate_pages(&cma_page_list, new_non_cma_page,
- NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
+ if (migrate_pages(&cma_page_list, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
/*
* some of the pages failed migration. Do get_user_pages
* without migration.
@@ -1735,7 +1668,7 @@ check_again:
* again migrating any new CMA pages which we failed to isolate
* earlier.
*/
- ret = __get_user_pages_locked(tsk, mm, start, nr_pages,
+ ret = __get_user_pages_locked(mm, start, nr_pages,
pages, vmas, NULL,
gup_flags);
@@ -1749,8 +1682,7 @@ check_again:
return ret;
}
#else
-static long check_and_migrate_cma_pages(struct task_struct *tsk,
- struct mm_struct *mm,
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1765,8 +1697,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
* __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
* allows us to process the FOLL_LONGTERM flag.
*/
-static long __gup_longterm_locked(struct task_struct *tsk,
- struct mm_struct *mm,
+static long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1791,11 +1722,10 @@ static long __gup_longterm_locked(struct task_struct *tsk,
flags = memalloc_nocma_save();
}
- rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+ rc = __get_user_pages_locked(mm, start, nr_pages, pages,
vmas_tmp, NULL, gup_flags);
if (gup_flags & FOLL_LONGTERM) {
- memalloc_nocma_restore(flags);
if (rc < 0)
goto out;
@@ -1806,32 +1736,31 @@ static long __gup_longterm_locked(struct task_struct *tsk,
goto out;
}
- rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
+ rc = check_and_migrate_cma_pages(mm, start, rc, pages,
vmas_tmp, gup_flags);
+out:
+ memalloc_nocma_restore(flags);
}
-out:
if (vmas_tmp != vmas)
kfree(vmas_tmp);
return rc;
}
#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
-static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
- struct mm_struct *mm,
+static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
struct vm_area_struct **vmas,
unsigned int flags)
{
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
NULL, flags);
}
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
#ifdef CONFIG_MMU
-static long __get_user_pages_remote(struct task_struct *tsk,
- struct mm_struct *mm,
+static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1850,20 +1779,18 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* This will check the vmas (even if our vmas arg is NULL)
* and return -ENOTSUPP if DAX isn't allowed in this case:
*/
- return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
+ return __gup_longterm_locked(mm, start, nr_pages, pages,
vmas, gup_flags | FOLL_TOUCH |
FOLL_REMOTE);
}
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
locked,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
/**
* get_user_pages_remote() - pin user pages in memory
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -1922,7 +1849,7 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* should use get_user_pages_remote because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1934,13 +1861,13 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
- return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
}
EXPORT_SYMBOL(get_user_pages_remote);
#else /* CONFIG_MMU */
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1948,8 +1875,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
return 0;
}
-static long __get_user_pages_remote(struct task_struct *tsk,
- struct mm_struct *mm,
+static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1969,11 +1895,10 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
*
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter. We also obviously don't pass
- * FOLL_REMOTE in here.
+ * This is the same as get_user_pages_remote(), just with a less-flexible
+ * calling convention where we assume that the mm being operated on belongs to
+ * the current task, and doesn't allow passing of a locked parameter. We also
+ * obviously don't pass FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -1986,7 +1911,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
- return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
@@ -1996,7 +1921,7 @@ EXPORT_SYMBOL(get_user_pages);
*
* mmap_read_lock(mm);
* do_something()
- * get_user_pages(tsk, mm, ..., pages, NULL);
+ * get_user_pages(mm, ..., pages, NULL);
* mmap_read_unlock(mm);
*
* to:
@@ -2004,7 +1929,7 @@ EXPORT_SYMBOL(get_user_pages);
* int locked = 1;
* mmap_read_lock(mm);
* do_something()
- * get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ * get_user_pages_locked(mm, ..., pages, &locked);
* if (locked)
* mmap_read_unlock(mm);
*
@@ -2042,7 +1967,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ return __get_user_pages_locked(current->mm, start, nr_pages,
pages, NULL, locked,
gup_flags | FOLL_TOUCH);
}
@@ -2052,12 +1977,12 @@ EXPORT_SYMBOL(get_user_pages_locked);
* get_user_pages_unlocked() is suitable to replace the form:
*
* mmap_read_lock(mm);
- * get_user_pages(tsk, mm, ..., pages, NULL);
+ * get_user_pages(mm, ..., pages, NULL);
* mmap_read_unlock(mm);
*
* with:
*
- * get_user_pages_unlocked(tsk, mm, ..., pages);
+ * get_user_pages_unlocked(mm, ..., pages);
*
* It is functionally equivalent to get_user_pages_fast so
* get_user_pages_fast should be used instead if specific gup_flags
@@ -2080,7 +2005,7 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
return -EINVAL;
mmap_read_lock(mm);
- ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
+ ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
&locked, gup_flags | FOLL_TOUCH);
if (locked)
mmap_read_unlock(mm);
@@ -2725,7 +2650,7 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
*/
if (gup_flags & FOLL_LONGTERM) {
mmap_read_lock(current->mm);
- ret = __gup_longterm_locked(current, current->mm,
+ ret = __gup_longterm_locked(current->mm,
start, nr_pages,
pages, NULL, gup_flags);
mmap_read_unlock(current->mm);
@@ -2968,10 +2893,8 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages,
EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
/**
- * pin_user_pages_remote() - pin pages of a remote process (task != current)
+ * pin_user_pages_remote() - pin pages of a remote process
*
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -2992,7 +2915,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for details.
*/
-long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long pin_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -3002,7 +2925,7 @@ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
return -EINVAL;
gup_flags |= FOLL_PIN;
- return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
}
EXPORT_SYMBOL(pin_user_pages_remote);
@@ -3034,7 +2957,7 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
return -EINVAL;
gup_flags |= FOLL_PIN;
- return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
@@ -3079,7 +3002,7 @@ long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
return -EINVAL;
gup_flags |= FOLL_PIN;
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ return __get_user_pages_locked(current->mm, start, nr_pages,
pages, NULL, locked,
gup_flags | FOLL_TOUCH);
}
diff --git a/mm/hmm.c b/mm/hmm.c
index 0809baee49d0..943cb2ba4442 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -75,7 +75,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
}
for (; addr < end; addr += PAGE_SIZE)
- if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR)
+ if (handle_mm_fault(vma, addr, fault_flags, NULL) &
+ VM_FAULT_ERROR)
return -EFAULT;
return -EBUSY;
}
@@ -249,7 +250,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
swp_entry_t entry = pte_to_swp_entry(pte);
/*
- * Never fault in device private pages pages, but just report
+ * Never fault in device private pages, but just report
* the PFN even if not present.
*/
if (hmm_is_device_private_entry(range, entry)) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 206f52b36ffb..2ccff8472cd4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -303,24 +303,6 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj,
static struct kobj_attribute hpage_pmd_size_attr =
__ATTR_RO(hpage_pmd_size);
-#ifdef CONFIG_DEBUG_VM
-static ssize_t debug_cow_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return single_hugepage_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
-}
-static ssize_t debug_cow_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- return single_hugepage_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
-}
-static struct kobj_attribute debug_cow_attr =
- __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
-#endif /* CONFIG_DEBUG_VM */
-
static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
@@ -329,9 +311,6 @@ static struct attribute *hugepage_attr[] = {
#ifdef CONFIG_SHMEM
&shmem_enabled_attr.attr,
#endif
-#ifdef CONFIG_DEBUG_VM
- &debug_cow_attr.attr,
-#endif
NULL,
};
@@ -640,7 +619,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr, true);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e52c878940bb..a301c2d672bf 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -19,6 +19,7 @@
#include <linux/memblock.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
@@ -133,7 +134,7 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
/*
* Subpool accounting for allocating and reserving pages.
* Return -ENOMEM if there are not enough resources to satisfy the
- * the request. Otherwise, return the number of pages by which the
+ * request. Otherwise, return the number of pages by which the
* global pools must be adjusted (upward). The returned value may
* only be different than the passed value (delta) in the case where
* a subpool minimum size must be maintained.
@@ -1040,10 +1041,16 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
struct page *page;
+ bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+
+ list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
+ if (nocma && is_migrate_cma_page(page))
+ continue;
- list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
if (!PageHWPoison(page))
break;
+ }
+
/*
* if 'non-isolated free hugepage' not found on the list,
* the allocation fails.
@@ -1093,15 +1100,6 @@ retry_cpuset:
return NULL;
}
-/* Movability of hugepages depends on migration support. */
-static inline gfp_t htlb_alloc_mask(struct hstate *h)
-{
- if (hugepage_movable_supported(h))
- return GFP_HIGHUSER_MOVABLE;
- else
- return GFP_HIGHUSER;
-}
-
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
@@ -1944,7 +1942,7 @@ out_unlock:
return page;
}
-struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct page *page;
@@ -1986,31 +1984,9 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
}
/* page migration callback function */
-struct page *alloc_huge_page_node(struct hstate *h, int nid)
-{
- gfp_t gfp_mask = htlb_alloc_mask(h);
- struct page *page = NULL;
-
- if (nid != NUMA_NO_NODE)
- gfp_mask |= __GFP_THISNODE;
-
- spin_lock(&hugetlb_lock);
- if (h->free_huge_pages - h->resv_huge_pages > 0)
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
- spin_unlock(&hugetlb_lock);
-
- if (!page)
- page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
-
- return page;
-}
-
-/* page migration callback function */
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask)
+ nodemask_t *nmask, gfp_t gfp_mask)
{
- gfp_t gfp_mask = htlb_alloc_mask(h);
-
spin_lock(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0) {
struct page *page;
@@ -2038,7 +2014,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- page = alloc_huge_page_nodemask(h, node, nodemask);
+ page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
return page;
@@ -2167,7 +2143,7 @@ static void return_unused_surplus_pages(struct hstate *h,
* evenly across all nodes with memory. Iterate across these nodes
* until we can no longer free unreserved surplus pages. This occurs
* when the nodes with surplus pages have no free pages.
- * free_pool_huge_page() will balance the the freed pages across the
+ * free_pool_huge_page() will balance the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
*
* Note that we decrement resv_huge_pages as we free the pages. If
@@ -3458,13 +3434,21 @@ static int __init default_hugepagesz_setup(char *s)
}
__setup("default_hugepagesz=", default_hugepagesz_setup);
-static unsigned int cpuset_mems_nr(unsigned int *array)
+static unsigned int allowed_mems_nr(struct hstate *h)
{
int node;
unsigned int nr = 0;
+ nodemask_t *mpol_allowed;
+ unsigned int *array = h->free_huge_pages_node;
+ gfp_t gfp_mask = htlb_alloc_mask(h);
- for_each_node_mask(node, cpuset_current_mems_allowed)
- nr += array[node];
+ mpol_allowed = policy_nodemask_current(gfp_mask);
+
+ for_each_node_mask(node, cpuset_current_mems_allowed) {
+ if (!mpol_allowed ||
+ (mpol_allowed && node_isset(node, *mpol_allowed)))
+ nr += array[node];
+ }
return nr;
}
@@ -3643,12 +3627,18 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
* we fall back to check against current free page availability as
* a best attempt and hopefully to minimize the impact of changing
* semantics that cpuset has.
+ *
+ * Apart from cpuset, we also have memory policy mechanism that
+ * also determines from which node the kernel will allocate memory
+ * in a NUMA system. So similar to cpuset, we also should consider
+ * the memory policy of the current task. Similar to the description
+ * above.
*/
if (delta > 0) {
if (gather_surplus_pages(h, delta) < 0)
goto out;
- if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+ if (delta > allowed_mems_nr(h)) {
return_unused_surplus_pages(h, delta);
goto out;
}
@@ -3953,7 +3943,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
continue;
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
spin_unlock(ptl);
/*
* We just unmapped a page of PMDs by clearing a PUD.
@@ -4540,10 +4530,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
- } else {
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
- if (!ptep)
- return VM_FAULT_OOM;
}
/*
@@ -5020,7 +5006,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
pages++;
spin_unlock(ptl);
shared_pmd = true;
@@ -5401,12 +5387,14 @@ out:
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
*/
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *addr, pte_t *ptep)
{
pgd_t *pgd = pgd_offset(mm, *addr);
p4d_t *p4d = p4d_offset(pgd, *addr);
pud_t *pud = pud_offset(p4d, *addr);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
@@ -5424,7 +5412,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *addr, pte_t *ptep)
{
return 0;
}
@@ -5694,12 +5683,14 @@ void __init hugetlb_cma_reserve(int order)
reserved = 0;
for_each_node_state(nid, N_ONLINE) {
int res;
+ char name[20];
size = min(per_node, hugetlb_cma_size - reserved);
size = round_up(size, PAGE_SIZE << order);
+ snprintf(name, 20, "hugetlb%d", nid);
res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
- 0, false, "hugetlb",
+ 0, false, name,
&hugetlb_cma[nid], nid);
if (res) {
pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
diff --git a/mm/internal.h b/mm/internal.h
index 9886db20d94f..d11a9a8d2135 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -239,6 +239,7 @@ struct compact_control {
bool no_set_skip_hint; /* Don't mark blocks for skipping */
bool ignore_block_suitable; /* Scan blocks considered unsuitable */
bool direct_compaction; /* False from kcompactd or /proc/... */
+ bool proactive_compaction; /* kcompactd proactive compaction */
bool whole_zone; /* Whole zone should/has been scanned */
bool contended; /* Signal lock or sched contention */
bool rescan; /* Rescanning the same pageblock */
@@ -612,5 +613,11 @@ static inline bool is_migrate_highatomic_page(struct page *page)
}
void setup_zone_pageset(struct zone *zone);
-extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
+
+struct migration_target_control {
+ int nid; /* preferred node id */
+ nodemask_t *nmask;
+ gfp_t gfp_mask;
+};
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b52bd46ad146..15a9af791014 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1173,7 +1173,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
diff --git a/mm/ksm.c b/mm/ksm.c
index 217842a66912..0aa2247bddd7 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -480,7 +480,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
break;
if (PageKsm(page))
ret = handle_mm_fault(vma, addr,
- FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
+ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+ NULL);
else
ret = VM_FAULT_WRITE;
put_page(page);
diff --git a/mm/maccess.c b/mm/maccess.c
index f98ff91e32c6..3bd70405f2d8 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -205,15 +205,14 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
{
long ret = -EFAULT;
- mm_segment_t old_fs = get_fs();
+ mm_segment_t old_fs = force_uaccess_begin();
- set_fs(USER_DS);
if (access_ok(src, size)) {
pagefault_disable();
ret = __copy_from_user_inatomic(dst, src, size);
pagefault_enable();
}
- set_fs(old_fs);
+ force_uaccess_end(old_fs);
if (ret)
return -EFAULT;
@@ -233,15 +232,14 @@ EXPORT_SYMBOL_GPL(copy_from_user_nofault);
long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
{
long ret = -EFAULT;
- mm_segment_t old_fs = get_fs();
+ mm_segment_t old_fs = force_uaccess_begin();
- set_fs(USER_DS);
if (access_ok(dst, size)) {
pagefault_disable();
ret = __copy_to_user_inatomic(dst, src, size);
pagefault_enable();
}
- set_fs(old_fs);
+ force_uaccess_end(old_fs);
if (ret)
return -EFAULT;
@@ -270,17 +268,17 @@ EXPORT_SYMBOL_GPL(copy_to_user_nofault);
long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
long count)
{
- mm_segment_t old_fs = get_fs();
+ mm_segment_t old_fs;
long ret;
if (unlikely(count <= 0))
return 0;
- set_fs(USER_DS);
+ old_fs = force_uaccess_begin();
pagefault_disable();
ret = strncpy_from_user(dst, unsafe_addr, count);
pagefault_enable();
- set_fs(old_fs);
+ force_uaccess_end(old_fs);
if (ret >= count) {
ret = count;
@@ -310,14 +308,14 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
*/
long strnlen_user_nofault(const void __user *unsafe_addr, long count)
{
- mm_segment_t old_fs = get_fs();
+ mm_segment_t old_fs;
int ret;
- set_fs(USER_DS);
+ old_fs = force_uaccess_begin();
pagefault_disable();
ret = strnlen_user(unsafe_addr, count);
pagefault_enable();
- set_fs(old_fs);
+ force_uaccess_end(old_fs);
return ret;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8d9ceea7fe4d..d59fd9af6e63 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -781,7 +781,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
if (mem_cgroup_disabled())
return;
- if (vmstat_item_in_bytes(idx))
+ if (memcg_stat_item_in_bytes(idx))
threshold <<= PAGE_SHIFT;
x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
@@ -1488,6 +1488,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
seq_buf_printf(&s, "slab %llu\n",
(u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
+ seq_buf_printf(&s, "percpu %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
seq_buf_printf(&s, "sock %llu\n",
(u64)memcg_page_state(memcg, MEMCG_SOCK) *
PAGE_SIZE);
@@ -1528,12 +1530,18 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
memcg_events(memcg, PGMAJFAULT));
- seq_buf_printf(&s, "workingset_refault %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT));
- seq_buf_printf(&s, "workingset_activate %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+ seq_buf_printf(&s, "workingset_refault_anon %lu\n",
+ memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
+ seq_buf_printf(&s, "workingset_refault_file %lu\n",
+ memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
+ seq_buf_printf(&s, "workingset_activate_anon %lu\n",
+ memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
+ seq_buf_printf(&s, "workingset_activate_file %lu\n",
+ memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
+ seq_buf_printf(&s, "workingset_restore %lu\n",
+ memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
seq_buf_printf(&s, "workingset_restore %lu\n",
- memcg_page_state(memcg, WORKINGSET_RESTORE));
+ memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
@@ -2414,7 +2422,7 @@ static void high_work_func(struct work_struct *work)
*
* - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
* overage ratio to a delay.
- * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
* proposed penalty in order to reduce to a reasonable number of jiffies, and
* to produce a reasonable delay curve.
*
@@ -5129,13 +5137,18 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
- pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
+ /* We charge the parent cgroup, never the current task */
+ WARN_ON_ONCE(!current->active_memcg);
+
+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
+ GFP_KERNEL_ACCOUNT);
if (!pn->lruvec_stat_local) {
kfree(pn);
return 1;
}
- pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
+ GFP_KERNEL_ACCOUNT);
if (!pn->lruvec_stat_cpu) {
free_percpu(pn->lruvec_stat_local);
kfree(pn);
@@ -5209,11 +5222,16 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
}
- memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
+ /* We charge the parent cgroup, never the current task */
+ WARN_ON_ONCE(!current->active_memcg);
+
+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+ GFP_KERNEL_ACCOUNT);
if (!memcg->vmstats_local)
goto fail;
- memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+ GFP_KERNEL_ACCOUNT);
if (!memcg->vmstats_percpu)
goto fail;
@@ -5262,7 +5280,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
struct mem_cgroup *memcg;
long error = -ENOMEM;
+ memalloc_use_memcg(parent);
memcg = mem_cgroup_alloc();
+ memalloc_unuse_memcg();
if (IS_ERR(memcg))
return ERR_CAST(memcg);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 47b8ccb1fb9b..f1aa6433f404 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1648,9 +1648,12 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private)
{
- int nid = page_to_nid(p);
+ struct migration_target_control mtc = {
+ .nid = page_to_nid(p),
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
- return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
+ return alloc_migration_target(p, (unsigned long)&mtc);
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index c39a13b09602..228efaca75d3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -71,6 +71,8 @@
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
#include <trace/events/kmem.h>
@@ -1800,7 +1802,7 @@ out_unlock:
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vmf_insert_pfn(), except that it allows drivers to
+ * This is exactly like vmf_insert_pfn(), except that it allows drivers
* to override pgprot on a per-page basis.
*
* This only makes sense for IO mappings, and it makes no sense for
@@ -1936,7 +1938,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vmf_insert_mixed(), except that it allows drivers to
+ * This is exactly like vmf_insert_mixed(), except that it allows drivers
* to override pgprot on a per-page basis.
*
* Typically this function should be used by drivers to set caching- and
@@ -2715,7 +2717,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
@@ -3098,6 +3100,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
int locked;
int exclusive = 0;
vm_fault_t ret = 0;
+ void *shadow = NULL;
if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
@@ -3149,13 +3152,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_page;
}
- /*
- * XXX: Move to lru_cache_add() when it
- * supports new vs putback
- */
- spin_lock_irq(&page_pgdat(page)->lru_lock);
- lru_note_cost_page(page);
- spin_unlock_irq(&page_pgdat(page)->lru_lock);
+ shadow = get_shadow_from_swap_cache(entry);
+ if (shadow)
+ workingset_refault(page, shadow);
lru_cache_add(page);
swap_readpage(page, true);
@@ -3266,10 +3265,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
- activate_page(page);
}
swap_free(entry);
@@ -3414,7 +3412,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -3672,7 +3670,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
@@ -4360,6 +4358,67 @@ retry_pud:
return handle_pte_fault(&vmf);
}
+/**
+ * mm_account_fault - Do page fault accountings
+ *
+ * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
+ * of perf event counters, but we'll still do the per-task accounting to
+ * the task who triggered this page fault.
+ * @address: the faulted address.
+ * @flags: the fault flags.
+ * @ret: the fault retcode.
+ *
+ * This will take care of most of the page fault accountings. Meanwhile, it
+ * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
+ * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * still be in per-arch page fault handlers at the entry of page fault.
+ */
+static inline void mm_account_fault(struct pt_regs *regs,
+ unsigned long address, unsigned int flags,
+ vm_fault_t ret)
+{
+ bool major;
+
+ /*
+ * We don't do accounting for some specific faults:
+ *
+ * - Unsuccessful faults (e.g. when the address wasn't valid). That
+ * includes arch_vma_access_permitted() failing before reaching here.
+ * So this is not a "this many hardware page faults" counter. We
+ * should use the hw profiling for that.
+ *
+ * - Incomplete faults (VM_FAULT_RETRY). They will only be counted
+ * once they're completed.
+ */
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
+ return;
+
+ /*
+ * We define the fault as a major fault when the final successful fault
+ * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
+ * handle it immediately previously).
+ */
+ major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
+
+ if (major)
+ current->maj_flt++;
+ else
+ current->min_flt++;
+
+ /*
+ * If the fault is done for GUP, regs will be NULL. We only do the
+ * accounting for the per thread fault counters who triggered the
+ * fault, and we skip the perf event updates.
+ */
+ if (!regs)
+ return;
+
+ if (major)
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ else
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*
@@ -4367,7 +4426,7 @@ retry_pud:
* return value. See filemap_fault() and __lock_page_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags)
+ unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
@@ -4408,6 +4467,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
mem_cgroup_oom_synchronize(false);
}
+ mm_account_fault(regs, address, flags, ret);
+
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
@@ -4681,7 +4742,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *maddr;
struct page *page = NULL;
- ret = get_user_pages_remote(tsk, mm, addr, 1,
+ ret = get_user_pages_remote(mm, addr, 1,
gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ac6961abaa10..c32ead89c911 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -350,6 +350,16 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
return err;
}
+#ifdef CONFIG_NUMA
+int __weak memory_add_physaddr_to_nid(u64 start)
+{
+ pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+ start);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
@@ -844,8 +854,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
- else
- zone_pcp_update(zone);
+ zone_pcp_update(zone);
init_per_zone_wmark_min();
@@ -1267,19 +1276,23 @@ found:
static struct page *new_node_page(struct page *page, unsigned long private)
{
- int nid = page_to_nid(page);
nodemask_t nmask = node_states[N_MEMORY];
+ struct migration_target_control mtc = {
+ .nid = page_to_nid(page),
+ .nmask = &nmask,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
/*
* try to allocate from a different node but reuse this node if there
* are no other online nodes to be used (e.g. we are offlining a part
* of the only existing node)
*/
- node_clear(nid, nmask);
+ node_clear(mtc.nid, nmask);
if (nodes_empty(nmask))
- node_set(nid, nmask);
+ node_set(mtc.nid, nmask);
- return new_page_nodemask(page, nid, &nmask);
+ return alloc_migration_target(page, (unsigned long)&mtc);
}
static int
@@ -1747,7 +1760,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
*/
rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
if (rc)
- goto done;
+ return rc;
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
@@ -1771,9 +1784,8 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
try_offline_node(nid);
-done:
mem_hotplug_done();
- return rc;
+ return 0;
}
/**
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b9e85d467352..afaa09ff9f6c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -129,7 +129,7 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/**
* numa_map_to_online_node - Find closest online node
- * @nid: Node id to start the search
+ * @node: Node id to start the search
*
* Lookup the next closest node by distance if @nid is not online.
*/
@@ -1065,27 +1065,6 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
return 0;
}
-/* page allocation callback for NUMA node migration */
-struct page *alloc_new_node_page(struct page *page, unsigned long node)
-{
- if (PageHuge(page))
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- node);
- else if (PageTransHuge(page)) {
- struct page *thp;
-
- thp = alloc_pages_node(node,
- (GFP_TRANSHUGE | __GFP_THISNODE),
- HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
- } else
- return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
- __GFP_THISNODE, 0);
-}
-
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
@@ -1096,6 +1075,10 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
+ struct migration_target_control mtc = {
+ .nid = dest,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
nodes_clear(nmask);
node_set(source, nmask);
@@ -1110,8 +1093,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
}
@@ -1632,11 +1615,11 @@ static int kernel_get_mempolicy(int __user *policy,
int pval;
nodemask_t nodes;
- addr = untagged_addr(addr);
-
if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
+ addr = untagged_addr(addr);
+
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
@@ -1890,7 +1873,7 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
* Return a nodemask representing a mempolicy for filtering nodes for
* page allocation
*/
-static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
+nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
diff --git a/mm/migrate.c b/mm/migrate.c
index d179657f8685..5053439be6ab 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1418,22 +1418,35 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
enum migrate_mode mode, int reason)
{
int retry = 1;
+ int thp_retry = 1;
int nr_failed = 0;
int nr_succeeded = 0;
+ int nr_thp_succeeded = 0;
+ int nr_thp_failed = 0;
+ int nr_thp_split = 0;
int pass = 0;
+ bool is_thp = false;
struct page *page;
struct page *page2;
int swapwrite = current->flags & PF_SWAPWRITE;
- int rc;
+ int rc, nr_subpages;
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
- for(pass = 0; pass < 10 && retry; pass++) {
+ for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
retry = 0;
+ thp_retry = 0;
list_for_each_entry_safe(page, page2, from, lru) {
retry:
+ /*
+ * THP statistics is based on the source huge page.
+ * Capture required information that might get lost
+ * during migration.
+ */
+ is_thp = PageTransHuge(page);
+ nr_subpages = hpage_nr_pages(page);
cond_resched();
if (PageHuge(page))
@@ -1464,15 +1477,30 @@ retry:
unlock_page(page);
if (!rc) {
list_safe_reset_next(page, page2, lru);
+ nr_thp_split++;
goto retry;
}
}
+ if (is_thp) {
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ goto out;
+ }
nr_failed++;
goto out;
case -EAGAIN:
+ if (is_thp) {
+ thp_retry++;
+ break;
+ }
retry++;
break;
case MIGRATEPAGE_SUCCESS:
+ if (is_thp) {
+ nr_thp_succeeded++;
+ nr_succeeded += nr_subpages;
+ break;
+ }
nr_succeeded++;
break;
default:
@@ -1482,19 +1510,27 @@ retry:
* removed from migration page list and not
* retried in the next outer loop.
*/
+ if (is_thp) {
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ break;
+ }
nr_failed++;
break;
}
}
}
- nr_failed += retry;
+ nr_failed += retry + thp_retry;
+ nr_thp_failed += thp_retry;
rc = nr_failed;
out:
- if (nr_succeeded)
- count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
- if (nr_failed)
- count_vm_events(PGMIGRATE_FAIL, nr_failed);
- trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
+ count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
+ count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
+ nr_thp_failed, nr_thp_split, mode, reason);
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
@@ -1502,6 +1538,49 @@ out:
return rc;
}
+struct page *alloc_migration_target(struct page *page, unsigned long private)
+{
+ struct migration_target_control *mtc;
+ gfp_t gfp_mask;
+ unsigned int order = 0;
+ struct page *new_page = NULL;
+ int nid;
+ int zidx;
+
+ mtc = (struct migration_target_control *)private;
+ gfp_mask = mtc->gfp_mask;
+ nid = mtc->nid;
+ if (nid == NUMA_NO_NODE)
+ nid = page_to_nid(page);
+
+ if (PageHuge(page)) {
+ struct hstate *h = page_hstate(compound_head(page));
+
+ gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
+ return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
+ }
+
+ if (PageTransHuge(page)) {
+ /*
+ * clear __GFP_RECLAIM to make the migration callback
+ * consistent with regular THP allocations.
+ */
+ gfp_mask &= ~__GFP_RECLAIM;
+ gfp_mask |= GFP_TRANSHUGE;
+ order = HPAGE_PMD_ORDER;
+ }
+ zidx = zone_idx(page_zone(page));
+ if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
+ gfp_mask |= __GFP_HIGHMEM;
+
+ new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
+
+ if (new_page && PageTransHuge(new_page))
+ prep_transhuge_page(new_page);
+
+ return new_page;
+}
+
#ifdef CONFIG_NUMA
static int store_status(int __user *status, int start, int value, int nr)
@@ -1519,9 +1598,13 @@ static int do_move_pages_to_node(struct mm_struct *mm,
struct list_head *pagelist, int node)
{
int err;
+ struct migration_target_control mtc = {
+ .nid = node,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
- err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(pagelist);
return err;
@@ -2168,6 +2251,16 @@ static int migrate_vma_collect_hole(unsigned long start,
struct migrate_vma *migrate = walk->private;
unsigned long addr;
+ /* Only allow populating anonymous memory. */
+ if (!vma_is_anonymous(walk->vma)) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = 0;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ }
+ return 0;
+ }
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
@@ -2260,8 +2353,10 @@ again:
pte = *ptep;
if (pte_none(pte)) {
- mpfn = MIGRATE_PFN_MIGRATE;
- migrate->cpages++;
+ if (vma_is_anonymous(vma)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ }
goto next;
}
@@ -2619,7 +2714,7 @@ restore:
/**
* migrate_vma_setup() - prepare to migrate a range of memory
- * @args: contains the vma, start, and and pfns arrays for the migration
+ * @args: contains the vma, start, and pfns arrays for the migration
*
* Returns: negative errno on failures, 0 when 0 or more pages were migrated
* without an error.
@@ -2830,7 +2925,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
if (!is_zone_device_page(page))
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
get_page(page);
if (flush) {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 352bb9f3ecc0..4fc918163dd3 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -166,7 +166,7 @@ static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
/**
* mmu_interval_read_begin - Begin a read side critical section against a VA
* range
- * interval_sub: The interval subscription
+ * @interval_sub: The interval subscription
*
* mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
* collision-retry scheme similar to seqcount for the VA range under
@@ -686,7 +686,7 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_register);
/**
* mmu_notifier_register - Register a notifier on a mm
- * @mn: The notifier to attach
+ * @subscription: The notifier to attach
* @mm: The mm to attach the notifier to
*
* Must not hold mmap_lock nor any other VM related lock when calling
@@ -856,7 +856,7 @@ static void mmu_notifier_free_rcu(struct rcu_head *rcu)
/**
* mmu_notifier_put - Release the reference on the notifier
- * @mn: The notifier to act on
+ * @subscription: The notifier to act on
*
* This function must be paired with each mmu_notifier_get(), it releases the
* reference obtained by the get. If this is the last reference then process
@@ -965,7 +965,8 @@ static int __mmu_interval_notifier_insert(
* @interval_sub: Interval subscription to register
* @start: Starting virtual address to monitor
* @length: Length of the range to monitor
- * @mm : mm_struct to attach to
+ * @mm: mm_struct to attach to
+ * @ops: Interval notifier operations to be called on matching events
*
* This function subscribes the interval notifier for notifications from the
* mm. Upon return the ops related to mmu_interval_notifier will be called
diff --git a/mm/nommu.c b/mm/nommu.c
index 340ae7774c13..75a327149af1 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1762,8 +1762,8 @@ EXPORT_SYMBOL_GPL(access_process_vm);
* @newsize: The proposed filesize of the inode
*
* Check the shared mappings on an inode on behalf of a shrinking truncate to
- * make sure that that any outstanding VMAs aren't broken and then shrink the
- * vm_regions that extend that beyond so that do_mmap() doesn't
+ * make sure that any outstanding VMAs aren't broken and then shrink the
+ * vm_regions that extend beyond so that do_mmap() doesn't
* automatically grant mappings that are too large.
*/
int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d30ce75f23fb..e90f25d6385d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,17 +196,17 @@ static bool is_dump_unreclaim_slabs(void)
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
-unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
+long oom_badness(struct task_struct *p, unsigned long totalpages)
{
long points;
long adj;
if (oom_unkillable_task(p))
- return 0;
+ return LONG_MIN;
p = find_lock_task_mm(p);
if (!p)
- return 0;
+ return LONG_MIN;
/*
* Do not even consider tasks which are explicitly marked oom
@@ -218,7 +218,7 @@ unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
in_vfork(p)) {
task_unlock(p);
- return 0;
+ return LONG_MIN;
}
/*
@@ -233,11 +233,7 @@ unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
adj *= totalpages / 1000;
points += adj;
- /*
- * Never return 0 for an eligible task regardless of the root bonus and
- * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
- */
- return points > 0 ? points : 1;
+ return points;
}
static const char * const oom_constraint_text[] = {
@@ -310,7 +306,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
static int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
- unsigned long points;
+ long points;
if (oom_unkillable_task(task))
goto next;
@@ -336,12 +332,12 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
* killed first if it triggers an oom, then select it.
*/
if (oom_task_origin(task)) {
- points = ULONG_MAX;
+ points = LONG_MAX;
goto select;
}
points = oom_badness(task, oc->totalpages);
- if (!points || points < oc->chosen_points)
+ if (points == LONG_MIN || points < oc->chosen_points)
goto next;
select:
@@ -365,6 +361,8 @@ abort:
*/
static void select_bad_process(struct oom_control *oc)
{
+ oc->chosen_points = LONG_MIN;
+
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
else {
@@ -863,6 +861,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
p = find_lock_task_mm(victim);
if (!p) {
+ pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
+ message, task_pid_nr(victim), victim->comm);
put_task_struct(victim);
return;
} else if (victim != p) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 167732f4d124..8b7d0ecf30b1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4282,7 +4282,7 @@ retry:
/*
* If an allocation failed after direct reclaim, it could be because
* pages are pinned on the per-cpu lists or in high alloc reserves.
- * Shrink them them and try again
+ * Shrink them and try again
*/
if (!page && !drained) {
unreserve_highatomic_pageblock(ac, false);
@@ -6192,7 +6192,7 @@ static int zone_batchsize(struct zone *zone)
* locking.
*
* Any new users of pcp->batch and pcp->high should ensure they can cope with
- * those fields changing asynchronously (acording the the above rule).
+ * those fields changing asynchronously (acording to the above rule).
*
* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
* outside of boot time (or some other assurance that no concurrent updaters
@@ -8203,7 +8203,7 @@ void *__init alloc_large_system_hash(const char *tablename,
* race condition. So you can't expect this function should be exact.
*
* Returns a page without holding a reference. If the caller wants to
- * dereference that page (e.g., dumping), it has to make sure that that it
+ * dereference that page (e.g., dumping), it has to make sure that it
* cannot get removed (e.g., via memory unplug) concurrently.
*
*/
@@ -8347,6 +8347,10 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long pfn = start;
unsigned int tries = 0;
int ret = 0;
+ struct migration_target_control mtc = {
+ .nid = zone_to_nid(cc->zone),
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
migrate_prep();
@@ -8373,8 +8377,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
&cc->migratepages);
cc->nr_migratepages -= nr_reclaimed;
- ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
- NULL, 0, cc->mode, MR_CONTIG_RANGE);
+ ret = migrate_pages(&cc->migratepages, alloc_migration_target,
+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
}
if (ret < 0) {
putback_movable_pages(&cc->migratepages);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f6d07c5f0d34..242c03121d73 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -306,8 +306,3 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
return pfn < end_pfn ? -EBUSY : 0;
}
-
-struct page *alloc_migrate_target(struct page *page, unsigned long private)
-{
- return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]);
-}
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 0468ba500bd4..18b768ac7dca 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -6,6 +6,25 @@
#include <linux/percpu.h>
/*
+ * There are two chunk types: root and memcg-aware.
+ * Chunks of each type have separate slots list.
+ *
+ * Memcg-aware chunks have an attached vector of obj_cgroup pointers, which is
+ * used to store memcg membership data of a percpu object. Obj_cgroups are
+ * ref-counted pointers to a memory cgroup with an ability to switch dynamically
+ * to the parent memory cgroup. This allows to reclaim a deleted memory cgroup
+ * without reclaiming of all outstanding objects, which hold a reference at it.
+ */
+enum pcpu_chunk_type {
+ PCPU_CHUNK_ROOT,
+#ifdef CONFIG_MEMCG_KMEM
+ PCPU_CHUNK_MEMCG,
+#endif
+ PCPU_NR_CHUNK_TYPES,
+ PCPU_FAIL_ALLOC = PCPU_NR_CHUNK_TYPES
+};
+
+/*
* pcpu_block_md is the metadata block struct.
* Each chunk's bitmap is split into a number of full blocks.
* All units are in terms of bits.
@@ -54,6 +73,9 @@ struct pcpu_chunk {
int end_offset; /* additional area required to
have the region end page
aligned */
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup **obj_cgroups; /* vector of object cgroups */
+#endif
int nr_pages; /* # of pages served by this chunk */
int nr_populated; /* # of populated pages */
@@ -63,7 +85,7 @@ struct pcpu_chunk {
extern spinlock_t pcpu_lock;
-extern struct list_head *pcpu_slot;
+extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
extern int pcpu_nr_empty_pop_pages;
@@ -106,6 +128,37 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}
+#ifdef CONFIG_MEMCG_KMEM
+static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
+{
+ if (chunk->obj_cgroups)
+ return PCPU_CHUNK_MEMCG;
+ return PCPU_CHUNK_ROOT;
+}
+
+static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
+{
+ return chunk_type == PCPU_CHUNK_MEMCG;
+}
+
+#else
+static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
+{
+ return PCPU_CHUNK_ROOT;
+}
+
+static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
+{
+ return false;
+}
+#endif
+
+static inline struct list_head *pcpu_chunk_list(enum pcpu_chunk_type chunk_type)
+{
+ return &pcpu_chunk_lists[pcpu_nr_slots *
+ pcpu_is_memcg_chunk(chunk_type)];
+}
+
#ifdef CONFIG_PERCPU_STATS
#include <linux/spinlock.h>
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 20d2b69a13b0..35c9941077ee 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -44,7 +44,8 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
/* nada */
}
-static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
+static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
+ gfp_t gfp)
{
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
struct pcpu_chunk *chunk;
@@ -52,7 +53,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
unsigned long flags;
int i;
- chunk = pcpu_alloc_chunk(gfp);
+ chunk = pcpu_alloc_chunk(type, gfp);
if (!chunk)
return NULL;
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index 32558063c3f9..c8400a2adbc2 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -34,11 +34,15 @@ static int find_max_nr_alloc(void)
{
struct pcpu_chunk *chunk;
int slot, max_nr_alloc;
+ enum pcpu_chunk_type type;
max_nr_alloc = 0;
- for (slot = 0; slot < pcpu_nr_slots; slot++)
- list_for_each_entry(chunk, &pcpu_slot[slot], list)
- max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc);
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ for (slot = 0; slot < pcpu_nr_slots; slot++)
+ list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
+ list)
+ max_nr_alloc = max(max_nr_alloc,
+ chunk->nr_alloc);
return max_nr_alloc;
}
@@ -129,6 +133,9 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
P("cur_min_alloc", cur_min_alloc);
P("cur_med_alloc", cur_med_alloc);
P("cur_max_alloc", cur_max_alloc);
+#ifdef CONFIG_MEMCG_KMEM
+ P("memcg_aware", pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)));
+#endif
seq_putc(m, '\n');
}
@@ -137,6 +144,7 @@ static int percpu_stats_show(struct seq_file *m, void *v)
struct pcpu_chunk *chunk;
int slot, max_nr_alloc;
int *buffer;
+ enum pcpu_chunk_type type;
alloc_buffer:
spin_lock_irq(&pcpu_lock);
@@ -202,18 +210,18 @@ alloc_buffer:
chunk_map_stats(m, pcpu_reserved_chunk, buffer);
}
- for (slot = 0; slot < pcpu_nr_slots; slot++) {
- list_for_each_entry(chunk, &pcpu_slot[slot], list) {
- if (chunk == pcpu_first_chunk) {
- seq_puts(m, "Chunk: <- First Chunk\n");
- chunk_map_stats(m, chunk, buffer);
-
-
- } else {
- seq_puts(m, "Chunk:\n");
- chunk_map_stats(m, chunk, buffer);
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) {
+ for (slot = 0; slot < pcpu_nr_slots; slot++) {
+ list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
+ list) {
+ if (chunk == pcpu_first_chunk) {
+ seq_puts(m, "Chunk: <- First Chunk\n");
+ chunk_map_stats(m, chunk, buffer);
+ } else {
+ seq_puts(m, "Chunk:\n");
+ chunk_map_stats(m, chunk, buffer);
+ }
}
-
}
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index a2b395acef89..e46f7a6917f9 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -328,12 +328,13 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
pcpu_free_pages(chunk, pages, page_start, page_end);
}
-static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
+static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
+ gfp_t gfp)
{
struct pcpu_chunk *chunk;
struct vm_struct **vms;
- chunk = pcpu_alloc_chunk(gfp);
+ chunk = pcpu_alloc_chunk(type, gfp);
if (!chunk)
return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index b626766160ce..f4709629e6de 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -37,9 +37,14 @@
* takes care of normal allocations.
*
* The allocator organizes chunks into lists according to free size and
- * tries to allocate from the fullest chunk first. Each chunk is managed
- * by a bitmap with metadata blocks. The allocation map is updated on
- * every allocation and free to reflect the current state while the boundary
+ * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT
+ * flag should be passed. All memcg-aware allocations are sharing one set
+ * of chunks and all unaccounted allocations and allocations performed
+ * by processes belonging to the root memory cgroup are using the second set.
+ *
+ * The allocator tries to allocate from the fullest chunk first. Each chunk
+ * is managed by a bitmap with metadata blocks. The allocation map is updated
+ * on every allocation and free to reflect the current state while the boundary
* map is only updated on allocation. Each metadata block contains
* information to help mitigate the need to iterate over large portions
* of the bitmap. The reverse mapping from page to chunk is stored in
@@ -81,6 +86,7 @@
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
+#include <linux/memcontrol.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
@@ -160,7 +166,7 @@ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
-struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
+struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
/* chunks which need their map areas extended, protected by pcpu_lock */
static LIST_HEAD(pcpu_map_extend_chunks);
@@ -500,6 +506,9 @@ static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
bool move_front)
{
if (chunk != pcpu_reserved_chunk) {
+ struct list_head *pcpu_slot;
+
+ pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
if (move_front)
list_move(&chunk->list, &pcpu_slot[slot]);
else
@@ -1211,11 +1220,14 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
*
* This function determines the size of an allocation to free using
* the boundary bitmap and clears the allocation map.
+ *
+ * RETURNS:
+ * Number of freed bytes.
*/
-static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
+static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
- int bit_off, bits, end, oslot;
+ int bit_off, bits, end, oslot, freed;
lockdep_assert_held(&pcpu_lock);
pcpu_stats_area_dealloc(chunk);
@@ -1230,8 +1242,10 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
bits = end - bit_off;
bitmap_clear(chunk->alloc_map, bit_off, bits);
+ freed = bits * PCPU_MIN_ALLOC_SIZE;
+
/* update metadata */
- chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
+ chunk->free_bytes += freed;
/* update first free bit */
chunk_md->first_free = min(chunk_md->first_free, bit_off);
@@ -1239,6 +1253,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
pcpu_block_update_hint_free(chunk, bit_off, bits);
pcpu_chunk_relocate(chunk, oslot);
+
+ return freed;
}
static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
@@ -1334,6 +1350,10 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
panic("%s: Failed to allocate %zu bytes\n", __func__,
alloc_size);
+#ifdef CONFIG_MEMCG_KMEM
+ /* first chunk isn't memcg-aware */
+ chunk->obj_cgroups = NULL;
+#endif
pcpu_init_md_blocks(chunk);
/* manage populated page bitmap */
@@ -1373,7 +1393,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
return chunk;
}
-static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
+static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
{
struct pcpu_chunk *chunk;
int region_bits;
@@ -1401,6 +1421,16 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
if (!chunk->md_blocks)
goto md_blocks_fail;
+#ifdef CONFIG_MEMCG_KMEM
+ if (pcpu_is_memcg_chunk(type)) {
+ chunk->obj_cgroups =
+ pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
+ sizeof(struct obj_cgroup *), gfp);
+ if (!chunk->obj_cgroups)
+ goto objcg_fail;
+ }
+#endif
+
pcpu_init_md_blocks(chunk);
/* init metadata */
@@ -1408,6 +1438,10 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
return chunk;
+#ifdef CONFIG_MEMCG_KMEM
+objcg_fail:
+ pcpu_mem_free(chunk->md_blocks);
+#endif
md_blocks_fail:
pcpu_mem_free(chunk->bound_map);
bound_map_fail:
@@ -1422,6 +1456,9 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
+#ifdef CONFIG_MEMCG_KMEM
+ pcpu_mem_free(chunk->obj_cgroups);
+#endif
pcpu_mem_free(chunk->md_blocks);
pcpu_mem_free(chunk->bound_map);
pcpu_mem_free(chunk->alloc_map);
@@ -1498,7 +1535,8 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end);
-static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
+static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
+ gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -1540,6 +1578,87 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}
+#ifdef CONFIG_MEMCG_KMEM
+static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
+ struct obj_cgroup **objcgp)
+{
+ struct obj_cgroup *objcg;
+
+ if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) ||
+ memcg_kmem_bypass())
+ return PCPU_CHUNK_ROOT;
+
+ objcg = get_obj_cgroup_from_current();
+ if (!objcg)
+ return PCPU_CHUNK_ROOT;
+
+ if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
+ obj_cgroup_put(objcg);
+ return PCPU_FAIL_ALLOC;
+ }
+
+ *objcgp = objcg;
+ return PCPU_CHUNK_MEMCG;
+}
+
+static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
+ struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+ if (!objcg)
+ return;
+
+ if (chunk) {
+ chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
+
+ rcu_read_lock();
+ mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
+ size * num_possible_cpus());
+ rcu_read_unlock();
+ } else {
+ obj_cgroup_uncharge(objcg, size * num_possible_cpus());
+ obj_cgroup_put(objcg);
+ }
+}
+
+static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+ struct obj_cgroup *objcg;
+
+ if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
+ return;
+
+ objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
+ chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
+
+ obj_cgroup_uncharge(objcg, size * num_possible_cpus());
+
+ rcu_read_lock();
+ mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
+ -(size * num_possible_cpus()));
+ rcu_read_unlock();
+
+ obj_cgroup_put(objcg);
+}
+
+#else /* CONFIG_MEMCG_KMEM */
+static enum pcpu_chunk_type
+pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
+{
+ return PCPU_CHUNK_ROOT;
+}
+
+static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
+ struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+}
+
+static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
/**
* pcpu_alloc - the percpu allocator
* @size: size of area to allocate in bytes
@@ -1561,6 +1680,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t pcpu_gfp;
bool is_atomic;
bool do_warn;
+ enum pcpu_chunk_type type;
+ struct list_head *pcpu_slot;
+ struct obj_cgroup *objcg = NULL;
static int warn_limit = 10;
struct pcpu_chunk *chunk, *next;
const char *err;
@@ -1595,16 +1717,23 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
return NULL;
}
+ type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
+ if (unlikely(type == PCPU_FAIL_ALLOC))
+ return NULL;
+ pcpu_slot = pcpu_chunk_list(type);
+
if (!is_atomic) {
/*
* pcpu_balance_workfn() allocates memory under this mutex,
* and it may wait for memory reclaim. Allow current task
* to become OOM victim, in case of memory pressure.
*/
- if (gfp & __GFP_NOFAIL)
+ if (gfp & __GFP_NOFAIL) {
mutex_lock(&pcpu_alloc_mutex);
- else if (mutex_lock_killable(&pcpu_alloc_mutex))
+ } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
+ pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
return NULL;
+ }
}
spin_lock_irqsave(&pcpu_lock, flags);
@@ -1659,7 +1788,7 @@ restart:
}
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
- chunk = pcpu_create_chunk(pcpu_gfp);
+ chunk = pcpu_create_chunk(type, pcpu_gfp);
if (!chunk) {
err = "failed to allocate new chunk";
goto fail;
@@ -1716,6 +1845,8 @@ area_found:
trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
chunk->base_addr, off, ptr);
+ pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
+
return ptr;
fail_unlock:
@@ -1737,6 +1868,9 @@ fail:
} else {
mutex_unlock(&pcpu_alloc_mutex);
}
+
+ pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
+
return NULL;
}
@@ -1796,8 +1930,8 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
}
/**
- * pcpu_balance_workfn - manage the amount of free chunks and populated pages
- * @work: unused
+ * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
+ * @type: chunk type
*
* Reclaim all fully free chunks except for the first one. This is also
* responsible for maintaining the pool of empty populated pages. However,
@@ -1806,11 +1940,12 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
* allocation causes the failure as it is possible that requests can be
* serviced from already backed regions.
*/
-static void pcpu_balance_workfn(struct work_struct *work)
+static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
{
/* gfp flags passed to underlying allocators */
const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
LIST_HEAD(to_free);
+ struct list_head *pcpu_slot = pcpu_chunk_list(type);
struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
int slot, nr_to_pop, ret;
@@ -1908,7 +2043,7 @@ retry_pop:
if (nr_to_pop) {
/* ran out of chunks to populate, create a new one and retry */
- chunk = pcpu_create_chunk(gfp);
+ chunk = pcpu_create_chunk(type, gfp);
if (chunk) {
spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
@@ -1921,6 +2056,20 @@ retry_pop:
}
/**
+ * pcpu_balance_workfn - manage the amount of free chunks and populated pages
+ * @work: unused
+ *
+ * Call __pcpu_balance_workfn() for each chunk type.
+ */
+static void pcpu_balance_workfn(struct work_struct *work)
+{
+ enum pcpu_chunk_type type;
+
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ __pcpu_balance_workfn(type);
+}
+
+/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
@@ -1934,8 +2083,9 @@ void free_percpu(void __percpu *ptr)
void *addr;
struct pcpu_chunk *chunk;
unsigned long flags;
- int off;
+ int size, off;
bool need_balance = false;
+ struct list_head *pcpu_slot;
if (!ptr)
return;
@@ -1949,7 +2099,11 @@ void free_percpu(void __percpu *ptr)
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->base_addr;
- pcpu_free_area(chunk, off);
+ size = pcpu_free_area(chunk, off);
+
+ pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
+
+ pcpu_memcg_free_hook(chunk, off, size);
/* if there are more than one fully free chunks, wake up grim reaper */
if (chunk->free_bytes == pcpu_unit_size) {
@@ -2260,6 +2414,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
int map_size;
unsigned long tmp_addr;
size_t alloc_size;
+ enum pcpu_chunk_type type;
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
@@ -2377,13 +2532,18 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
- pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]),
- SMP_CACHE_BYTES);
- if (!pcpu_slot)
+ pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
+ sizeof(pcpu_chunk_lists[0]) *
+ PCPU_NR_CHUNK_TYPES,
+ SMP_CACHE_BYTES);
+ if (!pcpu_chunk_lists)
panic("%s: Failed to allocate %zu bytes\n", __func__,
- pcpu_nr_slots * sizeof(pcpu_slot[0]));
- for (i = 0; i < pcpu_nr_slots; i++)
- INIT_LIST_HEAD(&pcpu_slot[i]);
+ pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
+ PCPU_NR_CHUNK_TYPES);
+
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ for (i = 0; i < pcpu_nr_slots; i++)
+ INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
/*
* The end of the static region needs to be aligned with the
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index cc85ce81914a..29c052099aff 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -105,7 +105,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
* current/current->mm
*/
mmap_read_lock(mm);
- pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages,
+ pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages,
flags, process_pages,
NULL, &locked);
if (locked)
diff --git a/mm/rmap.c b/mm/rmap.c
index 5fe2dedce1fc..6cce9ef06753 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1469,7 +1469,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* do this outside rmap routines.
*/
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
- if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
/*
* huge_pmd_unshare unmapped an entire PMD
* page. There is no way of knowing exactly
diff --git a/mm/shmem.c b/mm/shmem.c
index eb6b36d89722..271548ca20f3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1434,7 +1434,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
list_add(&info->swaplist, &shmem_swaplist);
if (add_to_swap_cache(page, swap,
- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN) == 0) {
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ NULL) == 0) {
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
info->swapped++;
@@ -1685,7 +1686,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
* Swap in the page pointed to by *pagep.
* Caller has to make sure that *pagep contains a valid swapped page.
* Returns 0 and the page in pagep if success. On failure, returns the
- * the error code and NULL in *pagep.
+ * error code and NULL in *pagep.
*/
static int shmem_swapin_page(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a513f3237155..f9ccd5dc13f3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -419,7 +419,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
/*
* On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
* @slab_caches_to_rcu_destroy list. The slab pages are freed
- * through RCU and and the associated kmem_cache are dereferenced
+ * through RCU and the associated kmem_cache are dereferenced
* while freeing the pages, so the kmem_caches should be freed only
* after the pending RCU operations are finished. As rcu_barrier()
* is a pretty slow operation, we batch all pending destructions
diff --git a/mm/swap.c b/mm/swap.c
index de257c0a89b1..9285e60c7d6e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -476,23 +476,24 @@ void lru_cache_add(struct page *page)
EXPORT_SYMBOL(lru_cache_add);
/**
- * lru_cache_add_active_or_unevictable
+ * lru_cache_add_inactive_or_unevictable
* @page: the page to be added to LRU
* @vma: vma in which page is mapped for determining reclaimability
*
- * Place @page on the active or unevictable LRU list, depending on its
+ * Place @page on the inactive or unevictable LRU list, depending on its
* evictability. Note that if the page is not evictable, it goes
* directly back onto it's zone's unevictable list, it does NOT use a
* per cpu pagevec.
*/
-void lru_cache_add_active_or_unevictable(struct page *page,
+void lru_cache_add_inactive_or_unevictable(struct page *page,
struct vm_area_struct *vma)
{
+ bool unevictable;
+
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
- SetPageActive(page);
- else if (!TestSetPageMlocked(page)) {
+ unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
+ if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
/*
* We use the irq-unsafe __mod_zone_page_stat because this
* counter is not modified from interrupt context, and the pte
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e82f4f8b1f63..b73aabdfd35a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -106,16 +106,32 @@ void show_swap_cache_info(void)
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}
+void *get_shadow_from_swap_cache(swp_entry_t entry)
+{
+ struct address_space *address_space = swap_address_space(entry);
+ pgoff_t idx = swp_offset(entry);
+ struct page *page;
+
+ page = find_get_entry(address_space, idx);
+ if (xa_is_value(page))
+ return page;
+ if (page)
+ put_page(page);
+ return NULL;
+}
+
/*
* add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
+int add_to_swap_cache(struct page *page, swp_entry_t entry,
+ gfp_t gfp, void **shadowp)
{
struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
unsigned long i, nr = hpage_nr_pages(page);
+ void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -125,16 +141,25 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
SetPageSwapCache(page);
do {
+ unsigned long nr_shadows = 0;
+
xas_lock_irq(&xas);
xas_create_range(&xas);
if (xas_error(&xas))
goto unlock;
for (i = 0; i < nr; i++) {
VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
+ old = xas_load(&xas);
+ if (xa_is_value(old)) {
+ nr_shadows++;
+ if (shadowp)
+ *shadowp = old;
+ }
set_page_private(page + i, entry.val + i);
xas_store(&xas, page);
xas_next(&xas);
}
+ address_space->nrexceptional -= nr_shadows;
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
ADD_CACHE_INFO(add_total, nr);
@@ -154,7 +179,8 @@ unlock:
* This must be called only on pages that have
* been verified to be in the swap cache.
*/
-void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
+void __delete_from_swap_cache(struct page *page,
+ swp_entry_t entry, void *shadow)
{
struct address_space *address_space = swap_address_space(entry);
int i, nr = hpage_nr_pages(page);
@@ -166,12 +192,14 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageWriteback(page), page);
for (i = 0; i < nr; i++) {
- void *entry = xas_store(&xas, NULL);
+ void *entry = xas_store(&xas, shadow);
VM_BUG_ON_PAGE(entry != page, entry);
set_page_private(page + i, 0);
xas_next(&xas);
}
ClearPageSwapCache(page);
+ if (shadow)
+ address_space->nrexceptional += nr;
address_space->nrpages -= nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
ADD_CACHE_INFO(del_total, nr);
@@ -208,7 +236,7 @@ int add_to_swap(struct page *page)
* Add it to the swap cache.
*/
err = add_to_swap_cache(page, entry,
- __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
+ __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
if (err)
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
@@ -246,13 +274,44 @@ void delete_from_swap_cache(struct page *page)
struct address_space *address_space = swap_address_space(entry);
xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(page, entry);
+ __delete_from_swap_cache(page, entry, NULL);
xa_unlock_irq(&address_space->i_pages);
put_swap_page(page, entry);
page_ref_sub(page, hpage_nr_pages(page));
}
+void clear_shadow_from_swap_cache(int type, unsigned long begin,
+ unsigned long end)
+{
+ unsigned long curr = begin;
+ void *old;
+
+ for (;;) {
+ unsigned long nr_shadows = 0;
+ swp_entry_t entry = swp_entry(type, curr);
+ struct address_space *address_space = swap_address_space(entry);
+ XA_STATE(xas, &address_space->i_pages, curr);
+
+ xa_lock_irq(&address_space->i_pages);
+ xas_for_each(&xas, old, end) {
+ if (!xa_is_value(old))
+ continue;
+ xas_store(&xas, NULL);
+ nr_shadows++;
+ }
+ address_space->nrexceptional -= nr_shadows;
+ xa_unlock_irq(&address_space->i_pages);
+
+ /* search the next swapcache until we meet end */
+ curr >>= SWAP_ADDRESS_SPACE_SHIFT;
+ curr++;
+ curr <<= SWAP_ADDRESS_SPACE_SHIFT;
+ if (curr > end)
+ break;
+ }
+}
+
/*
* If we are the only user, then try to free up the swap cache.
*
@@ -361,6 +420,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
{
struct swap_info_struct *si;
struct page *page;
+ void *shadow = NULL;
*new_page_allocated = false;
@@ -429,7 +489,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
__SetPageSwapBacked(page);
/* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK)) {
+ if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
put_swap_page(page, entry);
goto fail_unlock;
}
@@ -439,10 +499,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
goto fail_unlock;
}
- /* XXX: Move to lru_cache_add() when it supports new vs putback */
- spin_lock_irq(&page_pgdat(page)->lru_lock);
- lru_note_cost_page(page);
- spin_unlock_irq(&page_pgdat(page)->lru_lock);
+ if (shadow)
+ workingset_refault(page, shadow);
/* Caller will initiate read into locked page */
SetPageWorkingset(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c26916e95fd..e653eea1eb88 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -696,6 +696,7 @@ static void add_to_avail_list(struct swap_info_struct *p)
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
unsigned int nr_entries)
{
+ unsigned long begin = offset;
unsigned long end = offset + nr_entries - 1;
void (*swap_slot_free_notify)(struct block_device *, unsigned long);
@@ -721,6 +722,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_slot_free_notify(si->bdev, offset);
offset++;
}
+ clear_shadow_from_swap_cache(si->type, begin, end);
}
static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
@@ -1915,7 +1917,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
page_add_anon_rmap(page, vma, addr, false);
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, addr, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
}
swap_free(entry);
/*
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 660717a1ea5c..b3de3c4eefba 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -43,7 +43,7 @@ static noinline int check_stack_object(const void *obj, unsigned long len)
/*
* Reject: object partially overlaps the stack (passing the
- * the check above means at least one end is within the stack,
+ * check above means at least one end is within the stack,
* so if this check fails, the other end is outside the stack).
*/
if (obj < stack || stackend < obj + len)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index b80419320c7d..9a3d451402d7 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
inc_mm_counter(dst_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- lru_cache_add_active_or_unevictable(page, dst_vma);
+ lru_cache_add_inactive_or_unevictable(page, dst_vma);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 72da290b171b..738115ed75e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -854,6 +854,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
{
unsigned long flags;
int refcount;
+ void *shadow = NULL;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
@@ -896,13 +897,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
- __delete_from_swap_cache(page, swap);
+ if (reclaimed && !mapping_exiting(mapping))
+ shadow = workingset_eviction(page, target_memcg);
+ __delete_from_swap_cache(page, swap, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
- workingset_eviction(page, target_memcg);
} else {
void (*freepage)(struct page *);
- void *shadow = NULL;
freepage = mapping->a_ops->freepage;
/*
@@ -998,8 +999,6 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_RECLAIM;
if (referenced_ptes) {
- if (PageSwapBacked(page))
- return PAGEREF_ACTIVATE;
/*
* All mapped pages start out with page table
* references from the instantiating fault, so we need
@@ -1022,7 +1021,7 @@ static enum page_references page_check_references(struct page *page,
/*
* Activate file-backed executable pages after first usage.
*/
- if (vm_flags & VM_EXEC)
+ if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
return PAGEREF_ACTIVATE;
return PAGEREF_KEEP;
@@ -2685,7 +2684,10 @@ again:
if (!sc->force_deactivate) {
unsigned long refaults;
- if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_ANON);
+ if (refaults != target_lruvec->refaults[0] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
sc->may_deactivate |= DEACTIVATE_ANON;
else
sc->may_deactivate &= ~DEACTIVATE_ANON;
@@ -2696,8 +2698,8 @@ again:
* rid of any stale active pages quickly.
*/
refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE);
- if (refaults != target_lruvec->refaults ||
+ WORKINGSET_ACTIVATE_FILE);
+ if (refaults != target_lruvec->refaults[1] ||
inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
sc->may_deactivate |= DEACTIVATE_FILE;
else
@@ -2796,7 +2798,7 @@ again:
set_bit(PGDAT_DIRTY, &pgdat->flags);
/*
- * If kswapd scans pages marked marked for immediate
+ * If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
* implies that pages are cycling through the LRU
* faster than they are written so also forcibly stall.
@@ -2974,8 +2976,10 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
unsigned long refaults;
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
- refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE);
- target_lruvec->refaults = refaults;
+ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
+ target_lruvec->refaults[0] = refaults;
+ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
+ target_lruvec->refaults[1] = refaults;
}
/*
@@ -3369,7 +3373,7 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
/*
* Check for watermark boosts top-down as the higher zones
* are more likely to be boosted. Both watermarks and boosts
- * should not be checked at the time time as reclaim would
+ * should not be checked at the same time as reclaim would
* start prematurely when there is no boosting and a lower
* zone is balanced.
*/
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 2b866cbab11d..727a26d1ec1d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1096,6 +1096,24 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
}
+/*
+ * Calculates external fragmentation within a zone wrt the given order.
+ * It is defined as the percentage of pages found in blocks of size
+ * less than 1 << order. It returns values in range [0, 100].
+ */
+unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
+{
+ struct contig_page_info info;
+
+ fill_contig_page_info(zone, order, &info);
+ if (info.free_pages == 0)
+ return 0;
+
+ return div_u64((info.free_pages -
+ (info.free_blocks_suitable << order)) * 100,
+ info.free_pages);
+}
+
/* Same as __fragmentation index but allocs contig_page_info on stack */
int fragmentation_index(struct zone *zone, unsigned int order)
{
@@ -1167,9 +1185,12 @@ const char * const vmstat_text[] = {
"nr_isolated_anon",
"nr_isolated_file",
"workingset_nodes",
- "workingset_refault",
- "workingset_activate",
- "workingset_restore",
+ "workingset_refault_anon",
+ "workingset_refault_file",
+ "workingset_activate_anon",
+ "workingset_activate_file",
+ "workingset_restore_anon",
+ "workingset_restore_file",
"workingset_nodereclaim",
"nr_anon_pages",
"nr_mapped",
@@ -1256,6 +1277,9 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_MIGRATION
"pgmigrate_success",
"pgmigrate_fail",
+ "thp_migration_success",
+ "thp_migration_fail",
+ "thp_migration_split",
#endif
#ifdef CONFIG_COMPACTION
"compact_migrate_scanned",
diff --git a/mm/workingset.c b/mm/workingset.c
index b199726924dd..8cbe4e3cbe5c 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -6,6 +6,7 @@
*/
#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/shmem_fs.h>
#include <linux/pagemap.h>
@@ -280,6 +281,7 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
*/
void workingset_refault(struct page *page, void *shadow)
{
+ bool file = page_is_file_lru(page);
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
@@ -346,27 +348,34 @@ void workingset_refault(struct page *page, void *shadow)
memcg = page_memcg(page);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
- inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
/*
* Compare the distance to the existing workingset size. We
* don't activate pages that couldn't stay resident even if
- * all the memory was available to the page cache. Whether
- * cache can compete with anon or not depends on having swap.
+ * all the memory was available to the workingset. Whether
+ * workingset competition needs to consider anon or not depends
+ * on having swap.
*/
workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
- if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
+ if (!file) {
workingset_size += lruvec_page_state(eviction_lruvec,
- NR_INACTIVE_ANON);
+ NR_INACTIVE_FILE);
+ }
+ if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_ACTIVE_ANON);
+ if (file) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_INACTIVE_ANON);
+ }
}
if (refault_distance > workingset_size)
goto out;
SetPageActive(page);
workingset_age_nonresident(lruvec, hpage_nr_pages(page));
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);
/* Page was active prior to eviction */
if (workingset) {
@@ -375,7 +384,7 @@ void workingset_refault(struct page *page, void *shadow)
spin_lock_irq(&page_pgdat(page)->lru_lock);
lru_note_cost_page(page);
spin_unlock_irq(&page_pgdat(page)->lru_lock);
- inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
}
out:
rcu_read_unlock();
diff --git a/mm/zpool.c b/mm/zpool.c
index 863669212070..3744a2d1a624 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -239,15 +239,15 @@ const char *zpool_get_type(struct zpool *zpool)
}
/**
- * zpool_malloc_support_movable() - Check if the zpool support
- * allocate movable memory
+ * zpool_malloc_support_movable() - Check if the zpool supports
+ * allocating movable memory
* @zpool: The zpool to check
*
- * This returns if the zpool support allocate movable memory.
+ * This returns if the zpool supports allocating movable memory.
*
* Implementations must guarantee this to be thread-safe.
*
- * Returns: true if if the zpool support allocate movable memory, false if not
+ * Returns: true if the zpool supports allocating movable memory, false if not
*/
bool zpool_malloc_support_movable(struct zpool *zpool)
{
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 952a01e45c6a..c36fdff9a371 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -79,7 +79,7 @@
/*
* Object location (<PFN>, <obj_idx>) is encoded as
- * as single (unsigned long) handle value.
+ * a single (unsigned long) handle value.
*
* Note that object index <obj_idx> starts from 0.
*