aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-07-28 16:36:48 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-28 16:36:48 -0700
commit1c88e19b0f6a8471ee50d5062721ba30b8fd4ba9 (patch)
tree6d227487ca2cf391589c73af1c40ec7b7126feec /include
parentMerge tag 'dmaengine-4.8-rc1' of git://git.infradead.org/users/vkoul/slave-dma (diff)
parentmm, compaction: simplify contended compaction handling (diff)
downloadlinux-dev-1c88e19b0f6a8471ee50d5062721ba30b8fd4ba9.tar.xz
linux-dev-1c88e19b0f6a8471ee50d5062721ba30b8fd4ba9.zip
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: "The rest of MM" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (101 commits) mm, compaction: simplify contended compaction handling mm, compaction: introduce direct compaction priority mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations mm, page_alloc: make THP-specific decisions more generic mm, page_alloc: restructure direct compaction handling in slowpath mm, page_alloc: don't retry initial attempt in slowpath mm, page_alloc: set alloc_flags only once in slowpath lib/stackdepot.c: use __GFP_NOWARN for stack allocations mm, kasan: switch SLUB to stackdepot, enable memory quarantine for SLUB mm, kasan: account for object redzone in SLUB's nearest_obj() mm: fix use-after-free if memory allocation failed in vma_adjust() zsmalloc: Delete an unnecessary check before the function call "iput" mm/memblock.c: fix index adjustment error in __next_mem_range_rev() mem-hotplug: alloc new page from a nearest neighbor node when mem-offline mm: optimize copy_page_to/from_iter_iovec mm: add cond_resched() to generic_swapfile_activate() Revert "mm, mempool: only set __GFP_NOMEMALLOC if there are free elements" mm, compaction: don't isolate PageWriteback pages in MIGRATE_SYNC_LIGHT mode mm: hwpoison: remove incorrect comments make __section_nr() more efficient ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/backing-dev.h2
-rw-r--r--include/linux/compaction.h33
-rw-r--r--include/linux/gfp.h14
-rw-r--r--include/linux/huge_mm.h2
-rw-r--r--include/linux/kasan.h2
-rw-r--r--include/linux/kdb.h2
-rw-r--r--include/linux/memblock.h1
-rw-r--r--include/linux/memcontrol.h70
-rw-r--r--include/linux/memremap.h2
-rw-r--r--include/linux/mm.h17
-rw-r--r--include/linux/mm_inline.h19
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/mmzone.h170
-rw-r--r--include/linux/oom.h26
-rw-r--r--include/linux/sched.h27
-rw-r--r--include/linux/slab_def.h3
-rw-r--r--include/linux/slub_def.h14
-rw-r--r--include/linux/swap.h23
-rw-r--r--include/linux/topology.h2
-rw-r--r--include/linux/vm_event_item.h14
-rw-r--r--include/linux/vmstat.h111
-rw-r--r--include/linux/writeback.h2
-rw-r--r--include/trace/events/compaction.h12
-rw-r--r--include/trace/events/mmflags.h1
-rw-r--r--include/trace/events/vmscan.h63
-rw-r--r--include/trace/events/writeback.h10
26 files changed, 418 insertions, 226 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c82794f20110..491a91717788 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -197,7 +197,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
}
long congestion_wait(int sync, long timeout);
-long wait_iff_congested(struct zone *zone, int sync, long timeout);
+long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout);
int pdflush_proc_obsolete(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 1a02dab16646..d4e106b5dc27 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -1,6 +1,18 @@
#ifndef _LINUX_COMPACTION_H
#define _LINUX_COMPACTION_H
+/*
+ * Determines how hard direct compaction should try to succeed.
+ * Lower value means higher priority, analogically to reclaim priority.
+ */
+enum compact_priority {
+ COMPACT_PRIO_SYNC_LIGHT,
+ MIN_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
+ DEF_COMPACT_PRIORITY = COMPACT_PRIO_SYNC_LIGHT,
+ COMPACT_PRIO_ASYNC,
+ INIT_COMPACT_PRIORITY = COMPACT_PRIO_ASYNC
+};
+
/* Return values for compact_zone() and try_to_compact_pages() */
/* When adding new states, please adjust include/trace/events/compaction.h */
enum compact_result {
@@ -43,14 +55,6 @@ enum compact_result {
COMPACT_PARTIAL,
};
-/* Used to signal whether compaction detected need_sched() or lock contention */
-/* No contention detected */
-#define COMPACT_CONTENDED_NONE 0
-/* Either need_sched() was true or fatal signal pending */
-#define COMPACT_CONTENDED_SCHED 1
-/* Zone lock or lru_lock was contended in async compaction */
-#define COMPACT_CONTENDED_LOCK 2
-
struct alloc_context; /* in mm/internal.h */
#ifdef CONFIG_COMPACTION
@@ -64,9 +68,8 @@ extern int sysctl_compact_unevictable_allowed;
extern int fragmentation_index(struct zone *zone, unsigned int order);
extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
- unsigned int order,
- unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, int *contended);
+ unsigned int order, unsigned int alloc_flags,
+ const struct alloc_context *ac, enum compact_priority prio);
extern void compact_pgdat(pg_data_t *pgdat, int order);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern enum compact_result compaction_suitable(struct zone *zone, int order,
@@ -151,14 +154,6 @@ extern void kcompactd_stop(int nid);
extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
#else
-static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
- unsigned int order, int alloc_flags,
- const struct alloc_context *ac,
- enum migrate_mode mode, int *contended)
-{
- return COMPACT_CONTINUE;
-}
-
static inline void compact_pgdat(pg_data_t *pgdat, int order)
{
}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c29e9d347bc6..f8041f9de31e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -237,9 +237,11 @@ struct vm_area_struct;
* are expected to be movable via page reclaim or page migration. Typically,
* pages on the LRU would also be allocated with GFP_HIGHUSER_MOVABLE.
*
- * GFP_TRANSHUGE is used for THP allocations. They are compound allocations
- * that will fail quickly if memory is not available and will not wake
- * kswapd on failure.
+ * GFP_TRANSHUGE and GFP_TRANSHUGE_LIGHT are used for THP allocations. They are
+ * compound allocations that will generally fail quickly if memory is not
+ * available and will not wake kswapd/kcompactd on failure. The _LIGHT
+ * version does not attempt reclaim/compaction at all and is by default used
+ * in page fault path, while the non-light is used by khugepaged.
*/
#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
@@ -254,9 +256,9 @@ struct vm_area_struct;
#define GFP_DMA32 __GFP_DMA32
#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE)
-#define GFP_TRANSHUGE ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
- __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & \
- ~__GFP_RECLAIM)
+#define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
+ __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
+#define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 92ce91c03cd0..6f14de45b5ce 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -11,7 +11,7 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
unsigned int flags);
-extern int madvise_free_huge_pmd(struct mmu_gather *tlb,
+extern bool madvise_free_huge_pmd(struct mmu_gather *tlb,
struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr, unsigned long next);
extern int zap_huge_pmd(struct mmu_gather *tlb,
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index ac4b3c46a84d..c9cf374445d8 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -77,6 +77,7 @@ void kasan_free_shadow(const struct vm_struct *vm);
size_t ksize(const void *);
static inline void kasan_unpoison_slab(const void *ptr) { ksize(ptr); }
+size_t kasan_metadata_size(struct kmem_cache *cache);
#else /* CONFIG_KASAN */
@@ -121,6 +122,7 @@ static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
static inline void kasan_free_shadow(const struct vm_struct *vm) {}
static inline void kasan_unpoison_slab(const void *ptr) { }
+static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; }
#endif /* CONFIG_KASAN */
diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index a19bcf9e762e..410decacff8f 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -177,7 +177,7 @@ extern int kdb_get_kbd_char(void);
static inline
int kdb_process_cpu(const struct task_struct *p)
{
- unsigned int cpu = task_thread_info(p)->cpu;
+ unsigned int cpu = task_cpu(p);
if (cpu > num_possible_cpus())
cpu = 0;
return cpu;
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 6c14b6179727..2925da23505d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -332,6 +332,7 @@ phys_addr_t memblock_mem_size(unsigned long limit_pfn);
phys_addr_t memblock_start_of_DRAM(void);
phys_addr_t memblock_end_of_DRAM(void);
void memblock_enforce_memory_limit(phys_addr_t memory_limit);
+void memblock_mem_limit_remove_map(phys_addr_t limit);
bool memblock_is_memory(phys_addr_t addr);
int memblock_is_map_memory(phys_addr_t addr);
int memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 71aff733a497..5d8ca6e02e39 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -52,7 +52,7 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
MEM_CGROUP_STAT_NSTATS,
/* default hierarchy stats */
- MEMCG_KERNEL_STACK = MEM_CGROUP_STAT_NSTATS,
+ MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS,
MEMCG_SLAB_RECLAIMABLE,
MEMCG_SLAB_UNRECLAIMABLE,
MEMCG_SOCK,
@@ -60,7 +60,7 @@ enum mem_cgroup_stat_index {
};
struct mem_cgroup_reclaim_cookie {
- struct zone *zone;
+ pg_data_t *pgdat;
int priority;
unsigned int generation;
};
@@ -118,7 +118,7 @@ struct mem_cgroup_reclaim_iter {
/*
* per-zone information in memory controller.
*/
-struct mem_cgroup_per_zone {
+struct mem_cgroup_per_node {
struct lruvec lruvec;
unsigned long lru_size[NR_LRU_LISTS];
@@ -132,10 +132,6 @@ struct mem_cgroup_per_zone {
/* use container_of */
};
-struct mem_cgroup_per_node {
- struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
-
struct mem_cgroup_threshold {
struct eventfd_ctx *eventfd;
unsigned long threshold;
@@ -314,8 +310,46 @@ void mem_cgroup_uncharge_list(struct list_head *page_list);
void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
-struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
-struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
+static struct mem_cgroup_per_node *
+mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
+{
+ return memcg->nodeinfo[nid];
+}
+
+/**
+ * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone
+ * @node: node of the wanted lruvec
+ * @memcg: memcg of the wanted lruvec
+ *
+ * Returns the lru list vector holding pages for a given @node or a given
+ * @memcg and @zone. This can be the node lruvec, if the memory controller
+ * is disabled.
+ */
+static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
+ struct mem_cgroup *memcg)
+{
+ struct mem_cgroup_per_node *mz;
+ struct lruvec *lruvec;
+
+ if (mem_cgroup_disabled()) {
+ lruvec = node_lruvec(pgdat);
+ goto out;
+ }
+
+ mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+ lruvec = &mz->lruvec;
+out:
+ /*
+ * Since a node can be onlined after the mem_cgroup was created,
+ * we have to be prepared to initialize lruvec->pgdat here;
+ * and if offlined then reonlined, we need to reinitialize it.
+ */
+ if (unlikely(lruvec->pgdat != pgdat))
+ lruvec->pgdat = pgdat;
+ return lruvec;
+}
+
+struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *);
bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
@@ -404,9 +438,9 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
static inline
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
- struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup_per_node *mz;
- mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
return mz->lru_size[lru];
}
@@ -477,7 +511,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
mem_cgroup_update_page_stat(page, idx, -1);
}
-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
@@ -568,16 +602,16 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new)
{
}
-static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
- struct mem_cgroup *memcg)
+static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
+ struct mem_cgroup *memcg)
{
- return &zone->lruvec;
+ return node_lruvec(pgdat);
}
static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
- struct zone *zone)
+ struct pglist_data *pgdat)
{
- return &zone->lruvec;
+ return &pgdat->lruvec;
}
static inline bool mm_match_cgroup(struct mm_struct *mm,
@@ -681,7 +715,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page,
}
static inline
-unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
+unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned)
{
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index bcaa634139a9..93416196ba64 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -26,7 +26,7 @@ struct vmem_altmap {
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
-#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE)
+#ifdef CONFIG_ZONE_DEVICE
struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start);
#else
static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 192c1bbe5fcd..08ed53eeedd5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -933,6 +933,11 @@ static inline struct zone *page_zone(const struct page *page)
return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}
+static inline pg_data_t *page_pgdat(const struct page *page)
+{
+ return NODE_DATA(page_to_nid(page));
+}
+
#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
@@ -973,11 +978,21 @@ static inline struct mem_cgroup *page_memcg(struct page *page)
{
return page->mem_cgroup;
}
+static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return READ_ONCE(page->mem_cgroup);
+}
#else
static inline struct mem_cgroup *page_memcg(struct page *page)
{
return NULL;
}
+static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return NULL;
+}
#endif
/*
@@ -2284,6 +2299,8 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
}
#endif /* __HAVE_ARCH_GATE_AREA */
+extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);
+
#ifdef CONFIG_SYSCTL
extern int sysctl_drop_caches;
int drop_caches_sysctl_handler(struct ctl_table *, int,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 5bd29ba4f174..71613e8a720f 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -23,25 +23,30 @@ static inline int page_is_file_cache(struct page *page)
}
static __always_inline void __update_lru_size(struct lruvec *lruvec,
- enum lru_list lru, int nr_pages)
+ enum lru_list lru, enum zone_type zid,
+ int nr_pages)
{
- __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
+ __mod_zone_page_state(&pgdat->node_zones[zid],
+ NR_ZONE_LRU_BASE + lru, nr_pages);
}
static __always_inline void update_lru_size(struct lruvec *lruvec,
- enum lru_list lru, int nr_pages)
+ enum lru_list lru, enum zone_type zid,
+ int nr_pages)
{
+ __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
-#else
- __update_lru_size(lruvec, lru, nr_pages);
#endif
}
static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
- update_lru_size(lruvec, lru, hpage_nr_pages(page));
+ update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]);
}
@@ -49,7 +54,7 @@ static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
list_del(&page->lru);
- update_lru_size(lruvec, lru, -hpage_nr_pages(page));
+ update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page));
}
/**
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 79472b22d23f..903200f4ec41 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -118,7 +118,7 @@ struct page {
*/
union {
struct list_head lru; /* Pageout list, eg. active_list
- * protected by zone->lru_lock !
+ * protected by zone_lru_lock !
* Can be used as a generic list
* by the page owner.
*/
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 19425e988bdc..f2e4e90621ec 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -93,7 +93,7 @@ struct free_area {
struct pglist_data;
/*
- * zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
+ * zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
* So add a wild amount of padding here to ensure that they fall into separate
* cachelines. There are very few zone structures in the machine, so space
* consumption is not a concern here.
@@ -110,36 +110,20 @@ struct zone_padding {
enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
- NR_ALLOC_BATCH,
- NR_LRU_BASE,
- NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
- NR_ACTIVE_ANON, /* " " " " " */
- NR_INACTIVE_FILE, /* " " " " " */
- NR_ACTIVE_FILE, /* " " " " " */
- NR_UNEVICTABLE, /* " " " " " */
+ NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
+ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
+ NR_ZONE_ACTIVE_ANON,
+ NR_ZONE_INACTIVE_FILE,
+ NR_ZONE_ACTIVE_FILE,
+ NR_ZONE_UNEVICTABLE,
+ NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */
NR_MLOCK, /* mlock()ed pages found and moved off LRU */
- NR_ANON_PAGES, /* Mapped anonymous pages */
- NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
- only modified from process context */
- NR_FILE_PAGES,
- NR_FILE_DIRTY,
- NR_WRITEBACK,
NR_SLAB_RECLAIMABLE,
NR_SLAB_UNRECLAIMABLE,
NR_PAGETABLE, /* used for pagetables */
- NR_KERNEL_STACK,
+ NR_KERNEL_STACK_KB, /* measured in KiB */
/* Second 128 byte cacheline */
- NR_UNSTABLE_NFS, /* NFS unstable pages */
NR_BOUNCE,
- NR_VMSCAN_WRITE,
- NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
- NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
- NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
- NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
- NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
- NR_DIRTIED, /* page dirtyings since bootup */
- NR_WRITTEN, /* page writings since bootup */
- NR_PAGES_SCANNED, /* pages scanned since last reclaim */
#if IS_ENABLED(CONFIG_ZSMALLOC)
NR_ZSPAGES, /* allocated in zsmalloc */
#endif
@@ -151,14 +135,40 @@ enum zone_stat_item {
NUMA_LOCAL, /* allocation from local node */
NUMA_OTHER, /* allocation from other node */
#endif
+ NR_FREE_CMA_PAGES,
+ NR_VM_ZONE_STAT_ITEMS };
+
+enum node_stat_item {
+ NR_LRU_BASE,
+ NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
+ NR_ACTIVE_ANON, /* " " " " " */
+ NR_INACTIVE_FILE, /* " " " " " */
+ NR_ACTIVE_FILE, /* " " " " " */
+ NR_UNEVICTABLE, /* " " " " " */
+ NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
+ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
+ NR_PAGES_SCANNED, /* pages scanned since last reclaim */
WORKINGSET_REFAULT,
WORKINGSET_ACTIVATE,
WORKINGSET_NODERECLAIM,
- NR_ANON_THPS,
+ NR_ANON_MAPPED, /* Mapped anonymous pages */
+ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
+ only modified from process context */
+ NR_FILE_PAGES,
+ NR_FILE_DIRTY,
+ NR_WRITEBACK,
+ NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
+ NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
NR_SHMEM_THPS,
NR_SHMEM_PMDMAPPED,
- NR_FREE_CMA_PAGES,
- NR_VM_ZONE_STAT_ITEMS };
+ NR_ANON_THPS,
+ NR_UNSTABLE_NFS, /* NFS unstable pages */
+ NR_VMSCAN_WRITE,
+ NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
+ NR_DIRTIED, /* page dirtyings since bootup */
+ NR_WRITTEN, /* page writings since bootup */
+ NR_VM_NODE_STAT_ITEMS
+};
/*
* We do arithmetic on the LRU lists in various places in the code,
@@ -215,7 +225,7 @@ struct lruvec {
/* Evictions & activations on the inactive file list */
atomic_long_t inactive_age;
#ifdef CONFIG_MEMCG
- struct zone *zone;
+ struct pglist_data *pgdat;
#endif
};
@@ -267,6 +277,11 @@ struct per_cpu_pageset {
#endif
};
+struct per_cpu_nodestat {
+ s8 stat_threshold;
+ s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
+};
+
#endif /* !__GENERATING_BOUNDS.H */
enum zone_type {
@@ -348,22 +363,9 @@ struct zone {
#ifdef CONFIG_NUMA
int node;
#endif
-
- /*
- * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
- * this zone's LRU. Maintained by the pageout code.
- */
- unsigned int inactive_ratio;
-
struct pglist_data *zone_pgdat;
struct per_cpu_pageset __percpu *pageset;
- /*
- * This is a per-zone reserve of pages that are not available
- * to userspace allocations.
- */
- unsigned long totalreserve_pages;
-
#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
@@ -372,14 +374,6 @@ struct zone {
unsigned long *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
-#ifdef CONFIG_NUMA
- /*
- * zone reclaim becomes active if more unmapped pages exist.
- */
- unsigned long min_unmapped_pages;
- unsigned long min_slab_pages;
-#endif /* CONFIG_NUMA */
-
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
@@ -472,24 +466,21 @@ struct zone {
unsigned long wait_table_hash_nr_entries;
unsigned long wait_table_bits;
+ /* Write-intensive fields used from the page allocator */
ZONE_PADDING(_pad1_)
+
/* free areas of different sizes */
struct free_area free_area[MAX_ORDER];
/* zone flags, see below */
unsigned long flags;
- /* Write-intensive fields used from the page allocator */
+ /* Primarily protects free_area */
spinlock_t lock;
+ /* Write-intensive fields used by compaction and vmstats. */
ZONE_PADDING(_pad2_)
- /* Write-intensive fields used by page reclaim */
-
- /* Fields commonly accessed by the page reclaim scanner */
- spinlock_t lru_lock;
- struct lruvec lruvec;
-
/*
* When free pages are below this point, additional steps are taken
* when reading the number of free pages to avoid per-cpu counter
@@ -527,19 +518,18 @@ struct zone {
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
-enum zone_flags {
- ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
- ZONE_CONGESTED, /* zone has many dirty pages backed by
+enum pgdat_flags {
+ PGDAT_CONGESTED, /* pgdat has many dirty pages backed by
* a congested BDI
*/
- ZONE_DIRTY, /* reclaim scanning has recently found
+ PGDAT_DIRTY, /* reclaim scanning has recently found
* many dirty file pages at the tail
* of the LRU.
*/
- ZONE_WRITEBACK, /* reclaim scanning has recently found
+ PGDAT_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback
*/
- ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
+ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */
};
static inline unsigned long zone_end_pfn(const struct zone *zone)
@@ -663,8 +653,9 @@ typedef struct pglist_data {
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
- int kswapd_max_order;
- enum zone_type classzone_idx;
+ int kswapd_order;
+ enum zone_type kswapd_classzone_idx;
+
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_classzone_idx;
@@ -681,6 +672,23 @@ typedef struct pglist_data {
/* Number of pages migrated during the rate limiting time interval */
unsigned long numabalancing_migrate_nr_pages;
#endif
+ /*
+ * This is a per-node reserve of pages that are not available
+ * to userspace allocations.
+ */
+ unsigned long totalreserve_pages;
+
+#ifdef CONFIG_NUMA
+ /*
+ * zone reclaim becomes active if more unmapped pages exist.
+ */
+ unsigned long min_unmapped_pages;
+ unsigned long min_slab_pages;
+#endif /* CONFIG_NUMA */
+
+ /* Write-intensive fields used by page reclaim */
+ ZONE_PADDING(_pad1_)
+ spinlock_t lru_lock;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
@@ -695,6 +703,23 @@ typedef struct pglist_data {
struct list_head split_queue;
unsigned long split_queue_len;
#endif
+
+ /* Fields commonly accessed by the page reclaim scanner */
+ struct lruvec lruvec;
+
+ /*
+ * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
+ * this node's LRU. Maintained by the pageout code.
+ */
+ unsigned int inactive_ratio;
+
+ unsigned long flags;
+
+ ZONE_PADDING(_pad2_)
+
+ /* Per-node vmstats */
+ struct per_cpu_nodestat __percpu *per_cpu_nodestats;
+ atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
@@ -708,6 +733,15 @@ typedef struct pglist_data {
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
+static inline spinlock_t *zone_lru_lock(struct zone *zone)
+{
+ return &zone->zone_pgdat->lru_lock;
+}
+
+static inline struct lruvec *node_lruvec(struct pglist_data *pgdat)
+{
+ return &pgdat->lruvec;
+}
static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
@@ -760,12 +794,12 @@ extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
extern void lruvec_init(struct lruvec *lruvec);
-static inline struct zone *lruvec_zone(struct lruvec *lruvec)
+static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
{
#ifdef CONFIG_MEMCG
- return lruvec->zone;
+ return lruvec->pgdat;
#else
- return container_of(lruvec, struct zone, lruvec);
+ return container_of(lruvec, struct pglist_data, lruvec);
#endif
}
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 606137b3b778..5bc0457ee3a8 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -73,9 +73,9 @@ static inline bool oom_task_origin(const struct task_struct *p)
extern void mark_oom_victim(struct task_struct *tsk);
#ifdef CONFIG_MMU
-extern void try_oom_reaper(struct task_struct *tsk);
+extern void wake_oom_reaper(struct task_struct *tsk);
#else
-static inline void try_oom_reaper(struct task_struct *tsk)
+static inline void wake_oom_reaper(struct task_struct *tsk)
{
}
#endif
@@ -107,27 +107,7 @@ extern void oom_killer_enable(void);
extern struct task_struct *find_lock_task_mm(struct task_struct *p);
-static inline bool task_will_free_mem(struct task_struct *task)
-{
- struct signal_struct *sig = task->signal;
-
- /*
- * A coredumping process may sleep for an extended period in exit_mm(),
- * so the oom killer cannot assume that the process will promptly exit
- * and release memory.
- */
- if (sig->flags & SIGNAL_GROUP_COREDUMP)
- return false;
-
- if (!(task->flags & PF_EXITING))
- return false;
-
- /* Make sure that the whole thread group is going down */
- if (!thread_group_empty(task) && !(sig->flags & SIGNAL_GROUP_EXIT))
- return false;
-
- return true;
-}
+bool task_will_free_mem(struct task_struct *task);
/* sysctls */
extern int sysctl_oom_dump_tasks;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d99218a1e043..553af2923824 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -523,6 +523,7 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_HAS_UPROBES 19 /* has uprobes */
#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
#define MMF_OOM_REAPED 21 /* mm has been already reaped */
+#define MMF_OOM_NOT_REAPABLE 22 /* mm couldn't be reaped */
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
@@ -1949,6 +1950,32 @@ static inline int tsk_nr_cpus_allowed(struct task_struct *p)
#define TNF_FAULT_LOCAL 0x08
#define TNF_MIGRATE_FAIL 0x10
+static inline bool in_vfork(struct task_struct *tsk)
+{
+ bool ret;
+
+ /*
+ * need RCU to access ->real_parent if CLONE_VM was used along with
+ * CLONE_PARENT.
+ *
+ * We check real_parent->mm == tsk->mm because CLONE_VFORK does not
+ * imply CLONE_VM
+ *
+ * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
+ * ->real_parent is not necessarily the task doing vfork(), so in
+ * theory we can't rely on task_lock() if we want to dereference it.
+ *
+ * And in this case we can't trust the real_parent->mm == tsk->mm
+ * check, it can be false negative. But we do not care, if init or
+ * another oom-unkillable task does this it should blame itself.
+ */
+ rcu_read_lock();
+ ret = tsk->vfork_done && tsk->real_parent->mm == tsk->mm;
+ rcu_read_unlock();
+
+ return ret;
+}
+
#ifdef CONFIG_NUMA_BALANCING
extern void task_numa_fault(int last_node, int node, int pages, int flags);
extern pid_t task_numa_group_id(struct task_struct *p);
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 339ba027ade9..4ad2c5a26399 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -88,7 +88,8 @@ struct kmem_cache {
};
static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
- void *x) {
+ void *x)
+{
void *object = x - (x - page->s_mem) % cache->size;
void *last_object = page->s_mem + (cache->num - 1) * cache->size;
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 5624c1f3eb0a..75f56c2ef2d4 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -104,6 +104,10 @@ struct kmem_cache {
unsigned int *random_seq;
#endif
+#ifdef CONFIG_KASAN
+ struct kasan_cache kasan_info;
+#endif
+
struct kmem_cache_node *node[MAX_NUMNODES];
};
@@ -119,15 +123,17 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
void object_err(struct kmem_cache *s, struct page *page,
u8 *object, char *reason);
+void *fixup_red_left(struct kmem_cache *s, void *p);
+
static inline void *nearest_obj(struct kmem_cache *cache, struct page *page,
void *x) {
void *object = x - (x - page_address(page)) % cache->size;
void *last_object = page_address(page) +
(page->objects - 1) * cache->size;
- if (unlikely(object > last_object))
- return last_object;
- else
- return object;
+ void *result = (unlikely(object > last_object)) ? last_object : object;
+
+ result = fixup_red_left(cache, result);
+ return result;
}
#endif /* _LINUX_SLUB_DEF_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0af2bb2028fd..b17cc4830fa6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -157,15 +157,6 @@ enum {
#define SWAP_CLUSTER_MAX 32UL
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX
-/*
- * Ratio between zone->managed_pages and the "gap" that above the per-zone
- * "high_wmark". While balancing nodes, We allow kswapd to shrink zones that
- * do not meet the (high_wmark + gap) watermark, even which already met the
- * high_wmark, in order to provide better per-zone lru behavior. We are ok to
- * spend not more than 1% of the memory for this zone balancing "gap".
- */
-#define KSWAPD_ZONE_BALANCE_GAP_RATIO 100
-
#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */
#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */
#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
@@ -317,6 +308,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
+extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
@@ -324,9 +316,9 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
bool may_swap);
-extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
- struct zone *zone,
+ pg_data_t *pgdat,
unsigned long *nr_scanned);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
@@ -334,13 +326,14 @@ extern int remove_mapping(struct address_space *mapping, struct page *page);
extern unsigned long vm_total_pages;
#ifdef CONFIG_NUMA
-extern int zone_reclaim_mode;
+extern int node_reclaim_mode;
extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio;
-extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
#else
-#define zone_reclaim_mode 0
-static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
+#define node_reclaim_mode 0
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+ unsigned int order)
{
return 0;
}
diff --git a/include/linux/topology.h b/include/linux/topology.h
index afce69296ac0..cb0775e1ee4b 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -54,7 +54,7 @@ int arch_update_cpu_topology(void);
/*
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
* (in whatever arch specific measurement units returned by node_distance())
- * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim()
+ * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
* on nodes within this distance.
*/
#define RECLAIM_DISTANCE 30
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 42604173f122..4d6ec58a8d45 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -23,21 +23,23 @@
enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
FOR_ALL_ZONES(PGALLOC),
+ FOR_ALL_ZONES(ALLOCSTALL),
+ FOR_ALL_ZONES(PGSCAN_SKIP),
PGFREE, PGACTIVATE, PGDEACTIVATE,
PGFAULT, PGMAJFAULT,
PGLAZYFREED,
- FOR_ALL_ZONES(PGREFILL),
- FOR_ALL_ZONES(PGSTEAL_KSWAPD),
- FOR_ALL_ZONES(PGSTEAL_DIRECT),
- FOR_ALL_ZONES(PGSCAN_KSWAPD),
- FOR_ALL_ZONES(PGSCAN_DIRECT),
+ PGREFILL,
+ PGSTEAL_KSWAPD,
+ PGSTEAL_DIRECT,
+ PGSCAN_KSWAPD,
+ PGSCAN_DIRECT,
PGSCAN_DIRECT_THROTTLE,
#ifdef CONFIG_NUMA
PGSCAN_ZONE_RECLAIM_FAILED,
#endif
PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
- PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+ PAGEOUTRUN, PGROTATED,
DROP_PAGECACHE, DROP_SLAB,
#ifdef CONFIG_NUMA_BALANCING
NUMA_PTE_UPDATES,
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index d2da8e053210..613771909b6e 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -101,25 +101,42 @@ static inline void vm_events_fold_cpu(int cpu)
#define count_vm_vmacache_event(x) do {} while (0)
#endif
-#define __count_zone_vm_events(item, zone, delta) \
- __count_vm_events(item##_NORMAL - ZONE_NORMAL + \
- zone_idx(zone), delta)
+#define __count_zid_vm_events(item, zid, delta) \
+ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
/*
- * Zone based page accounting with per cpu differentials.
+ * Zone and node-based page accounting with per cpu differentials.
*/
-extern atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
+extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
static inline void zone_page_state_add(long x, struct zone *zone,
enum zone_stat_item item)
{
atomic_long_add(x, &zone->vm_stat[item]);
- atomic_long_add(x, &vm_stat[item]);
+ atomic_long_add(x, &vm_zone_stat[item]);
+}
+
+static inline void node_page_state_add(long x, struct pglist_data *pgdat,
+ enum node_stat_item item)
+{
+ atomic_long_add(x, &pgdat->vm_stat[item]);
+ atomic_long_add(x, &vm_node_stat[item]);
}
static inline unsigned long global_page_state(enum zone_stat_item item)
{
- long x = atomic_long_read(&vm_stat[item]);
+ long x = atomic_long_read(&vm_zone_stat[item]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
+
+static inline unsigned long global_node_page_state(enum node_stat_item item)
+{
+ long x = atomic_long_read(&vm_node_stat[item]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -160,32 +177,61 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
return x;
}
-#ifdef CONFIG_NUMA
+static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat,
+ enum node_stat_item item)
+{
+ long x = atomic_long_read(&pgdat->vm_stat[item]);
-extern unsigned long node_page_state(int node, enum zone_stat_item item);
+#ifdef CONFIG_SMP
+ int cpu;
+ for_each_online_cpu(cpu)
+ x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item];
-#else
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
-#define node_page_state(node, item) global_page_state(item)
+#ifdef CONFIG_NUMA
+extern unsigned long sum_zone_node_page_state(int node,
+ enum zone_stat_item item);
+extern unsigned long node_page_state(struct pglist_data *pgdat,
+ enum node_stat_item item);
+#else
+#define sum_zone_node_page_state(node, item) global_page_state(item)
+#define node_page_state(node, item) global_node_page_state(item)
#endif /* CONFIG_NUMA */
#define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d)
#define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d))
+#define add_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, __d)
+#define sub_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, -(__d))
#ifdef CONFIG_SMP
void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
void __inc_zone_page_state(struct page *, enum zone_stat_item);
void __dec_zone_page_state(struct page *, enum zone_stat_item);
+void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long);
+void __inc_node_page_state(struct page *, enum node_stat_item);
+void __dec_node_page_state(struct page *, enum node_stat_item);
+
void mod_zone_page_state(struct zone *, enum zone_stat_item, long);
void inc_zone_page_state(struct page *, enum zone_stat_item);
void dec_zone_page_state(struct page *, enum zone_stat_item);
-extern void inc_zone_state(struct zone *, enum zone_stat_item);
+void mod_node_page_state(struct pglist_data *, enum node_stat_item, long);
+void inc_node_page_state(struct page *, enum node_stat_item);
+void dec_node_page_state(struct page *, enum node_stat_item);
+
+extern void inc_node_state(struct pglist_data *, enum node_stat_item);
extern void __inc_zone_state(struct zone *, enum zone_stat_item);
+extern void __inc_node_state(struct pglist_data *, enum node_stat_item);
extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
+extern void __dec_node_state(struct pglist_data *, enum node_stat_item);
void quiet_vmstat(void);
void cpu_vm_stats_fold(int cpu);
@@ -213,16 +259,34 @@ static inline void __mod_zone_page_state(struct zone *zone,
zone_page_state_add(delta, zone, item);
}
+static inline void __mod_node_page_state(struct pglist_data *pgdat,
+ enum node_stat_item item, int delta)
+{
+ node_page_state_add(delta, pgdat, item);
+}
+
static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
atomic_long_inc(&zone->vm_stat[item]);
- atomic_long_inc(&vm_stat[item]);
+ atomic_long_inc(&vm_zone_stat[item]);
+}
+
+static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+ atomic_long_inc(&pgdat->vm_stat[item]);
+ atomic_long_inc(&vm_node_stat[item]);
}
static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
atomic_long_dec(&zone->vm_stat[item]);
- atomic_long_dec(&vm_stat[item]);
+ atomic_long_dec(&vm_zone_stat[item]);
+}
+
+static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+ atomic_long_dec(&pgdat->vm_stat[item]);
+ atomic_long_dec(&vm_node_stat[item]);
}
static inline void __inc_zone_page_state(struct page *page,
@@ -231,12 +295,26 @@ static inline void __inc_zone_page_state(struct page *page,
__inc_zone_state(page_zone(page), item);
}
+static inline void __inc_node_page_state(struct page *page,
+ enum node_stat_item item)
+{
+ __inc_node_state(page_pgdat(page), item);
+}
+
+
static inline void __dec_zone_page_state(struct page *page,
enum zone_stat_item item)
{
__dec_zone_state(page_zone(page), item);
}
+static inline void __dec_node_page_state(struct page *page,
+ enum node_stat_item item)
+{
+ __dec_node_state(page_pgdat(page), item);
+}
+
+
/*
* We only use atomic operations to update counters. So there is no need to
* disable interrupts.
@@ -245,7 +323,12 @@ static inline void __dec_zone_page_state(struct page *page,
#define dec_zone_page_state __dec_zone_page_state
#define mod_zone_page_state __mod_zone_page_state
+#define inc_node_page_state __inc_node_page_state
+#define dec_node_page_state __dec_node_page_state
+#define mod_node_page_state __mod_node_page_state
+
#define inc_zone_state __inc_zone_state
+#define inc_node_state __inc_node_state
#define dec_zone_state __dec_zone_state
#define set_pgdat_percpu_threshold(pgdat, callback) { }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 717e6149e753..fc1e16c25a29 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -320,7 +320,7 @@ void laptop_mode_timer_fn(unsigned long data);
static inline void laptop_sync_completion(void) { }
#endif
void throttle_vm_writeout(gfp_t gfp_mask);
-bool zone_dirty_ok(struct zone *zone);
+bool node_dirty_ok(struct pglist_data *pgdat);
int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom);
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 36e2d6fb1360..c2ba402ab256 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -226,26 +226,26 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages,
TP_PROTO(
int order,
gfp_t gfp_mask,
- enum migrate_mode mode),
+ int prio),
- TP_ARGS(order, gfp_mask, mode),
+ TP_ARGS(order, gfp_mask, prio),
TP_STRUCT__entry(
__field(int, order)
__field(gfp_t, gfp_mask)
- __field(enum migrate_mode, mode)
+ __field(int, prio)
),
TP_fast_assign(
__entry->order = order;
__entry->gfp_mask = gfp_mask;
- __entry->mode = mode;
+ __entry->prio = prio;
),
- TP_printk("order=%d gfp_mask=0x%x mode=%d",
+ TP_printk("order=%d gfp_mask=0x%x priority=%d",
__entry->order,
__entry->gfp_mask,
- (int)__entry->mode)
+ __entry->prio)
);
DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 43cedbf0c759..5a81ab48a2fb 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -11,6 +11,7 @@
#define __def_gfpflag_names \
{(unsigned long)GFP_TRANSHUGE, "GFP_TRANSHUGE"}, \
+ {(unsigned long)GFP_TRANSHUGE_LIGHT, "GFP_TRANSHUGE_LIGHT"}, \
{(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"},\
{(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \
{(unsigned long)GFP_USER, "GFP_USER"}, \
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 0101ef37f1ee..c88fd0934e7e 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -55,21 +55,23 @@ TRACE_EVENT(mm_vmscan_kswapd_sleep,
TRACE_EVENT(mm_vmscan_kswapd_wake,
- TP_PROTO(int nid, int order),
+ TP_PROTO(int nid, int zid, int order),
- TP_ARGS(nid, order),
+ TP_ARGS(nid, zid, order),
TP_STRUCT__entry(
__field( int, nid )
+ __field( int, zid )
__field( int, order )
),
TP_fast_assign(
__entry->nid = nid;
+ __entry->zid = zid;
__entry->order = order;
),
- TP_printk("nid=%d order=%d", __entry->nid, __entry->order)
+ TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order)
);
TRACE_EVENT(mm_vmscan_wakeup_kswapd,
@@ -98,47 +100,50 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd,
DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
- TP_PROTO(int order, int may_writepage, gfp_t gfp_flags),
+ TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
- TP_ARGS(order, may_writepage, gfp_flags),
+ TP_ARGS(order, may_writepage, gfp_flags, classzone_idx),
TP_STRUCT__entry(
__field( int, order )
__field( int, may_writepage )
__field( gfp_t, gfp_flags )
+ __field( int, classzone_idx )
),
TP_fast_assign(
__entry->order = order;
__entry->may_writepage = may_writepage;
__entry->gfp_flags = gfp_flags;
+ __entry->classzone_idx = classzone_idx;
),
- TP_printk("order=%d may_writepage=%d gfp_flags=%s",
+ TP_printk("order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d",
__entry->order,
__entry->may_writepage,
- show_gfp_flags(__entry->gfp_flags))
+ show_gfp_flags(__entry->gfp_flags),
+ __entry->classzone_idx)
);
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin,
- TP_PROTO(int order, int may_writepage, gfp_t gfp_flags),
+ TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
- TP_ARGS(order, may_writepage, gfp_flags)
+ TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
);
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin,
- TP_PROTO(int order, int may_writepage, gfp_t gfp_flags),
+ TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
- TP_ARGS(order, may_writepage, gfp_flags)
+ TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
);
DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin,
- TP_PROTO(int order, int may_writepage, gfp_t gfp_flags),
+ TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx),
- TP_ARGS(order, may_writepage, gfp_flags)
+ TP_ARGS(order, may_writepage, gfp_flags, classzone_idx)
);
DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_end_template,
@@ -266,16 +271,18 @@ TRACE_EVENT(mm_shrink_slab_end,
DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
- TP_PROTO(int order,
+ TP_PROTO(int classzone_idx,
+ int order,
unsigned long nr_requested,
unsigned long nr_scanned,
unsigned long nr_taken,
isolate_mode_t isolate_mode,
int file),
- TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
+ TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file),
TP_STRUCT__entry(
+ __field(int, classzone_idx)
__field(int, order)
__field(unsigned long, nr_requested)
__field(unsigned long, nr_scanned)
@@ -285,6 +292,7 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
),
TP_fast_assign(
+ __entry->classzone_idx = classzone_idx;
__entry->order = order;
__entry->nr_requested = nr_requested;
__entry->nr_scanned = nr_scanned;
@@ -293,8 +301,9 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
__entry->file = file;
),
- TP_printk("isolate_mode=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
+ TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_taken=%lu file=%d",
__entry->isolate_mode,
+ __entry->classzone_idx,
__entry->order,
__entry->nr_requested,
__entry->nr_scanned,
@@ -304,27 +313,29 @@ DECLARE_EVENT_CLASS(mm_vmscan_lru_isolate_template,
DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_lru_isolate,
- TP_PROTO(int order,
+ TP_PROTO(int classzone_idx,
+ int order,
unsigned long nr_requested,
unsigned long nr_scanned,
unsigned long nr_taken,
isolate_mode_t isolate_mode,
int file),
- TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
+ TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
);
DEFINE_EVENT(mm_vmscan_lru_isolate_template, mm_vmscan_memcg_isolate,
- TP_PROTO(int order,
+ TP_PROTO(int classzone_idx,
+ int order,
unsigned long nr_requested,
unsigned long nr_scanned,
unsigned long nr_taken,
isolate_mode_t isolate_mode,
int file),
- TP_ARGS(order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
+ TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_taken, isolate_mode, file)
);
@@ -352,15 +363,14 @@ TRACE_EVENT(mm_vmscan_writepage,
TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
- TP_PROTO(struct zone *zone,
+ TP_PROTO(int nid,
unsigned long nr_scanned, unsigned long nr_reclaimed,
int priority, int file),
- TP_ARGS(zone, nr_scanned, nr_reclaimed, priority, file),
+ TP_ARGS(nid, nr_scanned, nr_reclaimed, priority, file),
TP_STRUCT__entry(
__field(int, nid)
- __field(int, zid)
__field(unsigned long, nr_scanned)
__field(unsigned long, nr_reclaimed)
__field(int, priority)
@@ -368,16 +378,15 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive,
),
TP_fast_assign(
- __entry->nid = zone_to_nid(zone);
- __entry->zid = zone_idx(zone);
+ __entry->nid = nid;
__entry->nr_scanned = nr_scanned;
__entry->nr_reclaimed = nr_reclaimed;
__entry->priority = priority;
__entry->reclaim_flags = trace_shrink_flags(file);
),
- TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
- __entry->nid, __entry->zid,
+ TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s",
+ __entry->nid,
__entry->nr_scanned, __entry->nr_reclaimed,
__entry->priority,
show_reclaim_flags(__entry->reclaim_flags))
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 531f5811ff6b..2ccd9ccbf9ef 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -412,11 +412,11 @@ TRACE_EVENT(global_dirty_state,
),
TP_fast_assign(
- __entry->nr_dirty = global_page_state(NR_FILE_DIRTY);
- __entry->nr_writeback = global_page_state(NR_WRITEBACK);
- __entry->nr_unstable = global_page_state(NR_UNSTABLE_NFS);
- __entry->nr_dirtied = global_page_state(NR_DIRTIED);
- __entry->nr_written = global_page_state(NR_WRITTEN);
+ __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY);
+ __entry->nr_writeback = global_node_page_state(NR_WRITEBACK);
+ __entry->nr_unstable = global_node_page_state(NR_UNSTABLE_NFS);
+ __entry->nr_dirtied = global_node_page_state(NR_DIRTIED);
+ __entry->nr_written = global_node_page_state(NR_WRITTEN);
__entry->background_thresh = background_thresh;
__entry->dirty_thresh = dirty_thresh;
__entry->dirty_limit = global_wb_domain.dirty_limit;