aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c157
-rw-r--r--mm/cma.c40
-rw-r--r--mm/cma.h7
-rw-r--r--mm/cma_debug.c7
-rw-r--r--mm/compaction.c187
-rw-r--r--mm/debug.c83
-rw-r--r--mm/debug_vm_pgtable.c668
-rw-r--r--mm/filemap.c229
-rw-r--r--mm/frontswap.c2
-rw-r--r--mm/gup.c184
-rw-r--r--mm/hmm.c21
-rw-r--r--mm/huge_memory.c35
-rw-r--r--mm/hugetlb.c116
-rw-r--r--mm/internal.h9
-rw-r--r--mm/ioremap.c289
-rw-r--r--mm/kasan/Makefile2
-rw-r--r--mm/kasan/common.c41
-rw-r--r--mm/kasan/generic.c43
-rw-r--r--mm/kasan/generic_report.c1
-rw-r--r--mm/kasan/kasan.h23
-rw-r--r--mm/kasan/quarantine.c1
-rw-r--r--mm/kasan/report.c54
-rw-r--r--mm/kasan/tags.c37
-rw-r--r--mm/khugepaged.c77
-rw-r--r--mm/ksm.c9
-rw-r--r--mm/list_lru.c6
-rw-r--r--mm/maccess.c22
-rw-r--r--mm/memblock.c57
-rw-r--r--mm/memcontrol.c794
-rw-r--r--mm/memory-failure.c7
-rw-r--r--mm/memory.c104
-rw-r--r--mm/memory_hotplug.c41
-rw-r--r--mm/mempolicy.c41
-rw-r--r--mm/migrate.c139
-rw-r--r--mm/mm_init.c22
-rw-r--r--mm/mmap.c46
-rw-r--r--mm/mmu_notifier.c9
-rw-r--r--mm/mremap.c17
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/oom_kill.c26
-rw-r--r--mm/page-writeback.c6
-rw-r--r--mm/page_alloc.c236
-rw-r--r--mm/page_counter.c6
-rw-r--r--mm/page_io.c19
-rw-r--r--mm/page_isolation.c5
-rw-r--r--mm/percpu-internal.h55
-rw-r--r--mm/percpu-km.c5
-rw-r--r--mm/percpu-stats.c36
-rw-r--r--mm/percpu-vm.c5
-rw-r--r--mm/percpu.c210
-rw-r--r--mm/pgalloc-track.h51
-rw-r--r--mm/process_vm_access.c2
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/shmem.c134
-rw-r--r--mm/shuffle.c46
-rw-r--r--mm/shuffle.h17
-rw-r--r--mm/slab.c103
-rw-r--r--mm/slab.h395
-rw-r--r--mm/slab_common.c705
-rw-r--r--mm/slob.c12
-rw-r--r--mm/slub.c592
-rw-r--r--mm/sparse-vmemmap.c56
-rw-r--r--mm/sparse.c31
-rw-r--r--mm/swap.c17
-rw-r--r--mm/swap_slots.c45
-rw-r--r--mm/swap_state.c80
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/usercopy.c2
-rw-r--r--mm/userfaultfd.c2
-rw-r--r--mm/util.c52
-rw-r--r--mm/vmalloc.c176
-rw-r--r--mm/vmscan.c69
-rw-r--r--mm/vmstat.c68
-rw-r--r--mm/workingset.c29
-rw-r--r--mm/zpool.c8
-rw-r--r--mm/zsmalloc.c2
78 files changed, 3973 insertions, 2985 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f2104cc0d35c..6c974888f86f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -88,13 +88,9 @@ config NEED_MULTIPLE_NODES
def_bool y
depends on DISCONTIGMEM || NUMA
-config HAVE_MEMORY_PRESENT
- def_bool y
- depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
-
#
# SPARSEMEM_EXTREME (which is the default) does some bootmem
-# allocations when memory_present() is called. If this cannot
+# allocations when sparse_init() is called. If this cannot
# be done on your architecture, select this option. However,
# statically allocating the mem_section[] array can potentially
# consume vast quantities of .bss, so be careful.
@@ -387,7 +383,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
This option specifies the initial value of this option. The default
of 1 says that all excess pages should be trimmed.
- See Documentation/nommu-mmap.txt for more information.
+ See Documentation/mm/nommu-mmap.rst for more information.
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
diff --git a/mm/Makefile b/mm/Makefile
index 6e9d46b2efc9..d5649f1c12c0 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -38,7 +38,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o
+ pgtable-generic.o rmap.o vmalloc.o ioremap.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d382272bcc31..8e8b00627bb2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -281,7 +281,7 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
#define INIT_BW (100 << (20 - PAGE_SHIFT))
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
- int blkcg_id, gfp_t gfp)
+ gfp_t gfp)
{
int i, err;
@@ -308,15 +308,9 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
wb->dirty_sleep = jiffies;
- wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
- if (!wb->congested) {
- err = -ENOMEM;
- goto out_put_bdi;
- }
-
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
- goto out_put_cong;
+ goto out_put_bdi;
for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
err = percpu_counter_init(&wb->stat[i], 0, gfp);
@@ -330,8 +324,6 @@ out_destroy_stat:
while (i--)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
-out_put_cong:
- wb_congested_put(wb->congested);
out_put_bdi:
if (wb != &bdi->wb)
bdi_put(bdi);
@@ -374,7 +366,6 @@ static void wb_exit(struct bdi_writeback *wb)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
- wb_congested_put(wb->congested);
if (wb != &wb->bdi->wb)
bdi_put(wb->bdi);
}
@@ -384,99 +375,12 @@ static void wb_exit(struct bdi_writeback *wb)
#include <linux/memcontrol.h>
/*
- * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
- * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
- * protected.
+ * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list.
+ * bdi->cgwb_tree is also RCU protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
static struct workqueue_struct *cgwb_release_wq;
-/**
- * wb_congested_get_create - get or create a wb_congested
- * @bdi: associated bdi
- * @blkcg_id: ID of the associated blkcg
- * @gfp: allocation mask
- *
- * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one.
- * The returned wb_congested has its reference count incremented. Returns
- * NULL on failure.
- */
-struct bdi_writeback_congested *
-wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
-{
- struct bdi_writeback_congested *new_congested = NULL, *congested;
- struct rb_node **node, *parent;
- unsigned long flags;
-retry:
- spin_lock_irqsave(&cgwb_lock, flags);
-
- node = &bdi->cgwb_congested_tree.rb_node;
- parent = NULL;
-
- while (*node != NULL) {
- parent = *node;
- congested = rb_entry(parent, struct bdi_writeback_congested,
- rb_node);
- if (congested->blkcg_id < blkcg_id)
- node = &parent->rb_left;
- else if (congested->blkcg_id > blkcg_id)
- node = &parent->rb_right;
- else
- goto found;
- }
-
- if (new_congested) {
- /* !found and storage for new one already allocated, insert */
- congested = new_congested;
- rb_link_node(&congested->rb_node, parent, node);
- rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
- spin_unlock_irqrestore(&cgwb_lock, flags);
- return congested;
- }
-
- spin_unlock_irqrestore(&cgwb_lock, flags);
-
- /* allocate storage for new one and retry */
- new_congested = kzalloc(sizeof(*new_congested), gfp);
- if (!new_congested)
- return NULL;
-
- refcount_set(&new_congested->refcnt, 1);
- new_congested->__bdi = bdi;
- new_congested->blkcg_id = blkcg_id;
- goto retry;
-
-found:
- refcount_inc(&congested->refcnt);
- spin_unlock_irqrestore(&cgwb_lock, flags);
- kfree(new_congested);
- return congested;
-}
-
-/**
- * wb_congested_put - put a wb_congested
- * @congested: wb_congested to put
- *
- * Put @congested and destroy it if the refcnt reaches zero.
- */
-void wb_congested_put(struct bdi_writeback_congested *congested)
-{
- unsigned long flags;
-
- if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags))
- return;
-
- /* bdi might already have been destroyed leaving @congested unlinked */
- if (congested->__bdi) {
- rb_erase(&congested->rb_node,
- &congested->__bdi->cgwb_congested_tree);
- congested->__bdi = NULL;
- }
-
- spin_unlock_irqrestore(&cgwb_lock, flags);
- kfree(congested);
-}
-
static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
@@ -558,7 +462,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
goto out_put;
}
- ret = wb_init(wb, bdi, blkcg_css->id, gfp);
+ ret = wb_init(wb, bdi, gfp);
if (ret)
goto err_free;
@@ -696,11 +600,10 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
int ret;
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
- bdi->cgwb_congested_tree = RB_ROOT;
mutex_init(&bdi->cgwb_release_mutex);
init_rwsem(&bdi->wb_switch_rwsem);
- ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+ ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
if (!ret) {
bdi->wb.memcg_css = &root_mem_cgroup->css;
bdi->wb.blkcg_css = blkcg_root_css;
@@ -769,21 +672,6 @@ void wb_blkcg_offline(struct blkcg *blkcg)
spin_unlock_irq(&cgwb_lock);
}
-static void cgwb_bdi_exit(struct backing_dev_info *bdi)
-{
- struct rb_node *rbn;
-
- spin_lock_irq(&cgwb_lock);
- while ((rbn = rb_first(&bdi->cgwb_congested_tree))) {
- struct bdi_writeback_congested *congested =
- rb_entry(rbn, struct bdi_writeback_congested, rb_node);
-
- rb_erase(rbn, &bdi->cgwb_congested_tree);
- congested->__bdi = NULL; /* mark @congested unlinked */
- }
- spin_unlock_irq(&cgwb_lock);
-}
-
static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
spin_lock_irq(&cgwb_lock);
@@ -810,29 +698,11 @@ subsys_initcall(cgwb_init);
static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
- int err;
-
- bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL);
- if (!bdi->wb_congested)
- return -ENOMEM;
-
- refcount_set(&bdi->wb_congested->refcnt, 1);
-
- err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
- if (err) {
- wb_congested_put(bdi->wb_congested);
- return err;
- }
- return 0;
+ return wb_init(&bdi->wb, bdi, GFP_KERNEL);
}
static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
-static void cgwb_bdi_exit(struct backing_dev_info *bdi)
-{
- wb_congested_put(bdi->wb_congested);
-}
-
static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
@@ -1023,7 +893,6 @@ static void release_bdi(struct kref *ref)
bdi_unregister(bdi);
WARN_ON_ONCE(bdi->dev);
wb_exit(&bdi->wb);
- cgwb_bdi_exit(bdi);
kfree(bdi);
}
@@ -1047,29 +916,29 @@ static wait_queue_head_t congestion_wqh[2] = {
};
static atomic_t nr_wb_congested[2];
-void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
+void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
wait_queue_head_t *wqh = &congestion_wqh[sync];
enum wb_congested_state bit;
bit = sync ? WB_sync_congested : WB_async_congested;
- if (test_and_clear_bit(bit, &congested->state))
+ if (test_and_clear_bit(bit, &bdi->wb.congested))
atomic_dec(&nr_wb_congested[sync]);
smp_mb__after_atomic();
if (waitqueue_active(wqh))
wake_up(wqh);
}
-EXPORT_SYMBOL(clear_wb_congested);
+EXPORT_SYMBOL(clear_bdi_congested);
-void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
+void set_bdi_congested(struct backing_dev_info *bdi, int sync)
{
enum wb_congested_state bit;
bit = sync ? WB_sync_congested : WB_async_congested;
- if (!test_and_set_bit(bit, &congested->state))
+ if (!test_and_set_bit(bit, &bdi->wb.congested))
atomic_inc(&nr_wb_congested[sync]);
}
-EXPORT_SYMBOL(set_wb_congested);
+EXPORT_SYMBOL(set_bdi_congested);
/**
* congestion_wait - wait for a backing_dev to become uncongested
diff --git a/mm/cma.c b/mm/cma.c
index 26ecff818881..7f415d7cda9f 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -52,7 +52,7 @@ unsigned long cma_get_size(const struct cma *cma)
const char *cma_get_name(const struct cma *cma)
{
- return cma->name ? cma->name : "(undefined)";
+ return cma->name;
}
static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
@@ -93,17 +93,15 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
mutex_unlock(&cma->lock);
}
-static int __init cma_activate_area(struct cma *cma)
+static void __init cma_activate_area(struct cma *cma)
{
unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
unsigned i = cma->count >> pageblock_order;
struct zone *zone;
cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
- if (!cma->bitmap) {
- cma->count = 0;
- return -ENOMEM;
- }
+ if (!cma->bitmap)
+ goto out_error;
WARN_ON_ONCE(!pfn_valid(pfn));
zone = page_zone(pfn_to_page(pfn));
@@ -133,25 +131,22 @@ static int __init cma_activate_area(struct cma *cma)
spin_lock_init(&cma->mem_head_lock);
#endif
- return 0;
+ return;
not_in_zone:
- pr_err("CMA area %s could not be activated\n", cma->name);
bitmap_free(cma->bitmap);
+out_error:
cma->count = 0;
- return -EINVAL;
+ pr_err("CMA area %s could not be activated\n", cma->name);
+ return;
}
static int __init cma_init_reserved_areas(void)
{
int i;
- for (i = 0; i < cma_area_count; i++) {
- int ret = cma_activate_area(&cma_areas[i]);
-
- if (ret)
- return ret;
- }
+ for (i = 0; i < cma_area_count; i++)
+ cma_activate_area(&cma_areas[i]);
return 0;
}
@@ -202,13 +197,12 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
* subsystems (like slab allocator) are available.
*/
cma = &cma_areas[cma_area_count];
- if (name) {
- cma->name = name;
- } else {
- cma->name = kasprintf(GFP_KERNEL, "cma%d\n", cma_area_count);
- if (!cma->name)
- return -ENOMEM;
- }
+
+ if (name)
+ snprintf(cma->name, CMA_MAX_NAME, name);
+ else
+ snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count);
+
cma->base_pfn = PFN_DOWN(base);
cma->count = size >> PAGE_SHIFT;
cma->order_per_bit = order_per_bit;
@@ -425,7 +419,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
struct page *page = NULL;
int ret = -ENOMEM;
- if (!cma || !cma->count)
+ if (!cma || !cma->count || !cma->bitmap)
return NULL;
pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
diff --git a/mm/cma.h b/mm/cma.h
index 33c0b517733c..20f6e24bc477 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -2,6 +2,10 @@
#ifndef __MM_CMA_H__
#define __MM_CMA_H__
+#include <linux/debugfs.h>
+
+#define CMA_MAX_NAME 64
+
struct cma {
unsigned long base_pfn;
unsigned long count;
@@ -11,8 +15,9 @@ struct cma {
#ifdef CONFIG_CMA_DEBUGFS
struct hlist_head mem_head;
spinlock_t mem_head_lock;
+ struct debugfs_u32_array dfs_bitmap;
#endif
- const char *name;
+ char name[CMA_MAX_NAME];
};
extern struct cma cma_areas[MAX_CMA_AREAS];
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 4e6cbe2f586e..d5bf8aa34fdc 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -164,7 +164,6 @@ static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
struct dentry *tmp;
char name[16];
- int u32s;
scnprintf(name, sizeof(name), "cma-%s", cma->name);
@@ -180,8 +179,10 @@ static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
debugfs_create_file("used", 0444, tmp, cma, &cma_used_fops);
debugfs_create_file("maxchunk", 0444, tmp, cma, &cma_maxchunk_fops);
- u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
- debugfs_create_u32_array("bitmap", 0444, tmp, (u32 *)cma->bitmap, u32s);
+ cma->dfs_bitmap.array = (u32 *)cma->bitmap;
+ cma->dfs_bitmap.n_elements = DIV_ROUND_UP(cma_bitmap_maxno(cma),
+ BITS_PER_BYTE * sizeof(u32));
+ debugfs_create_u32_array("bitmap", 0444, tmp, &cma->dfs_bitmap);
}
static int __init cma_debugfs_init(void)
diff --git a/mm/compaction.c b/mm/compaction.c
index 86375605faa9..b89581bf859c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,24 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order)
#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
+/*
+ * Fragmentation score check interval for proactive compaction purposes.
+ */
+static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500;
+
+/*
+ * Page order with-respect-to which proactive compaction
+ * calculates external fragmentation, which is used as
+ * the "fragmentation score" of a node/zone.
+ */
+#if defined CONFIG_TRANSPARENT_HUGEPAGE
+#define COMPACTION_HPAGE_ORDER HPAGE_PMD_ORDER
+#elif defined CONFIG_HUGETLBFS
+#define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER
+#else
+#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT)
+#endif
+
static unsigned long release_freepages(struct list_head *freelist)
{
struct page *page, *next;
@@ -136,7 +154,7 @@ EXPORT_SYMBOL(__ClearPageMovable);
/*
* Compaction is deferred when compaction fails to result in a page
- * allocation success. 1 << compact_defer_limit compactions are skipped up
+ * allocation success. 1 << compact_defer_shift, compactions are skipped up
* to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
*/
void defer_compaction(struct zone *zone, int order)
@@ -1459,7 +1477,7 @@ static void isolate_freepages(struct compact_control *cc)
* this pfn aligned down to the pageblock boundary, because we do
* block_start_pfn -= pageblock_nr_pages in the for loop.
* For ending point, take care when isolating in last pageblock of a
- * a zone which ends in the middle of a pageblock.
+ * zone which ends in the middle of a pageblock.
* The low boundary is the end of the pageblock the migration scanner
* is using.
*/
@@ -1857,6 +1875,76 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
+static bool kswapd_is_running(pg_data_t *pgdat)
+{
+ return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING);
+}
+
+/*
+ * A zone's fragmentation score is the external fragmentation wrt to the
+ * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value
+ * in the range [0, 100].
+ *
+ * The scaling factor ensures that proactive compaction focuses on larger
+ * zones like ZONE_NORMAL, rather than smaller, specialized zones like
+ * ZONE_DMA32. For smaller zones, the score value remains close to zero,
+ * and thus never exceeds the high threshold for proactive compaction.
+ */
+static unsigned int fragmentation_score_zone(struct zone *zone)
+{
+ unsigned long score;
+
+ score = zone->present_pages *
+ extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+ return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
+}
+
+/*
+ * The per-node proactive (background) compaction process is started by its
+ * corresponding kcompactd thread when the node's fragmentation score
+ * exceeds the high threshold. The compaction process remains active till
+ * the node's score falls below the low threshold, or one of the back-off
+ * conditions is met.
+ */
+static unsigned int fragmentation_score_node(pg_data_t *pgdat)
+{
+ unsigned int score = 0;
+ int zoneid;
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct zone *zone;
+
+ zone = &pgdat->node_zones[zoneid];
+ score += fragmentation_score_zone(zone);
+ }
+
+ return score;
+}
+
+static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
+{
+ unsigned int wmark_low;
+
+ /*
+ * Cap the low watermak to avoid excessive compaction
+ * activity in case a user sets the proactivess tunable
+ * close to 100 (maximum).
+ */
+ wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
+ return low ? wmark_low : min(wmark_low + 10, 100U);
+}
+
+static bool should_proactive_compact_node(pg_data_t *pgdat)
+{
+ int wmark_high;
+
+ if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat))
+ return false;
+
+ wmark_high = fragmentation_score_wmark(pgdat, false);
+ return fragmentation_score_node(pgdat) > wmark_high;
+}
+
static enum compact_result __compact_finished(struct compact_control *cc)
{
unsigned int order;
@@ -1883,6 +1971,25 @@ static enum compact_result __compact_finished(struct compact_control *cc)
return COMPACT_PARTIAL_SKIPPED;
}
+ if (cc->proactive_compaction) {
+ int score, wmark_low;
+ pg_data_t *pgdat;
+
+ pgdat = cc->zone->zone_pgdat;
+ if (kswapd_is_running(pgdat))
+ return COMPACT_PARTIAL_SKIPPED;
+
+ score = fragmentation_score_zone(cc->zone);
+ wmark_low = fragmentation_score_wmark(pgdat, true);
+
+ if (score > wmark_low)
+ ret = COMPACT_CONTINUE;
+ else
+ ret = COMPACT_SUCCESS;
+
+ goto out;
+ }
+
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
@@ -1941,6 +2048,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
}
}
+out:
if (cc->contended || fatal_signal_pending(current))
ret = COMPACT_CONTENDED;
@@ -2421,6 +2529,41 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
return rc;
}
+/*
+ * Compact all zones within a node till each zone's fragmentation score
+ * reaches within proactive compaction thresholds (as determined by the
+ * proactiveness tunable).
+ *
+ * It is possible that the function returns before reaching score targets
+ * due to various back-off conditions, such as, contention on per-node or
+ * per-zone locks.
+ */
+static void proactive_compact_node(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = -1,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+ .whole_zone = true,
+ .gfp_mask = GFP_KERNEL,
+ .proactive_compaction = true,
+ };
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ cc.zone = zone;
+
+ compact_zone(&cc, NULL);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+}
/* Compact all zones within a node */
static void compact_node(int nid)
@@ -2468,6 +2611,13 @@ static void compact_nodes(void)
int sysctl_compact_memory;
/*
+ * Tunable for proactive compaction. It determines how
+ * aggressively the kernel should compact memory in the
+ * background. It takes values in the range [0, 100].
+ */
+unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+
+/*
* This is the entry point for compacting all nodes via
* /proc/sys/vm/compact_memory
*/
@@ -2646,6 +2796,7 @@ static int kcompactd(void *p)
{
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
+ unsigned int proactive_defer = 0;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -2661,12 +2812,34 @@ static int kcompactd(void *p)
unsigned long pflags;
trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
- wait_event_freezable(pgdat->kcompactd_wait,
- kcompactd_work_requested(pgdat));
+ if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
+ kcompactd_work_requested(pgdat),
+ msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
+
+ psi_memstall_enter(&pflags);
+ kcompactd_do_work(pgdat);
+ psi_memstall_leave(&pflags);
+ continue;
+ }
- psi_memstall_enter(&pflags);
- kcompactd_do_work(pgdat);
- psi_memstall_leave(&pflags);
+ /* kcompactd wait timeout */
+ if (should_proactive_compact_node(pgdat)) {
+ unsigned int prev_score, score;
+
+ if (proactive_defer) {
+ proactive_defer--;
+ continue;
+ }
+ prev_score = fragmentation_score_node(pgdat);
+ proactive_compact_node(pgdat);
+ score = fragmentation_score_node(pgdat);
+ /*
+ * Defer proactive compaction if the fragmentation
+ * score did not go down i.e. no progress made.
+ */
+ proactive_defer = score < prev_score ?
+ 0 : 1 << COMPACT_MAX_DEFER_SHIFT;
+ }
}
return 0;
diff --git a/mm/debug.c b/mm/debug.c
index 4f376514744d..ca8d1cacdecc 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -69,8 +69,19 @@ void __dump_page(struct page *page, const char *reason)
}
if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) {
- /* Corrupt page, cannot call page_mapping */
- mapping = page->mapping;
+ /*
+ * Corrupt page, so we cannot call page_mapping. Instead, do a
+ * safe subset of the steps that page_mapping() does. Caution:
+ * this will be misleading for tail pages, PageSwapCache pages,
+ * and potentially other situations. (See the page_mapping()
+ * implementation for what's missing here.)
+ */
+ unsigned long tmp = (unsigned long)page->mapping;
+
+ if (tmp & PAGE_MAPPING_ANON)
+ mapping = NULL;
+ else
+ mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS);
head = page;
compound = false;
} else {
@@ -84,86 +95,76 @@ void __dump_page(struct page *page, const char *reason)
*/
mapcount = PageSlab(head) ? 0 : page_mapcount(page);
- if (compound)
+ pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
+ page, page_ref_count(head), mapcount, mapping,
+ page_to_pgoff(page), page_to_pfn(page));
+ if (compound) {
if (hpage_pincount_available(page)) {
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%p "
- "index:%#lx head:%px order:%u "
- "compound_mapcount:%d compound_pincount:%d\n",
- page, page_ref_count(head), mapcount,
- mapping, page_to_pgoff(page), head,
- compound_order(head), compound_mapcount(page),
- compound_pincount(page));
+ pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
+ head, compound_order(head),
+ head_mapcount(head),
+ head_pincount(head));
} else {
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%p "
- "index:%#lx head:%px order:%u "
- "compound_mapcount:%d\n",
- page, page_ref_count(head), mapcount,
- mapping, page_to_pgoff(page), head,
- compound_order(head), compound_mapcount(page));
+ pr_warn("head:%p order:%u compound_mapcount:%d\n",
+ head, compound_order(head),
+ head_mapcount(head));
}
- else
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%p index:%#lx\n",
- page, page_ref_count(page), mapcount,
- mapping, page_to_pgoff(page));
+ }
if (PageKsm(page))
type = "ksm ";
else if (PageAnon(page))
type = "anon ";
else if (mapping) {
- const struct inode *host;
+ struct inode *host;
const struct address_space_operations *a_ops;
- const struct hlist_node *dentry_first;
- const struct dentry *dentry_ptr;
+ struct hlist_node *dentry_first;
+ struct dentry *dentry_ptr;
struct dentry dentry;
/*
* mapping can be invalid pointer and we don't want to crash
* accessing it, so probe everything depending on it carefully
*/
- if (copy_from_kernel_nofault(&host, &mapping->host,
- sizeof(struct inode *)) ||
- copy_from_kernel_nofault(&a_ops, &mapping->a_ops,
- sizeof(struct address_space_operations *))) {
- pr_warn("failed to read mapping->host or a_ops, mapping not a valid kernel address?\n");
+ if (get_kernel_nofault(host, &mapping->host) ||
+ get_kernel_nofault(a_ops, &mapping->a_ops)) {
+ pr_warn("failed to read mapping contents, not a valid kernel address?\n");
goto out_mapping;
}
if (!host) {
- pr_warn("mapping->a_ops:%ps\n", a_ops);
+ pr_warn("aops:%ps\n", a_ops);
goto out_mapping;
}
- if (copy_from_kernel_nofault(&dentry_first,
- &host->i_dentry.first, sizeof(struct hlist_node *))) {
- pr_warn("mapping->a_ops:%ps with invalid mapping->host inode address %px\n",
- a_ops, host);
+ if (get_kernel_nofault(dentry_first, &host->i_dentry.first)) {
+ pr_warn("aops:%ps with invalid host inode %px\n",
+ a_ops, host);
goto out_mapping;
}
if (!dentry_first) {
- pr_warn("mapping->a_ops:%ps\n", a_ops);
+ pr_warn("aops:%ps ino:%lx\n", a_ops, host->i_ino);
goto out_mapping;
}
dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
- if (copy_from_kernel_nofault(&dentry, dentry_ptr,
- sizeof(struct dentry))) {
- pr_warn("mapping->aops:%ps with invalid mapping->host->i_dentry.first %px\n",
- a_ops, dentry_ptr);
+ if (get_kernel_nofault(dentry, dentry_ptr)) {
+ pr_warn("aops:%ps with invalid dentry %px\n", a_ops,
+ dentry_ptr);
} else {
/*
* if dentry is corrupted, the %pd handler may still
* crash, but it's unlikely that we reach here with a
* corrupted struct page
*/
- pr_warn("mapping->aops:%ps dentry name:\"%pd\"\n",
- a_ops, &dentry);
+ pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n",
+ a_ops, host->i_ino, &dentry);
}
}
out_mapping:
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
- pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags,
+ pr_warn("%sflags: %#lx(%pGp)%s\n", type, head->flags, &head->flags,
page_cma ? " CMA" : "");
hex_only:
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 61ab16fb2e36..086309fb9b6f 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -8,7 +8,7 @@
*
* Author: Anshuman Khandual <anshuman.khandual@arm.com>
*/
-#define pr_fmt(fmt) "debug_vm_pgtable: %s: " fmt, __func__
+#define pr_fmt(fmt) "debug_vm_pgtable: [%-25s]: " fmt, __func__
#include <linux/gfp.h>
#include <linux/highmem.h>
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/pfn_t.h>
#include <linux/printk.h>
+#include <linux/pgtable.h>
#include <linux/random.h>
#include <linux/spinlock.h>
#include <linux/swap.h>
@@ -28,6 +29,13 @@
#include <linux/start_kernel.h>
#include <linux/sched/mm.h>
#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics
+ * expectations that are being validated here. All future changes in here
+ * or the documentation need to be in sync.
+ */
#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC)
@@ -46,6 +54,7 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
{
pte_t pte = pfn_pte(pfn, prot);
+ pr_debug("Validating PTE basic\n");
WARN_ON(!pte_same(pte, pte));
WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
@@ -55,6 +64,57 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
}
+static void __init pte_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pte_t *ptep,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ pr_debug("Validating PTE advanced\n");
+ pte = pfn_pte(pfn, prot);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_set_wrprotect(mm, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(pte_write(pte));
+
+ pte = pfn_pte(pfn, prot);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_get_and_clear(mm, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(!pte_none(pte));
+
+ pte = pfn_pte(pfn, prot);
+ pte = pte_wrprotect(pte);
+ pte = pte_mkclean(pte);
+ set_pte_at(mm, vaddr, ptep, pte);
+ pte = pte_mkwrite(pte);
+ pte = pte_mkdirty(pte);
+ ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
+ pte = ptep_get(ptep);
+ WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
+
+ pte = pfn_pte(pfn, prot);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_get_and_clear_full(mm, vaddr, ptep, 1);
+ pte = ptep_get(ptep);
+ WARN_ON(!pte_none(pte));
+
+ pte = pte_mkyoung(pte);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_test_and_clear_young(vma, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(pte_young(pte));
+}
+
+static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ pr_debug("Validating PTE saved write\n");
+ WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
+ WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
+}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
{
@@ -63,6 +123,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
if (!has_transparent_hugepage())
return;
+ pr_debug("Validating PMD basic\n");
WARN_ON(!pmd_same(pmd, pmd));
WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
@@ -77,6 +138,95 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
}
+static void __init pmd_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pmd_t *pmdp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD advanced\n");
+ /* Align the address wrt HPAGE_PMD_SIZE */
+ vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+
+ pmd = pfn_pmd(pfn, prot);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_set_wrprotect(mm, vaddr, pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(pmd_write(pmd));
+
+ pmd = pfn_pmd(pfn, prot);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_huge_get_and_clear(mm, vaddr, pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+
+ pmd = pfn_pmd(pfn, prot);
+ pmd = pmd_wrprotect(pmd);
+ pmd = pmd_mkclean(pmd);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmd = pmd_mkwrite(pmd);
+ pmd = pmd_mkdirty(pmd);
+ pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
+
+ pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+
+ pmd = pmd_mkyoung(pmd);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_test_and_clear_young(vma, vaddr, pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(pmd_young(pmd));
+}
+
+static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ pr_debug("Validating PMD leaf\n");
+ /*
+ * PMD based THP is a leaf entry.
+ */
+ pmd = pmd_mkhuge(pmd);
+ WARN_ON(!pmd_leaf(pmd));
+}
+
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ return;
+
+ pr_debug("Validating PMD huge\n");
+ /*
+ * X86 defined pmd_set_huge() verifies that the given
+ * PMD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pmdp, __pmd(0));
+ WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pmd_clear_huge(pmdp));
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+}
+
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ pr_debug("Validating PMD saved write\n");
+ WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
+ WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
+}
+
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
{
@@ -85,6 +235,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
if (!has_transparent_hugepage())
return;
+ pr_debug("Validating PUD basic\n");
WARN_ON(!pud_same(pud, pud));
WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud))));
WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud))));
@@ -100,18 +251,130 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
*/
WARN_ON(!pud_bad(pud_mkhuge(pud)));
}
+
+static void __init pud_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+ pud_t pud = pfn_pud(pfn, prot);
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PUD advanced\n");
+ /* Align the address wrt HPAGE_PUD_SIZE */
+ vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
+
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_set_wrprotect(mm, vaddr, pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(pud_write(pud));
+
+#ifndef __PAGETABLE_PMD_FOLDED
+ pud = pfn_pud(pfn, prot);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_huge_get_and_clear(mm, vaddr, pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+
+ pud = pfn_pud(pfn, prot);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+ pud = pfn_pud(pfn, prot);
+ pud = pud_wrprotect(pud);
+ pud = pud_mkclean(pud);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pud = pud_mkwrite(pud);
+ pud = pud_mkdirty(pud);
+ pudp_set_access_flags(vma, vaddr, pudp, pud, 1);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
+
+ pud = pud_mkyoung(pud);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_test_and_clear_young(vma, vaddr, pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(pud_young(pud));
+}
+
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud = pfn_pud(pfn, prot);
+
+ pr_debug("Validating PUD leaf\n");
+ /*
+ * PUD based THP is a leaf entry.
+ */
+ pud = pud_mkhuge(pud);
+ WARN_ON(!pud_leaf(pud));
+}
+
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud;
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ return;
+
+ pr_debug("Validating PUD huge\n");
+ /*
+ * X86 defined pud_set_huge() verifies that the given
+ * PUD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pudp, __pud(0));
+ WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pud_clear_huge(pudp));
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+}
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+}
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pmd_t *pmdp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+}
+static void __init pud_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+}
+static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+{
+}
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+}
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
{
p4d_t p4d;
+ pr_debug("Validating P4D basic\n");
memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t));
WARN_ON(!p4d_same(p4d, p4d));
}
@@ -120,6 +383,7 @@ static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
{
pgd_t pgd;
+ pr_debug("Validating PGD basic\n");
memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t));
WARN_ON(!pgd_same(pgd, pgd));
}
@@ -132,6 +396,7 @@ static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
if (mm_pmd_folded(mm))
return;
+ pr_debug("Validating PUD clear\n");
pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
WRITE_ONCE(*pudp, pud);
pud_clear(pudp);
@@ -146,6 +411,8 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
if (mm_pmd_folded(mm))
return;
+
+ pr_debug("Validating PUD populate\n");
/*
* This entry points to next level page table page.
* Hence this must not qualify as pud_bad().
@@ -172,6 +439,7 @@ static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
if (mm_pud_folded(mm))
return;
+ pr_debug("Validating P4D clear\n");
p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
WRITE_ONCE(*p4dp, p4d);
p4d_clear(p4dp);
@@ -187,6 +455,7 @@ static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
if (mm_pud_folded(mm))
return;
+ pr_debug("Validating P4D populate\n");
/*
* This entry points to next level page table page.
* Hence this must not qualify as p4d_bad().
@@ -205,6 +474,7 @@ static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
if (mm_p4d_folded(mm))
return;
+ pr_debug("Validating PGD clear\n");
pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
WRITE_ONCE(*pgdp, pgd);
pgd_clear(pgdp);
@@ -220,6 +490,7 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
if (mm_p4d_folded(mm))
return;
+ pr_debug("Validating PGD populate\n");
/*
* This entry points to next level page table page.
* Hence this must not qualify as pgd_bad().
@@ -248,6 +519,7 @@ static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
{
pte_t pte = ptep_get(ptep);
+ pr_debug("Validating PTE clear\n");
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
set_pte_at(mm, vaddr, ptep, pte);
barrier();
@@ -260,6 +532,7 @@ static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
{
pmd_t pmd = READ_ONCE(*pmdp);
+ pr_debug("Validating PMD clear\n");
pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
WRITE_ONCE(*pmdp, pmd);
pmd_clear(pmdp);
@@ -272,6 +545,7 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
{
pmd_t pmd;
+ pr_debug("Validating PMD populate\n");
/*
* This entry points to next level page table page.
* Hence this must not qualify as pmd_bad().
@@ -282,6 +556,344 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
WARN_ON(pmd_bad(pmd));
}
+static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL))
+ return;
+
+ pr_debug("Validating PTE special\n");
+ WARN_ON(!pte_special(pte_mkspecial(pte)));
+}
+
+static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
+ pr_debug("Validating PTE protnone\n");
+ WARN_ON(!pte_protnone(pte));
+ WARN_ON(!pte_present(pte));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
+ pr_debug("Validating PMD protnone\n");
+ WARN_ON(!pmd_protnone(pmd));
+ WARN_ON(!pmd_present(pmd));
+}
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
+static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ pr_debug("Validating PTE devmap\n");
+ WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ pr_debug("Validating PMD devmap\n");
+ WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud = pfn_pud(pfn, prot);
+
+ pr_debug("Validating PUD devmap\n");
+ WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
+}
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#else
+static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+
+static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return;
+
+ pr_debug("Validating PTE soft dirty\n");
+ WARN_ON(!pte_soft_dirty(pte_mksoft_dirty(pte)));
+ WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte)));
+}
+
+static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return;
+
+ pr_debug("Validating PTE swap soft dirty\n");
+ WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte)));
+ WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte)));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return;
+
+ pr_debug("Validating PMD soft dirty\n");
+ WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd)));
+ WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
+}
+
+static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) ||
+ !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
+ return;
+
+ pr_debug("Validating PMD swap soft dirty\n");
+ WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
+ WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
+}
+#else /* !CONFIG_ARCH_HAS_PTE_DEVMAP */
+static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+}
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+
+static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
+{
+ swp_entry_t swp;
+ pte_t pte;
+
+ pr_debug("Validating PTE swap\n");
+ pte = pfn_pte(pfn, prot);
+ swp = __pte_to_swp_entry(pte);
+ pte = __swp_entry_to_pte(swp);
+ WARN_ON(pfn != pte_pfn(pte));
+}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
+{
+ swp_entry_t swp;
+ pmd_t pmd;
+
+ pr_debug("Validating PMD swap\n");
+ pmd = pfn_pmd(pfn, prot);
+ swp = __pmd_to_swp_entry(pmd);
+ pmd = __swp_entry_to_pmd(swp);
+ WARN_ON(pfn != pmd_pfn(pmd));
+}
+#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static void __init swap_migration_tests(void)
+{
+ struct page *page;
+ swp_entry_t swp;
+
+ if (!IS_ENABLED(CONFIG_MIGRATION))
+ return;
+
+ pr_debug("Validating swap migration\n");
+ /*
+ * swap_migration_tests() requires a dedicated page as it needs to
+ * be locked before creating a migration entry from it. Locking the
+ * page that actually maps kernel text ('start_kernel') can be real
+ * problematic. Lets allocate a dedicated page explicitly for this
+ * purpose that will be freed subsequently.
+ */
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ pr_err("page allocation failed\n");
+ return;
+ }
+
+ /*
+ * make_migration_entry() expects given page to be
+ * locked, otherwise it stumbles upon a BUG_ON().
+ */
+ __SetPageLocked(page);
+ swp = make_migration_entry(page, 1);
+ WARN_ON(!is_migration_entry(swp));
+ WARN_ON(!is_write_migration_entry(swp));
+
+ make_migration_entry_read(&swp);
+ WARN_ON(!is_migration_entry(swp));
+ WARN_ON(is_write_migration_entry(swp));
+
+ swp = make_migration_entry(page, 0);
+ WARN_ON(!is_migration_entry(swp));
+ WARN_ON(is_write_migration_entry(swp));
+ __ClearPageLocked(page);
+ __free_page(page);
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ struct page *page;
+ pte_t pte;
+
+ pr_debug("Validating HugeTLB basic\n");
+ /*
+ * Accessing the page associated with the pfn is safe here,
+ * as it was previously derived from a real kernel symbol.
+ */
+ page = pfn_to_page(pfn);
+ pte = mk_huge_pte(page, prot);
+
+ WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
+ WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
+ WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+ pte = pfn_pte(pfn, prot);
+
+ WARN_ON(!pte_huge(pte_mkhuge(pte)));
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+}
+
+static void __init hugetlb_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pte_t *ptep, unsigned long pfn,
+ unsigned long vaddr, pgprot_t prot)
+{
+ struct page *page = pfn_to_page(pfn);
+ pte_t pte = ptep_get(ptep);
+ unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK;
+
+ pr_debug("Validating HugeTLB advanced\n");
+ pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot));
+ set_huge_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ WARN_ON(!pte_same(pte, huge_ptep_get(ptep)));
+ huge_pte_clear(mm, vaddr, ptep, PMD_SIZE);
+ pte = huge_ptep_get(ptep);
+ WARN_ON(!huge_pte_none(pte));
+
+ pte = mk_huge_pte(page, prot);
+ set_huge_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ huge_ptep_set_wrprotect(mm, vaddr, ptep);
+ pte = huge_ptep_get(ptep);
+ WARN_ON(huge_pte_write(pte));
+
+ pte = mk_huge_pte(page, prot);
+ set_huge_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ huge_ptep_get_and_clear(mm, vaddr, ptep);
+ pte = huge_ptep_get(ptep);
+ WARN_ON(!huge_pte_none(pte));
+
+ pte = mk_huge_pte(page, prot);
+ pte = huge_pte_wrprotect(pte);
+ set_huge_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ pte = huge_pte_mkwrite(pte);
+ pte = huge_pte_mkdirty(pte);
+ huge_ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
+ pte = huge_ptep_get(ptep);
+ WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
+}
+#else /* !CONFIG_HUGETLB_PAGE */
+static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init hugetlb_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pte_t *ptep, unsigned long pfn,
+ unsigned long vaddr, pgprot_t prot)
+{
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD based THP\n");
+ /*
+ * pmd_trans_huge() and pmd_present() must return positive after
+ * MMU invalidation with pmd_mkinvalid(). This behavior is an
+ * optimization for transparent huge page. pmd_trans_huge() must
+ * be true if pmd_page() returns a valid THP to avoid taking the
+ * pmd_lock when others walk over non transhuge pmds (i.e. there
+ * are no THP allocated). Especially when splitting a THP and
+ * removing the present bit from the pmd, pmd_trans_huge() still
+ * needs to return true. pmd_present() should be true whenever
+ * pmd_trans_huge() returns true.
+ */
+ pmd = pfn_pmd(pfn, prot);
+ WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd)));
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+ WARN_ON(!pmd_trans_huge(pmd_mkinvalid(pmd_mkhuge(pmd))));
+ WARN_ON(!pmd_present(pmd_mkinvalid(pmd_mkhuge(pmd))));
+#endif /* __HAVE_ARCH_PMDP_INVALIDATE */
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PUD based THP\n");
+ pud = pfn_pud(pfn, prot);
+ WARN_ON(!pud_trans_huge(pud_mkhuge(pud)));
+
+ /*
+ * pud_mkinvalid() has been dropped for now. Enable back
+ * these tests when it comes back with a modified pud_present().
+ *
+ * WARN_ON(!pud_trans_huge(pud_mkinvalid(pud_mkhuge(pud))));
+ * WARN_ON(!pud_present(pud_mkinvalid(pud_mkhuge(pud))));
+ */
+}
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
static unsigned long __init get_random_vaddr(void)
{
unsigned long random_vaddr, random_pages, total_user_pages;
@@ -296,6 +908,7 @@ static unsigned long __init get_random_vaddr(void)
static int __init debug_vm_pgtable(void)
{
+ struct vm_area_struct *vma;
struct mm_struct *mm;
pgd_t *pgdp;
p4d_t *p4dp, *saved_p4dp;
@@ -303,11 +916,11 @@ static int __init debug_vm_pgtable(void)
pmd_t *pmdp, *saved_pmdp, pmd;
pte_t *ptep;
pgtable_t saved_ptep;
- pgprot_t prot;
+ pgprot_t prot, protnone;
phys_addr_t paddr;
unsigned long vaddr, pte_aligned, pmd_aligned;
unsigned long pud_aligned, p4d_aligned, pgd_aligned;
- spinlock_t *uninitialized_var(ptl);
+ spinlock_t *ptl = NULL;
pr_info("Validating architecture page table helpers\n");
prot = vm_get_page_prot(VMFLAGS);
@@ -319,6 +932,18 @@ static int __init debug_vm_pgtable(void)
}
/*
+ * __P000 (or even __S000) will help create page table entries with
+ * PROT_NONE permission as required for pxx_protnone_tests().
+ */
+ protnone = __P000;
+
+ vma = vm_area_alloc(mm);
+ if (!vma) {
+ pr_err("vma allocation failed\n");
+ return 1;
+ }
+
+ /*
* PFN for mapping at PTE level is determined from a standard kernel
* text symbol. But pfns for higher page table levels are derived by
* masking lower bits of this real pfn. These derived pfns might not
@@ -366,6 +991,20 @@ static int __init debug_vm_pgtable(void)
p4d_clear_tests(mm, p4dp);
pgd_clear_tests(mm, pgdp);
+ pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+ pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
+ pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+ hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+
+ pmd_leaf_tests(pmd_aligned, prot);
+ pud_leaf_tests(pud_aligned, prot);
+
+ pmd_huge_tests(pmdp, pmd_aligned, prot);
+ pud_huge_tests(pudp, pud_aligned, prot);
+
+ pte_savedwrite_tests(pte_aligned, prot);
+ pmd_savedwrite_tests(pmd_aligned, prot);
+
pte_unmap_unlock(ptep, ptl);
pmd_populate_tests(mm, pmdp, saved_ptep);
@@ -373,11 +1012,34 @@ static int __init debug_vm_pgtable(void)
p4d_populate_tests(mm, p4dp, saved_pudp);
pgd_populate_tests(mm, pgdp, saved_p4dp);
+ pte_special_tests(pte_aligned, prot);
+ pte_protnone_tests(pte_aligned, protnone);
+ pmd_protnone_tests(pmd_aligned, protnone);
+
+ pte_devmap_tests(pte_aligned, prot);
+ pmd_devmap_tests(pmd_aligned, prot);
+ pud_devmap_tests(pud_aligned, prot);
+
+ pte_soft_dirty_tests(pte_aligned, prot);
+ pmd_soft_dirty_tests(pmd_aligned, prot);
+ pte_swap_soft_dirty_tests(pte_aligned, prot);
+ pmd_swap_soft_dirty_tests(pmd_aligned, prot);
+
+ pte_swap_tests(pte_aligned, prot);
+ pmd_swap_tests(pmd_aligned, prot);
+
+ swap_migration_tests();
+ hugetlb_basic_tests(pte_aligned, prot);
+
+ pmd_thp_tests(pmd_aligned, prot);
+ pud_thp_tests(pud_aligned, prot);
+
p4d_free(mm, saved_p4dp);
pud_free(mm, saved_pudp);
pmd_free(mm, saved_pmdp);
pte_free(mm, saved_ptep);
+ vm_area_free(vma);
mm_dec_nr_puds(mm);
mm_dec_nr_pmds(mm);
mm_dec_nr_ptes(mm);
diff --git a/mm/filemap.c b/mm/filemap.c
index 385759c4ce4b..8e75bce0346d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -41,6 +41,7 @@
#include <linux/delayacct.h>
#include <linux/psi.h>
#include <linux/ramfs.h>
+#include <linux/page_idle.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -987,44 +988,46 @@ void __init pagecache_init(void)
page_writeback_init();
}
-/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
-struct wait_page_key {
- struct page *page;
- int bit_nr;
- int page_match;
-};
-
-struct wait_page_queue {
- struct page *page;
- int bit_nr;
- wait_queue_entry_t wait;
-};
-
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
+ int ret;
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
= container_of(wait, struct wait_page_queue, wait);
- if (wait_page->page != key->page)
- return 0;
- key->page_match = 1;
-
- if (wait_page->bit_nr != key->bit_nr)
+ if (!wake_page_match(wait_page, key))
return 0;
/*
- * Stop walking if it's locked.
- * Is this safe if put_and_wait_on_page_locked() is in use?
- * Yes: the waker must hold a reference to this page, and if PG_locked
- * has now already been set by another task, that task must also hold
- * a reference to the *same usage* of this page; so there is no need
- * to walk on to wake even the put_and_wait_on_page_locked() callers.
+ * If it's an exclusive wait, we get the bit for it, and
+ * stop walking if we can't.
+ *
+ * If it's a non-exclusive wait, then the fact that this
+ * wake function was called means that the bit already
+ * was cleared, and we don't care if somebody then
+ * re-took it.
*/
- if (test_bit(key->bit_nr, &key->page->flags))
- return -1;
+ ret = 0;
+ if (wait->flags & WQ_FLAG_EXCLUSIVE) {
+ if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ return -1;
+ ret = 1;
+ }
+ wait->flags |= WQ_FLAG_WOKEN;
- return autoremove_wake_function(wait, mode, sync, key);
+ wake_up_state(wait->private, mode);
+
+ /*
+ * Ok, we have successfully done what we're waiting for,
+ * and we can unconditionally remove the wait entry.
+ *
+ * Note that this has to be the absolute last thing we do,
+ * since after list_del_init(&wait->entry) the wait entry
+ * might be de-allocated and the process might even have
+ * exited.
+ */
+ list_del_init_careful(&wait->entry);
+ return ret;
}
static void wake_up_page_bit(struct page *page, int bit_nr)
@@ -1103,16 +1106,31 @@ enum behavior {
*/
};
+/*
+ * Attempt to check (or get) the page bit, and mark the
+ * waiter woken if successful.
+ */
+static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
+ struct wait_queue_entry *wait)
+{
+ if (wait->flags & WQ_FLAG_EXCLUSIVE) {
+ if (test_and_set_bit(bit_nr, &page->flags))
+ return false;
+ } else if (test_bit(bit_nr, &page->flags))
+ return false;
+
+ wait->flags |= WQ_FLAG_WOKEN;
+ return true;
+}
+
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, enum behavior behavior)
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
- bool bit_is_set;
bool thrashing = false;
bool delayacct = false;
unsigned long pflags;
- int ret = 0;
if (bit_nr == PG_locked &&
!PageUptodate(page) && PageWorkingset(page)) {
@@ -1130,48 +1148,47 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
wait_page.page = page;
wait_page.bit_nr = bit_nr;
- for (;;) {
- spin_lock_irq(&q->lock);
+ /*
+ * Do one last check whether we can get the
+ * page bit synchronously.
+ *
+ * Do the SetPageWaiters() marking before that
+ * to let any waker we _just_ missed know they
+ * need to wake us up (otherwise they'll never
+ * even go to the slow case that looks at the
+ * page queue), and add ourselves to the wait
+ * queue if we need to sleep.
+ *
+ * This part needs to be done under the queue
+ * lock to avoid races.
+ */
+ spin_lock_irq(&q->lock);
+ SetPageWaiters(page);
+ if (!trylock_page_bit_common(page, bit_nr, wait))
+ __add_wait_queue_entry_tail(q, wait);
+ spin_unlock_irq(&q->lock);
- if (likely(list_empty(&wait->entry))) {
- __add_wait_queue_entry_tail(q, wait);
- SetPageWaiters(page);
- }
+ /*
+ * From now on, all the logic will be based on
+ * the WQ_FLAG_WOKEN flag, and the and the page
+ * bit testing (and setting) will be - or has
+ * already been - done by the wake function.
+ *
+ * We can drop our reference to the page.
+ */
+ if (behavior == DROP)
+ put_page(page);
+ for (;;) {
set_current_state(state);
- spin_unlock_irq(&q->lock);
-
- bit_is_set = test_bit(bit_nr, &page->flags);
- if (behavior == DROP)
- put_page(page);
-
- if (likely(bit_is_set))
- io_schedule();
-
- if (behavior == EXCLUSIVE) {
- if (!test_and_set_bit_lock(bit_nr, &page->flags))
- break;
- } else if (behavior == SHARED) {
- if (!test_bit(bit_nr, &page->flags))
- break;
- }
-
- if (signal_pending_state(state, current)) {
- ret = -EINTR;
+ if (signal_pending_state(state, current))
break;
- }
- if (behavior == DROP) {
- /*
- * We can no longer safely access page->flags:
- * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
- * there is a risk of waiting forever on a page reused
- * for something that keeps it locked indefinitely.
- * But best check for -EINTR above before breaking.
- */
+ if (wait->flags & WQ_FLAG_WOKEN)
break;
- }
+
+ io_schedule();
}
finish_wait(q, wait);
@@ -1190,7 +1207,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
* bother with signals either.
*/
- return ret;
+ return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}
void wait_on_page_bit(struct page *page, int bit_nr)
@@ -1207,6 +1224,44 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
}
EXPORT_SYMBOL(wait_on_page_bit_killable);
+static int __wait_on_page_locked_async(struct page *page,
+ struct wait_page_queue *wait, bool set)
+{
+ struct wait_queue_head *q = page_waitqueue(page);
+ int ret = 0;
+
+ wait->page = page;
+ wait->bit_nr = PG_locked;
+
+ spin_lock_irq(&q->lock);
+ __add_wait_queue_entry_tail(q, &wait->wait);
+ SetPageWaiters(page);
+ if (set)
+ ret = !trylock_page(page);
+ else
+ ret = PageLocked(page);
+ /*
+ * If we were succesful now, we know we're still on the
+ * waitqueue as we're still under the lock. This means it's
+ * safe to remove and return success, we know the callback
+ * isn't going to trigger.
+ */
+ if (!ret)
+ __remove_wait_queue(q, &wait->wait);
+ else
+ ret = -EIOCBQUEUED;
+ spin_unlock_irq(&q->lock);
+ return ret;
+}
+
+static int wait_on_page_locked_async(struct page *page,
+ struct wait_page_queue *wait)
+{
+ if (!PageLocked(page))
+ return 0;
+ return __wait_on_page_locked_async(compound_head(page), wait, false);
+}
+
/**
* put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
* @page: The page to wait for.
@@ -1369,6 +1424,11 @@ int __lock_page_killable(struct page *__page)
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
+int __lock_page_async(struct page *page, struct wait_page_queue *wait)
+{
+ return __wait_on_page_locked_async(page, wait, true);
+}
+
/*
* Return values:
* 1 - page is locked; mmap_lock is still held.
@@ -1589,6 +1649,9 @@ EXPORT_SYMBOL(find_lock_entry);
* * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
* page is already in cache. If the page was allocated, unlock it before
* returning so the caller can do the same dance.
+ * * %FGP_WRITE - The page will be written
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
+ * * %FGP_NOWAIT - Don't get blocked by page lock
*
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
* if the %GFP flags specified for %FGP_CREAT are atomic.
@@ -1630,6 +1693,11 @@ repeat:
if (fgp_flags & FGP_ACCESSED)
mark_page_accessed(page);
+ else if (fgp_flags & FGP_WRITE) {
+ /* Clear idle flag for buffer write */
+ if (page_is_idle(page))
+ clear_page_idle(page);
+ }
no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
@@ -2028,7 +2096,7 @@ find_page:
page = find_get_page(mapping, index);
if (!page) {
- if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
+ if (iocb->ki_flags & IOCB_NOIO)
goto would_block;
page_cache_sync_readahead(mapping,
ra, filp,
@@ -2047,17 +2115,25 @@ find_page:
index, last_index - index);
}
if (!PageUptodate(page)) {
- if (iocb->ki_flags & IOCB_NOWAIT) {
- put_page(page);
- goto would_block;
- }
-
/*
* See comment in do_read_cache_page on why
* wait_on_page_locked is used to avoid unnecessarily
* serialisations and why it's safe.
*/
- error = wait_on_page_locked_killable(page);
+ if (iocb->ki_flags & IOCB_WAITQ) {
+ if (written) {
+ put_page(page);
+ goto out;
+ }
+ error = wait_on_page_locked_async(page,
+ iocb->ki_waitq);
+ } else {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ put_page(page);
+ goto would_block;
+ }
+ error = wait_on_page_locked_killable(page);
+ }
if (unlikely(error))
goto readpage_error;
if (PageUptodate(page))
@@ -2145,7 +2221,10 @@ page_ok:
page_not_up_to_date:
/* Get exclusive access to the page ... */
- error = lock_page_killable(page);
+ if (iocb->ki_flags & IOCB_WAITQ)
+ error = lock_page_async(page, iocb->ki_waitq);
+ else
+ error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
@@ -2164,7 +2243,7 @@ page_not_up_to_date_locked:
}
readpage:
- if (iocb->ki_flags & IOCB_NOIO) {
+ if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
unlock_page(page);
put_page(page);
goto would_block;
@@ -2806,7 +2885,7 @@ filler:
* Case a, the page will be up to date when the page is unlocked.
* There is no need to serialise on the page lock here as the page
* is pinned so the lock gives no additional protection. Even if the
- * the page is truncated, the data is still valid if PageUptodate as
+ * page is truncated, the data is still valid if PageUptodate as
* it's a race vs truncate race.
* Case b, the page will not be up to date
* Case c, the page may be truncated but in itself, the data may still
diff --git a/mm/frontswap.c b/mm/frontswap.c
index bfa3a339253e..9d977b1fc016 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -446,7 +446,7 @@ static int __frontswap_shrink(unsigned long target_pages,
void frontswap_shrink(unsigned long target_pages)
{
unsigned long pages_to_unuse = 0;
- int uninitialized_var(type), ret;
+ int type, ret;
/*
* we don't want to hold swap_lock while doing a very
diff --git a/mm/gup.c b/mm/gup.c
index 6f47697f8fb0..39e58df6925d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -859,7 +859,7 @@ unmap:
* does not include FOLL_NOWAIT, the mmap_lock may be released. If it
* is, *@locked will be set to 0 and -EBUSY returned.
*/
-static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
+static int faultin_page(struct vm_area_struct *vma,
unsigned long address, unsigned int *flags, int *locked)
{
unsigned int fault_flags = 0;
@@ -884,7 +884,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_TRIED;
}
- ret = handle_mm_fault(vma, address, fault_flags);
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
@@ -893,13 +893,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
BUG();
}
- if (tsk) {
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
-
if (ret & VM_FAULT_RETRY) {
if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*locked = 0;
@@ -969,7 +962,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
/**
* __get_user_pages() - pin user pages in memory
- * @tsk: task_struct of target task
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -1028,7 +1020,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* instead of __get_user_pages. __get_user_pages should be used only if
* you need some special @gup_flags.
*/
-static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+static long __get_user_pages(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1110,8 +1102,7 @@ retry:
page = follow_page_mask(vma, start, foll_flags, &ctx);
if (!page) {
- ret = faultin_page(tsk, vma, start, &foll_flags,
- locked);
+ ret = faultin_page(vma, start, &foll_flags, locked);
switch (ret) {
case 0:
goto retry;
@@ -1185,8 +1176,6 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
/**
* fixup_user_fault() - manually resolve a user page fault
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @address: user address
* @fault_flags:flags to pass down to handle_mm_fault()
@@ -1214,7 +1203,7 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
* This function will not return with an unlocked mmap_lock. So it has not the
* same semantics wrt the @mm->mmap_lock as does filemap_fault().
*/
-int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+int fixup_user_fault(struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked)
{
@@ -1238,7 +1227,7 @@ retry:
fatal_signal_pending(current))
return -EINTR;
- ret = handle_mm_fault(vma, address, fault_flags);
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, 0);
@@ -1255,12 +1244,6 @@ retry:
goto retry;
}
- if (tsk) {
- if (major)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
return 0;
}
EXPORT_SYMBOL_GPL(fixup_user_fault);
@@ -1269,8 +1252,7 @@ EXPORT_SYMBOL_GPL(fixup_user_fault);
* Please note that this function, unlike __get_user_pages will not
* return 0 for nr_pages > 0 without FOLL_NOWAIT
*/
-static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
- struct mm_struct *mm,
+static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1303,7 +1285,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
pages_done = 0;
lock_dropped = false;
for (;;) {
- ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
+ ret = __get_user_pages(mm, start, nr_pages, flags, pages,
vmas, locked);
if (!locked)
/* VM_FAULT_RETRY couldn't trigger, bypass */
@@ -1363,7 +1345,7 @@ retry:
}
*locked = 1;
- ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
+ ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
pages, NULL, locked);
if (!*locked) {
/* Continue to retry until we succeeded */
@@ -1404,7 +1386,8 @@ retry:
*
* This takes care of mlocking the pages too if VM_LOCKED is set.
*
- * return 0 on success, negative error code on error.
+ * Return either number of pages pinned in the vma, or a negative error
+ * code on error.
*
* vma->vm_mm->mmap_lock must be held.
*
@@ -1449,7 +1432,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+ return __get_user_pages(mm, start, nr_pages, gup_flags,
NULL, NULL, locked);
}
@@ -1533,7 +1516,7 @@ struct page *get_dump_page(unsigned long addr)
struct vm_area_struct *vma;
struct page *page;
- if (__get_user_pages(current, current->mm, addr, 1,
+ if (__get_user_pages(current->mm, addr, 1,
FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
NULL) < 1)
return NULL;
@@ -1542,8 +1525,7 @@ struct page *get_dump_page(unsigned long addr)
}
#endif /* CONFIG_ELF_CORE */
#else /* CONFIG_MMU */
-static long __get_user_pages_locked(struct task_struct *tsk,
- struct mm_struct *mm, unsigned long start,
+static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
struct vm_area_struct **vmas, int *locked,
unsigned int foll_flags)
@@ -1608,59 +1590,7 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
}
#ifdef CONFIG_CMA
-static struct page *new_non_cma_page(struct page *page, unsigned long private)
-{
- /*
- * We want to make sure we allocate the new page from the same node
- * as the source page.
- */
- int nid = page_to_nid(page);
- /*
- * Trying to allocate a page for migration. Ignore allocation
- * failure warnings. We don't force __GFP_THISNODE here because
- * this node here is the node where we have CMA reservation and
- * in some case these nodes will have really less non movable
- * allocation memory.
- */
- gfp_t gfp_mask = GFP_USER | __GFP_NOWARN;
-
- if (PageHighMem(page))
- gfp_mask |= __GFP_HIGHMEM;
-
-#ifdef CONFIG_HUGETLB_PAGE
- if (PageHuge(page)) {
- struct hstate *h = page_hstate(page);
- /*
- * We don't want to dequeue from the pool because pool pages will
- * mostly be from the CMA region.
- */
- return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
- }
-#endif
- if (PageTransHuge(page)) {
- struct page *thp;
- /*
- * ignore allocation failure warnings
- */
- gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN;
-
- /*
- * Remove the movable mask so that we don't allocate from
- * CMA area again.
- */
- thp_gfpmask &= ~__GFP_MOVABLE;
- thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
- }
-
- return __alloc_pages_node(nid, gfp_mask, 0);
-}
-
-static long check_and_migrate_cma_pages(struct task_struct *tsk,
- struct mm_struct *mm,
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1673,6 +1603,10 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
bool migrate_allow = true;
LIST_HEAD(cma_page_list);
long ret = nr_pages;
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
+ };
check_again:
for (i = 0; i < nr_pages;) {
@@ -1718,8 +1652,8 @@ check_again:
for (i = 0; i < nr_pages; i++)
put_page(pages[i]);
- if (migrate_pages(&cma_page_list, new_non_cma_page,
- NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
+ if (migrate_pages(&cma_page_list, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
/*
* some of the pages failed migration. Do get_user_pages
* without migration.
@@ -1734,7 +1668,7 @@ check_again:
* again migrating any new CMA pages which we failed to isolate
* earlier.
*/
- ret = __get_user_pages_locked(tsk, mm, start, nr_pages,
+ ret = __get_user_pages_locked(mm, start, nr_pages,
pages, vmas, NULL,
gup_flags);
@@ -1748,8 +1682,7 @@ check_again:
return ret;
}
#else
-static long check_and_migrate_cma_pages(struct task_struct *tsk,
- struct mm_struct *mm,
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1764,8 +1697,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
* __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
* allows us to process the FOLL_LONGTERM flag.
*/
-static long __gup_longterm_locked(struct task_struct *tsk,
- struct mm_struct *mm,
+static long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1790,11 +1722,10 @@ static long __gup_longterm_locked(struct task_struct *tsk,
flags = memalloc_nocma_save();
}
- rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+ rc = __get_user_pages_locked(mm, start, nr_pages, pages,
vmas_tmp, NULL, gup_flags);
if (gup_flags & FOLL_LONGTERM) {
- memalloc_nocma_restore(flags);
if (rc < 0)
goto out;
@@ -1805,32 +1736,31 @@ static long __gup_longterm_locked(struct task_struct *tsk,
goto out;
}
- rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
+ rc = check_and_migrate_cma_pages(mm, start, rc, pages,
vmas_tmp, gup_flags);
+out:
+ memalloc_nocma_restore(flags);
}
-out:
if (vmas_tmp != vmas)
kfree(vmas_tmp);
return rc;
}
#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
-static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
- struct mm_struct *mm,
+static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
struct vm_area_struct **vmas,
unsigned int flags)
{
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
NULL, flags);
}
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
#ifdef CONFIG_MMU
-static long __get_user_pages_remote(struct task_struct *tsk,
- struct mm_struct *mm,
+static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1849,20 +1779,18 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* This will check the vmas (even if our vmas arg is NULL)
* and return -ENOTSUPP if DAX isn't allowed in this case:
*/
- return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
+ return __gup_longterm_locked(mm, start, nr_pages, pages,
vmas, gup_flags | FOLL_TOUCH |
FOLL_REMOTE);
}
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
locked,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
/**
* get_user_pages_remote() - pin user pages in memory
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -1921,7 +1849,7 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* should use get_user_pages_remote because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1933,13 +1861,13 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
- return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
}
EXPORT_SYMBOL(get_user_pages_remote);
#else /* CONFIG_MMU */
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1947,8 +1875,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
return 0;
}
-static long __get_user_pages_remote(struct task_struct *tsk,
- struct mm_struct *mm,
+static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1968,11 +1895,10 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
*
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter. We also obviously don't pass
- * FOLL_REMOTE in here.
+ * This is the same as get_user_pages_remote(), just with a less-flexible
+ * calling convention where we assume that the mm being operated on belongs to
+ * the current task, and doesn't allow passing of a locked parameter. We also
+ * obviously don't pass FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -1985,7 +1911,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
- return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
@@ -1995,7 +1921,7 @@ EXPORT_SYMBOL(get_user_pages);
*
* mmap_read_lock(mm);
* do_something()
- * get_user_pages(tsk, mm, ..., pages, NULL);
+ * get_user_pages(mm, ..., pages, NULL);
* mmap_read_unlock(mm);
*
* to:
@@ -2003,7 +1929,7 @@ EXPORT_SYMBOL(get_user_pages);
* int locked = 1;
* mmap_read_lock(mm);
* do_something()
- * get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ * get_user_pages_locked(mm, ..., pages, &locked);
* if (locked)
* mmap_read_unlock(mm);
*
@@ -2041,7 +1967,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ return __get_user_pages_locked(current->mm, start, nr_pages,
pages, NULL, locked,
gup_flags | FOLL_TOUCH);
}
@@ -2051,12 +1977,12 @@ EXPORT_SYMBOL(get_user_pages_locked);
* get_user_pages_unlocked() is suitable to replace the form:
*
* mmap_read_lock(mm);
- * get_user_pages(tsk, mm, ..., pages, NULL);
+ * get_user_pages(mm, ..., pages, NULL);
* mmap_read_unlock(mm);
*
* with:
*
- * get_user_pages_unlocked(tsk, mm, ..., pages);
+ * get_user_pages_unlocked(mm, ..., pages);
*
* It is functionally equivalent to get_user_pages_fast so
* get_user_pages_fast should be used instead if specific gup_flags
@@ -2079,7 +2005,7 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
return -EINVAL;
mmap_read_lock(mm);
- ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
+ ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
&locked, gup_flags | FOLL_TOUCH);
if (locked)
mmap_read_unlock(mm);
@@ -2724,7 +2650,7 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
*/
if (gup_flags & FOLL_LONGTERM) {
mmap_read_lock(current->mm);
- ret = __gup_longterm_locked(current, current->mm,
+ ret = __gup_longterm_locked(current->mm,
start, nr_pages,
pages, NULL, gup_flags);
mmap_read_unlock(current->mm);
@@ -2967,10 +2893,8 @@ int pin_user_pages_fast_only(unsigned long start, int nr_pages,
EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
/**
- * pin_user_pages_remote() - pin pages of a remote process (task != current)
+ * pin_user_pages_remote() - pin pages of a remote process
*
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -2991,7 +2915,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
* see Documentation/core-api/pin_user_pages.rst for details.
*/
-long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long pin_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -3001,7 +2925,7 @@ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
return -EINVAL;
gup_flags |= FOLL_PIN;
- return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
}
EXPORT_SYMBOL(pin_user_pages_remote);
@@ -3033,7 +2957,7 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
return -EINVAL;
gup_flags |= FOLL_PIN;
- return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
@@ -3078,7 +3002,7 @@ long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
return -EINVAL;
gup_flags |= FOLL_PIN;
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ return __get_user_pages_locked(current->mm, start, nr_pages,
pages, NULL, locked,
gup_flags | FOLL_TOUCH);
}
diff --git a/mm/hmm.c b/mm/hmm.c
index e9a545751108..943cb2ba4442 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -75,7 +75,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
}
for (; addr < end; addr += PAGE_SIZE)
- if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR)
+ if (handle_mm_fault(vma, addr, fault_flags, NULL) &
+ VM_FAULT_ERROR)
return -EFAULT;
return -EBUSY;
}
@@ -165,12 +166,19 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
return hmm_pfns_fill(addr, end, range, 0);
}
+static inline unsigned long hmm_pfn_flags_order(unsigned long order)
+{
+ return order << HMM_PFN_ORDER_SHIFT;
+}
+
static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
pmd_t pmd)
{
if (pmd_protnone(pmd))
return 0;
- return pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
+ return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
+ HMM_PFN_VALID) |
+ hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -242,7 +250,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
swp_entry_t entry = pte_to_swp_entry(pte);
/*
- * Never fault in device private pages pages, but just report
+ * Never fault in device private pages, but just report
* the PFN even if not present.
*/
if (hmm_is_device_private_entry(range, entry)) {
@@ -389,7 +397,9 @@ static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
{
if (!pud_present(pud))
return 0;
- return pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
+ return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
+ HMM_PFN_VALID) |
+ hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT);
}
static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
@@ -474,7 +484,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
i = (start - range->start) >> PAGE_SHIFT;
pfn_req_flags = range->hmm_pfns[i];
- cpu_flags = pte_to_hmm_pfn_flags(range, entry);
+ cpu_flags = pte_to_hmm_pfn_flags(range, entry) |
+ hmm_pfn_flags_order(huge_page_order(hstate_vma(vma)));
required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 78c84bee7e29..2ccff8472cd4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -303,24 +303,6 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj,
static struct kobj_attribute hpage_pmd_size_attr =
__ATTR_RO(hpage_pmd_size);
-#ifdef CONFIG_DEBUG_VM
-static ssize_t debug_cow_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return single_hugepage_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
-}
-static ssize_t debug_cow_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- return single_hugepage_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
-}
-static struct kobj_attribute debug_cow_attr =
- __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
-#endif /* CONFIG_DEBUG_VM */
-
static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
@@ -329,9 +311,6 @@ static struct attribute *hugepage_attr[] = {
#ifdef CONFIG_SHMEM
&shmem_enabled_attr.attr,
#endif
-#ifdef CONFIG_DEBUG_VM
- &debug_cow_attr.attr,
-#endif
NULL,
};
@@ -640,7 +619,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr, true);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1722,19 +1701,13 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
}
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, unsigned long old_end,
- pmd_t *old_pmd, pmd_t *new_pmd)
+ unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
bool force_flush = false;
- if ((old_addr & ~HPAGE_PMD_MASK) ||
- (new_addr & ~HPAGE_PMD_MASK) ||
- old_end - old_addr < HPAGE_PMD_SIZE)
- return false;
-
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have release it.
@@ -2069,8 +2042,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* free), userland could trigger a small page size TLB miss on the
* small sized TLB while the hugepage TLB entry is still established in
* the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
- * 383 on page 93. Intel should be safe but is also warns that it's
+ * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+ * 383 on page 105. Intel should be safe but is also warns that it's
* only safe if the permission and cache attributes of the two entries
* loaded in the two TLB is identical (which should be the case here).
* But it is generally safer to never allow small and huge TLB entries
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 590111ea6975..a301c2d672bf 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -19,6 +19,7 @@
#include <linux/memblock.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
@@ -31,6 +32,7 @@
#include <linux/cma.h>
#include <asm/page.h>
+#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <linux/io.h>
@@ -132,7 +134,7 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
/*
* Subpool accounting for allocating and reserving pages.
* Return -ENOMEM if there are not enough resources to satisfy the
- * the request. Otherwise, return the number of pages by which the
+ * request. Otherwise, return the number of pages by which the
* global pools must be adjusted (upward). The returned value may
* only be different than the passed value (delta) in the case where
* a subpool minimum size must be maintained.
@@ -1039,10 +1041,16 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
struct page *page;
+ bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+
+ list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
+ if (nocma && is_migrate_cma_page(page))
+ continue;
- list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
if (!PageHWPoison(page))
break;
+ }
+
/*
* if 'non-isolated free hugepage' not found on the list,
* the allocation fails.
@@ -1092,15 +1100,6 @@ retry_cpuset:
return NULL;
}
-/* Movability of hugepages depends on migration support. */
-static inline gfp_t htlb_alloc_mask(struct hstate *h)
-{
- if (hugepage_movable_supported(h))
- return GFP_HIGHUSER_MOVABLE;
- else
- return GFP_HIGHUSER;
-}
-
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
@@ -1943,7 +1942,7 @@ out_unlock:
return page;
}
-struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct page *page;
@@ -1985,31 +1984,9 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
}
/* page migration callback function */
-struct page *alloc_huge_page_node(struct hstate *h, int nid)
-{
- gfp_t gfp_mask = htlb_alloc_mask(h);
- struct page *page = NULL;
-
- if (nid != NUMA_NO_NODE)
- gfp_mask |= __GFP_THISNODE;
-
- spin_lock(&hugetlb_lock);
- if (h->free_huge_pages - h->resv_huge_pages > 0)
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
- spin_unlock(&hugetlb_lock);
-
- if (!page)
- page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
-
- return page;
-}
-
-/* page migration callback function */
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask)
+ nodemask_t *nmask, gfp_t gfp_mask)
{
- gfp_t gfp_mask = htlb_alloc_mask(h);
-
spin_lock(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0) {
struct page *page;
@@ -2037,7 +2014,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- page = alloc_huge_page_nodemask(h, node, nodemask);
+ page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
return page;
@@ -2166,7 +2143,7 @@ static void return_unused_surplus_pages(struct hstate *h,
* evenly across all nodes with memory. Iterate across these nodes
* until we can no longer free unreserved surplus pages. This occurs
* when the nodes with surplus pages have no free pages.
- * free_pool_huge_page() will balance the the freed pages across the
+ * free_pool_huge_page() will balance the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
*
* Note that we decrement resv_huge_pages as we free the pages. If
@@ -3457,13 +3434,21 @@ static int __init default_hugepagesz_setup(char *s)
}
__setup("default_hugepagesz=", default_hugepagesz_setup);
-static unsigned int cpuset_mems_nr(unsigned int *array)
+static unsigned int allowed_mems_nr(struct hstate *h)
{
int node;
unsigned int nr = 0;
+ nodemask_t *mpol_allowed;
+ unsigned int *array = h->free_huge_pages_node;
+ gfp_t gfp_mask = htlb_alloc_mask(h);
- for_each_node_mask(node, cpuset_current_mems_allowed)
- nr += array[node];
+ mpol_allowed = policy_nodemask_current(gfp_mask);
+
+ for_each_node_mask(node, cpuset_current_mems_allowed) {
+ if (!mpol_allowed ||
+ (mpol_allowed && node_isset(node, *mpol_allowed)))
+ nr += array[node];
+ }
return nr;
}
@@ -3642,12 +3627,18 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
* we fall back to check against current free page availability as
* a best attempt and hopefully to minimize the impact of changing
* semantics that cpuset has.
+ *
+ * Apart from cpuset, we also have memory policy mechanism that
+ * also determines from which node the kernel will allocate memory
+ * in a NUMA system. So similar to cpuset, we also should consider
+ * the memory policy of the current task. Similar to the description
+ * above.
*/
if (delta > 0) {
if (gather_surplus_pages(h, delta) < 0)
goto out;
- if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+ if (delta > allowed_mems_nr(h)) {
return_unused_surplus_pages(h, delta);
goto out;
}
@@ -3952,7 +3943,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
continue;
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
spin_unlock(ptl);
/*
* We just unmapped a page of PMDs by clearing a PUD.
@@ -4539,10 +4530,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
- } else {
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
- if (!ptep)
- return VM_FAULT_OOM;
}
/*
@@ -5019,7 +5006,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
pages++;
spin_unlock(ptl);
shared_pmd = true;
@@ -5313,25 +5300,21 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long check_addr;
+ unsigned long a_start, a_end;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
- for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
- unsigned long a_start = check_addr & PUD_MASK;
- unsigned long a_end = a_start + PUD_SIZE;
+ /* Extend the range to be PUD aligned for a worst case scenario */
+ a_start = ALIGN_DOWN(*start, PUD_SIZE);
+ a_end = ALIGN(*end, PUD_SIZE);
- /*
- * If sharing is possible, adjust start/end if necessary.
- */
- if (range_in_vma(vma, a_start, a_end)) {
- if (a_start < *start)
- *start = a_start;
- if (a_end > *end)
- *end = a_end;
- }
- }
+ /*
+ * Intersect the range with the vma range, since pmd sharing won't be
+ * across vma after all
+ */
+ *start = max(vma->vm_start, a_start);
+ *end = min(vma->vm_end, a_end);
}
/*
@@ -5404,12 +5387,14 @@ out:
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
*/
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *addr, pte_t *ptep)
{
pgd_t *pgd = pgd_offset(mm, *addr);
p4d_t *p4d = p4d_offset(pgd, *addr);
pud_t *pud = pud_offset(p4d, *addr);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
@@ -5427,7 +5412,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *addr, pte_t *ptep)
{
return 0;
}
@@ -5697,12 +5683,14 @@ void __init hugetlb_cma_reserve(int order)
reserved = 0;
for_each_node_state(nid, N_ONLINE) {
int res;
+ char name[20];
size = min(per_node, hugetlb_cma_size - reserved);
size = round_up(size, PAGE_SIZE << order);
+ snprintf(name, 20, "hugetlb%d", nid);
res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
- 0, false, "hugetlb",
+ 0, false, name,
&hugetlb_cma[nid], nid);
if (res) {
pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
diff --git a/mm/internal.h b/mm/internal.h
index 9886db20d94f..d11a9a8d2135 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -239,6 +239,7 @@ struct compact_control {
bool no_set_skip_hint; /* Don't mark blocks for skipping */
bool ignore_block_suitable; /* Scan blocks considered unsuitable */
bool direct_compaction; /* False from kcompactd or /proc/... */
+ bool proactive_compaction; /* kcompactd proactive compaction */
bool whole_zone; /* Whole zone should/has been scanned */
bool contended; /* Signal lock or sched contention */
bool rescan; /* Rescanning the same pageblock */
@@ -612,5 +613,11 @@ static inline bool is_migrate_highatomic_page(struct page *page)
}
void setup_zone_pageset(struct zone *zone);
-extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
+
+struct migration_target_control {
+ int nid; /* preferred node id */
+ nodemask_t *nmask;
+ gfp_t gfp_mask;
+};
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/ioremap.c b/mm/ioremap.c
new file mode 100644
index 000000000000..5fa1ab41d152
--- /dev/null
+++ b/mm/ioremap.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <asm/cacheflush.h>
+
+#include "pgalloc-track.h"
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static int __read_mostly ioremap_p4d_capable;
+static int __read_mostly ioremap_pud_capable;
+static int __read_mostly ioremap_pmd_capable;
+static int __read_mostly ioremap_huge_disabled;
+
+static int __init set_nohugeiomap(char *str)
+{
+ ioremap_huge_disabled = 1;
+ return 0;
+}
+early_param("nohugeiomap", set_nohugeiomap);
+
+void __init ioremap_huge_init(void)
+{
+ if (!ioremap_huge_disabled) {
+ if (arch_ioremap_p4d_supported())
+ ioremap_p4d_capable = 1;
+ if (arch_ioremap_pud_supported())
+ ioremap_pud_capable = 1;
+ if (arch_ioremap_pmd_supported())
+ ioremap_pmd_capable = 1;
+ }
+}
+
+static inline int ioremap_p4d_enabled(void)
+{
+ return ioremap_p4d_capable;
+}
+
+static inline int ioremap_pud_enabled(void)
+{
+ return ioremap_pud_capable;
+}
+
+static inline int ioremap_pmd_enabled(void)
+{
+ return ioremap_pmd_capable;
+}
+
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static inline int ioremap_p4d_enabled(void) { return 0; }
+static inline int ioremap_pud_enabled(void) { return 0; }
+static inline int ioremap_pmd_enabled(void) { return 0; }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+ u64 pfn;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ if (!pte)
+ return -ENOMEM;
+ do {
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
+ return 0;
+}
+
+static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr,
+ pgprot_t prot)
+{
+ if (!ioremap_pmd_enabled())
+ return 0;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+ return 0;
+
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+ return 0;
+
+ return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
+ *mask |= PGTBL_PMD_MODIFIED;
+ continue;
+ }
+
+ if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr,
+ pgprot_t prot)
+{
+ if (!ioremap_pud_enabled())
+ return 0;
+
+ if ((end - addr) != PUD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PUD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+ return 0;
+
+ if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+ return 0;
+
+ return pud_set_huge(pud, phys_addr, prot);
+}
+
+static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+
+ if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
+ *mask |= PGTBL_PUD_MODIFIED;
+ continue;
+ }
+
+ if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr,
+ pgprot_t prot)
+{
+ if (!ioremap_p4d_enabled())
+ return 0;
+
+ if ((end - addr) != P4D_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, P4D_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+ return 0;
+
+ if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+ return 0;
+
+ return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
+ *mask |= PGTBL_P4D_MODIFIED;
+ continue;
+ }
+
+ if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+int ioremap_page_range(unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+ pgd_t *pgd;
+ unsigned long start;
+ unsigned long next;
+ int err;
+ pgtbl_mod_mask mask = 0;
+
+ might_sleep();
+ BUG_ON(addr >= end);
+
+ start = addr;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot,
+ &mask);
+ if (err)
+ break;
+ } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+ flush_cache_vmap(start, end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return err;
+}
+
+#ifdef CONFIG_GENERIC_IOREMAP
+void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot)
+{
+ unsigned long offset, vaddr;
+ phys_addr_t last_addr;
+ struct vm_struct *area;
+
+ /* Disallow wrap-around or zero size */
+ last_addr = addr + size - 1;
+ if (!size || last_addr < addr)
+ return NULL;
+
+ /* Page-align mappings */
+ offset = addr & (~PAGE_MASK);
+ addr -= offset;
+ size = PAGE_ALIGN(size + offset);
+
+ area = get_vm_area_caller(size, VM_IOREMAP,
+ __builtin_return_address(0));
+ if (!area)
+ return NULL;
+ vaddr = (unsigned long)area->addr;
+
+ if (ioremap_page_range(vaddr, vaddr + size, addr, __pgprot(prot))) {
+ free_vm_area(area);
+ return NULL;
+ }
+
+ return (void __iomem *)(vaddr + offset);
+}
+EXPORT_SYMBOL(ioremap_prot);
+
+void iounmap(volatile void __iomem *addr)
+{
+ vunmap((void *)((unsigned long)addr & PAGE_MASK));
+}
+EXPORT_SYMBOL(iounmap);
+#endif /* CONFIG_GENERIC_IOREMAP */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index d532c2587731..370d970e5ab5 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -16,7 +16,7 @@ CFLAGS_REMOVE_tags_report.o = $(CC_FLAGS_FTRACE)
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
CC_FLAGS_KASAN_RUNTIME := $(call cc-option, -fno-conserve-stack)
-CC_FLAGS_KASAN_RUNTIME += $(call cc-option, -fno-stack-protector)
+CC_FLAGS_KASAN_RUNTIME += -fno-stack-protector
# Disable branch tracing to avoid recursion.
CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 757d4074fe28..950fd372a07e 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -40,7 +40,7 @@
#include "kasan.h"
#include "../slab.h"
-static inline depot_stack_handle_t save_stack(gfp_t flags)
+depot_stack_handle_t kasan_save_stack(gfp_t flags)
{
unsigned long entries[KASAN_STACK_DEPTH];
unsigned int nr_entries;
@@ -50,10 +50,10 @@ static inline depot_stack_handle_t save_stack(gfp_t flags)
return stack_depot_save(entries, nr_entries, flags);
}
-static inline void set_track(struct kasan_track *track, gfp_t flags)
+void kasan_set_track(struct kasan_track *track, gfp_t flags)
{
track->pid = current->pid;
- track->stack = save_stack(flags);
+ track->stack = kasan_save_stack(flags);
}
void kasan_enable_current(void)
@@ -180,21 +180,6 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
kasan_unpoison_shadow(base, watermark - base);
}
-/*
- * Clear all poison for the region between the current SP and a provided
- * watermark value, as is sometimes required prior to hand-crafted asm function
- * returns in the middle of functions.
- */
-void kasan_unpoison_stack_above_sp_to(const void *watermark)
-{
- const void *sp = __builtin_frame_address(0);
- size_t size = watermark - sp;
-
- if (WARN_ON(sp > watermark))
- return;
- kasan_unpoison_shadow(sp, size);
-}
-
void kasan_alloc_pages(struct page *page, unsigned int order)
{
u8 tag;
@@ -298,24 +283,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
return (void *)object + cache->kasan_info.free_meta_offset;
}
-
-static void kasan_set_free_info(struct kmem_cache *cache,
- void *object, u8 tag)
-{
- struct kasan_alloc_meta *alloc_meta;
- u8 idx = 0;
-
- alloc_meta = get_alloc_info(cache, object);
-
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- idx = alloc_meta->free_track_idx;
- alloc_meta->free_pointer_tag[idx] = tag;
- alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
-#endif
-
- set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
-}
-
void kasan_poison_slab(struct page *page)
{
unsigned long i;
@@ -491,7 +458,7 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
KASAN_KMALLOC_REDZONE);
if (cache->flags & SLAB_KASAN)
- set_track(&get_alloc_info(cache, object)->alloc_track, flags);
+ kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags);
return set_tag(object, tag);
}
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 098a7dbaced6..248264b9cb76 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -324,3 +324,46 @@ DEFINE_ASAN_SET_SHADOW(f2);
DEFINE_ASAN_SET_SHADOW(f3);
DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
+
+void kasan_record_aux_stack(void *addr)
+{
+ struct page *page = kasan_addr_to_page(addr);
+ struct kmem_cache *cache;
+ struct kasan_alloc_meta *alloc_info;
+ void *object;
+
+ if (!(page && PageSlab(page)))
+ return;
+
+ cache = page->slab_cache;
+ object = nearest_obj(cache, page, addr);
+ alloc_info = get_alloc_info(cache, object);
+
+ /*
+ * record the last two call_rcu() call stacks.
+ */
+ alloc_info->aux_stack[1] = alloc_info->aux_stack[0];
+ alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
+}
+
+void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_free_meta *free_meta;
+
+ free_meta = get_free_info(cache, object);
+ kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
+
+ /*
+ * the object was freed and has free track set
+ */
+ *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREETRACK;
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_KMALLOC_FREETRACK)
+ return NULL;
+ return &get_free_info(cache, object)->free_track;
+}
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
index e200acb2d292..a38c7a9e192a 100644
--- a/mm/kasan/generic_report.c
+++ b/mm/kasan/generic_report.c
@@ -80,6 +80,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
break;
case KASAN_FREE_PAGE:
case KASAN_KMALLOC_FREE:
+ case KASAN_KMALLOC_FREETRACK:
bug_type = "use-after-free";
break;
case KASAN_ALLOCA_LEFT:
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index cfade6413528..ac499456740f 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -17,15 +17,17 @@
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */
#else
#define KASAN_FREE_PAGE KASAN_TAG_INVALID
#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID
#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID
+#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID
#endif
-#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
-#define KASAN_VMALLOC_INVALID 0xF9 /* unallocated space in vmapped page */
+#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */
+#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */
/*
* Stack redzone shadow values
@@ -104,7 +106,15 @@ struct kasan_track {
struct kasan_alloc_meta {
struct kasan_track alloc_track;
+#ifdef CONFIG_KASAN_GENERIC
+ /*
+ * call_rcu() call stack is stored into struct kasan_alloc_meta.
+ * The free stack is stored into struct kasan_free_meta.
+ */
+ depot_stack_handle_t aux_stack[2];
+#else
struct kasan_track free_track[KASAN_NR_FREE_STACKS];
+#endif
#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
u8 free_track_idx;
@@ -119,6 +129,9 @@ struct kasan_free_meta {
* Otherwise it might be used for the allocator freelist.
*/
struct qlist_node quarantine_link;
+#ifdef CONFIG_KASAN_GENERIC
+ struct kasan_track free_track;
+#endif
};
struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
@@ -159,6 +172,12 @@ void kasan_report_invalid_free(void *object, unsigned long ip);
struct page *kasan_addr_to_page(const void *addr);
+depot_stack_handle_t kasan_save_stack(gfp_t flags);
+void kasan_set_track(struct kasan_track *track, gfp_t flags);
+void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag);
+
#if defined(CONFIG_KASAN_GENERIC) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 978bc4a3eb51..4c5375810449 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -145,6 +145,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
if (IS_ENABLED(CONFIG_SLAB))
local_irq_save(flags);
+ *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
___cache_free(cache, object, _THIS_IP_);
if (IS_ENABLED(CONFIG_SLAB))
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 51ec45407a0b..4f49fa6cd1aa 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -106,15 +106,20 @@ static void end_report(unsigned long *flags)
kasan_enable_current();
}
+static void print_stack(depot_stack_handle_t stack)
+{
+ unsigned long *entries;
+ unsigned int nr_entries;
+
+ nr_entries = stack_depot_fetch(stack, &entries);
+ stack_trace_print(entries, nr_entries, 0);
+}
+
static void print_track(struct kasan_track *track, const char *prefix)
{
pr_err("%s by task %u:\n", prefix, track->pid);
if (track->stack) {
- unsigned long *entries;
- unsigned int nr_entries;
-
- nr_entries = stack_depot_fetch(track->stack, &entries);
- stack_trace_print(entries, nr_entries, 0);
+ print_stack(track->stack);
} else {
pr_err("(stack is not available)\n");
}
@@ -160,26 +165,6 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
(void *)(object_addr + cache->object_size));
}
-static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag)
-{
- struct kasan_alloc_meta *alloc_meta;
- int i = 0;
-
- alloc_meta = get_alloc_info(cache, object);
-
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
- if (alloc_meta->free_pointer_tag[i] == tag)
- break;
- }
- if (i == KASAN_NR_FREE_STACKS)
- i = alloc_meta->free_track_idx;
-#endif
-
- return &alloc_meta->free_track[i];
-}
-
static void describe_object(struct kmem_cache *cache, void *object,
const void *addr, u8 tag)
{
@@ -191,8 +176,23 @@ static void describe_object(struct kmem_cache *cache, void *object,
print_track(&alloc_info->alloc_track, "Allocated");
pr_err("\n");
free_track = kasan_get_free_track(cache, object, tag);
- print_track(free_track, "Freed");
- pr_err("\n");
+ if (free_track) {
+ print_track(free_track, "Freed");
+ pr_err("\n");
+ }
+
+#ifdef CONFIG_KASAN_GENERIC
+ if (alloc_info->aux_stack[0]) {
+ pr_err("Last call_rcu():\n");
+ print_stack(alloc_info->aux_stack[0]);
+ pr_err("\n");
+ }
+ if (alloc_info->aux_stack[1]) {
+ pr_err("Second to last call_rcu():\n");
+ print_stack(alloc_info->aux_stack[1]);
+ pr_err("\n");
+ }
+#endif
}
describe_object_addr(cache, object, addr);
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 8a959fdd30e3..e02a36a51f42 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -161,3 +161,40 @@ void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
kasan_poison_shadow((void *)addr, size, tag);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
+
+void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ u8 idx = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ idx = alloc_meta->free_track_idx;
+ alloc_meta->free_pointer_tag[idx] = tag;
+ alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
+#endif
+
+ kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ int i = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ break;
+ }
+ if (i == KASAN_NR_FREE_STACKS)
+ i = alloc_meta->free_track_idx;
+#endif
+
+ return &alloc_meta->free_track[i];
+}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 700f5160f3e4..15a9af791014 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -431,7 +431,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -1100,9 +1100,6 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
- result = SCAN_ANY_PROCESS;
- if (!mmget_still_valid(mm))
- goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
@@ -1176,7 +1173,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
@@ -1412,7 +1409,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
{
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = find_vma(mm, haddr);
- struct page *hpage = NULL;
+ struct page *hpage;
pte_t *start_pte, *pte;
pmd_t *pmd, _pmd;
spinlock_t *ptl;
@@ -1432,9 +1429,17 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
return;
+ hpage = find_lock_page(vma->vm_file->f_mapping,
+ linear_page_index(vma, haddr));
+ if (!hpage)
+ return;
+
+ if (!PageHead(hpage))
+ goto drop_hpage;
+
pmd = mm_find_pmd(mm, haddr);
if (!pmd)
- return;
+ goto drop_hpage;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
@@ -1453,30 +1458,11 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
page = vm_normal_page(vma, addr, *pte);
- if (!page || !PageCompound(page))
- goto abort;
-
- if (!hpage) {
- hpage = compound_head(page);
- /*
- * The mapping of the THP should not change.
- *
- * Note that uprobe, debugger, or MAP_PRIVATE may
- * change the page table, but the new page will
- * not pass PageCompound() check.
- */
- if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
- goto abort;
- }
-
/*
- * Confirm the page maps to the correct subpage.
- *
- * Note that uprobe, debugger, or MAP_PRIVATE may change
- * the page table, but the new page will not pass
- * PageCompound() check.
+ * Note that uprobe, debugger, or MAP_PRIVATE may change the
+ * page table, but the new page will not be a subpage of hpage.
*/
- if (WARN_ON(hpage + i != page))
+ if (hpage + i != page)
goto abort;
count++;
}
@@ -1495,21 +1481,26 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
pte_unmap_unlock(start_pte, ptl);
/* step 3: set proper refcount and mm_counters. */
- if (hpage) {
+ if (count) {
page_ref_sub(hpage, count);
add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
}
/* step 4: collapse pmd */
ptl = pmd_lock(vma->vm_mm, pmd);
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ _pmd = pmdp_collapse_flush(vma, haddr, pmd);
spin_unlock(ptl);
mm_dec_nr_ptes(mm);
pte_free(mm, pmd_pgtable(_pmd));
+
+drop_hpage:
+ unlock_page(hpage);
+ put_page(hpage);
return;
abort:
pte_unmap_unlock(start_pte, ptl);
+ goto drop_hpage;
}
static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
@@ -1538,6 +1529,7 @@ out:
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
+ struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
@@ -1566,7 +1558,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
continue;
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
continue;
- pmd = mm_find_pmd(vma->vm_mm, addr);
+ mm = vma->vm_mm;
+ pmd = mm_find_pmd(mm, addr);
if (!pmd)
continue;
/*
@@ -1576,17 +1569,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
*/
- if (mmap_write_trylock(vma->vm_mm)) {
- spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
- /* assume page table is clear */
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
- mmap_write_unlock(vma->vm_mm);
- mm_dec_nr_ptes(vma->vm_mm);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ if (mmap_write_trylock(mm)) {
+ if (!khugepaged_test_exit(mm)) {
+ spinlock_t *ptl = pmd_lock(mm, pmd);
+ /* assume page table is clear */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+ mmap_write_unlock(mm);
} else {
/* Try again later */
- khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
+ khugepaged_add_pte_mapped_thp(mm, addr);
}
}
i_mmap_unlock_write(mapping);
diff --git a/mm/ksm.c b/mm/ksm.c
index 4102034cd55a..0aa2247bddd7 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -480,7 +480,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
break;
if (PageKsm(page))
ret = handle_mm_fault(vma, addr,
- FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
+ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+ NULL);
else
ret = VM_FAULT_WRITE;
put_page(page);
@@ -2387,7 +2388,7 @@ next_mm:
static void ksm_do_scan(unsigned int scan_npages)
{
struct rmap_item *rmap_item;
- struct page *uninitialized_var(page);
+ struct page *page;
while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
@@ -2452,10 +2453,6 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
if (vma_is_dax(vma))
return 0;
-#ifdef VM_SAO
- if (*vm_flags & VM_SAO)
- return 0;
-#endif
#ifdef VM_SPARC_ADI
if (*vm_flags & VM_SPARC_ADI)
return 0;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 9222910ab1cb..e825804b3928 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -373,14 +373,14 @@ static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
struct list_lru_memcg *memcg_lrus;
/*
* This is called when shrinker has already been unregistered,
- * and nobody can use it. So, there is no need to use kvfree_rcu().
+ * and nobody can use it. So, there is no need to use kvfree_rcu_local().
*/
memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
__memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
kvfree(memcg_lrus);
}
-static void kvfree_rcu(struct rcu_head *head)
+static void kvfree_rcu_local(struct rcu_head *head)
{
struct list_lru_memcg *mlru;
@@ -419,7 +419,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
rcu_assign_pointer(nlru->memcg_lrus, new);
spin_unlock_irq(&nlru->lock);
- call_rcu(&old->rcu, kvfree_rcu);
+ call_rcu(&old->rcu, kvfree_rcu_local);
return 0;
}
diff --git a/mm/maccess.c b/mm/maccess.c
index f98ff91e32c6..3bd70405f2d8 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -205,15 +205,14 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
{
long ret = -EFAULT;
- mm_segment_t old_fs = get_fs();
+ mm_segment_t old_fs = force_uaccess_begin();
- set_fs(USER_DS);
if (access_ok(src, size)) {
pagefault_disable();
ret = __copy_from_user_inatomic(dst, src, size);
pagefault_enable();
}
- set_fs(old_fs);
+ force_uaccess_end(old_fs);
if (ret)
return -EFAULT;
@@ -233,15 +232,14 @@ EXPORT_SYMBOL_GPL(copy_from_user_nofault);
long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
{
long ret = -EFAULT;
- mm_segment_t old_fs = get_fs();
+ mm_segment_t old_fs = force_uaccess_begin();
- set_fs(USER_DS);
if (access_ok(dst, size)) {
pagefault_disable();
ret = __copy_to_user_inatomic(dst, src, size);
pagefault_enable();
}
- set_fs(old_fs);
+ force_uaccess_end(old_fs);
if (ret)
return -EFAULT;
@@ -270,17 +268,17 @@ EXPORT_SYMBOL_GPL(copy_to_user_nofault);
long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
long count)
{
- mm_segment_t old_fs = get_fs();
+ mm_segment_t old_fs;
long ret;
if (unlikely(count <= 0))
return 0;
- set_fs(USER_DS);
+ old_fs = force_uaccess_begin();
pagefault_disable();
ret = strncpy_from_user(dst, unsafe_addr, count);
pagefault_enable();
- set_fs(old_fs);
+ force_uaccess_end(old_fs);
if (ret >= count) {
ret = count;
@@ -310,14 +308,14 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
*/
long strnlen_user_nofault(const void __user *unsafe_addr, long count)
{
- mm_segment_t old_fs = get_fs();
+ mm_segment_t old_fs;
int ret;
- set_fs(USER_DS);
+ old_fs = force_uaccess_begin();
pagefault_disable();
ret = strnlen_user(unsafe_addr, count);
pagefault_enable();
- set_fs(old_fs);
+ force_uaccess_end(old_fs);
return ret;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 39aceafc57f6..45f198750be9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -44,19 +44,20 @@
* in the system, for instance when the memory is restricted with
* ``mem=`` command line parameter
* * ``reserved`` - describes the regions that were allocated
- * * ``physmap`` - describes the actual physical memory regardless of
- * the possible restrictions; the ``physmap`` type is only available
- * on some architectures.
+ * * ``physmem`` - describes the actual physical memory available during
+ * boot regardless of the possible restrictions and memory hot(un)plug;
+ * the ``physmem`` type is only available on some architectures.
*
* Each region is represented by :c:type:`struct memblock_region` that
* defines the region extents, its attributes and NUMA node id on NUMA
* systems. Every memory type is described by the :c:type:`struct
* memblock_type` which contains an array of memory regions along with
- * the allocator metadata. The memory types are nicely wrapped with
- * :c:type:`struct memblock`. This structure is statically initialzed
- * at build time. The region arrays for the "memory" and "reserved"
- * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the
- * "physmap" type to %INIT_PHYSMEM_REGIONS.
+ * the allocator metadata. The "memory" and "reserved" types are nicely
+ * wrapped with :c:type:`struct memblock`. This structure is statically
+ * initialized at build time. The region arrays are initially sized to
+ * %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS
+ * for "reserved". The region array for "physmem" is initially sized to
+ * %INIT_PHYSMEM_REGIONS.
* The memblock_allow_resize() enables automatic resizing of the region
* arrays during addition of new regions. This feature should be used
* with care so that memory allocated for the region array will not
@@ -87,8 +88,8 @@
* function frees all the memory to the buddy page allocator.
*
* Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
- * memblock data structures will be discarded after the system
- * initialization completes.
+ * memblock data structures (except "physmem") will be discarded after the
+ * system initialization completes.
*/
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -104,7 +105,7 @@ unsigned long long max_possible_pfn;
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS];
#endif
struct memblock memblock __initdata_memblock = {
@@ -118,17 +119,19 @@ struct memblock memblock __initdata_memblock = {
.reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS,
.reserved.name = "reserved",
-#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
- .physmem.regions = memblock_physmem_init_regions,
- .physmem.cnt = 1, /* empty dummy entry */
- .physmem.max = INIT_PHYSMEM_REGIONS,
- .physmem.name = "physmem",
-#endif
-
.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+struct memblock_type physmem = {
+ .regions = memblock_physmem_init_regions,
+ .cnt = 1, /* empty dummy entry */
+ .max = INIT_PHYSMEM_REGIONS,
+ .name = "physmem",
+};
+#endif
+
int memblock_debug __initdata_memblock;
static bool system_has_some_mirror __initdata_memblock = false;
static int memblock_can_resize __initdata_memblock;
@@ -838,7 +841,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
- return memblock_add_range(&memblock.physmem, base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0);
}
#endif
@@ -1019,12 +1022,10 @@ static bool should_skip_region(struct memblock_region *m, int nid, int flags)
* As both region arrays are sorted, the function advances the two indices
* in lockstep and returns each intersection.
*/
-void __init_memblock __next_mem_range(u64 *idx, int nid,
- enum memblock_flags flags,
- struct memblock_type *type_a,
- struct memblock_type *type_b,
- phys_addr_t *out_start,
- phys_addr_t *out_end, int *out_nid)
+void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
+ struct memblock_type *type_a,
+ struct memblock_type *type_b, phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
{
int idx_a = *idx & 0xffffffff;
int idx_b = *idx >> 32;
@@ -1924,7 +1925,7 @@ void __init_memblock __memblock_dump_all(void)
memblock_dump(&memblock.memory);
memblock_dump(&memblock.reserved);
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
- memblock_dump(&memblock.physmem);
+ memblock_dump(&physmem);
#endif
}
@@ -2064,8 +2065,8 @@ static int __init memblock_init_debugfs(void)
debugfs_create_file("reserved", 0444, root,
&memblock.reserved, &memblock_debug_fops);
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
- debugfs_create_file("physmem", 0444, root,
- &memblock.physmem, &memblock_debug_fops);
+ debugfs_create_file("physmem", 0444, root, &physmem,
+ &memblock_debug_fops);
#endif
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 13f559af1ab6..d59fd9af6e63 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -73,8 +73,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
-#define MEM_CGROUP_RECLAIM_RETRIES 5
-
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
@@ -257,8 +255,100 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
}
#ifdef CONFIG_MEMCG_KMEM
+extern spinlock_t css_set_lock;
+
+static void obj_cgroup_release(struct percpu_ref *ref)
+{
+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
+ struct mem_cgroup *memcg;
+ unsigned int nr_bytes;
+ unsigned int nr_pages;
+ unsigned long flags;
+
+ /*
+ * At this point all allocated objects are freed, and
+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
+ *
+ * The following sequence can lead to it:
+ * 1) CPU0: objcg == stock->cached_objcg
+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
+ * PAGE_SIZE bytes are charged
+ * 3) CPU1: a process from another memcg is allocating something,
+ * the stock if flushed,
+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
+ * 5) CPU0: we do release this object,
+ * 92 bytes are added to stock->nr_bytes
+ * 6) CPU0: stock is flushed,
+ * 92 bytes are added to objcg->nr_charged_bytes
+ *
+ * In the result, nr_charged_bytes == PAGE_SIZE.
+ * This page will be uncharged in obj_cgroup_release().
+ */
+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+
+ spin_lock_irqsave(&css_set_lock, flags);
+ memcg = obj_cgroup_memcg(objcg);
+ if (nr_pages)
+ __memcg_kmem_uncharge(memcg, nr_pages);
+ list_del(&objcg->list);
+ mem_cgroup_put(memcg);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+
+ percpu_ref_exit(ref);
+ kfree_rcu(objcg, rcu);
+}
+
+static struct obj_cgroup *obj_cgroup_alloc(void)
+{
+ struct obj_cgroup *objcg;
+ int ret;
+
+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
+ if (!objcg)
+ return NULL;
+
+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
+ GFP_KERNEL);
+ if (ret) {
+ kfree(objcg);
+ return NULL;
+ }
+ INIT_LIST_HEAD(&objcg->list);
+ return objcg;
+}
+
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
+{
+ struct obj_cgroup *objcg, *iter;
+
+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
+
+ spin_lock_irq(&css_set_lock);
+
+ /* Move active objcg to the parent's list */
+ xchg(&objcg->memcg, parent);
+ css_get(&parent->css);
+ list_add(&objcg->list, &parent->objcg_list);
+
+ /* Move already reparented objcgs to the parent's list */
+ list_for_each_entry(iter, &memcg->objcg_list, list) {
+ css_get(&parent->css);
+ xchg(&iter->memcg, parent);
+ css_put(&memcg->css);
+ }
+ list_splice(&memcg->objcg_list, &parent->objcg_list);
+
+ spin_unlock_irq(&css_set_lock);
+
+ percpu_ref_kill(&objcg->refcnt);
+}
+
/*
- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
+ * This will be used as a shrinker list's index.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
@@ -301,14 +391,12 @@ void memcg_put_cache_ids(void)
/*
* A lot of the calls to the cache allocation functions are expected to be
- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-
-struct workqueue_struct *memcg_kmem_cache_wq;
#endif
static int memcg_shrinker_map_size;
@@ -477,10 +565,17 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
- if (PageSlab(page) && !PageTail(page))
- memcg = memcg_from_slab_page(page);
- else
- memcg = READ_ONCE(page->mem_cgroup);
+ memcg = page->mem_cgroup;
+
+ /*
+ * The lowest bit set means that memcg isn't a valid
+ * memcg pointer, but a obj_cgroups pointer.
+ * In this case the page is shared and doesn't belong
+ * to any specific memory cgroup.
+ */
+ if ((unsigned long) memcg & 0x1UL)
+ memcg = NULL;
+
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
if (memcg)
@@ -681,13 +776,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
*/
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
- long x;
+ long x, threshold = MEMCG_CHARGE_BATCH;
if (mem_cgroup_disabled())
return;
+ if (memcg_stat_item_in_bytes(idx))
+ threshold <<= PAGE_SHIFT;
+
x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
- if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ if (unlikely(abs(x) > threshold)) {
struct mem_cgroup *mi;
/*
@@ -713,29 +811,12 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
return mem_cgroup_nodeinfo(parent, nid);
}
-/**
- * __mod_lruvec_state - update lruvec memory statistics
- * @lruvec: the lruvec
- * @idx: the stat item
- * @val: delta to add to the counter, can be negative
- *
- * The lruvec is the intersection of the NUMA node and a cgroup. This
- * function updates the all three counters that are affected by a
- * change of state at this level: per-node, per-cgroup, per-lruvec.
- */
-void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
- int val)
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
{
- pg_data_t *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
- long x;
-
- /* Update node */
- __mod_node_page_state(pgdat, idx, val);
-
- if (mem_cgroup_disabled())
- return;
+ long x, threshold = MEMCG_CHARGE_BATCH;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
@@ -746,8 +827,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update lruvec */
__this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+ if (vmstat_item_in_bytes(idx))
+ threshold <<= PAGE_SHIFT;
+
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
- if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ if (unlikely(abs(x) > threshold)) {
+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup_per_node *pi;
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
@@ -757,6 +842,27 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
}
+/**
+ * __mod_lruvec_state - update lruvec memory statistics
+ * @lruvec: the lruvec
+ * @idx: the stat item
+ * @val: delta to add to the counter, can be negative
+ *
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
+ * function updates the all three counters that are affected by a
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
+ */
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
+{
+ /* Update node */
+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+
+ /* Update memcg and lruvec */
+ if (!mem_cgroup_disabled())
+ __mod_memcg_lruvec_state(lruvec, idx, val);
+}
+
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
{
pg_data_t *pgdat = page_pgdat(virt_to_page(p));
@@ -1004,7 +1110,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+ struct mem_cgroup_reclaim_iter *iter;
struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *pos = NULL;
@@ -1377,12 +1483,13 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
(u64)memcg_page_state(memcg, NR_FILE_PAGES) *
PAGE_SIZE);
seq_buf_printf(&s, "kernel_stack %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+ (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
1024);
seq_buf_printf(&s, "slab %llu\n",
- (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
- PAGE_SIZE);
+ (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
+ seq_buf_printf(&s, "percpu %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
seq_buf_printf(&s, "sock %llu\n",
(u64)memcg_page_state(memcg, MEMCG_SOCK) *
PAGE_SIZE);
@@ -1412,11 +1519,9 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
PAGE_SIZE);
seq_buf_printf(&s, "slab_reclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
- PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
seq_buf_printf(&s, "slab_unreclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
- PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
/* Accumulated memory events */
@@ -1425,12 +1530,18 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
memcg_events(memcg, PGMAJFAULT));
- seq_buf_printf(&s, "workingset_refault %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT));
- seq_buf_printf(&s, "workingset_activate %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+ seq_buf_printf(&s, "workingset_refault_anon %lu\n",
+ memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
+ seq_buf_printf(&s, "workingset_refault_file %lu\n",
+ memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
+ seq_buf_printf(&s, "workingset_activate_anon %lu\n",
+ memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
+ seq_buf_printf(&s, "workingset_activate_file %lu\n",
+ memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
+ seq_buf_printf(&s, "workingset_restore %lu\n",
+ memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
seq_buf_printf(&s, "workingset_restore %lu\n",
- memcg_page_state(memcg, WORKINGSET_RESTORE));
+ memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
@@ -1560,15 +1671,21 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
.gfp_mask = gfp_mask,
.order = order,
};
- bool ret;
+ bool ret = true;
if (mutex_lock_killable(&oom_lock))
return true;
+
+ if (mem_cgroup_margin(memcg) >= (1 << order))
+ goto unlock;
+
/*
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
ret = should_force_charge() || out_of_memory(&oc);
+
+unlock:
mutex_unlock(&oom_lock);
return ret;
}
@@ -2039,6 +2156,12 @@ EXPORT_SYMBOL(unlock_page_memcg);
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
+
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup *cached_objcg;
+ unsigned int nr_bytes;
+#endif
+
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE 0
@@ -2046,6 +2169,22 @@ struct memcg_stock_pcp {
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
+#ifdef CONFIG_MEMCG_KMEM
+static void drain_obj_stock(struct memcg_stock_pcp *stock);
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg);
+
+#else
+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+}
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ return false;
+}
+#endif
+
/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
@@ -2086,13 +2225,17 @@ static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
+ if (!old)
+ return;
+
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_memsw_account())
page_counter_uncharge(&old->memsw, stock->nr_pages);
- css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
+
+ css_put(&old->css);
stock->cached = NULL;
}
@@ -2108,6 +2251,7 @@ static void drain_local_stock(struct work_struct *dummy)
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
+ drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
@@ -2128,6 +2272,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
+ css_get(&memcg->css);
stock->cached = memcg;
}
stock->nr_pages += nr_pages;
@@ -2166,6 +2311,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
if (memcg && stock->nr_pages &&
mem_cgroup_is_descendant(memcg, root_memcg))
flush = true;
+ if (obj_stock_flush_required(stock, root_memcg))
+ flush = true;
rcu_read_unlock();
if (flush &&
@@ -2228,18 +2375,29 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
return 0;
}
-static void reclaim_high(struct mem_cgroup *memcg,
- unsigned int nr_pages,
- gfp_t gfp_mask)
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ gfp_t gfp_mask)
{
+ unsigned long nr_reclaimed = 0;
+
do {
+ unsigned long pflags;
+
if (page_counter_read(&memcg->memory) <=
READ_ONCE(memcg->memory.high))
continue;
+
memcg_memory_event(memcg, MEMCG_HIGH);
- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+
+ psi_memstall_enter(&pflags);
+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
+ gfp_mask, true);
+ psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
+
+ return nr_reclaimed;
}
static void high_work_func(struct work_struct *work)
@@ -2264,7 +2422,7 @@ static void high_work_func(struct work_struct *work)
*
* - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
* overage ratio to a delay.
- * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
* proposed penalty in order to reduce to a reasonable number of jiffies, and
* to produce a reasonable delay curve.
*
@@ -2395,16 +2553,32 @@ void mem_cgroup_handle_over_high(void)
{
unsigned long penalty_jiffies;
unsigned long pflags;
+ unsigned long nr_reclaimed;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
+ int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *memcg;
+ bool in_retry = false;
if (likely(!nr_pages))
return;
memcg = get_mem_cgroup_from_mm(current->mm);
- reclaim_high(memcg, nr_pages, GFP_KERNEL);
current->memcg_nr_pages_over_high = 0;
+retry_reclaim:
+ /*
+ * The allocating task should reclaim at least the batch size, but for
+ * subsequent retries we only want to do what's necessary to prevent oom
+ * or breaching resource isolation.
+ *
+ * This is distinct from memory.max or page allocator behaviour because
+ * memory.high is currently batched, whereas memory.max and the page
+ * allocator run every time an allocation is made.
+ */
+ nr_reclaimed = reclaim_high(memcg,
+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
+ GFP_KERNEL);
+
/*
* memory.high is breached and reclaim is unable to keep up. Throttle
* allocators proactively to slow down excessive growth.
@@ -2432,6 +2606,16 @@ void mem_cgroup_handle_over_high(void)
goto out;
/*
+ * If reclaim is making forward progress but we're still over
+ * memory.high, we want to encourage that rather than doing allocator
+ * throttling.
+ */
+ if (nr_reclaimed || nr_retries--) {
+ in_retry = true;
+ goto retry_reclaim;
+ }
+
+ /*
* If we exit early, we're guaranteed to die (since
* schedule_timeout_killable sets TASK_KILLABLE). This means we don't
* need to account for any ill-begotten jiffies to pay them off later.
@@ -2448,13 +2632,14 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
+ enum oom_status oom_status;
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
- enum oom_status oom_status;
+ unsigned long pflags;
if (mem_cgroup_is_root(memcg))
return 0;
@@ -2514,8 +2699,10 @@ retry:
memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
+ psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
goto retry;
@@ -2567,7 +2754,7 @@ retry:
get_order(nr_pages * PAGE_SIZE));
switch (oom_status) {
case OOM_SUCCESS:
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ nr_retries = MAX_RECLAIM_RETRIES;
goto retry;
case OOM_FAILED:
goto force;
@@ -2586,12 +2773,10 @@ force:
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);
return 0;
done_restock:
- css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
@@ -2649,8 +2834,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
-
- css_put_many(&memcg->css, nr_pages);
}
#endif
@@ -2669,6 +2852,26 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
}
#ifdef CONFIG_MEMCG_KMEM
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+ gfp_t gfp)
+{
+ unsigned int objects = objs_per_slab_page(s, page);
+ void *vec;
+
+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
+ page_to_nid(page));
+ if (!vec)
+ return -ENOMEM;
+
+ if (cmpxchg(&page->obj_cgroups, NULL,
+ (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+ kfree(vec);
+ else
+ kmemleak_not_leak(vec);
+
+ return 0;
+}
+
/*
* Returns a pointer to the memory cgroup to which the kernel object is charged.
*
@@ -2685,17 +2888,50 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
page = virt_to_head_page(p);
/*
- * Slab pages don't have page->mem_cgroup set because corresponding
- * kmem caches can be reparented during the lifetime. That's why
- * memcg_from_slab_page() should be used instead.
+ * Slab objects are accounted individually, not per-page.
+ * Memcg membership data for each individual object is saved in
+ * the page->obj_cgroups.
*/
- if (PageSlab(page))
- return memcg_from_slab_page(page);
+ if (page_has_obj_cgroups(page)) {
+ struct obj_cgroup *objcg;
+ unsigned int off;
+
+ off = obj_to_index(page->slab_cache, page, p);
+ objcg = page_obj_cgroups(page)[off];
+ if (objcg)
+ return obj_cgroup_memcg(objcg);
+
+ return NULL;
+ }
/* All other pages use page->mem_cgroup */
return page->mem_cgroup;
}
+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+{
+ struct obj_cgroup *objcg = NULL;
+ struct mem_cgroup *memcg;
+
+ if (unlikely(!current->mm && !current->active_memcg))
+ return NULL;
+
+ rcu_read_lock();
+ if (unlikely(current->active_memcg))
+ memcg = rcu_dereference(current->active_memcg);
+ else
+ memcg = mem_cgroup_from_task(current);
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ objcg = rcu_dereference(memcg->objcg);
+ if (objcg && obj_cgroup_tryget(objcg))
+ break;
+ }
+ rcu_read_unlock();
+
+ return objcg;
+}
+
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2721,9 +2957,7 @@ static int memcg_alloc_cache_id(void)
else if (size > MEMCG_CACHES_MAX_SIZE)
size = MEMCG_CACHES_MAX_SIZE;
- err = memcg_update_all_caches(size);
- if (!err)
- err = memcg_update_all_list_lrus(size);
+ err = memcg_update_all_list_lrus(size);
if (!err)
memcg_nr_cache_ids = size;
@@ -2741,150 +2975,6 @@ static void memcg_free_cache_id(int id)
ida_simple_remove(&memcg_cache_ida, id);
}
-struct memcg_kmem_cache_create_work {
- struct mem_cgroup *memcg;
- struct kmem_cache *cachep;
- struct work_struct work;
-};
-
-static void memcg_kmem_cache_create_func(struct work_struct *w)
-{
- struct memcg_kmem_cache_create_work *cw =
- container_of(w, struct memcg_kmem_cache_create_work, work);
- struct mem_cgroup *memcg = cw->memcg;
- struct kmem_cache *cachep = cw->cachep;
-
- memcg_create_kmem_cache(memcg, cachep);
-
- css_put(&memcg->css);
- kfree(cw);
-}
-
-/*
- * Enqueue the creation of a per-memcg kmem_cache.
- */
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- struct memcg_kmem_cache_create_work *cw;
-
- if (!css_tryget_online(&memcg->css))
- return;
-
- cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
- if (!cw) {
- css_put(&memcg->css);
- return;
- }
-
- cw->memcg = memcg;
- cw->cachep = cachep;
- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
-
- queue_work(memcg_kmem_cache_wq, &cw->work);
-}
-
-static inline bool memcg_kmem_bypass(void)
-{
- if (in_interrupt())
- return true;
-
- /* Allow remote memcg charging in kthread contexts. */
- if ((!current->mm || (current->flags & PF_KTHREAD)) &&
- !current->active_memcg)
- return true;
- return false;
-}
-
-/**
- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
- * @cachep: the original global kmem cache
- *
- * Return the kmem_cache we're supposed to use for a slab allocation.
- * We try to use the current memcg's version of the cache.
- *
- * If the cache does not exist yet, if we are the first user of it, we
- * create it asynchronously in a workqueue and let the current allocation
- * go through with the original cache.
- *
- * This function takes a reference to the cache it returns to assure it
- * won't get destroyed while we are working with it. Once the caller is
- * done with it, memcg_kmem_put_cache() must be called to release the
- * reference.
- */
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
-{
- struct mem_cgroup *memcg;
- struct kmem_cache *memcg_cachep;
- struct memcg_cache_array *arr;
- int kmemcg_id;
-
- VM_BUG_ON(!is_root_cache(cachep));
-
- if (memcg_kmem_bypass())
- return cachep;
-
- rcu_read_lock();
-
- if (unlikely(current->active_memcg))
- memcg = current->active_memcg;
- else
- memcg = mem_cgroup_from_task(current);
-
- if (!memcg || memcg == root_mem_cgroup)
- goto out_unlock;
-
- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
- if (kmemcg_id < 0)
- goto out_unlock;
-
- arr = rcu_dereference(cachep->memcg_params.memcg_caches);
-
- /*
- * Make sure we will access the up-to-date value. The code updating
- * memcg_caches issues a write barrier to match the data dependency
- * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
- */
- memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
-
- /*
- * If we are in a safe context (can wait, and not in interrupt
- * context), we could be be predictable and return right away.
- * This would guarantee that the allocation being performed
- * already belongs in the new cache.
- *
- * However, there are some clashes that can arrive from locking.
- * For instance, because we acquire the slab_mutex while doing
- * memcg_create_kmem_cache, this means no further allocation
- * could happen with the slab_mutex held. So it's better to
- * defer everything.
- *
- * If the memcg is dying or memcg_cache is about to be released,
- * don't bother creating new kmem_caches. Because memcg_cachep
- * is ZEROed as the fist step of kmem offlining, we don't need
- * percpu_ref_tryget_live() here. css_tryget_online() check in
- * memcg_schedule_kmem_cache_create() will prevent us from
- * creation of a new kmem_cache.
- */
- if (unlikely(!memcg_cachep))
- memcg_schedule_kmem_cache_create(memcg, cachep);
- else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
- cachep = memcg_cachep;
-out_unlock:
- rcu_read_unlock();
- return cachep;
-}
-
-/**
- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
- * @cachep: the cache returned by memcg_kmem_get_cache
- */
-void memcg_kmem_put_cache(struct kmem_cache *cachep)
-{
- if (!is_root_cache(cachep))
- percpu_ref_put(&cachep->memcg_params.refcnt);
-}
-
/**
* __memcg_kmem_charge: charge a number of kernel pages to a memcg
* @memcg: memory cgroup to charge
@@ -2958,6 +3048,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
if (!ret) {
page->mem_cgroup = memcg;
__SetPageKmemcg(page);
+ return 0;
}
}
css_put(&memcg->css);
@@ -2980,13 +3071,146 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
__memcg_kmem_uncharge(memcg, nr_pages);
page->mem_cgroup = NULL;
+ css_put(&memcg->css);
/* slab pages do not have PageKmemcg flag set */
if (PageKmemcg(page))
__ClearPageKmemcg(page);
+}
+
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+ bool ret = false;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+ stock->nr_bytes -= nr_bytes;
+ ret = true;
+ }
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+ struct obj_cgroup *old = stock->cached_objcg;
+
+ if (!old)
+ return;
+
+ if (stock->nr_bytes) {
+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
+
+ if (nr_pages) {
+ rcu_read_lock();
+ __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
+ rcu_read_unlock();
+ }
+
+ /*
+ * The leftover is flushed to the centralized per-memcg value.
+ * On the next attempt to refill obj stock it will be moved
+ * to a per-cpu stock (probably, on an other CPU), see
+ * refill_obj_stock().
+ *
+ * How often it's flushed is a trade-off between the memory
+ * limit enforcement accuracy and potential CPU contention,
+ * so it might be changed in the future.
+ */
+ atomic_add(nr_bytes, &old->nr_charged_bytes);
+ stock->nr_bytes = 0;
+ }
+
+ obj_cgroup_put(old);
+ stock->cached_objcg = NULL;
+}
- css_put_many(&memcg->css, nr_pages);
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ struct mem_cgroup *memcg;
+
+ if (stock->cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+ return true;
+ }
+
+ return false;
+}
+
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (stock->cached_objcg != objcg) { /* reset if necessary */
+ drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->cached_objcg = objcg;
+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+ }
+ stock->nr_bytes += nr_bytes;
+
+ if (stock->nr_bytes > PAGE_SIZE)
+ drain_obj_stock(stock);
+
+ local_irq_restore(flags);
+}
+
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+ struct mem_cgroup *memcg;
+ unsigned int nr_pages, nr_bytes;
+ int ret;
+
+ if (consume_obj_stock(objcg, size))
+ return 0;
+
+ /*
+ * In theory, memcg->nr_charged_bytes can have enough
+ * pre-charged bytes to satisfy the allocation. However,
+ * flushing memcg->nr_charged_bytes requires two atomic
+ * operations, and memcg->nr_charged_bytes can't be big,
+ * so it's better to ignore it and try grab some new pages.
+ * memcg->nr_charged_bytes will be flushed in
+ * refill_obj_stock(), called from this function or
+ * independently later.
+ */
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ css_get(&memcg->css);
+ rcu_read_unlock();
+
+ nr_pages = size >> PAGE_SHIFT;
+ nr_bytes = size & (PAGE_SIZE - 1);
+
+ if (nr_bytes)
+ nr_pages += 1;
+
+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
+ if (!ret && nr_bytes)
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
+
+ css_put(&memcg->css);
+ return ret;
}
+
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
+{
+ refill_obj_stock(objcg, size);
+}
+
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2997,13 +3221,16 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
+ struct mem_cgroup *memcg = head->mem_cgroup;
int i;
if (mem_cgroup_disabled())
return;
- for (i = 1; i < HPAGE_PMD_NR; i++)
- head[i].mem_cgroup = head->mem_cgroup;
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ css_get(&memcg->css);
+ head[i].mem_cgroup = memcg;
+ }
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3207,7 +3434,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
*/
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
{
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ int nr_retries = MAX_RECLAIM_RETRIES;
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
@@ -3404,6 +3631,7 @@ static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
+ struct obj_cgroup *objcg;
int memcg_id;
if (cgroup_memory_nokmem)
@@ -3416,7 +3644,16 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
if (memcg_id < 0)
return memcg_id;
- static_branch_inc(&memcg_kmem_enabled_key);
+ objcg = obj_cgroup_alloc();
+ if (!objcg) {
+ memcg_free_cache_id(memcg_id);
+ return -ENOMEM;
+ }
+ objcg->memcg = memcg;
+ rcu_assign_pointer(memcg->objcg, objcg);
+
+ static_branch_enable(&memcg_kmem_enabled_key);
+
/*
* A memory cgroup is considered kmem-online as soon as it gets
* kmemcg_id. Setting the id after enabling static branching will
@@ -3425,7 +3662,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
*/
memcg->kmemcg_id = memcg_id;
memcg->kmem_state = KMEM_ONLINE;
- INIT_LIST_HEAD(&memcg->kmem_caches);
return 0;
}
@@ -3438,22 +3674,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
if (memcg->kmem_state != KMEM_ONLINE)
return;
- /*
- * Clear the online state before clearing memcg_caches array
- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
- * guarantees that no cache will be created for this cgroup
- * after we are done (see memcg_create_kmem_cache()).
- */
+
memcg->kmem_state = KMEM_ALLOCATED;
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
- /*
- * Deactivate and reparent kmem_caches.
- */
- memcg_deactivate_kmem_caches(memcg, parent);
+ memcg_reparent_objcgs(memcg, parent);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
@@ -3486,11 +3714,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
/* css_alloc() failed, offlining didn't happen */
if (unlikely(memcg->kmem_state == KMEM_ONLINE))
memcg_offline_kmem(memcg);
-
- if (memcg->kmem_state == KMEM_ALLOCATED) {
- WARN_ON(!list_empty(&memcg->kmem_caches));
- static_branch_dec(&memcg_kmem_enabled_key);
- }
}
#else
static int memcg_online_kmem(struct mem_cgroup *memcg)
@@ -4800,9 +5023,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
{
.name = "kmem.slabinfo",
- .seq_start = memcg_slab_start,
- .seq_next = memcg_slab_next,
- .seq_stop = memcg_slab_stop,
.seq_show = memcg_slab_show,
},
#endif
@@ -4917,13 +5137,18 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
- pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
+ /* We charge the parent cgroup, never the current task */
+ WARN_ON_ONCE(!current->active_memcg);
+
+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
+ GFP_KERNEL_ACCOUNT);
if (!pn->lruvec_stat_local) {
kfree(pn);
return 1;
}
- pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
+ GFP_KERNEL_ACCOUNT);
if (!pn->lruvec_stat_cpu) {
free_percpu(pn->lruvec_stat_local);
kfree(pn);
@@ -4997,11 +5222,16 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
}
- memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
+ /* We charge the parent cgroup, never the current task */
+ WARN_ON_ONCE(!current->active_memcg);
+
+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+ GFP_KERNEL_ACCOUNT);
if (!memcg->vmstats_local)
goto fail;
- memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+ GFP_KERNEL_ACCOUNT);
if (!memcg->vmstats_percpu)
goto fail;
@@ -5022,6 +5252,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->socket_pressure = jiffies;
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
+ INIT_LIST_HEAD(&memcg->objcg_list);
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
@@ -5049,7 +5280,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
struct mem_cgroup *memcg;
long error = -ENOMEM;
+ memalloc_use_memcg(parent);
memcg = mem_cgroup_alloc();
+ memalloc_unuse_memcg();
if (IS_ERR(memcg))
return ERR_CAST(memcg);
@@ -5084,9 +5317,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
/* The following stuff does not apply to the root */
if (!parent) {
-#ifdef CONFIG_MEMCG_KMEM
- INIT_LIST_HEAD(&memcg->kmem_caches);
-#endif
root_mem_cgroup = memcg;
return &memcg->css;
}
@@ -5448,7 +5678,10 @@ static int mem_cgroup_move_account(struct page *page,
*/
smp_mb();
- page->mem_cgroup = to; /* caller should have done css_get */
+ css_get(&to->css);
+ css_put(&from->css);
+
+ page->mem_cgroup = to;
__unlock_page_memcg(from);
@@ -5669,8 +5902,6 @@ static void __mem_cgroup_clear_mc(void)
if (!mem_cgroup_is_root(mc.to))
page_counter_uncharge(&mc.to->memory, mc.moved_swap);
- css_put_many(&mc.to->css, mc.moved_swap);
-
mc.moved_swap = 0;
}
memcg_oom_recover(from);
@@ -6036,7 +6267,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
bool drained = false;
unsigned long high;
int err;
@@ -6046,8 +6277,6 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (err)
return err;
- page_counter_set_high(&memcg->memory, high);
-
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
@@ -6071,6 +6300,10 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
break;
}
+ page_counter_set_high(&memcg->memory, high);
+
+ memcg_wb_domain_size_changed(memcg);
+
return nbytes;
}
@@ -6084,7 +6317,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
bool drained = false;
unsigned long max;
int err;
@@ -6391,40 +6624,42 @@ static unsigned long effective_protection(unsigned long usage,
*
* WARNING: This function is not stateless! It can only be used as part
* of a top-down tree iteration, not for isolated queries.
- *
- * Returns one of the following:
- * MEMCG_PROT_NONE: cgroup memory is not protected
- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
- * an unprotected supply of reclaimable memory from other cgroups.
- * MEMCG_PROT_MIN: cgroup memory is protected
*/
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
- struct mem_cgroup *memcg)
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
{
unsigned long usage, parent_usage;
struct mem_cgroup *parent;
if (mem_cgroup_disabled())
- return MEMCG_PROT_NONE;
+ return;
if (!root)
root = root_mem_cgroup;
+
+ /*
+ * Effective values of the reclaim targets are ignored so they
+ * can be stale. Have a look at mem_cgroup_protection for more
+ * details.
+ * TODO: calculation should be more robust so that we do not need
+ * that special casing.
+ */
if (memcg == root)
- return MEMCG_PROT_NONE;
+ return;
usage = page_counter_read(&memcg->memory);
if (!usage)
- return MEMCG_PROT_NONE;
+ return;
parent = parent_mem_cgroup(memcg);
/* No parent means a non-hierarchical mode on v1 memcg */
if (!parent)
- return MEMCG_PROT_NONE;
+ return;
if (parent == root) {
memcg->memory.emin = READ_ONCE(memcg->memory.min);
memcg->memory.elow = READ_ONCE(memcg->memory.low);
- goto out;
+ return;
}
parent_usage = page_counter_read(&parent->memory);
@@ -6438,14 +6673,6 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
READ_ONCE(memcg->memory.low),
READ_ONCE(parent->memory.elow),
atomic_long_read(&parent->memory.children_low_usage)));
-
-out:
- if (usage <= memcg->memory.emin)
- return MEMCG_PROT_MIN;
- else if (usage <= memcg->memory.elow)
- return MEMCG_PROT_LOW;
- else
- return MEMCG_PROT_NONE;
}
/**
@@ -6498,6 +6725,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
if (ret)
goto out_put;
+ css_get(&memcg->css);
commit_charge(page, memcg);
local_irq_disable();
@@ -6552,9 +6780,6 @@ static void uncharge_batch(const struct uncharge_gather *ug)
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
-
- if (!mem_cgroup_is_root(ug->memcg))
- css_put_many(&ug->memcg->css, ug->nr_pages);
}
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
@@ -6592,6 +6817,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
ug->dummy_page = page;
page->mem_cgroup = NULL;
+ css_put(&ug->memcg->css);
}
static void uncharge_list(struct list_head *page_list)
@@ -6697,8 +6923,8 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);
+ css_get(&memcg->css);
commit_charge(newpage, memcg);
local_irq_save(flags);
@@ -6821,17 +7047,6 @@ static int __init mem_cgroup_init(void)
{
int cpu, node;
-#ifdef CONFIG_MEMCG_KMEM
- /*
- * Kmem cache creation is mostly done with the slab_mutex held,
- * so use a workqueue with limited concurrency to avoid stalling
- * all worker threads in case lots of cgroups are created and
- * destroyed simultaneously.
- */
- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
- BUG_ON(!memcg_kmem_cache_wq);
-#endif
-
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
@@ -6935,8 +7150,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
mem_cgroup_charge_statistics(memcg, page, -nr_entries);
memcg_check_events(memcg, page);
- if (!mem_cgroup_is_root(memcg))
- css_put_many(&memcg->css, nr_entries);
+ css_put(&memcg->css);
}
/**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 47b8ccb1fb9b..f1aa6433f404 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1648,9 +1648,12 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private)
{
- int nid = page_to_nid(p);
+ struct migration_target_control mtc = {
+ .nid = page_to_nid(p),
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
- return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
+ return alloc_migration_target(p, (unsigned long)&mtc);
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index 3ecad55103ad..228efaca75d3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -71,6 +71,8 @@
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
#include <trace/events/kmem.h>
@@ -437,7 +439,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
* of a chain of data-dependent loads, meaning most CPUs (alpha
* being the notable exception) will already guarantee loads are
* seen in-order. See the alpha page table accessors for the
- * smp_read_barrier_depends() barriers in page table walking code.
+ * smp_rmb() barriers in page table walking code.
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
@@ -1098,7 +1100,7 @@ again:
}
entry = pte_to_swp_entry(ptent);
- if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+ if (is_device_private_entry(entry)) {
struct page *page = device_private_entry_to_page(entry);
if (unlikely(details && details->check_mapping)) {
@@ -1800,7 +1802,7 @@ out_unlock:
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vmf_insert_pfn(), except that it allows drivers to
+ * This is exactly like vmf_insert_pfn(), except that it allows drivers
* to override pgprot on a per-page basis.
*
* This only makes sense for IO mappings, and it makes no sense for
@@ -1936,7 +1938,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vmf_insert_mixed(), except that it allows drivers to
+ * This is exactly like vmf_insert_mixed(), except that it allows drivers
* to override pgprot on a per-page basis.
*
* Typically this function should be used by drivers to set caching- and
@@ -2082,7 +2084,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
- * @addr: target user address to start at
+ * @addr: target page aligned user address to start at
* @pfn: page frame number of kernel physical memory address
* @size: size of mapping area
* @prot: page protection flags for this mapping
@@ -2101,6 +2103,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long remap_pfn = pfn;
int err;
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
+ return -EINVAL;
+
/*
* Physically remapped pages are special. Tell the
* rest of the world about it:
@@ -2205,7 +2210,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
{
pte_t *pte;
int err = 0;
- spinlock_t *uninitialized_var(ptl);
+ spinlock_t *ptl;
if (create) {
pte = (mm == &init_mm) ?
@@ -2712,7 +2717,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
@@ -3095,6 +3100,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
int locked;
int exclusive = 0;
vm_fault_t ret = 0;
+ void *shadow = NULL;
if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
@@ -3146,13 +3152,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_page;
}
- /*
- * XXX: Move to lru_cache_add() when it
- * supports new vs putback
- */
- spin_lock_irq(&page_pgdat(page)->lru_lock);
- lru_note_cost_page(page);
- spin_unlock_irq(&page_pgdat(page)->lru_lock);
+ shadow = get_shadow_from_swap_cache(entry);
+ if (shadow)
+ workingset_refault(page, shadow);
lru_cache_add(page);
swap_readpage(page, true);
@@ -3263,10 +3265,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
- activate_page(page);
}
swap_free(entry);
@@ -3411,7 +3412,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -3669,7 +3670,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
@@ -4357,6 +4358,67 @@ retry_pud:
return handle_pte_fault(&vmf);
}
+/**
+ * mm_account_fault - Do page fault accountings
+ *
+ * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
+ * of perf event counters, but we'll still do the per-task accounting to
+ * the task who triggered this page fault.
+ * @address: the faulted address.
+ * @flags: the fault flags.
+ * @ret: the fault retcode.
+ *
+ * This will take care of most of the page fault accountings. Meanwhile, it
+ * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
+ * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * still be in per-arch page fault handlers at the entry of page fault.
+ */
+static inline void mm_account_fault(struct pt_regs *regs,
+ unsigned long address, unsigned int flags,
+ vm_fault_t ret)
+{
+ bool major;
+
+ /*
+ * We don't do accounting for some specific faults:
+ *
+ * - Unsuccessful faults (e.g. when the address wasn't valid). That
+ * includes arch_vma_access_permitted() failing before reaching here.
+ * So this is not a "this many hardware page faults" counter. We
+ * should use the hw profiling for that.
+ *
+ * - Incomplete faults (VM_FAULT_RETRY). They will only be counted
+ * once they're completed.
+ */
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
+ return;
+
+ /*
+ * We define the fault as a major fault when the final successful fault
+ * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
+ * handle it immediately previously).
+ */
+ major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
+
+ if (major)
+ current->maj_flt++;
+ else
+ current->min_flt++;
+
+ /*
+ * If the fault is done for GUP, regs will be NULL. We only do the
+ * accounting for the per thread fault counters who triggered the
+ * fault, and we skip the perf event updates.
+ */
+ if (!regs)
+ return;
+
+ if (major)
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ else
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*
@@ -4364,7 +4426,7 @@ retry_pud:
* return value. See filemap_fault() and __lock_page_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags)
+ unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
@@ -4405,6 +4467,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
mem_cgroup_oom_synchronize(false);
}
+ mm_account_fault(regs, address, flags, ret);
+
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
@@ -4678,7 +4742,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *maddr;
struct page *page = NULL;
- ret = get_user_pages_remote(tsk, mm, addr, 1,
+ ret = get_user_pages_remote(mm, addr, 1,
gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index da374cd3d45b..c32ead89c911 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -350,6 +350,16 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
return err;
}
+#ifdef CONFIG_NUMA
+int __weak memory_add_physaddr_to_nid(u64 start)
+{
+ pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+ start);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
@@ -831,21 +841,26 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
zone->zone_pgdat->node_present_pages += onlined_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ /*
+ * When exposing larger, physically contiguous memory areas to the
+ * buddy, shuffling in the buddy (when freeing onlined pages, putting
+ * them either to the head or the tail of the freelist) is only helpful
+ * for maintaining the shuffle, but not for creating the initial
+ * shuffle. Shuffle the whole zone to make sure the just onlined pages
+ * are properly distributed across the whole freelist.
+ */
shuffle_zone(zone);
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
- else
- zone_pcp_update(zone);
+ zone_pcp_update(zone);
init_per_zone_wmark_min();
kswapd_run(nid);
kcompactd_run(nid);
- vm_total_pages = nr_free_pagecache_pages();
-
writeback_set_ratelimit();
memory_notify(MEM_ONLINE, &arg);
@@ -1261,19 +1276,23 @@ found:
static struct page *new_node_page(struct page *page, unsigned long private)
{
- int nid = page_to_nid(page);
nodemask_t nmask = node_states[N_MEMORY];
+ struct migration_target_control mtc = {
+ .nid = page_to_nid(page),
+ .nmask = &nmask,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
/*
* try to allocate from a different node but reuse this node if there
* are no other online nodes to be used (e.g. we are offlining a part
* of the only existing node)
*/
- node_clear(nid, nmask);
+ node_clear(mtc.nid, nmask);
if (nodes_empty(nmask))
- node_set(nid, nmask);
+ node_set(mtc.nid, nmask);
- return new_page_nodemask(page, nid, &nmask);
+ return alloc_migration_target(page, (unsigned long)&mtc);
}
static int
@@ -1595,7 +1614,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
kcompactd_stop(node);
}
- vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
@@ -1742,7 +1760,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
*/
rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
if (rc)
- goto done;
+ return rc;
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
@@ -1766,9 +1784,8 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
try_offline_node(nid);
-done:
mem_hotplug_done();
- return rc;
+ return 0;
}
/**
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 381320671677..afaa09ff9f6c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -129,7 +129,7 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/**
* numa_map_to_online_node - Find closest online node
- * @nid: Node id to start the search
+ * @node: Node id to start the search
*
* Lookup the next closest node by distance if @nid is not online.
*/
@@ -1065,27 +1065,6 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
return 0;
}
-/* page allocation callback for NUMA node migration */
-struct page *alloc_new_node_page(struct page *page, unsigned long node)
-{
- if (PageHuge(page))
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- node);
- else if (PageTransHuge(page)) {
- struct page *thp;
-
- thp = alloc_pages_node(node,
- (GFP_TRANSHUGE | __GFP_THISNODE),
- HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
- } else
- return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
- __GFP_THISNODE, 0);
-}
-
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
@@ -1096,6 +1075,10 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
+ struct migration_target_control mtc = {
+ .nid = dest,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
nodes_clear(nmask);
node_set(source, nmask);
@@ -1110,8 +1093,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
}
@@ -1234,7 +1217,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
static struct page *new_page(struct page *page, unsigned long start)
{
struct vm_area_struct *vma;
- unsigned long uninitialized_var(address);
+ unsigned long address;
vma = find_vma(current->mm, start);
while (vma) {
@@ -1629,14 +1612,14 @@ static int kernel_get_mempolicy(int __user *policy,
unsigned long flags)
{
int err;
- int uninitialized_var(pval);
+ int pval;
nodemask_t nodes;
- addr = untagged_addr(addr);
-
if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
+ addr = untagged_addr(addr);
+
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
@@ -1890,7 +1873,7 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
* Return a nodemask representing a mempolicy for filtering nodes for
* page allocation
*/
-static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
+nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
diff --git a/mm/migrate.c b/mm/migrate.c
index 40cd7016ae6f..5053439be6ab 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1418,22 +1418,35 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
enum migrate_mode mode, int reason)
{
int retry = 1;
+ int thp_retry = 1;
int nr_failed = 0;
int nr_succeeded = 0;
+ int nr_thp_succeeded = 0;
+ int nr_thp_failed = 0;
+ int nr_thp_split = 0;
int pass = 0;
+ bool is_thp = false;
struct page *page;
struct page *page2;
int swapwrite = current->flags & PF_SWAPWRITE;
- int rc;
+ int rc, nr_subpages;
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
- for(pass = 0; pass < 10 && retry; pass++) {
+ for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
retry = 0;
+ thp_retry = 0;
list_for_each_entry_safe(page, page2, from, lru) {
retry:
+ /*
+ * THP statistics is based on the source huge page.
+ * Capture required information that might get lost
+ * during migration.
+ */
+ is_thp = PageTransHuge(page);
+ nr_subpages = hpage_nr_pages(page);
cond_resched();
if (PageHuge(page))
@@ -1464,15 +1477,30 @@ retry:
unlock_page(page);
if (!rc) {
list_safe_reset_next(page, page2, lru);
+ nr_thp_split++;
goto retry;
}
}
+ if (is_thp) {
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ goto out;
+ }
nr_failed++;
goto out;
case -EAGAIN:
+ if (is_thp) {
+ thp_retry++;
+ break;
+ }
retry++;
break;
case MIGRATEPAGE_SUCCESS:
+ if (is_thp) {
+ nr_thp_succeeded++;
+ nr_succeeded += nr_subpages;
+ break;
+ }
nr_succeeded++;
break;
default:
@@ -1482,19 +1510,27 @@ retry:
* removed from migration page list and not
* retried in the next outer loop.
*/
+ if (is_thp) {
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ break;
+ }
nr_failed++;
break;
}
}
}
- nr_failed += retry;
+ nr_failed += retry + thp_retry;
+ nr_thp_failed += thp_retry;
rc = nr_failed;
out:
- if (nr_succeeded)
- count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
- if (nr_failed)
- count_vm_events(PGMIGRATE_FAIL, nr_failed);
- trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
+ count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
+ count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
+ nr_thp_failed, nr_thp_split, mode, reason);
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
@@ -1502,6 +1538,49 @@ out:
return rc;
}
+struct page *alloc_migration_target(struct page *page, unsigned long private)
+{
+ struct migration_target_control *mtc;
+ gfp_t gfp_mask;
+ unsigned int order = 0;
+ struct page *new_page = NULL;
+ int nid;
+ int zidx;
+
+ mtc = (struct migration_target_control *)private;
+ gfp_mask = mtc->gfp_mask;
+ nid = mtc->nid;
+ if (nid == NUMA_NO_NODE)
+ nid = page_to_nid(page);
+
+ if (PageHuge(page)) {
+ struct hstate *h = page_hstate(compound_head(page));
+
+ gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
+ return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
+ }
+
+ if (PageTransHuge(page)) {
+ /*
+ * clear __GFP_RECLAIM to make the migration callback
+ * consistent with regular THP allocations.
+ */
+ gfp_mask &= ~__GFP_RECLAIM;
+ gfp_mask |= GFP_TRANSHUGE;
+ order = HPAGE_PMD_ORDER;
+ }
+ zidx = zone_idx(page_zone(page));
+ if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
+ gfp_mask |= __GFP_HIGHMEM;
+
+ new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
+
+ if (new_page && PageTransHuge(new_page))
+ prep_transhuge_page(new_page);
+
+ return new_page;
+}
+
#ifdef CONFIG_NUMA
static int store_status(int __user *status, int start, int value, int nr)
@@ -1519,9 +1598,13 @@ static int do_move_pages_to_node(struct mm_struct *mm,
struct list_head *pagelist, int node)
{
int err;
+ struct migration_target_control mtc = {
+ .nid = node,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
- err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(pagelist);
return err;
@@ -2168,6 +2251,16 @@ static int migrate_vma_collect_hole(unsigned long start,
struct migrate_vma *migrate = walk->private;
unsigned long addr;
+ /* Only allow populating anonymous memory. */
+ if (!vma_is_anonymous(walk->vma)) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = 0;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ }
+ return 0;
+ }
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
@@ -2260,8 +2353,10 @@ again:
pte = *ptep;
if (pte_none(pte)) {
- mpfn = MIGRATE_PFN_MIGRATE;
- migrate->cpages++;
+ if (vma_is_anonymous(vma)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ }
goto next;
}
@@ -2276,7 +2371,9 @@ again:
goto next;
page = device_private_entry_to_page(entry);
- if (page->pgmap->owner != migrate->src_owner)
+ if (!(migrate->flags &
+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ page->pgmap->owner != migrate->pgmap_owner)
goto next;
mpfn = migrate_pfn(page_to_pfn(page)) |
@@ -2284,7 +2381,7 @@ again:
if (is_write_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
- if (migrate->src_owner)
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
goto next;
pfn = pte_pfn(pte);
if (is_zero_pfn(pfn)) {
@@ -2379,8 +2476,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
{
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL,
- migrate->vma->vm_mm, migrate->start, migrate->end);
+ /*
+ * Note that the pgmap_owner is passed to the mmu notifier callback so
+ * that the registered device driver can skip invalidating device
+ * private page mappings that won't be migrated.
+ */
+ mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
+ migrate->vma->vm_mm, migrate->start, migrate->end,
+ migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
@@ -2611,7 +2714,7 @@ restore:
/**
* migrate_vma_setup() - prepare to migrate a range of memory
- * @args: contains the vma, start, and and pfns arrays for the migration
+ * @args: contains the vma, start, and pfns arrays for the migration
*
* Returns: negative errno on failures, 0 when 0 or more pages were migrated
* without an error.
@@ -2822,7 +2925,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
if (!is_zone_device_page(page))
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
get_page(page);
if (flush) {
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 435e5f794b3b..b06a30fbedff 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -13,6 +13,7 @@
#include <linux/memory.h>
#include <linux/notifier.h>
#include <linux/sched.h>
+#include <linux/mman.h>
#include "internal.h"
#ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -144,14 +145,23 @@ EXPORT_SYMBOL_GPL(mm_kobj);
#ifdef CONFIG_SMP
s32 vm_committed_as_batch = 32;
-static void __meminit mm_compute_batch(void)
+void mm_compute_batch(int overcommit_policy)
{
u64 memsized_batch;
s32 nr = num_present_cpus();
s32 batch = max_t(s32, nr*2, 32);
-
- /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
- memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff);
+ unsigned long ram_pages = totalram_pages();
+
+ /*
+ * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
+ * (total memory/#cpus), and lift it to 25% for other policies
+ * to easy the possible lock contention for percpu_counter
+ * vm_committed_as, while the max limit is INT_MAX
+ */
+ if (overcommit_policy == OVERCOMMIT_NEVER)
+ memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);
+ else
+ memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
vm_committed_as_batch = max_t(s32, memsized_batch, batch);
}
@@ -162,7 +172,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
- mm_compute_batch();
+ mm_compute_batch(sysctl_overcommit_memory);
default:
break;
}
@@ -176,7 +186,7 @@ static struct notifier_block compute_batch_nb __meminitdata = {
static int __init mm_compute_batch_init(void)
{
- mm_compute_batch();
+ mm_compute_batch(sysctl_overcommit_memory);
register_hotmemory_notifier(&compute_batch_nb);
return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index 8c7ca737a19b..40248d84ad5f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1030,7 +1030,7 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
*
* We don't check here for the merged mmap wrapping around the end of pagecache
- * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
+ * indices (16TB on ia32) because do_mmap() does not permit mmap's which
* wrap, nor mmaps which cover the final page at index -1UL.
*/
static int
@@ -1365,11 +1365,11 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
- unsigned long flags, vm_flags_t vm_flags,
- unsigned long pgoff, unsigned long *populate,
- struct list_head *uf)
+ unsigned long flags, unsigned long pgoff,
+ unsigned long *populate, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
+ vm_flags_t vm_flags;
int pkey = 0;
*populate = 0;
@@ -1431,7 +1431,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+ vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
@@ -1562,11 +1562,12 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
file = fget(fd);
if (!file)
return -EBADF;
- if (is_file_hugepages(file))
+ if (is_file_hugepages(file)) {
len = ALIGN(len, huge_page_size(hstate_file(file)));
- retval = -EINVAL;
- if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
+ } else if (unlikely(flags & MAP_HUGETLB)) {
+ retval = -EINVAL;
goto out_fput;
+ }
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
struct hstate *hs;
@@ -1689,7 +1690,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
+ struct vm_area_struct *vma, *prev, *merge;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
@@ -1773,6 +1774,25 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (error)
goto unmap_and_free_vma;
+ /* If vm_flags changed after call_mmap(), we should try merge vma again
+ * as we may succeed this time.
+ */
+ if (unlikely(vm_flags != vma->vm_flags && prev)) {
+ merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
+ NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
+ if (merge) {
+ fput(file);
+ vm_area_free(vma);
+ vma = merge;
+ /* Update vm_flags and possible addr to pick up the change. We don't
+ * warn here if addr changed as the vma is not linked by vma_link().
+ */
+ addr = vma->vm_start;
+ vm_flags = vma->vm_flags;
+ goto unmap_writable;
+ }
+ }
+
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
@@ -1795,6 +1815,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
if (file) {
+unmap_writable:
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
if (vm_flags & VM_DENYWRITE)
@@ -2209,7 +2230,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
/*
* mmap_region() will call shmem_zero_setup() to create a file,
* so use shmem's get_unmapped_area in case it can be huge.
- * do_mmap_pgoff() will clear pgoff, so match alignment.
+ * do_mmap() will clear pgoff, so match alignment.
*/
pgoff = 0;
get_area = shmem_get_unmapped_area;
@@ -2982,7 +3003,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
}
file = get_file(vma->vm_file);
- ret = do_mmap_pgoff(vma->vm_file, start, size,
+ ret = do_mmap(vma->vm_file, start, size,
prot, flags, pgoff, &populate, NULL);
fput(file);
out:
@@ -3171,6 +3192,7 @@ void exit_mmap(struct mm_struct *mm)
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
vma = remove_vma(vma);
+ cond_resched();
}
vm_unacct_memory(nr_accounted);
}
@@ -3201,7 +3223,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
* By setting it to reflect the virtual start address of the
* vma, merges and splits can happen in a seamless way, just
* using the existing file pgoff checks and manipulations.
- * Similarly in do_mmap_pgoff and in do_brk.
+ * Similarly in do_mmap and in do_brk.
*/
if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 352bb9f3ecc0..4fc918163dd3 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -166,7 +166,7 @@ static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
/**
* mmu_interval_read_begin - Begin a read side critical section against a VA
* range
- * interval_sub: The interval subscription
+ * @interval_sub: The interval subscription
*
* mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
* collision-retry scheme similar to seqcount for the VA range under
@@ -686,7 +686,7 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_register);
/**
* mmu_notifier_register - Register a notifier on a mm
- * @mn: The notifier to attach
+ * @subscription: The notifier to attach
* @mm: The mm to attach the notifier to
*
* Must not hold mmap_lock nor any other VM related lock when calling
@@ -856,7 +856,7 @@ static void mmu_notifier_free_rcu(struct rcu_head *rcu)
/**
* mmu_notifier_put - Release the reference on the notifier
- * @mn: The notifier to act on
+ * @subscription: The notifier to act on
*
* This function must be paired with each mmu_notifier_get(), it releases the
* reference obtained by the get. If this is the last reference then process
@@ -965,7 +965,8 @@ static int __mmu_interval_notifier_insert(
* @interval_sub: Interval subscription to register
* @start: Starting virtual address to monitor
* @length: Length of the range to monitor
- * @mm : mm_struct to attach to
+ * @mm: mm_struct to attach to
+ * @ops: Interval notifier operations to be called on matching events
*
* This function subscribes the interval notifier for notifications from the
* mm. Upon return the ops related to mmu_interval_notifier will be called
diff --git a/mm/mremap.c b/mm/mremap.c
index 6b153dc05fe4..138abbae4f75 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -193,17 +193,12 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
#ifdef CONFIG_HAVE_MOVE_PMD
static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, unsigned long old_end,
- pmd_t *old_pmd, pmd_t *new_pmd)
+ unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
struct mm_struct *mm = vma->vm_mm;
pmd_t pmd;
- if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
- || old_end - old_addr < PMD_SIZE)
- return false;
-
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have released it.
@@ -279,6 +274,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
extent = next - old_addr;
if (extent > old_end - old_addr)
extent = old_end - old_addr;
+ next = (new_addr + PMD_SIZE) & PMD_MASK;
+ if (extent > next - new_addr)
+ extent = next - new_addr;
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
@@ -292,7 +290,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (need_rmap_locks)
take_rmap_locks(vma);
moved = move_huge_pmd(vma, old_addr, new_addr,
- old_end, old_pmd, new_pmd);
+ old_pmd, new_pmd);
if (need_rmap_locks)
drop_rmap_locks(vma);
if (moved)
@@ -312,7 +310,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (need_rmap_locks)
take_rmap_locks(vma);
moved = move_normal_pmd(vma, old_addr, new_addr,
- old_end, old_pmd, new_pmd);
+ old_pmd, new_pmd);
if (need_rmap_locks)
drop_rmap_locks(vma);
if (moved)
@@ -322,9 +320,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (pte_alloc(new_vma->vm_mm, new_pmd))
break;
- next = (new_addr + PMD_SIZE) & PMD_MASK;
- if (extent > next - new_addr)
- extent = next - new_addr;
move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
new_pmd, new_addr, need_rmap_locks);
}
diff --git a/mm/nommu.c b/mm/nommu.c
index f32a69095d50..75a327149af1 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -5,7 +5,7 @@
* Replacement code for mm functions to support CPU's that don't
* have any form of memory management unit (thus no virtual memory).
*
- * See Documentation/nommu-mmap.txt
+ * See Documentation/mm/nommu-mmap.rst
*
* Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
* Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
@@ -1078,7 +1078,6 @@ unsigned long do_mmap(struct file *file,
unsigned long len,
unsigned long prot,
unsigned long flags,
- vm_flags_t vm_flags,
unsigned long pgoff,
unsigned long *populate,
struct list_head *uf)
@@ -1086,6 +1085,7 @@ unsigned long do_mmap(struct file *file,
struct vm_area_struct *vma;
struct vm_region *region;
struct rb_node *rb;
+ vm_flags_t vm_flags;
unsigned long capabilities, result;
int ret;
@@ -1104,7 +1104,7 @@ unsigned long do_mmap(struct file *file,
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
- vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
+ vm_flags = determine_vm_flags(file, prot, flags, capabilities);
/* we're going to need to record the mapping */
region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
@@ -1762,8 +1762,8 @@ EXPORT_SYMBOL_GPL(access_process_vm);
* @newsize: The proposed filesize of the inode
*
* Check the shared mappings on an inode on behalf of a shrinking truncate to
- * make sure that that any outstanding VMAs aren't broken and then shrink the
- * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
+ * make sure that any outstanding VMAs aren't broken and then shrink the
+ * vm_regions that extend beyond so that do_mmap() doesn't
* automatically grant mappings that are too large.
*/
int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6e94962893ee..e90f25d6385d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -184,7 +184,7 @@ static bool is_dump_unreclaim_slabs(void)
global_node_page_state(NR_ISOLATED_FILE) +
global_node_page_state(NR_UNEVICTABLE);
- return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
+ return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
}
/**
@@ -196,17 +196,17 @@ static bool is_dump_unreclaim_slabs(void)
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
-unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
+long oom_badness(struct task_struct *p, unsigned long totalpages)
{
long points;
long adj;
if (oom_unkillable_task(p))
- return 0;
+ return LONG_MIN;
p = find_lock_task_mm(p);
if (!p)
- return 0;
+ return LONG_MIN;
/*
* Do not even consider tasks which are explicitly marked oom
@@ -218,7 +218,7 @@ unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
in_vfork(p)) {
task_unlock(p);
- return 0;
+ return LONG_MIN;
}
/*
@@ -233,11 +233,7 @@ unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
adj *= totalpages / 1000;
points += adj;
- /*
- * Never return 0 for an eligible task regardless of the root bonus and
- * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
- */
- return points > 0 ? points : 1;
+ return points;
}
static const char * const oom_constraint_text[] = {
@@ -310,7 +306,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
static int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
- unsigned long points;
+ long points;
if (oom_unkillable_task(task))
goto next;
@@ -336,12 +332,12 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
* killed first if it triggers an oom, then select it.
*/
if (oom_task_origin(task)) {
- points = ULONG_MAX;
+ points = LONG_MAX;
goto select;
}
points = oom_badness(task, oc->totalpages);
- if (!points || points < oc->chosen_points)
+ if (points == LONG_MIN || points < oc->chosen_points)
goto next;
select:
@@ -365,6 +361,8 @@ abort:
*/
static void select_bad_process(struct oom_control *oc)
{
+ oc->chosen_points = LONG_MIN;
+
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
else {
@@ -863,6 +861,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
p = find_lock_task_mm(victim);
if (!p) {
+ pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
+ message, task_pid_nr(victim), victim->comm);
put_task_struct(victim);
return;
} else if (victim != p) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 28b3e7a67565..4e4ddd67b71e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2076,13 +2076,11 @@ static int page_writeback_cpu_online(unsigned int cpu)
* Called early on to tune the page writeback dirty limits.
*
* We used to scale dirty pages according to how total memory
- * related to pages that could be allocated for buffers (by
- * comparing nr_free_buffer_pages() to vm_total_pages.
+ * related to pages that could be allocated for buffers.
*
* However, that was when we used "dirty_ratio" to scale with
* all memory, and we don't do that any more. "dirty_ratio"
- * is now applied to total non-HIGHPAGE memory (by subtracting
- * totalhigh_pages from vm_total_pages), and as such we can't
+ * is now applied to total non-HIGHPAGE memory, and as such we can't
* get into the old insane situation any more where we had
* large amounts of dirty pages compared to a small amount of
* non-HIGHMEM memory.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e028b87ce294..8b7d0ecf30b1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -459,25 +459,23 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#else
pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#endif /* CONFIG_SPARSEMEM */
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
/**
* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
- * @end_bitidx: The last bit of interest to retrieve
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
*/
-static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
+static __always_inline
+unsigned long __get_pfnblock_flags_mask(struct page *page,
unsigned long pfn,
- unsigned long end_bitidx,
unsigned long mask)
{
unsigned long *bitmap;
@@ -490,20 +488,18 @@ static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page
bitidx &= (BITS_PER_LONG-1);
word = bitmap[word_bitidx];
- bitidx += end_bitidx;
- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+ return (word >> bitidx) & mask;
}
unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
- unsigned long end_bitidx,
unsigned long mask)
{
- return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
+ return __get_pfnblock_flags_mask(page, pfn, mask);
}
static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
{
- return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
+ return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
/**
@@ -511,12 +507,10 @@ static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned
* @page: The page within the block of interest
* @flags: The flags to set
* @pfn: The target page frame number
- * @end_bitidx: The last bit of interest
* @mask: mask of bits that the caller is interested in
*/
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
unsigned long pfn,
- unsigned long end_bitidx,
unsigned long mask)
{
unsigned long *bitmap;
@@ -533,9 +527,8 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
- bitidx += end_bitidx;
- mask <<= (BITS_PER_LONG - bitidx - 1);
- flags <<= (BITS_PER_LONG - bitidx - 1);
+ mask <<= bitidx;
+ flags <<= bitidx;
word = READ_ONCE(bitmap[word_bitidx]);
for (;;) {
@@ -552,8 +545,8 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
- set_pageblock_flags_group(page, (unsigned long)migratetype,
- PB_migrate, PB_migrate_end);
+ set_pfnblock_flags_mask(page, (unsigned long)migratetype,
+ page_to_pfn(page), MIGRATETYPE_MASK);
}
#ifdef CONFIG_DEBUG_VM
@@ -813,11 +806,10 @@ static inline struct capture_control *task_capc(struct zone *zone)
{
struct capture_control *capc = current->capture_control;
- return capc &&
+ return unlikely(capc) &&
!(current->flags & PF_KTHREAD) &&
!capc->page &&
- capc->cc->zone == zone &&
- capc->cc->direct_compaction ? capc : NULL;
+ capc->cc->zone == zone ? capc : NULL;
}
static inline bool
@@ -961,7 +953,7 @@ static inline void __free_one_page(struct page *page,
int migratetype, bool report)
{
struct capture_control *capc = task_capc(zone);
- unsigned long uninitialized_var(buddy_pfn);
+ unsigned long buddy_pfn;
unsigned long combined_pfn;
unsigned int max_order;
struct page *buddy;
@@ -1164,8 +1156,11 @@ static void kernel_init_free_pages(struct page *page, int numpages)
{
int i;
+ /* s390's use of memset() could override KASAN redzones. */
+ kasan_disable_current();
for (i = 0; i < numpages; i++)
clear_highpage(page + i);
+ kasan_enable_current();
}
static __always_inline bool free_pages_prepare(struct page *page,
@@ -2273,7 +2268,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*/
-static int fallbacks[MIGRATE_TYPES][4] = {
+static int fallbacks[MIGRATE_TYPES][3] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
@@ -2790,7 +2785,7 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
* allocating from CMA when over half of the zone's free memory
* is in the CMA area.
*/
- if (migratetype == MIGRATE_MOVABLE &&
+ if (alloc_flags & ALLOC_CMA &&
zone_page_state(zone, NR_FREE_CMA_PAGES) >
zone_page_state(zone, NR_FREE_PAGES) / 2) {
page = __rmqueue_cma_fallback(zone, order);
@@ -2801,7 +2796,7 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
- if (migratetype == MIGRATE_MOVABLE)
+ if (alloc_flags & ALLOC_CMA)
page = __rmqueue_cma_fallback(zone, order);
if (!page && __rmqueue_fallback(zone, order, migratetype,
@@ -3487,6 +3482,29 @@ static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
}
ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
+static inline long __zone_watermark_unusable_free(struct zone *z,
+ unsigned int order, unsigned int alloc_flags)
+{
+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
+ long unusable_free = (1 << order) - 1;
+
+ /*
+ * If the caller does not have rights to ALLOC_HARDER then subtract
+ * the high-atomic reserves. This will over-estimate the size of the
+ * atomic reserve but it avoids a search.
+ */
+ if (likely(!alloc_harder))
+ unusable_free += z->nr_reserved_highatomic;
+
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
+ return unusable_free;
+}
+
/*
* Return true if free base pages are above 'mark'. For high-order checks it
* will return true of the order-0 watermark is reached and there is at least
@@ -3502,19 +3520,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
- free_pages -= (1 << order) - 1;
+ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
- /*
- * If the caller does not have rights to ALLOC_HARDER then subtract
- * the high-atomic reserves. This will over-estimate the size of the
- * atomic reserve but it avoids a search.
- */
- if (likely(!alloc_harder)) {
- free_pages -= z->nr_reserved_highatomic;
- } else {
+ if (unlikely(alloc_harder)) {
/*
* OOM victims can try even harder than normal ALLOC_HARDER
* users on the grounds that it's definitely going to be in
@@ -3527,13 +3538,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
min -= min / 4;
}
-
-#ifdef CONFIG_CMA
- /* If allocation can't use CMA areas don't use free CMA pages */
- if (!(alloc_flags & ALLOC_CMA))
- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
-
/*
* Check watermarks for an order-0 allocation request. If these
* are not met, then a high-order request also cannot go ahead
@@ -3580,30 +3584,42 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx,
- unsigned int alloc_flags)
+ unsigned int alloc_flags, gfp_t gfp_mask)
{
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
- long cma_pages = 0;
+ long free_pages;
-#ifdef CONFIG_CMA
- /* If allocation can't use CMA areas don't use free CMA pages */
- if (!(alloc_flags & ALLOC_CMA))
- cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
+ free_pages = zone_page_state(z, NR_FREE_PAGES);
/*
* Fast check for order-0 only. If this fails then the reserves
- * need to be calculated. There is a corner case where the check
- * passes but only the high-order atomic reserve are free. If
- * the caller is !atomic then it'll uselessly search the free
- * list. That corner case is then slower but it is harmless.
+ * need to be calculated.
*/
- if (!order && (free_pages - cma_pages) >
- mark + z->lowmem_reserve[highest_zoneidx])
+ if (!order) {
+ long fast_free;
+
+ fast_free = free_pages;
+ fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
+ if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
+ return true;
+ }
+
+ if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
+ free_pages))
return true;
+ /*
+ * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
+ * when checking the min watermark. The min watermark is the
+ * point where boosting is ignored so that kswapd is woken up
+ * when below the low watermark.
+ */
+ if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
+ && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
+ mark = z->_watermark[WMARK_MIN];
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx,
+ alloc_flags, free_pages);
+ }
- return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
- free_pages);
+ return false;
}
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
@@ -3671,6 +3687,20 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
return alloc_flags;
}
+static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
+ unsigned int alloc_flags)
+{
+#ifdef CONFIG_CMA
+ unsigned int pflags = current->flags;
+
+ if (!(pflags & PF_MEMALLOC_NOCMA) &&
+ gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+
+#endif
+ return alloc_flags;
+}
+
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
@@ -3747,7 +3777,8 @@ retry:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
- ac->highest_zoneidx, alloc_flags)) {
+ ac->highest_zoneidx, alloc_flags,
+ gfp_mask)) {
int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -4251,7 +4282,7 @@ retry:
/*
* If an allocation failed after direct reclaim, it could be because
* pages are pinned on the per-cpu lists or in high alloc reserves.
- * Shrink them them and try again
+ * Shrink them and try again
*/
if (!page && !drained) {
unreserve_highatomic_pageblock(ac, false);
@@ -4316,10 +4347,8 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
-#ifdef CONFIG_CMA
- if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
- alloc_flags |= ALLOC_CMA;
-#endif
+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
+
return alloc_flags;
}
@@ -4620,7 +4649,7 @@ retry:
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
- alloc_flags = reserve_flags;
+ alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
/*
* Reset the nodemask and zonelist iterators if memory policies can be
@@ -4697,7 +4726,7 @@ retry:
/* Avoid allocations with no watermarks from looping endlessly */
if (tsk_is_oom_victim(current) &&
- (alloc_flags == ALLOC_OOM ||
+ (alloc_flags & ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
@@ -4771,7 +4800,11 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
- if (!ac->nodemask)
+ /*
+ * When we are in the interrupt context, it is irrelevant
+ * to the current task context. It means that any node ok.
+ */
+ if (!in_interrupt() && !ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
@@ -4785,8 +4818,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
if (should_fail_alloc_page(gfp_mask, order))
return false;
- if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
- *alloc_flags |= ALLOC_CMA;
+ *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
return true;
}
@@ -5165,19 +5197,6 @@ unsigned long nr_free_buffer_pages(void)
}
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
-/**
- * nr_free_pagecache_pages - count number of pages beyond high watermark
- *
- * nr_free_pagecache_pages() counts the number of pages which are beyond the
- * high watermark within all zones.
- *
- * Return: number of pages beyond high watermark within all zones.
- */
-unsigned long nr_free_pagecache_pages(void)
-{
- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
-}
-
static inline void show_node(struct zone *zone)
{
if (IS_ENABLED(CONFIG_NUMA))
@@ -5220,8 +5239,8 @@ long si_mem_available(void)
* items that are in use, and cannot be freed. Cap this estimate at the
* low watermark.
*/
- reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
available += reclaimable - min(reclaimable / 2, wmark_low);
if (available < 0)
@@ -5364,8 +5383,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_UNEVICTABLE),
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
- global_node_page_state(NR_SLAB_RECLAIMABLE),
- global_node_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_zone_page_state(NR_PAGETABLE),
@@ -5396,6 +5415,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" anon_thp: %lukB"
#endif
" writeback_tmp:%lukB"
+ " kernel_stack:%lukB"
+#ifdef CONFIG_SHADOW_CALL_STACK
+ " shadow_call_stack:%lukB"
+#endif
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -5417,6 +5440,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
+#ifdef CONFIG_SHADOW_CALL_STACK
+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
+#endif
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
@@ -5448,10 +5475,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
- " kernel_stack:%lukB"
-#ifdef CONFIG_SHADOW_CALL_STACK
- " shadow_call_stack:%lukB"
-#endif
" pagetables:%lukB"
" bounce:%lukB"
" free_pcp:%lukB"
@@ -5473,10 +5496,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
- zone_page_state(zone, NR_KERNEL_STACK_KB),
-#ifdef CONFIG_SHADOW_CALL_STACK
- zone_page_state(zone, NR_KERNEL_SCS_KB),
-#endif
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
@@ -5891,13 +5910,16 @@ build_all_zonelists_init(void)
*/
void __ref build_all_zonelists(pg_data_t *pgdat)
{
+ unsigned long vm_total_pages;
+
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
} else {
__build_all_zonelists(pgdat);
/* cpuset refresh routine should be here */
}
- vm_total_pages = nr_free_pagecache_pages();
+ /* Get the number of free pages beyond high watermark in all zones. */
+ vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
/*
* Disable grouping by mobility if the number of pages in the
* system is too low to allow the mechanism to work. It would be
@@ -6170,7 +6192,7 @@ static int zone_batchsize(struct zone *zone)
* locking.
*
* Any new users of pcp->batch and pcp->high should ensure they can cope with
- * those fields changing asynchronously (acording the the above rule).
+ * those fields changing asynchronously (acording to the above rule).
*
* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
* outside of boot time (or some other assurance that no concurrent updaters
@@ -6325,22 +6347,6 @@ void __meminit init_currently_empty_zone(struct zone *zone,
}
/**
- * sparse_memory_present_with_active_regions - Call memory_present for each active range
- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
- *
- * If an architecture guarantees that all ranges registered contain no holes and may
- * be freed, this function may be used instead of calling memory_present() manually.
- */
-void __init sparse_memory_present_with_active_regions(int nid)
-{
- unsigned long start_pfn, end_pfn;
- int i, this_nid;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
- memory_present(this_nid, start_pfn, end_pfn);
-}
-
-/**
* get_pfn_range_for_nid - Return the start and end page frames for a node
* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
* @start_pfn: Passed by reference. On return, it will have the node start_pfn.
@@ -8197,7 +8203,7 @@ void *__init alloc_large_system_hash(const char *tablename,
* race condition. So you can't expect this function should be exact.
*
* Returns a page without holding a reference. If the caller wants to
- * dereference that page (e.g., dumping), it has to make sure that that it
+ * dereference that page (e.g., dumping), it has to make sure that it
* cannot get removed (e.g., via memory unplug) concurrently.
*
*/
@@ -8341,6 +8347,10 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long pfn = start;
unsigned int tries = 0;
int ret = 0;
+ struct migration_target_control mtc = {
+ .nid = zone_to_nid(cc->zone),
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
migrate_prep();
@@ -8367,8 +8377,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
&cc->migratepages);
cc->nr_migratepages -= nr_reclaimed;
- ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
- NULL, 0, cc->mode, MR_CONTIG_RANGE);
+ ret = migrate_pages(&cc->migratepages, alloc_migration_target,
+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
}
if (ret < 0) {
putback_movable_pages(&cc->migratepages);
diff --git a/mm/page_counter.c b/mm/page_counter.c
index c56db2d5e159..b4663844c9b3 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -72,7 +72,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
long new;
new = atomic_long_add_return(nr_pages, &c->usage);
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -116,7 +116,7 @@ bool page_counter_try_charge(struct page_counter *counter,
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
@@ -125,7 +125,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*fail = c;
goto failed;
}
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
diff --git a/mm/page_io.c b/mm/page_io.c
index e8726f3e3820..9e362567d454 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -277,6 +277,23 @@ static inline void count_swpout_vm_event(struct page *page)
count_vm_events(PSWPOUT, hpage_nr_pages(page));
}
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
+{
+ struct cgroup_subsys_state *css;
+
+ if (!page->mem_cgroup)
+ return;
+
+ rcu_read_lock();
+ css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+ bio_associate_blkg_from_css(bio, css);
+ rcu_read_unlock();
+}
+#else
+#define bio_associate_blkg_from_page(bio, page) do { } while (0)
+#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
+
int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func)
{
@@ -424,7 +441,7 @@ int swap_readpage(struct page *page, bool synchronous)
break;
if (!blk_poll(disk->queue, qc, true))
- io_schedule();
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
bio_put(bio);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f6d07c5f0d34..242c03121d73 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -306,8 +306,3 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
return pfn < end_pfn ? -EBUSY : 0;
}
-
-struct page *alloc_migrate_target(struct page *page, unsigned long private)
-{
- return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]);
-}
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 0468ba500bd4..18b768ac7dca 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -6,6 +6,25 @@
#include <linux/percpu.h>
/*
+ * There are two chunk types: root and memcg-aware.
+ * Chunks of each type have separate slots list.
+ *
+ * Memcg-aware chunks have an attached vector of obj_cgroup pointers, which is
+ * used to store memcg membership data of a percpu object. Obj_cgroups are
+ * ref-counted pointers to a memory cgroup with an ability to switch dynamically
+ * to the parent memory cgroup. This allows to reclaim a deleted memory cgroup
+ * without reclaiming of all outstanding objects, which hold a reference at it.
+ */
+enum pcpu_chunk_type {
+ PCPU_CHUNK_ROOT,
+#ifdef CONFIG_MEMCG_KMEM
+ PCPU_CHUNK_MEMCG,
+#endif
+ PCPU_NR_CHUNK_TYPES,
+ PCPU_FAIL_ALLOC = PCPU_NR_CHUNK_TYPES
+};
+
+/*
* pcpu_block_md is the metadata block struct.
* Each chunk's bitmap is split into a number of full blocks.
* All units are in terms of bits.
@@ -54,6 +73,9 @@ struct pcpu_chunk {
int end_offset; /* additional area required to
have the region end page
aligned */
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup **obj_cgroups; /* vector of object cgroups */
+#endif
int nr_pages; /* # of pages served by this chunk */
int nr_populated; /* # of populated pages */
@@ -63,7 +85,7 @@ struct pcpu_chunk {
extern spinlock_t pcpu_lock;
-extern struct list_head *pcpu_slot;
+extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
extern int pcpu_nr_empty_pop_pages;
@@ -106,6 +128,37 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}
+#ifdef CONFIG_MEMCG_KMEM
+static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
+{
+ if (chunk->obj_cgroups)
+ return PCPU_CHUNK_MEMCG;
+ return PCPU_CHUNK_ROOT;
+}
+
+static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
+{
+ return chunk_type == PCPU_CHUNK_MEMCG;
+}
+
+#else
+static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
+{
+ return PCPU_CHUNK_ROOT;
+}
+
+static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
+{
+ return false;
+}
+#endif
+
+static inline struct list_head *pcpu_chunk_list(enum pcpu_chunk_type chunk_type)
+{
+ return &pcpu_chunk_lists[pcpu_nr_slots *
+ pcpu_is_memcg_chunk(chunk_type)];
+}
+
#ifdef CONFIG_PERCPU_STATS
#include <linux/spinlock.h>
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 20d2b69a13b0..35c9941077ee 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -44,7 +44,8 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
/* nada */
}
-static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
+static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
+ gfp_t gfp)
{
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
struct pcpu_chunk *chunk;
@@ -52,7 +53,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
unsigned long flags;
int i;
- chunk = pcpu_alloc_chunk(gfp);
+ chunk = pcpu_alloc_chunk(type, gfp);
if (!chunk)
return NULL;
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index 32558063c3f9..c8400a2adbc2 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -34,11 +34,15 @@ static int find_max_nr_alloc(void)
{
struct pcpu_chunk *chunk;
int slot, max_nr_alloc;
+ enum pcpu_chunk_type type;
max_nr_alloc = 0;
- for (slot = 0; slot < pcpu_nr_slots; slot++)
- list_for_each_entry(chunk, &pcpu_slot[slot], list)
- max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc);
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ for (slot = 0; slot < pcpu_nr_slots; slot++)
+ list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
+ list)
+ max_nr_alloc = max(max_nr_alloc,
+ chunk->nr_alloc);
return max_nr_alloc;
}
@@ -129,6 +133,9 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
P("cur_min_alloc", cur_min_alloc);
P("cur_med_alloc", cur_med_alloc);
P("cur_max_alloc", cur_max_alloc);
+#ifdef CONFIG_MEMCG_KMEM
+ P("memcg_aware", pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)));
+#endif
seq_putc(m, '\n');
}
@@ -137,6 +144,7 @@ static int percpu_stats_show(struct seq_file *m, void *v)
struct pcpu_chunk *chunk;
int slot, max_nr_alloc;
int *buffer;
+ enum pcpu_chunk_type type;
alloc_buffer:
spin_lock_irq(&pcpu_lock);
@@ -202,18 +210,18 @@ alloc_buffer:
chunk_map_stats(m, pcpu_reserved_chunk, buffer);
}
- for (slot = 0; slot < pcpu_nr_slots; slot++) {
- list_for_each_entry(chunk, &pcpu_slot[slot], list) {
- if (chunk == pcpu_first_chunk) {
- seq_puts(m, "Chunk: <- First Chunk\n");
- chunk_map_stats(m, chunk, buffer);
-
-
- } else {
- seq_puts(m, "Chunk:\n");
- chunk_map_stats(m, chunk, buffer);
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) {
+ for (slot = 0; slot < pcpu_nr_slots; slot++) {
+ list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
+ list) {
+ if (chunk == pcpu_first_chunk) {
+ seq_puts(m, "Chunk: <- First Chunk\n");
+ chunk_map_stats(m, chunk, buffer);
+ } else {
+ seq_puts(m, "Chunk:\n");
+ chunk_map_stats(m, chunk, buffer);
+ }
}
-
}
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index a2b395acef89..e46f7a6917f9 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -328,12 +328,13 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
pcpu_free_pages(chunk, pages, page_start, page_end);
}
-static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
+static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
+ gfp_t gfp)
{
struct pcpu_chunk *chunk;
struct vm_struct **vms;
- chunk = pcpu_alloc_chunk(gfp);
+ chunk = pcpu_alloc_chunk(type, gfp);
if (!chunk)
return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 696367b18222..f4709629e6de 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -37,9 +37,14 @@
* takes care of normal allocations.
*
* The allocator organizes chunks into lists according to free size and
- * tries to allocate from the fullest chunk first. Each chunk is managed
- * by a bitmap with metadata blocks. The allocation map is updated on
- * every allocation and free to reflect the current state while the boundary
+ * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT
+ * flag should be passed. All memcg-aware allocations are sharing one set
+ * of chunks and all unaccounted allocations and allocations performed
+ * by processes belonging to the root memory cgroup are using the second set.
+ *
+ * The allocator tries to allocate from the fullest chunk first. Each chunk
+ * is managed by a bitmap with metadata blocks. The allocation map is updated
+ * on every allocation and free to reflect the current state while the boundary
* map is only updated on allocation. Each metadata block contains
* information to help mitigate the need to iterate over large portions
* of the bitmap. The reverse mapping from page to chunk is stored in
@@ -81,6 +86,7 @@
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
+#include <linux/memcontrol.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
@@ -160,7 +166,7 @@ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
-struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
+struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
/* chunks which need their map areas extended, protected by pcpu_lock */
static LIST_HEAD(pcpu_map_extend_chunks);
@@ -500,6 +506,9 @@ static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
bool move_front)
{
if (chunk != pcpu_reserved_chunk) {
+ struct list_head *pcpu_slot;
+
+ pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
if (move_front)
list_move(&chunk->list, &pcpu_slot[slot]);
else
@@ -1211,11 +1220,14 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
*
* This function determines the size of an allocation to free using
* the boundary bitmap and clears the allocation map.
+ *
+ * RETURNS:
+ * Number of freed bytes.
*/
-static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
+static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
- int bit_off, bits, end, oslot;
+ int bit_off, bits, end, oslot, freed;
lockdep_assert_held(&pcpu_lock);
pcpu_stats_area_dealloc(chunk);
@@ -1230,8 +1242,10 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
bits = end - bit_off;
bitmap_clear(chunk->alloc_map, bit_off, bits);
+ freed = bits * PCPU_MIN_ALLOC_SIZE;
+
/* update metadata */
- chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
+ chunk->free_bytes += freed;
/* update first free bit */
chunk_md->first_free = min(chunk_md->first_free, bit_off);
@@ -1239,6 +1253,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
pcpu_block_update_hint_free(chunk, bit_off, bits);
pcpu_chunk_relocate(chunk, oslot);
+
+ return freed;
}
static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
@@ -1334,6 +1350,10 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
panic("%s: Failed to allocate %zu bytes\n", __func__,
alloc_size);
+#ifdef CONFIG_MEMCG_KMEM
+ /* first chunk isn't memcg-aware */
+ chunk->obj_cgroups = NULL;
+#endif
pcpu_init_md_blocks(chunk);
/* manage populated page bitmap */
@@ -1373,7 +1393,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
return chunk;
}
-static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
+static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
{
struct pcpu_chunk *chunk;
int region_bits;
@@ -1401,6 +1421,16 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
if (!chunk->md_blocks)
goto md_blocks_fail;
+#ifdef CONFIG_MEMCG_KMEM
+ if (pcpu_is_memcg_chunk(type)) {
+ chunk->obj_cgroups =
+ pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
+ sizeof(struct obj_cgroup *), gfp);
+ if (!chunk->obj_cgroups)
+ goto objcg_fail;
+ }
+#endif
+
pcpu_init_md_blocks(chunk);
/* init metadata */
@@ -1408,6 +1438,10 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
return chunk;
+#ifdef CONFIG_MEMCG_KMEM
+objcg_fail:
+ pcpu_mem_free(chunk->md_blocks);
+#endif
md_blocks_fail:
pcpu_mem_free(chunk->bound_map);
bound_map_fail:
@@ -1422,6 +1456,9 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
+#ifdef CONFIG_MEMCG_KMEM
+ pcpu_mem_free(chunk->obj_cgroups);
+#endif
pcpu_mem_free(chunk->md_blocks);
pcpu_mem_free(chunk->bound_map);
pcpu_mem_free(chunk->alloc_map);
@@ -1498,7 +1535,8 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end);
-static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
+static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
+ gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -1540,6 +1578,87 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}
+#ifdef CONFIG_MEMCG_KMEM
+static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
+ struct obj_cgroup **objcgp)
+{
+ struct obj_cgroup *objcg;
+
+ if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) ||
+ memcg_kmem_bypass())
+ return PCPU_CHUNK_ROOT;
+
+ objcg = get_obj_cgroup_from_current();
+ if (!objcg)
+ return PCPU_CHUNK_ROOT;
+
+ if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
+ obj_cgroup_put(objcg);
+ return PCPU_FAIL_ALLOC;
+ }
+
+ *objcgp = objcg;
+ return PCPU_CHUNK_MEMCG;
+}
+
+static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
+ struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+ if (!objcg)
+ return;
+
+ if (chunk) {
+ chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
+
+ rcu_read_lock();
+ mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
+ size * num_possible_cpus());
+ rcu_read_unlock();
+ } else {
+ obj_cgroup_uncharge(objcg, size * num_possible_cpus());
+ obj_cgroup_put(objcg);
+ }
+}
+
+static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+ struct obj_cgroup *objcg;
+
+ if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
+ return;
+
+ objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
+ chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
+
+ obj_cgroup_uncharge(objcg, size * num_possible_cpus());
+
+ rcu_read_lock();
+ mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
+ -(size * num_possible_cpus()));
+ rcu_read_unlock();
+
+ obj_cgroup_put(objcg);
+}
+
+#else /* CONFIG_MEMCG_KMEM */
+static enum pcpu_chunk_type
+pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
+{
+ return PCPU_CHUNK_ROOT;
+}
+
+static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
+ struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+}
+
+static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
/**
* pcpu_alloc - the percpu allocator
* @size: size of area to allocate in bytes
@@ -1561,6 +1680,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t pcpu_gfp;
bool is_atomic;
bool do_warn;
+ enum pcpu_chunk_type type;
+ struct list_head *pcpu_slot;
+ struct obj_cgroup *objcg = NULL;
static int warn_limit = 10;
struct pcpu_chunk *chunk, *next;
const char *err;
@@ -1595,16 +1717,23 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
return NULL;
}
+ type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
+ if (unlikely(type == PCPU_FAIL_ALLOC))
+ return NULL;
+ pcpu_slot = pcpu_chunk_list(type);
+
if (!is_atomic) {
/*
* pcpu_balance_workfn() allocates memory under this mutex,
* and it may wait for memory reclaim. Allow current task
* to become OOM victim, in case of memory pressure.
*/
- if (gfp & __GFP_NOFAIL)
+ if (gfp & __GFP_NOFAIL) {
mutex_lock(&pcpu_alloc_mutex);
- else if (mutex_lock_killable(&pcpu_alloc_mutex))
+ } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
+ pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
return NULL;
+ }
}
spin_lock_irqsave(&pcpu_lock, flags);
@@ -1659,7 +1788,7 @@ restart:
}
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
- chunk = pcpu_create_chunk(pcpu_gfp);
+ chunk = pcpu_create_chunk(type, pcpu_gfp);
if (!chunk) {
err = "failed to allocate new chunk";
goto fail;
@@ -1716,6 +1845,8 @@ area_found:
trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
chunk->base_addr, off, ptr);
+ pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
+
return ptr;
fail_unlock:
@@ -1737,6 +1868,9 @@ fail:
} else {
mutex_unlock(&pcpu_alloc_mutex);
}
+
+ pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
+
return NULL;
}
@@ -1796,8 +1930,8 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
}
/**
- * pcpu_balance_workfn - manage the amount of free chunks and populated pages
- * @work: unused
+ * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
+ * @type: chunk type
*
* Reclaim all fully free chunks except for the first one. This is also
* responsible for maintaining the pool of empty populated pages. However,
@@ -1806,11 +1940,12 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
* allocation causes the failure as it is possible that requests can be
* serviced from already backed regions.
*/
-static void pcpu_balance_workfn(struct work_struct *work)
+static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
{
/* gfp flags passed to underlying allocators */
const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
LIST_HEAD(to_free);
+ struct list_head *pcpu_slot = pcpu_chunk_list(type);
struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
int slot, nr_to_pop, ret;
@@ -1908,7 +2043,7 @@ retry_pop:
if (nr_to_pop) {
/* ran out of chunks to populate, create a new one and retry */
- chunk = pcpu_create_chunk(gfp);
+ chunk = pcpu_create_chunk(type, gfp);
if (chunk) {
spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
@@ -1921,6 +2056,20 @@ retry_pop:
}
/**
+ * pcpu_balance_workfn - manage the amount of free chunks and populated pages
+ * @work: unused
+ *
+ * Call __pcpu_balance_workfn() for each chunk type.
+ */
+static void pcpu_balance_workfn(struct work_struct *work)
+{
+ enum pcpu_chunk_type type;
+
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ __pcpu_balance_workfn(type);
+}
+
+/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
@@ -1934,8 +2083,9 @@ void free_percpu(void __percpu *ptr)
void *addr;
struct pcpu_chunk *chunk;
unsigned long flags;
- int off;
+ int size, off;
bool need_balance = false;
+ struct list_head *pcpu_slot;
if (!ptr)
return;
@@ -1949,7 +2099,11 @@ void free_percpu(void __percpu *ptr)
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->base_addr;
- pcpu_free_area(chunk, off);
+ size = pcpu_free_area(chunk, off);
+
+ pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
+
+ pcpu_memcg_free_hook(chunk, off, size);
/* if there are more than one fully free chunks, wake up grim reaper */
if (chunk->free_bytes == pcpu_unit_size) {
@@ -2260,6 +2414,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
int map_size;
unsigned long tmp_addr;
size_t alloc_size;
+ enum pcpu_chunk_type type;
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
@@ -2377,13 +2532,18 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
- pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]),
- SMP_CACHE_BYTES);
- if (!pcpu_slot)
+ pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
+ sizeof(pcpu_chunk_lists[0]) *
+ PCPU_NR_CHUNK_TYPES,
+ SMP_CACHE_BYTES);
+ if (!pcpu_chunk_lists)
panic("%s: Failed to allocate %zu bytes\n", __func__,
- pcpu_nr_slots * sizeof(pcpu_slot[0]));
- for (i = 0; i < pcpu_nr_slots; i++)
- INIT_LIST_HEAD(&pcpu_slot[i]);
+ pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
+ PCPU_NR_CHUNK_TYPES);
+
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ for (i = 0; i < pcpu_nr_slots; i++)
+ INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
/*
* The end of the static region needs to be aligned with the
@@ -2513,7 +2673,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
const size_t static_size = __per_cpu_end - __per_cpu_start;
int nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size;
- int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int upa, max_upa, best_upa; /* units_per_alloc */
int last_allocs, group, unit;
unsigned int cpu, tcpu;
struct pcpu_alloc_info *ai;
diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h
new file mode 100644
index 000000000000..1dcc865029a2
--- /dev/null
+++ b/mm/pgalloc-track.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PGALLLC_TRACK_H
+#define _LINUX_PGALLLC_TRACK_H
+
+#if defined(CONFIG_MMU)
+static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(pgd_none(*pgd))) {
+ if (__p4d_alloc(mm, pgd, address))
+ return NULL;
+ *mod_mask |= PGTBL_PGD_MODIFIED;
+ }
+
+ return p4d_offset(pgd, address);
+}
+
+static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(p4d_none(*p4d))) {
+ if (__pud_alloc(mm, p4d, address))
+ return NULL;
+ *mod_mask |= PGTBL_P4D_MODIFIED;
+ }
+
+ return pud_offset(p4d, address);
+}
+
+static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(pud_none(*pud))) {
+ if (__pmd_alloc(mm, pud, address))
+ return NULL;
+ *mod_mask |= PGTBL_PUD_MODIFIED;
+ }
+
+ return pmd_offset(pud, address);
+}
+#endif /* CONFIG_MMU */
+
+#define pte_alloc_kernel_track(pmd, address, mask) \
+ ((unlikely(pmd_none(*(pmd))) && \
+ (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
+ NULL: pte_offset_kernel(pmd, address))
+
+#endif /* _LINUX_PGALLLC_TRACK_H */
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index cc85ce81914a..29c052099aff 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -105,7 +105,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
* current/current->mm
*/
mmap_read_lock(mm);
- pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages,
+ pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages,
flags, process_pages,
NULL, &locked);
if (locked)
diff --git a/mm/rmap.c b/mm/rmap.c
index 5fe2dedce1fc..6cce9ef06753 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1469,7 +1469,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* do this outside rmap routines.
*/
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
- if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
/*
* huge_pmd_unshare unmapped an entire PMD
* page. There is no way of knowing exactly
diff --git a/mm/shmem.c b/mm/shmem.c
index b2abca3f7f33..271548ca20f3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -114,11 +114,13 @@ struct shmem_options {
kuid_t uid;
kgid_t gid;
umode_t mode;
+ bool full_inums;
int huge;
int seen;
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
+#define SHMEM_SEEN_INUMS 8
};
#ifdef CONFIG_TMPFS
@@ -260,18 +262,76 @@ bool vma_is_shmem(struct vm_area_struct *vma)
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
-static int shmem_reserve_inode(struct super_block *sb)
+/*
+ * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
+ * produces a novel ino for the newly allocated inode.
+ *
+ * It may also be called when making a hard link to permit the space needed by
+ * each dentry. However, in that case, no new inode number is needed since that
+ * internally draws from another pool of inode numbers (currently global
+ * get_next_ino()). This case is indicated by passing NULL as inop.
+ */
+#define SHMEM_INO_BATCH 1024
+static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- if (sbinfo->max_inodes) {
+ ino_t ino;
+
+ if (!(sb->s_flags & SB_KERNMOUNT)) {
spin_lock(&sbinfo->stat_lock);
if (!sbinfo->free_inodes) {
spin_unlock(&sbinfo->stat_lock);
return -ENOSPC;
}
sbinfo->free_inodes--;
+ if (inop) {
+ ino = sbinfo->next_ino++;
+ if (unlikely(is_zero_ino(ino)))
+ ino = sbinfo->next_ino++;
+ if (unlikely(!sbinfo->full_inums &&
+ ino > UINT_MAX)) {
+ /*
+ * Emulate get_next_ino uint wraparound for
+ * compatibility
+ */
+ if (IS_ENABLED(CONFIG_64BIT))
+ pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
+ __func__, MINOR(sb->s_dev));
+ sbinfo->next_ino = 1;
+ ino = sbinfo->next_ino++;
+ }
+ *inop = ino;
+ }
spin_unlock(&sbinfo->stat_lock);
+ } else if (inop) {
+ /*
+ * __shmem_file_setup, one of our callers, is lock-free: it
+ * doesn't hold stat_lock in shmem_reserve_inode since
+ * max_inodes is always 0, and is called from potentially
+ * unknown contexts. As such, use a per-cpu batched allocator
+ * which doesn't require the per-sb stat_lock unless we are at
+ * the batch boundary.
+ *
+ * We don't need to worry about inode{32,64} since SB_KERNMOUNT
+ * shmem mounts are not exposed to userspace, so we don't need
+ * to worry about things like glibc compatibility.
+ */
+ ino_t *next_ino;
+ next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
+ ino = *next_ino;
+ if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
+ spin_lock(&sbinfo->stat_lock);
+ ino = sbinfo->next_ino;
+ sbinfo->next_ino += SHMEM_INO_BATCH;
+ spin_unlock(&sbinfo->stat_lock);
+ if (unlikely(is_zero_ino(ino)))
+ ino++;
+ }
+ *inop = ino;
+ *next_ino = ++ino;
+ put_cpu();
}
+
return 0;
}
@@ -1374,7 +1434,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
list_add(&info->swaplist, &shmem_swaplist);
if (add_to_swap_cache(page, swap,
- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN) == 0) {
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ NULL) == 0) {
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
info->swapped++;
@@ -1625,7 +1686,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
* Swap in the page pointed to by *pagep.
* Caller has to make sure that *pagep contains a valid swapped page.
* Returns 0 and the page in pagep if success. On failure, returns the
- * the error code and NULL in *pagep.
+ * error code and NULL in *pagep.
*/
static int shmem_swapin_page(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
@@ -2222,13 +2283,14 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
struct inode *inode;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ ino_t ino;
- if (shmem_reserve_inode(sb))
+ if (shmem_reserve_inode(sb, &ino))
return NULL;
inode = new_inode(sb);
if (inode) {
- inode->i_ino = get_next_ino();
+ inode->i_ino = ino;
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -2932,7 +2994,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
* first link must skip that, to get the accounting right.
*/
if (inode->i_nlink) {
- ret = shmem_reserve_inode(inode->i_sb);
+ ret = shmem_reserve_inode(inode->i_sb, NULL);
if (ret)
goto out;
}
@@ -3347,6 +3409,8 @@ enum shmem_param {
Opt_nr_inodes,
Opt_size,
Opt_uid,
+ Opt_inode32,
+ Opt_inode64,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -3366,6 +3430,8 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_string("nr_inodes", Opt_nr_inodes),
fsparam_string("size", Opt_size),
fsparam_u32 ("uid", Opt_uid),
+ fsparam_flag ("inode32", Opt_inode32),
+ fsparam_flag ("inode64", Opt_inode64),
{}
};
@@ -3437,6 +3503,18 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
break;
}
goto unsupported_parameter;
+ case Opt_inode32:
+ ctx->full_inums = false;
+ ctx->seen |= SHMEM_SEEN_INUMS;
+ break;
+ case Opt_inode64:
+ if (sizeof(ino_t) < 8) {
+ return invalfc(fc,
+ "Cannot use inode64 with <64bit inums in kernel\n");
+ }
+ ctx->full_inums = true;
+ ctx->seen |= SHMEM_SEEN_INUMS;
+ break;
}
return 0;
@@ -3528,8 +3606,16 @@ static int shmem_reconfigure(struct fs_context *fc)
}
}
+ if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
+ sbinfo->next_ino > UINT_MAX) {
+ err = "Current inum too high to switch to 32-bit inums";
+ goto out;
+ }
+
if (ctx->seen & SHMEM_SEEN_HUGE)
sbinfo->huge = ctx->huge;
+ if (ctx->seen & SHMEM_SEEN_INUMS)
+ sbinfo->full_inums = ctx->full_inums;
if (ctx->seen & SHMEM_SEEN_BLOCKS)
sbinfo->max_blocks = ctx->blocks;
if (ctx->seen & SHMEM_SEEN_INODES) {
@@ -3569,6 +3655,29 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, sbinfo->gid));
+
+ /*
+ * Showing inode{64,32} might be useful even if it's the system default,
+ * since then people don't have to resort to checking both here and
+ * /proc/config.gz to confirm 64-bit inums were successfully applied
+ * (which may not even exist if IKCONFIG_PROC isn't enabled).
+ *
+ * We hide it when inode64 isn't the default and we are using 32-bit
+ * inodes, since that probably just means the feature isn't even under
+ * consideration.
+ *
+ * As such:
+ *
+ * +-----------------+-----------------+
+ * | TMPFS_INODE64=y | TMPFS_INODE64=n |
+ * +------------------+-----------------+-----------------+
+ * | full_inums=true | show | show |
+ * | full_inums=false | show | hide |
+ * +------------------+-----------------+-----------------+
+ *
+ */
+ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
+ seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
if (sbinfo->huge)
@@ -3584,6 +3693,7 @@ static void shmem_put_super(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ free_percpu(sbinfo->ino_batch);
percpu_counter_destroy(&sbinfo->used_blocks);
mpol_put(sbinfo->mpol);
kfree(sbinfo);
@@ -3616,6 +3726,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
ctx->blocks = shmem_default_max_blocks();
if (!(ctx->seen & SHMEM_SEEN_INODES))
ctx->inodes = shmem_default_max_inodes();
+ if (!(ctx->seen & SHMEM_SEEN_INUMS))
+ ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
} else {
sb->s_flags |= SB_NOUSER;
}
@@ -3626,8 +3738,14 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#endif
sbinfo->max_blocks = ctx->blocks;
sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+ if (sb->s_flags & SB_KERNMOUNT) {
+ sbinfo->ino_batch = alloc_percpu(ino_t);
+ if (!sbinfo->ino_batch)
+ goto failed;
+ }
sbinfo->uid = ctx->uid;
sbinfo->gid = ctx->gid;
+ sbinfo->full_inums = ctx->full_inums;
sbinfo->mode = ctx->mode;
sbinfo->huge = ctx->huge;
sbinfo->mpol = ctx->mpol;
@@ -4128,7 +4246,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
/**
* shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
+ * @vma: the vma to be mmapped is prepared by do_mmap
*/
int shmem_zero_setup(struct vm_area_struct *vma)
{
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 44406d9977c7..9b5cd4b004b0 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -10,33 +10,11 @@
#include "shuffle.h"
DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
-static unsigned long shuffle_state __ro_after_init;
-
-/*
- * Depending on the architecture, module parameter parsing may run
- * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents,
- * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE
- * attempts to turn on the implementation, but aborts if it finds
- * SHUFFLE_FORCE_DISABLE already set.
- */
-__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
-{
- if (ctl == SHUFFLE_FORCE_DISABLE)
- set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state);
-
- if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) {
- if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state))
- static_branch_disable(&page_alloc_shuffle_key);
- } else if (ctl == SHUFFLE_ENABLE
- && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state))
- static_branch_enable(&page_alloc_shuffle_key);
-}
static bool shuffle_param;
static int shuffle_show(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
- ? 'Y' : 'N');
+ return sprintf(buffer, "%c\n", shuffle_param ? 'Y' : 'N');
}
static __meminit int shuffle_store(const char *val,
@@ -47,9 +25,7 @@ static __meminit int shuffle_store(const char *val,
if (rc < 0)
return rc;
if (shuffle_param)
- page_alloc_shuffle(SHUFFLE_ENABLE);
- else
- page_alloc_shuffle(SHUFFLE_FORCE_DISABLE);
+ static_branch_enable(&page_alloc_shuffle_key);
return 0;
}
module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
@@ -58,25 +34,25 @@ module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
* For two pages to be swapped in the shuffle, they must be free (on a
* 'free_area' lru), have the same order, and have the same migratetype.
*/
-static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order)
+static struct page * __meminit shuffle_valid_page(struct zone *zone,
+ unsigned long pfn, int order)
{
- struct page *page;
+ struct page *page = pfn_to_online_page(pfn);
/*
* Given we're dealing with randomly selected pfns in a zone we
* need to ask questions like...
*/
- /* ...is the pfn even in the memmap? */
- if (!pfn_valid_within(pfn))
+ /* ... is the page managed by the buddy? */
+ if (!page)
return NULL;
- /* ...is the pfn in a present section or a hole? */
- if (!pfn_in_present_section(pfn))
+ /* ... is the page assigned to the same zone? */
+ if (page_zone(page) != zone)
return NULL;
/* ...is the page free and currently on a free_area list? */
- page = pfn_to_page(pfn);
if (!PageBuddy(page))
return NULL;
@@ -123,7 +99,7 @@ void __meminit __shuffle_zone(struct zone *z)
* page_j randomly selected in the span @zone_start_pfn to
* @spanned_pages.
*/
- page_i = shuffle_valid_page(i, order);
+ page_i = shuffle_valid_page(z, i, order);
if (!page_i)
continue;
@@ -137,7 +113,7 @@ void __meminit __shuffle_zone(struct zone *z)
j = z->zone_start_pfn +
ALIGN_DOWN(get_random_long() % z->spanned_pages,
order_pages);
- page_j = shuffle_valid_page(j, order);
+ page_j = shuffle_valid_page(z, j, order);
if (page_j && page_j != page_i)
break;
}
diff --git a/mm/shuffle.h b/mm/shuffle.h
index 4d79f03b6658..71b784f0b7c3 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -4,23 +4,10 @@
#define _MM_SHUFFLE_H
#include <linux/jump_label.h>
-/*
- * SHUFFLE_ENABLE is called from the command line enabling path, or by
- * platform-firmware enabling that indicates the presence of a
- * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from
- * the command line path and overrides any previous or future
- * SHUFFLE_ENABLE.
- */
-enum mm_shuffle_ctl {
- SHUFFLE_ENABLE,
- SHUFFLE_FORCE_DISABLE,
-};
-
#define SHUFFLE_ORDER (MAX_ORDER-1)
#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
-extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl);
extern void __shuffle_free_memory(pg_data_t *pgdat);
extern bool shuffle_pick_tail(void);
static inline void shuffle_free_memory(pg_data_t *pgdat)
@@ -58,10 +45,6 @@ static inline void shuffle_zone(struct zone *z)
{
}
-static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
-{
-}
-
static inline bool is_shuffle_order(int order)
{
return false;
diff --git a/mm/slab.c b/mm/slab.c
index 9350062ffc1a..3160dff6fd76 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -588,6 +588,16 @@ static int transfer_objects(struct array_cache *to,
return nr;
}
+/* &alien->lock must be held by alien callers. */
+static __always_inline void __free_one(struct array_cache *ac, void *objp)
+{
+ /* Avoid trivial double-free. */
+ if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
+ WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp))
+ return;
+ ac->entry[ac->avail++] = objp;
+}
+
#ifndef CONFIG_NUMA
#define drain_alien_cache(cachep, alien) do { } while (0)
@@ -767,7 +777,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, ac, page_node, &list);
}
- ac->entry[ac->avail++] = objp;
+ __free_one(ac, objp);
spin_unlock(&alien->lock);
slabs_destroy(cachep, &list);
} else {
@@ -1050,7 +1060,7 @@ int slab_prepare_cpu(unsigned int cpu)
* offline.
*
* Even if all the cpus of a node are down, we don't free the
- * kmem_list3 of any cache. This to avoid a race between cpu_down, and
+ * kmem_cache_node of any cache. This to avoid a race between cpu_down, and
* a kmalloc allocation from another cpu for memory from the node of
* the cpu going down. The list3 structure is usually allocated from
* kmem_cache_create() and gets destroyed at kmem_cache_destroy().
@@ -1239,7 +1249,6 @@ void __init kmem_cache_init(void)
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN, 0, 0);
list_add(&kmem_cache->list, &slab_caches);
- memcg_link_cache(kmem_cache, NULL);
slab_state = PARTIAL;
/*
@@ -1370,11 +1379,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
return NULL;
}
- if (charge_slab_page(page, flags, cachep->gfporder, cachep)) {
- __free_pages(page, cachep->gfporder);
- return NULL;
- }
-
+ account_slab_page(page, cachep->gfporder, cachep);
__SetPageSlab(page);
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (sk_memalloc_socks() && page_is_pfmemalloc(page))
@@ -1398,7 +1403,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
- uncharge_slab_page(page, order, cachep);
+ unaccount_slab_page(page, order, cachep);
__free_pages(page, order);
}
@@ -2243,17 +2248,6 @@ int __kmem_cache_shrink(struct kmem_cache *cachep)
return (ret ? 1 : 0);
}
-#ifdef CONFIG_MEMCG
-void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
-{
- __kmem_cache_shrink(cachep);
-}
-
-void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
-{
-}
-#endif
-
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
return __kmem_cache_shrink(cachep);
@@ -2579,13 +2573,9 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
- if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
- gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
- flags &= ~GFP_SLAB_BUG_MASK;
- pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
- invalid_mask, &invalid_mask, flags, &flags);
- dump_stack();
- }
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
+
WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
@@ -3222,9 +3212,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
unsigned long save_flags;
void *ptr;
int slab_node = numa_mem_id();
+ struct obj_cgroup *objcg = NULL;
flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, flags);
+ cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;
@@ -3260,7 +3251,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
memset(ptr, 0, cachep->object_size);
- slab_post_alloc_hook(cachep, flags, 1, &ptr);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
return ptr;
}
@@ -3301,9 +3292,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
unsigned long save_flags;
void *objp;
+ struct obj_cgroup *objcg = NULL;
flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, flags);
+ cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;
@@ -3317,7 +3309,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
memset(objp, 0, cachep->object_size);
- slab_post_alloc_hook(cachep, flags, 1, &objp);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
return objp;
}
@@ -3426,6 +3418,11 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
if (kasan_slab_free(cachep, objp, _RET_IP_))
return;
+ /* Use KCSAN to help debug racy use-after-free. */
+ if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU))
+ __kcsan_check_access(objp, cachep->object_size,
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
+
___cache_free(cachep, objp, caller);
}
@@ -3439,6 +3436,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
memset(objp, 0, cachep->object_size);
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
+ memcg_slab_free_hook(cachep, virt_to_head_page(objp), objp);
/*
* Skip calling cache_free_alien() when the platform is not numa.
@@ -3466,7 +3464,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
}
}
- ac->entry[ac->avail++] = objp;
+ __free_one(ac, objp);
}
/**
@@ -3504,8 +3502,9 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
size_t i;
+ struct obj_cgroup *objcg = NULL;
- s = slab_pre_alloc_hook(s, flags);
+ s = slab_pre_alloc_hook(s, &objcg, size, flags);
if (!s)
return 0;
@@ -3528,13 +3527,13 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
for (i = 0; i < size; i++)
memset(p[i], 0, s->object_size);
- slab_post_alloc_hook(s, flags, size, p);
+ slab_post_alloc_hook(s, objcg, flags, size, p);
/* FIXME: Trace call missing. Christoph would like a bulk variant */
return size;
error:
local_irq_enable();
cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
- slab_post_alloc_hook(s, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
@@ -3796,8 +3795,8 @@ fail:
}
/* Always called with the slab_mutex held */
-static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
- int batchcount, int shared, gfp_t gfp)
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+ int batchcount, int shared, gfp_t gfp)
{
struct array_cache __percpu *cpu_cache, *prev;
int cpu;
@@ -3842,29 +3841,6 @@ setup_node:
return setup_kmem_cache_nodes(cachep, gfp);
}
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
- int batchcount, int shared, gfp_t gfp)
-{
- int ret;
- struct kmem_cache *c;
-
- ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
-
- if (slab_state < FULL)
- return ret;
-
- if ((ret < 0) || !is_root_cache(cachep))
- return ret;
-
- lockdep_assert_held(&slab_mutex);
- for_each_memcg_cache(c, cachep) {
- /* return value determined by the root cache only */
- __do_tune_cpucache(c, limit, batchcount, shared, gfp);
- }
-
- return ret;
-}
-
/* Called with slab_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
@@ -3877,13 +3853,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
if (err)
goto end;
- if (!is_root_cache(cachep)) {
- struct kmem_cache *root = memcg_root_cache(cachep);
- limit = root->limit;
- shared = root->shared;
- batchcount = root->batchcount;
- }
-
if (limit && shared && batchcount)
goto skip_setup;
/*
diff --git a/mm/slab.h b/mm/slab.h
index 74f7e09a7cfd..6cc323f1313a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -30,69 +30,6 @@ struct kmem_cache {
struct list_head list; /* List of all slab caches on the system */
};
-#else /* !CONFIG_SLOB */
-
-struct memcg_cache_array {
- struct rcu_head rcu;
- struct kmem_cache *entries[0];
-};
-
-/*
- * This is the main placeholder for memcg-related information in kmem caches.
- * Both the root cache and the child caches will have it. For the root cache,
- * this will hold a dynamically allocated array large enough to hold
- * information about the currently limited memcgs in the system. To allow the
- * array to be accessed without taking any locks, on relocation we free the old
- * version only after a grace period.
- *
- * Root and child caches hold different metadata.
- *
- * @root_cache: Common to root and child caches. NULL for root, pointer to
- * the root cache for children.
- *
- * The following fields are specific to root caches.
- *
- * @memcg_caches: kmemcg ID indexed table of child caches. This table is
- * used to index child cachces during allocation and cleared
- * early during shutdown.
- *
- * @root_caches_node: List node for slab_root_caches list.
- *
- * @children: List of all child caches. While the child caches are also
- * reachable through @memcg_caches, a child cache remains on
- * this list until it is actually destroyed.
- *
- * The following fields are specific to child caches.
- *
- * @memcg: Pointer to the memcg this cache belongs to.
- *
- * @children_node: List node for @root_cache->children list.
- *
- * @kmem_caches_node: List node for @memcg->kmem_caches list.
- */
-struct memcg_cache_params {
- struct kmem_cache *root_cache;
- union {
- struct {
- struct memcg_cache_array __rcu *memcg_caches;
- struct list_head __root_caches_node;
- struct list_head children;
- bool dying;
- };
- struct {
- struct mem_cgroup *memcg;
- struct list_head children_node;
- struct list_head kmem_caches_node;
- struct percpu_ref refcnt;
-
- void (*work_fn)(struct kmem_cache *);
- union {
- struct rcu_head rcu_head;
- struct work_struct work;
- };
- };
- };
-};
#endif /* CONFIG_SLOB */
#ifdef CONFIG_SLAB
@@ -109,6 +46,7 @@ struct memcg_cache_params {
#include <linux/kmemleak.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
+#include <linux/kmemleak.h>
/*
* State of the slab allocator.
@@ -152,6 +90,7 @@ void create_kmalloc_caches(slab_flags_t);
struct kmem_cache *kmalloc_slab(size_t, gfp_t);
#endif
+gfp_t kmalloc_fix_flags(gfp_t flags);
/* Functions provided by the slab allocators */
int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
@@ -234,10 +173,7 @@ bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *);
-void __kmemcg_cache_deactivate(struct kmem_cache *s);
-void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
void slab_kmem_cache_release(struct kmem_cache *);
-void kmem_cache_shrink_all(struct kmem_cache *s);
struct seq_file;
struct file;
@@ -272,199 +208,208 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
static inline int cache_vmstat_idx(struct kmem_cache *s)
{
return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE;
+ NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
}
-#ifdef CONFIG_MEMCG_KMEM
-
-/* List of all root caches. */
-extern struct list_head slab_root_caches;
-#define root_caches_node memcg_params.__root_caches_node
+#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SLUB_DEBUG_ON
+DECLARE_STATIC_KEY_TRUE(slub_debug_enabled);
+#else
+DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
+#endif
+extern void print_tracking(struct kmem_cache *s, void *object);
+#else
+static inline void print_tracking(struct kmem_cache *s, void *object)
+{
+}
+#endif
/*
- * Iterate over all memcg caches of the given root cache. The caller must hold
- * slab_mutex.
+ * Returns true if any of the specified slub_debug flags is enabled for the
+ * cache. Use only for flags parsed by setup_slub_debug() as it also enables
+ * the static key.
*/
-#define for_each_memcg_cache(iter, root) \
- list_for_each_entry(iter, &(root)->memcg_params.children, \
- memcg_params.children_node)
-
-static inline bool is_root_cache(struct kmem_cache *s)
+static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags)
{
- return !s->memcg_params.root_cache;
+#ifdef CONFIG_SLUB_DEBUG
+ VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
+ if (static_branch_unlikely(&slub_debug_enabled))
+ return s->flags & flags;
+#endif
+ return false;
}
-static inline bool slab_equal_or_root(struct kmem_cache *s,
- struct kmem_cache *p)
+#ifdef CONFIG_MEMCG_KMEM
+static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
{
- return p == s || p == s->memcg_params.root_cache;
+ /*
+ * page->mem_cgroup and page->obj_cgroups are sharing the same
+ * space. To distinguish between them in case we don't know for sure
+ * that the page is a slab page (e.g. page_cgroup_ino()), let's
+ * always set the lowest bit of obj_cgroups.
+ */
+ return (struct obj_cgroup **)
+ ((unsigned long)page->obj_cgroups & ~0x1UL);
}
-/*
- * We use suffixes to the name in memcg because we can't have caches
- * created in the system with the same name. But when we print them
- * locally, better refer to them with the base name
- */
-static inline const char *cache_name(struct kmem_cache *s)
+static inline bool page_has_obj_cgroups(struct page *page)
{
- if (!is_root_cache(s))
- s = s->memcg_params.root_cache;
- return s->name;
+ return ((unsigned long)page->obj_cgroups & 0x1UL);
}
-static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+ gfp_t gfp);
+
+static inline void memcg_free_page_obj_cgroups(struct page *page)
{
- if (is_root_cache(s))
- return s;
- return s->memcg_params.root_cache;
+ kfree(page_obj_cgroups(page));
+ page->obj_cgroups = NULL;
}
-/*
- * Expects a pointer to a slab page. Please note, that PageSlab() check
- * isn't sufficient, as it returns true also for tail compound slab pages,
- * which do not have slab_cache pointer set.
- * So this function assumes that the page can pass PageSlab() && !PageTail()
- * check.
- *
- * The kmem_cache can be reparented asynchronously. The caller must ensure
- * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex.
- */
-static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
+static inline size_t obj_full_size(struct kmem_cache *s)
{
- struct kmem_cache *s;
-
- s = READ_ONCE(page->slab_cache);
- if (s && !is_root_cache(s))
- return READ_ONCE(s->memcg_params.memcg);
-
- return NULL;
+ /*
+ * For each accounted object there is an extra space which is used
+ * to store obj_cgroup membership. Charge it too.
+ */
+ return s->size + sizeof(struct obj_cgroup *);
}
-/*
- * Charge the slab page belonging to the non-root kmem_cache.
- * Can be called for non-root kmem_caches only.
- */
-static __always_inline int memcg_charge_slab(struct page *page,
- gfp_t gfp, int order,
- struct kmem_cache *s)
+static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ size_t objects,
+ gfp_t flags)
{
- int nr_pages = 1 << order;
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
- int ret;
-
- rcu_read_lock();
- memcg = READ_ONCE(s->memcg_params.memcg);
- while (memcg && !css_tryget_online(&memcg->css))
- memcg = parent_mem_cgroup(memcg);
- rcu_read_unlock();
+ struct obj_cgroup *objcg;
- if (unlikely(!memcg || mem_cgroup_is_root(memcg))) {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- nr_pages);
- percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
- return 0;
- }
+ if (memcg_kmem_bypass())
+ return NULL;
- ret = memcg_kmem_charge(memcg, gfp, nr_pages);
- if (ret)
- goto out;
+ objcg = get_obj_cgroup_from_current();
+ if (!objcg)
+ return NULL;
- lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
- mod_lruvec_state(lruvec, cache_vmstat_idx(s), nr_pages);
+ if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) {
+ obj_cgroup_put(objcg);
+ return NULL;
+ }
- /* transer try_charge() page references to kmem_cache */
- percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
- css_put_many(&memcg->css, nr_pages);
-out:
- css_put(&memcg->css);
- return ret;
+ return objcg;
}
-/*
- * Uncharge a slab page belonging to a non-root kmem_cache.
- * Can be called for non-root kmem_caches only.
- */
-static __always_inline void memcg_uncharge_slab(struct page *page, int order,
- struct kmem_cache *s)
+static inline void mod_objcg_state(struct obj_cgroup *objcg,
+ struct pglist_data *pgdat,
+ int idx, int nr)
{
- int nr_pages = 1 << order;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
rcu_read_lock();
- memcg = READ_ONCE(s->memcg_params.memcg);
- if (likely(!mem_cgroup_is_root(memcg))) {
- lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
- mod_lruvec_state(lruvec, cache_vmstat_idx(s), -nr_pages);
- memcg_kmem_uncharge(memcg, nr_pages);
- } else {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- -nr_pages);
- }
+ memcg = obj_cgroup_memcg(objcg);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
rcu_read_unlock();
+}
+
+static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size,
+ void **p)
+{
+ struct page *page;
+ unsigned long off;
+ size_t i;
+
+ if (!objcg)
+ return;
- percpu_ref_put_many(&s->memcg_params.refcnt, nr_pages);
+ flags &= ~__GFP_ACCOUNT;
+ for (i = 0; i < size; i++) {
+ if (likely(p[i])) {
+ page = virt_to_head_page(p[i]);
+
+ if (!page_has_obj_cgroups(page) &&
+ memcg_alloc_page_obj_cgroups(page, s, flags)) {
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ continue;
+ }
+
+ off = obj_to_index(s, page, p[i]);
+ obj_cgroup_get(objcg);
+ page_obj_cgroups(page)[off] = objcg;
+ mod_objcg_state(objcg, page_pgdat(page),
+ cache_vmstat_idx(s), obj_full_size(s));
+ } else {
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ }
+ }
+ obj_cgroup_put(objcg);
}
-extern void slab_init_memcg_params(struct kmem_cache *);
-extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg);
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
+ void *p)
+{
+ struct obj_cgroup *objcg;
+ unsigned int off;
-#else /* CONFIG_MEMCG_KMEM */
+ if (!memcg_kmem_enabled())
+ return;
-/* If !memcg, all caches are root. */
-#define slab_root_caches slab_caches
-#define root_caches_node list
+ if (!page_has_obj_cgroups(page))
+ return;
-#define for_each_memcg_cache(iter, root) \
- for ((void)(iter), (void)(root); 0; )
+ off = obj_to_index(s, page, p);
+ objcg = page_obj_cgroups(page)[off];
+ page_obj_cgroups(page)[off] = NULL;
-static inline bool is_root_cache(struct kmem_cache *s)
-{
- return true;
-}
+ if (!objcg)
+ return;
-static inline bool slab_equal_or_root(struct kmem_cache *s,
- struct kmem_cache *p)
-{
- return s == p;
-}
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
+ -obj_full_size(s));
-static inline const char *cache_name(struct kmem_cache *s)
-{
- return s->name;
+ obj_cgroup_put(objcg);
}
-static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+#else /* CONFIG_MEMCG_KMEM */
+static inline bool page_has_obj_cgroups(struct page *page)
{
- return s;
+ return false;
}
-static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
+static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
{
return NULL;
}
-static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
- struct kmem_cache *s)
+static inline int memcg_alloc_page_obj_cgroups(struct page *page,
+ struct kmem_cache *s, gfp_t gfp)
{
return 0;
}
-static inline void memcg_uncharge_slab(struct page *page, int order,
- struct kmem_cache *s)
+static inline void memcg_free_page_obj_cgroups(struct page *page)
{
}
-static inline void slab_init_memcg_params(struct kmem_cache *s)
+static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ size_t objects,
+ gfp_t flags)
{
+ return NULL;
}
-static inline void memcg_link_cache(struct kmem_cache *s,
- struct mem_cgroup *memcg)
+static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size,
+ void **p)
{
}
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
+ void *p)
+{
+}
#endif /* CONFIG_MEMCG_KMEM */
static inline struct kmem_cache *virt_to_cache(const void *obj)
@@ -478,51 +423,36 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
return page->slab_cache;
}
-static __always_inline int charge_slab_page(struct page *page,
- gfp_t gfp, int order,
- struct kmem_cache *s)
+static __always_inline void account_slab_page(struct page *page, int order,
+ struct kmem_cache *s)
{
- if (is_root_cache(s)) {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- 1 << order);
- return 0;
- }
-
- return memcg_charge_slab(page, gfp, order, s);
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ PAGE_SIZE << order);
}
-static __always_inline void uncharge_slab_page(struct page *page, int order,
- struct kmem_cache *s)
+static __always_inline void unaccount_slab_page(struct page *page, int order,
+ struct kmem_cache *s)
{
- if (is_root_cache(s)) {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- -(1 << order));
- return;
- }
+ if (memcg_kmem_enabled())
+ memcg_free_page_obj_cgroups(page);
- memcg_uncharge_slab(page, order, s);
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ -(PAGE_SIZE << order));
}
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
struct kmem_cache *cachep;
- /*
- * When kmemcg is not being used, both assignments should return the
- * same value. but we don't want to pay the assignment price in that
- * case. If it is not compiled in, the compiler should be smart enough
- * to not do even the assignment. In that case, slab_equal_or_root
- * will also be a constant.
- */
- if (!memcg_kmem_enabled() &&
- !IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
- !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
+ if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
+ !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS))
return s;
cachep = virt_to_cache(x);
- WARN_ONCE(cachep && !slab_equal_or_root(cachep, s),
+ if (WARN(cachep && cachep != s,
"%s: Wrong slab cache. %s but object is from %s\n",
- __func__, s->name, cachep->name);
+ __func__, s->name, cachep->name))
+ print_tracking(cachep, x);
return cachep;
}
@@ -557,7 +487,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
}
static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
- gfp_t flags)
+ struct obj_cgroup **objcgp,
+ size_t size, gfp_t flags)
{
flags &= gfp_allowed_mask;
@@ -571,13 +502,14 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
if (memcg_kmem_enabled() &&
((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
- return memcg_kmem_get_cache(s);
+ *objcgp = memcg_slab_pre_alloc_hook(s, size, flags);
return s;
}
-static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
- size_t size, void **p)
+static inline void slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size, void **p)
{
size_t i;
@@ -590,7 +522,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
}
if (memcg_kmem_enabled())
- memcg_kmem_put_cache(s);
+ memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
}
#ifndef CONFIG_SLOB
@@ -645,9 +577,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
void *slab_start(struct seq_file *m, loff_t *pos);
void *slab_next(struct seq_file *m, void *p, loff_t *pos);
void slab_stop(struct seq_file *m, void *p);
-void *memcg_slab_start(struct seq_file *m, loff_t *pos);
-void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
-void memcg_slab_stop(struct seq_file *m, void *p);
int memcg_slab_show(struct seq_file *m, void *p);
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index fe8b68482670..f9ccd5dc13f3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -26,6 +26,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/kmem.h>
+#include "internal.h"
+
#include "slab.h"
enum slab_state slab_state;
@@ -128,152 +130,6 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
return i;
}
-#ifdef CONFIG_MEMCG_KMEM
-
-LIST_HEAD(slab_root_caches);
-static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
-
-static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
-
-void slab_init_memcg_params(struct kmem_cache *s)
-{
- s->memcg_params.root_cache = NULL;
- RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
- INIT_LIST_HEAD(&s->memcg_params.children);
- s->memcg_params.dying = false;
-}
-
-static int init_memcg_params(struct kmem_cache *s,
- struct kmem_cache *root_cache)
-{
- struct memcg_cache_array *arr;
-
- if (root_cache) {
- int ret = percpu_ref_init(&s->memcg_params.refcnt,
- kmemcg_cache_shutdown,
- 0, GFP_KERNEL);
- if (ret)
- return ret;
-
- s->memcg_params.root_cache = root_cache;
- INIT_LIST_HEAD(&s->memcg_params.children_node);
- INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
- return 0;
- }
-
- slab_init_memcg_params(s);
-
- if (!memcg_nr_cache_ids)
- return 0;
-
- arr = kvzalloc(sizeof(struct memcg_cache_array) +
- memcg_nr_cache_ids * sizeof(void *),
- GFP_KERNEL);
- if (!arr)
- return -ENOMEM;
-
- RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
- return 0;
-}
-
-static void destroy_memcg_params(struct kmem_cache *s)
-{
- if (is_root_cache(s)) {
- kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
- } else {
- mem_cgroup_put(s->memcg_params.memcg);
- WRITE_ONCE(s->memcg_params.memcg, NULL);
- percpu_ref_exit(&s->memcg_params.refcnt);
- }
-}
-
-static void free_memcg_params(struct rcu_head *rcu)
-{
- struct memcg_cache_array *old;
-
- old = container_of(rcu, struct memcg_cache_array, rcu);
- kvfree(old);
-}
-
-static int update_memcg_params(struct kmem_cache *s, int new_array_size)
-{
- struct memcg_cache_array *old, *new;
-
- new = kvzalloc(sizeof(struct memcg_cache_array) +
- new_array_size * sizeof(void *), GFP_KERNEL);
- if (!new)
- return -ENOMEM;
-
- old = rcu_dereference_protected(s->memcg_params.memcg_caches,
- lockdep_is_held(&slab_mutex));
- if (old)
- memcpy(new->entries, old->entries,
- memcg_nr_cache_ids * sizeof(void *));
-
- rcu_assign_pointer(s->memcg_params.memcg_caches, new);
- if (old)
- call_rcu(&old->rcu, free_memcg_params);
- return 0;
-}
-
-int memcg_update_all_caches(int num_memcgs)
-{
- struct kmem_cache *s;
- int ret = 0;
-
- mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_root_caches, root_caches_node) {
- ret = update_memcg_params(s, num_memcgs);
- /*
- * Instead of freeing the memory, we'll just leave the caches
- * up to this point in an updated state.
- */
- if (ret)
- break;
- }
- mutex_unlock(&slab_mutex);
- return ret;
-}
-
-void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
-{
- if (is_root_cache(s)) {
- list_add(&s->root_caches_node, &slab_root_caches);
- } else {
- css_get(&memcg->css);
- s->memcg_params.memcg = memcg;
- list_add(&s->memcg_params.children_node,
- &s->memcg_params.root_cache->memcg_params.children);
- list_add(&s->memcg_params.kmem_caches_node,
- &s->memcg_params.memcg->kmem_caches);
- }
-}
-
-static void memcg_unlink_cache(struct kmem_cache *s)
-{
- if (is_root_cache(s)) {
- list_del(&s->root_caches_node);
- } else {
- list_del(&s->memcg_params.children_node);
- list_del(&s->memcg_params.kmem_caches_node);
- }
-}
-#else
-static inline int init_memcg_params(struct kmem_cache *s,
- struct kmem_cache *root_cache)
-{
- return 0;
-}
-
-static inline void destroy_memcg_params(struct kmem_cache *s)
-{
-}
-
-static inline void memcg_unlink_cache(struct kmem_cache *s)
-{
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
/*
* Figure out what the alignment of the objects will be given a set of
* flags, a user specified alignment and the size of the objects.
@@ -311,9 +167,6 @@ int slab_unmergeable(struct kmem_cache *s)
if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
return 1;
- if (!is_root_cache(s))
- return 1;
-
if (s->ctor)
return 1;
@@ -326,14 +179,6 @@ int slab_unmergeable(struct kmem_cache *s)
if (s->refcount < 0)
return 1;
-#ifdef CONFIG_MEMCG_KMEM
- /*
- * Skip the dying kmem_cache.
- */
- if (s->memcg_params.dying)
- return 1;
-#endif
-
return 0;
}
@@ -356,7 +201,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
if (flags & SLAB_NEVER_MERGE)
return NULL;
- list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) {
+ list_for_each_entry_reverse(s, &slab_caches, list) {
if (slab_unmergeable(s))
continue;
@@ -388,7 +233,7 @@ static struct kmem_cache *create_cache(const char *name,
unsigned int object_size, unsigned int align,
slab_flags_t flags, unsigned int useroffset,
unsigned int usersize, void (*ctor)(void *),
- struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+ struct kmem_cache *root_cache)
{
struct kmem_cache *s;
int err;
@@ -408,24 +253,18 @@ static struct kmem_cache *create_cache(const char *name,
s->useroffset = useroffset;
s->usersize = usersize;
- err = init_memcg_params(s, root_cache);
- if (err)
- goto out_free_cache;
-
err = __kmem_cache_create(s, flags);
if (err)
goto out_free_cache;
s->refcount = 1;
list_add(&s->list, &slab_caches);
- memcg_link_cache(s, memcg);
out:
if (err)
return ERR_PTR(err);
return s;
out_free_cache:
- destroy_memcg_params(s);
kmem_cache_free(kmem_cache, s);
goto out;
}
@@ -471,7 +310,6 @@ kmem_cache_create_usercopy(const char *name,
get_online_cpus();
get_online_mems();
- memcg_get_cache_ids();
mutex_lock(&slab_mutex);
@@ -512,7 +350,7 @@ kmem_cache_create_usercopy(const char *name,
s = create_cache(cache_name, size,
calculate_alignment(flags, align, size),
- flags, useroffset, usersize, ctor, NULL, NULL);
+ flags, useroffset, usersize, ctor, NULL);
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
@@ -521,7 +359,6 @@ kmem_cache_create_usercopy(const char *name,
out_unlock:
mutex_unlock(&slab_mutex);
- memcg_put_cache_ids();
put_online_mems();
put_online_cpus();
@@ -582,7 +419,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
/*
* On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
* @slab_caches_to_rcu_destroy list. The slab pages are freed
- * through RCU and and the associated kmem_cache are dereferenced
+ * through RCU and the associated kmem_cache are dereferenced
* while freeing the pages, so the kmem_caches should be freed only
* after the pending RCU operations are finished. As rcu_barrier()
* is a pretty slow operation, we batch all pending destructions
@@ -614,7 +451,6 @@ static int shutdown_cache(struct kmem_cache *s)
if (__kmem_cache_shutdown(s) != 0)
return -EBUSY;
- memcg_unlink_cache(s);
list_del(&s->list);
if (s->flags & SLAB_TYPESAFE_BY_RCU) {
@@ -635,311 +471,9 @@ static int shutdown_cache(struct kmem_cache *s)
return 0;
}
-#ifdef CONFIG_MEMCG_KMEM
-/*
- * memcg_create_kmem_cache - Create a cache for a memory cgroup.
- * @memcg: The memory cgroup the new cache is for.
- * @root_cache: The parent of the new cache.
- *
- * This function attempts to create a kmem cache that will serve allocation
- * requests going from @memcg to @root_cache. The new cache inherits properties
- * from its parent.
- */
-void memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *root_cache)
-{
- static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
- struct cgroup_subsys_state *css = &memcg->css;
- struct memcg_cache_array *arr;
- struct kmem_cache *s = NULL;
- char *cache_name;
- int idx;
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
-
- /*
- * The memory cgroup could have been offlined while the cache
- * creation work was pending.
- */
- if (memcg->kmem_state != KMEM_ONLINE)
- goto out_unlock;
-
- idx = memcg_cache_id(memcg);
- arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
- lockdep_is_held(&slab_mutex));
-
- /*
- * Since per-memcg caches are created asynchronously on first
- * allocation (see memcg_kmem_get_cache()), several threads can try to
- * create the same cache, but only one of them may succeed.
- */
- if (arr->entries[idx])
- goto out_unlock;
-
- cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
- cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name,
- css->serial_nr, memcg_name_buf);
- if (!cache_name)
- goto out_unlock;
-
- s = create_cache(cache_name, root_cache->object_size,
- root_cache->align,
- root_cache->flags & CACHE_CREATE_MASK,
- root_cache->useroffset, root_cache->usersize,
- root_cache->ctor, memcg, root_cache);
- /*
- * If we could not create a memcg cache, do not complain, because
- * that's not critical at all as we can always proceed with the root
- * cache.
- */
- if (IS_ERR(s)) {
- kfree(cache_name);
- goto out_unlock;
- }
-
- /*
- * Since readers won't lock (see memcg_kmem_get_cache()), we need a
- * barrier here to ensure nobody will see the kmem_cache partially
- * initialized.
- */
- smp_wmb();
- arr->entries[idx] = s;
-
-out_unlock:
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-}
-
-static void kmemcg_workfn(struct work_struct *work)
-{
- struct kmem_cache *s = container_of(work, struct kmem_cache,
- memcg_params.work);
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
- s->memcg_params.work_fn(s);
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-}
-
-static void kmemcg_rcufn(struct rcu_head *head)
-{
- struct kmem_cache *s = container_of(head, struct kmem_cache,
- memcg_params.rcu_head);
-
- /*
- * We need to grab blocking locks. Bounce to ->work. The
- * work item shares the space with the RCU head and can't be
- * initialized earlier.
- */
- INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
- queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
-}
-
-static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
-{
- WARN_ON(shutdown_cache(s));
-}
-
-static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
-{
- struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
- memcg_params.refcnt);
- unsigned long flags;
-
- spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
- if (s->memcg_params.root_cache->memcg_params.dying)
- goto unlock;
-
- s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
- INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
- queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
-
-unlock:
- spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
-}
-
-static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
-{
- __kmemcg_cache_deactivate_after_rcu(s);
- percpu_ref_kill(&s->memcg_params.refcnt);
-}
-
-static void kmemcg_cache_deactivate(struct kmem_cache *s)
-{
- if (WARN_ON_ONCE(is_root_cache(s)))
- return;
-
- __kmemcg_cache_deactivate(s);
- s->flags |= SLAB_DEACTIVATED;
-
- /*
- * memcg_kmem_wq_lock is used to synchronize memcg_params.dying
- * flag and make sure that no new kmem_cache deactivation tasks
- * are queued (see flush_memcg_workqueue() ).
- */
- spin_lock_irq(&memcg_kmem_wq_lock);
- if (s->memcg_params.root_cache->memcg_params.dying)
- goto unlock;
-
- s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
- call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
-unlock:
- spin_unlock_irq(&memcg_kmem_wq_lock);
-}
-
-void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg,
- struct mem_cgroup *parent)
-{
- int idx;
- struct memcg_cache_array *arr;
- struct kmem_cache *s, *c;
- unsigned int nr_reparented;
-
- idx = memcg_cache_id(memcg);
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_root_caches, root_caches_node) {
- arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
- lockdep_is_held(&slab_mutex));
- c = arr->entries[idx];
- if (!c)
- continue;
-
- kmemcg_cache_deactivate(c);
- arr->entries[idx] = NULL;
- }
- nr_reparented = 0;
- list_for_each_entry(s, &memcg->kmem_caches,
- memcg_params.kmem_caches_node) {
- WRITE_ONCE(s->memcg_params.memcg, parent);
- css_put(&memcg->css);
- nr_reparented++;
- }
- if (nr_reparented) {
- list_splice_init(&memcg->kmem_caches,
- &parent->kmem_caches);
- css_get_many(&parent->css, nr_reparented);
- }
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-}
-
-static int shutdown_memcg_caches(struct kmem_cache *s)
-{
- struct memcg_cache_array *arr;
- struct kmem_cache *c, *c2;
- LIST_HEAD(busy);
- int i;
-
- BUG_ON(!is_root_cache(s));
-
- /*
- * First, shutdown active caches, i.e. caches that belong to online
- * memory cgroups.
- */
- arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
- lockdep_is_held(&slab_mutex));
- for_each_memcg_cache_index(i) {
- c = arr->entries[i];
- if (!c)
- continue;
- if (shutdown_cache(c))
- /*
- * The cache still has objects. Move it to a temporary
- * list so as not to try to destroy it for a second
- * time while iterating over inactive caches below.
- */
- list_move(&c->memcg_params.children_node, &busy);
- else
- /*
- * The cache is empty and will be destroyed soon. Clear
- * the pointer to it in the memcg_caches array so that
- * it will never be accessed even if the root cache
- * stays alive.
- */
- arr->entries[i] = NULL;
- }
-
- /*
- * Second, shutdown all caches left from memory cgroups that are now
- * offline.
- */
- list_for_each_entry_safe(c, c2, &s->memcg_params.children,
- memcg_params.children_node)
- shutdown_cache(c);
-
- list_splice(&busy, &s->memcg_params.children);
-
- /*
- * A cache being destroyed must be empty. In particular, this means
- * that all per memcg caches attached to it must be empty too.
- */
- if (!list_empty(&s->memcg_params.children))
- return -EBUSY;
- return 0;
-}
-
-static void memcg_set_kmem_cache_dying(struct kmem_cache *s)
-{
- spin_lock_irq(&memcg_kmem_wq_lock);
- s->memcg_params.dying = true;
- spin_unlock_irq(&memcg_kmem_wq_lock);
-}
-
-static void flush_memcg_workqueue(struct kmem_cache *s)
-{
- /*
- * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make
- * sure all registered rcu callbacks have been invoked.
- */
- rcu_barrier();
-
- /*
- * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB
- * deactivates the memcg kmem_caches through workqueue. Make sure all
- * previous workitems on workqueue are processed.
- */
- if (likely(memcg_kmem_cache_wq))
- flush_workqueue(memcg_kmem_cache_wq);
-
- /*
- * If we're racing with children kmem_cache deactivation, it might
- * take another rcu grace period to complete their destruction.
- * At this moment the corresponding percpu_ref_kill() call should be
- * done, but it might take another rcu grace period to complete
- * switching to the atomic mode.
- * Please, note that we check without grabbing the slab_mutex. It's safe
- * because at this moment the children list can't grow.
- */
- if (!list_empty(&s->memcg_params.children))
- rcu_barrier();
-}
-#else
-static inline int shutdown_memcg_caches(struct kmem_cache *s)
-{
- return 0;
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
void slab_kmem_cache_release(struct kmem_cache *s)
{
__kmem_cache_release(s);
- destroy_memcg_params(s);
kfree_const(s->name);
kmem_cache_free(kmem_cache, s);
}
@@ -960,26 +494,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
-#ifdef CONFIG_MEMCG_KMEM
- memcg_set_kmem_cache_dying(s);
-
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-
- flush_memcg_workqueue(s);
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
-#endif
-
- err = shutdown_memcg_caches(s);
- if (!err)
- err = shutdown_cache(s);
-
+ err = shutdown_cache(s);
if (err) {
pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
s->name);
@@ -1016,43 +531,6 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-/**
- * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache
- * @s: The cache pointer
- */
-void kmem_cache_shrink_all(struct kmem_cache *s)
-{
- struct kmem_cache *c;
-
- if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) {
- kmem_cache_shrink(s);
- return;
- }
-
- get_online_cpus();
- get_online_mems();
- kasan_cache_shrink(s);
- __kmem_cache_shrink(s);
-
- /*
- * We have to take the slab_mutex to protect from the memcg list
- * modification.
- */
- mutex_lock(&slab_mutex);
- for_each_memcg_cache(c, s) {
- /*
- * Don't need to shrink deactivated memcg caches.
- */
- if (s->flags & SLAB_DEACTIVATED)
- continue;
- kasan_cache_shrink(c);
- __kmem_cache_shrink(c);
- }
- mutex_unlock(&slab_mutex);
- put_online_mems();
- put_online_cpus();
-}
-
bool slab_is_available(void)
{
return slab_state >= UP;
@@ -1081,8 +559,6 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
s->useroffset = useroffset;
s->usersize = usersize;
- slab_init_memcg_params(s);
-
err = __kmem_cache_create(s, flags);
if (err)
@@ -1103,7 +579,6 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
create_boot_cache(s, name, size, flags, useroffset, usersize);
list_add(&s->list, &slab_caches);
- memcg_link_cache(s, NULL);
s->refcount = 1;
return s;
}
@@ -1332,6 +807,18 @@ void __init create_kmalloc_caches(slab_flags_t flags)
}
#endif /* !CONFIG_SLOB */
+gfp_t kmalloc_fix_flags(gfp_t flags)
+{
+ gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
+
+ flags &= ~GFP_SLAB_BUG_MASK;
+ pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
+ invalid_mask, &invalid_mask, flags, &flags);
+ dump_stack();
+
+ return flags;
+}
+
/*
* To avoid unnecessary overhead, we pass through large allocation requests
* directly to the page allocator. We use __GFP_COMP, because we will need to
@@ -1342,12 +829,15 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
void *ret = NULL;
struct page *page;
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
+
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
if (likely(page)) {
ret = page_address(page);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
- 1 << order);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
}
ret = kasan_kmalloc_large(ret, size, flags);
/* As ret might get tagged, call kmemleak hook after KASAN. */
@@ -1444,12 +934,12 @@ static void print_slabinfo_header(struct seq_file *m)
void *slab_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&slab_mutex);
- return seq_list_start(&slab_root_caches, *pos);
+ return seq_list_start(&slab_caches, *pos);
}
void *slab_next(struct seq_file *m, void *p, loff_t *pos)
{
- return seq_list_next(p, &slab_root_caches, pos);
+ return seq_list_next(p, &slab_caches, pos);
}
void slab_stop(struct seq_file *m, void *p)
@@ -1457,27 +947,6 @@ void slab_stop(struct seq_file *m, void *p)
mutex_unlock(&slab_mutex);
}
-static void
-memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
-{
- struct kmem_cache *c;
- struct slabinfo sinfo;
-
- if (!is_root_cache(s))
- return;
-
- for_each_memcg_cache(c, s) {
- memset(&sinfo, 0, sizeof(sinfo));
- get_slabinfo(c, &sinfo);
-
- info->active_slabs += sinfo.active_slabs;
- info->num_slabs += sinfo.num_slabs;
- info->shared_avail += sinfo.shared_avail;
- info->active_objs += sinfo.active_objs;
- info->num_objs += sinfo.num_objs;
- }
-}
-
static void cache_show(struct kmem_cache *s, struct seq_file *m)
{
struct slabinfo sinfo;
@@ -1485,10 +954,8 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
memset(&sinfo, 0, sizeof(sinfo));
get_slabinfo(s, &sinfo);
- memcg_accumulate_slabinfo(s, &sinfo);
-
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
- cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
+ s->name, sinfo.active_objs, sinfo.num_objs, s->size,
sinfo.objects_per_slab, (1 << sinfo.cache_order));
seq_printf(m, " : tunables %4u %4u %4u",
@@ -1501,9 +968,9 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
static int slab_show(struct seq_file *m, void *p)
{
- struct kmem_cache *s = list_entry(p, struct kmem_cache, root_caches_node);
+ struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
- if (p == slab_root_caches.next)
+ if (p == slab_caches.next)
print_slabinfo_header(m);
cache_show(s, m);
return 0;
@@ -1530,13 +997,13 @@ void dump_unreclaimable_slab(void)
pr_info("Name Used Total\n");
list_for_each_entry_safe(s, s2, &slab_caches, list) {
- if (!is_root_cache(s) || (s->flags & SLAB_RECLAIM_ACCOUNT))
+ if (s->flags & SLAB_RECLAIM_ACCOUNT)
continue;
get_slabinfo(s, &sinfo);
if (sinfo.num_objs > 0)
- pr_info("%-17s %10luKB %10luKB\n", cache_name(s),
+ pr_info("%-17s %10luKB %10luKB\n", s->name,
(sinfo.active_objs * s->size) / 1024,
(sinfo.num_objs * s->size) / 1024);
}
@@ -1544,35 +1011,12 @@ void dump_unreclaimable_slab(void)
}
#if defined(CONFIG_MEMCG_KMEM)
-void *memcg_slab_start(struct seq_file *m, loff_t *pos)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-
- mutex_lock(&slab_mutex);
- return seq_list_start(&memcg->kmem_caches, *pos);
-}
-
-void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-
- return seq_list_next(p, &memcg->kmem_caches, pos);
-}
-
-void memcg_slab_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&slab_mutex);
-}
-
int memcg_slab_show(struct seq_file *m, void *p)
{
- struct kmem_cache *s = list_entry(p, struct kmem_cache,
- memcg_params.kmem_caches_node);
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-
- if (p == memcg->kmem_caches.next)
- print_slabinfo_header(m);
- cache_show(s, m);
+ /*
+ * Deprecated.
+ * Please, take a look at tools/cgroup/slabinfo.py .
+ */
return 0;
}
#endif
@@ -1618,73 +1062,15 @@ static int __init slab_proc_init(void)
}
module_init(slab_proc_init);
-#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_MEMCG_KMEM)
-/*
- * Display information about kmem caches that have child memcg caches.
- */
-static int memcg_slabinfo_show(struct seq_file *m, void *unused)
-{
- struct kmem_cache *s, *c;
- struct slabinfo sinfo;
-
- mutex_lock(&slab_mutex);
- seq_puts(m, "# <name> <css_id[:dead|deact]> <active_objs> <num_objs>");
- seq_puts(m, " <active_slabs> <num_slabs>\n");
- list_for_each_entry(s, &slab_root_caches, root_caches_node) {
- /*
- * Skip kmem caches that don't have any memcg children.
- */
- if (list_empty(&s->memcg_params.children))
- continue;
-
- memset(&sinfo, 0, sizeof(sinfo));
- get_slabinfo(s, &sinfo);
- seq_printf(m, "%-17s root %6lu %6lu %6lu %6lu\n",
- cache_name(s), sinfo.active_objs, sinfo.num_objs,
- sinfo.active_slabs, sinfo.num_slabs);
-
- for_each_memcg_cache(c, s) {
- struct cgroup_subsys_state *css;
- char *status = "";
-
- css = &c->memcg_params.memcg->css;
- if (!(css->flags & CSS_ONLINE))
- status = ":dead";
- else if (c->flags & SLAB_DEACTIVATED)
- status = ":deact";
-
- memset(&sinfo, 0, sizeof(sinfo));
- get_slabinfo(c, &sinfo);
- seq_printf(m, "%-17s %4d%-6s %6lu %6lu %6lu %6lu\n",
- cache_name(c), css->id, status,
- sinfo.active_objs, sinfo.num_objs,
- sinfo.active_slabs, sinfo.num_slabs);
- }
- }
- mutex_unlock(&slab_mutex);
- return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(memcg_slabinfo);
-
-static int __init memcg_slabinfo_init(void)
-{
- debugfs_create_file("memcg_slabinfo", S_IFREG | S_IRUGO,
- NULL, NULL, &memcg_slabinfo_fops);
- return 0;
-}
-
-late_initcall(memcg_slabinfo_init);
-#endif /* CONFIG_DEBUG_FS && CONFIG_MEMCG_KMEM */
#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
static __always_inline void *__do_krealloc(const void *p, size_t new_size,
gfp_t flags)
{
void *ret;
- size_t ks = 0;
+ size_t ks;
- if (p)
- ks = ksize(p);
+ ks = ksize(p);
if (ks >= new_size) {
p = kasan_krealloc((void *)p, new_size, flags);
@@ -1729,28 +1115,27 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
EXPORT_SYMBOL(krealloc);
/**
- * kzfree - like kfree but zero memory
+ * kfree_sensitive - Clear sensitive information in memory before freeing
* @p: object to free memory of
*
* The memory of the object @p points to is zeroed before freed.
- * If @p is %NULL, kzfree() does nothing.
+ * If @p is %NULL, kfree_sensitive() does nothing.
*
* Note: this function zeroes the whole allocated buffer which can be a good
* deal bigger than the requested buffer size passed to kmalloc(). So be
* careful when using this function in performance sensitive code.
*/
-void kzfree(const void *p)
+void kfree_sensitive(const void *p)
{
size_t ks;
void *mem = (void *)p;
- if (unlikely(ZERO_OR_NULL_PTR(mem)))
- return;
ks = ksize(mem);
- memzero_explicit(mem, ks);
+ if (ks)
+ memzero_explicit(mem, ks);
kfree(mem);
}
-EXPORT_SYMBOL(kzfree);
+EXPORT_SYMBOL(kfree_sensitive);
/**
* ksize - get the actual amount of memory allocated for a given object
@@ -1770,8 +1155,6 @@ size_t ksize(const void *objp)
{
size_t size;
- if (WARN_ON_ONCE(!objp))
- return 0;
/*
* We need to check that the pointed to object is valid, and only then
* unpoison the shadow memory below. We use __kasan_check_read(), to
@@ -1785,7 +1168,7 @@ size_t ksize(const void *objp)
* We want to perform the check before __ksize(), to avoid potentially
* crashing in __ksize() due to accessing invalid metadata.
*/
- if (unlikely(objp == ZERO_SIZE_PTR) || !__kasan_check_read(objp, 1))
+ if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1))
return 0;
size = __ksize(objp);
diff --git a/mm/slob.c b/mm/slob.c
index ac2aecfbc7a8..7cc9805c8091 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -202,8 +202,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
if (!page)
return NULL;
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
- 1 << order);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
return page_address(page);
}
@@ -214,8 +214,8 @@ static void slob_free_pages(void *b, int order)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
- mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
- -(1 << order));
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
__free_pages(sp, order);
}
@@ -552,8 +552,8 @@ void kfree(const void *block)
slob_free(m, *m + align);
} else {
unsigned int order = compound_order(sp);
- mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
- -(1 << order));
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
__free_pages(sp, order);
}
diff --git a/mm/slub.c b/mm/slub.c
index ef303070d175..68c02b2eecd9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -114,18 +114,22 @@
* the fast path and disables lockless freelists.
*/
-static inline int kmem_cache_debug(struct kmem_cache *s)
-{
#ifdef CONFIG_SLUB_DEBUG
- return unlikely(s->flags & SLAB_DEBUG_FLAGS);
+#ifdef CONFIG_SLUB_DEBUG_ON
+DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
#else
- return 0;
+DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
+#endif
+
+static inline bool kmem_cache_debug(struct kmem_cache *s)
+{
+ return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
}
void *fixup_red_left(struct kmem_cache *s, void *p)
{
- if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+ if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
p += s->red_left_pad;
return p;
@@ -214,14 +218,10 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
#ifdef CONFIG_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
-static void memcg_propagate_slab_attrs(struct kmem_cache *s);
-static void sysfs_slab_remove(struct kmem_cache *s);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
-static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
-static inline void sysfs_slab_remove(struct kmem_cache *s) { }
#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -313,12 +313,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
__p < (__addr) + (__objects) * (__s)->size; \
__p += (__s)->size)
-/* Determine object index from a given position */
-static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
-{
- return (kasan_reset_tag(p) - addr) / s->size;
-}
-
static inline unsigned int order_objects(unsigned int order, unsigned int size)
{
return ((unsigned int)PAGE_SIZE << order) / size;
@@ -461,7 +455,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
bitmap_zero(object_map, page->objects);
for (p = page->freelist; p; p = get_freepointer(s, p))
- set_bit(slab_index(p, s, addr), object_map);
+ set_bit(__obj_to_index(s, addr, p), object_map);
return object_map;
}
@@ -469,8 +463,6 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
static void put_map(unsigned long *map) __releases(&object_map_lock)
{
VM_BUG_ON(map != object_map);
- lockdep_assert_held(&object_map_lock);
-
spin_unlock(&object_map_lock);
}
@@ -499,7 +491,7 @@ static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
static slab_flags_t slub_debug;
#endif
-static char *slub_debug_slabs;
+static char *slub_debug_string;
static int disable_higher_order_debug;
/*
@@ -634,7 +626,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time)
#endif
}
-static void print_tracking(struct kmem_cache *s, void *object)
+void print_tracking(struct kmem_cache *s, void *object)
{
unsigned long pr_time = jiffies;
if (!(s->flags & SLAB_STORE_USER))
@@ -1112,7 +1104,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
static void setup_object_debug(struct kmem_cache *s, struct page *page,
void *object)
{
- if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
+ if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
return;
init_object(s, object, SLUB_RED_INACTIVE);
@@ -1122,7 +1114,7 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
static
void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
{
- if (!(s->flags & SLAB_POISON))
+ if (!kmem_cache_debug_flags(s, SLAB_POISON))
return;
metadata_access_enable();
@@ -1218,7 +1210,7 @@ static noinline int free_debug_processing(
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
void *object = head;
int cnt = 0;
- unsigned long uninitialized_var(flags);
+ unsigned long flags;
int ret = 0;
spin_lock_irqsave(&n->list_lock, flags);
@@ -1262,69 +1254,135 @@ out:
return ret;
}
-static int __init setup_slub_debug(char *str)
+/*
+ * Parse a block of slub_debug options. Blocks are delimited by ';'
+ *
+ * @str: start of block
+ * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
+ * @slabs: return start of list of slabs, or NULL when there's no list
+ * @init: assume this is initial parsing and not per-kmem-create parsing
+ *
+ * returns the start of next block if there's any, or NULL
+ */
+static char *
+parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
{
- slub_debug = DEBUG_DEFAULT_FLAGS;
- if (*str++ != '=' || !*str)
- /*
- * No options specified. Switch on full debugging.
- */
- goto out;
+ bool higher_order_disable = false;
- if (*str == ',')
+ /* Skip any completely empty blocks */
+ while (*str && *str == ';')
+ str++;
+
+ if (*str == ',') {
/*
* No options but restriction on slabs. This means full
* debugging for slabs matching a pattern.
*/
+ *flags = DEBUG_DEFAULT_FLAGS;
goto check_slabs;
+ }
+ *flags = 0;
- slub_debug = 0;
- if (*str == '-')
- /*
- * Switch off all debugging measures.
- */
- goto out;
-
- /*
- * Determine which debug features should be switched on
- */
- for (; *str && *str != ','; str++) {
+ /* Determine which debug features should be switched on */
+ for (; *str && *str != ',' && *str != ';'; str++) {
switch (tolower(*str)) {
+ case '-':
+ *flags = 0;
+ break;
case 'f':
- slub_debug |= SLAB_CONSISTENCY_CHECKS;
+ *flags |= SLAB_CONSISTENCY_CHECKS;
break;
case 'z':
- slub_debug |= SLAB_RED_ZONE;
+ *flags |= SLAB_RED_ZONE;
break;
case 'p':
- slub_debug |= SLAB_POISON;
+ *flags |= SLAB_POISON;
break;
case 'u':
- slub_debug |= SLAB_STORE_USER;
+ *flags |= SLAB_STORE_USER;
break;
case 't':
- slub_debug |= SLAB_TRACE;
+ *flags |= SLAB_TRACE;
break;
case 'a':
- slub_debug |= SLAB_FAILSLAB;
+ *flags |= SLAB_FAILSLAB;
break;
case 'o':
/*
* Avoid enabling debugging on caches if its minimum
* order would increase as a result.
*/
- disable_higher_order_debug = 1;
+ higher_order_disable = true;
break;
default:
- pr_err("slub_debug option '%c' unknown. skipped\n",
- *str);
+ if (init)
+ pr_err("slub_debug option '%c' unknown. skipped\n", *str);
}
}
-
check_slabs:
if (*str == ',')
- slub_debug_slabs = str + 1;
+ *slabs = ++str;
+ else
+ *slabs = NULL;
+
+ /* Skip over the slab list */
+ while (*str && *str != ';')
+ str++;
+
+ /* Skip any completely empty blocks */
+ while (*str && *str == ';')
+ str++;
+
+ if (init && higher_order_disable)
+ disable_higher_order_debug = 1;
+
+ if (*str)
+ return str;
+ else
+ return NULL;
+}
+
+static int __init setup_slub_debug(char *str)
+{
+ slab_flags_t flags;
+ char *saved_str;
+ char *slab_list;
+ bool global_slub_debug_changed = false;
+ bool slab_list_specified = false;
+
+ slub_debug = DEBUG_DEFAULT_FLAGS;
+ if (*str++ != '=' || !*str)
+ /*
+ * No options specified. Switch on full debugging.
+ */
+ goto out;
+
+ saved_str = str;
+ while (str) {
+ str = parse_slub_debug_flags(str, &flags, &slab_list, true);
+
+ if (!slab_list) {
+ slub_debug = flags;
+ global_slub_debug_changed = true;
+ } else {
+ slab_list_specified = true;
+ }
+ }
+
+ /*
+ * For backwards compatibility, a single list of flags with list of
+ * slabs means debugging is only enabled for those slabs, so the global
+ * slub_debug should be 0. We can extended that to multiple lists as
+ * long as there is no option specifying flags without a slab list.
+ */
+ if (slab_list_specified) {
+ if (!global_slub_debug_changed)
+ slub_debug = 0;
+ slub_debug_string = saved_str;
+ }
out:
+ if (slub_debug != 0 || slub_debug_string)
+ static_branch_enable(&slub_debug_enabled);
if ((static_branch_unlikely(&init_on_alloc) ||
static_branch_unlikely(&init_on_free)) &&
(slub_debug & SLAB_POISON))
@@ -1352,36 +1410,47 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
{
char *iter;
size_t len;
+ char *next_block;
+ slab_flags_t block_flags;
/* If slub_debug = 0, it folds into the if conditional. */
- if (!slub_debug_slabs)
+ if (!slub_debug_string)
return flags | slub_debug;
len = strlen(name);
- iter = slub_debug_slabs;
- while (*iter) {
- char *end, *glob;
- size_t cmplen;
-
- end = strchrnul(iter, ',');
+ next_block = slub_debug_string;
+ /* Go through all blocks of debug options, see if any matches our slab's name */
+ while (next_block) {
+ next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
+ if (!iter)
+ continue;
+ /* Found a block that has a slab list, search it */
+ while (*iter) {
+ char *end, *glob;
+ size_t cmplen;
+
+ end = strchrnul(iter, ',');
+ if (next_block && next_block < end)
+ end = next_block - 1;
+
+ glob = strnchr(iter, end - iter, '*');
+ if (glob)
+ cmplen = glob - iter;
+ else
+ cmplen = max_t(size_t, len, (end - iter));
- glob = strnchr(iter, end - iter, '*');
- if (glob)
- cmplen = glob - iter;
- else
- cmplen = max_t(size_t, len, (end - iter));
+ if (!strncmp(name, iter, cmplen)) {
+ flags |= block_flags;
+ return flags;
+ }
- if (!strncmp(name, iter, cmplen)) {
- flags |= slub_debug;
- break;
+ if (!*end || *end == ';')
+ break;
+ iter = end + 1;
}
-
- if (!*end)
- break;
- iter = end + 1;
}
- return flags;
+ return slub_debug;
}
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
@@ -1470,6 +1539,11 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size);
+ /* Use KCSAN to help debug racy use-after-free. */
+ if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
+ __kcsan_check_access(x, s->object_size,
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
+
/* KASAN might put x into memory quarantine, delaying its reuse */
return kasan_slab_free(s, x, _RET_IP_);
}
@@ -1546,10 +1620,8 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
else
page = __alloc_pages_node(node, flags, order);
- if (page && charge_slab_page(page, flags, order, s)) {
- __free_pages(page, order);
- page = NULL;
- }
+ if (page)
+ account_slab_page(page, order, s);
return page;
}
@@ -1745,13 +1817,8 @@ out:
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
- if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
- gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
- flags &= ~GFP_SLAB_BUG_MASK;
- pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
- invalid_mask, &invalid_mask, flags, &flags);
- dump_stack();
- }
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
return allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
@@ -1762,7 +1829,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
int order = compound_order(page);
int pages = 1 << order;
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
void *p;
slab_pad_check(s, page);
@@ -1777,7 +1844,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
page->mapping = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- uncharge_slab_page(page, order, s);
+ unaccount_slab_page(page, order, s);
__free_pages(page, order);
}
@@ -2744,8 +2811,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
struct kmem_cache_cpu *c;
struct page *page;
unsigned long tid;
+ struct obj_cgroup *objcg = NULL;
- s = slab_pre_alloc_hook(s, gfpflags);
+ s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
if (!s)
return NULL;
redo:
@@ -2821,7 +2889,7 @@ redo:
if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
memset(object, 0, s->object_size);
- slab_post_alloc_hook(s, gfpflags, 1, &object);
+ slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
return object;
}
@@ -2901,7 +2969,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
struct page new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
- unsigned long uninitialized_var(flags);
+ unsigned long flags;
stat(s, FREE_SLOWPATH);
@@ -3026,6 +3094,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
+
+ memcg_slab_free_hook(s, page, head);
redo:
/*
* Determine the currently cpus per cpu slab.
@@ -3205,9 +3275,10 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
{
struct kmem_cache_cpu *c;
int i;
+ struct obj_cgroup *objcg = NULL;
/* memcg and kmem_cache debug support */
- s = slab_pre_alloc_hook(s, flags);
+ s = slab_pre_alloc_hook(s, &objcg, size, flags);
if (unlikely(!s))
return false;
/*
@@ -3261,11 +3332,11 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
}
/* memcg and kmem_cache debug support */
- slab_post_alloc_hook(s, flags, size, p);
+ slab_post_alloc_hook(s, objcg, flags, size, p);
return i;
error:
local_irq_enable();
- slab_post_alloc_hook(s, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
@@ -3675,6 +3746,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
size = ALIGN(size, s->align);
s->size = size;
+ s->reciprocal_size = reciprocal_value(size);
if (forced_order >= 0)
order = forced_order;
else
@@ -3779,7 +3851,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
- if (!test_bit(slab_index(p, s, addr), map)) {
+ if (!test_bit(__obj_to_index(s, addr, p), map)) {
pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
@@ -3842,7 +3914,6 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
if (n->nr_partial || slabs_node(s, node))
return 1;
}
- sysfs_slab_remove(s);
return 0;
}
@@ -3912,8 +3983,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
page = alloc_pages_node(node, flags, order);
if (page) {
ptr = page_address(page);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
- 1 << order);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
}
return kmalloc_large_node_hook(ptr, size, flags);
@@ -3980,7 +4051,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
offset = (ptr - page_address(page)) % s->size;
/* Adjust for redzone and reject if within the redzone. */
- if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
+ if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
if (offset < s->red_left_pad)
usercopy_abort("SLUB object in left red zone",
s->name, to_user, offset, n);
@@ -4044,8 +4115,8 @@ void kfree(const void *x)
BUG_ON(!PageCompound(page));
kfree_hook(object);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
- -(1 << order));
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
__free_pages(page, order);
return;
}
@@ -4126,36 +4197,6 @@ int __kmem_cache_shrink(struct kmem_cache *s)
return ret;
}
-#ifdef CONFIG_MEMCG
-void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
-{
- /*
- * Called with all the locks held after a sched RCU grace period.
- * Even if @s becomes empty after shrinking, we can't know that @s
- * doesn't have allocations already in-flight and thus can't
- * destroy @s until the associated memcg is released.
- *
- * However, let's remove the sysfs files for empty caches here.
- * Each cache has a lot of interface files which aren't
- * particularly useful for empty draining caches; otherwise, we can
- * easily end up with millions of unnecessary sysfs files on
- * systems which have a lot of memory and transient cgroups.
- */
- if (!__kmem_cache_shrink(s))
- sysfs_slab_remove(s);
-}
-
-void __kmemcg_cache_deactivate(struct kmem_cache *s)
-{
- /*
- * Disable empty slabs caching. Used to avoid pinning offline
- * memory cgroups by kmem pages that can be freed.
- */
- slub_set_cpu_partial(s, 0);
- s->min_partial = 0;
-}
-#endif /* CONFIG_MEMCG */
-
static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;
@@ -4310,9 +4351,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
p->slab_cache = s;
#endif
}
- slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
- memcg_link_cache(s, NULL);
return s;
}
@@ -4367,7 +4406,7 @@ struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *))
{
- struct kmem_cache *s, *c;
+ struct kmem_cache *s;
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
@@ -4380,11 +4419,6 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
s->object_size = max(s->object_size, size);
s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
- for_each_memcg_cache(c, s) {
- c->object_size = s->object_size;
- c->inuse = max(c->inuse, ALIGN(size, sizeof(void *)));
- }
-
if (sysfs_slab_alias(s, name)) {
s->refcount--;
s = NULL;
@@ -4406,7 +4440,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
if (slab_state <= UP)
return 0;
- memcg_propagate_slab_attrs(s);
err = sysfs_slab_add(s);
if (err)
__kmem_cache_release(s);
@@ -4495,7 +4528,7 @@ static void validate_slab(struct kmem_cache *s, struct page *page)
/* Now we know that a valid freelist exists */
map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
- u8 val = test_bit(slab_index(p, s, addr), map) ?
+ u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
if (!check_object(s, page, p, val))
@@ -4686,7 +4719,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
map = get_map(s, page);
for_each_object(p, s, addr, page->objects)
- if (!test_bit(slab_index(p, s, addr), map))
+ if (!test_bit(__obj_to_index(s, addr, p), map))
add_location(t, s, get_track(s, p, alloc));
put_map(map);
}
@@ -4970,20 +5003,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
return x + sprintf(buf + x, "\n");
}
-#ifdef CONFIG_SLUB_DEBUG
-static int any_slab_objects(struct kmem_cache *s)
-{
- int node;
- struct kmem_cache_node *n;
-
- for_each_kmem_cache_node(s, node, n)
- if (atomic_long_read(&n->total_objects))
- return 1;
-
- return 0;
-}
-#endif
-
#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
#define to_slab(n) container_of(n, struct kmem_cache, kobj)
@@ -5025,28 +5044,11 @@ static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(objs_per_slab);
-static ssize_t order_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- unsigned int order;
- int err;
-
- err = kstrtouint(buf, 10, &order);
- if (err)
- return err;
-
- if (order > slub_max_order || order < slub_min_order)
- return -EINVAL;
-
- calculate_sizes(s, order);
- return length;
-}
-
static ssize_t order_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%u\n", oo_order(s->oo));
}
-SLAB_ATTR(order);
+SLAB_ATTR_RO(order);
static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
{
@@ -5168,16 +5170,7 @@ static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
}
-
-static ssize_t reclaim_account_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- s->flags &= ~SLAB_RECLAIM_ACCOUNT;
- if (buf[0] == '1')
- s->flags |= SLAB_RECLAIM_ACCOUNT;
- return length;
-}
-SLAB_ATTR(reclaim_account);
+SLAB_ATTR_RO(reclaim_account);
static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
{
@@ -5222,104 +5215,34 @@ static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
}
-
-static ssize_t sanity_checks_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- s->flags &= ~SLAB_CONSISTENCY_CHECKS;
- if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_CONSISTENCY_CHECKS;
- }
- return length;
-}
-SLAB_ATTR(sanity_checks);
+SLAB_ATTR_RO(sanity_checks);
static ssize_t trace_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
}
-
-static ssize_t trace_store(struct kmem_cache *s, const char *buf,
- size_t length)
-{
- /*
- * Tracing a merged cache is going to give confusing results
- * as well as cause other issues like converting a mergeable
- * cache into an umergeable one.
- */
- if (s->refcount > 1)
- return -EINVAL;
-
- s->flags &= ~SLAB_TRACE;
- if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_TRACE;
- }
- return length;
-}
-SLAB_ATTR(trace);
+SLAB_ATTR_RO(trace);
static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
}
-static ssize_t red_zone_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- if (any_slab_objects(s))
- return -EBUSY;
-
- s->flags &= ~SLAB_RED_ZONE;
- if (buf[0] == '1') {
- s->flags |= SLAB_RED_ZONE;
- }
- calculate_sizes(s, -1);
- return length;
-}
-SLAB_ATTR(red_zone);
+SLAB_ATTR_RO(red_zone);
static ssize_t poison_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
}
-static ssize_t poison_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- if (any_slab_objects(s))
- return -EBUSY;
-
- s->flags &= ~SLAB_POISON;
- if (buf[0] == '1') {
- s->flags |= SLAB_POISON;
- }
- calculate_sizes(s, -1);
- return length;
-}
-SLAB_ATTR(poison);
+SLAB_ATTR_RO(poison);
static ssize_t store_user_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
}
-static ssize_t store_user_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- if (any_slab_objects(s))
- return -EBUSY;
-
- s->flags &= ~SLAB_STORE_USER;
- if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_STORE_USER;
- }
- calculate_sizes(s, -1);
- return length;
-}
-SLAB_ATTR(store_user);
+SLAB_ATTR_RO(store_user);
static ssize_t validate_show(struct kmem_cache *s, char *buf)
{
@@ -5362,19 +5285,7 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
}
-
-static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
- size_t length)
-{
- if (s->refcount > 1)
- return -EINVAL;
-
- s->flags &= ~SLAB_FAILSLAB;
- if (buf[0] == '1')
- s->flags |= SLAB_FAILSLAB;
- return length;
-}
-SLAB_ATTR(failslab);
+SLAB_ATTR_RO(failslab);
#endif
static ssize_t shrink_show(struct kmem_cache *s, char *buf)
@@ -5386,7 +5297,7 @@ static ssize_t shrink_store(struct kmem_cache *s,
const char *buf, size_t length)
{
if (buf[0] == '1')
- kmem_cache_shrink_all(s);
+ kmem_cache_shrink(s);
else
return -EINVAL;
return length;
@@ -5610,98 +5521,9 @@ static ssize_t slab_attr_store(struct kobject *kobj,
return -EIO;
err = attribute->store(s, buf, len);
-#ifdef CONFIG_MEMCG
- if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
- struct kmem_cache *c;
-
- mutex_lock(&slab_mutex);
- if (s->max_attr_size < len)
- s->max_attr_size = len;
-
- /*
- * This is a best effort propagation, so this function's return
- * value will be determined by the parent cache only. This is
- * basically because not all attributes will have a well
- * defined semantics for rollbacks - most of the actions will
- * have permanent effects.
- *
- * Returning the error value of any of the children that fail
- * is not 100 % defined, in the sense that users seeing the
- * error code won't be able to know anything about the state of
- * the cache.
- *
- * Only returning the error code for the parent cache at least
- * has well defined semantics. The cache being written to
- * directly either failed or succeeded, in which case we loop
- * through the descendants with best-effort propagation.
- */
- for_each_memcg_cache(c, s)
- attribute->store(c, buf, len);
- mutex_unlock(&slab_mutex);
- }
-#endif
return err;
}
-static void memcg_propagate_slab_attrs(struct kmem_cache *s)
-{
-#ifdef CONFIG_MEMCG
- int i;
- char *buffer = NULL;
- struct kmem_cache *root_cache;
-
- if (is_root_cache(s))
- return;
-
- root_cache = s->memcg_params.root_cache;
-
- /*
- * This mean this cache had no attribute written. Therefore, no point
- * in copying default values around
- */
- if (!root_cache->max_attr_size)
- return;
-
- for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
- char mbuf[64];
- char *buf;
- struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
- ssize_t len;
-
- if (!attr || !attr->store || !attr->show)
- continue;
-
- /*
- * It is really bad that we have to allocate here, so we will
- * do it only as a fallback. If we actually allocate, though,
- * we can just use the allocated buffer until the end.
- *
- * Most of the slub attributes will tend to be very small in
- * size, but sysfs allows buffers up to a page, so they can
- * theoretically happen.
- */
- if (buffer)
- buf = buffer;
- else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) &&
- !IS_ENABLED(CONFIG_SLUB_STATS))
- buf = mbuf;
- else {
- buffer = (char *) get_zeroed_page(GFP_KERNEL);
- if (WARN_ON(!buffer))
- continue;
- buf = buffer;
- }
-
- len = attr->show(root_cache, buf);
- if (len > 0)
- attr->store(s, buf, len);
- }
-
- if (buffer)
- free_page((unsigned long)buffer);
-#endif /* CONFIG_MEMCG */
-}
-
static void kmem_cache_release(struct kobject *k)
{
slab_kmem_cache_release(to_slab(k));
@@ -5721,10 +5543,6 @@ static struct kset *slab_kset;
static inline struct kset *cache_kset(struct kmem_cache *s)
{
-#ifdef CONFIG_MEMCG
- if (!is_root_cache(s))
- return s->memcg_params.root_cache->memcg_kset;
-#endif
return slab_kset;
}
@@ -5767,27 +5585,6 @@ static char *create_unique_id(struct kmem_cache *s)
return name;
}
-static void sysfs_slab_remove_workfn(struct work_struct *work)
-{
- struct kmem_cache *s =
- container_of(work, struct kmem_cache, kobj_remove_work);
-
- if (!s->kobj.state_in_sysfs)
- /*
- * For a memcg cache, this may be called during
- * deactivation and again on shutdown. Remove only once.
- * A cache is never shut down before deactivation is
- * complete, so no need to worry about synchronization.
- */
- goto out;
-
-#ifdef CONFIG_MEMCG
- kset_unregister(s->memcg_kset);
-#endif
-out:
- kobject_put(&s->kobj);
-}
-
static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
@@ -5795,8 +5592,6 @@ static int sysfs_slab_add(struct kmem_cache *s)
struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
- INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
-
if (!kset) {
kobject_init(&s->kobj, &slab_ktype);
return 0;
@@ -5833,16 +5628,6 @@ static int sysfs_slab_add(struct kmem_cache *s)
if (err)
goto out_del_kobj;
-#ifdef CONFIG_MEMCG
- if (is_root_cache(s) && memcg_sysfs_enabled) {
- s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
- if (!s->memcg_kset) {
- err = -ENOMEM;
- goto out_del_kobj;
- }
- }
-#endif
-
if (!unmergeable) {
/* Setup first alias */
sysfs_slab_alias(s, s->name);
@@ -5856,19 +5641,6 @@ out_del_kobj:
goto out;
}
-static void sysfs_slab_remove(struct kmem_cache *s)
-{
- if (slab_state < FULL)
- /*
- * Sysfs has not been setup yet so no need to remove the
- * cache from sysfs.
- */
- return;
-
- kobject_get(&s->kobj);
- schedule_work(&s->kobj_remove_work);
-}
-
void sysfs_slab_unlink(struct kmem_cache *s)
{
if (slab_state >= FULL)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 0db7738d76e9..16183d85a7d5 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -69,11 +69,19 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
__pa(MAX_DMA_ADDRESS));
}
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+ struct vmem_altmap *altmap);
+
/* need to make sure size is all the same during early stage */
-void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
+ struct vmem_altmap *altmap)
{
- void *ptr = sparse_buffer_alloc(size);
+ void *ptr;
+
+ if (altmap)
+ return altmap_alloc_block_buf(size, altmap);
+ ptr = sparse_buffer_alloc(size);
if (!ptr)
ptr = vmemmap_alloc_block(size, node);
return ptr;
@@ -94,15 +102,8 @@ static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
return 0;
}
-/**
- * altmap_alloc_block_buf - allocate pages from the device page map
- * @altmap: device page map
- * @size: size (in bytes) of the allocation
- *
- * Allocations are aligned to the size of the request.
- */
-void * __meminit altmap_alloc_block_buf(unsigned long size,
- struct vmem_altmap *altmap)
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+ struct vmem_altmap *altmap)
{
unsigned long pfn, nr_pfns, nr_align;
@@ -139,12 +140,15 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
start, end - 1);
}
-pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
+pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
+ struct vmem_altmap *altmap)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
pte_t entry;
- void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
+ void *p;
+
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
if (!p)
return NULL;
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -212,8 +216,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
return pgd;
}
-int __meminit vmemmap_populate_basepages(unsigned long start,
- unsigned long end, int node)
+int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap)
{
unsigned long addr = start;
pgd_t *pgd;
@@ -235,7 +239,7 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
pmd = vmemmap_pmd_populate(pud, addr, node);
if (!pmd)
return -ENOMEM;
- pte = vmemmap_pte_populate(pmd, addr, node);
+ pte = vmemmap_pte_populate(pmd, addr, node, altmap);
if (!pte)
return -ENOMEM;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
@@ -247,20 +251,12 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
struct page * __meminit __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
- unsigned long start;
- unsigned long end;
-
- /*
- * The minimum granularity of memmap extensions is
- * PAGES_PER_SUBSECTION as allocations are tracked in the
- * 'subsection_map' bitmap of the section.
- */
- end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION);
- pfn &= PAGE_SUBSECTION_MASK;
- nr_pages = end - pfn;
-
- start = (unsigned long) pfn_to_page(pfn);
- end = start + nr_pages * sizeof(struct page);
+ unsigned long start = (unsigned long) pfn_to_page(pfn);
+ unsigned long end = start + nr_pages * sizeof(struct page);
+
+ if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
+ !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
+ return NULL;
if (vmemmap_populate(start, end, nid, altmap))
return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index b2b9a3e34696..fcc3d176f1ea 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -16,7 +16,6 @@
#include "internal.h"
#include <asm/dma.h>
-#include <asm/pgalloc.h>
/*
* Permanent SPARSEMEM data:
@@ -250,7 +249,7 @@ void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
#endif
/* Record a memory area against a node. */
-void __init memory_present(int nid, unsigned long start, unsigned long end)
+static void __init memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;
@@ -286,11 +285,11 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
}
/*
- * Mark all memblocks as present using memory_present(). This is a
- * convenience function that is useful for a number of arches
- * to mark all of the systems memory as present during initialization.
+ * Mark all memblocks as present using memory_present().
+ * This is a convenience function that is useful to mark all of the systems
+ * memory as present during initialization.
*/
-void __init memblocks_present(void)
+static void __init memblocks_present(void)
{
struct memblock_region *reg;
@@ -575,9 +574,13 @@ failed:
*/
void __init sparse_init(void)
{
- unsigned long pnum_begin = first_present_section_nr();
- int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
- unsigned long pnum_end, map_count = 1;
+ unsigned long pnum_end, pnum_begin, map_count = 1;
+ int nid_begin;
+
+ memblocks_present();
+
+ pnum_begin = first_present_section_nr();
+ nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
set_pageblock_order();
@@ -825,10 +828,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
}
- if (section_is_early && memmap)
- free_map_bootmem(memmap);
- else
+ /*
+ * The memmap of early sections is always fully populated. See
+ * section_activate() and pfn_valid() .
+ */
+ if (!section_is_early)
depopulate_section_memmap(pfn, nr_pages, altmap);
+ else if (memmap)
+ free_map_bootmem(memmap);
if (empty)
ms->section_mem_map = (unsigned long)NULL;
diff --git a/mm/swap.c b/mm/swap.c
index a82efc33411f..9285e60c7d6e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -476,23 +476,24 @@ void lru_cache_add(struct page *page)
EXPORT_SYMBOL(lru_cache_add);
/**
- * lru_cache_add_active_or_unevictable
+ * lru_cache_add_inactive_or_unevictable
* @page: the page to be added to LRU
* @vma: vma in which page is mapped for determining reclaimability
*
- * Place @page on the active or unevictable LRU list, depending on its
+ * Place @page on the inactive or unevictable LRU list, depending on its
* evictability. Note that if the page is not evictable, it goes
* directly back onto it's zone's unevictable list, it does NOT use a
* per cpu pagevec.
*/
-void lru_cache_add_active_or_unevictable(struct page *page,
+void lru_cache_add_inactive_or_unevictable(struct page *page,
struct vm_area_struct *vma)
{
+ bool unevictable;
+
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
- SetPageActive(page);
- else if (!TestSetPageMlocked(page)) {
+ unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
+ if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
/*
* We use the irq-unsafe __mod_zone_page_stat because this
* counter is not modified from interrupt context, and the pte
@@ -830,8 +831,8 @@ void release_pages(struct page **pages, int nr)
LIST_HEAD(pages_to_free);
struct pglist_data *locked_pgdat = NULL;
struct lruvec *lruvec;
- unsigned long uninitialized_var(flags);
- unsigned int uninitialized_var(lock_batch);
+ unsigned long flags;
+ unsigned int lock_batch;
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 0975adc72253..3e6453573a89 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -46,8 +46,7 @@ static void __drain_swap_slots_cache(unsigned int type);
static void deactivate_swap_slots_cache(void);
static void reactivate_swap_slots_cache(void);
-#define use_swap_slot_cache (swap_slot_cache_active && \
- swap_slot_cache_enabled && swap_slot_cache_initialized)
+#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
#define SLOTS_CACHE 0x1
#define SLOTS_CACHE_RET 0x2
@@ -94,7 +93,7 @@ static bool check_cache_active(void)
{
long pages;
- if (!swap_slot_cache_enabled || !swap_slot_cache_initialized)
+ if (!swap_slot_cache_enabled)
return false;
pages = get_nr_swap_pages();
@@ -136,9 +135,16 @@ static int alloc_swap_slot_cache(unsigned int cpu)
mutex_lock(&swap_slots_cache_mutex);
cache = &per_cpu(swp_slots, cpu);
- if (cache->slots || cache->slots_ret)
+ if (cache->slots || cache->slots_ret) {
/* cache already allocated */
- goto out;
+ mutex_unlock(&swap_slots_cache_mutex);
+
+ kvfree(slots);
+ kvfree(slots_ret);
+
+ return 0;
+ }
+
if (!cache->lock_initialized) {
mutex_init(&cache->alloc_lock);
spin_lock_init(&cache->free_lock);
@@ -155,15 +161,8 @@ static int alloc_swap_slot_cache(unsigned int cpu)
*/
mb();
cache->slots = slots;
- slots = NULL;
cache->slots_ret = slots_ret;
- slots_ret = NULL;
-out:
mutex_unlock(&swap_slots_cache_mutex);
- if (slots)
- kvfree(slots);
- if (slots_ret)
- kvfree(slots_ret);
return 0;
}
@@ -240,21 +239,19 @@ static int free_slot_cache(unsigned int cpu)
int enable_swap_slots_cache(void)
{
- int ret = 0;
-
mutex_lock(&swap_slots_cache_enable_mutex);
- if (swap_slot_cache_initialized) {
- __reenable_swap_slots_cache();
- goto out_unlock;
- }
+ if (!swap_slot_cache_initialized) {
+ int ret;
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
- alloc_swap_slot_cache, free_slot_cache);
- if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
- "without swap slots cache.\n", __func__))
- goto out_unlock;
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
+ alloc_swap_slot_cache, free_slot_cache);
+ if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
+ "without swap slots cache.\n", __func__))
+ goto out_unlock;
+
+ swap_slot_cache_initialized = true;
+ }
- swap_slot_cache_initialized = true;
__reenable_swap_slots_cache();
out_unlock:
mutex_unlock(&swap_slots_cache_enable_mutex);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 05889e8e3c97..b73aabdfd35a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -106,16 +106,32 @@ void show_swap_cache_info(void)
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}
+void *get_shadow_from_swap_cache(swp_entry_t entry)
+{
+ struct address_space *address_space = swap_address_space(entry);
+ pgoff_t idx = swp_offset(entry);
+ struct page *page;
+
+ page = find_get_entry(address_space, idx);
+ if (xa_is_value(page))
+ return page;
+ if (page)
+ put_page(page);
+ return NULL;
+}
+
/*
* add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
+int add_to_swap_cache(struct page *page, swp_entry_t entry,
+ gfp_t gfp, void **shadowp)
{
struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
unsigned long i, nr = hpage_nr_pages(page);
+ void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -125,16 +141,25 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
SetPageSwapCache(page);
do {
+ unsigned long nr_shadows = 0;
+
xas_lock_irq(&xas);
xas_create_range(&xas);
if (xas_error(&xas))
goto unlock;
for (i = 0; i < nr; i++) {
VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
+ old = xas_load(&xas);
+ if (xa_is_value(old)) {
+ nr_shadows++;
+ if (shadowp)
+ *shadowp = old;
+ }
set_page_private(page + i, entry.val + i);
xas_store(&xas, page);
xas_next(&xas);
}
+ address_space->nrexceptional -= nr_shadows;
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
ADD_CACHE_INFO(add_total, nr);
@@ -154,7 +179,8 @@ unlock:
* This must be called only on pages that have
* been verified to be in the swap cache.
*/
-void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
+void __delete_from_swap_cache(struct page *page,
+ swp_entry_t entry, void *shadow)
{
struct address_space *address_space = swap_address_space(entry);
int i, nr = hpage_nr_pages(page);
@@ -166,12 +192,14 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageWriteback(page), page);
for (i = 0; i < nr; i++) {
- void *entry = xas_store(&xas, NULL);
+ void *entry = xas_store(&xas, shadow);
VM_BUG_ON_PAGE(entry != page, entry);
set_page_private(page + i, 0);
xas_next(&xas);
}
ClearPageSwapCache(page);
+ if (shadow)
+ address_space->nrexceptional += nr;
address_space->nrpages -= nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
ADD_CACHE_INFO(del_total, nr);
@@ -208,7 +236,7 @@ int add_to_swap(struct page *page)
* Add it to the swap cache.
*/
err = add_to_swap_cache(page, entry,
- __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
+ __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
if (err)
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
@@ -246,13 +274,44 @@ void delete_from_swap_cache(struct page *page)
struct address_space *address_space = swap_address_space(entry);
xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(page, entry);
+ __delete_from_swap_cache(page, entry, NULL);
xa_unlock_irq(&address_space->i_pages);
put_swap_page(page, entry);
page_ref_sub(page, hpage_nr_pages(page));
}
+void clear_shadow_from_swap_cache(int type, unsigned long begin,
+ unsigned long end)
+{
+ unsigned long curr = begin;
+ void *old;
+
+ for (;;) {
+ unsigned long nr_shadows = 0;
+ swp_entry_t entry = swp_entry(type, curr);
+ struct address_space *address_space = swap_address_space(entry);
+ XA_STATE(xas, &address_space->i_pages, curr);
+
+ xa_lock_irq(&address_space->i_pages);
+ xas_for_each(&xas, old, end) {
+ if (!xa_is_value(old))
+ continue;
+ xas_store(&xas, NULL);
+ nr_shadows++;
+ }
+ address_space->nrexceptional -= nr_shadows;
+ xa_unlock_irq(&address_space->i_pages);
+
+ /* search the next swapcache until we meet end */
+ curr >>= SWAP_ADDRESS_SPACE_SHIFT;
+ curr++;
+ curr <<= SWAP_ADDRESS_SPACE_SHIFT;
+ if (curr > end)
+ break;
+ }
+}
+
/*
* If we are the only user, then try to free up the swap cache.
*
@@ -361,6 +420,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
{
struct swap_info_struct *si;
struct page *page;
+ void *shadow = NULL;
*new_page_allocated = false;
@@ -429,7 +489,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
__SetPageSwapBacked(page);
/* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK)) {
+ if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
put_swap_page(page, entry);
goto fail_unlock;
}
@@ -439,10 +499,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
goto fail_unlock;
}
- /* XXX: Move to lru_cache_add() when it supports new vs putback */
- spin_lock_irq(&page_pgdat(page)->lru_lock);
- lru_note_cost_page(page);
- spin_unlock_irq(&page_pgdat(page)->lru_lock);
+ if (shadow)
+ workingset_refault(page, shadow);
/* Caller will initiate read into locked page */
SetPageWorkingset(page);
@@ -725,7 +783,7 @@ static void swap_ra_info(struct vm_fault *vmf,
/**
* swap_vma_readahead - swap in pages in hope we need them soon
- * @entry: swap entry of this memory
+ * @fentry: swap entry of this memory
* @gfp_mask: memory allocation flags
* @vmf: fault information
*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 987276c557d1..e653eea1eb88 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -696,6 +696,7 @@ static void add_to_avail_list(struct swap_info_struct *p)
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
unsigned int nr_entries)
{
+ unsigned long begin = offset;
unsigned long end = offset + nr_entries - 1;
void (*swap_slot_free_notify)(struct block_device *, unsigned long);
@@ -721,6 +722,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_slot_free_notify(si->bdev, offset);
offset++;
}
+ clear_shadow_from_swap_cache(si->type, begin, end);
}
static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
@@ -1915,7 +1917,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
page_add_anon_rmap(page, vma, addr, false);
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, addr, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
}
swap_free(entry);
/*
@@ -2929,7 +2931,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
* write only restriction. Hence zoned block devices are not
* suitable for swapping. Disallow them here.
*/
- if (blk_queue_is_zoned(p->bdev->bd_queue))
+ if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
return -EINVAL;
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 660717a1ea5c..b3de3c4eefba 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -43,7 +43,7 @@ static noinline int check_stack_object(const void *obj, unsigned long len)
/*
* Reject: object partially overlaps the stack (passing the
- * the check above means at least one end is within the stack,
+ * check above means at least one end is within the stack,
* so if this check fails, the other end is outside the stack).
*/
if (obj < stack || stackend < obj + len)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index b80419320c7d..9a3d451402d7 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -123,7 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
inc_mm_counter(dst_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- lru_cache_add_active_or_unevictable(page, dst_vma);
+ lru_cache_add_inactive_or_unevictable(page, dst_vma);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
diff --git a/mm/util.c b/mm/util.c
index c63c8e47be57..5ef378a2a038 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -503,8 +503,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
- &populate, &uf);
+ ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
+ &uf);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
@@ -746,6 +746,47 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
return ret;
}
+static void sync_overcommit_as(struct work_struct *dummy)
+{
+ percpu_counter_sync(&vm_committed_as);
+}
+
+int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int new_policy;
+ int ret;
+
+ /*
+ * The deviation of sync_overcommit_as could be big with loose policy
+ * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
+ * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
+ * with the strict "NEVER", and to avoid possible race condtion (even
+ * though user usually won't too frequently do the switching to policy
+ * OVERCOMMIT_NEVER), the switch is done in the following order:
+ * 1. changing the batch
+ * 2. sync percpu count on each CPU
+ * 3. switch the policy
+ */
+ if (write) {
+ t = *table;
+ t.data = &new_policy;
+ ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ mm_compute_batch(new_policy);
+ if (new_policy == OVERCOMMIT_NEVER)
+ schedule_on_each_cpu(sync_overcommit_as);
+ sysctl_overcommit_memory = new_policy;
+ } else {
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ }
+
+ return ret;
+}
+
int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
@@ -787,10 +828,15 @@ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
* balancing memory across competing virtual machines that are hosted.
* Several metrics drive this policy engine including the guest reported
* memory commitment.
+ *
+ * The time cost of this is very low for small platforms, and for big
+ * platform like a 2S/36C/72T Skylake server, in worst case where
+ * vm_committed_as's spinlock is under severe contention, the time cost
+ * could be about 30~40 microseconds.
*/
unsigned long vm_memory_committed(void)
{
- return percpu_counter_read_positive(&vm_committed_as);
+ return percpu_counter_sum_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5a2b55c8dd9a..b482d240f9a2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -7,6 +7,7 @@
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
* Numa awareness, Christoph Lameter, SGI, June 2005
+ * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
*/
#include <linux/vmalloc.h>
@@ -25,7 +26,7 @@
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
@@ -41,6 +42,7 @@
#include <asm/shmparam.h>
#include "internal.h"
+#include "pgalloc-track.h"
bool is_vmalloc_addr(const void *x)
{
@@ -173,7 +175,6 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
- start = addr;
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
@@ -511,6 +512,10 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
/*
* This function returns back addresses of parent node
* and its left or right link for further processing.
+ *
+ * Otherwise NULL is returned. In that case all further
+ * steps regarding inserting of conflicting overlap range
+ * have to be declined and actually considered as a bug.
*/
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
@@ -549,8 +554,12 @@ find_va_links(struct vmap_area *va,
else if (va->va_end > tmp_va->va_start &&
va->va_start >= tmp_va->va_end)
link = &(*link)->rb_right;
- else
- BUG();
+ else {
+ WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
+ va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
+
+ return NULL;
+ }
} while (*link);
*parent = &tmp_va->rb_node;
@@ -632,43 +641,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root)
#if DEBUG_AUGMENT_PROPAGATE_CHECK
static void
-augment_tree_propagate_check(struct rb_node *n)
+augment_tree_propagate_check(void)
{
struct vmap_area *va;
- struct rb_node *node;
- unsigned long size;
- bool found = false;
-
- if (n == NULL)
- return;
+ unsigned long computed_size;
- va = rb_entry(n, struct vmap_area, rb_node);
- size = va->subtree_max_size;
- node = n;
-
- while (node) {
- va = rb_entry(node, struct vmap_area, rb_node);
-
- if (get_subtree_max_size(node->rb_left) == size) {
- node = node->rb_left;
- } else {
- if (va_size(va) == size) {
- found = true;
- break;
- }
-
- node = node->rb_right;
- }
- }
-
- if (!found) {
- va = rb_entry(n, struct vmap_area, rb_node);
- pr_emerg("tree is corrupted: %lu, %lu\n",
- va_size(va), va->subtree_max_size);
+ list_for_each_entry(va, &free_vmap_area_list, list) {
+ computed_size = compute_subtree_max_size(va);
+ if (computed_size != va->subtree_max_size)
+ pr_emerg("tree is corrupted: %lu, %lu\n",
+ va_size(va), va->subtree_max_size);
}
-
- augment_tree_propagate_check(n->rb_left);
- augment_tree_propagate_check(n->rb_right);
}
#endif
@@ -702,28 +685,15 @@ augment_tree_propagate_check(struct rb_node *n)
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
- struct rb_node *node = &va->rb_node;
- unsigned long new_va_sub_max_size;
-
- while (node) {
- va = rb_entry(node, struct vmap_area, rb_node);
- new_va_sub_max_size = compute_subtree_max_size(va);
-
- /*
- * If the newly calculated maximum available size of the
- * subtree is equal to the current one, then it means that
- * the tree is propagated correctly. So we have to stop at
- * this point to save cycles.
- */
- if (va->subtree_max_size == new_va_sub_max_size)
- break;
-
- va->subtree_max_size = new_va_sub_max_size;
- node = rb_parent(&va->rb_node);
- }
+ /*
+ * Populate the tree from bottom towards the root until
+ * the calculated maximum available size of checked node
+ * is equal to its current one.
+ */
+ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
#if DEBUG_AUGMENT_PROPAGATE_CHECK
- augment_tree_propagate_check(free_vmap_area_root.rb_node);
+ augment_tree_propagate_check();
#endif
}
@@ -735,7 +705,8 @@ insert_vmap_area(struct vmap_area *va,
struct rb_node *parent;
link = find_va_links(va, root, NULL, &parent);
- link_va(va, root, parent, link, head);
+ if (link)
+ link_va(va, root, parent, link, head);
}
static void
@@ -751,8 +722,10 @@ insert_vmap_area_augment(struct vmap_area *va,
else
link = find_va_links(va, root, NULL, &parent);
- link_va(va, root, parent, link, head);
- augment_tree_propagate_from(va);
+ if (link) {
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+ }
}
/*
@@ -760,6 +733,11 @@ insert_vmap_area_augment(struct vmap_area *va,
* and next free blocks. If coalesce is not done a new
* free area is inserted. If VA has been merged, it is
* freed.
+ *
+ * Please note, it can return NULL in case of overlap
+ * ranges, followed by WARN() report. Despite it is a
+ * buggy behaviour, a system can be alive and keep
+ * ongoing.
*/
static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
@@ -776,6 +754,8 @@ merge_or_add_vmap_area(struct vmap_area *va,
* inserted, unless it is merged with its sibling/siblings.
*/
link = find_va_links(va, root, NULL, &parent);
+ if (!link)
+ return NULL;
/*
* Get next node of VA to check if merging can be done.
@@ -796,9 +776,6 @@ merge_or_add_vmap_area(struct vmap_area *va,
if (sibling->va_start == va->va_end) {
sibling->va_start = va->va_start;
- /* Check and update the tree if needed. */
- augment_tree_propagate_from(sibling);
-
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
@@ -818,14 +795,18 @@ merge_or_add_vmap_area(struct vmap_area *va,
if (next->prev != head) {
sibling = list_entry(next->prev, struct vmap_area, list);
if (sibling->va_end == va->va_start) {
- sibling->va_end = va->va_end;
-
- /* Check and update the tree if needed. */
- augment_tree_propagate_from(sibling);
-
+ /*
+ * If both neighbors are coalesced, it is important
+ * to unlink the "next" node first, followed by merging
+ * with "previous" one. Otherwise the tree might not be
+ * fully populated if a sibling's augmented value is
+ * "normalized" because of rotation operations.
+ */
if (merged)
unlink_va(va, root);
+ sibling->va_end = va->va_end;
+
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
@@ -836,11 +817,13 @@ merge_or_add_vmap_area(struct vmap_area *va,
}
insert:
- if (!merged) {
+ if (!merged)
link_va(va, root, parent, link, head);
- augment_tree_propagate_from(va);
- }
+ /*
+ * Last step is to check and update the tree.
+ */
+ augment_tree_propagate_from(va);
return va;
}
@@ -1381,6 +1364,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
va = merge_or_add_vmap_area(va, &free_vmap_area_root,
&free_vmap_area_list);
+ if (!va)
+ continue;
+
if (is_vmalloc_or_module_addr((void *)orig_start))
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
@@ -1513,12 +1499,11 @@ struct vmap_block {
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
/*
- * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
+ * XArray of vmap blocks, indexed by address, to quickly find a vmap block
* in the free path. Could get rid of this if we change the API to return a
* "cookie" from alloc, to be passed to free. But no big deal yet.
*/
-static DEFINE_SPINLOCK(vmap_block_tree_lock);
-static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+static DEFINE_XARRAY(vmap_blocks);
/*
* We should probably have a fallback mechanism to allocate virtual memory
@@ -1575,13 +1560,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_CAST(va);
}
- err = radix_tree_preload(gfp_mask);
- if (unlikely(err)) {
- kfree(vb);
- free_vmap_area(va);
- return ERR_PTR(err);
- }
-
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
@@ -1594,11 +1572,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
INIT_LIST_HEAD(&vb->free_list);
vb_idx = addr_to_vb_idx(va->va_start);
- spin_lock(&vmap_block_tree_lock);
- err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
- spin_unlock(&vmap_block_tree_lock);
- BUG_ON(err);
- radix_tree_preload_end();
+ err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
+ if (err) {
+ kfree(vb);
+ free_vmap_area(va);
+ return ERR_PTR(err);
+ }
vbq = &get_cpu_var(vmap_block_queue);
spin_lock(&vbq->lock);
@@ -1612,12 +1591,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
static void free_vmap_block(struct vmap_block *vb)
{
struct vmap_block *tmp;
- unsigned long vb_idx;
- vb_idx = addr_to_vb_idx(vb->va->va_start);
- spin_lock(&vmap_block_tree_lock);
- tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
- spin_unlock(&vmap_block_tree_lock);
+ tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
free_vmap_area_noflush(vb->va);
@@ -1723,7 +1698,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
static void vb_free(unsigned long addr, unsigned long size)
{
unsigned long offset;
- unsigned long vb_idx;
unsigned int order;
struct vmap_block *vb;
@@ -1733,14 +1707,8 @@ static void vb_free(unsigned long addr, unsigned long size)
flush_cache_vunmap(addr, addr + size);
order = get_order(size);
-
offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
-
- vb_idx = addr_to_vb_idx(addr);
- rcu_read_lock();
- vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
- rcu_read_unlock();
- BUG_ON(!vb);
+ vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
unmap_kernel_range_noflush(addr, size);
@@ -3383,8 +3351,9 @@ recovery:
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
&free_vmap_area_list);
- kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
vas[area] = NULL;
}
@@ -3432,8 +3401,9 @@ err_free_shadow:
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
&free_vmap_area_list);
- kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
vas[area] = NULL;
kfree(vms[area]);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 749d239c62b2..738115ed75e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -170,11 +170,6 @@ struct scan_control {
* From 0 .. 200. Higher means more swappy.
*/
int vm_swappiness = 60;
-/*
- * The total number of pages which are beyond the high watermark within all
- * zones.
- */
-unsigned long vm_total_pages;
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
@@ -859,6 +854,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
{
unsigned long flags;
int refcount;
+ void *shadow = NULL;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
@@ -901,13 +897,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
- __delete_from_swap_cache(page, swap);
+ if (reclaimed && !mapping_exiting(mapping))
+ shadow = workingset_eviction(page, target_memcg);
+ __delete_from_swap_cache(page, swap, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
- workingset_eviction(page, target_memcg);
} else {
void (*freepage)(struct page *);
- void *shadow = NULL;
freepage = mapping->a_ops->freepage;
/*
@@ -915,7 +911,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* order to detect refaults, thus thrashing, later on.
*
* But don't store shadows in an address space that is
- * already exiting. This is not just an optizimation,
+ * already exiting. This is not just an optimization,
* inode reclaim needs to empty out the radix tree or
* the nodes are lost. Don't plant shadows behind its
* back.
@@ -1003,8 +999,6 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_RECLAIM;
if (referenced_ptes) {
- if (PageSwapBacked(page))
- return PAGEREF_ACTIVATE;
/*
* All mapped pages start out with page table
* references from the instantiating fault, so we need
@@ -1027,7 +1021,7 @@ static enum page_references page_check_references(struct page *page,
/*
* Activate file-backed executable pages after first usage.
*/
- if (vm_flags & VM_EXEC)
+ if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
return PAGEREF_ACTIVATE;
return PAGEREF_KEEP;
@@ -2035,7 +2029,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- __count_vm_events(PGREFILL, nr_scanned);
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(PGREFILL, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2331,7 +2326,8 @@ out:
unsigned long protection;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- protection = mem_cgroup_protection(memcg,
+ protection = mem_cgroup_protection(sc->target_mem_cgroup,
+ memcg,
sc->memcg_low_reclaim);
if (protection) {
@@ -2619,14 +2615,15 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
unsigned long reclaimed;
unsigned long scanned;
- switch (mem_cgroup_protected(target_memcg, memcg)) {
- case MEMCG_PROT_MIN:
+ mem_cgroup_calculate_protection(target_memcg, memcg);
+
+ if (mem_cgroup_below_min(memcg)) {
/*
* Hard protection.
* If there is no reclaimable memory, OOM.
*/
continue;
- case MEMCG_PROT_LOW:
+ } else if (mem_cgroup_below_low(memcg)) {
/*
* Soft protection.
* Respect the protection only as long as
@@ -2638,16 +2635,6 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
continue;
}
memcg_memory_event(memcg, MEMCG_LOW);
- break;
- case MEMCG_PROT_NONE:
- /*
- * All protection thresholds breached. We may
- * still choose to vary the scan pressure
- * applied based on by how much the cgroup in
- * question has exceeded its protection
- * thresholds (see get_scan_count).
- */
- break;
}
reclaimed = sc->nr_reclaimed;
@@ -2697,7 +2684,10 @@ again:
if (!sc->force_deactivate) {
unsigned long refaults;
- if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_ANON);
+ if (refaults != target_lruvec->refaults[0] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
sc->may_deactivate |= DEACTIVATE_ANON;
else
sc->may_deactivate &= ~DEACTIVATE_ANON;
@@ -2708,8 +2698,8 @@ again:
* rid of any stale active pages quickly.
*/
refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE);
- if (refaults != target_lruvec->refaults ||
+ WORKINGSET_ACTIVATE_FILE);
+ if (refaults != target_lruvec->refaults[1] ||
inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
sc->may_deactivate |= DEACTIVATE_FILE;
else
@@ -2808,7 +2798,7 @@ again:
set_bit(PGDAT_DIRTY, &pgdat->flags);
/*
- * If kswapd scans pages marked marked for immediate
+ * If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
* implies that pages are cycling through the LRU
* faster than they are written so also forcibly stall.
@@ -2986,8 +2976,10 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
unsigned long refaults;
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
- refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE);
- target_lruvec->refaults = refaults;
+ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
+ target_lruvec->refaults[0] = refaults;
+ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
+ target_lruvec->refaults[1] = refaults;
}
/*
@@ -3318,7 +3310,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
bool may_swap)
{
unsigned long nr_reclaimed;
- unsigned long pflags;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -3339,17 +3330,12 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
set_task_reclaim_state(current, &sc.reclaim_state);
-
trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
-
- psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
memalloc_noreclaim_restore(noreclaim_flag);
- psi_memstall_leave(&pflags);
-
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
set_task_reclaim_state(current, NULL);
@@ -3387,7 +3373,7 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
/*
* Check for watermark boosts top-down as the higher zones
* are more likely to be boosted. Both watermarks and boosts
- * should not be checked at the time time as reclaim would
+ * should not be checked at the same time as reclaim would
* start prematurely when there is no boosting and a lower
* zone is balanced.
*/
@@ -4222,7 +4208,8 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
* unmapped file backed pages.
*/
if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
- node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+ node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
+ pgdat->min_slab_pages)
return NODE_RECLAIM_FULL;
/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 3fb23a21f6dd..727a26d1ec1d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -341,6 +341,11 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long x;
long t;
+ if (vmstat_item_in_bytes(item)) {
+ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
+ delta >>= PAGE_SHIFT;
+ }
+
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -398,6 +403,8 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -442,6 +449,8 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -541,6 +550,11 @@ static inline void mod_node_state(struct pglist_data *pgdat,
s8 __percpu *p = pcp->vm_node_stat_diff + item;
long o, n, t, z;
+ if (vmstat_item_in_bytes(item)) {
+ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
+ delta >>= PAGE_SHIFT;
+ }
+
do {
z = 0; /* overflow to node counters */
@@ -989,8 +1003,8 @@ unsigned long sum_zone_numa_state(int node,
/*
* Determine the per node value of a stat item.
*/
-unsigned long node_page_state(struct pglist_data *pgdat,
- enum node_stat_item item)
+unsigned long node_page_state_pages(struct pglist_data *pgdat,
+ enum node_stat_item item)
{
long x = atomic_long_read(&pgdat->vm_stat[item]);
#ifdef CONFIG_SMP
@@ -999,6 +1013,14 @@ unsigned long node_page_state(struct pglist_data *pgdat,
#endif
return x;
}
+
+unsigned long node_page_state(struct pglist_data *pgdat,
+ enum node_stat_item item)
+{
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
+ return node_page_state_pages(pgdat, item);
+}
#endif
#ifdef CONFIG_COMPACTION
@@ -1074,6 +1096,24 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
}
+/*
+ * Calculates external fragmentation within a zone wrt the given order.
+ * It is defined as the percentage of pages found in blocks of size
+ * less than 1 << order. It returns values in range [0, 100].
+ */
+unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
+{
+ struct contig_page_info info;
+
+ fill_contig_page_info(zone, order, &info);
+ if (info.free_pages == 0)
+ return 0;
+
+ return div_u64((info.free_pages -
+ (info.free_blocks_suitable << order)) * 100,
+ info.free_pages);
+}
+
/* Same as __fragmentation index but allocs contig_page_info on stack */
int fragmentation_index(struct zone *zone, unsigned int order)
{
@@ -1118,10 +1158,6 @@ const char * const vmstat_text[] = {
"nr_zone_write_pending",
"nr_mlock",
"nr_page_table_pages",
- "nr_kernel_stack",
-#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
- "nr_shadow_call_stack",
-#endif
"nr_bounce",
#if IS_ENABLED(CONFIG_ZSMALLOC)
"nr_zspages",
@@ -1149,9 +1185,12 @@ const char * const vmstat_text[] = {
"nr_isolated_anon",
"nr_isolated_file",
"workingset_nodes",
- "workingset_refault",
- "workingset_activate",
- "workingset_restore",
+ "workingset_refault_anon",
+ "workingset_refault_file",
+ "workingset_activate_anon",
+ "workingset_activate_file",
+ "workingset_restore_anon",
+ "workingset_restore_file",
"workingset_nodereclaim",
"nr_anon_pages",
"nr_mapped",
@@ -1172,6 +1211,10 @@ const char * const vmstat_text[] = {
"nr_kernel_misc_reclaimable",
"nr_foll_pin_acquired",
"nr_foll_pin_released",
+ "nr_kernel_stack",
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+ "nr_shadow_call_stack",
+#endif
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1234,6 +1277,9 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_MIGRATION
"pgmigrate_success",
"pgmigrate_fail",
+ "thp_migration_success",
+ "thp_migration_fail",
+ "thp_migration_split",
#endif
#ifdef CONFIG_COMPACTION
"compact_migrate_scanned",
@@ -1577,7 +1623,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, "\n per-node stats");
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
seq_printf(m, "\n %-12s %lu", node_stat_name(i),
- node_page_state(pgdat, i));
+ node_page_state_pages(pgdat, i));
}
}
seq_printf(m,
@@ -1698,7 +1744,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
#endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- v[i] = global_node_page_state(i);
+ v[i] = global_node_page_state_pages(i);
v += NR_VM_NODE_STAT_ITEMS;
global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
diff --git a/mm/workingset.c b/mm/workingset.c
index 50b7937bab32..8cbe4e3cbe5c 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -6,6 +6,7 @@
*/
#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/shmem_fs.h>
#include <linux/pagemap.h>
@@ -280,6 +281,7 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
*/
void workingset_refault(struct page *page, void *shadow)
{
+ bool file = page_is_file_lru(page);
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
@@ -346,27 +348,34 @@ void workingset_refault(struct page *page, void *shadow)
memcg = page_memcg(page);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
- inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
/*
* Compare the distance to the existing workingset size. We
* don't activate pages that couldn't stay resident even if
- * all the memory was available to the page cache. Whether
- * cache can compete with anon or not depends on having swap.
+ * all the memory was available to the workingset. Whether
+ * workingset competition needs to consider anon or not depends
+ * on having swap.
*/
workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
- if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
+ if (!file) {
workingset_size += lruvec_page_state(eviction_lruvec,
- NR_INACTIVE_ANON);
+ NR_INACTIVE_FILE);
+ }
+ if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
workingset_size += lruvec_page_state(eviction_lruvec,
NR_ACTIVE_ANON);
+ if (file) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_INACTIVE_ANON);
+ }
}
if (refault_distance > workingset_size)
goto out;
SetPageActive(page);
workingset_age_nonresident(lruvec, hpage_nr_pages(page));
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);
/* Page was active prior to eviction */
if (workingset) {
@@ -375,7 +384,7 @@ void workingset_refault(struct page *page, void *shadow)
spin_lock_irq(&page_pgdat(page)->lru_lock);
lru_note_cost_page(page);
spin_unlock_irq(&page_pgdat(page)->lru_lock);
- inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
}
out:
rcu_read_unlock();
@@ -486,8 +495,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
pages += lruvec_page_state_local(lruvec,
NR_LRU_BASE + i);
- pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE);
- pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE);
+ pages += lruvec_page_state_local(
+ lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
+ pages += lruvec_page_state_local(
+ lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
} else
#endif
pages = node_present_pages(sc->nid);
diff --git a/mm/zpool.c b/mm/zpool.c
index 863669212070..3744a2d1a624 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -239,15 +239,15 @@ const char *zpool_get_type(struct zpool *zpool)
}
/**
- * zpool_malloc_support_movable() - Check if the zpool support
- * allocate movable memory
+ * zpool_malloc_support_movable() - Check if the zpool supports
+ * allocating movable memory
* @zpool: The zpool to check
*
- * This returns if the zpool support allocate movable memory.
+ * This returns if the zpool supports allocating movable memory.
*
* Implementations must guarantee this to be thread-safe.
*
- * Returns: true if if the zpool support allocate movable memory, false if not
+ * Returns: true if the zpool supports allocating movable memory, false if not
*/
bool zpool_malloc_support_movable(struct zpool *zpool)
{
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 952a01e45c6a..c36fdff9a371 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -79,7 +79,7 @@
/*
* Object location (<PFN>, <obj_idx>) is encoded as
- * as single (unsigned long) handle value.
+ * a single (unsigned long) handle value.
*
* Note that object index <obj_idx> starts from 0.
*