aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig40
-rw-r--r--mm/Makefile8
-rw-r--r--mm/backing-dev.c6
-rw-r--r--mm/cma_debug.c5
-rw-r--r--mm/compaction.c31
-rw-r--r--mm/damon/Kconfig3
-rw-r--r--mm/damon/core-test.h29
-rw-r--r--mm/damon/core.c308
-rw-r--r--mm/damon/dbgfs.c78
-rw-r--r--mm/damon/lru_sort.c380
-rw-r--r--mm/damon/modules-common.h46
-rw-r--r--mm/damon/ops-common.c50
-rw-r--r--mm/damon/ops-common.h2
-rw-r--r--mm/damon/paddr.c43
-rw-r--r--mm/damon/reclaim.c289
-rw-r--r--mm/damon/sysfs.c145
-rw-r--r--mm/damon/vaddr-test.h36
-rw-r--r--mm/damon/vaddr.c106
-rw-r--r--mm/debug.c14
-rw-r--r--mm/filemap.c131
-rw-r--r--mm/folio-compat.c6
-rw-r--r--mm/frontswap.c3
-rw-r--r--mm/gup.c296
-rw-r--r--mm/highmem.c43
-rw-r--r--mm/hmm.c2
-rw-r--r--mm/huge_memory.c225
-rw-r--r--mm/hugetlb.c910
-rw-r--r--mm/hugetlb_cgroup.c27
-rw-r--r--mm/hugetlb_vmemmap.c11
-rw-r--r--mm/hwpoison-inject.c4
-rw-r--r--mm/init-mm.c4
-rw-r--r--mm/internal.h48
-rw-r--r--mm/kasan/Makefile8
-rw-r--r--mm/kasan/common.c177
-rw-r--r--mm/kasan/generic.c154
-rw-r--r--mm/kasan/hw_tags.c39
-rw-r--r--mm/kasan/kasan.h171
-rw-r--r--mm/kasan/kasan_test.c1457
-rw-r--r--mm/kasan/kasan_test_module.c141
-rw-r--r--mm/kasan/report.c138
-rw-r--r--mm/kasan/report_generic.c46
-rw-r--r--mm/kasan/report_tags.c123
-rw-r--r--mm/kasan/sw_tags.c5
-rw-r--r--mm/kasan/tags.c143
-rw-r--r--mm/kfence/core.c24
-rw-r--r--mm/kfence/report.c1
-rw-r--r--mm/khugepaged.c1182
-rw-r--r--mm/kmemleak.c82
-rw-r--r--mm/kmsan/Makefile28
-rw-r--r--mm/kmsan/core.c450
-rw-r--r--mm/kmsan/hooks.c384
-rw-r--r--mm/kmsan/init.c235
-rw-r--r--mm/kmsan/instrumentation.c308
-rw-r--r--mm/kmsan/kmsan.h211
-rw-r--r--mm/kmsan/kmsan_test.c581
-rw-r--r--mm/kmsan/report.c219
-rw-r--r--mm/kmsan/shadow.c299
-rw-r--r--mm/ksm.c375
-rw-r--r--mm/madvise.c79
-rw-r--r--mm/memblock.c6
-rw-r--r--mm/memcontrol.c305
-rw-r--r--mm/memory-failure.c154
-rw-r--r--mm/memory-tiers.c732
-rw-r--r--mm/memory.c308
-rw-r--r--mm/memory_hotplug.c11
-rw-r--r--mm/mempolicy.c65
-rw-r--r--mm/memremap.c33
-rw-r--r--mm/migrate.c692
-rw-r--r--mm/migrate_device.c274
-rw-r--r--mm/mlock.c37
-rw-r--r--mm/mm_init.c6
-rw-r--r--mm/mm_slot.h55
-rw-r--r--mm/mmap.c2264
-rw-r--r--mm/mmu_gather.c10
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mprotect.c19
-rw-r--r--mm/mremap.c41
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c260
-rw-r--r--mm/oom_kill.c11
-rw-r--r--mm/page-writeback.c1
-rw-r--r--mm/page_alloc.c298
-rw-r--r--mm/page_counter.c15
-rw-r--r--mm/page_ext.c117
-rw-r--r--mm/page_io.c43
-rw-r--r--mm/page_isolation.c42
-rw-r--r--mm/page_owner.c103
-rw-r--r--mm/page_table_check.c14
-rw-r--r--mm/page_vma_mapped.c6
-rw-r--r--mm/pagewalk.c12
-rw-r--r--mm/readahead.c22
-rw-r--r--mm/rmap.c184
-rw-r--r--mm/rodata_test.c8
-rw-r--r--mm/secretmem.c10
-rw-r--r--mm/shmem.c454
-rw-r--r--mm/shuffle.c21
-rw-r--r--mm/slab.c344
-rw-r--r--mm/slab.h11
-rw-r--r--mm/slab_common.c256
-rw-r--r--mm/slob.c45
-rw-r--r--mm/slub.c924
-rw-r--r--mm/swap.c73
-rw-r--r--mm/swap.h18
-rw-r--r--mm/swap_cgroup.c6
-rw-r--r--mm/swap_slots.c2
-rw-r--r--mm/swap_state.c113
-rw-r--r--mm/swapfile.c180
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/userfaultfd.c55
-rw-r--r--mm/util.c40
-rw-r--r--mm/vmacache.c117
-rw-r--r--mm/vmalloc.c35
-rw-r--r--mm/vmscan.c3340
-rw-r--r--mm/vmstat.c46
-rw-r--r--mm/workingset.c110
-rw-r--r--mm/zsmalloc.c19
-rw-r--r--mm/zswap.c2
117 files changed, 15884 insertions, 6880 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 0331f1461f81..57e1d8c5b505 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -23,7 +23,7 @@ menuconfig SWAP
in your computer. If unsure say Y.
config ZSWAP
- bool "Compressed cache for swap pages (EXPERIMENTAL)"
+ bool "Compressed cache for swap pages"
depends on SWAP
select FRONTSWAP
select CRYPTO
@@ -36,12 +36,6 @@ config ZSWAP
in the case where decompressing from RAM is faster than swap device
reads, can also improve workload performance.
- This is marked experimental because it is a new feature (as of
- v3.11) that interacts heavily with memory reclaim. While these
- interactions don't cause any known issues on simple memory setups,
- they have not be fully explored on the large set of potential
- configurations and workloads that exist.
-
config ZSWAP_DEFAULT_ON
bool "Enable the compressed cache for swap pages by default"
depends on ZSWAP
@@ -579,6 +573,12 @@ config COMPACTION
it and then we would be really interested to hear about that at
linux-mm@kvack.org.
+config COMPACT_UNEVICTABLE_DEFAULT
+ int
+ depends on COMPACTION
+ default 0 if PREEMPT_RT
+ default 1
+
#
# support for free page reporting
config PAGE_REPORTING
@@ -1124,6 +1124,32 @@ config PTE_MARKER_UFFD_WP
purposes. It is required to enable userfaultfd write protection on
file-backed memory types like shmem and hugetlbfs.
+# multi-gen LRU {
+config LRU_GEN
+ bool "Multi-Gen LRU"
+ depends on MMU
+ # make sure folio->flags has enough spare bits
+ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
+ help
+ A high performance LRU implementation to overcommit memory. See
+ Documentation/admin-guide/mm/multigen_lru.rst for details.
+
+config LRU_GEN_ENABLED
+ bool "Enable by default"
+ depends on LRU_GEN
+ help
+ This option enables the multi-gen LRU by default.
+
+config LRU_GEN_STATS
+ bool "Full stats for debugging"
+ depends on LRU_GEN
+ help
+ Do not enable this option unless you plan to look at historical stats
+ from evicted generations for debugging purpose.
+
+ This option has a per-memcg and per-node memory overhead.
+# }
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 9a564f836403..8e105e5b3e29 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
- compaction.o vmacache.o \
+ compaction.o \
interval_tree.o list_lru.o workingset.o \
debug.o gup.o mmap_lock.o $(mmu-y)
@@ -89,14 +89,18 @@ obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_KFENCE) += kfence/
+obj-$(CONFIG_KMSAN) += kmsan/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
+obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
-obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
+ifdef CONFIG_SWAP
+obj-$(CONFIG_MEMCG) += swap_cgroup.o
+endif
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_GUP_TEST) += gup_test.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index de65cb1e5f76..c30419a5e119 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -776,8 +776,6 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
int bdi_init(struct backing_dev_info *bdi)
{
- int ret;
-
bdi->dev = NULL;
kref_init(&bdi->refcnt);
@@ -788,9 +786,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
- ret = cgwb_bdi_init(bdi);
-
- return ret;
+ return cgwb_bdi_init(bdi);
}
struct backing_dev_info *bdi_alloc(int node_id)
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index c3ffe253e055..602fff89b15f 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -163,11 +163,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
struct dentry *tmp;
- char name[CMA_MAX_NAME];
- scnprintf(name, sizeof(name), "cma-%s", cma->name);
-
- tmp = debugfs_create_dir(name, root_dentry);
+ tmp = debugfs_create_dir(cma->name, root_dentry);
debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
diff --git a/mm/compaction.c b/mm/compaction.c
index 640fa76228dd..c51f7f545afe 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -52,8 +52,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order))
#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order))
-#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order)
-#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
/*
* Page order with-respect-to which proactive compaction
@@ -404,7 +402,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page,
if (cc->ignore_skip_hint)
return false;
- if (!IS_ALIGNED(pfn, pageblock_nr_pages))
+ if (!pageblock_aligned(pfn))
return false;
skip = get_pageblock_skip(page);
@@ -886,7 +884,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* COMPACT_CLUSTER_MAX at a time so the second call must
* not falsely conclude that the block should be skipped.
*/
- if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
+ if (!valid_page && pageblock_aligned(low_pfn)) {
if (!isolation_suitable(cc, page)) {
low_pfn = end_pfn;
page = NULL;
@@ -1727,11 +1725,7 @@ typedef enum {
* Allow userspace to control policy on scanning the unevictable LRU for
* compactable pages.
*/
-#ifdef CONFIG_PREEMPT_RT
-int sysctl_compact_unevictable_allowed __read_mostly = 0;
-#else
-int sysctl_compact_unevictable_allowed __read_mostly = 1;
-#endif
+int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNEVICTABLE_DEFAULT;
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
@@ -1853,7 +1847,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
- set_pageblock_skip(freepage);
break;
}
}
@@ -1939,7 +1932,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
* before making it "skip" so other compaction instances do
* not scan the same block.
*/
- if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
+ if (pageblock_aligned(low_pfn) &&
!fast_find_block && !isolation_suitable(cc, page))
continue;
@@ -1981,9 +1974,21 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
+/*
+ * Determine whether kswapd is (or recently was!) running on this node.
+ *
+ * pgdat_kswapd_lock() pins pgdat->kswapd, so a concurrent kswapd_stop() can't
+ * zero it.
+ */
static bool kswapd_is_running(pg_data_t *pgdat)
{
- return pgdat->kswapd && task_is_running(pgdat->kswapd);
+ bool running;
+
+ pgdat_kswapd_lock(pgdat);
+ running = pgdat->kswapd && task_is_running(pgdat->kswapd);
+ pgdat_kswapd_unlock(pgdat);
+
+ return running;
}
/*
@@ -2113,7 +2118,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
* migration source is unmovable/reclaimable but it's not worth
* special casing.
*/
- if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+ if (!pageblock_aligned(cc->migrate_pfn))
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 66265e3a9c65..7821fcb3f258 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -68,6 +68,9 @@ config DAMON_DBGFS
If unsure, say N.
+ This will be removed after >5.15.y LTS kernel is released, so users
+ should move to the sysfs interface (DAMON_SYSFS).
+
config DAMON_DBGFS_KUNIT_TEST
bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS
depends on DAMON_DBGFS && KUNIT=y
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index 573669566f84..3db9b7368756 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -126,7 +126,7 @@ static void damon_test_split_at(struct kunit *test)
t = damon_new_target();
r = damon_new_region(0, 100);
damon_add_region(r, t);
- damon_split_region_at(c, t, r, 25);
+ damon_split_region_at(t, r, 25);
KUNIT_EXPECT_EQ(test, r->ar.start, 0ul);
KUNIT_EXPECT_EQ(test, r->ar.end, 25ul);
@@ -219,14 +219,14 @@ static void damon_test_split_regions_of(struct kunit *test)
t = damon_new_target();
r = damon_new_region(0, 22);
damon_add_region(r, t);
- damon_split_regions_of(c, t, 2);
+ damon_split_regions_of(t, 2);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
damon_free_target(t);
t = damon_new_target();
r = damon_new_region(0, 220);
damon_add_region(r, t);
- damon_split_regions_of(c, t, 4);
+ damon_split_regions_of(t, 4);
KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
damon_free_target(t);
damon_destroy_ctx(c);
@@ -267,6 +267,28 @@ static void damon_test_ops_registration(struct kunit *test)
KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
}
+static void damon_test_set_regions(struct kunit *test)
+{
+ struct damon_target *t = damon_new_target();
+ struct damon_region *r1 = damon_new_region(4, 16);
+ struct damon_region *r2 = damon_new_region(24, 32);
+ struct damon_addr_range range = {.start = 8, .end = 28};
+ unsigned long expects[] = {8, 16, 16, 24, 24, 28};
+ int expect_idx = 0;
+ struct damon_region *r;
+
+ damon_add_region(r1, t);
+ damon_add_region(r2, t);
+ damon_set_regions(t, &range, 1);
+
+ KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3);
+ damon_for_each_region(r, t) {
+ KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]);
+ KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]);
+ }
+ damon_destroy_target(t);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_target),
KUNIT_CASE(damon_test_regions),
@@ -276,6 +298,7 @@ static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_test_merge_regions_of),
KUNIT_CASE(damon_test_split_regions_of),
KUNIT_CASE(damon_test_ops_registration),
+ KUNIT_CASE(damon_test_set_regions),
{},
};
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 7d25dc582fe3..36d098d06c55 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -29,6 +29,8 @@ static bool running_exclusive_ctxs;
static DEFINE_MUTEX(damon_ops_lock);
static struct damon_operations damon_registered_ops[NR_DAMON_OPS];
+static struct kmem_cache *damon_region_cache __ro_after_init;
+
/* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */
static bool __damon_is_registered_ops(enum damon_ops_id id)
{
@@ -119,7 +121,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
{
struct damon_region *region;
- region = kmalloc(sizeof(*region), GFP_KERNEL);
+ region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL);
if (!region)
return NULL;
@@ -148,7 +150,7 @@ static void damon_del_region(struct damon_region *r, struct damon_target *t)
static void damon_free_region(struct damon_region *r)
{
- kfree(r);
+ kmem_cache_free(damon_region_cache, r);
}
void damon_destroy_region(struct damon_region *r, struct damon_target *t)
@@ -169,6 +171,30 @@ static bool damon_intersect(struct damon_region *r,
}
/*
+ * Fill holes in regions with new regions.
+ */
+static int damon_fill_regions_holes(struct damon_region *first,
+ struct damon_region *last, struct damon_target *t)
+{
+ struct damon_region *r = first;
+
+ damon_for_each_region_from(r, t) {
+ struct damon_region *next, *newr;
+
+ if (r == last)
+ break;
+ next = damon_next_region(r);
+ if (r->ar.end != next->ar.start) {
+ newr = damon_new_region(r->ar.end, next->ar.start);
+ if (!newr)
+ return -ENOMEM;
+ damon_insert_region(newr, r, next, t);
+ }
+ }
+ return 0;
+}
+
+/*
* damon_set_regions() - Set regions of a target for given address ranges.
* @t: the given target.
* @ranges: array of new monitoring target ranges.
@@ -184,6 +210,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
{
struct damon_region *r, *next;
unsigned int i;
+ int err;
/* Remove regions which are not in the new ranges */
damon_for_each_region_safe(r, next, t) {
@@ -195,6 +222,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
damon_destroy_region(r, t);
}
+ r = damon_first_region(t);
/* Add new regions or resize existing regions to fit in the ranges */
for (i = 0; i < nr_ranges; i++) {
struct damon_region *first = NULL, *last, *newr;
@@ -202,7 +230,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
range = &ranges[i];
/* Get the first/last regions intersecting with the range */
- damon_for_each_region(r, t) {
+ damon_for_each_region_from(r, t) {
if (damon_intersect(r, range)) {
if (!first)
first = r;
@@ -225,52 +253,46 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
first->ar.start = ALIGN_DOWN(range->start,
DAMON_MIN_REGION);
last->ar.end = ALIGN(range->end, DAMON_MIN_REGION);
+
+ /* fill possible holes in the range */
+ err = damon_fill_regions_holes(first, last, t);
+ if (err)
+ return err;
}
}
return 0;
}
-struct damos *damon_new_scheme(
- unsigned long min_sz_region, unsigned long max_sz_region,
- unsigned int min_nr_accesses, unsigned int max_nr_accesses,
- unsigned int min_age_region, unsigned int max_age_region,
- enum damos_action action, struct damos_quota *quota,
- struct damos_watermarks *wmarks)
+/* initialize private fields of damos_quota and return the pointer */
+static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota)
+{
+ quota->total_charged_sz = 0;
+ quota->total_charged_ns = 0;
+ quota->esz = 0;
+ quota->charged_sz = 0;
+ quota->charged_from = 0;
+ quota->charge_target_from = NULL;
+ quota->charge_addr_from = 0;
+ return quota;
+}
+
+struct damos *damon_new_scheme(struct damos_access_pattern *pattern,
+ enum damos_action action, struct damos_quota *quota,
+ struct damos_watermarks *wmarks)
{
struct damos *scheme;
scheme = kmalloc(sizeof(*scheme), GFP_KERNEL);
if (!scheme)
return NULL;
- scheme->min_sz_region = min_sz_region;
- scheme->max_sz_region = max_sz_region;
- scheme->min_nr_accesses = min_nr_accesses;
- scheme->max_nr_accesses = max_nr_accesses;
- scheme->min_age_region = min_age_region;
- scheme->max_age_region = max_age_region;
+ scheme->pattern = *pattern;
scheme->action = action;
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
- scheme->quota.ms = quota->ms;
- scheme->quota.sz = quota->sz;
- scheme->quota.reset_interval = quota->reset_interval;
- scheme->quota.weight_sz = quota->weight_sz;
- scheme->quota.weight_nr_accesses = quota->weight_nr_accesses;
- scheme->quota.weight_age = quota->weight_age;
- scheme->quota.total_charged_sz = 0;
- scheme->quota.total_charged_ns = 0;
- scheme->quota.esz = 0;
- scheme->quota.charged_sz = 0;
- scheme->quota.charged_from = 0;
- scheme->quota.charge_target_from = NULL;
- scheme->quota.charge_addr_from = 0;
-
- scheme->wmarks.metric = wmarks->metric;
- scheme->wmarks.interval = wmarks->interval;
- scheme->wmarks.high = wmarks->high;
- scheme->wmarks.mid = wmarks->mid;
- scheme->wmarks.low = wmarks->low;
+ scheme->quota = *(damos_quota_init_priv(quota));
+
+ scheme->wmarks = *wmarks;
scheme->wmarks.activated = true;
return scheme;
@@ -313,6 +335,7 @@ struct damon_target *damon_new_target(void)
t->pid = NULL;
t->nr_regions = 0;
INIT_LIST_HEAD(&t->regions_list);
+ INIT_LIST_HEAD(&t->list);
return t;
}
@@ -360,17 +383,17 @@ struct damon_ctx *damon_new_ctx(void)
if (!ctx)
return NULL;
- ctx->sample_interval = 5 * 1000;
- ctx->aggr_interval = 100 * 1000;
- ctx->ops_update_interval = 60 * 1000 * 1000;
+ ctx->attrs.sample_interval = 5 * 1000;
+ ctx->attrs.aggr_interval = 100 * 1000;
+ ctx->attrs.ops_update_interval = 60 * 1000 * 1000;
ktime_get_coarse_ts64(&ctx->last_aggregation);
ctx->last_ops_update = ctx->last_aggregation;
mutex_init(&ctx->kdamond_lock);
- ctx->min_nr_regions = 10;
- ctx->max_nr_regions = 1000;
+ ctx->attrs.min_nr_regions = 10;
+ ctx->attrs.max_nr_regions = 1000;
INIT_LIST_HEAD(&ctx->adaptive_targets);
INIT_LIST_HEAD(&ctx->schemes);
@@ -406,32 +429,21 @@ void damon_destroy_ctx(struct damon_ctx *ctx)
/**
* damon_set_attrs() - Set attributes for the monitoring.
* @ctx: monitoring context
- * @sample_int: time interval between samplings
- * @aggr_int: time interval between aggregations
- * @ops_upd_int: time interval between monitoring operations updates
- * @min_nr_reg: minimal number of regions
- * @max_nr_reg: maximum number of regions
+ * @attrs: monitoring attributes
*
* This function should not be called while the kdamond is running.
* Every time interval is in micro-seconds.
*
* Return: 0 on success, negative error code otherwise.
*/
-int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
- unsigned long aggr_int, unsigned long ops_upd_int,
- unsigned long min_nr_reg, unsigned long max_nr_reg)
+int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs)
{
- if (min_nr_reg < 3)
+ if (attrs->min_nr_regions < 3)
return -EINVAL;
- if (min_nr_reg > max_nr_reg)
+ if (attrs->min_nr_regions > attrs->max_nr_regions)
return -EINVAL;
- ctx->sample_interval = sample_int;
- ctx->aggr_interval = aggr_int;
- ctx->ops_update_interval = ops_upd_int;
- ctx->min_nr_regions = min_nr_reg;
- ctx->max_nr_regions = max_nr_reg;
-
+ ctx->attrs = *attrs;
return 0;
}
@@ -443,10 +455,8 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
*
* This function should not be called while the kdamond of the context is
* running.
- *
- * Return: 0 if success, or negative error code otherwise.
*/
-int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
+void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
ssize_t nr_schemes)
{
struct damos *s, *next;
@@ -456,7 +466,6 @@ int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
damon_destroy_scheme(s);
for (i = 0; i < nr_schemes; i++)
damon_add_scheme(ctx, schemes[i]);
- return 0;
}
/**
@@ -482,11 +491,11 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t)
- sz += r->ar.end - r->ar.start;
+ sz += damon_sz_region(r);
}
- if (ctx->min_nr_regions)
- sz /= ctx->min_nr_regions;
+ if (ctx->attrs.min_nr_regions)
+ sz /= ctx->attrs.min_nr_regions;
if (sz < DAMON_MIN_REGION)
sz = DAMON_MIN_REGION;
@@ -635,7 +644,7 @@ static bool damon_check_reset_time_interval(struct timespec64 *baseline,
static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
{
return damon_check_reset_time_interval(&ctx->last_aggregation,
- ctx->aggr_interval);
+ ctx->attrs.aggr_interval);
}
/*
@@ -658,19 +667,20 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
}
}
-static void damon_split_region_at(struct damon_ctx *ctx,
- struct damon_target *t, struct damon_region *r,
- unsigned long sz_r);
+static void damon_split_region_at(struct damon_target *t,
+ struct damon_region *r, unsigned long sz_r);
static bool __damos_valid_target(struct damon_region *r, struct damos *s)
{
unsigned long sz;
- sz = r->ar.end - r->ar.start;
- return s->min_sz_region <= sz && sz <= s->max_sz_region &&
- s->min_nr_accesses <= r->nr_accesses &&
- r->nr_accesses <= s->max_nr_accesses &&
- s->min_age_region <= r->age && r->age <= s->max_age_region;
+ sz = damon_sz_region(r);
+ return s->pattern.min_sz_region <= sz &&
+ sz <= s->pattern.max_sz_region &&
+ s->pattern.min_nr_accesses <= r->nr_accesses &&
+ r->nr_accesses <= s->pattern.max_nr_accesses &&
+ s->pattern.min_age_region <= r->age &&
+ r->age <= s->pattern.max_age_region;
}
static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
@@ -692,7 +702,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
damon_for_each_scheme(s, c) {
struct damos_quota *quota = &s->quota;
- unsigned long sz = r->ar.end - r->ar.start;
+ unsigned long sz = damon_sz_region(r);
struct timespec64 begin, end;
unsigned long sz_applied = 0;
@@ -721,14 +731,14 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
sz = ALIGN_DOWN(quota->charge_addr_from -
r->ar.start, DAMON_MIN_REGION);
if (!sz) {
- if (r->ar.end - r->ar.start <=
- DAMON_MIN_REGION)
+ if (damon_sz_region(r) <=
+ DAMON_MIN_REGION)
continue;
sz = DAMON_MIN_REGION;
}
- damon_split_region_at(c, t, r, sz);
+ damon_split_region_at(t, r, sz);
r = damon_next_region(r);
- sz = r->ar.end - r->ar.start;
+ sz = damon_sz_region(r);
}
quota->charge_target_from = NULL;
quota->charge_addr_from = 0;
@@ -745,7 +755,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
DAMON_MIN_REGION);
if (!sz)
goto update_stat;
- damon_split_region_at(c, t, r, sz);
+ damon_split_region_at(t, r, sz);
}
ktime_get_coarse_ts64(&begin);
sz_applied = c->ops.apply_scheme(c, t, r, s);
@@ -833,8 +843,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
continue;
score = c->ops.get_scheme_score(
c, t, r, s);
- quota->histogram[score] +=
- r->ar.end - r->ar.start;
+ quota->histogram[score] += damon_sz_region(r);
if (score > max_score)
max_score = score;
}
@@ -855,18 +864,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
}
}
-static inline unsigned long sz_damon_region(struct damon_region *r)
-{
- return r->ar.end - r->ar.start;
-}
-
/*
* Merge two adjacent regions into one region
*/
static void damon_merge_two_regions(struct damon_target *t,
struct damon_region *l, struct damon_region *r)
{
- unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r);
+ unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r);
l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
(sz_l + sz_r);
@@ -895,7 +899,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
if (prev && prev->ar.end == r->ar.start &&
abs(prev->nr_accesses - r->nr_accesses) <= thres &&
- sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
+ damon_sz_region(prev) + damon_sz_region(r) <= sz_limit)
damon_merge_two_regions(t, prev, r);
else
prev = r;
@@ -928,9 +932,8 @@ static void kdamond_merge_regions(struct damon_ctx *c, unsigned int threshold,
* r the region to be split
* sz_r size of the first sub-region that will be made
*/
-static void damon_split_region_at(struct damon_ctx *ctx,
- struct damon_target *t, struct damon_region *r,
- unsigned long sz_r)
+static void damon_split_region_at(struct damon_target *t,
+ struct damon_region *r, unsigned long sz_r)
{
struct damon_region *new;
@@ -947,15 +950,14 @@ static void damon_split_region_at(struct damon_ctx *ctx,
}
/* Split every region in the given target into 'nr_subs' regions */
-static void damon_split_regions_of(struct damon_ctx *ctx,
- struct damon_target *t, int nr_subs)
+static void damon_split_regions_of(struct damon_target *t, int nr_subs)
{
struct damon_region *r, *next;
unsigned long sz_region, sz_sub = 0;
int i;
damon_for_each_region_safe(r, next, t) {
- sz_region = r->ar.end - r->ar.start;
+ sz_region = damon_sz_region(r);
for (i = 0; i < nr_subs - 1 &&
sz_region > 2 * DAMON_MIN_REGION; i++) {
@@ -969,7 +971,7 @@ static void damon_split_regions_of(struct damon_ctx *ctx,
if (sz_sub == 0 || sz_sub >= sz_region)
continue;
- damon_split_region_at(ctx, t, r, sz_sub);
+ damon_split_region_at(t, r, sz_sub);
sz_region = sz_sub;
}
}
@@ -995,16 +997,16 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
damon_for_each_target(t, ctx)
nr_regions += damon_nr_regions(t);
- if (nr_regions > ctx->max_nr_regions / 2)
+ if (nr_regions > ctx->attrs.max_nr_regions / 2)
return;
/* Maybe the middle of the region has different access frequency */
if (last_nr_regions == nr_regions &&
- nr_regions < ctx->max_nr_regions / 3)
+ nr_regions < ctx->attrs.max_nr_regions / 3)
nr_subregions = 3;
damon_for_each_target(t, ctx)
- damon_split_regions_of(ctx, t, nr_subregions);
+ damon_split_regions_of(t, nr_subregions);
last_nr_regions = nr_regions;
}
@@ -1018,7 +1020,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx)
static bool kdamond_need_update_operations(struct damon_ctx *ctx)
{
return damon_check_reset_time_interval(&ctx->last_ops_update,
- ctx->ops_update_interval);
+ ctx->attrs.ops_update_interval);
}
/*
@@ -1142,32 +1144,27 @@ static int kdamond_fn(void *data)
struct damon_region *r, *next;
unsigned int max_nr_accesses = 0;
unsigned long sz_limit = 0;
- bool done = false;
pr_debug("kdamond (%d) starts\n", current->pid);
if (ctx->ops.init)
ctx->ops.init(ctx);
if (ctx->callback.before_start && ctx->callback.before_start(ctx))
- done = true;
+ goto done;
sz_limit = damon_region_sz_limit(ctx);
- while (!kdamond_need_stop(ctx) && !done) {
- if (kdamond_wait_activation(ctx)) {
- done = true;
- continue;
- }
+ while (!kdamond_need_stop(ctx)) {
+ if (kdamond_wait_activation(ctx))
+ break;
if (ctx->ops.prepare_access_checks)
ctx->ops.prepare_access_checks(ctx);
if (ctx->callback.after_sampling &&
- ctx->callback.after_sampling(ctx)) {
- done = true;
- continue;
- }
+ ctx->callback.after_sampling(ctx))
+ break;
- kdamond_usleep(ctx->sample_interval);
+ kdamond_usleep(ctx->attrs.sample_interval);
if (ctx->ops.check_accesses)
max_nr_accesses = ctx->ops.check_accesses(ctx);
@@ -1177,10 +1174,8 @@ static int kdamond_fn(void *data)
max_nr_accesses / 10,
sz_limit);
if (ctx->callback.after_aggregation &&
- ctx->callback.after_aggregation(ctx)) {
- done = true;
- continue;
- }
+ ctx->callback.after_aggregation(ctx))
+ break;
kdamond_apply_schemes(ctx);
kdamond_reset_aggregated(ctx);
kdamond_split_regions(ctx);
@@ -1194,6 +1189,7 @@ static int kdamond_fn(void *data)
sz_limit = damon_region_sz_limit(ctx);
}
}
+done:
damon_for_each_target(t, ctx) {
damon_for_each_region_safe(r, next, t)
damon_destroy_region(r, t);
@@ -1218,4 +1214,90 @@ static int kdamond_fn(void *data)
return 0;
}
+/*
+ * struct damon_system_ram_region - System RAM resource address region of
+ * [@start, @end).
+ * @start: Start address of the region (inclusive).
+ * @end: End address of the region (exclusive).
+ */
+struct damon_system_ram_region {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int walk_system_ram(struct resource *res, void *arg)
+{
+ struct damon_system_ram_region *a = arg;
+
+ if (a->end - a->start < resource_size(res)) {
+ a->start = res->start;
+ a->end = res->end;
+ }
+ return 0;
+}
+
+/*
+ * Find biggest 'System RAM' resource and store its start and end address in
+ * @start and @end, respectively. If no System RAM is found, returns false.
+ */
+static bool damon_find_biggest_system_ram(unsigned long *start,
+ unsigned long *end)
+
+{
+ struct damon_system_ram_region arg = {};
+
+ walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
+ if (arg.end <= arg.start)
+ return false;
+
+ *start = arg.start;
+ *end = arg.end;
+ return true;
+}
+
+/**
+ * damon_set_region_biggest_system_ram_default() - Set the region of the given
+ * monitoring target as requested, or biggest 'System RAM'.
+ * @t: The monitoring target to set the region.
+ * @start: The pointer to the start address of the region.
+ * @end: The pointer to the end address of the region.
+ *
+ * This function sets the region of @t as requested by @start and @end. If the
+ * values of @start and @end are zero, however, this function finds the biggest
+ * 'System RAM' resource and sets the region to cover the resource. In the
+ * latter case, this function saves the start and end addresses of the resource
+ * in @start and @end, respectively.
+ *
+ * Return: 0 on success, negative error code otherwise.
+ */
+int damon_set_region_biggest_system_ram_default(struct damon_target *t,
+ unsigned long *start, unsigned long *end)
+{
+ struct damon_addr_range addr_range;
+
+ if (*start > *end)
+ return -EINVAL;
+
+ if (!*start && !*end &&
+ !damon_find_biggest_system_ram(start, end))
+ return -EINVAL;
+
+ addr_range.start = *start;
+ addr_range.end = *end;
+ return damon_set_regions(t, &addr_range, 1);
+}
+
+static int __init damon_init(void)
+{
+ damon_region_cache = KMEM_CACHE(damon_region, 0);
+ if (unlikely(!damon_region_cache)) {
+ pr_err("creating damon_region_cache fails\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+subsys_initcall(damon_init);
+
#include "core-test.h"
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index cfdf63132d5a..b3f454a5c682 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -55,9 +55,9 @@ static ssize_t dbgfs_attrs_read(struct file *file,
mutex_lock(&ctx->kdamond_lock);
ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n",
- ctx->sample_interval, ctx->aggr_interval,
- ctx->ops_update_interval, ctx->min_nr_regions,
- ctx->max_nr_regions);
+ ctx->attrs.sample_interval, ctx->attrs.aggr_interval,
+ ctx->attrs.ops_update_interval,
+ ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions);
mutex_unlock(&ctx->kdamond_lock);
return simple_read_from_buffer(buf, count, ppos, kbuf, ret);
@@ -67,7 +67,7 @@ static ssize_t dbgfs_attrs_write(struct file *file,
const char __user *buf, size_t count, loff_t *ppos)
{
struct damon_ctx *ctx = file->private_data;
- unsigned long s, a, r, minr, maxr;
+ struct damon_attrs attrs;
char *kbuf;
ssize_t ret;
@@ -76,7 +76,10 @@ static ssize_t dbgfs_attrs_write(struct file *file,
return PTR_ERR(kbuf);
if (sscanf(kbuf, "%lu %lu %lu %lu %lu",
- &s, &a, &r, &minr, &maxr) != 5) {
+ &attrs.sample_interval, &attrs.aggr_interval,
+ &attrs.ops_update_interval,
+ &attrs.min_nr_regions,
+ &attrs.max_nr_regions) != 5) {
ret = -EINVAL;
goto out;
}
@@ -87,7 +90,7 @@ static ssize_t dbgfs_attrs_write(struct file *file,
goto unlock_out;
}
- ret = damon_set_attrs(ctx, s, a, r, minr, maxr);
+ ret = damon_set_attrs(ctx, &attrs);
if (!ret)
ret = count;
unlock_out:
@@ -131,9 +134,12 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
damon_for_each_scheme(s, c) {
rc = scnprintf(&buf[written], len - written,
"%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
- s->min_sz_region, s->max_sz_region,
- s->min_nr_accesses, s->max_nr_accesses,
- s->min_age_region, s->max_age_region,
+ s->pattern.min_sz_region,
+ s->pattern.max_sz_region,
+ s->pattern.min_nr_accesses,
+ s->pattern.max_nr_accesses,
+ s->pattern.min_age_region,
+ s->pattern.max_age_region,
damos_action_to_dbgfs_scheme_action(s->action),
s->quota.ms, s->quota.sz,
s->quota.reset_interval,
@@ -221,8 +227,6 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
struct damos *scheme, **schemes;
const int max_nr_schemes = 256;
int pos = 0, parsed, ret;
- unsigned long min_sz, max_sz;
- unsigned int min_nr_a, max_nr_a, min_age, max_age;
unsigned int action_input;
enum damos_action action;
@@ -233,13 +237,18 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
*nr_schemes = 0;
while (pos < len && *nr_schemes < max_nr_schemes) {
+ struct damos_access_pattern pattern = {};
struct damos_quota quota = {};
struct damos_watermarks wmarks;
ret = sscanf(&str[pos],
"%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
- &min_sz, &max_sz, &min_nr_a, &max_nr_a,
- &min_age, &max_age, &action_input, &quota.ms,
+ &pattern.min_sz_region, &pattern.max_sz_region,
+ &pattern.min_nr_accesses,
+ &pattern.max_nr_accesses,
+ &pattern.min_age_region,
+ &pattern.max_age_region,
+ &action_input, &quota.ms,
&quota.sz, &quota.reset_interval,
&quota.weight_sz, &quota.weight_nr_accesses,
&quota.weight_age, &wmarks.metric,
@@ -251,7 +260,9 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
if ((int)action < 0)
goto fail;
- if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age)
+ if (pattern.min_sz_region > pattern.max_sz_region ||
+ pattern.min_nr_accesses > pattern.max_nr_accesses ||
+ pattern.min_age_region > pattern.max_age_region)
goto fail;
if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low ||
@@ -259,8 +270,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
goto fail;
pos += parsed;
- scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
- min_age, max_age, action, &quota, &wmarks);
+ scheme = damon_new_scheme(&pattern, action, &quota, &wmarks);
if (!scheme)
goto fail;
@@ -297,11 +307,9 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
goto unlock_out;
}
- ret = damon_set_schemes(ctx, schemes, nr_schemes);
- if (!ret) {
- ret = count;
- nr_schemes = 0;
- }
+ damon_set_schemes(ctx, schemes, nr_schemes);
+ ret = count;
+ nr_schemes = 0;
unlock_out:
mutex_unlock(&ctx->kdamond_lock);
@@ -882,8 +890,10 @@ out:
static int dbgfs_rm_context(char *name)
{
struct dentry *root, *dir, **new_dirs;
+ struct inode *inode;
struct damon_ctx **new_ctxs;
int i, j;
+ int ret = 0;
if (damon_nr_running_ctxs())
return -EBUSY;
@@ -896,16 +906,24 @@ static int dbgfs_rm_context(char *name)
if (!dir)
return -ENOENT;
+ inode = d_inode(dir);
+ if (!S_ISDIR(inode->i_mode)) {
+ ret = -EINVAL;
+ goto out_dput;
+ }
+
new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
GFP_KERNEL);
- if (!new_dirs)
- return -ENOMEM;
+ if (!new_dirs) {
+ ret = -ENOMEM;
+ goto out_dput;
+ }
new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
GFP_KERNEL);
if (!new_ctxs) {
- kfree(new_dirs);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_new_dirs;
}
for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
@@ -925,7 +943,13 @@ static int dbgfs_rm_context(char *name)
dbgfs_ctxs = new_ctxs;
dbgfs_nr_ctxs--;
- return 0;
+ goto out_dput;
+
+out_new_dirs:
+ kfree(new_dirs);
+out_dput:
+ dput(dir);
+ return ret;
}
static ssize_t dbgfs_rm_context_write(struct file *file,
@@ -1044,7 +1068,7 @@ static int __init __damon_dbgfs_init(void)
fops[i]);
dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]);
- dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL);
+ dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL);
if (!dbgfs_dirs) {
debugfs_remove(dbgfs_root);
return -ENOMEM;
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index 9de6f00a71c5..efbc2bda8b9c 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -13,6 +13,8 @@
#include <linux/sched.h>
#include <linux/workqueue.h>
+#include "modules-common.h"
+
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
@@ -63,109 +65,35 @@ module_param(hot_thres_access_freq, ulong, 0600);
static unsigned long cold_min_age __read_mostly = 120000000;
module_param(cold_min_age, ulong, 0600);
-/*
- * Limit of time for trying the LRU lists sorting in milliseconds.
- *
- * DAMON_LRU_SORT tries to use only up to this time within a time window
- * (quota_reset_interval_ms) for trying LRU lists sorting. This can be used
- * for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the
- * limit is disabled.
- *
- * 10 ms by default.
- */
-static unsigned long quota_ms __read_mostly = 10;
-module_param(quota_ms, ulong, 0600);
-
-/*
- * The time quota charge reset interval in milliseconds.
- *
- * The charge reset interval for the quota of time (quota_ms). That is,
- * DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms
- * milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds.
- *
- * 1 second by default.
- */
-static unsigned long quota_reset_interval_ms __read_mostly = 1000;
-module_param(quota_reset_interval_ms, ulong, 0600);
-
-/*
- * The watermarks check time interval in microseconds.
- *
- * Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is
- * enabled but inactive due to its watermarks rule. 5 seconds by default.
- */
-static unsigned long wmarks_interval __read_mostly = 5000000;
-module_param(wmarks_interval, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the high watermark.
- *
- * If free memory of the system in bytes per thousand bytes is higher than
- * this, DAMON_LRU_SORT becomes inactive, so it does nothing but periodically
- * checks the watermarks. 200 (20%) by default.
- */
-static unsigned long wmarks_high __read_mostly = 200;
-module_param(wmarks_high, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the middle watermark.
- *
- * If free memory of the system in bytes per thousand bytes is between this and
- * the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring
- * and the LRU-lists sorting. 150 (15%) by default.
- */
-static unsigned long wmarks_mid __read_mostly = 150;
-module_param(wmarks_mid, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the low watermark.
- *
- * If free memory of the system in bytes per thousand bytes is lower than this,
- * DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks
- * the watermarks. 50 (5%) by default.
- */
-static unsigned long wmarks_low __read_mostly = 50;
-module_param(wmarks_low, ulong, 0600);
-
-/*
- * Sampling interval for the monitoring in microseconds.
- *
- * The sampling interval of DAMON for the hot/cold memory monitoring. Please
- * refer to the DAMON documentation for more detail. 5 ms by default.
- */
-static unsigned long sample_interval __read_mostly = 5000;
-module_param(sample_interval, ulong, 0600);
-
-/*
- * Aggregation interval for the monitoring in microseconds.
- *
- * The aggregation interval of DAMON for the hot/cold memory monitoring.
- * Please refer to the DAMON documentation for more detail. 100 ms by default.
- */
-static unsigned long aggr_interval __read_mostly = 100000;
-module_param(aggr_interval, ulong, 0600);
-
-/*
- * Minimum number of monitoring regions.
- *
- * The minimal number of monitoring regions of DAMON for the hot/cold memory
- * monitoring. This can be used to set lower-bound of the monitoring quality.
- * But, setting this too high could result in increased monitoring overhead.
- * Please refer to the DAMON documentation for more detail. 10 by default.
- */
-static unsigned long min_nr_regions __read_mostly = 10;
-module_param(min_nr_regions, ulong, 0600);
-
-/*
- * Maximum number of monitoring regions.
- *
- * The maximum number of monitoring regions of DAMON for the hot/cold memory
- * monitoring. This can be used to set upper-bound of the monitoring overhead.
- * However, setting this too low could result in bad monitoring quality.
- * Please refer to the DAMON documentation for more detail. 1000 by default.
- */
-static unsigned long max_nr_regions __read_mostly = 1000;
-module_param(max_nr_regions, ulong, 0600);
+static struct damos_quota damon_lru_sort_quota = {
+ /* Use up to 10 ms per 1 sec, by default */
+ .ms = 10,
+ .sz = 0,
+ .reset_interval = 1000,
+ /* Within the quota, mark hotter regions accessed first. */
+ .weight_sz = 0,
+ .weight_nr_accesses = 1,
+ .weight_age = 0,
+};
+DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota);
+
+static struct damos_watermarks damon_lru_sort_wmarks = {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .interval = 5000000, /* 5 seconds */
+ .high = 200, /* 20 percent */
+ .mid = 150, /* 15 percent */
+ .low = 50, /* 5 percent */
+};
+DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_lru_sort_wmarks);
+
+static struct damon_attrs damon_lru_sort_mon_attrs = {
+ .sample_interval = 5000, /* 5 ms */
+ .aggr_interval = 100000, /* 100 ms */
+ .ops_update_interval = 0,
+ .min_nr_regions = 10,
+ .max_nr_regions = 1000,
+};
+DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs);
/*
* Start of the target memory region in physical address.
@@ -194,222 +122,97 @@ module_param(monitor_region_end, ulong, 0600);
static int kdamond_pid __read_mostly = -1;
module_param(kdamond_pid, int, 0400);
-/*
- * Number of hot memory regions that tried to be LRU-sorted.
- */
-static unsigned long nr_lru_sort_tried_hot_regions __read_mostly;
-module_param(nr_lru_sort_tried_hot_regions, ulong, 0400);
-
-/*
- * Total bytes of hot memory regions that tried to be LRU-sorted.
- */
-static unsigned long bytes_lru_sort_tried_hot_regions __read_mostly;
-module_param(bytes_lru_sort_tried_hot_regions, ulong, 0400);
-
-/*
- * Number of hot memory regions that successfully be LRU-sorted.
- */
-static unsigned long nr_lru_sorted_hot_regions __read_mostly;
-module_param(nr_lru_sorted_hot_regions, ulong, 0400);
-
-/*
- * Total bytes of hot memory regions that successfully be LRU-sorted.
- */
-static unsigned long bytes_lru_sorted_hot_regions __read_mostly;
-module_param(bytes_lru_sorted_hot_regions, ulong, 0400);
-
-/*
- * Number of times that the time quota limit for hot regions have exceeded
- */
-static unsigned long nr_hot_quota_exceeds __read_mostly;
-module_param(nr_hot_quota_exceeds, ulong, 0400);
-
-/*
- * Number of cold memory regions that tried to be LRU-sorted.
- */
-static unsigned long nr_lru_sort_tried_cold_regions __read_mostly;
-module_param(nr_lru_sort_tried_cold_regions, ulong, 0400);
-
-/*
- * Total bytes of cold memory regions that tried to be LRU-sorted.
- */
-static unsigned long bytes_lru_sort_tried_cold_regions __read_mostly;
-module_param(bytes_lru_sort_tried_cold_regions, ulong, 0400);
-
-/*
- * Number of cold memory regions that successfully be LRU-sorted.
- */
-static unsigned long nr_lru_sorted_cold_regions __read_mostly;
-module_param(nr_lru_sorted_cold_regions, ulong, 0400);
-
-/*
- * Total bytes of cold memory regions that successfully be LRU-sorted.
- */
-static unsigned long bytes_lru_sorted_cold_regions __read_mostly;
-module_param(bytes_lru_sorted_cold_regions, ulong, 0400);
-
-/*
- * Number of times that the time quota limit for cold regions have exceeded
- */
-static unsigned long nr_cold_quota_exceeds __read_mostly;
-module_param(nr_cold_quota_exceeds, ulong, 0400);
+static struct damos_stat damon_lru_sort_hot_stat;
+DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_hot_stat,
+ lru_sort_tried_hot_regions, lru_sorted_hot_regions,
+ hot_quota_exceeds);
+
+static struct damos_stat damon_lru_sort_cold_stat;
+DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat,
+ lru_sort_tried_cold_regions, lru_sorted_cold_regions,
+ cold_quota_exceeds);
+
+static struct damos_access_pattern damon_lru_sort_stub_pattern = {
+ /* Find regions having PAGE_SIZE or larger size */
+ .min_sz_region = PAGE_SIZE,
+ .max_sz_region = ULONG_MAX,
+ /* no matter its access frequency */
+ .min_nr_accesses = 0,
+ .max_nr_accesses = UINT_MAX,
+ /* no matter its age */
+ .min_age_region = 0,
+ .max_age_region = UINT_MAX,
+};
static struct damon_ctx *ctx;
static struct damon_target *target;
-struct damon_lru_sort_ram_walk_arg {
- unsigned long start;
- unsigned long end;
-};
-
-static int walk_system_ram(struct resource *res, void *arg)
-{
- struct damon_lru_sort_ram_walk_arg *a = arg;
-
- if (a->end - a->start < resource_size(res)) {
- a->start = res->start;
- a->end = res->end;
- }
- return 0;
-}
-
-/*
- * Find biggest 'System RAM' resource and store its start and end address in
- * @start and @end, respectively. If no System RAM is found, returns false.
- */
-static bool get_monitoring_region(unsigned long *start, unsigned long *end)
+static struct damos *damon_lru_sort_new_scheme(
+ struct damos_access_pattern *pattern, enum damos_action action)
{
- struct damon_lru_sort_ram_walk_arg arg = {};
+ struct damos_quota quota = damon_lru_sort_quota;
- walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
- if (arg.end <= arg.start)
- return false;
+ /* Use half of total quota for hot/cold pages sorting */
+ quota.ms = quota.ms / 2;
- *start = arg.start;
- *end = arg.end;
- return true;
+ return damon_new_scheme(
+ /* find the pattern, and */
+ pattern,
+ /* (de)prioritize on LRU-lists */
+ action,
+ /* under the quota. */
+ &quota,
+ /* (De)activate this according to the watermarks. */
+ &damon_lru_sort_wmarks);
}
/* Create a DAMON-based operation scheme for hot memory regions */
static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
{
- struct damos_watermarks wmarks = {
- .metric = DAMOS_WMARK_FREE_MEM_RATE,
- .interval = wmarks_interval,
- .high = wmarks_high,
- .mid = wmarks_mid,
- .low = wmarks_low,
- };
- struct damos_quota quota = {
- /*
- * Do not try LRU-lists sorting of hot pages for more than half
- * of quota_ms milliseconds within quota_reset_interval_ms.
- */
- .ms = quota_ms / 2,
- .sz = 0,
- .reset_interval = quota_reset_interval_ms,
- /* Within the quota, mark hotter regions accessed first. */
- .weight_sz = 0,
- .weight_nr_accesses = 1,
- .weight_age = 0,
- };
- struct damos *scheme = damon_new_scheme(
- /* Find regions having PAGE_SIZE or larger size */
- PAGE_SIZE, ULONG_MAX,
- /* and accessed for more than the threshold */
- hot_thres, UINT_MAX,
- /* no matter its age */
- 0, UINT_MAX,
- /* prioritize those on LRU lists, as soon as found */
- DAMOS_LRU_PRIO,
- /* under the quota. */
- &quota,
- /* (De)activate this according to the watermarks. */
- &wmarks);
+ struct damos_access_pattern pattern = damon_lru_sort_stub_pattern;
- return scheme;
+ pattern.min_nr_accesses = hot_thres;
+ return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO);
}
/* Create a DAMON-based operation scheme for cold memory regions */
static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
{
- struct damos_watermarks wmarks = {
- .metric = DAMOS_WMARK_FREE_MEM_RATE,
- .interval = wmarks_interval,
- .high = wmarks_high,
- .mid = wmarks_mid,
- .low = wmarks_low,
- };
- struct damos_quota quota = {
- /*
- * Do not try LRU-lists sorting of cold pages for more than
- * half of quota_ms milliseconds within
- * quota_reset_interval_ms.
- */
- .ms = quota_ms / 2,
- .sz = 0,
- .reset_interval = quota_reset_interval_ms,
- /* Within the quota, mark colder regions not accessed first. */
- .weight_sz = 0,
- .weight_nr_accesses = 0,
- .weight_age = 1,
- };
- struct damos *scheme = damon_new_scheme(
- /* Find regions having PAGE_SIZE or larger size */
- PAGE_SIZE, ULONG_MAX,
- /* and not accessed at all */
- 0, 0,
- /* for cold_thres or more micro-seconds, and */
- cold_thres, UINT_MAX,
- /* mark those as not accessed, as soon as found */
- DAMOS_LRU_DEPRIO,
- /* under the quota. */
- &quota,
- /* (De)activate this according to the watermarks. */
- &wmarks);
+ struct damos_access_pattern pattern = damon_lru_sort_stub_pattern;
- return scheme;
+ pattern.max_nr_accesses = 0;
+ pattern.min_age_region = cold_thres;
+ return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO);
}
static int damon_lru_sort_apply_parameters(void)
{
- struct damos *scheme, *next_scheme;
- struct damon_addr_range addr_range;
+ struct damos *scheme;
unsigned int hot_thres, cold_thres;
int err = 0;
- err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
- min_nr_regions, max_nr_regions);
+ err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs);
if (err)
return err;
- /* free previously set schemes */
- damon_for_each_scheme_safe(scheme, next_scheme, ctx)
- damon_destroy_scheme(scheme);
-
/* aggr_interval / sample_interval is the maximum nr_accesses */
- hot_thres = aggr_interval / sample_interval * hot_thres_access_freq /
- 1000;
+ hot_thres = damon_lru_sort_mon_attrs.aggr_interval /
+ damon_lru_sort_mon_attrs.sample_interval *
+ hot_thres_access_freq / 1000;
scheme = damon_lru_sort_new_hot_scheme(hot_thres);
if (!scheme)
return -ENOMEM;
- damon_add_scheme(ctx, scheme);
+ damon_set_schemes(ctx, &scheme, 1);
- cold_thres = cold_min_age / aggr_interval;
+ cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval;
scheme = damon_lru_sort_new_cold_scheme(cold_thres);
if (!scheme)
return -ENOMEM;
damon_add_scheme(ctx, scheme);
- if (monitor_region_start > monitor_region_end)
- return -EINVAL;
- if (!monitor_region_start && !monitor_region_end &&
- !get_monitoring_region(&monitor_region_start,
- &monitor_region_end))
- return -EINVAL;
- addr_range.start = monitor_region_start;
- addr_range.end = monitor_region_end;
- return damon_set_regions(target, &addr_range, 1);
+ return damon_set_region_biggest_system_ram_default(target,
+ &monitor_region_start,
+ &monitor_region_end);
}
static int damon_lru_sort_turn(bool on)
@@ -495,19 +298,10 @@ static int damon_lru_sort_after_aggregation(struct damon_ctx *c)
/* update the stats parameter */
damon_for_each_scheme(s, c) {
- if (s->action == DAMOS_LRU_PRIO) {
- nr_lru_sort_tried_hot_regions = s->stat.nr_tried;
- bytes_lru_sort_tried_hot_regions = s->stat.sz_tried;
- nr_lru_sorted_hot_regions = s->stat.nr_applied;
- bytes_lru_sorted_hot_regions = s->stat.sz_applied;
- nr_hot_quota_exceeds = s->stat.qt_exceeds;
- } else if (s->action == DAMOS_LRU_DEPRIO) {
- nr_lru_sort_tried_cold_regions = s->stat.nr_tried;
- bytes_lru_sort_tried_cold_regions = s->stat.sz_tried;
- nr_lru_sorted_cold_regions = s->stat.nr_applied;
- bytes_lru_sorted_cold_regions = s->stat.sz_applied;
- nr_cold_quota_exceeds = s->stat.qt_exceeds;
- }
+ if (s->action == DAMOS_LRU_PRIO)
+ damon_lru_sort_hot_stat = s->stat;
+ else if (s->action == DAMOS_LRU_DEPRIO)
+ damon_lru_sort_cold_stat = s->stat;
}
return damon_lru_sort_handle_commit_inputs();
diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h
new file mode 100644
index 000000000000..5a4921851d32
--- /dev/null
+++ b/mm/damon/modules-common.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for DAMON Modules
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/moduleparam.h>
+
+#define DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(attrs) \
+ module_param_named(sample_interval, attrs.sample_interval, \
+ ulong, 0600); \
+ module_param_named(aggr_interval, attrs.aggr_interval, ulong, \
+ 0600); \
+ module_param_named(min_nr_regions, attrs.min_nr_regions, ulong, \
+ 0600); \
+ module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \
+ 0600);
+
+#define DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \
+ module_param_named(quota_ms, quota.ms, ulong, 0600); \
+ module_param_named(quota_reset_interval_ms, \
+ quota.reset_interval, ulong, 0600);
+
+#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \
+ DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \
+ module_param_named(quota_sz, quota.sz, ulong, 0600);
+
+#define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \
+ module_param_named(wmarks_interval, wmarks.interval, ulong, \
+ 0600); \
+ module_param_named(wmarks_high, wmarks.high, ulong, 0600); \
+ module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \
+ module_param_named(wmarks_low, wmarks.low, ulong, 0600);
+
+#define DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(stat, try_name, \
+ succ_name, qt_exceed_name) \
+ module_param_named(nr_##try_name, stat.nr_tried, ulong, 0400); \
+ module_param_named(bytes_##try_name, stat.sz_tried, ulong, \
+ 0400); \
+ module_param_named(nr_##succ_name, stat.nr_applied, ulong, \
+ 0400); \
+ module_param_named(bytes_##succ_name, stat.sz_applied, ulong, \
+ 0400); \
+ module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \
+ 0400);
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index b1335de200e7..75409601f934 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -88,7 +88,7 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
#define DAMON_MAX_SUBSCORE (100)
#define DAMON_MAX_AGE_IN_LOG (32)
-int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s)
{
unsigned int max_nr_accesses;
@@ -99,10 +99,10 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
unsigned int age_weight = s->quota.weight_age;
int hotness;
- max_nr_accesses = c->aggr_interval / c->sample_interval;
+ max_nr_accesses = c->attrs.aggr_interval / c->attrs.sample_interval;
freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
- age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
+ age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000;
for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
age_in_log++, age_in_sec >>= 1)
;
@@ -127,48 +127,14 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
*/
hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
- /* Return coldness of the region */
- return DAMOS_MAX_SCORE - hotness;
+ return hotness;
}
-int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s)
{
- unsigned int max_nr_accesses;
- int freq_subscore;
- unsigned int age_in_sec;
- int age_in_log, age_subscore;
- unsigned int freq_weight = s->quota.weight_nr_accesses;
- unsigned int age_weight = s->quota.weight_age;
- int hotness;
-
- max_nr_accesses = c->aggr_interval / c->sample_interval;
- freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
-
- age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
- for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
- age_in_log++, age_in_sec >>= 1)
- ;
+ int hotness = damon_hot_score(c, r, s);
- /* If frequency is 0, higher age means it's colder */
- if (freq_subscore == 0)
- age_in_log *= -1;
-
- /*
- * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
- * Scale it to be in [0, 100] and set it as age subscore.
- */
- age_in_log += DAMON_MAX_AGE_IN_LOG;
- age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
- DAMON_MAX_AGE_IN_LOG / 2;
-
- hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
- if (freq_weight + age_weight)
- hotness /= freq_weight + age_weight;
- /*
- * Transform it to fit in [0, DAMOS_MAX_SCORE]
- */
- hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
-
- return hotness;
+ /* Return coldness of the region */
+ return DAMOS_MAX_SCORE - hotness;
}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index 52329ff361cd..8d82d3722204 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -12,7 +12,7 @@ struct page *damon_get_page(unsigned long pfn);
void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
-int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+int damon_cold_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index dc131c6a5403..e1a4315c4be6 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -63,8 +63,7 @@ out:
folio_put(folio);
}
-static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
- struct damon_region *r)
+static void __damon_pa_prepare_access_check(struct damon_region *r)
{
r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
@@ -78,7 +77,7 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t)
- __damon_pa_prepare_access_check(ctx, r);
+ __damon_pa_prepare_access_check(r);
}
}
@@ -166,8 +165,7 @@ out:
return result.accessed;
}
-static void __damon_pa_check_access(struct damon_ctx *ctx,
- struct damon_region *r)
+static void __damon_pa_check_access(struct damon_region *r)
{
static unsigned long last_addr;
static unsigned long last_page_sz = PAGE_SIZE;
@@ -196,7 +194,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
damon_for_each_target(t, ctx) {
damon_for_each_region(r, t) {
- __damon_pa_check_access(ctx, r);
+ __damon_pa_check_access(r);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
}
}
@@ -233,7 +231,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r)
return applied * PAGE_SIZE;
}
-static unsigned long damon_pa_mark_accessed(struct damon_region *r)
+static inline unsigned long damon_pa_mark_accessed_or_deactivate(
+ struct damon_region *r, bool mark_accessed)
{
unsigned long addr, applied = 0;
@@ -242,27 +241,24 @@ static unsigned long damon_pa_mark_accessed(struct damon_region *r)
if (!page)
continue;
- mark_page_accessed(page);
+ if (mark_accessed)
+ mark_page_accessed(page);
+ else
+ deactivate_page(page);
put_page(page);
applied++;
}
return applied * PAGE_SIZE;
}
-static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+static unsigned long damon_pa_mark_accessed(struct damon_region *r)
{
- unsigned long addr, applied = 0;
-
- for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
- struct page *page = damon_get_page(PHYS_PFN(addr));
+ return damon_pa_mark_accessed_or_deactivate(r, true);
+}
- if (!page)
- continue;
- deactivate_page(page);
- put_page(page);
- applied++;
- }
- return applied * PAGE_SIZE;
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+{
+ return damon_pa_mark_accessed_or_deactivate(r, false);
}
static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
@@ -276,7 +272,10 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
return damon_pa_mark_accessed(r);
case DAMOS_LRU_DEPRIO:
return damon_pa_deactivate_pages(r);
+ case DAMOS_STAT:
+ break;
default:
+ /* DAMOS actions that not yet supported by 'paddr'. */
break;
}
return 0;
@@ -288,11 +287,11 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
{
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pageout_score(context, r, scheme);
+ return damon_cold_score(context, r, scheme);
case DAMOS_LRU_PRIO:
return damon_hot_score(context, r, scheme);
case DAMOS_LRU_DEPRIO:
- return damon_pageout_score(context, r, scheme);
+ return damon_cold_score(context, r, scheme);
default:
break;
}
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index a7faf51b4bd4..162c9b1ca00f 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -13,6 +13,8 @@
#include <linux/sched.h>
#include <linux/workqueue.h>
+#include "modules-common.h"
+
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
#endif
@@ -50,124 +52,35 @@ module_param(commit_inputs, bool, 0600);
static unsigned long min_age __read_mostly = 120000000;
module_param(min_age, ulong, 0600);
-/*
- * Limit of time for trying the reclamation in milliseconds.
- *
- * DAMON_RECLAIM tries to use only up to this time within a time window
- * (quota_reset_interval_ms) for trying reclamation of cold pages. This can be
- * used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero,
- * the limit is disabled.
- *
- * 10 ms by default.
- */
-static unsigned long quota_ms __read_mostly = 10;
-module_param(quota_ms, ulong, 0600);
-
-/*
- * Limit of size of memory for the reclamation in bytes.
- *
- * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a
- * time window (quota_reset_interval_ms) and makes no more than this limit is
- * tried. This can be used for limiting consumption of CPU and IO. If this
- * value is zero, the limit is disabled.
- *
- * 128 MiB by default.
- */
-static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024;
-module_param(quota_sz, ulong, 0600);
-
-/*
- * The time/size quota charge reset interval in milliseconds.
- *
- * The charge reset interval for the quota of time (quota_ms) and size
- * (quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than
- * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms
- * milliseconds.
- *
- * 1 second by default.
- */
-static unsigned long quota_reset_interval_ms __read_mostly = 1000;
-module_param(quota_reset_interval_ms, ulong, 0600);
-
-/*
- * The watermarks check time interval in microseconds.
- *
- * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is
- * enabled but inactive due to its watermarks rule. 5 seconds by default.
- */
-static unsigned long wmarks_interval __read_mostly = 5000000;
-module_param(wmarks_interval, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the high watermark.
- *
- * If free memory of the system in bytes per thousand bytes is higher than
- * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically
- * checks the watermarks. 500 (50%) by default.
- */
-static unsigned long wmarks_high __read_mostly = 500;
-module_param(wmarks_high, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the middle watermark.
- *
- * If free memory of the system in bytes per thousand bytes is between this and
- * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring
- * and the reclaiming. 400 (40%) by default.
- */
-static unsigned long wmarks_mid __read_mostly = 400;
-module_param(wmarks_mid, ulong, 0600);
-
-/*
- * Free memory rate (per thousand) for the low watermark.
- *
- * If free memory of the system in bytes per thousand bytes is lower than this,
- * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks
- * the watermarks. In the case, the system falls back to the LRU-based page
- * granularity reclamation logic. 200 (20%) by default.
- */
-static unsigned long wmarks_low __read_mostly = 200;
-module_param(wmarks_low, ulong, 0600);
-
-/*
- * Sampling interval for the monitoring in microseconds.
- *
- * The sampling interval of DAMON for the cold memory monitoring. Please refer
- * to the DAMON documentation for more detail. 5 ms by default.
- */
-static unsigned long sample_interval __read_mostly = 5000;
-module_param(sample_interval, ulong, 0600);
-
-/*
- * Aggregation interval for the monitoring in microseconds.
- *
- * The aggregation interval of DAMON for the cold memory monitoring. Please
- * refer to the DAMON documentation for more detail. 100 ms by default.
- */
-static unsigned long aggr_interval __read_mostly = 100000;
-module_param(aggr_interval, ulong, 0600);
-
-/*
- * Minimum number of monitoring regions.
- *
- * The minimal number of monitoring regions of DAMON for the cold memory
- * monitoring. This can be used to set lower-bound of the monitoring quality.
- * But, setting this too high could result in increased monitoring overhead.
- * Please refer to the DAMON documentation for more detail. 10 by default.
- */
-static unsigned long min_nr_regions __read_mostly = 10;
-module_param(min_nr_regions, ulong, 0600);
-
-/*
- * Maximum number of monitoring regions.
- *
- * The maximum number of monitoring regions of DAMON for the cold memory
- * monitoring. This can be used to set upper-bound of the monitoring overhead.
- * However, setting this too low could result in bad monitoring quality.
- * Please refer to the DAMON documentation for more detail. 1000 by default.
- */
-static unsigned long max_nr_regions __read_mostly = 1000;
-module_param(max_nr_regions, ulong, 0600);
+static struct damos_quota damon_reclaim_quota = {
+ /* use up to 10 ms time, reclaim up to 128 MiB per 1 sec by default */
+ .ms = 10,
+ .sz = 128 * 1024 * 1024,
+ .reset_interval = 1000,
+ /* Within the quota, page out older regions first. */
+ .weight_sz = 0,
+ .weight_nr_accesses = 0,
+ .weight_age = 1
+};
+DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota);
+
+static struct damos_watermarks damon_reclaim_wmarks = {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .interval = 5000000, /* 5 seconds */
+ .high = 500, /* 50 percent */
+ .mid = 400, /* 40 percent */
+ .low = 200, /* 20 percent */
+};
+DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_reclaim_wmarks);
+
+static struct damon_attrs damon_reclaim_mon_attrs = {
+ .sample_interval = 5000, /* 5 ms */
+ .aggr_interval = 100000, /* 100 ms */
+ .ops_update_interval = 0,
+ .min_nr_regions = 10,
+ .max_nr_regions = 1000,
+};
+DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs);
/*
* Start of the target memory region in physical address.
@@ -196,119 +109,44 @@ module_param(monitor_region_end, ulong, 0600);
static int kdamond_pid __read_mostly = -1;
module_param(kdamond_pid, int, 0400);
-/*
- * Number of memory regions that tried to be reclaimed.
- */
-static unsigned long nr_reclaim_tried_regions __read_mostly;
-module_param(nr_reclaim_tried_regions, ulong, 0400);
-
-/*
- * Total bytes of memory regions that tried to be reclaimed.
- */
-static unsigned long bytes_reclaim_tried_regions __read_mostly;
-module_param(bytes_reclaim_tried_regions, ulong, 0400);
-
-/*
- * Number of memory regions that successfully be reclaimed.
- */
-static unsigned long nr_reclaimed_regions __read_mostly;
-module_param(nr_reclaimed_regions, ulong, 0400);
-
-/*
- * Total bytes of memory regions that successfully be reclaimed.
- */
-static unsigned long bytes_reclaimed_regions __read_mostly;
-module_param(bytes_reclaimed_regions, ulong, 0400);
-
-/*
- * Number of times that the time/space quota limits have exceeded
- */
-static unsigned long nr_quota_exceeds __read_mostly;
-module_param(nr_quota_exceeds, ulong, 0400);
+static struct damos_stat damon_reclaim_stat;
+DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat,
+ reclaim_tried_regions, reclaimed_regions, quota_exceeds);
static struct damon_ctx *ctx;
static struct damon_target *target;
-struct damon_reclaim_ram_walk_arg {
- unsigned long start;
- unsigned long end;
-};
-
-static int walk_system_ram(struct resource *res, void *arg)
-{
- struct damon_reclaim_ram_walk_arg *a = arg;
-
- if (a->end - a->start < resource_size(res)) {
- a->start = res->start;
- a->end = res->end;
- }
- return 0;
-}
-
-/*
- * Find biggest 'System RAM' resource and store its start and end address in
- * @start and @end, respectively. If no System RAM is found, returns false.
- */
-static bool get_monitoring_region(unsigned long *start, unsigned long *end)
-{
- struct damon_reclaim_ram_walk_arg arg = {};
-
- walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
- if (arg.end <= arg.start)
- return false;
-
- *start = arg.start;
- *end = arg.end;
- return true;
-}
-
static struct damos *damon_reclaim_new_scheme(void)
{
- struct damos_watermarks wmarks = {
- .metric = DAMOS_WMARK_FREE_MEM_RATE,
- .interval = wmarks_interval,
- .high = wmarks_high,
- .mid = wmarks_mid,
- .low = wmarks_low,
- };
- struct damos_quota quota = {
- /*
- * Do not try reclamation for more than quota_ms milliseconds
- * or quota_sz bytes within quota_reset_interval_ms.
- */
- .ms = quota_ms,
- .sz = quota_sz,
- .reset_interval = quota_reset_interval_ms,
- /* Within the quota, page out older regions first. */
- .weight_sz = 0,
- .weight_nr_accesses = 0,
- .weight_age = 1
+ struct damos_access_pattern pattern = {
+ /* Find regions having PAGE_SIZE or larger size */
+ .min_sz_region = PAGE_SIZE,
+ .max_sz_region = ULONG_MAX,
+ /* and not accessed at all */
+ .min_nr_accesses = 0,
+ .max_nr_accesses = 0,
+ /* for min_age or more micro-seconds */
+ .min_age_region = min_age /
+ damon_reclaim_mon_attrs.aggr_interval,
+ .max_age_region = UINT_MAX,
};
- struct damos *scheme = damon_new_scheme(
- /* Find regions having PAGE_SIZE or larger size */
- PAGE_SIZE, ULONG_MAX,
- /* and not accessed at all */
- 0, 0,
- /* for min_age or more micro-seconds, and */
- min_age / aggr_interval, UINT_MAX,
+
+ return damon_new_scheme(
+ &pattern,
/* page out those, as soon as found */
DAMOS_PAGEOUT,
/* under the quota. */
- &quota,
+ &damon_reclaim_quota,
/* (De)activate this according to the watermarks. */
- &wmarks);
-
- return scheme;
+ &damon_reclaim_wmarks);
}
static int damon_reclaim_apply_parameters(void)
{
struct damos *scheme;
- struct damon_addr_range addr_range;
int err = 0;
- err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
- min_nr_regions, max_nr_regions);
+ err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs);
if (err)
return err;
@@ -316,19 +154,11 @@ static int damon_reclaim_apply_parameters(void)
scheme = damon_reclaim_new_scheme();
if (!scheme)
return -ENOMEM;
- err = damon_set_schemes(ctx, &scheme, 1);
- if (err)
- return err;
+ damon_set_schemes(ctx, &scheme, 1);
- if (monitor_region_start > monitor_region_end)
- return -EINVAL;
- if (!monitor_region_start && !monitor_region_end &&
- !get_monitoring_region(&monitor_region_start,
- &monitor_region_end))
- return -EINVAL;
- addr_range.start = monitor_region_start;
- addr_range.end = monitor_region_end;
- return damon_set_regions(target, &addr_range, 1);
+ return damon_set_region_biggest_system_ram_default(target,
+ &monitor_region_start,
+ &monitor_region_end);
}
static int damon_reclaim_turn(bool on)
@@ -413,13 +243,8 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c)
struct damos *s;
/* update the stats parameter */
- damon_for_each_scheme(s, c) {
- nr_reclaim_tried_regions = s->stat.nr_tried;
- bytes_reclaim_tried_regions = s->stat.sz_tried;
- nr_reclaimed_regions = s->stat.nr_applied;
- bytes_reclaimed_regions = s->stat.sz_applied;
- nr_quota_exceeds = s->stat.qt_exceeds;
- }
+ damon_for_each_scheme(s, c)
+ damon_reclaim_stat = s->stat;
return damon_reclaim_handle_commit_inputs();
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 7488e27c87c3..9f1219a67e3f 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -58,7 +58,7 @@ static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr,
err = kstrtoul(buf, 0, &min);
if (err)
- return -EINVAL;
+ return err;
range->min = min;
return count;
@@ -83,7 +83,7 @@ static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr,
err = kstrtoul(buf, 0, &max);
if (err)
- return -EINVAL;
+ return err;
range->max = max;
return count;
@@ -291,9 +291,7 @@ static ssize_t interval_us_store(struct kobject *kobj,
struct damon_sysfs_watermarks, kobj);
int err = kstrtoul(buf, 0, &watermarks->interval_us);
- if (err)
- return -EINVAL;
- return count;
+ return err ? err : count;
}
static ssize_t high_show(struct kobject *kobj,
@@ -312,9 +310,7 @@ static ssize_t high_store(struct kobject *kobj,
struct damon_sysfs_watermarks, kobj);
int err = kstrtoul(buf, 0, &watermarks->high);
- if (err)
- return -EINVAL;
- return count;
+ return err ? err : count;
}
static ssize_t mid_show(struct kobject *kobj,
@@ -333,9 +329,7 @@ static ssize_t mid_store(struct kobject *kobj,
struct damon_sysfs_watermarks, kobj);
int err = kstrtoul(buf, 0, &watermarks->mid);
- if (err)
- return -EINVAL;
- return count;
+ return err ? err : count;
}
static ssize_t low_show(struct kobject *kobj,
@@ -354,9 +348,7 @@ static ssize_t low_store(struct kobject *kobj,
struct damon_sysfs_watermarks, kobj);
int err = kstrtoul(buf, 0, &watermarks->low);
- if (err)
- return -EINVAL;
- return count;
+ return err ? err : count;
}
static void damon_sysfs_watermarks_release(struct kobject *kobj)
@@ -437,9 +429,7 @@ static ssize_t sz_permil_store(struct kobject *kobj,
struct damon_sysfs_weights, kobj);
int err = kstrtouint(buf, 0, &weights->sz);
- if (err)
- return -EINVAL;
- return count;
+ return err ? err : count;
}
static ssize_t nr_accesses_permil_show(struct kobject *kobj,
@@ -458,9 +448,7 @@ static ssize_t nr_accesses_permil_store(struct kobject *kobj,
struct damon_sysfs_weights, kobj);
int err = kstrtouint(buf, 0, &weights->nr_accesses);
- if (err)
- return -EINVAL;
- return count;
+ return err ? err : count;
}
static ssize_t age_permil_show(struct kobject *kobj,
@@ -479,9 +467,7 @@ static ssize_t age_permil_store(struct kobject *kobj,
struct damon_sysfs_weights, kobj);
int err = kstrtouint(buf, 0, &weights->age);
- if (err)
- return -EINVAL;
- return count;
+ return err ? err : count;
}
static void damon_sysfs_weights_release(struct kobject *kobj)
@@ -1031,8 +1017,7 @@ static ssize_t nr_schemes_show(struct kobject *kobj,
static ssize_t nr_schemes_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
- struct damon_sysfs_schemes *schemes = container_of(kobj,
- struct damon_sysfs_schemes, kobj);
+ struct damon_sysfs_schemes *schemes;
int nr, err = kstrtoint(buf, 0, &nr);
if (err)
@@ -1040,6 +1025,8 @@ static ssize_t nr_schemes_store(struct kobject *kobj,
if (nr < 0)
return -EINVAL;
+ schemes = container_of(kobj, struct damon_sysfs_schemes, kobj);
+
if (!mutex_trylock(&damon_sysfs_lock))
return -EBUSY;
err = damon_sysfs_schemes_add_dirs(schemes, nr);
@@ -1110,9 +1097,7 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr,
struct damon_sysfs_region, kobj);
int err = kstrtoul(buf, 0, &region->start);
- if (err)
- return -EINVAL;
- return count;
+ return err ? err : count;
}
static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -1131,9 +1116,7 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr,
struct damon_sysfs_region, kobj);
int err = kstrtoul(buf, 0, &region->end);
- if (err)
- return -EINVAL;
- return count;
+ return err ? err : count;
}
static void damon_sysfs_region_release(struct kobject *kobj)
@@ -1237,8 +1220,7 @@ static ssize_t nr_regions_show(struct kobject *kobj,
static ssize_t nr_regions_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
- struct damon_sysfs_regions *regions = container_of(kobj,
- struct damon_sysfs_regions, kobj);
+ struct damon_sysfs_regions *regions;
int nr, err = kstrtoint(buf, 0, &nr);
if (err)
@@ -1246,6 +1228,8 @@ static ssize_t nr_regions_store(struct kobject *kobj,
if (nr < 0)
return -EINVAL;
+ regions = container_of(kobj, struct damon_sysfs_regions, kobj);
+
if (!mutex_trylock(&damon_sysfs_lock))
return -EBUSY;
err = damon_sysfs_regions_add_dirs(regions, nr);
@@ -1440,8 +1424,7 @@ static ssize_t nr_targets_show(struct kobject *kobj,
static ssize_t nr_targets_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
- struct damon_sysfs_targets *targets = container_of(kobj,
- struct damon_sysfs_targets, kobj);
+ struct damon_sysfs_targets *targets;
int nr, err = kstrtoint(buf, 0, &nr);
if (err)
@@ -1449,6 +1432,8 @@ static ssize_t nr_targets_store(struct kobject *kobj,
if (nr < 0)
return -EINVAL;
+ targets = container_of(kobj, struct damon_sysfs_targets, kobj);
+
if (!mutex_trylock(&damon_sysfs_lock))
return -EBUSY;
err = damon_sysfs_targets_add_dirs(targets, nr);
@@ -1525,7 +1510,7 @@ static ssize_t sample_us_store(struct kobject *kobj,
int err = kstrtoul(buf, 0, &us);
if (err)
- return -EINVAL;
+ return err;
intervals->sample_us = us;
return count;
@@ -1549,7 +1534,7 @@ static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr,
int err = kstrtoul(buf, 0, &us);
if (err)
- return -EINVAL;
+ return err;
intervals->aggr_us = us;
return count;
@@ -1573,7 +1558,7 @@ static ssize_t update_us_store(struct kobject *kobj,
int err = kstrtoul(buf, 0, &us);
if (err)
- return -EINVAL;
+ return err;
intervals->update_us = us;
return count;
@@ -1962,8 +1947,7 @@ static ssize_t nr_contexts_show(struct kobject *kobj,
static ssize_t nr_contexts_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
- struct damon_sysfs_contexts *contexts = container_of(kobj,
- struct damon_sysfs_contexts, kobj);
+ struct damon_sysfs_contexts *contexts;
int nr, err;
err = kstrtoint(buf, 0, &nr);
@@ -1973,6 +1957,7 @@ static ssize_t nr_contexts_store(struct kobject *kobj,
if (nr < 0 || 1 < nr)
return -EINVAL;
+ contexts = container_of(kobj, struct damon_sysfs_contexts, kobj);
if (!mutex_trylock(&damon_sysfs_lock))
return -EBUSY;
err = damon_sysfs_contexts_add_dirs(contexts, nr);
@@ -2127,18 +2112,23 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx,
struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals;
struct damon_sysfs_ul_range *sys_nr_regions =
sys_attrs->nr_regions_range;
-
- return damon_set_attrs(ctx, sys_intervals->sample_us,
- sys_intervals->aggr_us, sys_intervals->update_us,
- sys_nr_regions->min, sys_nr_regions->max);
+ struct damon_attrs attrs = {
+ .sample_interval = sys_intervals->sample_us,
+ .aggr_interval = sys_intervals->aggr_us,
+ .ops_update_interval = sys_intervals->update_us,
+ .min_nr_regions = sys_nr_regions->min,
+ .max_nr_regions = sys_nr_regions->max,
+ };
+ return damon_set_attrs(ctx, &attrs);
}
static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
+ bool has_pid = damon_target_has_pid(ctx);
damon_for_each_target_safe(t, next, ctx) {
- if (damon_target_has_pid(ctx))
+ if (has_pid)
put_pid(t->pid);
damon_destroy_target(t);
}
@@ -2182,12 +2172,12 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
if (!t)
return -ENOMEM;
+ damon_add_target(ctx, t);
if (damon_target_has_pid(ctx)) {
t->pid = find_get_pid(sys_target->pid);
if (!t->pid)
goto destroy_targets_out;
}
- damon_add_target(ctx, t);
err = damon_sysfs_set_regions(t, sys_target->regions);
if (err)
goto destroy_targets_out;
@@ -2259,11 +2249,20 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx,
static struct damos *damon_sysfs_mk_scheme(
struct damon_sysfs_scheme *sysfs_scheme)
{
- struct damon_sysfs_access_pattern *pattern =
+ struct damon_sysfs_access_pattern *access_pattern =
sysfs_scheme->access_pattern;
struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas;
struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights;
struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks;
+
+ struct damos_access_pattern pattern = {
+ .min_sz_region = access_pattern->sz->min,
+ .max_sz_region = access_pattern->sz->max,
+ .min_nr_accesses = access_pattern->nr_accesses->min,
+ .max_nr_accesses = access_pattern->nr_accesses->max,
+ .min_age_region = access_pattern->age->min,
+ .max_age_region = access_pattern->age->max,
+ };
struct damos_quota quota = {
.ms = sysfs_quotas->ms,
.sz = sysfs_quotas->sz,
@@ -2280,10 +2279,8 @@ static struct damos *damon_sysfs_mk_scheme(
.low = sysfs_wmarks->low,
};
- return damon_new_scheme(pattern->sz->min, pattern->sz->max,
- pattern->nr_accesses->min, pattern->nr_accesses->max,
- pattern->age->min, pattern->age->max,
- sysfs_scheme->action, &quota, &wmarks);
+ return damon_new_scheme(&pattern, sysfs_scheme->action, &quota,
+ &wmarks);
}
static int damon_sysfs_set_schemes(struct damon_ctx *ctx,
@@ -2309,7 +2306,7 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
- if (ctx->ops.id != DAMON_OPS_VADDR && ctx->ops.id != DAMON_OPS_FVADDR)
+ if (!damon_target_has_pid(ctx))
return;
mutex_lock(&ctx->kdamond_lock);
@@ -2455,8 +2452,7 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
struct damon_ctx *ctx;
int err;
- if (kdamond->damon_ctx &&
- damon_sysfs_ctx_running(kdamond->damon_ctx))
+ if (damon_sysfs_kdamond_running(kdamond))
return -EBUSY;
if (damon_sysfs_cmd_request.kdamond == kdamond)
return -EBUSY;
@@ -2579,19 +2575,16 @@ static ssize_t pid_show(struct kobject *kobj,
struct damon_sysfs_kdamond *kdamond = container_of(kobj,
struct damon_sysfs_kdamond, kobj);
struct damon_ctx *ctx;
- int pid;
+ int pid = -1;
if (!mutex_trylock(&damon_sysfs_lock))
return -EBUSY;
ctx = kdamond->damon_ctx;
- if (!ctx) {
- pid = -1;
+ if (!ctx)
goto out;
- }
+
mutex_lock(&ctx->kdamond_lock);
- if (!ctx->kdamond)
- pid = -1;
- else
+ if (ctx->kdamond)
pid = ctx->kdamond->pid;
mutex_unlock(&ctx->kdamond_lock);
out:
@@ -2657,23 +2650,18 @@ static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds)
kdamonds->kdamonds_arr = NULL;
}
-static int damon_sysfs_nr_running_ctxs(struct damon_sysfs_kdamond **kdamonds,
+static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds,
int nr_kdamonds)
{
- int nr_running_ctxs = 0;
int i;
for (i = 0; i < nr_kdamonds; i++) {
- struct damon_ctx *ctx = kdamonds[i]->damon_ctx;
-
- if (!ctx)
- continue;
- mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond)
- nr_running_ctxs++;
- mutex_unlock(&ctx->kdamond_lock);
+ if (damon_sysfs_kdamond_running(kdamonds[i]) ||
+ damon_sysfs_cmd_request.kdamond == kdamonds[i])
+ return true;
}
- return nr_running_ctxs;
+
+ return false;
}
static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds,
@@ -2682,15 +2670,9 @@ static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds,
struct damon_sysfs_kdamond **kdamonds_arr, *kdamond;
int err, i;
- if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr))
+ if (damon_sysfs_kdamonds_busy(kdamonds->kdamonds_arr, kdamonds->nr))
return -EBUSY;
- for (i = 0; i < kdamonds->nr; i++) {
- if (damon_sysfs_cmd_request.kdamond ==
- kdamonds->kdamonds_arr[i])
- return -EBUSY;
- }
-
damon_sysfs_kdamonds_rm_dirs(kdamonds);
if (!nr_kdamonds)
return 0;
@@ -2741,8 +2723,7 @@ static ssize_t nr_kdamonds_show(struct kobject *kobj,
static ssize_t nr_kdamonds_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
- struct damon_sysfs_kdamonds *kdamonds = container_of(kobj,
- struct damon_sysfs_kdamonds, kobj);
+ struct damon_sysfs_kdamonds *kdamonds;
int nr, err;
err = kstrtoint(buf, 0, &nr);
@@ -2751,6 +2732,8 @@ static ssize_t nr_kdamonds_store(struct kobject *kobj,
if (nr < 0)
return -EINVAL;
+ kdamonds = container_of(kobj, struct damon_sysfs_kdamonds, kobj);
+
if (!mutex_trylock(&damon_sysfs_lock))
return -EBUSY;
err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr);
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index d4f55f349100..bce37c487540 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -14,33 +14,19 @@
#include <kunit/test.h>
-static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas)
+static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas,
+ ssize_t nr_vmas)
{
- int i, j;
- unsigned long largest_gap, gap;
+ int i;
+ MA_STATE(mas, mt, 0, 0);
if (!nr_vmas)
return;
- for (i = 0; i < nr_vmas - 1; i++) {
- vmas[i].vm_next = &vmas[i + 1];
-
- vmas[i].vm_rb.rb_left = NULL;
- vmas[i].vm_rb.rb_right = &vmas[i + 1].vm_rb;
-
- largest_gap = 0;
- for (j = i; j < nr_vmas; j++) {
- if (j == 0)
- continue;
- gap = vmas[j].vm_start - vmas[j - 1].vm_end;
- if (gap > largest_gap)
- largest_gap = gap;
- }
- vmas[i].rb_subtree_gap = largest_gap;
- }
- vmas[i].vm_next = NULL;
- vmas[i].vm_rb.rb_right = NULL;
- vmas[i].rb_subtree_gap = 0;
+ mas_lock(&mas);
+ for (i = 0; i < nr_vmas; i++)
+ vma_mas_store(&vmas[i], &mas);
+ mas_unlock(&mas);
}
/*
@@ -72,6 +58,7 @@ static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas)
*/
static void damon_test_three_regions_in_vmas(struct kunit *test)
{
+ static struct mm_struct mm;
struct damon_addr_range regions[3] = {0,};
/* 10-20-25, 200-210-220, 300-305, 307-330 */
struct vm_area_struct vmas[] = {
@@ -83,9 +70,10 @@ static void damon_test_three_regions_in_vmas(struct kunit *test)
(struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
};
- __link_vmas(vmas, 6);
+ mt_init_flags(&mm.mm_mt, MM_MT_FLAGS);
+ __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas));
- __damon_va_three_regions(&vmas[0], regions);
+ __damon_va_three_regions(&mm, regions);
KUNIT_EXPECT_EQ(test, 10ul, regions[0].start);
KUNIT_EXPECT_EQ(test, 25ul, regions[0].end);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 3c7b9d6dca95..15f03df66db6 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -72,7 +72,7 @@ static int damon_va_evenly_split_region(struct damon_target *t,
return -EINVAL;
orig_end = r->ar.end;
- sz_orig = r->ar.end - r->ar.start;
+ sz_orig = damon_sz_region(r);
sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);
if (!sz_piece)
@@ -113,37 +113,38 @@ static unsigned long sz_range(struct damon_addr_range *r)
*
* Returns 0 if success, or negative error code otherwise.
*/
-static int __damon_va_three_regions(struct vm_area_struct *vma,
+static int __damon_va_three_regions(struct mm_struct *mm,
struct damon_addr_range regions[3])
{
- struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0};
- struct vm_area_struct *last_vma = NULL;
- unsigned long start = 0;
- struct rb_root rbroot;
-
- /* Find two biggest gaps so that first_gap > second_gap > others */
- for (; vma; vma = vma->vm_next) {
- if (!last_vma) {
- start = vma->vm_start;
- goto next;
- }
+ struct damon_addr_range first_gap = {0}, second_gap = {0};
+ VMA_ITERATOR(vmi, mm, 0);
+ struct vm_area_struct *vma, *prev = NULL;
+ unsigned long start;
- if (vma->rb_subtree_gap <= sz_range(&second_gap)) {
- rbroot.rb_node = &vma->vm_rb;
- vma = rb_entry(rb_last(&rbroot),
- struct vm_area_struct, vm_rb);
+ /*
+ * Find the two biggest gaps so that first_gap > second_gap > others.
+ * If this is too slow, it can be optimised to examine the maple
+ * tree gaps.
+ */
+ for_each_vma(vmi, vma) {
+ unsigned long gap;
+
+ if (!prev) {
+ start = vma->vm_start;
goto next;
}
-
- gap.start = last_vma->vm_end;
- gap.end = vma->vm_start;
- if (sz_range(&gap) > sz_range(&second_gap)) {
- swap(gap, second_gap);
- if (sz_range(&second_gap) > sz_range(&first_gap))
- swap(second_gap, first_gap);
+ gap = vma->vm_start - prev->vm_end;
+
+ if (gap > sz_range(&first_gap)) {
+ second_gap = first_gap;
+ first_gap.start = prev->vm_end;
+ first_gap.end = vma->vm_start;
+ } else if (gap > sz_range(&second_gap)) {
+ second_gap.start = prev->vm_end;
+ second_gap.end = vma->vm_start;
}
next:
- last_vma = vma;
+ prev = vma;
}
if (!sz_range(&second_gap) || !sz_range(&first_gap))
@@ -159,7 +160,7 @@ next:
regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION);
regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION);
regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION);
- regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION);
+ regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION);
return 0;
}
@@ -180,7 +181,7 @@ static int damon_va_three_regions(struct damon_target *t,
return -EINVAL;
mmap_read_lock(mm);
- rc = __damon_va_three_regions(mm->mmap, regions);
+ rc = __damon_va_three_regions(mm, regions);
mmap_read_unlock(mm);
mmput(mm);
@@ -250,8 +251,8 @@ static void __damon_va_init_regions(struct damon_ctx *ctx,
for (i = 0; i < 3; i++)
sz += regions[i].end - regions[i].start;
- if (ctx->min_nr_regions)
- sz /= ctx->min_nr_regions;
+ if (ctx->attrs.min_nr_regions)
+ sz /= ctx->attrs.min_nr_regions;
if (sz < DAMON_MIN_REGION)
sz = DAMON_MIN_REGION;
@@ -302,9 +303,14 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_huge(*pmd)) {
+ if (pmd_trans_huge(*pmd)) {
ptl = pmd_lock(walk->mm, pmd);
- if (pmd_huge(*pmd)) {
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (pmd_trans_huge(*pmd)) {
damon_pmdp_mkold(pmd, walk->mm, addr);
spin_unlock(ptl);
return 0;
@@ -391,8 +397,8 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
* Functions for the access checking of the regions
*/
-static void __damon_va_prepare_access_check(struct damon_ctx *ctx,
- struct mm_struct *mm, struct damon_region *r)
+static void __damon_va_prepare_access_check(struct mm_struct *mm,
+ struct damon_region *r)
{
r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
@@ -410,7 +416,7 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
if (!mm)
continue;
damon_for_each_region(r, t)
- __damon_va_prepare_access_check(ctx, mm, r);
+ __damon_va_prepare_access_check(mm, r);
mmput(mm);
}
}
@@ -429,9 +435,14 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
struct damon_young_walk_private *priv = walk->private;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (pmd_huge(*pmd)) {
+ if (pmd_trans_huge(*pmd)) {
ptl = pmd_lock(walk->mm, pmd);
- if (!pmd_huge(*pmd)) {
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (!pmd_trans_huge(*pmd)) {
spin_unlock(ptl);
goto regular_page;
}
@@ -532,16 +543,15 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
* mm 'mm_struct' for the given virtual address space
* r the region to be checked
*/
-static void __damon_va_check_access(struct damon_ctx *ctx,
- struct mm_struct *mm, struct damon_region *r)
+static void __damon_va_check_access(struct mm_struct *mm,
+ struct damon_region *r, bool same_target)
{
- static struct mm_struct *last_mm;
static unsigned long last_addr;
static unsigned long last_page_sz = PAGE_SIZE;
static bool last_accessed;
/* If the region is in the last checked page, reuse the result */
- if (mm == last_mm && (ALIGN_DOWN(last_addr, last_page_sz) ==
+ if (same_target && (ALIGN_DOWN(last_addr, last_page_sz) ==
ALIGN_DOWN(r->sampling_addr, last_page_sz))) {
if (last_accessed)
r->nr_accesses++;
@@ -552,7 +562,6 @@ static void __damon_va_check_access(struct damon_ctx *ctx,
if (last_accessed)
r->nr_accesses++;
- last_mm = mm;
last_addr = r->sampling_addr;
}
@@ -562,14 +571,17 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
struct mm_struct *mm;
struct damon_region *r;
unsigned int max_nr_accesses = 0;
+ bool same_target;
damon_for_each_target(t, ctx) {
mm = damon_get_mm(t);
if (!mm)
continue;
+ same_target = false;
damon_for_each_region(r, t) {
- __damon_va_check_access(ctx, mm, r);
+ __damon_va_check_access(mm, r, same_target);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
+ same_target = true;
}
mmput(mm);
}
@@ -581,9 +593,8 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
* Functions for the target validity check and cleanup
*/
-static bool damon_va_target_valid(void *target)
+static bool damon_va_target_valid(struct damon_target *t)
{
- struct damon_target *t = target;
struct task_struct *task;
task = damon_get_task_struct(t);
@@ -607,7 +618,7 @@ static unsigned long damos_madvise(struct damon_target *target,
{
struct mm_struct *mm;
unsigned long start = PAGE_ALIGN(r->ar.start);
- unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start);
+ unsigned long len = PAGE_ALIGN(damon_sz_region(r));
unsigned long applied;
mm = damon_get_mm(target);
@@ -646,6 +657,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
case DAMOS_STAT:
return 0;
default:
+ /*
+ * DAMOS actions that are not yet supported by 'vaddr'.
+ */
return 0;
}
@@ -659,7 +673,7 @@ static int damon_va_scheme_score(struct damon_ctx *context,
switch (scheme->action) {
case DAMOS_PAGEOUT:
- return damon_pageout_score(context, r, scheme);
+ return damon_cold_score(context, r, scheme);
default:
break;
}
diff --git a/mm/debug.c b/mm/debug.c
index bef329bf28f0..0fd15ba70d16 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -139,13 +139,11 @@ EXPORT_SYMBOL(dump_page);
void dump_vma(const struct vm_area_struct *vma)
{
- pr_emerg("vma %px start %px end %px\n"
- "next %px prev %px mm %px\n"
+ pr_emerg("vma %px start %px end %px mm %px\n"
"prot %lx anon_vma %px vm_ops %px\n"
"pgoff %lx file %px private_data %px\n"
"flags: %#lx(%pGv)\n",
- vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
- vma->vm_prev, vma->vm_mm,
+ vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
vma->vm_file, vma->vm_private_data,
@@ -155,11 +153,11 @@ EXPORT_SYMBOL(dump_vma);
void dump_mm(const struct mm_struct *mm)
{
- pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n"
+ pr_emerg("mm %px task_size %lu\n"
#ifdef CONFIG_MMU
"get_unmapped_area %px\n"
#endif
- "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
+ "mmap_base %lu mmap_legacy_base %lu\n"
"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n"
@@ -183,11 +181,11 @@ void dump_mm(const struct mm_struct *mm)
"tlb_flush_pending %d\n"
"def_flags: %#lx(%pGv)\n",
- mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size,
+ mm, mm->task_size,
#ifdef CONFIG_MMU
mm->get_unmapped_area,
#endif
- mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
+ mm->mmap_base, mm->mmap_legacy_base,
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
mm_pgtables_bytes(mm),
diff --git a/mm/filemap.c b/mm/filemap.c
index 15800334147b..08341616ae7a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -632,22 +632,23 @@ bool filemap_range_has_writeback(struct address_space *mapping,
{
XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
pgoff_t max = end_byte >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
if (end_byte < start_byte)
return false;
rcu_read_lock();
- xas_for_each(&xas, page, max) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, max) {
+ if (xas_retry(&xas, folio))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
- if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
+ if (folio_test_dirty(folio) || folio_test_locked(folio) ||
+ folio_test_writeback(folio))
break;
}
rcu_read_unlock();
- return page != NULL;
+ return folio != NULL;
}
EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
@@ -1221,15 +1222,12 @@ static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
bool thrashing = false;
- bool delayacct = false;
unsigned long pflags;
+ bool in_thrashing;
if (bit_nr == PG_locked &&
!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
- if (!folio_test_swapbacked(folio)) {
- delayacct_thrashing_start();
- delayacct = true;
- }
+ delayacct_thrashing_start(&in_thrashing);
psi_memstall_enter(&pflags);
thrashing = true;
}
@@ -1329,8 +1327,7 @@ repeat:
finish_wait(q, wait);
if (thrashing) {
- if (delayacct)
- delayacct_thrashing_end();
+ delayacct_thrashing_end(&in_thrashing);
psi_memstall_leave(&pflags);
}
@@ -1378,17 +1375,14 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
bool thrashing = false;
- bool delayacct = false;
unsigned long pflags;
+ bool in_thrashing;
wait_queue_head_t *q;
struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
q = folio_waitqueue(folio);
if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
- if (!folio_test_swapbacked(folio)) {
- delayacct_thrashing_start();
- delayacct = true;
- }
+ delayacct_thrashing_start(&in_thrashing);
psi_memstall_enter(&pflags);
thrashing = true;
}
@@ -1435,8 +1429,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
finish_wait(q, wait);
if (thrashing) {
- if (delayacct)
- delayacct_thrashing_end();
+ delayacct_thrashing_end(&in_thrashing);
psi_memstall_leave(&pflags);
}
}
@@ -1467,7 +1460,7 @@ EXPORT_SYMBOL(folio_wait_bit_killable);
*
* Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
*/
-int folio_put_wait_locked(struct folio *folio, int state)
+static int folio_put_wait_locked(struct folio *folio, int state)
{
return folio_wait_bit_common(folio, PG_locked, state, DROP);
}
@@ -1633,24 +1626,26 @@ EXPORT_SYMBOL(folio_end_writeback);
*/
void page_endio(struct page *page, bool is_write, int err)
{
+ struct folio *folio = page_folio(page);
+
if (!is_write) {
if (!err) {
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
} else {
- ClearPageUptodate(page);
- SetPageError(page);
+ folio_clear_uptodate(folio);
+ folio_set_error(folio);
}
- unlock_page(page);
+ folio_unlock(folio);
} else {
if (err) {
struct address_space *mapping;
- SetPageError(page);
- mapping = page_mapping(page);
+ folio_set_error(folio);
+ mapping = folio_mapping(folio);
if (mapping)
mapping_set_error(mapping, err);
}
- end_page_writeback(page);
+ folio_end_writeback(folio);
}
}
EXPORT_SYMBOL_GPL(page_endio);
@@ -2195,30 +2190,31 @@ bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
}
/**
- * find_get_pages_contig - gang contiguous pagecache lookup
+ * filemap_get_folios_contig - Get a batch of contiguous folios
* @mapping: The address_space to search
- * @index: The starting page index
- * @nr_pages: The maximum number of pages
- * @pages: Where the resulting pages are placed
+ * @start: The starting page index
+ * @end: The final page index (inclusive)
+ * @fbatch: The batch to fill
*
- * find_get_pages_contig() works exactly like find_get_pages_range(),
- * except that the returned number of pages are guaranteed to be
- * contiguous.
+ * filemap_get_folios_contig() works exactly like filemap_get_folios(),
+ * except the returned folios are guaranteed to be contiguous. This may
+ * not return all contiguous folios if the batch gets filled up.
*
- * Return: the number of pages which were found.
+ * Return: The number of folios found.
+ * Also update @start to be positioned for traversal of the next folio.
*/
-unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
- unsigned int nr_pages, struct page **pages)
+
+unsigned filemap_get_folios_contig(struct address_space *mapping,
+ pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
{
- XA_STATE(xas, &mapping->i_pages, index);
+ XA_STATE(xas, &mapping->i_pages, *start);
+ unsigned long nr;
struct folio *folio;
- unsigned int ret = 0;
-
- if (unlikely(!nr_pages))
- return 0;
rcu_read_lock();
- for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
+
+ for (folio = xas_load(&xas); folio && xas.xa_index <= end;
+ folio = xas_next(&xas)) {
if (xas_retry(&xas, folio))
continue;
/*
@@ -2226,33 +2222,45 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
* No current caller is looking for DAX entries.
*/
if (xa_is_value(folio))
- break;
+ goto update_start;
if (!folio_try_get_rcu(folio))
goto retry;
if (unlikely(folio != xas_reload(&xas)))
- goto put_page;
+ goto put_folio;
-again:
- pages[ret] = folio_file_page(folio, xas.xa_index);
- if (++ret == nr_pages)
- break;
- if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) {
- xas.xa_index++;
- folio_ref_inc(folio);
- goto again;
+ if (!folio_batch_add(fbatch, folio)) {
+ nr = folio_nr_pages(folio);
+
+ if (folio_test_hugetlb(folio))
+ nr = 1;
+ *start = folio->index + nr;
+ goto out;
}
continue;
-put_page:
+put_folio:
folio_put(folio);
+
retry:
xas_reset(&xas);
}
+
+update_start:
+ nr = folio_batch_count(fbatch);
+
+ if (nr) {
+ folio = fbatch->folios[nr - 1];
+ if (folio_test_hugetlb(folio))
+ *start = folio->index + 1;
+ else
+ *start = folio->index + folio_nr_pages(folio);
+ }
+out:
rcu_read_unlock();
- return ret;
+ return folio_batch_count(fbatch);
}
-EXPORT_SYMBOL(find_get_pages_contig);
+EXPORT_SYMBOL(filemap_get_folios_contig);
/**
* find_get_pages_range_tag - Find and return head pages matching @tag.
@@ -2382,6 +2390,8 @@ retry:
static int filemap_read_folio(struct file *file, filler_t filler,
struct folio *folio)
{
+ bool workingset = folio_test_workingset(folio);
+ unsigned long pflags;
int error;
/*
@@ -2390,8 +2400,13 @@ static int filemap_read_folio(struct file *file, filler_t filler,
* fails.
*/
folio_clear_error(folio);
+
/* Start the actual read. The read will unlock the page. */
+ if (unlikely(workingset))
+ psi_memstall_enter(&pflags);
error = filler(file, folio);
+ if (unlikely(workingset))
+ psi_memstall_leave(&pflags);
if (error)
return error;
@@ -3712,7 +3727,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
- void *fsdata;
+ void *fsdata = NULL;
offset = (pos & (PAGE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_SIZE - offset,
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 458618c7302c..e1e23b4947d7 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -88,6 +88,12 @@ void lru_cache_add(struct page *page)
}
EXPORT_SYMBOL(lru_cache_add);
+void lru_cache_add_inactive_or_unevictable(struct page *page,
+ struct vm_area_struct *vma)
+{
+ folio_add_lru_vma(page_folio(page), vma);
+}
+
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp)
{
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1a97610308cb..279e55b4ed87 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -125,6 +125,9 @@ void frontswap_init(unsigned type, unsigned long *map)
* p->frontswap set to something valid to work properly.
*/
frontswap_map_set(sis, map);
+
+ if (!frontswap_enabled())
+ return;
frontswap_ops->init(type);
}
diff --git a/mm/gup.c b/mm/gup.c
index 5abdaf487460..fe195d47de74 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -158,6 +158,13 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
else
folio_ref_add(folio,
refs * (GUP_PIN_COUNTING_BIAS - 1));
+ /*
+ * Adjust the pincount before re-checking the PTE for changes.
+ * This is essentially a smp_mb() and is paired with a memory
+ * barrier in page_try_share_anon_rmap().
+ */
+ smp_mb__after_atomic();
+
node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
return folio;
@@ -530,6 +537,18 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
return ERR_PTR(-EINVAL);
+
+ /*
+ * Considering PTE level hugetlb, like continuous-PTE hugetlb on
+ * ARM64 architecture.
+ */
+ if (is_vm_hugetlb_page(vma)) {
+ page = follow_huge_pmd_pte(vma, address, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
+ }
+
retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
@@ -554,7 +573,7 @@ retry:
migration_entry_wait(mm, pmd, address);
goto retry;
}
- if ((flags & FOLL_NUMA) && pte_protnone(pte))
+ if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
goto no_page;
page = vm_normal_page(vma, address, pte);
@@ -662,7 +681,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
if (pmd_none(pmdval))
return no_page_table(vma, flags);
if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
- page = follow_huge_pmd(mm, address, pmd, flags);
+ page = follow_huge_pmd_pte(vma, address, flags);
if (page)
return page;
return no_page_table(vma, flags);
@@ -707,7 +726,7 @@ retry:
if (likely(!pmd_trans_huge(pmdval)))
return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
- if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
+ if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags))
return no_page_table(vma, flags);
retry_locked:
@@ -1153,14 +1172,6 @@ static long __get_user_pages(struct mm_struct *mm,
VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
- /*
- * If FOLL_FORCE is set then do not force a full fault as the hinting
- * fault information is unrelated to the reference behaviour of a task
- * using the address space
- */
- if (!(gup_flags & FOLL_FORCE))
- gup_flags |= FOLL_NUMA;
-
do {
struct page *page;
unsigned int foll_flags = gup_flags;
@@ -1667,10 +1678,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
if (!locked) {
locked = 1;
mmap_read_lock(mm);
- vma = find_vma(mm, nstart);
+ vma = find_vma_intersection(mm, nstart, end);
} else if (nstart >= vma->vm_end)
- vma = vma->vm_next;
- if (!vma || vma->vm_start >= end)
+ vma = find_vma_intersection(mm, vma->vm_end, end);
+
+ if (!vma)
break;
/*
* Set [nstart; nend) to intersection of desired address
@@ -1927,20 +1939,16 @@ struct page *get_dump_page(unsigned long addr)
#ifdef CONFIG_MIGRATION
/*
- * Check whether all pages are pinnable, if so return number of pages. If some
- * pages are not pinnable, migrate them, and unpin all pages. Return zero if
- * pages were migrated, or if some pages were not successfully isolated.
- * Return negative error if migration fails.
+ * Returns the number of collected pages. Return value is always >= 0.
*/
-static long check_and_migrate_movable_pages(unsigned long nr_pages,
- struct page **pages,
- unsigned int gup_flags)
+static unsigned long collect_longterm_unpinnable_pages(
+ struct list_head *movable_page_list,
+ unsigned long nr_pages,
+ struct page **pages)
{
- unsigned long isolation_error_count = 0, i;
+ unsigned long i, collected = 0;
struct folio *prev_folio = NULL;
- LIST_HEAD(movable_page_list);
- bool drain_allow = true, coherent_pages = false;
- int ret = 0;
+ bool drain_allow = true;
for (i = 0; i < nr_pages; i++) {
struct folio *folio = page_folio(pages[i]);
@@ -1949,45 +1957,16 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
continue;
prev_folio = folio;
- /*
- * Device coherent pages are managed by a driver and should not
- * be pinned indefinitely as it prevents the driver moving the
- * page. So when trying to pin with FOLL_LONGTERM instead try
- * to migrate the page out of device memory.
- */
- if (folio_is_device_coherent(folio)) {
- /*
- * We always want a new GUP lookup with device coherent
- * pages.
- */
- pages[i] = 0;
- coherent_pages = true;
-
- /*
- * Migration will fail if the page is pinned, so convert
- * the pin on the source page to a normal reference.
- */
- if (gup_flags & FOLL_PIN) {
- get_page(&folio->page);
- unpin_user_page(&folio->page);
- }
+ if (folio_is_longterm_pinnable(folio))
+ continue;
- ret = migrate_device_coherent_page(&folio->page);
- if (ret)
- goto unpin_pages;
+ collected++;
+ if (folio_is_device_coherent(folio))
continue;
- }
- if (folio_is_longterm_pinnable(folio))
- continue;
- /*
- * Try to move out any movable page before pinning the range.
- */
if (folio_test_hugetlb(folio)) {
- if (isolate_hugetlb(&folio->page,
- &movable_page_list))
- isolation_error_count++;
+ isolate_hugetlb(&folio->page, movable_page_list);
continue;
}
@@ -1996,63 +1975,124 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
drain_allow = false;
}
- if (folio_isolate_lru(folio)) {
- isolation_error_count++;
+ if (!folio_isolate_lru(folio))
continue;
- }
- list_add_tail(&folio->lru, &movable_page_list);
+
+ list_add_tail(&folio->lru, movable_page_list);
node_stat_mod_folio(folio,
NR_ISOLATED_ANON + folio_is_file_lru(folio),
folio_nr_pages(folio));
}
- if (!list_empty(&movable_page_list) || isolation_error_count ||
- coherent_pages)
- goto unpin_pages;
+ return collected;
+}
- /*
- * If list is empty, and no isolation errors, means that all pages are
- * in the correct zone.
- */
- return nr_pages;
+/*
+ * Unpins all pages and migrates device coherent pages and movable_page_list.
+ * Returns -EAGAIN if all pages were successfully migrated or -errno for failure
+ * (or partial success).
+ */
+static int migrate_longterm_unpinnable_pages(
+ struct list_head *movable_page_list,
+ unsigned long nr_pages,
+ struct page **pages)
+{
+ int ret;
+ unsigned long i;
-unpin_pages:
- /*
- * pages[i] might be NULL if any device coherent pages were found.
- */
for (i = 0; i < nr_pages; i++) {
- if (!pages[i])
+ struct folio *folio = page_folio(pages[i]);
+
+ if (folio_is_device_coherent(folio)) {
+ /*
+ * Migration will fail if the page is pinned, so convert
+ * the pin on the source page to a normal reference.
+ */
+ pages[i] = NULL;
+ folio_get(folio);
+ gup_put_folio(folio, 1, FOLL_PIN);
+
+ if (migrate_device_coherent_page(&folio->page)) {
+ ret = -EBUSY;
+ goto err;
+ }
+
continue;
+ }
- if (gup_flags & FOLL_PIN)
- unpin_user_page(pages[i]);
- else
- put_page(pages[i]);
+ /*
+ * We can't migrate pages with unexpected references, so drop
+ * the reference obtained by __get_user_pages_locked().
+ * Migrating pages have been added to movable_page_list after
+ * calling folio_isolate_lru() which takes a reference so the
+ * page won't be freed if it's migrating.
+ */
+ unpin_user_page(pages[i]);
+ pages[i] = NULL;
}
- if (!list_empty(&movable_page_list)) {
+ if (!list_empty(movable_page_list)) {
struct migration_target_control mtc = {
.nid = NUMA_NO_NODE,
.gfp_mask = GFP_USER | __GFP_NOWARN,
};
- ret = migrate_pages(&movable_page_list, alloc_migration_target,
- NULL, (unsigned long)&mtc, MIGRATE_SYNC,
- MR_LONGTERM_PIN, NULL);
- if (ret > 0) /* number of pages not migrated */
+ if (migrate_pages(movable_page_list, alloc_migration_target,
+ NULL, (unsigned long)&mtc, MIGRATE_SYNC,
+ MR_LONGTERM_PIN, NULL)) {
ret = -ENOMEM;
+ goto err;
+ }
}
- if (ret && !list_empty(&movable_page_list))
- putback_movable_pages(&movable_page_list);
+ putback_movable_pages(movable_page_list);
+
+ return -EAGAIN;
+
+err:
+ for (i = 0; i < nr_pages; i++)
+ if (pages[i])
+ unpin_user_page(pages[i]);
+ putback_movable_pages(movable_page_list);
+
return ret;
}
+
+/*
+ * Check whether all pages are *allowed* to be pinned. Rather confusingly, all
+ * pages in the range are required to be pinned via FOLL_PIN, before calling
+ * this routine.
+ *
+ * If any pages in the range are not allowed to be pinned, then this routine
+ * will migrate those pages away, unpin all the pages in the range and return
+ * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then
+ * call this routine again.
+ *
+ * If an error other than -EAGAIN occurs, this indicates a migration failure.
+ * The caller should give up, and propagate the error back up the call stack.
+ *
+ * If everything is OK and all pages in the range are allowed to be pinned, then
+ * this routine leaves all pages pinned and returns zero for success.
+ */
+static long check_and_migrate_movable_pages(unsigned long nr_pages,
+ struct page **pages)
+{
+ unsigned long collected;
+ LIST_HEAD(movable_page_list);
+
+ collected = collect_longterm_unpinnable_pages(&movable_page_list,
+ nr_pages, pages);
+ if (!collected)
+ return 0;
+
+ return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages,
+ pages);
+}
#else
static long check_and_migrate_movable_pages(unsigned long nr_pages,
- struct page **pages,
- unsigned int gup_flags)
+ struct page **pages)
{
- return nr_pages;
+ return 0;
}
#endif /* CONFIG_MIGRATION */
@@ -2068,22 +2108,36 @@ static long __gup_longterm_locked(struct mm_struct *mm,
unsigned int gup_flags)
{
unsigned int flags;
- long rc;
+ long rc, nr_pinned_pages;
if (!(gup_flags & FOLL_LONGTERM))
return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
NULL, gup_flags);
+
+ /*
+ * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM
+ * implies FOLL_PIN (although the reverse is not true). Therefore it is
+ * correct to unconditionally call check_and_migrate_movable_pages()
+ * which assumes pages have been pinned via FOLL_PIN.
+ *
+ * Enforce the above reasoning by asserting that FOLL_PIN is set.
+ */
+ if (WARN_ON(!(gup_flags & FOLL_PIN)))
+ return -EINVAL;
flags = memalloc_pin_save();
do {
- rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
- NULL, gup_flags);
- if (rc <= 0)
+ nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
+ pages, vmas, NULL,
+ gup_flags);
+ if (nr_pinned_pages <= 0) {
+ rc = nr_pinned_pages;
break;
- rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
- } while (!rc);
+ }
+ rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
+ } while (rc == -EAGAIN);
memalloc_pin_restore(flags);
- return rc;
+ return rc ? rc : nr_pinned_pages;
}
static bool is_valid_gup_flags(unsigned int gup_flags)
@@ -2345,8 +2399,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -2358,11 +2432,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
struct page *page;
struct folio *folio;
- /*
- * Similar to the PMD case below, NUMA hinting must take slow
- * path using the pte_protnone check.
- */
- if (pte_protnone(pte))
+ if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
goto pte_unmap;
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
@@ -2392,7 +2462,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
}
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2439,8 +2510,9 @@ pte_unmap:
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
@@ -2744,12 +2816,8 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
pmd_devmap(pmd))) {
- /*
- * NUMA hinting faults need to be handled in the GUP
- * slowpath for accounting purposes and so that they
- * can be serialised against THP migration.
- */
- if (pmd_protnone(pmd))
+ if (pmd_protnone(pmd) &&
+ !gup_can_follow_protnone(flags))
return 0;
if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
@@ -2764,7 +2832,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
diff --git a/mm/highmem.c b/mm/highmem.c
index c707d7202d5f..db251e77f98f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -30,6 +30,17 @@
#include <asm/tlbflush.h>
#include <linux/vmalloc.h>
+#ifdef CONFIG_KMAP_LOCAL
+static inline int kmap_local_calc_idx(int idx)
+{
+ return idx + KM_MAX_IDX * smp_processor_id();
+}
+
+#ifndef arch_kmap_local_map_idx
+#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx)
+#endif
+#endif /* CONFIG_KMAP_LOCAL */
+
/*
* Virtual_count is not a pure "count".
* 0 means that it is not mapped, and has not been mapped
@@ -142,12 +153,29 @@ pte_t *pkmap_page_table;
struct page *__kmap_to_page(void *vaddr)
{
+ unsigned long base = (unsigned long) vaddr & PAGE_MASK;
+ struct kmap_ctrl *kctrl = &current->kmap_ctrl;
unsigned long addr = (unsigned long)vaddr;
+ int i;
+
+ /* kmap() mappings */
+ if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) &&
+ addr < PKMAP_ADDR(LAST_PKMAP)))
+ return pte_page(pkmap_page_table[PKMAP_NR(addr)]);
- if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
- int i = PKMAP_NR(addr);
+ /* kmap_local_page() mappings */
+ if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) &&
+ base < __fix_to_virt(FIX_KMAP_BEGIN))) {
+ for (i = 0; i < kctrl->idx; i++) {
+ unsigned long base_addr;
+ int idx;
- return pte_page(pkmap_page_table[i]);
+ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
+ base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+
+ if (base_addr == base)
+ return pte_page(kctrl->pteval[i]);
+ }
}
return virt_to_page(vaddr);
@@ -462,10 +490,6 @@ static inline void kmap_local_idx_pop(void)
# define arch_kmap_local_post_unmap(vaddr) do { } while (0)
#endif
-#ifndef arch_kmap_local_map_idx
-#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx)
-#endif
-
#ifndef arch_kmap_local_unmap_idx
#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx)
#endif
@@ -494,11 +518,6 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr)
return false;
}
-static inline int kmap_local_calc_idx(int idx)
-{
- return idx + KM_MAX_IDX * smp_processor_id();
-}
-
static pte_t *__kmap_pte;
static pte_t *kmap_get_pte(unsigned long vaddr, int idx)
diff --git a/mm/hmm.c b/mm/hmm.c
index f2aa63b94d9b..3850fb625dda 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -253,7 +253,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = swp_offset(entry) | cpu_flags;
+ *hmm_pfn = swp_offset_pfn(entry) | cpu_flags;
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e9414ee57c5b..811d19b5c4f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -36,6 +36,7 @@
#include <linux/numa.h>
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
+#include <linux/memory-tiers.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -70,9 +71,8 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags,
- bool smaps, bool in_pf)
+bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
+ bool smaps, bool in_pf, bool enforce_sysfs)
{
if (!vma->vm_mm) /* vdso */
return false;
@@ -119,13 +119,12 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
* own flags.
*/
if (!in_pf && shmem_file(vma->vm_file))
- return shmem_huge_enabled(vma);
+ return shmem_huge_enabled(vma, !enforce_sysfs);
- if (!hugepage_flags_enabled())
- return false;
-
- /* THP settings require madvise. */
- if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always())
+ /* Enforce sysfs THP requirements as necessary */
+ if (enforce_sysfs &&
+ (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
+ !hugepage_flags_always())))
return false;
/* Only regular file is valid */
@@ -164,7 +163,6 @@ retry:
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
return false;
}
- count_vm_event(THP_ZERO_PAGE_ALLOC);
preempt_disable();
if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
preempt_enable();
@@ -176,6 +174,7 @@ retry:
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
+ count_vm_event(THP_ZERO_PAGE_ALLOC);
return true;
}
@@ -772,8 +771,7 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
- if (pgtable)
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
}
@@ -1307,6 +1305,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t orig_pmd = vmf->orig_pmd;
@@ -1328,46 +1327,48 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
}
page = pmd_page(orig_pmd);
+ folio = page_folio(page);
VM_BUG_ON_PAGE(!PageHead(page), page);
/* Early check when only holding the PT lock. */
if (PageAnonExclusive(page))
goto reuse;
- if (!trylock_page(page)) {
- get_page(page);
+ if (!folio_trylock(folio)) {
+ folio_get(folio);
spin_unlock(vmf->ptl);
- lock_page(page);
+ folio_lock(folio);
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return 0;
}
- put_page(page);
+ folio_put(folio);
}
/* Recheck after temporarily dropping the PT lock. */
if (PageAnonExclusive(page)) {
- unlock_page(page);
+ folio_unlock(folio);
goto reuse;
}
/*
- * See do_wp_page(): we can only reuse the page exclusively if there are
- * no additional references. Note that we always drain the LRU
- * pagevecs immediately after adding a THP.
+ * See do_wp_page(): we can only reuse the folio exclusively if
+ * there are no additional references. Note that we always drain
+ * the LRU pagevecs immediately after adding a THP.
*/
- if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page))
+ if (folio_ref_count(folio) >
+ 1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
goto unlock_fallback;
- if (PageSwapCache(page))
- try_to_free_swap(page);
- if (page_count(page) == 1) {
+ if (folio_test_swapcache(folio))
+ folio_free_swap(folio);
+ if (folio_ref_count(folio) == 1) {
pmd_t entry;
page_move_anon_rmap(page, vma);
- unlock_page(page);
+ folio_unlock(folio);
reuse:
if (unlikely(unshare)) {
spin_unlock(vmf->ptl);
@@ -1382,7 +1383,7 @@ reuse:
}
unlock_fallback:
- unlock_page(page);
+ folio_unlock(folio);
spin_unlock(vmf->ptl);
fallback:
__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
@@ -1449,7 +1450,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
return ERR_PTR(-EFAULT);
/* Full NUMA hinting faults to serialise migration in fault paths */
- if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
+ if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags))
return NULL;
if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
@@ -1479,7 +1480,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int page_nid = NUMA_NO_NODE;
- int target_nid, last_cpupid = -1;
+ int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
bool migrated = false;
bool was_writable = pmd_savedwrite(oldpmd);
int flags = 0;
@@ -1500,7 +1501,12 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
flags |= TNF_NO_GROUP;
page_nid = page_to_nid(page);
- last_cpupid = page_cpupid_last(page);
+ /*
+ * For memory tiering mode, cpupid of slow memory page is used
+ * to record page access time. So use default value.
+ */
+ if (node_is_toptier(page_nid))
+ last_cpupid = page_cpupid_last(page);
target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
&flags);
@@ -1824,6 +1830,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (prot_numa) {
struct page *page;
+ bool toptier;
/*
* Avoid trapping faults against the zero page. The read-only
* data is likely to be read-cached on the local CPU and
@@ -1836,13 +1843,18 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto unlock;
page = pmd_page(*pmd);
+ toptier = node_is_toptier(page_to_nid(page));
/*
* Skip scanning top tier node if normal numa
* balancing is disabled
*/
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- node_is_toptier(page_to_nid(page)))
+ toptier)
goto unlock;
+
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !toptier)
+ xchg_page_access_time(page, jiffies_to_msecs(jiffies));
}
/*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
@@ -2029,7 +2041,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
- bool anon_exclusive = false;
+ bool anon_exclusive = false, dirty = false;
unsigned long addr;
int i;
@@ -2113,13 +2125,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
write = is_writable_migration_entry(entry);
if (PageAnon(page))
anon_exclusive = is_readable_exclusive_migration_entry(entry);
- young = false;
+ young = is_migration_entry_young(entry);
+ dirty = is_migration_entry_dirty(entry);
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
page = pmd_page(old_pmd);
- if (pmd_dirty(old_pmd))
+ if (pmd_dirty(old_pmd)) {
+ dirty = true;
SetPageDirty(page);
+ }
write = pmd_write(old_pmd);
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
@@ -2140,6 +2155,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*
* In case we cannot clear PageAnonExclusive(), split the PMD
* only and let try_to_migrate_one() fail later.
+ *
+ * See page_try_share_anon_rmap(): invalidate PMD first.
*/
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
@@ -2171,6 +2188,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
else
swp_entry = make_readable_migration_entry(
page_to_pfn(page + i));
+ if (young)
+ swp_entry = make_migration_entry_young(swp_entry);
+ if (dirty)
+ swp_entry = make_migration_entry_dirty(swp_entry);
entry = swp_entry_to_pte(swp_entry);
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
@@ -2185,6 +2206,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_wrprotect(entry);
if (!young)
entry = pte_mkold(entry);
+ /*
+ * NOTE: we don't do pte_mkdirty when dirty==true
+ * because it breaks sparc64 which can sigsegv
+ * random process. Need to revisit when we figure
+ * out what is special with sparc64.
+ */
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
if (uffd_wp)
@@ -2288,25 +2315,11 @@ out:
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
bool freeze, struct folio *folio)
{
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
+ pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
- pgd = pgd_offset(vma->vm_mm, address);
- if (!pgd_present(*pgd))
+ if (!pmd)
return;
- p4d = p4d_offset(pgd, address);
- if (!p4d_present(*p4d))
- return;
-
- pud = pud_offset(p4d, address);
- if (!pud_present(*pud))
- return;
-
- pmd = pmd_offset(pud, address);
-
__split_huge_pmd(vma, pmd, address, freeze, folio);
}
@@ -2334,24 +2347,23 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
split_huge_pmd_if_needed(vma, end);
/*
- * If we're also updating the vma->vm_next->vm_start,
+ * If we're also updating the next vma vm_start,
* check if we need to split it.
*/
if (adjust_next > 0) {
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
unsigned long nstart = next->vm_start;
nstart += adjust_next;
split_huge_pmd_if_needed(next, nstart);
}
}
-static void unmap_page(struct page *page)
+static void unmap_folio(struct folio *folio)
{
- struct folio *folio = page_folio(page);
enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
TTU_SYNC;
- VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
/*
* Anon pages need migration entries to preserve them, but file
@@ -2368,7 +2380,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
{
int i = 0;
- /* If unmap_page() uses try_to_migrate() on file, remove this check */
+ /* If unmap_folio() uses try_to_migrate() on file, remove this check */
if (!folio_test_anon(folio))
return;
for (;;) {
@@ -2418,7 +2430,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
* for example lock_page() which set PG_waiters.
*
* Note that for mapped sub-pages of an anonymous THP,
- * PG_anon_exclusive has been cleared in unmap_page() and is stored in
+ * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
* the migration entry instead from where remap_page() will restore it.
* We can still have PG_anon_exclusive set on effectively unmapped and
* unreferenced sub-pages of an anonymous THP: we can simply drop
@@ -2438,14 +2450,24 @@ static void __split_huge_page_tail(struct page *head, int tail,
#ifdef CONFIG_64BIT
(1L << PG_arch_2) |
#endif
- (1L << PG_dirty)));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_REFS_MASK));
/* ->mapping in first tail page is compound_mapcount */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
page_tail);
page_tail->mapping = head->mapping;
page_tail->index = head->index + tail;
- page_tail->private = 0;
+
+ /*
+ * page->private should not be set in tail pages with the exception
+ * of swap cache pages that store the swp_entry_t in tail pages.
+ * Fix up and warn once if private is unexpectedly set.
+ */
+ if (!folio_test_swapcache(page_folio(head))) {
+ VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail);
+ page_tail->private = 0;
+ }
/* Page flags must be visible before we make the page non-compound. */
smp_wmb();
@@ -2611,27 +2633,26 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct folio *folio = page_folio(page);
- struct page *head = &folio->page;
- struct deferred_split *ds_queue = get_deferred_split_queue(head);
- XA_STATE(xas, &head->mapping->i_pages, head->index);
+ struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page);
+ XA_STATE(xas, &folio->mapping->i_pages, folio->index);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int extra_pins, ret;
pgoff_t end;
bool is_hzp;
- VM_BUG_ON_PAGE(!PageLocked(head), head);
- VM_BUG_ON_PAGE(!PageCompound(head), head);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- is_hzp = is_huge_zero_page(head);
- VM_WARN_ON_ONCE_PAGE(is_hzp, head);
+ is_hzp = is_huge_zero_page(&folio->page);
+ VM_WARN_ON_ONCE_FOLIO(is_hzp, folio);
if (is_hzp)
return -EBUSY;
- if (PageWriteback(head))
+ if (folio_test_writeback(folio))
return -EBUSY;
- if (PageAnon(head)) {
+ if (folio_test_anon(folio)) {
/*
* The caller does not necessarily hold an mmap_lock that would
* prevent the anon_vma disappearing so we first we take a
@@ -2640,7 +2661,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* is taken to serialise against parallel split or collapse
* operations.
*/
- anon_vma = page_get_anon_vma(head);
+ anon_vma = folio_get_anon_vma(folio);
if (!anon_vma) {
ret = -EBUSY;
goto out;
@@ -2649,7 +2670,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
- mapping = head->mapping;
+ gfp_t gfp;
+
+ mapping = folio->mapping;
/* Truncated ? */
if (!mapping) {
@@ -2657,8 +2680,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out;
}
- xas_split_alloc(&xas, head, compound_order(head),
- mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
+ gfp = current_gfp_context(mapping_gfp_mask(mapping) &
+ GFP_RECLAIM_MASK);
+
+ if (folio_test_private(folio) &&
+ !filemap_release_folio(folio, gfp)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ xas_split_alloc(&xas, folio, folio_order(folio), gfp);
if (xas_error(&xas)) {
ret = xas_error(&xas);
goto out;
@@ -2672,7 +2703,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
* which cannot be nested inside the page tree lock. So note
* end now: i_size itself may be changed at any moment, but
- * head page lock is good enough to serialize the trimming.
+ * folio lock is good enough to serialize the trimming.
*/
end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (shmem_mapping(mapping))
@@ -2680,7 +2711,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
/*
- * Racy check if we can split the page, before unmap_page() will
+ * Racy check if we can split the page, before unmap_folio() will
* split PMDs
*/
if (!can_split_folio(folio, &extra_pins)) {
@@ -2688,38 +2719,38 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out_unlock;
}
- unmap_page(head);
+ unmap_folio(folio);
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
if (mapping) {
/*
- * Check if the head page is present in page cache.
- * We assume all tail are present too, if head is there.
+ * Check if the folio is present in page cache.
+ * We assume all tail are present too, if folio is there.
*/
xas_lock(&xas);
xas_reset(&xas);
- if (xas_load(&xas) != head)
+ if (xas_load(&xas) != folio)
goto fail;
}
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
- if (page_ref_freeze(head, 1 + extra_pins)) {
- if (!list_empty(page_deferred_list(head))) {
+ if (folio_ref_freeze(folio, 1 + extra_pins)) {
+ if (!list_empty(page_deferred_list(&folio->page))) {
ds_queue->split_queue_len--;
- list_del(page_deferred_list(head));
+ list_del(page_deferred_list(&folio->page));
}
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
- int nr = thp_nr_pages(head);
+ int nr = folio_nr_pages(folio);
- xas_split(&xas, head, thp_order(head));
- if (PageSwapBacked(head)) {
- __mod_lruvec_page_state(head, NR_SHMEM_THPS,
+ xas_split(&xas, folio, folio_order(folio));
+ if (folio_test_swapbacked(folio)) {
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS,
-nr);
} else {
- __mod_lruvec_page_state(head, NR_FILE_THPS,
+ __lruvec_stat_mod_folio(folio, NR_FILE_THPS,
-nr);
filemap_nr_thps_dec(mapping);
}
@@ -2894,11 +2925,9 @@ static void split_huge_pages_all(void)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
int nr_pages;
- if (!pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
- if (!get_page_unless_zero(page))
+ page = pfn_to_online_page(pfn);
+ if (!page || !get_page_unless_zero(page))
continue;
if (zone != page_zone(page))
@@ -2985,7 +3014,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
/* FOLL_DUMP to ignore special (like zero) pages */
page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
- if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
+ if (IS_ERR_OR_NULL(page))
continue;
if (!is_transparent_hugepage(page))
@@ -3177,6 +3206,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ /* See page_try_share_anon_rmap(): invalidate PMD first. */
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive && page_try_share_anon_rmap(page)) {
set_pmd_at(mm, address, pvmw->pmd, pmdval);
@@ -3191,6 +3221,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
else
entry = make_readable_migration_entry(page_to_pfn(page));
+ if (pmd_young(pmdval))
+ entry = make_migration_entry_young(entry);
+ if (pmd_dirty(pmdval))
+ entry = make_migration_entry_dirty(entry);
pmdswp = swp_entry_to_pmd(entry);
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
@@ -3216,13 +3250,18 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
get_page(new);
- pmde = pmd_mkold(mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)));
+ pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
pmde = maybe_pmd_mkwrite(pmde, vma);
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
+ if (!is_migration_entry_young(entry))
+ pmde = pmd_mkold(pmde);
+ /* NOTE: this may contain setting soft-dirty on some archs */
+ if (PageDirty(new) && is_migration_entry_dirty(entry))
+ pmde = pmd_mkdirty(pmde);
if (PageAnon(new)) {
rmap_t rmap_flags = RMAP_COMPOUND;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e070b8593b37..e48f8ef45b17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -33,6 +33,7 @@
#include <linux/migrate.h>
#include <linux/nospec.h>
#include <linux/delayacct.h>
+#include <linux/memory.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -90,6 +91,9 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
@@ -257,7 +261,7 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
static struct file_region *
get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
{
- struct file_region *nrg = NULL;
+ struct file_region *nrg;
VM_BUG_ON(resv->region_cache_count <= 0);
@@ -339,7 +343,7 @@ static bool has_same_uncharge_info(struct file_region *rg,
static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
{
- struct file_region *nrg = NULL, *prg = NULL;
+ struct file_region *nrg, *prg;
prg = list_prev_entry(rg, link);
if (&prg->link != &resv->regions && prg->to == rg->from &&
@@ -456,14 +460,12 @@ static int allocate_file_region_entries(struct resv_map *resv,
int regions_needed)
__must_hold(&resv->lock)
{
- struct list_head allocated_regions;
+ LIST_HEAD(allocated_regions);
int to_allocate = 0, i = 0;
struct file_region *trg = NULL, *rg = NULL;
VM_BUG_ON(regions_needed < 0);
- INIT_LIST_HEAD(&allocated_regions);
-
/*
* Check for sufficient descriptors in the cache to accommodate
* the number of in progress add operations plus regions_needed.
@@ -860,7 +862,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
* faults in a MAP_PRIVATE mapping. Only the process that called mmap()
* is guaranteed to have their future faults succeed.
*
- * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * With the exception of hugetlb_dup_vma_private() which is called at fork(),
* the reserve counters are updated with the hugetlb_lock held. It is safe
* to reset the VMA at fork() time as it is not in use yet and there is no
* chance of the global counters getting corrupted as a result of the values.
@@ -1007,12 +1009,28 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}
-/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
- if (!(vma->vm_flags & VM_MAYSHARE))
- vma->vm_private_data = (void *)0;
+ /*
+ * Clear vm_private_data
+ * - For shared mappings this is a per-vma semaphore that may be
+ * allocated in a subsequent call to hugetlb_vm_op_open.
+ * Before clearing, make sure pointer is not associated with vma
+ * as this will leak the structure. This is the case when called
+ * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
+ * been called to allocate a new structure.
+ * - For MAP_PRIVATE mappings, this is the reserve map which does
+ * not apply to children. Faults generated by the children are
+ * not guaranteed to succeed, even if read-only.
+ */
+ if (vma->vm_flags & VM_MAYSHARE) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (vma_lock && vma_lock->vma != vma)
+ vma->vm_private_data = NULL;
+ } else
+ vma->vm_private_data = NULL;
}
/*
@@ -1043,7 +1061,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
kref_put(&reservations->refs, resv_map_release);
}
- reset_vma_resv_huge_pages(vma);
+ hugetlb_dup_vma_private(vma);
}
/* Returns true if the VMA has associated reserve pages */
@@ -1182,6 +1200,11 @@ retry_cpuset:
return NULL;
}
+static unsigned long available_huge_pages(struct hstate *h)
+{
+ return h->free_huge_pages - h->resv_huge_pages;
+}
+
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
@@ -1198,12 +1221,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
* have no page reserves. This check ensures that reservations are
* not "stolen". The child may still get SIGKILLed
*/
- if (!vma_has_reserves(vma, chg) &&
- h->free_huge_pages - h->resv_huge_pages == 0)
+ if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
goto err;
/* If reserves cannot be used, ensure enough pages are in the pool */
- if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+ if (avoid_reserve && !available_huge_pages(h))
goto err;
gfp_mask = htlb_alloc_mask(h);
@@ -1308,12 +1330,13 @@ static void __destroy_compound_gigantic_page(struct page *page,
{
int i;
int nr_pages = 1 << order;
- struct page *p = page + 1;
+ struct page *p;
atomic_set(compound_mapcount_ptr(page), 0);
atomic_set(compound_pincount_ptr(page), 0);
- for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ for (i = 1; i < nr_pages; i++) {
+ p = nth_page(page, i);
p->mapping = NULL;
clear_compound_head(p);
if (!demote)
@@ -1506,6 +1529,10 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
set_page_private(page, 0);
+ /*
+ * We have to set HPageVmemmapOptimized again as above
+ * set_page_private(page, 0) cleared it.
+ */
SetHPageVmemmapOptimized(page);
/*
@@ -1530,7 +1557,7 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
static void __update_and_free_page(struct hstate *h, struct page *page)
{
int i;
- struct page *subpage = page;
+ struct page *subpage;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
@@ -1561,8 +1588,8 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
if (unlikely(PageHWPoison(page)))
hugetlb_clear_page_hwpoison(page);
- for (i = 0; i < pages_per_huge_page(h);
- i++, subpage = mem_map_next(subpage, page, i)) {
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ subpage = nth_page(page, i);
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
1 << PG_active | 1 << PG_private |
@@ -1769,13 +1796,14 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
{
int i, j;
int nr_pages = 1 << order;
- struct page *p = page + 1;
+ struct page *p;
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
- __ClearPageReserved(page);
__SetPageHead(page);
- for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ for (i = 0; i < nr_pages; i++) {
+ p = nth_page(page, i);
+
/*
* For gigantic hugepages allocated through bootmem at
* boot, it's safer to be consistent with the not-gigantic
@@ -1814,22 +1842,26 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
} else {
VM_BUG_ON_PAGE(page_count(p), p);
}
- set_compound_head(p, page);
+ if (i != 0)
+ set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
atomic_set(compound_pincount_ptr(page), 0);
return true;
out_error:
- /* undo tail page modifications made above */
- p = page + 1;
- for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
- clear_compound_head(p);
+ /* undo page modifications made above */
+ for (j = 0; j < i; j++) {
+ p = nth_page(page, j);
+ if (j != 0)
+ clear_compound_head(p);
set_page_refcounted(p);
}
/* need to clear PG_reserved on remaining tail pages */
- for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
+ for (; j < nr_pages; j++) {
+ p = nth_page(page, j);
__ClearPageReserved(p);
+ }
set_compound_order(page, 0);
#ifdef CONFIG_64BIT
page[1].compound_nr = 0;
@@ -1918,6 +1950,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
int order = huge_page_order(h);
struct page *page;
bool alloc_try_hard = true;
+ bool retry = true;
/*
* By default we always try hard to allocate the page with
@@ -1933,7 +1966,21 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
+retry:
page = __alloc_pages(gfp_mask, order, nid, nmask);
+
+ /* Freeze head page */
+ if (page && !page_ref_freeze(page, 1)) {
+ __free_pages(page, order);
+ if (retry) { /* retry once */
+ retry = false;
+ goto retry;
+ }
+ /* WOW! twice in a row. */
+ pr_warn("HugeTLB head page unexpected inflated ref count\n");
+ page = NULL;
+ }
+
if (page)
__count_vm_event(HTLB_BUDDY_PGALLOC);
else
@@ -1961,6 +2008,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
/*
* Common helper to allocate a fresh hugetlb page. All specific allocators
* should use this function to get new hugetlb pages
+ *
+ * Note that returned page is 'frozen': ref count of head page and all tail
+ * pages is zero.
*/
static struct page *alloc_fresh_huge_page(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
@@ -2018,7 +2068,7 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
if (!page)
return 0;
- put_page(page); /* free it into the hugepage allocator */
+ free_huge_page(page); /* free it into the hugepage allocator */
return 1;
}
@@ -2087,7 +2137,7 @@ retry:
if (!page_count(page)) {
struct page *head = compound_head(page);
struct hstate *h = page_hstate(head);
- if (h->free_huge_pages - h->resv_huge_pages == 0)
+ if (!available_huge_pages(h))
goto out;
/*
@@ -2175,10 +2225,9 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
* Allocates a fresh surplus page from the page allocator.
*/
static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask, bool zero_ref)
+ int nid, nodemask_t *nmask)
{
struct page *page = NULL;
- bool retry = false;
if (hstate_is_gigantic(h))
return NULL;
@@ -2188,7 +2237,6 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
goto out_unlock;
spin_unlock_irq(&hugetlb_lock);
-retry:
page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -2204,34 +2252,10 @@ retry:
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
SetHPageTemporary(page);
spin_unlock_irq(&hugetlb_lock);
- put_page(page);
+ free_huge_page(page);
return NULL;
}
- if (zero_ref) {
- /*
- * Caller requires a page with zero ref count.
- * We will drop ref count here. If someone else is holding
- * a ref, the page will be freed when they drop it. Abuse
- * temporary page flag to accomplish this.
- */
- SetHPageTemporary(page);
- if (!put_page_testzero(page)) {
- /*
- * Unexpected inflated ref count on freshly allocated
- * huge. Retry once.
- */
- pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
- spin_unlock_irq(&hugetlb_lock);
- if (retry)
- return NULL;
-
- retry = true;
- goto retry;
- }
- ClearHPageTemporary(page);
- }
-
h->surplus_huge_pages++;
h->surplus_huge_pages_node[page_to_nid(page)]++;
@@ -2253,6 +2277,9 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
if (!page)
return NULL;
+ /* fresh huge pages are frozen */
+ set_page_refcounted(page);
+
/*
* We do not account these pages as surplus because they are only
* temporary and will be released properly on the last reference
@@ -2280,14 +2307,14 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
gfp_t gfp = gfp_mask | __GFP_NOWARN;
gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
+ page = alloc_surplus_huge_page(h, gfp, nid, nodemask);
/* Fallback to all nodes if page==NULL */
nodemask = NULL;
}
if (!page)
- page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
+ page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
mpol_cond_put(mpol);
return page;
}
@@ -2297,7 +2324,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask)
{
spin_lock_irq(&hugetlb_lock);
- if (h->free_huge_pages - h->resv_huge_pages > 0) {
+ if (available_huge_pages(h)) {
struct page *page;
page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
@@ -2336,7 +2363,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
static int gather_surplus_pages(struct hstate *h, long delta)
__must_hold(&hugetlb_lock)
{
- struct list_head surplus_list;
+ LIST_HEAD(surplus_list);
struct page *page, *tmp;
int ret;
long i;
@@ -2351,14 +2378,13 @@ static int gather_surplus_pages(struct hstate *h, long delta)
}
allocated = 0;
- INIT_LIST_HEAD(&surplus_list);
ret = -ENOMEM;
retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
- NUMA_NO_NODE, NULL, true);
+ NUMA_NO_NODE, NULL);
if (!page) {
alloc_ok = false;
break;
@@ -2720,7 +2746,6 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
int nid = page_to_nid(old_page);
- bool alloc_retry = false;
struct page *new_page;
int ret = 0;
@@ -2731,30 +2756,9 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
* the pool. This simplifies and let us do most of the processing
* under the lock.
*/
-alloc_retry:
new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
if (!new_page)
return -ENOMEM;
- /*
- * If all goes well, this page will be directly added to the free
- * list in the pool. For this the ref count needs to be zero.
- * Attempt to drop now, and retry once if needed. It is VERY
- * unlikely there is another ref on the page.
- *
- * If someone else has a reference to the page, it will be freed
- * when they drop their ref. Abuse temporary page flag to accomplish
- * this. Retry once if there is an inflated ref count.
- */
- SetHPageTemporary(new_page);
- if (!put_page_testzero(new_page)) {
- if (alloc_retry)
- return -EBUSY;
-
- alloc_retry = true;
- goto alloc_retry;
- }
- ClearHPageTemporary(new_page);
-
__prep_new_huge_page(h, new_page);
retry:
@@ -2928,12 +2932,13 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
+ spin_lock_irq(&hugetlb_lock);
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
SetHPageRestoreReserve(page);
h->resv_huge_pages--;
}
- spin_lock_irq(&hugetlb_lock);
list_add(&page->lru, &h->hugepage_activelist);
+ set_page_refcounted(page);
/* Fall through */
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
@@ -3038,7 +3043,7 @@ static void __init gather_bootmem_prealloc(void)
if (prep_compound_gigantic_page(page, huge_page_order(h))) {
WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
- put_page(page); /* add to the hugepage allocator */
+ free_huge_page(page); /* add to the hugepage allocator */
} else {
/* VERY unlikely inflated ref count on a tail page */
free_gigantic_page(page, huge_page_order(h));
@@ -3070,7 +3075,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
&node_states[N_MEMORY], NULL);
if (!page)
break;
- put_page(page); /* free it into the hugepage allocator */
+ free_huge_page(page); /* free it into the hugepage allocator */
}
cond_resched();
}
@@ -3420,6 +3425,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
{
int i, nid = page_to_nid(page);
struct hstate *target_hstate;
+ struct page *subpage;
int rc = 0;
target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
@@ -3453,15 +3459,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
mutex_lock(&target_hstate->resize_lock);
for (i = 0; i < pages_per_huge_page(h);
i += pages_per_huge_page(target_hstate)) {
+ subpage = nth_page(page, i);
if (hstate_is_gigantic(target_hstate))
- prep_compound_gigantic_page_for_demote(page + i,
+ prep_compound_gigantic_page_for_demote(subpage,
target_hstate->order);
else
- prep_compound_page(page + i, target_hstate->order);
- set_page_private(page + i, 0);
- set_page_refcounted(page + i);
- prep_new_huge_page(target_hstate, page + i, nid);
- put_page(page + i);
+ prep_compound_page(subpage, target_hstate->order);
+ set_page_private(subpage, 0);
+ prep_new_huge_page(target_hstate, subpage, nid);
+ free_huge_page(subpage);
}
mutex_unlock(&target_hstate->resize_lock);
@@ -3472,7 +3478,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
* based on pool changes for the demoted page.
*/
h->max_huge_pages--;
- target_hstate->max_huge_pages += pages_per_huge_page(h);
+ target_hstate->max_huge_pages +=
+ pages_per_huge_page(h) / pages_per_huge_page(target_hstate);
return rc;
}
@@ -3714,7 +3721,7 @@ static ssize_t demote_store(struct kobject *kobj,
unsigned long nr_available;
nodemask_t nodes_allowed, *n_mask;
struct hstate *h;
- int err = 0;
+ int err;
int nid;
err = kstrtoul(buf, 10, &nr_demote);
@@ -3765,8 +3772,7 @@ HSTATE_ATTR_WO(demote);
static ssize_t demote_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- int nid;
- struct hstate *h = kobj_to_hstate(kobj, &nid);
+ struct hstate *h = kobj_to_hstate(kobj, NULL);
unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
return sysfs_emit(buf, "%lukB\n", demote_size);
@@ -3779,7 +3785,6 @@ static ssize_t demote_size_store(struct kobject *kobj,
struct hstate *h, *demote_hstate;
unsigned long demote_size;
unsigned int demote_order;
- int nid;
demote_size = (unsigned long)memparse(buf, NULL);
@@ -3791,7 +3796,7 @@ static ssize_t demote_size_store(struct kobject *kobj,
return -EINVAL;
/* demote order must be smaller than hstate order */
- h = kobj_to_hstate(kobj, &nid);
+ h = kobj_to_hstate(kobj, NULL);
if (demote_order >= h->order)
return -EINVAL;
@@ -3845,35 +3850,26 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
if (retval) {
kobject_put(hstate_kobjs[hi]);
hstate_kobjs[hi] = NULL;
+ return retval;
}
if (h->demote_order) {
- if (sysfs_create_group(hstate_kobjs[hi],
- &hstate_demote_attr_group))
+ retval = sysfs_create_group(hstate_kobjs[hi],
+ &hstate_demote_attr_group);
+ if (retval) {
pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+ sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
+ kobject_put(hstate_kobjs[hi]);
+ hstate_kobjs[hi] = NULL;
+ return retval;
+ }
}
- return retval;
-}
-
-static void __init hugetlb_sysfs_init(void)
-{
- struct hstate *h;
- int err;
-
- hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
- if (!hugepages_kobj)
- return;
-
- for_each_hstate(h) {
- err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
- hstate_kobjs, &hstate_attr_group);
- if (err)
- pr_err("HugeTLB: Unable to add hstate %s", h->name);
- }
+ return 0;
}
#ifdef CONFIG_NUMA
+static bool hugetlb_sysfs_initialized __ro_after_init;
/*
* node_hstate/s - associate per node hstate attributes, via their kobjects,
@@ -3929,7 +3925,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
* Unregister hstate attributes from a single node device.
* No-op if no hstate attributes attached.
*/
-static void hugetlb_unregister_node(struct node *node)
+void hugetlb_unregister_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
@@ -3939,10 +3935,15 @@ static void hugetlb_unregister_node(struct node *node)
for_each_hstate(h) {
int idx = hstate_index(h);
- if (nhs->hstate_kobjs[idx]) {
- kobject_put(nhs->hstate_kobjs[idx]);
- nhs->hstate_kobjs[idx] = NULL;
- }
+ struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
+
+ if (!hstate_kobj)
+ continue;
+ if (h->demote_order)
+ sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
+ sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
+ kobject_put(hstate_kobj);
+ nhs->hstate_kobjs[idx] = NULL;
}
kobject_put(nhs->hugepages_kobj);
@@ -3954,12 +3955,15 @@ static void hugetlb_unregister_node(struct node *node)
* Register hstate attributes for a single node device.
* No-op if attributes already registered.
*/
-static void hugetlb_register_node(struct node *node)
+void hugetlb_register_node(struct node *node)
{
struct hstate *h;
struct node_hstate *nhs = &node_hstates[node->dev.id];
int err;
+ if (!hugetlb_sysfs_initialized)
+ return;
+
if (nhs->hugepages_kobj)
return; /* already allocated */
@@ -3990,18 +3994,8 @@ static void __init hugetlb_register_all_nodes(void)
{
int nid;
- for_each_node_state(nid, N_MEMORY) {
- struct node *node = node_devices[nid];
- if (node->dev.id == nid)
- hugetlb_register_node(node);
- }
-
- /*
- * Let the node device driver know we're here so it can
- * [un]register hstate attributes on node hotplug.
- */
- register_hugetlbfs_with_node(hugetlb_register_node,
- hugetlb_unregister_node);
+ for_each_online_node(nid)
+ hugetlb_register_node(node_devices[nid]);
}
#else /* !CONFIG_NUMA */
@@ -4017,6 +4011,36 @@ static void hugetlb_register_all_nodes(void) { }
#endif
+#ifdef CONFIG_CMA
+static void __init hugetlb_cma_check(void);
+#else
+static inline __init void hugetlb_cma_check(void)
+{
+}
+#endif
+
+static void __init hugetlb_sysfs_init(void)
+{
+ struct hstate *h;
+ int err;
+
+ hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+ if (!hugepages_kobj)
+ return;
+
+ for_each_hstate(h) {
+ err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
+ hstate_kobjs, &hstate_attr_group);
+ if (err)
+ pr_err("HugeTLB: Unable to add hstate %s", h->name);
+ }
+
+#ifdef CONFIG_NUMA
+ hugetlb_sysfs_initialized = true;
+#endif
+ hugetlb_register_all_nodes();
+}
+
static int __init hugetlb_init(void)
{
int i;
@@ -4071,7 +4095,6 @@ static int __init hugetlb_init(void)
report_hugepages();
hugetlb_sysfs_init();
- hugetlb_register_all_nodes();
hugetlb_cgroup_file_init();
#ifdef CONFIG_SMP
@@ -4116,7 +4139,7 @@ void __init hugetlb_add_hstate(unsigned int order)
h->next_nid_to_alloc = first_memory_node;
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
- huge_page_size(h)/1024);
+ huge_page_size(h)/SZ_1K);
parsed_hstate = h;
}
@@ -4131,11 +4154,11 @@ static void __init hugepages_clear_pages_in_node(void)
if (!hugetlb_max_hstate) {
default_hstate_max_huge_pages = 0;
memset(default_hugepages_in_node, 0,
- MAX_NUMNODES * sizeof(unsigned int));
+ sizeof(default_hugepages_in_node));
} else {
parsed_hstate->max_huge_pages = 0;
memset(parsed_hstate->max_huge_pages_node, 0,
- MAX_NUMNODES * sizeof(unsigned int));
+ sizeof(parsed_hstate->max_huge_pages_node));
}
}
@@ -4330,18 +4353,34 @@ static int __init default_hugepagesz_setup(char *s)
}
__setup("default_hugepagesz=", default_hugepagesz_setup);
+static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
+{
+#ifdef CONFIG_NUMA
+ struct mempolicy *mpol = get_task_policy(current);
+
+ /*
+ * Only enforce MPOL_BIND policy which overlaps with cpuset policy
+ * (from policy_nodemask) specifically for hugetlb case
+ */
+ if (mpol->mode == MPOL_BIND &&
+ (apply_policy_zone(mpol, gfp_zone(gfp)) &&
+ cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
+ return &mpol->nodes;
+#endif
+ return NULL;
+}
+
static unsigned int allowed_mems_nr(struct hstate *h)
{
int node;
unsigned int nr = 0;
- nodemask_t *mpol_allowed;
+ nodemask_t *mbind_nodemask;
unsigned int *array = h->free_huge_pages_node;
gfp_t gfp_mask = htlb_alloc_mask(h);
- mpol_allowed = policy_nodemask_current(gfp_mask);
-
+ mbind_nodemask = policy_mbind_nodemask(gfp_mask);
for_each_node_mask(node, cpuset_current_mems_allowed) {
- if (!mpol_allowed || node_isset(node, *mpol_allowed))
+ if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
nr += array[node];
}
@@ -4570,6 +4609,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
struct resv_map *resv = vma_resv_map(vma);
/*
+ * HPAGE_RESV_OWNER indicates a private mapping.
* This new VMA should share its siblings reservation map if present.
* The VMA will only ever have a valid reservation map pointer where
* it is being copied for another still existing VMA. As that VMA
@@ -4581,16 +4621,38 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
kref_get(&resv->refs);
}
+
+ /*
+ * vma_lock structure for sharable mappings is vma specific.
+ * Clear old pointer (if copied via vm_area_dup) and allocate
+ * new structure. Before clearing, make sure vma_lock is not
+ * for this vma.
+ */
+ if (vma->vm_flags & VM_MAYSHARE) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (vma_lock) {
+ if (vma_lock->vma != vma) {
+ vma->vm_private_data = NULL;
+ hugetlb_vma_lock_alloc(vma);
+ } else
+ pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
+ } else
+ hugetlb_vma_lock_alloc(vma);
+ }
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
struct hstate *h = hstate_vma(vma);
- struct resv_map *resv = vma_resv_map(vma);
+ struct resv_map *resv;
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
long gbl_reserve;
+ hugetlb_vma_lock_free(vma);
+
+ resv = vma_resv_map(vma);
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
@@ -4721,14 +4783,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma)
{
- pte_t *src_pte, *dst_pte, entry, dst_entry;
+ pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
unsigned long addr;
bool cow = is_cow_mapping(src_vma->vm_flags);
struct hstate *h = hstate_vma(src_vma);
unsigned long sz = huge_page_size(h);
unsigned long npages = pages_per_huge_page(h);
- struct address_space *mapping = src_vma->vm_file->f_mapping;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
int ret = 0;
@@ -4742,12 +4803,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
raw_write_seqcount_begin(&src->write_protect_seq);
} else {
/*
- * For shared mappings i_mmap_rwsem must be held to call
- * huge_pte_alloc, otherwise the returned ptep could go
- * away if part of a shared pmd and another thread calls
- * huge_pmd_unshare.
+ * For shared mappings the vma lock must be held before
+ * calling huge_pte_offset in the src vma. Otherwise, the
+ * returned ptep could go away if part of a shared pmd and
+ * another thread calls huge_pmd_unshare.
*/
- i_mmap_lock_read(mapping);
+ hugetlb_vma_lock_read(src_vma);
}
last_addr_mask = hugetlb_mask_last_page(h);
@@ -4766,15 +4827,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
/*
* If the pagetables are shared don't copy or take references.
- * dst_pte == src_pte is the common case of src/dest sharing.
*
+ * dst_pte == src_pte is the common case of src/dest sharing.
* However, src could have 'unshared' and dst shares with
- * another vma. If dst_pte !none, this implies sharing.
- * Check here before taking page table lock, and once again
- * after taking the lock below.
+ * another vma. So page_count of ptep page is checked instead
+ * to reliably determine whether pte is shared.
*/
- dst_entry = huge_ptep_get(dst_pte);
- if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+ if (page_count(virt_to_page(dst_pte)) > 1) {
addr |= last_addr_mask;
continue;
}
@@ -4783,13 +4842,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
- dst_entry = huge_ptep_get(dst_pte);
again:
- if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+ if (huge_pte_none(entry)) {
/*
- * Skip if src entry none. Also, skip in the
- * unlikely case dst entry !none as this implies
- * sharing with another vma.
+ * Skip if src entry none.
*/
;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
@@ -4868,7 +4924,7 @@ again:
restore_reserve_on_error(h, dst_vma, addr,
new);
put_page(new);
- /* dst_entry won't change as in child */
+ /* huge_ptep of dst_pte won't change as in child */
goto again;
}
hugetlb_install_page(dst_vma, dst_pte, addr, new);
@@ -4900,7 +4956,7 @@ again:
raw_write_seqcount_end(&src->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
} else {
- i_mmap_unlock_read(mapping);
+ hugetlb_vma_unlock_read(src_vma);
}
return ret;
@@ -4959,6 +5015,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
last_addr_mask = hugetlb_mask_last_page(h);
/* Prevent race with file truncation */
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
src_pte = huge_pte_offset(mm, old_addr, sz);
@@ -4990,6 +5047,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
flush_tlb_range(vma, old_end - len, old_end);
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
+ hugetlb_vma_unlock_write(vma);
return len + old_addr - old_end;
}
@@ -5057,6 +5115,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
* unmapped and its refcount is dropped, so just clear pte here.
*/
if (unlikely(!pte_present(pte))) {
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
/*
* If the pte was wr-protected by uffd-wp in any of the
* swap forms, meanwhile the caller does not want to
@@ -5068,6 +5127,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
else
+#endif
huge_pte_clear(mm, address, ptep, sz);
spin_unlock(ptl);
continue;
@@ -5096,11 +5156,13 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
/* Leave a uffd-wp pte marker if needed */
if (huge_pte_uffd_wp(pte) &&
!(zap_flags & ZAP_FLAG_DROP_MARKER))
set_huge_pte_at(mm, address, ptep,
make_pte_marker(PTE_MARKER_UFFD_WP));
+#endif
hugetlb_count_sub(pages_per_huge_page(h), mm);
page_remove_rmap(page, vma, true);
@@ -5137,19 +5199,22 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
unsigned long end, struct page *ref_page,
zap_flags_t zap_flags)
{
+ hugetlb_vma_lock_write(vma);
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
__unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
/*
- * Clear this flag so that x86's huge_pmd_share page_table_shareable
- * test will fail on a vma being torn down, and not grab a page table
- * on its way out. We're lucky that the flag has such an appropriate
- * name, and can in fact be safely cleared here. We could clear it
- * before the __unmap_hugepage_range above, but all that's necessary
- * is to clear it before releasing the i_mmap_rwsem. This works
- * because in the context this is called, the VMA is about to be
- * destroyed and the i_mmap_rwsem is held.
+ * Unlock and free the vma lock before releasing i_mmap_rwsem. When
+ * the vma_lock is freed, this makes the vma ineligible for pmd
+ * sharing. And, i_mmap_rwsem is required to set up pmd sharing.
+ * This is important as page tables for this unmapped range will
+ * be asynchrously deleted. If the page tables are shared, there
+ * will be issues when accessed by someone else.
*/
- vma->vm_flags &= ~VM_MAYSHARE;
+ __hugetlb_vma_unlock_write_free(vma);
+
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
@@ -5314,11 +5379,10 @@ retry_avoidcopy:
u32 hash;
put_page(old_page);
- BUG_ON(huge_pte_none(pte));
/*
- * Drop hugetlb_fault_mutex and i_mmap_rwsem before
- * unmapping. unmapping needs to hold i_mmap_rwsem
- * in write mode. Dropping i_mmap_rwsem in read mode
+ * Drop hugetlb_fault_mutex and vma_lock before
+ * unmapping. unmapping needs to hold vma_lock
+ * in write mode. Dropping vma_lock in read mode
* here is OK as COW mappings do not interact with
* PMD sharing.
*
@@ -5326,13 +5390,13 @@ retry_avoidcopy:
*/
idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(mapping, idx);
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
unmap_ref_private(mm, vma, old_page, haddr);
- i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_vma_lock_read(vma);
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep &&
@@ -5406,19 +5470,6 @@ out_release_old:
return ret;
}
-/* Return the pagecache page at a given address within a VMA */
-static struct page *hugetlbfs_pagecache_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address)
-{
- struct address_space *mapping;
- pgoff_t idx;
-
- mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
-
- return find_lock_page(mapping, idx);
-}
-
/*
* Return whether there is a pagecache page to back given address within VMA.
* Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
@@ -5439,7 +5490,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
return page != NULL;
}
-int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx)
{
struct folio *folio = page_folio(page);
@@ -5476,7 +5527,6 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
unsigned long addr,
unsigned long reason)
{
- vm_fault_t ret;
u32 hash;
struct vm_fault vmf = {
.vma = vma,
@@ -5494,18 +5544,31 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
};
/*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
- * dropped before handling userfault. Reacquire
- * after handling fault to make calling code simpler.
+ * vma_lock and hugetlb_fault_mutex must be dropped before handling
+ * userfault. Also mmap_lock could be dropped due to handling
+ * userfault, any vma operation should be careful from here.
*/
+ hugetlb_vma_unlock_read(vma);
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
- ret = handle_userfault(&vmf, reason);
- i_mmap_lock_read(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ return handle_userfault(&vmf, reason);
+}
- return ret;
+/*
+ * Recheck pte with pgtable lock. Returns true if pte didn't change, or
+ * false if pte changed or is changing.
+ */
+static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
+ pte_t *ptep, pte_t old_pte)
+{
+ spinlock_t *ptl;
+ bool same;
+
+ ptl = huge_pte_lock(h, mm, ptep);
+ same = pte_same(huge_ptep_get(ptep), old_pte);
+ spin_unlock(ptl);
+
+ return same;
}
static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
@@ -5523,6 +5586,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
bool new_page, new_pagecache_page = false;
+ u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
/*
* Currently, we are forced to kill the process in the event the
@@ -5533,28 +5597,46 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
current->pid);
- return ret;
+ goto out;
}
/*
- * We can not race with truncation due to holding i_mmap_rwsem.
- * i_size is modified when holding i_mmap_rwsem, so check here
- * once for faults beyond end of file.
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
*/
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
-retry:
new_page = false;
page = find_lock_page(mapping, idx);
if (!page) {
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
/* Check for page in userfault range */
if (userfaultfd_missing(vma)) {
- ret = hugetlb_handle_userfault(vma, mapping, idx,
- flags, haddr, address,
- VM_UFFD_MISSING);
- goto out;
+ /*
+ * Since hugetlb_no_page() was examining pte
+ * without pgtable lock, we need to re-test under
+ * lock because the pte may not be stable and could
+ * have changed from under us. Try to detect
+ * either changed or during-changing ptes and retry
+ * properly when needed.
+ *
+ * Note that userfaultfd is actually fine with
+ * false positives (e.g. caused by pte changed),
+ * but not wrong logical events (e.g. caused by
+ * reading a pte during changing). The latter can
+ * confuse the userspace, so the strictness is very
+ * much preferred. E.g., MISSING event should
+ * never happen on the page after UFFDIO_COPY has
+ * correctly installed the page and returned.
+ */
+ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ ret = 0;
+ goto out;
+ }
+
+ return hugetlb_handle_userfault(vma, mapping, idx, flags,
+ haddr, address,
+ VM_UFFD_MISSING);
}
page = alloc_huge_page(vma, haddr, 0);
@@ -5571,11 +5653,10 @@ retry:
* here. Before returning error, get ptl and make
* sure there really is no pte entry.
*/
- ptl = huge_pte_lock(h, mm, ptep);
- ret = 0;
- if (huge_pte_none(huge_ptep_get(ptep)))
+ if (hugetlb_pte_stable(h, mm, ptep, old_pte))
ret = vmf_error(PTR_ERR(page));
- spin_unlock(ptl);
+ else
+ ret = 0;
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
@@ -5583,11 +5664,17 @@ retry:
new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
- int err = huge_add_to_page_cache(page, mapping, idx);
+ int err = hugetlb_add_to_page_cache(page, mapping, idx);
if (err) {
+ /*
+ * err can't be -EEXIST which implies someone
+ * else consumed the reservation since hugetlb
+ * fault mutex is held when add a hugetlb page
+ * to the page cache. So it's safe to call
+ * restore_reserve_on_error() here.
+ */
+ restore_reserve_on_error(h, vma, haddr, page);
put_page(page);
- if (err == -EEXIST)
- goto retry;
goto out;
}
new_pagecache_page = true;
@@ -5615,10 +5702,14 @@ retry:
if (userfaultfd_minor(vma)) {
unlock_page(page);
put_page(page);
- ret = hugetlb_handle_userfault(vma, mapping, idx,
- flags, haddr, address,
- VM_UFFD_MINOR);
- goto out;
+ /* See comment in userfaultfd_missing() block above */
+ if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+ ret = 0;
+ goto out;
+ }
+ return hugetlb_handle_userfault(vma, mapping, idx, flags,
+ haddr, address,
+ VM_UFFD_MINOR);
}
}
@@ -5676,15 +5767,17 @@ retry:
unlock_page(page);
out:
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return ret;
backout:
spin_unlock(ptl);
backout_unlocked:
- unlock_page(page);
- /* restore reserve for newly allocated pages not in page cache */
if (new_page && !new_pagecache_page)
restore_reserve_on_error(h, vma, haddr, page);
+
+ unlock_page(page);
put_page(page);
goto out;
}
@@ -5745,40 +5838,41 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
/*
- * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
- * until finished with ptep. This serves two purposes:
- * 1) It prevents huge_pmd_unshare from being called elsewhere
- * and making the ptep no longer valid.
- * 2) It synchronizes us with i_size modifications during truncation.
+ * Serialize hugepage allocation and instantiation, so that we don't
+ * get spurious allocation failures if two CPUs race to instantiate
+ * the same page in the page cache.
+ */
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ /*
+ * Acquire vma lock before calling huge_pte_alloc and hold
+ * until finished with ptep. This prevents huge_pmd_unshare from
+ * being called elsewhere and making the ptep no longer valid.
*
* ptep could have already be assigned via huge_pte_offset. That
* is OK, as huge_pte_alloc will return the same value unless
* something has changed.
*/
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
+ hugetlb_vma_lock_read(vma);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
if (!ptep) {
- i_mmap_unlock_read(mapping);
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return VM_FAULT_OOM;
}
- /*
- * Serialize hugepage allocation and instantiation, so that we don't
- * get spurious allocation failures if two CPUs race to instantiate
- * the same page in the page cache.
- */
- idx = vma_hugecache_offset(h, vma, haddr);
- hash = hugetlb_fault_mutex_hash(mapping, idx);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
entry = huge_ptep_get(ptep);
/* PTE markers should be handled the same way as none pte */
- if (huge_pte_none_mostly(entry)) {
- ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
+ if (huge_pte_none_mostly(entry))
+ /*
+ * hugetlb_no_page will drop vma lock and hugetlb fault
+ * mutex internally, which make us return immediately.
+ */
+ return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
entry, flags);
- goto out_mutex;
- }
ret = 0;
@@ -5808,7 +5902,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
- pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
+ pagecache_page = find_lock_page(mapping, idx);
}
ptl = huge_pte_lock(h, mm, ptep);
@@ -5832,8 +5926,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unlock_page(pagecache_page);
put_page(pagecache_page);
}
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
return handle_userfault(&vmf, VM_UFFD_WP);
}
@@ -5876,8 +5970,8 @@ out_ptl:
put_page(pagecache_page);
}
out_mutex:
+ hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -6005,39 +6099,28 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
/*
* Serialization between remove_inode_hugepages() and
- * huge_add_to_page_cache() below happens through the
+ * hugetlb_add_to_page_cache() below happens through the
* hugetlb_fault_mutex_table that here must be hold by
* the caller.
*/
- ret = huge_add_to_page_cache(page, mapping, idx);
+ ret = hugetlb_add_to_page_cache(page, mapping, idx);
if (ret)
goto out_release_nounlock;
page_in_pagecache = true;
}
- ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
- spin_lock(ptl);
+ ptl = huge_pte_lock(h, dst_mm, dst_pte);
- /*
- * Recheck the i_size after holding PT lock to make sure not
- * to leave any page mapped (as page_mapped()) beyond the end
- * of the i_size (remove_inode_hugepages() is strict about
- * enforcing that). If we bail out here, we'll also leave a
- * page in the radix tree in the vm_shared case beyond the end
- * of the i_size, but remove_inode_hugepages() will take care
- * of it as soon as we drop the hugetlb_fault_mutex_table.
- */
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- ret = -EFAULT;
- if (idx >= size)
+ ret = -EIO;
+ if (PageHWPoison(page))
goto out_release_unlock;
- ret = -EEXIST;
/*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
* page backing it, then access the page.
*/
+ ret = -EEXIST;
if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
@@ -6105,7 +6188,7 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
for (nr = 0; nr < refs; nr++) {
if (likely(pages))
- pages[nr] = mem_map_offset(page, nr);
+ pages[nr] = nth_page(page, nr);
if (vmas)
vmas[nr] = vma;
}
@@ -6269,7 +6352,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
(vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
if (pages || vmas)
- record_subpages_vmas(mem_map_offset(page, pfn_offset),
+ record_subpages_vmas(nth_page(page, pfn_offset),
vma, refs,
likely(pages) ? pages + i : NULL,
vmas ? vmas + i : NULL);
@@ -6340,8 +6423,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
flush_cache_range(vma, range.start, range.end);
mmu_notifier_invalidate_range_start(&range);
- last_addr_mask = hugetlb_mask_last_page(h);
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
+ last_addr_mask = hugetlb_mask_last_page(h);
for (; address < end; address += psize) {
spinlock_t *ptl;
ptep = huge_pte_offset(mm, address, psize);
@@ -6440,6 +6524,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* See Documentation/mm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
mmu_notifier_invalidate_range_end(&range);
return pages << h->order;
@@ -6465,6 +6550,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
}
/*
+ * vma specific semaphore used for pmd sharing synchronization
+ */
+ hugetlb_vma_lock_alloc(vma);
+
+ /*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
* without using reserves
@@ -6487,12 +6577,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to, &regions_needed);
-
} else {
/* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
- return false;
+ goto out_err;
chg = to - from;
@@ -6587,6 +6676,7 @@ out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
+ hugetlb_vma_lock_free(vma);
if (!vma || vma->vm_flags & VM_MAYSHARE)
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
@@ -6656,35 +6746,37 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
/*
* match the virtual addresses, permission and the alignment of the
* page table page.
+ *
+ * Also, vma_lock (vm_private_data) is required for sharing.
*/
if (pmd_index(addr) != pmd_index(saddr) ||
vm_flags != svm_flags ||
- !range_in_vma(svma, sbase, s_end))
+ !range_in_vma(svma, sbase, s_end) ||
+ !svma->vm_private_data)
return 0;
return saddr;
}
-static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
-{
- unsigned long base = addr & PUD_MASK;
- unsigned long end = base + PUD_SIZE;
-
- /*
- * check on proper vm_flags and page table alignment
- */
- if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
- return true;
- return false;
-}
-
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
+ unsigned long start = addr & PUD_MASK;
+ unsigned long end = start + PUD_SIZE;
+
#ifdef CONFIG_USERFAULTFD
if (uffd_disable_huge_pmd_share(vma))
return false;
#endif
- return vma_shareable(vma, addr);
+ /*
+ * check on proper vm_flags and page table alignment
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ return false;
+ if (!vma->vm_private_data) /* vma lock required for sharing */
+ return false;
+ if (!range_in_vma(vma, start, end))
+ return false;
+ return true;
}
/*
@@ -6714,16 +6806,157 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
*end = ALIGN(*end, PUD_SIZE);
}
+static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
+{
+ return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
+ vma->vm_private_data;
+}
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ down_read(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ up_read(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ down_write(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ up_write(&vma_lock->rw_sema);
+ }
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ if (!__vma_shareable_flags_pmd(vma))
+ return 1;
+
+ return down_write_trylock(&vma_lock->rw_sema);
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ lockdep_assert_held(&vma_lock->rw_sema);
+ }
+}
+
+void hugetlb_vma_lock_release(struct kref *kref)
+{
+ struct hugetlb_vma_lock *vma_lock = container_of(kref,
+ struct hugetlb_vma_lock, refs);
+
+ kfree(vma_lock);
+}
+
+static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
+{
+ struct vm_area_struct *vma = vma_lock->vma;
+
+ /*
+ * vma_lock structure may or not be released as a result of put,
+ * it certainly will no longer be attached to vma so clear pointer.
+ * Semaphore synchronizes access to vma_lock->vma field.
+ */
+ vma_lock->vma = NULL;
+ vma->vm_private_data = NULL;
+ up_write(&vma_lock->rw_sema);
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+}
+
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
+{
+ if (__vma_shareable_flags_pmd(vma)) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ __hugetlb_vma_unlock_write_put(vma_lock);
+ }
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+ /*
+ * Only present in sharable vmas.
+ */
+ if (!vma || !__vma_shareable_flags_pmd(vma))
+ return;
+
+ if (vma->vm_private_data) {
+ struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+ down_write(&vma_lock->rw_sema);
+ __hugetlb_vma_unlock_write_put(vma_lock);
+ }
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+ struct hugetlb_vma_lock *vma_lock;
+
+ /* Only establish in (flags) sharable vmas */
+ if (!vma || !(vma->vm_flags & VM_MAYSHARE))
+ return;
+
+ /* Should never get here with non-NULL vm_private_data */
+ if (vma->vm_private_data)
+ return;
+
+ vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
+ if (!vma_lock) {
+ /*
+ * If we can not allocate structure, then vma can not
+ * participate in pmd sharing. This is only a possible
+ * performance enhancement and memory saving issue.
+ * However, the lock is also used to synchronize page
+ * faults with truncation. If the lock is not present,
+ * unlikely races could leave pages in a file past i_size
+ * until the file is removed. Warn in the unlikely case of
+ * allocation failure.
+ */
+ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
+ return;
+ }
+
+ kref_init(&vma_lock->refs);
+ init_rwsem(&vma_lock->rw_sema);
+ vma_lock->vma = vma;
+ vma->vm_private_data = vma_lock;
+}
+
/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner.
- *
- * This routine must be called with i_mmap_rwsem held in at least read mode if
- * sharing is possible. For hugetlbfs, this prevents removal of any page
- * table entries associated with the address space. This is important as we
- * are setting up sharing based on existing page table entries (mappings).
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
@@ -6737,7 +6970,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *pte;
spinlock_t *ptl;
- i_mmap_assert_locked(mapping);
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -6767,6 +7000,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_unlock_read(mapping);
return pte;
}
@@ -6777,7 +7011,7 @@ out:
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * Called with page table lock held and i_mmap_rwsem held in write mode.
+ * Called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
@@ -6790,6 +7024,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
pud_t *pud = pud_offset(p4d, addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+ hugetlb_vma_assert_locked(vma);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
@@ -6801,6 +7036,48 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
}
#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+ return 1;
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_release(struct kref *kref)
+{
+}
+
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+}
+
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
{
@@ -6944,12 +7221,13 @@ follow_huge_pd(struct vm_area_struct *vma,
}
struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int flags)
+follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags)
{
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;
spinlock_t *ptl;
- pte_t pte;
+ pte_t *ptep, pte;
/*
* FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
@@ -6959,17 +7237,15 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
return NULL;
retry:
- ptl = pmd_lockptr(mm, pmd);
- spin_lock(ptl);
- /*
- * make sure that the address range covered by this pmd is not
- * unmapped from other threads.
- */
- if (!pmd_huge(*pmd))
- goto out;
- pte = huge_ptep_get((pte_t *)pmd);
+ ptep = huge_pte_offset(mm, address, huge_page_size(h));
+ if (!ptep)
+ return NULL;
+
+ ptl = huge_pte_lock(h, mm, ptep);
+ pte = huge_ptep_get(ptep);
if (pte_present(pte)) {
- page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ page = pte_page(pte) +
+ ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
/*
* try_grab_page() should always succeed here, because: a) we
* hold the pmd (ptl) lock, and b) we've just checked that the
@@ -6985,7 +7261,7 @@ retry:
} else {
if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
- __migration_entry_wait_huge((pte_t *)pmd, ptl);
+ __migration_entry_wait_huge(ptep, ptl);
goto retry;
}
/*
@@ -7171,6 +7447,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
+ hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
ptep = huge_pte_offset(mm, address, sz);
@@ -7182,6 +7459,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
}
flush_hugetlb_tlb_range(vma, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
+ hugetlb_vma_unlock_write(vma);
/*
* No need to call mmu_notifier_invalidate_range(), see
* Documentation/mm/mmu_notifier.rst.
@@ -7332,7 +7610,7 @@ void __init hugetlb_cma_reserve(int order)
hugetlb_cma_size = 0;
}
-void __init hugetlb_cma_check(void)
+static void __init hugetlb_cma_check(void)
{
if (!hugetlb_cma_size || cma_reserve_called)
return;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index c86691c431fd..f61d132df52b 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -75,11 +75,11 @@ parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
{
- int idx;
+ struct hstate *h;
- for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+ for_each_hstate(h) {
if (page_counter_read(
- hugetlb_cgroup_counter_from_cgroup(h_cg, idx)))
+ hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
return true;
}
return false;
@@ -154,9 +154,9 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
* function.
*/
for_each_node(node) {
- /* Set node_to_alloc to -1 for offline nodes. */
+ /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
int node_to_alloc =
- node_state(node, N_NORMAL_MEMORY) ? node : -1;
+ node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
h_cgroup->nodeinfo[node] =
kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
GFP_KERNEL, node_to_alloc);
@@ -225,17 +225,14 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
struct hstate *h;
struct page *page;
- int idx;
do {
- idx = 0;
for_each_hstate(h) {
spin_lock_irq(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
- hugetlb_cgroup_move_parent(idx, h_cg, page);
+ hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page);
spin_unlock_irq(&hugetlb_lock);
- idx++;
}
cond_resched();
} while (hugetlb_cgroup_have_usage(h_cg));
@@ -442,7 +439,7 @@ void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
return;
- if (rg->reservation_counter && resv->pages_per_hpage && nr_pages > 0 &&
+ if (rg->reservation_counter && resv->pages_per_hpage &&
!resv->reservation_counter) {
page_counter_uncharge(rg->reservation_counter,
nr_pages * resv->pages_per_hpage);
@@ -675,12 +672,12 @@ static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
static char *mem_fmt(char *buf, int size, unsigned long hsize)
{
- if (hsize >= (1UL << 30))
- snprintf(buf, size, "%luGB", hsize >> 30);
- else if (hsize >= (1UL << 20))
- snprintf(buf, size, "%luMB", hsize >> 20);
+ if (hsize >= SZ_1G)
+ snprintf(buf, size, "%luGB", hsize / SZ_1G);
+ else if (hsize >= SZ_1M)
+ snprintf(buf, size, "%luMB", hsize / SZ_1M);
else
- snprintf(buf, size, "%luKB", hsize >> 10);
+ snprintf(buf, size, "%luKB", hsize / SZ_1K);
return buf;
}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 20f414c0379f..4962dd1ba4a6 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -11,6 +11,7 @@
#define pr_fmt(fmt) "HugeTLB: " fmt
#include <linux/pgtable.h>
+#include <linux/moduleparam.h>
#include <linux/bootmem_info.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -265,11 +266,10 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
static inline void reset_struct_pages(struct page *start)
{
- int i;
struct page *from = start + NR_RESET_STRUCT_PAGE;
- for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
- memcpy(start + i, from, sizeof(*from));
+ BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
+ memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
}
static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
@@ -287,6 +287,11 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
copy_page(to, (void *)walk->reuse_addr);
reset_struct_pages(to);
+ /*
+ * Makes sure that preceding stores to the page contents become visible
+ * before the set_pte_at() write.
+ */
+ smp_wmb();
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 65e242b5a432..d0548e382b6b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -63,13 +63,13 @@ static int hwpoison_unpoison(void *data, u64 val)
DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
-static void pfn_inject_exit(void)
+static void __exit pfn_inject_exit(void)
{
hwpoison_filter_enable = 0;
debugfs_remove_recursive(hwpoison_dir);
}
-static int pfn_inject_init(void)
+static int __init pfn_inject_init(void)
{
hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index fbe7844d0912..c9327abb771c 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm_types.h>
-#include <linux/rbtree.h>
+#include <linux/maple_tree.h>
#include <linux/rwsem.h>
#include <linux/spinlock.h>
#include <linux/list.h>
@@ -28,7 +28,7 @@
* and size this cpu_bitmask to NR_CPUS.
*/
struct mm_struct init_mm = {
- .mm_rb = RB_ROOT,
+ .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock),
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index 785409805ed7..6b7ef495b56d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -83,9 +83,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
+void folio_activate(struct folio *folio);
-void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
- unsigned long floor, unsigned long ceiling);
+void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ struct vm_area_struct *start_vma, unsigned long floor,
+ unsigned long ceiling);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
struct zap_details;
@@ -187,7 +189,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason
/*
* in mm/rmap.c:
*/
-extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
+pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
/*
* in mm/page_alloc.c
@@ -365,7 +367,6 @@ extern int user_min_free_kbytes;
extern void free_unref_page(struct page *page, unsigned int order);
extern void free_unref_page_list(struct list_head *list);
-extern void zone_pcp_update(struct zone *zone, int cpu_online);
extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
@@ -479,9 +480,6 @@ static inline bool is_data_mapping(vm_flags_t flags)
}
/* mm/util.c */
-void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev);
-void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
struct anon_vma *folio_anon_vma(struct folio *folio);
#ifdef CONFIG_MMU
@@ -639,34 +637,6 @@ static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
}
#endif /* !CONFIG_MMU */
-/*
- * Return the mem_map entry representing the 'offset' subpage within
- * the maximally aligned gigantic page 'base'. Handle any discontiguity
- * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
- */
-static inline struct page *mem_map_offset(struct page *base, int offset)
-{
- if (unlikely(offset >= MAX_ORDER_NR_PAGES))
- return nth_page(base, offset);
- return base + offset;
-}
-
-/*
- * Iterator over all subpages within the maximally aligned gigantic
- * page 'base'. Handle any discontiguity in the mem_map.
- */
-static inline struct page *mem_map_next(struct page *iter,
- struct page *base, int offset)
-{
- if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
- unsigned long pfn = page_to_pfn(base) + offset;
- if (!pfn_valid(pfn))
- return NULL;
- return pfn_to_page(pfn);
- }
- return iter + 1;
-}
-
/* Memory initialisation debug and verification */
enum mminit_level {
MMINIT_WARNING,
@@ -847,8 +817,14 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
}
#endif
+int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages,
+ unsigned int page_shift);
+
void vunmap_range_noflush(unsigned long start, unsigned long end);
+void __vunmap_range_noflush(unsigned long start, unsigned long end);
+
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);
@@ -860,8 +836,6 @@ int migrate_device_coherent_page(struct page *page);
*/
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
-DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
-
extern bool mirrored_kernelcore;
static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 1f84df9c302e..d4837bff3b60 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -35,7 +35,15 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) -fno-builtin $(call cc-disable-warning, vla)
+
+CFLAGS_kasan_test.o := $(CFLAGS_KASAN_TEST)
+CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST)
+
obj-y := common.o report.o
obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o
obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o
+
+obj-$(CONFIG_KASAN_KUNIT_TEST) += kasan_test.o
+obj-$(CONFIG_KASAN_MODULE_TEST) += kasan_test_module.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 69f583855c8b..833bf2cfd2a3 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -30,13 +30,20 @@
#include "kasan.h"
#include "../slab.h"
+struct slab *kasan_addr_to_slab(const void *addr)
+{
+ if (virt_addr_valid(addr))
+ return virt_to_slab(addr);
+ return NULL;
+}
+
depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc)
{
unsigned long entries[KASAN_STACK_DEPTH];
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
- return __stack_depot_save(entries, nr_entries, flags, can_alloc);
+ return __stack_depot_save(entries, nr_entries, 0, flags, can_alloc);
}
void kasan_set_track(struct kasan_track *track, gfp_t flags)
@@ -88,17 +95,6 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
}
#endif /* CONFIG_KASAN_STACK */
-/*
- * Only allow cache merging when stack collection is disabled and no metadata
- * is present.
- */
-slab_flags_t __kasan_never_merge(void)
-{
- if (kasan_stack_collection_enabled())
- return SLAB_KASAN;
- return 0;
-}
-
void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
{
u8 tag;
@@ -121,132 +117,11 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
KASAN_PAGE_FREE, init);
}
-/*
- * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
- * For larger allocations larger redzones are used.
- */
-static inline unsigned int optimal_redzone(unsigned int object_size)
-{
- return
- object_size <= 64 - 16 ? 16 :
- object_size <= 128 - 32 ? 32 :
- object_size <= 512 - 64 ? 64 :
- object_size <= 4096 - 128 ? 128 :
- object_size <= (1 << 14) - 256 ? 256 :
- object_size <= (1 << 15) - 512 ? 512 :
- object_size <= (1 << 16) - 1024 ? 1024 : 2048;
-}
-
-void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
- slab_flags_t *flags)
-{
- unsigned int ok_size;
- unsigned int optimal_size;
-
- /*
- * SLAB_KASAN is used to mark caches as ones that are sanitized by
- * KASAN. Currently this flag is used in two places:
- * 1. In slab_ksize() when calculating the size of the accessible
- * memory within the object.
- * 2. In slab_common.c to prevent merging of sanitized caches.
- */
- *flags |= SLAB_KASAN;
-
- if (!kasan_stack_collection_enabled())
- return;
-
- ok_size = *size;
-
- /* Add alloc meta into redzone. */
- cache->kasan_info.alloc_meta_offset = *size;
- *size += sizeof(struct kasan_alloc_meta);
-
- /*
- * If alloc meta doesn't fit, don't add it.
- * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal
- * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for
- * larger sizes.
- */
- if (*size > KMALLOC_MAX_SIZE) {
- cache->kasan_info.alloc_meta_offset = 0;
- *size = ok_size;
- /* Continue, since free meta might still fit. */
- }
-
- /* Only the generic mode uses free meta or flexible redzones. */
- if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) {
- cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META;
- return;
- }
-
- /*
- * Add free meta into redzone when it's not possible to store
- * it in the object. This is the case when:
- * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can
- * be touched after it was freed, or
- * 2. Object has a constructor, which means it's expected to
- * retain its content until the next allocation, or
- * 3. Object is too small.
- * Otherwise cache->kasan_info.free_meta_offset = 0 is implied.
- */
- if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor ||
- cache->object_size < sizeof(struct kasan_free_meta)) {
- ok_size = *size;
-
- cache->kasan_info.free_meta_offset = *size;
- *size += sizeof(struct kasan_free_meta);
-
- /* If free meta doesn't fit, don't add it. */
- if (*size > KMALLOC_MAX_SIZE) {
- cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META;
- *size = ok_size;
- }
- }
-
- /* Calculate size with optimal redzone. */
- optimal_size = cache->object_size + optimal_redzone(cache->object_size);
- /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */
- if (optimal_size > KMALLOC_MAX_SIZE)
- optimal_size = KMALLOC_MAX_SIZE;
- /* Use optimal size if the size with added metas is not large enough. */
- if (*size < optimal_size)
- *size = optimal_size;
-}
-
void __kasan_cache_create_kmalloc(struct kmem_cache *cache)
{
cache->kasan_info.is_kmalloc = true;
}
-size_t __kasan_metadata_size(struct kmem_cache *cache)
-{
- if (!kasan_stack_collection_enabled())
- return 0;
- return (cache->kasan_info.alloc_meta_offset ?
- sizeof(struct kasan_alloc_meta) : 0) +
- (cache->kasan_info.free_meta_offset ?
- sizeof(struct kasan_free_meta) : 0);
-}
-
-struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
- const void *object)
-{
- if (!cache->kasan_info.alloc_meta_offset)
- return NULL;
- return kasan_reset_tag(object) + cache->kasan_info.alloc_meta_offset;
-}
-
-#ifdef CONFIG_KASAN_GENERIC
-struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
- const void *object)
-{
- BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
- if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META)
- return NULL;
- return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset;
-}
-#endif
-
void __kasan_poison_slab(struct slab *slab)
{
struct page *page = slab_page(slab);
@@ -312,13 +187,9 @@ static inline u8 assign_tag(struct kmem_cache *cache,
void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
const void *object)
{
- struct kasan_alloc_meta *alloc_meta;
-
- if (kasan_stack_collection_enabled()) {
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (alloc_meta)
- __memset(alloc_meta, 0, sizeof(*alloc_meta));
- }
+ /* Initialize per-object metadata if it is present. */
+ if (kasan_requires_meta())
+ kasan_init_object_meta(cache, object);
/* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
object = set_tag(object, assign_tag(cache, object, true));
@@ -329,13 +200,11 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
unsigned long ip, bool quarantine, bool init)
{
- u8 tag;
void *tagged_object;
if (!kasan_arch_is_ready())
return false;
- tag = get_tag(object);
tagged_object = object;
object = kasan_reset_tag(object);
@@ -364,7 +233,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
return false;
if (kasan_stack_collection_enabled())
- kasan_set_free_info(cache, object, tag);
+ kasan_save_free_info(cache, tagged_object);
return kasan_quarantine_put(cache, object);
}
@@ -423,20 +292,6 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
}
}
-static void set_alloc_info(struct kmem_cache *cache, void *object,
- gfp_t flags, bool is_kmalloc)
-{
- struct kasan_alloc_meta *alloc_meta;
-
- /* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */
- if (cache->kasan_info.is_kmalloc && !is_kmalloc)
- return;
-
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (alloc_meta)
- kasan_set_track(&alloc_meta->alloc_track, flags);
-}
-
void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
void *object, gfp_t flags, bool init)
{
@@ -466,8 +321,8 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
kasan_unpoison(tagged_object, cache->object_size, init);
/* Save alloc info (if possible) for non-kmalloc() allocations. */
- if (kasan_stack_collection_enabled())
- set_alloc_info(cache, (void *)object, flags, false);
+ if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc)
+ kasan_save_alloc_info(cache, tagged_object, flags);
return tagged_object;
}
@@ -512,8 +367,8 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache,
* Save alloc info (if possible) for kmalloc() allocations.
* This also rewrites the alloc info when called from kasan_krealloc().
*/
- if (kasan_stack_collection_enabled())
- set_alloc_info(cache, (void *)object, flags, true);
+ if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc)
+ kasan_save_alloc_info(cache, (void *)object, flags);
/* Keep the tag that was set by kasan_slab_alloc(). */
return (void *)object;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 437fcc7e77cf..d8b5590f9484 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -328,6 +328,139 @@ DEFINE_ASAN_SET_SHADOW(f3);
DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
+/* Only allow cache merging when no per-object metadata is present. */
+slab_flags_t kasan_never_merge(void)
+{
+ if (!kasan_requires_meta())
+ return 0;
+ return SLAB_KASAN;
+}
+
+/*
+ * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
+ * For larger allocations larger redzones are used.
+ */
+static inline unsigned int optimal_redzone(unsigned int object_size)
+{
+ return
+ object_size <= 64 - 16 ? 16 :
+ object_size <= 128 - 32 ? 32 :
+ object_size <= 512 - 64 ? 64 :
+ object_size <= 4096 - 128 ? 128 :
+ object_size <= (1 << 14) - 256 ? 256 :
+ object_size <= (1 << 15) - 512 ? 512 :
+ object_size <= (1 << 16) - 1024 ? 1024 : 2048;
+}
+
+void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+ slab_flags_t *flags)
+{
+ unsigned int ok_size;
+ unsigned int optimal_size;
+
+ if (!kasan_requires_meta())
+ return;
+
+ /*
+ * SLAB_KASAN is used to mark caches that are sanitized by KASAN
+ * and that thus have per-object metadata.
+ * Currently this flag is used in two places:
+ * 1. In slab_ksize() to account for per-object metadata when
+ * calculating the size of the accessible memory within the object.
+ * 2. In slab_common.c via kasan_never_merge() to prevent merging of
+ * caches with per-object metadata.
+ */
+ *flags |= SLAB_KASAN;
+
+ ok_size = *size;
+
+ /* Add alloc meta into redzone. */
+ cache->kasan_info.alloc_meta_offset = *size;
+ *size += sizeof(struct kasan_alloc_meta);
+
+ /*
+ * If alloc meta doesn't fit, don't add it.
+ * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal
+ * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for
+ * larger sizes.
+ */
+ if (*size > KMALLOC_MAX_SIZE) {
+ cache->kasan_info.alloc_meta_offset = 0;
+ *size = ok_size;
+ /* Continue, since free meta might still fit. */
+ }
+
+ /*
+ * Add free meta into redzone when it's not possible to store
+ * it in the object. This is the case when:
+ * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can
+ * be touched after it was freed, or
+ * 2. Object has a constructor, which means it's expected to
+ * retain its content until the next allocation, or
+ * 3. Object is too small.
+ * Otherwise cache->kasan_info.free_meta_offset = 0 is implied.
+ */
+ if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor ||
+ cache->object_size < sizeof(struct kasan_free_meta)) {
+ ok_size = *size;
+
+ cache->kasan_info.free_meta_offset = *size;
+ *size += sizeof(struct kasan_free_meta);
+
+ /* If free meta doesn't fit, don't add it. */
+ if (*size > KMALLOC_MAX_SIZE) {
+ cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META;
+ *size = ok_size;
+ }
+ }
+
+ /* Calculate size with optimal redzone. */
+ optimal_size = cache->object_size + optimal_redzone(cache->object_size);
+ /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */
+ if (optimal_size > KMALLOC_MAX_SIZE)
+ optimal_size = KMALLOC_MAX_SIZE;
+ /* Use optimal size if the size with added metas is not large enough. */
+ if (*size < optimal_size)
+ *size = optimal_size;
+}
+
+struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
+ const void *object)
+{
+ if (!cache->kasan_info.alloc_meta_offset)
+ return NULL;
+ return (void *)object + cache->kasan_info.alloc_meta_offset;
+}
+
+struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
+ const void *object)
+{
+ BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
+ if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META)
+ return NULL;
+ return (void *)object + cache->kasan_info.free_meta_offset;
+}
+
+void kasan_init_object_meta(struct kmem_cache *cache, const void *object)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta)
+ __memset(alloc_meta, 0, sizeof(*alloc_meta));
+}
+
+size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+ if (!kasan_requires_meta())
+ return 0;
+ return (cache->kasan_info.alloc_meta_offset ?
+ sizeof(struct kasan_alloc_meta) : 0) +
+ ((cache->kasan_info.free_meta_offset &&
+ cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ?
+ sizeof(struct kasan_free_meta) : 0);
+}
+
static void __kasan_record_aux_stack(void *addr, bool can_alloc)
{
struct slab *slab = kasan_addr_to_slab(addr);
@@ -358,8 +491,16 @@ void kasan_record_aux_stack_noalloc(void *addr)
return __kasan_record_aux_stack(addr, false);
}
-void kasan_set_free_info(struct kmem_cache *cache,
- void *object, u8 tag)
+void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta)
+ kasan_set_track(&alloc_meta->alloc_track, flags);
+}
+
+void kasan_save_free_info(struct kmem_cache *cache, void *object)
{
struct kasan_free_meta *free_meta;
@@ -371,12 +512,3 @@ void kasan_set_free_info(struct kmem_cache *cache,
/* The object was freed and has free track set. */
*(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK;
}
-
-struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag)
-{
- if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK)
- return NULL;
- /* Free meta must be present with KASAN_SLAB_FREETRACK. */
- return &kasan_get_free_meta(cache, object)->free_track;
-}
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 9ad8eff71b28..b22c4f461cb0 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -38,16 +38,9 @@ enum kasan_arg_vmalloc {
KASAN_ARG_VMALLOC_ON,
};
-enum kasan_arg_stacktrace {
- KASAN_ARG_STACKTRACE_DEFAULT,
- KASAN_ARG_STACKTRACE_OFF,
- KASAN_ARG_STACKTRACE_ON,
-};
-
static enum kasan_arg kasan_arg __ro_after_init;
static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata;
-static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata;
/*
* Whether KASAN is enabled at all.
@@ -66,9 +59,6 @@ EXPORT_SYMBOL_GPL(kasan_mode);
/* Whether to enable vmalloc tagging. */
DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
-/* Whether to collect alloc/free stack traces. */
-DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
-
/* kasan=off/on */
static int __init early_kasan_flag(char *arg)
{
@@ -122,23 +112,6 @@ static int __init early_kasan_flag_vmalloc(char *arg)
}
early_param("kasan.vmalloc", early_kasan_flag_vmalloc);
-/* kasan.stacktrace=off/on */
-static int __init early_kasan_flag_stacktrace(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (!strcmp(arg, "off"))
- kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF;
- else if (!strcmp(arg, "on"))
- kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON;
- else
- return -EINVAL;
-
- return 0;
-}
-early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
-
static inline const char *kasan_mode_info(void)
{
if (kasan_mode == KASAN_MODE_ASYNC)
@@ -213,17 +186,7 @@ void __init kasan_init_hw_tags(void)
break;
}
- switch (kasan_arg_stacktrace) {
- case KASAN_ARG_STACKTRACE_DEFAULT:
- /* Default is specified by kasan_flag_stacktrace definition. */
- break;
- case KASAN_ARG_STACKTRACE_OFF:
- static_branch_disable(&kasan_flag_stacktrace);
- break;
- case KASAN_ARG_STACKTRACE_ON:
- static_branch_enable(&kasan_flag_stacktrace);
- break;
- }
+ kasan_init_tags();
/* KASAN is now initialized, enable it. */
static_branch_enable(&kasan_flag_enabled);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 01c03e45acd4..abbcc1b0eec5 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -2,18 +2,37 @@
#ifndef __MM_KASAN_KASAN_H
#define __MM_KASAN_KASAN_H
+#include <linux/atomic.h>
#include <linux/kasan.h>
#include <linux/kasan-tags.h>
#include <linux/kfence.h>
#include <linux/stackdepot.h>
-#ifdef CONFIG_KASAN_HW_TAGS
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
#include <linux/static_key.h>
+
+DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
+
+static inline bool kasan_stack_collection_enabled(void)
+{
+ return static_branch_unlikely(&kasan_flag_stacktrace);
+}
+
+#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
+static inline bool kasan_stack_collection_enabled(void)
+{
+ return true;
+}
+
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
+#ifdef CONFIG_KASAN_HW_TAGS
+
#include "../slab.h"
DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc);
-DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
enum kasan_mode {
KASAN_MODE_SYNC,
@@ -28,11 +47,6 @@ static inline bool kasan_vmalloc_enabled(void)
return static_branch_likely(&kasan_flag_vmalloc);
}
-static inline bool kasan_stack_collection_enabled(void)
-{
- return static_branch_unlikely(&kasan_flag_stacktrace);
-}
-
static inline bool kasan_async_fault_possible(void)
{
return kasan_mode == KASAN_MODE_ASYNC || kasan_mode == KASAN_MODE_ASYMM;
@@ -43,12 +57,7 @@ static inline bool kasan_sync_fault_possible(void)
return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM;
}
-#else
-
-static inline bool kasan_stack_collection_enabled(void)
-{
- return true;
-}
+#else /* CONFIG_KASAN_HW_TAGS */
static inline bool kasan_async_fault_possible(void)
{
@@ -60,7 +69,31 @@ static inline bool kasan_sync_fault_possible(void)
return true;
}
-#endif
+#endif /* CONFIG_KASAN_HW_TAGS */
+
+#ifdef CONFIG_KASAN_GENERIC
+
+/* Generic KASAN uses per-object metadata to store stack traces. */
+static inline bool kasan_requires_meta(void)
+{
+ /*
+ * Technically, Generic KASAN always collects stack traces right now.
+ * However, let's use kasan_stack_collection_enabled() in case the
+ * kasan.stacktrace command-line argument is changed to affect
+ * Generic KASAN.
+ */
+ return kasan_stack_collection_enabled();
+}
+
+#else /* CONFIG_KASAN_GENERIC */
+
+/* Tag-based KASAN modes do not use per-object metadata. */
+static inline bool kasan_requires_meta(void)
+{
+ return false;
+}
+
+#endif /* CONFIG_KASAN_GENERIC */
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
@@ -122,6 +155,13 @@ static inline bool kasan_sync_fault_possible(void)
#define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE)
#define META_ROWS_AROUND_ADDR 2
+#define KASAN_STACK_DEPTH 64
+
+struct kasan_track {
+ u32 pid;
+ depot_stack_handle_t stack;
+};
+
enum kasan_report_type {
KASAN_REPORT_ACCESS,
KASAN_REPORT_INVALID_FREE,
@@ -129,12 +169,22 @@ enum kasan_report_type {
};
struct kasan_report_info {
+ /* Filled in by kasan_report_*(). */
enum kasan_report_type type;
void *access_addr;
- void *first_bad_addr;
size_t access_size;
bool is_write;
unsigned long ip;
+
+ /* Filled in by the common reporting code. */
+ void *first_bad_addr;
+ struct kmem_cache *cache;
+ void *object;
+
+ /* Filled in by the mode-specific reporting code. */
+ const char *bug_type;
+ struct kasan_track alloc_track;
+ struct kasan_track free_track;
};
/* Do not change the struct layout: compiler ABI. */
@@ -160,33 +210,14 @@ struct kasan_global {
#endif
};
-/* Structures for keeping alloc and free tracks. */
+/* Structures for keeping alloc and free meta. */
-#define KASAN_STACK_DEPTH 64
-
-struct kasan_track {
- u32 pid;
- depot_stack_handle_t stack;
-};
-
-#if defined(CONFIG_KASAN_TAGS_IDENTIFY) && defined(CONFIG_KASAN_SW_TAGS)
-#define KASAN_NR_FREE_STACKS 5
-#else
-#define KASAN_NR_FREE_STACKS 1
-#endif
+#ifdef CONFIG_KASAN_GENERIC
struct kasan_alloc_meta {
struct kasan_track alloc_track;
- /* Generic mode stores free track in kasan_free_meta. */
-#ifdef CONFIG_KASAN_GENERIC
+ /* Free track is stored in kasan_free_meta. */
depot_stack_handle_t aux_stack[2];
-#else
- struct kasan_track free_track[KASAN_NR_FREE_STACKS];
-#endif
-#ifdef CONFIG_KASAN_TAGS_IDENTIFY
- u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
- u8 free_track_idx;
-#endif
};
struct qlist_node {
@@ -205,12 +236,31 @@ struct qlist_node {
* After that, slab allocator stores the freelist pointer in the object.
*/
struct kasan_free_meta {
-#ifdef CONFIG_KASAN_GENERIC
struct qlist_node quarantine_link;
struct kasan_track free_track;
-#endif
};
+#endif /* CONFIG_KASAN_GENERIC */
+
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+
+struct kasan_stack_ring_entry {
+ void *ptr;
+ size_t size;
+ u32 pid;
+ depot_stack_handle_t stack;
+ bool is_free;
+};
+
+struct kasan_stack_ring {
+ rwlock_t lock;
+ size_t size;
+ atomic64_t pos;
+ struct kasan_stack_ring_entry *entries;
+};
+
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
/* Used in KUnit-compatible KASAN tests. */
struct kunit_kasan_status {
@@ -219,13 +269,6 @@ struct kunit_kasan_status {
};
#endif
-struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
- const void *object);
-#ifdef CONFIG_KASAN_GENERIC
-struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
- const void *object);
-#endif
-
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
@@ -260,34 +303,50 @@ static inline bool addr_has_metadata(const void *addr)
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+void *kasan_find_first_bad_addr(void *addr, size_t size);
+void kasan_complete_mode_report_info(struct kasan_report_info *info);
+void kasan_metadata_fetch_row(char *buffer, void *row);
+
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
void kasan_print_tags(u8 addr_tag, const void *addr);
#else
static inline void kasan_print_tags(u8 addr_tag, const void *addr) { }
#endif
-void *kasan_find_first_bad_addr(void *addr, size_t size);
-const char *kasan_get_bug_type(struct kasan_report_info *info);
-void kasan_metadata_fetch_row(char *buffer, void *row);
-
#if defined(CONFIG_KASAN_STACK)
void kasan_print_address_stack_frame(const void *addr);
#else
static inline void kasan_print_address_stack_frame(const void *addr) { }
#endif
+#ifdef CONFIG_KASAN_GENERIC
+void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object);
+#else
+static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { }
+#endif
+
bool kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type);
-struct page *kasan_addr_to_page(const void *addr);
struct slab *kasan_addr_to_slab(const void *addr);
+#ifdef CONFIG_KASAN_GENERIC
+void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size);
+void kasan_init_object_meta(struct kmem_cache *cache, const void *object);
+struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
+ const void *object);
+struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
+ const void *object);
+#else
+static inline void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) { }
+static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { }
+#endif
+
depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc);
void kasan_set_track(struct kasan_track *track, gfp_t flags);
-void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
-struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag);
+void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags);
+void kasan_save_free_info(struct kmem_cache *cache, void *object);
#if defined(CONFIG_KASAN_GENERIC) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
@@ -358,6 +417,10 @@ static inline void kasan_enable_tagging(void) { }
#endif /* CONFIG_KASAN_HW_TAGS */
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+void __init kasan_init_tags(void);
+#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */
+
#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
void kasan_force_async_fault(void);
diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
new file mode 100644
index 000000000000..0d59098f0876
--- /dev/null
+++ b/mm/kasan/kasan_test.c
@@ -0,0 +1,1457 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ */
+
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
+#include <linux/vmalloc.h>
+#include <linux/set_memory.h>
+
+#include <asm/page.h>
+
+#include <kunit/test.h>
+
+#include "kasan.h"
+
+#define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE)
+
+/*
+ * Some tests use these global variables to store return values from function
+ * calls that could otherwise be eliminated by the compiler as dead code.
+ */
+void *kasan_ptr_result;
+int kasan_int_result;
+
+static struct kunit_resource resource;
+static struct kunit_kasan_status test_status;
+static bool multishot;
+
+/*
+ * Temporarily enable multi-shot mode. Otherwise, KASAN would only report the
+ * first detected bug and panic the kernel if panic_on_warn is enabled. For
+ * hardware tag-based KASAN also allow tag checking to be reenabled for each
+ * test, see the comment for KUNIT_EXPECT_KASAN_FAIL().
+ */
+static int kasan_test_init(struct kunit *test)
+{
+ if (!kasan_enabled()) {
+ kunit_err(test, "can't run KASAN tests with KASAN disabled");
+ return -1;
+ }
+
+ multishot = kasan_save_enable_multi_shot();
+ test_status.report_found = false;
+ test_status.sync_fault = false;
+ kunit_add_named_resource(test, NULL, NULL, &resource,
+ "kasan_status", &test_status);
+ return 0;
+}
+
+static void kasan_test_exit(struct kunit *test)
+{
+ kasan_restore_multi_shot(multishot);
+ KUNIT_EXPECT_FALSE(test, test_status.report_found);
+}
+
+/**
+ * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a
+ * KASAN report; causes a test failure otherwise. This relies on a KUnit
+ * resource named "kasan_status". Do not use this name for KUnit resources
+ * outside of KASAN tests.
+ *
+ * For hardware tag-based KASAN, when a synchronous tag fault happens, tag
+ * checking is auto-disabled. When this happens, this test handler reenables
+ * tag checking. As tag checking can be only disabled or enabled per CPU,
+ * this handler disables migration (preemption).
+ *
+ * Since the compiler doesn't see that the expression can change the test_status
+ * fields, it can reorder or optimize away the accesses to those fields.
+ * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the
+ * expression to prevent that.
+ *
+ * In between KUNIT_EXPECT_KASAN_FAIL checks, test_status.report_found is kept
+ * as false. This allows detecting KASAN reports that happen outside of the
+ * checks by asserting !test_status.report_found at the start of
+ * KUNIT_EXPECT_KASAN_FAIL and in kasan_test_exit.
+ */
+#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \
+ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
+ kasan_sync_fault_possible()) \
+ migrate_disable(); \
+ KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); \
+ barrier(); \
+ expression; \
+ barrier(); \
+ if (kasan_async_fault_possible()) \
+ kasan_force_async_fault(); \
+ if (!READ_ONCE(test_status.report_found)) { \
+ KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure " \
+ "expected in \"" #expression \
+ "\", but none occurred"); \
+ } \
+ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \
+ kasan_sync_fault_possible()) { \
+ if (READ_ONCE(test_status.report_found) && \
+ READ_ONCE(test_status.sync_fault)) \
+ kasan_enable_tagging(); \
+ migrate_enable(); \
+ } \
+ WRITE_ONCE(test_status.report_found, false); \
+} while (0)
+
+#define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \
+ if (!IS_ENABLED(config)) \
+ kunit_skip((test), "Test requires " #config "=y"); \
+} while (0)
+
+#define KASAN_TEST_NEEDS_CONFIG_OFF(test, config) do { \
+ if (IS_ENABLED(config)) \
+ kunit_skip((test), "Test requires " #config "=n"); \
+} while (0)
+
+static void kmalloc_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE - 5;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ /*
+ * An unaligned access past the requested kmalloc size.
+ * Only generic KASAN can precisely detect these.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'x');
+
+ /*
+ * An aligned access into the first out-of-bounds granule that falls
+ * within the aligned kmalloc object.
+ */
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y');
+
+ /* Out-of-bounds access past the aligned kmalloc object. */
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] =
+ ptr[size + KASAN_GRANULE_SIZE + 5]);
+
+ kfree(ptr);
+}
+
+static void kmalloc_oob_left(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 15;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr = *(ptr - 1));
+ kfree(ptr);
+}
+
+static void kmalloc_node_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 4096;
+
+ ptr = kmalloc_node(size, GFP_KERNEL, 0);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
+ kfree(ptr);
+}
+
+/*
+ * These kmalloc_pagealloc_* tests try allocating a memory chunk that doesn't
+ * fit into a slab cache and therefore is allocated via the page allocator
+ * fallback. Since this kind of fallback is only implemented for SLUB, these
+ * tests are limited to that allocator.
+ */
+static void kmalloc_pagealloc_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 0);
+
+ kfree(ptr);
+}
+
+static void kmalloc_pagealloc_uaf(struct kunit *test)
+{
+ char *ptr;
+ size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ kfree(ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+}
+
+static void kmalloc_pagealloc_invalid_free(struct kunit *test)
+{
+ char *ptr;
+ size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kfree(ptr + 1));
+}
+
+static void pagealloc_oob_right(struct kunit *test)
+{
+ char *ptr;
+ struct page *pages;
+ size_t order = 4;
+ size_t size = (1UL << (PAGE_SHIFT + order));
+
+ /*
+ * With generic KASAN page allocations have no redzones, thus
+ * out-of-bounds detection is not guaranteed.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=210503.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ pages = alloc_pages(GFP_KERNEL, order);
+ ptr = page_address(pages);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]);
+ free_pages((unsigned long)ptr, order);
+}
+
+static void pagealloc_uaf(struct kunit *test)
+{
+ char *ptr;
+ struct page *pages;
+ size_t order = 4;
+
+ pages = alloc_pages(GFP_KERNEL, order);
+ ptr = page_address(pages);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ free_pages((unsigned long)ptr, order);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+}
+
+static void kmalloc_large_oob_right(struct kunit *test)
+{
+ char *ptr;
+ size_t size = KMALLOC_MAX_CACHE_SIZE - 256;
+
+ /*
+ * Allocate a chunk that is large enough, but still fits into a slab
+ * and does not trigger the page allocator fallback in SLUB.
+ */
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0);
+ kfree(ptr);
+}
+
+static void krealloc_more_oob_helper(struct kunit *test,
+ size_t size1, size_t size2)
+{
+ char *ptr1, *ptr2;
+ size_t middle;
+
+ KUNIT_ASSERT_LT(test, size1, size2);
+ middle = size1 + (size2 - size1) / 2;
+
+ ptr1 = kmalloc(size1, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ /* Suppress -Warray-bounds warnings. */
+ OPTIMIZER_HIDE_VAR(ptr2);
+
+ /* All offsets up to size2 must be accessible. */
+ ptr2[size1 - 1] = 'x';
+ ptr2[size1] = 'x';
+ ptr2[middle] = 'x';
+ ptr2[size2 - 1] = 'x';
+
+ /* Generic mode is precise, so unaligned size2 must be inaccessible. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x');
+
+ /* For all modes first aligned offset after size2 must be inaccessible. */
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x');
+
+ kfree(ptr2);
+}
+
+static void krealloc_less_oob_helper(struct kunit *test,
+ size_t size1, size_t size2)
+{
+ char *ptr1, *ptr2;
+ size_t middle;
+
+ KUNIT_ASSERT_LT(test, size2, size1);
+ middle = size2 + (size1 - size2) / 2;
+
+ ptr1 = kmalloc(size1, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ /* Suppress -Warray-bounds warnings. */
+ OPTIMIZER_HIDE_VAR(ptr2);
+
+ /* Must be accessible for all modes. */
+ ptr2[size2 - 1] = 'x';
+
+ /* Generic mode is precise, so unaligned size2 must be inaccessible. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size2] = 'x');
+
+ /* For all modes first aligned offset after size2 must be inaccessible. */
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ ptr2[round_up(size2, KASAN_GRANULE_SIZE)] = 'x');
+
+ /*
+ * For all modes all size2, middle, and size1 should land in separate
+ * granules and thus the latter two offsets should be inaccessible.
+ */
+ KUNIT_EXPECT_LE(test, round_up(size2, KASAN_GRANULE_SIZE),
+ round_down(middle, KASAN_GRANULE_SIZE));
+ KUNIT_EXPECT_LE(test, round_up(middle, KASAN_GRANULE_SIZE),
+ round_down(size1, KASAN_GRANULE_SIZE));
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[middle] = 'x');
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1 - 1] = 'x');
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2[size1] = 'x');
+
+ kfree(ptr2);
+}
+
+static void krealloc_more_oob(struct kunit *test)
+{
+ krealloc_more_oob_helper(test, 201, 235);
+}
+
+static void krealloc_less_oob(struct kunit *test)
+{
+ krealloc_less_oob_helper(test, 235, 201);
+}
+
+static void krealloc_pagealloc_more_oob(struct kunit *test)
+{
+ /* page_alloc fallback in only implemented for SLUB. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ krealloc_more_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 201,
+ KMALLOC_MAX_CACHE_SIZE + 235);
+}
+
+static void krealloc_pagealloc_less_oob(struct kunit *test)
+{
+ /* page_alloc fallback in only implemented for SLUB. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB);
+
+ krealloc_less_oob_helper(test, KMALLOC_MAX_CACHE_SIZE + 235,
+ KMALLOC_MAX_CACHE_SIZE + 201);
+}
+
+/*
+ * Check that krealloc() detects a use-after-free, returns NULL,
+ * and doesn't unpoison the freed object.
+ */
+static void krealloc_uaf(struct kunit *test)
+{
+ char *ptr1, *ptr2;
+ int size1 = 201;
+ int size2 = 235;
+
+ ptr1 = kmalloc(size1, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+ kfree(ptr1);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL));
+ KUNIT_ASSERT_NULL(test, ptr2);
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)ptr1);
+}
+
+static void kmalloc_oob_16(struct kunit *test)
+{
+ struct {
+ u64 words[2];
+ } *ptr1, *ptr2;
+
+ /* This test is specifically crafted for the generic mode. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
+ ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ OPTIMIZER_HIDE_VAR(ptr1);
+ OPTIMIZER_HIDE_VAR(ptr2);
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2);
+ kfree(ptr1);
+ kfree(ptr2);
+}
+
+static void kmalloc_uaf_16(struct kunit *test)
+{
+ struct {
+ u64 words[2];
+ } *ptr1, *ptr2;
+
+ ptr1 = kmalloc(sizeof(*ptr1), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ kfree(ptr2);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2);
+ kfree(ptr1);
+}
+
+/*
+ * Note: in the memset tests below, the written range touches both valid and
+ * invalid memory. This makes sure that the instrumentation does not only check
+ * the starting address but the whole range.
+ */
+
+static void kmalloc_oob_memset_2(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 1, 0, 2));
+ kfree(ptr);
+}
+
+static void kmalloc_oob_memset_4(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 3, 0, 4));
+ kfree(ptr);
+}
+
+static void kmalloc_oob_memset_8(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 7, 0, 8));
+ kfree(ptr);
+}
+
+static void kmalloc_oob_memset_16(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr + size - 15, 0, 16));
+ kfree(ptr);
+}
+
+static void kmalloc_oob_in_memset(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ memset(ptr, 0, size + KASAN_GRANULE_SIZE));
+ kfree(ptr);
+}
+
+static void kmalloc_memmove_negative_size(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 64;
+ size_t invalid_size = -2;
+
+ /*
+ * Hardware tag-based mode doesn't check memmove for negative size.
+ * As a result, this test introduces a side-effect memory corruption,
+ * which can result in a crash.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ memset((char *)ptr, 0, 64);
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(invalid_size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ memmove((char *)ptr, (char *)ptr + 4, invalid_size));
+ kfree(ptr);
+}
+
+static void kmalloc_memmove_invalid_size(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 64;
+ size_t invalid_size = size;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ memset((char *)ptr, 0, 64);
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(invalid_size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ memmove((char *)ptr, (char *)ptr + 4, invalid_size));
+ kfree(ptr);
+}
+
+static void kmalloc_uaf(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 10;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ kfree(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]);
+}
+
+static void kmalloc_uaf_memset(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 33;
+
+ /*
+ * Only generic KASAN uses quarantine, which is required to avoid a
+ * kernel memory corruption this test causes.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ kfree(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, memset(ptr, 0, size));
+}
+
+static void kmalloc_uaf2(struct kunit *test)
+{
+ char *ptr1, *ptr2;
+ size_t size = 43;
+ int counter = 0;
+
+again:
+ ptr1 = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+
+ kfree(ptr1);
+
+ ptr2 = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+
+ /*
+ * For tag-based KASAN ptr1 and ptr2 tags might happen to be the same.
+ * Allow up to 16 attempts at generating different tags.
+ */
+ if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && ptr1 == ptr2 && counter++ < 16) {
+ kfree(ptr2);
+ goto again;
+ }
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]);
+ KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2);
+
+ kfree(ptr2);
+}
+
+/*
+ * Check that KASAN detects use-after-free when another object was allocated in
+ * the same slot. Relevant for the tag-based modes, which do not use quarantine.
+ */
+static void kmalloc_uaf3(struct kunit *test)
+{
+ char *ptr1, *ptr2;
+ size_t size = 100;
+
+ /* This test is specifically crafted for tag-based modes. */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ ptr1 = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1);
+ kfree(ptr1);
+
+ ptr2 = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2);
+ kfree(ptr2);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]);
+}
+
+static void kfree_via_page(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 8;
+ struct page *page;
+ unsigned long offset;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ page = virt_to_page(ptr);
+ offset = offset_in_page(ptr);
+ kfree(page_address(page) + offset);
+}
+
+static void kfree_via_phys(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 8;
+ phys_addr_t phys;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ phys = virt_to_phys(ptr);
+ kfree(phys_to_virt(phys));
+}
+
+static void kmem_cache_oob(struct kunit *test)
+{
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]);
+
+ kmem_cache_free(cache, p);
+ kmem_cache_destroy(cache);
+}
+
+static void kmem_cache_accounted(struct kunit *test)
+{
+ int i;
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, SLAB_ACCOUNT, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ /*
+ * Several allocations with a delay to allow for lazy per memcg kmem
+ * cache creation.
+ */
+ for (i = 0; i < 5; i++) {
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p)
+ goto free_cache;
+
+ kmem_cache_free(cache, p);
+ msleep(100);
+ }
+
+free_cache:
+ kmem_cache_destroy(cache);
+}
+
+static void kmem_cache_bulk(struct kunit *test)
+{
+ struct kmem_cache *cache;
+ size_t size = 200;
+ char *p[10];
+ bool ret;
+ int i;
+
+ cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ ret = kmem_cache_alloc_bulk(cache, GFP_KERNEL, ARRAY_SIZE(p), (void **)&p);
+ if (!ret) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(p); i++)
+ p[i][0] = p[i][size - 1] = 42;
+
+ kmem_cache_free_bulk(cache, ARRAY_SIZE(p), (void **)&p);
+ kmem_cache_destroy(cache);
+}
+
+static char global_array[10];
+
+static void kasan_global_oob_right(struct kunit *test)
+{
+ /*
+ * Deliberate out-of-bounds access. To prevent CONFIG_UBSAN_LOCAL_BOUNDS
+ * from failing here and panicking the kernel, access the array via a
+ * volatile pointer, which will prevent the compiler from being able to
+ * determine the array bounds.
+ *
+ * This access uses a volatile pointer to char (char *volatile) rather
+ * than the more conventional pointer to volatile char (volatile char *)
+ * because we want to prevent the compiler from making inferences about
+ * the pointer itself (i.e. its array bounds), not the data that it
+ * refers to.
+ */
+ char *volatile array = global_array;
+ char *p = &array[ARRAY_SIZE(global_array) + 3];
+
+ /* Only generic mode instruments globals. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+static void kasan_global_oob_left(struct kunit *test)
+{
+ char *volatile array = global_array;
+ char *p = array - 3;
+
+ /*
+ * GCC is known to fail this test, skip it.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=215051.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_CC_IS_CLANG);
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+/* Check that ksize() makes the whole object accessible. */
+static void ksize_unpoisons_memory(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 123, real_size;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ real_size = ksize(ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+
+ /* This access shouldn't trigger a KASAN report. */
+ ptr[size] = 'x';
+
+ /* This one must. */
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size]);
+
+ kfree(ptr);
+}
+
+/*
+ * Check that a use-after-free is detected by ksize() and via normal accesses
+ * after it.
+ */
+static void ksize_uaf(struct kunit *test)
+{
+ char *ptr;
+ int size = 128 - KASAN_GRANULE_SIZE;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ kfree(ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr));
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]);
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]);
+}
+
+static void kasan_stack_oob(struct kunit *test)
+{
+ char stack_array[10];
+ /* See comment in kasan_global_oob_right. */
+ char *volatile array = stack_array;
+ char *p = &array[ARRAY_SIZE(stack_array) + OOB_TAG_OFF];
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+static void kasan_alloca_oob_left(struct kunit *test)
+{
+ volatile int i = 10;
+ char alloca_array[i];
+ /* See comment in kasan_global_oob_right. */
+ char *volatile array = alloca_array;
+ char *p = array - 1;
+
+ /* Only generic mode instruments dynamic allocas. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+static void kasan_alloca_oob_right(struct kunit *test)
+{
+ volatile int i = 10;
+ char alloca_array[i];
+ /* See comment in kasan_global_oob_right. */
+ char *volatile array = alloca_array;
+ char *p = array + i;
+
+ /* Only generic mode instruments dynamic allocas. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK);
+
+ KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p);
+}
+
+static void kmem_cache_double_free(struct kunit *test)
+{
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, 0, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ kmem_cache_free(cache, p);
+ KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p));
+ kmem_cache_destroy(cache);
+}
+
+static void kmem_cache_invalid_free(struct kunit *test)
+{
+ char *p;
+ size_t size = 200;
+ struct kmem_cache *cache;
+
+ cache = kmem_cache_create("test_cache", size, 0, SLAB_TYPESAFE_BY_RCU,
+ NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+
+ p = kmem_cache_alloc(cache, GFP_KERNEL);
+ if (!p) {
+ kunit_err(test, "Allocation failed: %s\n", __func__);
+ kmem_cache_destroy(cache);
+ return;
+ }
+
+ /* Trigger invalid free, the object doesn't get freed. */
+ KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p + 1));
+
+ /*
+ * Properly free the object to prevent the "Objects remaining in
+ * test_cache on __kmem_cache_shutdown" BUG failure.
+ */
+ kmem_cache_free(cache, p);
+
+ kmem_cache_destroy(cache);
+}
+
+static void empty_cache_ctor(void *object) { }
+
+static void kmem_cache_double_destroy(struct kunit *test)
+{
+ struct kmem_cache *cache;
+
+ /* Provide a constructor to prevent cache merging. */
+ cache = kmem_cache_create("test_cache", 200, 0, 0, empty_cache_ctor);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache);
+ kmem_cache_destroy(cache);
+ KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_destroy(cache));
+}
+
+static void kasan_memchr(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 24;
+
+ /*
+ * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT);
+
+ if (OOB_TAG_OFF)
+ size = round_up(size, OOB_TAG_OFF);
+
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ kasan_ptr_result = memchr(ptr, '1', size + 1));
+
+ kfree(ptr);
+}
+
+static void kasan_memcmp(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 24;
+ int arr[9];
+
+ /*
+ * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT);
+
+ if (OOB_TAG_OFF)
+ size = round_up(size, OOB_TAG_OFF);
+
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ memset(arr, 0, sizeof(arr));
+
+ OPTIMIZER_HIDE_VAR(ptr);
+ OPTIMIZER_HIDE_VAR(size);
+ KUNIT_EXPECT_KASAN_FAIL(test,
+ kasan_int_result = memcmp(ptr, arr, size+1));
+ kfree(ptr);
+}
+
+static void kasan_strings(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 24;
+
+ /*
+ * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT.
+ * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details.
+ */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT);
+
+ ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ kfree(ptr);
+
+ /*
+ * Try to cause only 1 invalid access (less spam in dmesg).
+ * For that we need ptr to point to zeroed byte.
+ * Skip metadata that could be stored in freed object so ptr
+ * will likely point to zeroed byte.
+ */
+ ptr += 16;
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strchr(ptr, '1'));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strrchr(ptr, '1'));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strcmp(ptr, "2"));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strncmp(ptr, "2", 1));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strlen(ptr));
+
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strnlen(ptr, 1));
+}
+
+static void kasan_bitops_modify(struct kunit *test, int nr, void *addr)
+{
+ KUNIT_EXPECT_KASAN_FAIL(test, set_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __set_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, clear_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, clear_bit_unlock(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __clear_bit_unlock(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, change_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __change_bit(nr, addr));
+}
+
+static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr)
+{
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __test_and_set_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_clear_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __test_and_clear_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr));
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr));
+
+#if defined(clear_bit_unlock_is_negative_byte)
+ KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result =
+ clear_bit_unlock_is_negative_byte(nr, addr));
+#endif
+}
+
+static void kasan_bitops_generic(struct kunit *test)
+{
+ long *bits;
+
+ /* This test is specifically crafted for the generic mode. */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC);
+
+ /*
+ * Allocate 1 more byte, which causes kzalloc to round up to 16 bytes;
+ * this way we do not actually corrupt other memory.
+ */
+ bits = kzalloc(sizeof(*bits) + 1, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits);
+
+ /*
+ * Below calls try to access bit within allocated memory; however, the
+ * below accesses are still out-of-bounds, since bitops are defined to
+ * operate on the whole long the bit is in.
+ */
+ kasan_bitops_modify(test, BITS_PER_LONG, bits);
+
+ /*
+ * Below calls try to access bit beyond allocated memory.
+ */
+ kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, bits);
+
+ kfree(bits);
+}
+
+static void kasan_bitops_tags(struct kunit *test)
+{
+ long *bits;
+
+ /* This test is specifically crafted for tag-based modes. */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ /* kmalloc-64 cache will be used and the last 16 bytes will be the redzone. */
+ bits = kzalloc(48, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits);
+
+ /* Do the accesses past the 48 allocated bytes, but within the redone. */
+ kasan_bitops_modify(test, BITS_PER_LONG, (void *)bits + 48);
+ kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, (void *)bits + 48);
+
+ kfree(bits);
+}
+
+static void kmalloc_double_kzfree(struct kunit *test)
+{
+ char *ptr;
+ size_t size = 16;
+
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ kfree_sensitive(ptr);
+ KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr));
+}
+
+static void vmalloc_helpers_tags(struct kunit *test)
+{
+ void *ptr;
+
+ /* This test is intended for tag-based modes. */
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+ ptr = vmalloc(PAGE_SIZE);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ /* Check that the returned pointer is tagged. */
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure exported vmalloc helpers handle tagged pointers. */
+ KUNIT_ASSERT_TRUE(test, is_vmalloc_addr(ptr));
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, vmalloc_to_page(ptr));
+
+#if !IS_MODULE(CONFIG_KASAN_KUNIT_TEST)
+ {
+ int rv;
+
+ /* Make sure vmalloc'ed memory permissions can be changed. */
+ rv = set_memory_ro((unsigned long)ptr, 1);
+ KUNIT_ASSERT_GE(test, rv, 0);
+ rv = set_memory_rw((unsigned long)ptr, 1);
+ KUNIT_ASSERT_GE(test, rv, 0);
+ }
+#endif
+
+ vfree(ptr);
+}
+
+static void vmalloc_oob(struct kunit *test)
+{
+ char *v_ptr, *p_ptr;
+ struct page *page;
+ size_t size = PAGE_SIZE / 2 - KASAN_GRANULE_SIZE - 5;
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+ v_ptr = vmalloc(size);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ OPTIMIZER_HIDE_VAR(v_ptr);
+
+ /*
+ * We have to be careful not to hit the guard page in vmalloc tests.
+ * The MMU will catch that and crash us.
+ */
+
+ /* Make sure in-bounds accesses are valid. */
+ v_ptr[0] = 0;
+ v_ptr[size - 1] = 0;
+
+ /*
+ * An unaligned access past the requested vmalloc size.
+ * Only generic KASAN can precisely detect these.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]);
+
+ /* An aligned access into the first out-of-bounds granule. */
+ KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]);
+
+ /* Check that in-bounds accesses to the physical page are valid. */
+ page = vmalloc_to_page(v_ptr);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page);
+ p_ptr = page_address(page);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+ p_ptr[0] = 0;
+
+ vfree(v_ptr);
+
+ /*
+ * We can't check for use-after-unmap bugs in this nor in the following
+ * vmalloc tests, as the page might be fully unmapped and accessing it
+ * will crash the kernel.
+ */
+}
+
+static void vmap_tags(struct kunit *test)
+{
+ char *p_ptr, *v_ptr;
+ struct page *p_page, *v_page;
+
+ /*
+ * This test is specifically crafted for the software tag-based mode,
+ * the only tag-based mode that poisons vmap mappings.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC);
+
+ p_page = alloc_pages(GFP_KERNEL, 1);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_page);
+ p_ptr = page_address(p_page);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+
+ v_ptr = vmap(&p_page, 1, VM_MAP, PAGE_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ /*
+ * We can't check for out-of-bounds bugs in this nor in the following
+ * vmalloc tests, as allocations have page granularity and accessing
+ * the guard page will crash the kernel.
+ */
+
+ KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure that in-bounds accesses through both pointers work. */
+ *p_ptr = 0;
+ *v_ptr = 0;
+
+ /* Make sure vmalloc_to_page() correctly recovers the page pointer. */
+ v_page = vmalloc_to_page(v_ptr);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_page);
+ KUNIT_EXPECT_PTR_EQ(test, p_page, v_page);
+
+ vunmap(v_ptr);
+ free_pages((unsigned long)p_ptr, 1);
+}
+
+static void vm_map_ram_tags(struct kunit *test)
+{
+ char *p_ptr, *v_ptr;
+ struct page *page;
+
+ /*
+ * This test is specifically crafted for the software tag-based mode,
+ * the only tag-based mode that poisons vm_map_ram mappings.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+ page = alloc_pages(GFP_KERNEL, 1);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, page);
+ p_ptr = page_address(page);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p_ptr);
+
+ v_ptr = vm_map_ram(&page, 1, -1);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, v_ptr);
+
+ KUNIT_EXPECT_GE(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(v_ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure that in-bounds accesses through both pointers work. */
+ *p_ptr = 0;
+ *v_ptr = 0;
+
+ vm_unmap_ram(v_ptr, 1);
+ free_pages((unsigned long)p_ptr, 1);
+}
+
+static void vmalloc_percpu(struct kunit *test)
+{
+ char __percpu *ptr;
+ int cpu;
+
+ /*
+ * This test is specifically crafted for the software tag-based mode,
+ * the only tag-based mode that poisons percpu mappings.
+ */
+ KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+
+ ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+
+ for_each_possible_cpu(cpu) {
+ char *c_ptr = per_cpu_ptr(ptr, cpu);
+
+ KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* Make sure that in-bounds accesses don't crash the kernel. */
+ *c_ptr = 0;
+ }
+
+ free_percpu(ptr);
+}
+
+/*
+ * Check that the assigned pointer tag falls within the [KASAN_TAG_MIN,
+ * KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based
+ * modes.
+ */
+static void match_all_not_assigned(struct kunit *test)
+{
+ char *ptr;
+ struct page *pages;
+ int i, size, order;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ for (i = 0; i < 256; i++) {
+ size = prandom_u32_max(1024) + 1;
+ ptr = kmalloc(size, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+ kfree(ptr);
+ }
+
+ for (i = 0; i < 256; i++) {
+ order = prandom_u32_max(4) + 1;
+ pages = alloc_pages(GFP_KERNEL, order);
+ ptr = page_address(pages);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+ free_pages((unsigned long)ptr, order);
+ }
+
+ if (!IS_ENABLED(CONFIG_KASAN_VMALLOC))
+ return;
+
+ for (i = 0; i < 256; i++) {
+ size = prandom_u32_max(1024) + 1;
+ ptr = vmalloc(size);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN);
+ KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+ vfree(ptr);
+ }
+}
+
+/* Check that 0xff works as a match-all pointer tag for tag-based modes. */
+static void match_all_ptr_tag(struct kunit *test)
+{
+ char *ptr;
+ u8 tag;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ ptr = kmalloc(128, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+
+ /* Backup the assigned tag. */
+ tag = get_tag(ptr);
+ KUNIT_EXPECT_NE(test, tag, (u8)KASAN_TAG_KERNEL);
+
+ /* Reset the tag to 0xff.*/
+ ptr = set_tag(ptr, KASAN_TAG_KERNEL);
+
+ /* This access shouldn't trigger a KASAN report. */
+ *ptr = 0;
+
+ /* Recover the pointer tag and free. */
+ ptr = set_tag(ptr, tag);
+ kfree(ptr);
+}
+
+/* Check that there are no match-all memory tags for tag-based modes. */
+static void match_all_mem_tag(struct kunit *test)
+{
+ char *ptr;
+ int tag;
+
+ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC);
+
+ ptr = kmalloc(128, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ KUNIT_EXPECT_NE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL);
+
+ /* For each possible tag value not matching the pointer tag. */
+ for (tag = KASAN_TAG_MIN; tag <= KASAN_TAG_KERNEL; tag++) {
+ if (tag == get_tag(ptr))
+ continue;
+
+ /* Mark the first memory granule with the chosen memory tag. */
+ kasan_poison(ptr, KASAN_GRANULE_SIZE, (u8)tag, false);
+
+ /* This access must cause a KASAN report. */
+ KUNIT_EXPECT_KASAN_FAIL(test, *ptr = 0);
+ }
+
+ /* Recover the memory tag and free. */
+ kasan_poison(ptr, KASAN_GRANULE_SIZE, get_tag(ptr), false);
+ kfree(ptr);
+}
+
+static struct kunit_case kasan_kunit_test_cases[] = {
+ KUNIT_CASE(kmalloc_oob_right),
+ KUNIT_CASE(kmalloc_oob_left),
+ KUNIT_CASE(kmalloc_node_oob_right),
+ KUNIT_CASE(kmalloc_pagealloc_oob_right),
+ KUNIT_CASE(kmalloc_pagealloc_uaf),
+ KUNIT_CASE(kmalloc_pagealloc_invalid_free),
+ KUNIT_CASE(pagealloc_oob_right),
+ KUNIT_CASE(pagealloc_uaf),
+ KUNIT_CASE(kmalloc_large_oob_right),
+ KUNIT_CASE(krealloc_more_oob),
+ KUNIT_CASE(krealloc_less_oob),
+ KUNIT_CASE(krealloc_pagealloc_more_oob),
+ KUNIT_CASE(krealloc_pagealloc_less_oob),
+ KUNIT_CASE(krealloc_uaf),
+ KUNIT_CASE(kmalloc_oob_16),
+ KUNIT_CASE(kmalloc_uaf_16),
+ KUNIT_CASE(kmalloc_oob_in_memset),
+ KUNIT_CASE(kmalloc_oob_memset_2),
+ KUNIT_CASE(kmalloc_oob_memset_4),
+ KUNIT_CASE(kmalloc_oob_memset_8),
+ KUNIT_CASE(kmalloc_oob_memset_16),
+ KUNIT_CASE(kmalloc_memmove_negative_size),
+ KUNIT_CASE(kmalloc_memmove_invalid_size),
+ KUNIT_CASE(kmalloc_uaf),
+ KUNIT_CASE(kmalloc_uaf_memset),
+ KUNIT_CASE(kmalloc_uaf2),
+ KUNIT_CASE(kmalloc_uaf3),
+ KUNIT_CASE(kfree_via_page),
+ KUNIT_CASE(kfree_via_phys),
+ KUNIT_CASE(kmem_cache_oob),
+ KUNIT_CASE(kmem_cache_accounted),
+ KUNIT_CASE(kmem_cache_bulk),
+ KUNIT_CASE(kasan_global_oob_right),
+ KUNIT_CASE(kasan_global_oob_left),
+ KUNIT_CASE(kasan_stack_oob),
+ KUNIT_CASE(kasan_alloca_oob_left),
+ KUNIT_CASE(kasan_alloca_oob_right),
+ KUNIT_CASE(ksize_unpoisons_memory),
+ KUNIT_CASE(ksize_uaf),
+ KUNIT_CASE(kmem_cache_double_free),
+ KUNIT_CASE(kmem_cache_invalid_free),
+ KUNIT_CASE(kmem_cache_double_destroy),
+ KUNIT_CASE(kasan_memchr),
+ KUNIT_CASE(kasan_memcmp),
+ KUNIT_CASE(kasan_strings),
+ KUNIT_CASE(kasan_bitops_generic),
+ KUNIT_CASE(kasan_bitops_tags),
+ KUNIT_CASE(kmalloc_double_kzfree),
+ KUNIT_CASE(vmalloc_helpers_tags),
+ KUNIT_CASE(vmalloc_oob),
+ KUNIT_CASE(vmap_tags),
+ KUNIT_CASE(vm_map_ram_tags),
+ KUNIT_CASE(vmalloc_percpu),
+ KUNIT_CASE(match_all_not_assigned),
+ KUNIT_CASE(match_all_ptr_tag),
+ KUNIT_CASE(match_all_mem_tag),
+ {}
+};
+
+static struct kunit_suite kasan_kunit_test_suite = {
+ .name = "kasan",
+ .init = kasan_test_init,
+ .test_cases = kasan_kunit_test_cases,
+ .exit = kasan_test_exit,
+};
+
+kunit_test_suite(kasan_kunit_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c
new file mode 100644
index 000000000000..e4ca82dc2c16
--- /dev/null
+++ b/mm/kasan/kasan_test_module.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ */
+
+#define pr_fmt(fmt) "kasan test: %s " fmt, __func__
+
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "kasan.h"
+
+static noinline void __init copy_user_test(void)
+{
+ char *kmem;
+ char __user *usermem;
+ size_t size = 128 - KASAN_GRANULE_SIZE;
+ int __maybe_unused unused;
+
+ kmem = kmalloc(size, GFP_KERNEL);
+ if (!kmem)
+ return;
+
+ usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_PRIVATE, 0);
+ if (IS_ERR(usermem)) {
+ pr_err("Failed to allocate user memory\n");
+ kfree(kmem);
+ return;
+ }
+
+ OPTIMIZER_HIDE_VAR(size);
+
+ pr_info("out-of-bounds in copy_from_user()\n");
+ unused = copy_from_user(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in copy_to_user()\n");
+ unused = copy_to_user(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in __copy_from_user()\n");
+ unused = __copy_from_user(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in __copy_to_user()\n");
+ unused = __copy_to_user(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in __copy_from_user_inatomic()\n");
+ unused = __copy_from_user_inatomic(kmem, usermem, size + 1);
+
+ pr_info("out-of-bounds in __copy_to_user_inatomic()\n");
+ unused = __copy_to_user_inatomic(usermem, kmem, size + 1);
+
+ pr_info("out-of-bounds in strncpy_from_user()\n");
+ unused = strncpy_from_user(kmem, usermem, size + 1);
+
+ vm_munmap((unsigned long)usermem, PAGE_SIZE);
+ kfree(kmem);
+}
+
+static struct kasan_rcu_info {
+ int i;
+ struct rcu_head rcu;
+} *global_rcu_ptr;
+
+static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp)
+{
+ struct kasan_rcu_info *fp = container_of(rp,
+ struct kasan_rcu_info, rcu);
+
+ kfree(fp);
+ ((volatile struct kasan_rcu_info *)fp)->i;
+}
+
+static noinline void __init kasan_rcu_uaf(void)
+{
+ struct kasan_rcu_info *ptr;
+
+ pr_info("use-after-free in kasan_rcu_reclaim\n");
+ ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL);
+ if (!ptr) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ global_rcu_ptr = rcu_dereference_protected(ptr, NULL);
+ call_rcu(&global_rcu_ptr->rcu, kasan_rcu_reclaim);
+}
+
+static noinline void __init kasan_workqueue_work(struct work_struct *work)
+{
+ kfree(work);
+}
+
+static noinline void __init kasan_workqueue_uaf(void)
+{
+ struct workqueue_struct *workqueue;
+ struct work_struct *work;
+
+ workqueue = create_workqueue("kasan_wq_test");
+ if (!workqueue) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+ work = kmalloc(sizeof(struct work_struct), GFP_KERNEL);
+ if (!work) {
+ pr_err("Allocation failed\n");
+ return;
+ }
+
+ INIT_WORK(work, kasan_workqueue_work);
+ queue_work(workqueue, work);
+ destroy_workqueue(workqueue);
+
+ pr_info("use-after-free on workqueue\n");
+ ((volatile struct work_struct *)work)->data;
+}
+
+static int __init test_kasan_module_init(void)
+{
+ /*
+ * Temporarily enable multi-shot mode. Otherwise, KASAN would only
+ * report the first detected bug and panic the kernel if panic_on_warn
+ * is enabled.
+ */
+ bool multishot = kasan_save_enable_multi_shot();
+
+ copy_user_test();
+ kasan_rcu_uaf();
+ kasan_workqueue_uaf();
+
+ kasan_restore_multi_shot(multishot);
+ return -EAGAIN;
+}
+
+module_init(test_kasan_module_init);
+MODULE_LICENSE("GPL");
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index fe3f606b3a98..df3602062bfd 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -175,18 +175,14 @@ static void end_report(unsigned long *flags, void *addr)
static void print_error_description(struct kasan_report_info *info)
{
- if (info->type == KASAN_REPORT_INVALID_FREE) {
- pr_err("BUG: KASAN: invalid-free in %pS\n", (void *)info->ip);
- return;
- }
+ pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip);
- if (info->type == KASAN_REPORT_DOUBLE_FREE) {
- pr_err("BUG: KASAN: double-free in %pS\n", (void *)info->ip);
+ if (info->type != KASAN_REPORT_ACCESS) {
+ pr_err("Free of addr %px by task %s/%d\n",
+ info->access_addr, current->comm, task_pid_nr(current));
return;
}
- pr_err("BUG: KASAN: %s in %pS\n",
- kasan_get_bug_type(info), (void *)info->ip);
if (info->access_size)
pr_err("%s of size %zu at addr %px by task %s/%d\n",
info->is_write ? "Write" : "Read", info->access_size,
@@ -200,31 +196,21 @@ static void print_error_description(struct kasan_report_info *info)
static void print_track(struct kasan_track *track, const char *prefix)
{
pr_err("%s by task %u:\n", prefix, track->pid);
- if (track->stack) {
+ if (track->stack)
stack_depot_print(track->stack);
- } else {
+ else
pr_err("(stack is not available)\n");
- }
}
-struct page *kasan_addr_to_page(const void *addr)
+static inline struct page *addr_to_page(const void *addr)
{
- if ((addr >= (void *)PAGE_OFFSET) &&
- (addr < high_memory))
+ if (virt_addr_valid(addr))
return virt_to_head_page(addr);
return NULL;
}
-struct slab *kasan_addr_to_slab(const void *addr)
-{
- if ((addr >= (void *)PAGE_OFFSET) &&
- (addr < high_memory))
- return virt_to_slab(addr);
- return NULL;
-}
-
-static void describe_object_addr(struct kmem_cache *cache, void *object,
- const void *addr)
+static void describe_object_addr(const void *addr, struct kmem_cache *cache,
+ void *object)
{
unsigned long access_addr = (unsigned long)addr;
unsigned long object_addr = (unsigned long)object;
@@ -252,46 +238,26 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
(void *)(object_addr + cache->object_size));
}
-static void describe_object_stacks(struct kmem_cache *cache, void *object,
- const void *addr, u8 tag)
+static void describe_object_stacks(struct kasan_report_info *info)
{
- struct kasan_alloc_meta *alloc_meta;
- struct kasan_track *free_track;
-
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (alloc_meta) {
- print_track(&alloc_meta->alloc_track, "Allocated");
+ if (info->alloc_track.stack) {
+ print_track(&info->alloc_track, "Allocated");
pr_err("\n");
}
- free_track = kasan_get_free_track(cache, object, tag);
- if (free_track) {
- print_track(free_track, "Freed");
+ if (info->free_track.stack) {
+ print_track(&info->free_track, "Freed");
pr_err("\n");
}
-#ifdef CONFIG_KASAN_GENERIC
- if (!alloc_meta)
- return;
- if (alloc_meta->aux_stack[0]) {
- pr_err("Last potentially related work creation:\n");
- stack_depot_print(alloc_meta->aux_stack[0]);
- pr_err("\n");
- }
- if (alloc_meta->aux_stack[1]) {
- pr_err("Second to last potentially related work creation:\n");
- stack_depot_print(alloc_meta->aux_stack[1]);
- pr_err("\n");
- }
-#endif
+ kasan_print_aux_stacks(info->cache, info->object);
}
-static void describe_object(struct kmem_cache *cache, void *object,
- const void *addr, u8 tag)
+static void describe_object(const void *addr, struct kasan_report_info *info)
{
if (kasan_stack_collection_enabled())
- describe_object_stacks(cache, object, addr, tag);
- describe_object_addr(cache, object, addr);
+ describe_object_stacks(info);
+ describe_object_addr(addr, info->cache, info->object);
}
static inline bool kernel_or_module_addr(const void *addr)
@@ -310,19 +276,16 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
-static void print_address_description(void *addr, u8 tag)
+static void print_address_description(void *addr, u8 tag,
+ struct kasan_report_info *info)
{
- struct page *page = kasan_addr_to_page(addr);
+ struct page *page = addr_to_page(addr);
dump_stack_lvl(KERN_ERR);
pr_err("\n");
- if (page && PageSlab(page)) {
- struct slab *slab = page_slab(page);
- struct kmem_cache *cache = slab->slab_cache;
- void *object = nearest_obj(cache, slab, addr);
-
- describe_object(cache, object, addr, tag);
+ if (info->cache && info->object) {
+ describe_object(addr, info);
pr_err("\n");
}
@@ -420,23 +383,56 @@ static void print_memory_metadata(const void *addr)
static void print_report(struct kasan_report_info *info)
{
- void *tagged_addr = info->access_addr;
- void *untagged_addr = kasan_reset_tag(tagged_addr);
- u8 tag = get_tag(tagged_addr);
+ void *addr = kasan_reset_tag(info->access_addr);
+ u8 tag = get_tag(info->access_addr);
print_error_description(info);
- if (addr_has_metadata(untagged_addr))
+ if (addr_has_metadata(addr))
kasan_print_tags(tag, info->first_bad_addr);
pr_err("\n");
- if (addr_has_metadata(untagged_addr)) {
- print_address_description(untagged_addr, tag);
+ if (addr_has_metadata(addr)) {
+ print_address_description(addr, tag, info);
print_memory_metadata(info->first_bad_addr);
} else {
dump_stack_lvl(KERN_ERR);
}
}
+static void complete_report_info(struct kasan_report_info *info)
+{
+ void *addr = kasan_reset_tag(info->access_addr);
+ struct slab *slab;
+
+ if (info->type == KASAN_REPORT_ACCESS)
+ info->first_bad_addr = kasan_find_first_bad_addr(
+ info->access_addr, info->access_size);
+ else
+ info->first_bad_addr = addr;
+
+ slab = kasan_addr_to_slab(addr);
+ if (slab) {
+ info->cache = slab->slab_cache;
+ info->object = nearest_obj(info->cache, slab, addr);
+ } else
+ info->cache = info->object = NULL;
+
+ switch (info->type) {
+ case KASAN_REPORT_INVALID_FREE:
+ info->bug_type = "invalid-free";
+ break;
+ case KASAN_REPORT_DOUBLE_FREE:
+ info->bug_type = "double-free";
+ break;
+ default:
+ /* bug_type filled in by kasan_complete_mode_report_info. */
+ break;
+ }
+
+ /* Fill in mode-specific report info fields. */
+ kasan_complete_mode_report_info(info);
+}
+
void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type)
{
unsigned long flags;
@@ -452,13 +448,15 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty
start_report(&flags, true);
+ memset(&info, 0, sizeof(info));
info.type = type;
info.access_addr = ptr;
- info.first_bad_addr = kasan_reset_tag(ptr);
info.access_size = 0;
info.is_write = false;
info.ip = ip;
+ complete_report_info(&info);
+
print_report(&info);
end_report(&flags, ptr);
@@ -485,13 +483,15 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write,
start_report(&irq_flags, true);
+ memset(&info, 0, sizeof(info));
info.type = KASAN_REPORT_ACCESS;
info.access_addr = ptr;
- info.first_bad_addr = kasan_find_first_bad_addr(ptr, size);
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
+ complete_report_info(&info);
+
print_report(&info);
end_report(&irq_flags, ptr);
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 6689fb9a919b..043c94b04605 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -109,7 +109,7 @@ static const char *get_wild_bug_type(struct kasan_report_info *info)
return bug_type;
}
-const char *kasan_get_bug_type(struct kasan_report_info *info)
+static const char *get_bug_type(struct kasan_report_info *info)
{
/*
* If access_size is a negative number, then it has reason to be
@@ -127,11 +127,55 @@ const char *kasan_get_bug_type(struct kasan_report_info *info)
return get_wild_bug_type(info);
}
+void kasan_complete_mode_report_info(struct kasan_report_info *info)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ struct kasan_free_meta *free_meta;
+
+ if (!info->bug_type)
+ info->bug_type = get_bug_type(info);
+
+ if (!info->cache || !info->object)
+ return;
+
+ alloc_meta = kasan_get_alloc_meta(info->cache, info->object);
+ if (alloc_meta)
+ memcpy(&info->alloc_track, &alloc_meta->alloc_track,
+ sizeof(info->alloc_track));
+
+ if (*(u8 *)kasan_mem_to_shadow(info->object) == KASAN_SLAB_FREETRACK) {
+ /* Free meta must be present with KASAN_SLAB_FREETRACK. */
+ free_meta = kasan_get_free_meta(info->cache, info->object);
+ memcpy(&info->free_track, &free_meta->free_track,
+ sizeof(info->free_track));
+ }
+}
+
void kasan_metadata_fetch_row(char *buffer, void *row)
{
memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
}
+void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (!alloc_meta)
+ return;
+
+ if (alloc_meta->aux_stack[0]) {
+ pr_err("Last potentially related work creation:\n");
+ stack_depot_print(alloc_meta->aux_stack[0]);
+ pr_err("\n");
+ }
+ if (alloc_meta->aux_stack[1]) {
+ pr_err("Second to last potentially related work creation:\n");
+ stack_depot_print(alloc_meta->aux_stack[1]);
+ pr_err("\n");
+ }
+}
+
#ifdef CONFIG_KASAN_STACK
static bool __must_check tokenize_frame_descr(const char **frame_descr,
char *token, size_t max_tok_len,
diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c
index e25d2166e813..ecede06ef374 100644
--- a/mm/kasan/report_tags.c
+++ b/mm/kasan/report_tags.c
@@ -4,38 +4,14 @@
* Copyright (c) 2020 Google, Inc.
*/
+#include <linux/atomic.h>
+
#include "kasan.h"
-#include "../slab.h"
-const char *kasan_get_bug_type(struct kasan_report_info *info)
-{
-#ifdef CONFIG_KASAN_TAGS_IDENTIFY
- struct kasan_alloc_meta *alloc_meta;
- struct kmem_cache *cache;
- struct slab *slab;
- const void *addr;
- void *object;
- u8 tag;
- int i;
-
- tag = get_tag(info->access_addr);
- addr = kasan_reset_tag(info->access_addr);
- slab = kasan_addr_to_slab(addr);
- if (slab) {
- cache = slab->slab_cache;
- object = nearest_obj(cache, slab, (void *)addr);
- alloc_meta = kasan_get_alloc_meta(cache, object);
-
- if (alloc_meta) {
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
- if (alloc_meta->free_pointer_tag[i] == tag)
- return "use-after-free";
- }
- }
- return "out-of-bounds";
- }
-#endif
+extern struct kasan_stack_ring stack_ring;
+static const char *get_common_bug_type(struct kasan_report_info *info)
+{
/*
* If access_size is a negative number, then it has reason to be
* defined as out-of-bounds bug type.
@@ -49,3 +25,92 @@ const char *kasan_get_bug_type(struct kasan_report_info *info)
return "invalid-access";
}
+
+void kasan_complete_mode_report_info(struct kasan_report_info *info)
+{
+ unsigned long flags;
+ u64 pos;
+ struct kasan_stack_ring_entry *entry;
+ void *ptr;
+ u32 pid;
+ depot_stack_handle_t stack;
+ bool is_free;
+ bool alloc_found = false, free_found = false;
+
+ if ((!info->cache || !info->object) && !info->bug_type) {
+ info->bug_type = get_common_bug_type(info);
+ return;
+ }
+
+ write_lock_irqsave(&stack_ring.lock, flags);
+
+ pos = atomic64_read(&stack_ring.pos);
+
+ /*
+ * The loop below tries to find stack ring entries relevant to the
+ * buggy object. This is a best-effort process.
+ *
+ * First, another object with the same tag can be allocated in place of
+ * the buggy object. Also, since the number of entries is limited, the
+ * entries relevant to the buggy object can be overwritten.
+ */
+
+ for (u64 i = pos - 1; i != pos - 1 - stack_ring.size; i--) {
+ if (alloc_found && free_found)
+ break;
+
+ entry = &stack_ring.entries[i % stack_ring.size];
+
+ /* Paired with smp_store_release() in save_stack_info(). */
+ ptr = (void *)smp_load_acquire(&entry->ptr);
+
+ if (kasan_reset_tag(ptr) != info->object ||
+ get_tag(ptr) != get_tag(info->access_addr))
+ continue;
+
+ pid = READ_ONCE(entry->pid);
+ stack = READ_ONCE(entry->stack);
+ is_free = READ_ONCE(entry->is_free);
+
+ if (is_free) {
+ /*
+ * Second free of the same object.
+ * Give up on trying to find the alloc entry.
+ */
+ if (free_found)
+ break;
+
+ info->free_track.pid = pid;
+ info->free_track.stack = stack;
+ free_found = true;
+
+ /*
+ * If a free entry is found first, the bug is likely
+ * a use-after-free.
+ */
+ if (!info->bug_type)
+ info->bug_type = "use-after-free";
+ } else {
+ /* Second alloc of the same object. Give up. */
+ if (alloc_found)
+ break;
+
+ info->alloc_track.pid = pid;
+ info->alloc_track.stack = stack;
+ alloc_found = true;
+
+ /*
+ * If an alloc entry is found first, the bug is likely
+ * an out-of-bounds.
+ */
+ if (!info->bug_type)
+ info->bug_type = "slab-out-of-bounds";
+ }
+ }
+
+ write_unlock_irqrestore(&stack_ring.lock, flags);
+
+ /* Assign the common bug type if no entries were found. */
+ if (!info->bug_type)
+ info->bug_type = get_common_bug_type(info);
+}
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 77f13f391b57..a3afaf2ad1b1 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -42,7 +42,10 @@ void __init kasan_init_sw_tags(void)
for_each_possible_cpu(cpu)
per_cpu(prng_state, cpu) = (u32)get_cycles();
- pr_info("KernelAddressSanitizer initialized (sw-tags)\n");
+ kasan_init_tags();
+
+ pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n",
+ kasan_stack_collection_enabled() ? "on" : "off");
}
/*
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 8f48b9502a17..67a222586846 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -6,9 +6,11 @@
* Copyright (c) 2020 Google, Inc.
*/
+#include <linux/atomic.h>
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
+#include <linux/memblock.h>
#include <linux/memory.h>
#include <linux/mm.h>
#include <linux/static_key.h>
@@ -16,44 +18,127 @@
#include <linux/types.h>
#include "kasan.h"
+#include "../slab.h"
-void kasan_set_free_info(struct kmem_cache *cache,
- void *object, u8 tag)
-{
- struct kasan_alloc_meta *alloc_meta;
- u8 idx = 0;
+#define KASAN_STACK_RING_SIZE_DEFAULT (32 << 10)
+
+enum kasan_arg_stacktrace {
+ KASAN_ARG_STACKTRACE_DEFAULT,
+ KASAN_ARG_STACKTRACE_OFF,
+ KASAN_ARG_STACKTRACE_ON,
+};
+
+static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata;
+
+/* Whether to collect alloc/free stack traces. */
+DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace);
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (!alloc_meta)
- return;
+/* Non-zero, as initial pointer values are 0. */
+#define STACK_RING_BUSY_PTR ((void *)1)
+
+struct kasan_stack_ring stack_ring = {
+ .lock = __RW_LOCK_UNLOCKED(stack_ring.lock)
+};
+
+/* kasan.stacktrace=off/on */
+static int __init early_kasan_flag_stacktrace(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
-#ifdef CONFIG_KASAN_TAGS_IDENTIFY
- idx = alloc_meta->free_track_idx;
- alloc_meta->free_pointer_tag[idx] = tag;
- alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
-#endif
+ if (!strcmp(arg, "off"))
+ kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF;
+ else if (!strcmp(arg, "on"))
+ kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON;
+ else
+ return -EINVAL;
- kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+ return 0;
}
+early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
-struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag)
+/* kasan.stack_ring_size=<number of entries> */
+static int __init early_kasan_flag_stack_ring_size(char *arg)
{
- struct kasan_alloc_meta *alloc_meta;
- int i = 0;
+ if (!arg)
+ return -EINVAL;
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (!alloc_meta)
- return NULL;
+ return kstrtoul(arg, 0, &stack_ring.size);
+}
+early_param("kasan.stack_ring_size", early_kasan_flag_stack_ring_size);
+
+void __init kasan_init_tags(void)
+{
+ switch (kasan_arg_stacktrace) {
+ case KASAN_ARG_STACKTRACE_DEFAULT:
+ /* Default is specified by kasan_flag_stacktrace definition. */
+ break;
+ case KASAN_ARG_STACKTRACE_OFF:
+ static_branch_disable(&kasan_flag_stacktrace);
+ break;
+ case KASAN_ARG_STACKTRACE_ON:
+ static_branch_enable(&kasan_flag_stacktrace);
+ break;
+ }
-#ifdef CONFIG_KASAN_TAGS_IDENTIFY
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
- if (alloc_meta->free_pointer_tag[i] == tag)
- break;
+ if (kasan_stack_collection_enabled()) {
+ if (!stack_ring.size)
+ stack_ring.size = KASAN_STACK_RING_SIZE_DEFAULT;
+ stack_ring.entries = memblock_alloc(
+ sizeof(stack_ring.entries[0]) * stack_ring.size,
+ SMP_CACHE_BYTES);
+ if (WARN_ON(!stack_ring.entries))
+ static_branch_disable(&kasan_flag_stacktrace);
}
- if (i == KASAN_NR_FREE_STACKS)
- i = alloc_meta->free_track_idx;
-#endif
+}
+
+static void save_stack_info(struct kmem_cache *cache, void *object,
+ gfp_t gfp_flags, bool is_free)
+{
+ unsigned long flags;
+ depot_stack_handle_t stack;
+ u64 pos;
+ struct kasan_stack_ring_entry *entry;
+ void *old_ptr;
+
+ stack = kasan_save_stack(gfp_flags, true);
+
+ /*
+ * Prevent save_stack_info() from modifying stack ring
+ * when kasan_complete_mode_report_info() is walking it.
+ */
+ read_lock_irqsave(&stack_ring.lock, flags);
+
+next:
+ pos = atomic64_fetch_add(1, &stack_ring.pos);
+ entry = &stack_ring.entries[pos % stack_ring.size];
+
+ /* Detect stack ring entry slots that are being written to. */
+ old_ptr = READ_ONCE(entry->ptr);
+ if (old_ptr == STACK_RING_BUSY_PTR)
+ goto next; /* Busy slot. */
+ if (!try_cmpxchg(&entry->ptr, &old_ptr, STACK_RING_BUSY_PTR))
+ goto next; /* Busy slot. */
+
+ WRITE_ONCE(entry->size, cache->object_size);
+ WRITE_ONCE(entry->pid, current->pid);
+ WRITE_ONCE(entry->stack, stack);
+ WRITE_ONCE(entry->is_free, is_free);
+
+ /*
+ * Paired with smp_load_acquire() in kasan_complete_mode_report_info().
+ */
+ smp_store_release(&entry->ptr, (s64)object);
- return &alloc_meta->free_track[i];
+ read_unlock_irqrestore(&stack_ring.lock, flags);
+}
+
+void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
+{
+ save_stack_info(cache, object, flags, false);
+}
+
+void kasan_save_free_info(struct kmem_cache *cache, void *object)
+{
+ save_stack_info(cache, object, GFP_NOWAIT, true);
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index c252081b11df..141788858b70 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -719,24 +719,13 @@ static int show_object(struct seq_file *seq, void *v)
return 0;
}
-static const struct seq_operations object_seqops = {
+static const struct seq_operations objects_sops = {
.start = start_object,
.next = next_object,
.stop = stop_object,
.show = show_object,
};
-
-static int open_objects(struct inode *inode, struct file *file)
-{
- return seq_open(file, &object_seqops);
-}
-
-static const struct file_operations objects_fops = {
- .open = open_objects,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(objects);
static int __init kfence_debugfs_init(void)
{
@@ -864,7 +853,7 @@ static void kfence_init_enable(void)
void __init kfence_init(void)
{
- stack_hash_seed = (u32)random_get_entropy();
+ stack_hash_seed = get_random_u32();
/* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
if (!kfence_sample_interval)
@@ -1003,6 +992,13 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
return NULL;
}
+ /*
+ * Skip allocations for this slab, if KFENCE has been disabled for
+ * this slab.
+ */
+ if (s->flags & SLAB_SKIP_KFENCE)
+ return NULL;
+
if (atomic_inc_return(&kfence_allocation_gate) > 1)
return NULL;
#ifdef CONFIG_KFENCE_STATIC_KEYS
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index f5a6d8ba3e21..7e496856c2eb 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -86,6 +86,7 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
/* Also the *_bulk() variants by only checking prefixes. */
if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") ||
str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc"))
goto found;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 01f71786d530..4734315f7940 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -23,16 +23,20 @@
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
+#include "mm_slot.h"
enum scan_result {
SCAN_FAIL,
SCAN_SUCCEED,
SCAN_PMD_NULL,
+ SCAN_PMD_NONE,
+ SCAN_PMD_MAPPED,
SCAN_EXCEED_NONE_PTE,
SCAN_EXCEED_SWAP_PTE,
SCAN_EXCEED_SHARED_PTE,
SCAN_PTE_NON_PRESENT,
SCAN_PTE_UFFD_WP,
+ SCAN_PTE_MAPPED_HUGEPAGE,
SCAN_PAGE_RO,
SCAN_LACK_REFERENCED_PAGE,
SCAN_PAGE_NULL,
@@ -73,6 +77,8 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* default collapse hugepages if there is at least one pte mapped like
* it would have happened if the vma was large enough during page
* fault.
+ *
+ * Note that these are only respected if collapse was initiated by khugepaged.
*/
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
@@ -85,18 +91,24 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
#define MAX_PTE_MAPPED_THP 8
+struct collapse_control {
+ bool is_khugepaged;
+
+ /* Num pages scanned per node */
+ u32 node_load[MAX_NUMNODES];
+
+ /* Last target selected in hpage_collapse_find_target_node() */
+ int last_target_node;
+};
+
/**
- * struct mm_slot - hash lookup from mm to mm_slot
- * @hash: hash collision list
- * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
- * @mm: the mm that this information is valid for
+ * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
+ * @slot: hash lookup from mm to mm_slot
* @nr_pte_mapped_thp: number of pte mapped THP
* @pte_mapped_thp: address array corresponding pte mapped THP
*/
-struct mm_slot {
- struct hlist_node hash;
- struct list_head mm_node;
- struct mm_struct *mm;
+struct khugepaged_mm_slot {
+ struct mm_slot slot;
/* pte-mapped THP in this mm */
int nr_pte_mapped_thp;
@@ -113,7 +125,7 @@ struct mm_slot {
*/
struct khugepaged_scan {
struct list_head mm_head;
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
unsigned long address;
};
@@ -377,8 +389,9 @@ int hugepage_madvise(struct vm_area_struct *vma,
int __init khugepaged_init(void)
{
mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
- sizeof(struct mm_slot),
- __alignof__(struct mm_slot), 0, NULL);
+ sizeof(struct khugepaged_mm_slot),
+ __alignof__(struct khugepaged_mm_slot),
+ 0, NULL);
if (!mm_slot_cache)
return -ENOMEM;
@@ -395,65 +408,38 @@ void __init khugepaged_destroy(void)
kmem_cache_destroy(mm_slot_cache);
}
-static inline struct mm_slot *alloc_mm_slot(void)
-{
- if (!mm_slot_cache) /* initialization failed */
- return NULL;
- return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
-}
-
-static inline void free_mm_slot(struct mm_slot *mm_slot)
-{
- kmem_cache_free(mm_slot_cache, mm_slot);
-}
-
-static struct mm_slot *get_mm_slot(struct mm_struct *mm)
-{
- struct mm_slot *mm_slot;
-
- hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
- if (mm == mm_slot->mm)
- return mm_slot;
-
- return NULL;
-}
-
-static void insert_to_mm_slots_hash(struct mm_struct *mm,
- struct mm_slot *mm_slot)
-{
- mm_slot->mm = mm;
- hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
-}
-
-static inline int khugepaged_test_exit(struct mm_struct *mm)
+static inline int hpage_collapse_test_exit(struct mm_struct *mm)
{
return atomic_read(&mm->mm_users) == 0;
}
void __khugepaged_enter(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
int wakeup;
- mm_slot = alloc_mm_slot();
+ mm_slot = mm_slot_alloc(mm_slot_cache);
if (!mm_slot)
return;
+ slot = &mm_slot->slot;
+
/* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
+ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
return;
}
spin_lock(&khugepaged_mm_lock);
- insert_to_mm_slots_hash(mm, mm_slot);
+ mm_slot_insert(mm_slots_hash, mm, slot);
/*
* Insert just behind the scanning cursor, to let the area settle
* down a little.
*/
wakeup = list_empty(&khugepaged_scan.mm_head);
- list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+ list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
spin_unlock(&khugepaged_mm_lock);
mmgrab(mm);
@@ -466,37 +452,38 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
hugepage_flags_enabled()) {
- if (hugepage_vma_check(vma, vm_flags, false, false))
+ if (hugepage_vma_check(vma, vm_flags, false, false, true))
__khugepaged_enter(vma->vm_mm);
}
}
void __khugepaged_exit(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
int free = 0;
spin_lock(&khugepaged_mm_lock);
- mm_slot = get_mm_slot(mm);
+ slot = mm_slot_lookup(mm_slots_hash, mm);
+ mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
- hash_del(&mm_slot->hash);
- list_del(&mm_slot->mm_node);
+ hash_del(&slot->hash);
+ list_del(&slot->mm_node);
free = 1;
}
spin_unlock(&khugepaged_mm_lock);
if (free) {
clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
mmdrop(mm);
} else if (mm_slot) {
/*
* This is required to serialize against
- * khugepaged_test_exit() (which is guaranteed to run
- * under mmap sem read mode). Stop here (after we
- * return all pagetables will be destroyed) until
- * khugepaged has finished working on the pagetables
- * under the mmap_lock.
+ * hpage_collapse_test_exit() (which is guaranteed to run
+ * under mmap sem read mode). Stop here (after we return all
+ * pagetables will be destroyed) until khugepaged has finished
+ * working on the pagetables under the mmap_lock.
*/
mmap_write_lock(mm);
mmap_write_unlock(mm);
@@ -546,11 +533,12 @@ static bool is_refcount_suitable(struct page *page)
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
+ struct collapse_control *cc,
struct list_head *compound_pagelist)
{
struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
+ int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
bool writable = false;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
@@ -558,8 +546,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
+ ++none_or_zero;
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
+ (!cc->is_khugepaged ||
+ none_or_zero <= khugepaged_max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
@@ -579,11 +569,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!PageAnon(page), page);
- if (page_mapcount(page) > 1 &&
- ++shared > khugepaged_max_ptes_shared) {
- result = SCAN_EXCEED_SHARED_PTE;
- count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
- goto out;
+ if (page_mapcount(page) > 1) {
+ ++shared;
+ if (cc->is_khugepaged &&
+ shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ goto out;
+ }
}
if (PageCompound(page)) {
@@ -646,10 +639,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (PageCompound(page))
list_add_tail(&page->lru, compound_pagelist);
next:
- /* There should be enough young pte to collapse the page */
- if (pte_young(pteval) ||
- page_is_young(page) || PageReferenced(page) ||
- mmu_notifier_test_young(vma->vm_mm, address))
+ /*
+ * If collapse was initiated by khugepaged, check that there is
+ * enough young pte to justify collapsing the page
+ */
+ if (cc->is_khugepaged &&
+ (pte_young(pteval) || page_is_young(page) ||
+ PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ address)))
referenced++;
if (pte_write(pteval))
@@ -658,19 +655,19 @@ next:
if (unlikely(!writable)) {
result = SCAN_PAGE_RO;
- } else if (unlikely(!referenced)) {
+ } else if (unlikely(cc->is_khugepaged && !referenced)) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
- return 1;
+ return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
- return 0;
+ return result;
}
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -730,14 +727,17 @@ static void khugepaged_alloc_sleep(void)
DEFINE_WAIT(wait);
add_wait_queue(&khugepaged_wait, &wait);
- freezable_schedule_timeout_interruptible(
- msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
+ schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
remove_wait_queue(&khugepaged_wait, &wait);
}
-static int khugepaged_node_load[MAX_NUMNODES];
+struct collapse_control khugepaged_collapse_control = {
+ .is_khugepaged = true,
+ .last_target_node = NUMA_NO_NODE,
+};
-static bool khugepaged_scan_abort(int nid)
+static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
{
int i;
@@ -749,11 +749,11 @@ static bool khugepaged_scan_abort(int nid)
return false;
/* If there is a count for this node already, it must be acceptable */
- if (khugepaged_node_load[nid])
+ if (cc->node_load[nid])
return false;
for (i = 0; i < MAX_NUMNODES; i++) {
- if (!khugepaged_node_load[i])
+ if (!cc->node_load[i])
continue;
if (node_distance(nid, i) > node_reclaim_distance)
return true;
@@ -772,146 +772,63 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
}
#ifdef CONFIG_NUMA
-static int khugepaged_find_target_node(void)
+static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
- static int last_khugepaged_target_node = NUMA_NO_NODE;
int nid, target_node = 0, max_value = 0;
/* find first node with max normal pages hit */
for (nid = 0; nid < MAX_NUMNODES; nid++)
- if (khugepaged_node_load[nid] > max_value) {
- max_value = khugepaged_node_load[nid];
+ if (cc->node_load[nid] > max_value) {
+ max_value = cc->node_load[nid];
target_node = nid;
}
/* do some balance if several nodes have the same hit record */
- if (target_node <= last_khugepaged_target_node)
- for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
- nid++)
- if (max_value == khugepaged_node_load[nid]) {
+ if (target_node <= cc->last_target_node)
+ for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES;
+ nid++)
+ if (max_value == cc->node_load[nid]) {
target_node = nid;
break;
}
- last_khugepaged_target_node = target_node;
+ cc->last_target_node = target_node;
return target_node;
}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+#else
+static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
- if (IS_ERR(*hpage)) {
- if (!*wait)
- return false;
-
- *wait = false;
- *hpage = NULL;
- khugepaged_alloc_sleep();
- } else if (*hpage) {
- put_page(*hpage);
- *hpage = NULL;
- }
-
- return true;
+ return 0;
}
+#endif
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node)
{
- VM_BUG_ON_PAGE(*hpage, *hpage);
-
*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- *hpage = ERR_PTR(-ENOMEM);
- return NULL;
+ return false;
}
prep_transhuge_page(*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
- return *hpage;
-}
-#else
-static int khugepaged_find_target_node(void)
-{
- return 0;
-}
-
-static inline struct page *alloc_khugepaged_hugepage(void)
-{
- struct page *page;
-
- page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
- HPAGE_PMD_ORDER);
- if (page)
- prep_transhuge_page(page);
- return page;
-}
-
-static struct page *khugepaged_alloc_hugepage(bool *wait)
-{
- struct page *hpage;
-
- do {
- hpage = alloc_khugepaged_hugepage();
- if (!hpage) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- if (!*wait)
- return NULL;
-
- *wait = false;
- khugepaged_alloc_sleep();
- } else
- count_vm_event(THP_COLLAPSE_ALLOC);
- } while (unlikely(!hpage) && likely(hugepage_flags_enabled()));
-
- return hpage;
-}
-
-static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
-{
- /*
- * If the hpage allocated earlier was briefly exposed in page cache
- * before collapse_file() failed, it is possible that racing lookups
- * have not yet completed, and would then be unpleasantly surprised by
- * finding the hpage reused for the same mapping at a different offset.
- * Just release the previous allocation if there is any danger of that.
- */
- if (*hpage && page_count(*hpage) > 1) {
- put_page(*hpage);
- *hpage = NULL;
- }
-
- if (!*hpage)
- *hpage = khugepaged_alloc_hugepage(wait);
-
- if (unlikely(!*hpage))
- return false;
-
return true;
}
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
-{
- VM_BUG_ON(!*hpage);
-
- return *hpage;
-}
-#endif
-
/*
* If mmap_lock temporarily dropped, revalidate vma
* before taking mmap_lock.
- * Return 0 if succeeds, otherwise return none-zero
- * value (scan code).
+ * Returns enum scan_result value.
*/
static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
- struct vm_area_struct **vmap)
+ bool expect_anon,
+ struct vm_area_struct **vmap,
+ struct collapse_control *cc)
{
struct vm_area_struct *vma;
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit(mm)))
return SCAN_ANY_PROCESS;
*vmap = vma = find_vma(mm, address);
@@ -920,7 +837,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!transhuge_vma_suitable(vma, address))
return SCAN_ADDRESS_RANGE;
- if (!hugepage_vma_check(vma, vma->vm_flags, false, false))
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
+ cc->is_khugepaged))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
@@ -929,23 +847,62 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
* hugepage_vma_check may return true for qualified file
* vmas.
*/
- if (!vma->anon_vma || !vma_is_anonymous(vma))
- return SCAN_VMA_CHECK;
- return 0;
+ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
+ return SCAN_PAGE_ANON;
+ return SCAN_SUCCEED;
+}
+
+static int find_pmd_or_thp_or_none(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t **pmd)
+{
+ pmd_t pmde;
+
+ *pmd = mm_find_pmd(mm, address);
+ if (!*pmd)
+ return SCAN_PMD_NULL;
+
+ pmde = pmd_read_atomic(*pmd);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
+ barrier();
+#endif
+ if (pmd_none(pmde))
+ return SCAN_PMD_NONE;
+ if (pmd_trans_huge(pmde))
+ return SCAN_PMD_MAPPED;
+ if (pmd_bad(pmde))
+ return SCAN_PMD_NULL;
+ return SCAN_SUCCEED;
+}
+
+static int check_pmd_still_valid(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t *pmd)
+{
+ pmd_t *new_pmd;
+ int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
+
+ if (result != SCAN_SUCCEED)
+ return result;
+ if (new_pmd != pmd)
+ return SCAN_FAIL;
+ return SCAN_SUCCEED;
}
/*
* Bring missing pages in from swap, to complete THP collapse.
- * Only done if khugepaged_scan_pmd believes it is worthwhile.
+ * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
*
* Called and returns without pte mapped or spinlocks held.
* Note that if false is returned, mmap_lock will be released.
*/
-static bool __collapse_huge_page_swapin(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long haddr, pmd_t *pmd,
- int referenced)
+static int __collapse_huge_page_swapin(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd,
+ int referenced)
{
int swapped_in = 0;
vm_fault_t ret = 0;
@@ -976,12 +933,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
*/
if (ret & VM_FAULT_RETRY) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
+ /* Likely, but not guaranteed, that page lock failed */
+ return SCAN_PAGE_LOCK;
}
if (ret & VM_FAULT_ERROR) {
mmap_read_unlock(mm);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
+ return SCAN_FAIL;
}
swapped_in++;
}
@@ -991,30 +949,41 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
lru_add_drain();
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
- return true;
+ return SCAN_SUCCEED;
}
-static void collapse_huge_page(struct mm_struct *mm,
- unsigned long address,
- struct page **hpage,
- int node, int referenced, int unmapped)
+static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
+ struct collapse_control *cc)
+{
+ /* Only allocate from the target node */
+ gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
+ GFP_TRANSHUGE) | __GFP_THISNODE;
+ int node = hpage_collapse_find_target_node(cc);
+
+ if (!hpage_collapse_alloc_page(hpage, gfp, node))
+ return SCAN_ALLOC_HUGE_PAGE_FAIL;
+ if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp)))
+ return SCAN_CGROUP_CHARGE_FAIL;
+ count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
+ return SCAN_SUCCEED;
+}
+
+static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
+ int referenced, int unmapped,
+ struct collapse_control *cc)
{
LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable;
- struct page *new_page;
+ struct page *hpage;
spinlock_t *pmd_ptl, *pte_ptl;
- int isolated = 0, result = 0;
+ int result = SCAN_FAIL;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
- gfp_t gfp;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- /* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
-
/*
* Before allocating the hugepage, release the mmap_lock read lock.
* The allocation can take potentially a long time if it involves
@@ -1022,40 +991,34 @@ static void collapse_huge_page(struct mm_struct *mm,
* that. We will recheck the vma after taking it again in write mode.
*/
mmap_read_unlock(mm);
- new_page = khugepaged_alloc_page(hpage, gfp, node);
- if (!new_page) {
- result = SCAN_ALLOC_HUGE_PAGE_FAIL;
- goto out_nolock;
- }
- if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
- result = SCAN_CGROUP_CHARGE_FAIL;
+ result = alloc_charge_hpage(&hpage, mm, cc);
+ if (result != SCAN_SUCCEED)
goto out_nolock;
- }
- count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
mmap_read_lock(mm);
- result = hugepage_vma_revalidate(mm, address, &vma);
- if (result) {
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- pmd = mm_find_pmd(mm, address);
- if (!pmd) {
- result = SCAN_PMD_NULL;
+ result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ if (result != SCAN_SUCCEED) {
mmap_read_unlock(mm);
goto out_nolock;
}
- /*
- * __collapse_huge_page_swapin will return with mmap_lock released
- * when it fails. So we jump out_nolock directly in that case.
- * Continuing to collapse causes inconsistency.
- */
- if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
- pmd, referenced)) {
- goto out_nolock;
+ if (unmapped) {
+ /*
+ * __collapse_huge_page_swapin will return with mmap_lock
+ * released when it fails. So we jump out_nolock directly in
+ * that case. Continuing to collapse causes inconsistency.
+ */
+ result = __collapse_huge_page_swapin(mm, vma, address, pmd,
+ referenced);
+ if (result != SCAN_SUCCEED)
+ goto out_nolock;
}
mmap_read_unlock(mm);
@@ -1065,11 +1028,12 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
mmap_write_lock(mm);
- result = hugepage_vma_revalidate(mm, address, &vma);
- if (result)
+ result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
+ if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
- if (mm_find_pmd(mm, address) != pmd)
+ result = check_pmd_still_valid(mm, address, pmd);
+ if (result != SCAN_SUCCEED)
goto out_up_write;
anon_vma_lock_write(vma->anon_vma);
@@ -1083,21 +1047,23 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(&range);
spin_lock(pte_ptl);
- isolated = __collapse_huge_page_isolate(vma, address, pte,
- &compound_pagelist);
+ result = __collapse_huge_page_isolate(vma, address, pte, cc,
+ &compound_pagelist);
spin_unlock(pte_ptl);
- if (unlikely(!isolated)) {
+ if (unlikely(result != SCAN_SUCCEED)) {
pte_unmap(pte);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
@@ -1109,7 +1075,6 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
- result = SCAN_FAIL;
goto out_up_write;
}
@@ -1119,8 +1084,8 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
anon_vma_unlock_write(vma->anon_vma);
- __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
- &compound_pagelist);
+ __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl,
+ &compound_pagelist);
pte_unmap(pte);
/*
* spin_lock() below is not the equivalent of smp_wmb(), but
@@ -1128,42 +1093,43 @@ static void collapse_huge_page(struct mm_struct *mm,
* avoid the copy_huge_page writes to become visible after
* the set_pmd_at() write.
*/
- __SetPageUptodate(new_page);
+ __SetPageUptodate(hpage);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
+ _pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- page_add_new_anon_rmap(new_page, vma, address);
- lru_cache_add_inactive_or_unevictable(new_page, vma);
+ page_add_new_anon_rmap(hpage, vma, address);
+ lru_cache_add_inactive_or_unevictable(hpage, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
spin_unlock(pmd_ptl);
- *hpage = NULL;
+ hpage = NULL;
- khugepaged_pages_collapsed++;
result = SCAN_SUCCEED;
out_up_write:
mmap_write_unlock(mm);
out_nolock:
- if (!IS_ERR_OR_NULL(*hpage))
- mem_cgroup_uncharge(page_folio(*hpage));
- trace_mm_collapse_huge_page(mm, isolated, result);
- return;
+ if (hpage) {
+ mem_cgroup_uncharge(page_folio(hpage));
+ put_page(hpage);
+ }
+ trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
+ return result;
}
-static int khugepaged_scan_pmd(struct mm_struct *mm,
- struct vm_area_struct *vma,
- unsigned long address,
- struct page **hpage)
+static int hpage_collapse_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address, bool *mmap_locked,
+ struct collapse_control *cc)
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, result = 0, referenced = 0;
+ int result = SCAN_FAIL, referenced = 0;
int none_or_zero = 0, shared = 0;
struct page *page = NULL;
unsigned long _address;
@@ -1173,19 +1139,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- pmd = mm_find_pmd(mm, address);
- if (!pmd) {
- result = SCAN_PMD_NULL;
+ result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ if (result != SCAN_SUCCEED)
goto out;
- }
- memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ memset(cc->node_load, 0, sizeof(cc->node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (is_swap_pte(pteval)) {
- if (++unmapped <= khugepaged_max_ptes_swap) {
+ ++unmapped;
+ if (!cc->is_khugepaged ||
+ unmapped <= khugepaged_max_ptes_swap) {
/*
* Always be strict with uffd-wp
* enabled swap entries. Please see
@@ -1203,8 +1169,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
}
}
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ ++none_or_zero;
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none) {
+ (!cc->is_khugepaged ||
+ none_or_zero <= khugepaged_max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
@@ -1234,27 +1202,30 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
- if (page_mapcount(page) > 1 &&
- ++shared > khugepaged_max_ptes_shared) {
- result = SCAN_EXCEED_SHARED_PTE;
- count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
- goto out_unmap;
+ if (page_mapcount(page) > 1) {
+ ++shared;
+ if (cc->is_khugepaged &&
+ shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+ goto out_unmap;
+ }
}
page = compound_head(page);
/*
* Record which node the original page is from and save this
- * information to khugepaged_node_load[].
+ * information to cc->node_load[].
* Khugepaged will allocate hugepage from the node has the max
* hit record.
*/
node = page_to_nid(page);
- if (khugepaged_scan_abort(node)) {
+ if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
goto out_unmap;
}
- khugepaged_node_load[node]++;
+ cc->node_load[node]++;
if (!PageLRU(page)) {
result = SCAN_PAGE_LRU;
goto out_unmap;
@@ -1289,43 +1260,51 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
- if (pte_young(pteval) ||
- page_is_young(page) || PageReferenced(page) ||
- mmu_notifier_test_young(vma->vm_mm, address))
+
+ /*
+ * If collapse was initiated by khugepaged, check that there is
+ * enough young pte to justify collapsing the page
+ */
+ if (cc->is_khugepaged &&
+ (pte_young(pteval) || page_is_young(page) ||
+ PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+ address)))
referenced++;
}
if (!writable) {
result = SCAN_PAGE_RO;
- } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
+ } else if (cc->is_khugepaged &&
+ (!referenced ||
+ (unmapped && referenced < HPAGE_PMD_NR / 2))) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- ret = 1;
}
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret) {
- node = khugepaged_find_target_node();
+ if (result == SCAN_SUCCEED) {
+ result = collapse_huge_page(mm, address, referenced,
+ unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
- collapse_huge_page(mm, address, hpage, node,
- referenced, unmapped);
+ *mmap_locked = false;
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
none_or_zero, result, unmapped);
- return ret;
+ return result;
}
-static void collect_mm_slot(struct mm_slot *mm_slot)
+static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
{
- struct mm_struct *mm = mm_slot->mm;
+ struct mm_slot *slot = &mm_slot->slot;
+ struct mm_struct *mm = slot->mm;
lockdep_assert_held(&khugepaged_mm_lock);
- if (khugepaged_test_exit(mm)) {
+ if (hpage_collapse_test_exit(mm)) {
/* free mm_slot */
- hash_del(&mm_slot->hash);
- list_del(&mm_slot->mm_node);
+ hash_del(&slot->hash);
+ list_del(&slot->mm_node);
/*
* Not strictly needed because the mm exited already.
@@ -1334,7 +1313,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
*/
/* khugepaged_mm_lock actually not necessary for the below */
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
mmdrop(mm);
}
}
@@ -1343,19 +1322,66 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
/*
* Notify khugepaged that given addr of the mm is pte-mapped THP. Then
* khugepaged should try to collapse the page table.
+ *
+ * Note that following race exists:
+ * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A,
+ * emptying the A's ->pte_mapped_thp[] array.
+ * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and
+ * retract_page_tables() finds a VMA in mm_struct A mapping the same extent
+ * (at virtual address X) and adds an entry (for X) into mm_struct A's
+ * ->pte-mapped_thp[] array.
+ * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X,
+ * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry
+ * (for X) into mm_struct A's ->pte-mapped_thp[] array.
+ * Thus, it's possible the same address is added multiple times for the same
+ * mm_struct. Should this happen, we'll simply attempt
+ * collapse_pte_mapped_thp() multiple times for the same address, under the same
+ * exclusive mmap_lock, and assuming the first call is successful, subsequent
+ * attempts will return quickly (without grabbing any additional locks) when
+ * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap
+ * check, and since this is a rare occurrence, the cost of preventing this
+ * "multiple-add" is thought to be more expensive than just handling it, should
+ * it occur.
*/
-static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
unsigned long addr)
{
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
+ bool ret = false;
VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
spin_lock(&khugepaged_mm_lock);
- mm_slot = get_mm_slot(mm);
- if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
+ slot = mm_slot_lookup(mm_slots_hash, mm);
+ mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
+ if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) {
mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
+ ret = true;
+ }
spin_unlock(&khugepaged_mm_lock);
+ return ret;
+}
+
+/* hpage must be locked, and mmap_lock must be held in write */
+static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmdp, struct page *hpage)
+{
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = addr,
+ .flags = 0,
+ .pmd = pmdp,
+ };
+
+ VM_BUG_ON(!PageTransHuge(hpage));
+ mmap_assert_write_locked(vma->vm_mm);
+
+ if (do_set_pmd(&vmf, hpage))
+ return SCAN_FAIL;
+
+ get_page(hpage);
+ return SCAN_SUCCEED;
}
static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -1379,52 +1405,80 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
*
* @mm: process address space where collapse happens
* @addr: THP collapse address
+ * @install_pmd: If a huge PMD should be installed
*
* This function checks whether all the PTEs in the PMD are pointing to the
* right THP. If so, retract the page table so the THP can refault in with
- * as pmd-mapped.
+ * as pmd-mapped. Possibly install a huge PMD mapping the THP.
*/
-void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
+int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
+ bool install_pmd)
{
unsigned long haddr = addr & HPAGE_PMD_MASK;
- struct vm_area_struct *vma = find_vma(mm, haddr);
+ struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
pmd_t *pmd;
spinlock_t *ptl;
- int count = 0;
+ int count = 0, result = SCAN_FAIL;
int i;
+ mmap_assert_write_locked(mm);
+
+ /* Fast check before locking page if already PMD-mapped */
+ result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
+ if (result == SCAN_PMD_MAPPED)
+ return result;
+
if (!vma || !vma->vm_file ||
!range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
- return;
+ return SCAN_VMA_CHECK;
/*
- * This vm_flags may not have VM_HUGEPAGE if the page was not
- * collapsed by this mm. But we can still collapse if the page is
- * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
- * will not fail the vma for missing VM_HUGEPAGE
+ * If we are here, we've succeeded in replacing all the native pages
+ * in the page cache with a single hugepage. If a mm were to fault-in
+ * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
+ * and map it by a PMD, regardless of sysfs THP settings. As such, let's
+ * analogously elide sysfs THP settings here.
*/
- if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false))
- return;
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+ return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
if (userfaultfd_wp(vma))
- return;
+ return SCAN_PTE_UFFD_WP;
hpage = find_lock_page(vma->vm_file->f_mapping,
linear_page_index(vma, haddr));
if (!hpage)
- return;
+ return SCAN_PAGE_NULL;
+
+ if (!PageHead(hpage)) {
+ result = SCAN_FAIL;
+ goto drop_hpage;
+ }
- if (!PageHead(hpage))
+ if (compound_order(hpage) != HPAGE_PMD_ORDER) {
+ result = SCAN_PAGE_COMPOUND;
goto drop_hpage;
+ }
- pmd = mm_find_pmd(mm, haddr);
- if (!pmd)
+ switch (result) {
+ case SCAN_SUCCEED:
+ break;
+ case SCAN_PMD_NONE:
+ /*
+ * In MADV_COLLAPSE path, possible race with khugepaged where
+ * all pte entries have been removed and pmd cleared. If so,
+ * skip all the pte checks and just update the pmd mapping.
+ */
+ goto maybe_install_pmd;
+ default:
goto drop_hpage;
+ }
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+ result = SCAN_FAIL;
/* step 1: check all mapped PTEs are to the right huge page */
for (i = 0, addr = haddr, pte = start_pte;
@@ -1436,8 +1490,10 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
continue;
/* page swapped out, abort */
- if (!pte_present(*pte))
+ if (!pte_present(*pte)) {
+ result = SCAN_PTE_NON_PRESENT;
goto abort;
+ }
page = vm_normal_page(vma, addr, *pte);
if (WARN_ON_ONCE(page && is_zone_device_page(page)))
@@ -1472,21 +1528,29 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
}
- /* step 4: collapse pmd */
+ /* step 4: remove pte entries */
collapse_and_free_pmd(mm, vma, haddr, pmd);
+
+maybe_install_pmd:
+ /* step 5: install pmd entry */
+ result = install_pmd
+ ? set_huge_pmd(vma, haddr, pmd, hpage)
+ : SCAN_SUCCEED;
+
drop_hpage:
unlock_page(hpage);
put_page(hpage);
- return;
+ return result;
abort:
pte_unmap_unlock(start_pte, ptl);
goto drop_hpage;
}
-static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
{
- struct mm_struct *mm = mm_slot->mm;
+ struct mm_slot *slot = &mm_slot->slot;
+ struct mm_struct *mm = slot->mm;
int i;
if (likely(mm_slot->nr_pte_mapped_thp == 0))
@@ -1495,26 +1559,33 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
if (!mmap_write_trylock(mm))
return;
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit(mm)))
goto out;
for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
- collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
+ collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false);
out:
mm_slot->nr_pte_mapped_thp = 0;
mmap_write_unlock(mm);
}
-static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
+ struct mm_struct *target_mm,
+ unsigned long target_addr, struct page *hpage,
+ struct collapse_control *cc)
{
struct vm_area_struct *vma;
- struct mm_struct *mm;
- unsigned long addr;
- pmd_t *pmd;
+ int target_result = SCAN_FAIL;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ int result = SCAN_FAIL;
+ struct mm_struct *mm = NULL;
+ unsigned long addr = 0;
+ pmd_t *pmd;
+ bool is_target = false;
+
/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
* got written to. These VMAs are likely not worth investing
@@ -1531,25 +1602,34 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* ptl. It has higher chance to recover THP for the VMA, but
* has higher cost too.
*/
- if (vma->anon_vma)
- continue;
+ if (vma->anon_vma) {
+ result = SCAN_PAGE_ANON;
+ goto next;
+ }
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- if (addr & ~HPAGE_PMD_MASK)
- continue;
- if (vma->vm_end < addr + HPAGE_PMD_SIZE)
- continue;
+ if (addr & ~HPAGE_PMD_MASK ||
+ vma->vm_end < addr + HPAGE_PMD_SIZE) {
+ result = SCAN_VMA_CHECK;
+ goto next;
+ }
mm = vma->vm_mm;
- pmd = mm_find_pmd(mm, addr);
- if (!pmd)
- continue;
+ is_target = mm == target_mm && addr == target_addr;
+ result = find_pmd_or_thp_or_none(mm, addr, &pmd);
+ if (result != SCAN_SUCCEED)
+ goto next;
/*
* We need exclusive mmap_lock to retract page table.
*
* We use trylock due to lock inversion: we need to acquire
* mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
+ *
+ * Also, it's not MADV_COLLAPSE's job to collapse other
+ * mappings - let khugepaged take care of them later.
*/
- if (mmap_write_trylock(mm)) {
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
+ if ((cc->is_khugepaged || is_target) &&
+ mmap_write_trylock(mm)) {
/*
* When a vma is registered with uffd-wp, we can't
* recycle the pmd pgtable because there can be pte
@@ -1558,25 +1638,48 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* it'll always mapped in small page size for uffd-wp
* registered ranges.
*/
- if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma))
- collapse_and_free_pmd(mm, vma, addr, pmd);
+ if (hpage_collapse_test_exit(mm)) {
+ result = SCAN_ANY_PROCESS;
+ goto unlock_next;
+ }
+ if (userfaultfd_wp(vma)) {
+ result = SCAN_PTE_UFFD_WP;
+ goto unlock_next;
+ }
+ collapse_and_free_pmd(mm, vma, addr, pmd);
+ if (!cc->is_khugepaged && is_target)
+ result = set_huge_pmd(vma, addr, pmd, hpage);
+ else
+ result = SCAN_SUCCEED;
+
+unlock_next:
mmap_write_unlock(mm);
- } else {
- /* Try again later */
+ goto next;
+ }
+ /*
+ * Calling context will handle target mm/addr. Otherwise, let
+ * khugepaged try again later.
+ */
+ if (!is_target) {
khugepaged_add_pte_mapped_thp(mm, addr);
+ continue;
}
+next:
+ if (is_target)
+ target_result = result;
}
i_mmap_unlock_write(mapping);
+ return target_result;
}
/**
* collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
*
* @mm: process address space where collapse happens
+ * @addr: virtual collapse start address
* @file: file that collapse on
* @start: collapse start address
- * @hpage: new allocated huge page for collapse
- * @node: appointed node the new huge page allocate from
+ * @cc: collapse context and scratchpad
*
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
@@ -1593,13 +1696,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* + restore gaps in the page cache;
* + unlock and free huge page;
*/
-static void collapse_file(struct mm_struct *mm,
- struct file *file, pgoff_t start,
- struct page **hpage, int node)
+static int collapse_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc)
{
struct address_space *mapping = file->f_mapping;
- gfp_t gfp;
- struct page *new_page;
+ struct page *hpage;
pgoff_t index, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
@@ -1610,20 +1712,9 @@ static void collapse_file(struct mm_struct *mm,
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
- /* Only allocate from the target node */
- gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
-
- new_page = khugepaged_alloc_page(hpage, gfp, node);
- if (!new_page) {
- result = SCAN_ALLOC_HUGE_PAGE_FAIL;
- goto out;
- }
-
- if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
- result = SCAN_CGROUP_CHARGE_FAIL;
+ result = alloc_charge_hpage(&hpage, mm, cc);
+ if (result != SCAN_SUCCEED)
goto out;
- }
- count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
/*
* Ensure we have slots for all the pages in the range. This is
@@ -1641,14 +1732,14 @@ static void collapse_file(struct mm_struct *mm,
}
} while (1);
- __SetPageLocked(new_page);
+ __SetPageLocked(hpage);
if (is_shmem)
- __SetPageSwapBacked(new_page);
- new_page->index = start;
- new_page->mapping = mapping;
+ __SetPageSwapBacked(hpage);
+ hpage->index = start;
+ hpage->mapping = mapping;
/*
- * At this point the new_page is locked and not up-to-date.
+ * At this point the hpage is locked and not up-to-date.
* It's safe to insert it into the page cache, because nobody would
* be able to map it or use it in another way until we unlock it.
*/
@@ -1676,19 +1767,22 @@ static void collapse_file(struct mm_struct *mm,
result = SCAN_FAIL;
goto xa_locked;
}
- xas_store(&xas, new_page);
+ xas_store(&xas, hpage);
nr_none++;
continue;
}
if (xa_is_value(page) || !PageUptodate(page)) {
+ struct folio *folio;
+
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
- if (shmem_getpage(mapping->host, index, &page,
- SGP_NOALLOC)) {
+ if (shmem_get_folio(mapping->host, index,
+ &folio, SGP_NOALLOC)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
+ page = folio_file_page(folio, index);
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
@@ -1755,9 +1849,16 @@ static void collapse_file(struct mm_struct *mm,
/*
* If file was truncated then extended, or hole-punched, before
* we locked the first page, then a THP might be there already.
+ * This will be discovered on the first iteration.
*/
if (PageTransCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
+ struct page *head = compound_head(page);
+
+ result = compound_order(head) == HPAGE_PMD_ORDER &&
+ head->index == start
+ /* Maybe PMD-mapped */
+ ? SCAN_PTE_MAPPED_HUGEPAGE
+ : SCAN_PAGE_COMPOUND;
goto out_unlock;
}
@@ -1818,19 +1919,19 @@ static void collapse_file(struct mm_struct *mm,
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- xas_store(&xas, new_page);
+ xas_store(&xas, hpage);
continue;
out_unlock:
unlock_page(page);
put_page(page);
goto xa_unlocked;
}
- nr = thp_nr_pages(new_page);
+ nr = thp_nr_pages(hpage);
if (is_shmem)
- __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
+ __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
else {
- __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
+ __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
filemap_nr_thps_inc(mapping);
/*
* Paired with smp_mb() in do_dentry_open() to ensure
@@ -1841,21 +1942,21 @@ out_unlock:
smp_mb();
if (inode_is_open_for_write(mapping->host)) {
result = SCAN_FAIL;
- __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
+ __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
goto xa_locked;
}
}
if (nr_none) {
- __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
+ __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
/* nr_none is always 0 for non-shmem. */
- __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
+ __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
}
/* Join all the small entries into a single multi-index entry */
xas_set_order(&xas, start, HPAGE_PMD_ORDER);
- xas_store(&xas, new_page);
+ xas_store(&xas, hpage);
xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
@@ -1877,11 +1978,11 @@ xa_unlocked:
index = start;
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
while (index < page->index) {
- clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ clear_highpage(hpage + (index % HPAGE_PMD_NR));
index++;
}
- copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
- page);
+ copy_highpage(hpage + (page->index % HPAGE_PMD_NR),
+ page);
list_del(&page->lru);
page->mapping = NULL;
page_ref_unfreeze(page, 1);
@@ -1892,23 +1993,23 @@ xa_unlocked:
index++;
}
while (index < end) {
- clear_highpage(new_page + (index % HPAGE_PMD_NR));
+ clear_highpage(hpage + (index % HPAGE_PMD_NR));
index++;
}
- SetPageUptodate(new_page);
- page_ref_add(new_page, HPAGE_PMD_NR - 1);
+ SetPageUptodate(hpage);
+ page_ref_add(hpage, HPAGE_PMD_NR - 1);
if (is_shmem)
- set_page_dirty(new_page);
- lru_cache_add(new_page);
+ set_page_dirty(hpage);
+ lru_cache_add(hpage);
/*
* Remove pte page tables, so we can re-fault the page as huge.
*/
- retract_page_tables(mapping, start);
- *hpage = NULL;
-
- khugepaged_pages_collapsed++;
+ result = retract_page_tables(mapping, start, mm, addr, hpage,
+ cc);
+ unlock_page(hpage);
+ hpage = NULL;
} else {
struct page *page;
@@ -1947,19 +2048,24 @@ xa_unlocked:
VM_BUG_ON(nr_none);
xas_unlock_irq(&xas);
- new_page->mapping = NULL;
+ hpage->mapping = NULL;
}
- unlock_page(new_page);
+ if (hpage)
+ unlock_page(hpage);
out:
VM_BUG_ON(!list_empty(&pagelist));
- if (!IS_ERR_OR_NULL(*hpage))
- mem_cgroup_uncharge(page_folio(*hpage));
+ if (hpage) {
+ mem_cgroup_uncharge(page_folio(hpage));
+ put_page(hpage);
+ }
/* TODO: tracepoints */
+ return result;
}
-static void khugepaged_scan_file(struct mm_struct *mm,
- struct file *file, pgoff_t start, struct page **hpage)
+static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc)
{
struct page *page = NULL;
struct address_space *mapping = file->f_mapping;
@@ -1970,14 +2076,16 @@ static void khugepaged_scan_file(struct mm_struct *mm,
present = 0;
swap = 0;
- memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+ memset(cc->node_load, 0, sizeof(cc->node_load));
rcu_read_lock();
xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
if (xas_retry(&xas, page))
continue;
if (xa_is_value(page)) {
- if (++swap > khugepaged_max_ptes_swap) {
+ ++swap;
+ if (cc->is_khugepaged &&
+ swap > khugepaged_max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
break;
@@ -1986,20 +2094,32 @@ static void khugepaged_scan_file(struct mm_struct *mm,
}
/*
- * XXX: khugepaged should compact smaller compound pages
+ * TODO: khugepaged should compact smaller compound pages
* into a PMD sized page
*/
if (PageTransCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
+ struct page *head = compound_head(page);
+
+ result = compound_order(head) == HPAGE_PMD_ORDER &&
+ head->index == start
+ /* Maybe PMD-mapped */
+ ? SCAN_PTE_MAPPED_HUGEPAGE
+ : SCAN_PAGE_COMPOUND;
+ /*
+ * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
+ * by the caller won't touch the page cache, and so
+ * it's safe to skip LRU and refcount checks before
+ * returning.
+ */
break;
}
node = page_to_nid(page);
- if (khugepaged_scan_abort(node)) {
+ if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
break;
}
- khugepaged_node_load[node]++;
+ cc->node_load[node]++;
if (!PageLRU(page)) {
result = SCAN_PAGE_LRU;
@@ -2028,54 +2148,68 @@ static void khugepaged_scan_file(struct mm_struct *mm,
rcu_read_unlock();
if (result == SCAN_SUCCEED) {
- if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ if (cc->is_khugepaged &&
+ present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
} else {
- node = khugepaged_find_target_node();
- collapse_file(mm, file, start, hpage, node);
+ result = collapse_file(mm, addr, file, start, cc);
}
}
- /* TODO: tracepoints */
+ trace_mm_khugepaged_scan_file(mm, page, file->f_path.dentry->d_iname,
+ present, swap, result);
+ return result;
}
#else
-static void khugepaged_scan_file(struct mm_struct *mm,
- struct file *file, pgoff_t start, struct page **hpage)
+static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
+ struct file *file, pgoff_t start,
+ struct collapse_control *cc)
{
BUILD_BUG();
}
-static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
{
}
+
+static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
+{
+ return false;
+}
#endif
-static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
- struct page **hpage)
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
+ struct collapse_control *cc)
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
{
- struct mm_slot *mm_slot;
+ struct vma_iterator vmi;
+ struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int progress = 0;
VM_BUG_ON(!pages);
lockdep_assert_held(&khugepaged_mm_lock);
+ *result = SCAN_FAIL;
- if (khugepaged_scan.mm_slot)
+ if (khugepaged_scan.mm_slot) {
mm_slot = khugepaged_scan.mm_slot;
- else {
- mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ slot = &mm_slot->slot;
+ } else {
+ slot = list_entry(khugepaged_scan.mm_head.next,
struct mm_slot, mm_node);
+ mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
khugepaged_collapse_pte_mapped_thps(mm_slot);
- mm = mm_slot->mm;
+ mm = slot->mm;
/*
* Don't wait for semaphore (to avoid long wait times). Just move to
* the next mm on the list.
@@ -2083,19 +2217,21 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
vma = NULL;
if (unlikely(!mmap_read_trylock(mm)))
goto breakouterloop_mmap_lock;
- if (likely(!khugepaged_test_exit(mm)))
- vma = find_vma(mm, khugepaged_scan.address);
progress++;
- for (; vma; vma = vma->vm_next) {
+ if (unlikely(hpage_collapse_test_exit(mm)))
+ goto breakouterloop;
+
+ vma_iter_init(&vmi, mm, khugepaged_scan.address);
+ for_each_vma(vmi, vma) {
unsigned long hstart, hend;
cond_resched();
- if (unlikely(khugepaged_test_exit(mm))) {
+ if (unlikely(hpage_collapse_test_exit(mm))) {
progress++;
break;
}
- if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) {
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
skip:
progress++;
continue;
@@ -2109,9 +2245,10 @@ skip:
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
while (khugepaged_scan.address < hend) {
- int ret;
+ bool mmap_locked = true;
+
cond_resched();
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(hpage_collapse_test_exit(mm)))
goto breakouterloop;
VM_BUG_ON(khugepaged_scan.address < hstart ||
@@ -2123,19 +2260,48 @@ skip:
khugepaged_scan.address);
mmap_read_unlock(mm);
- ret = 1;
- khugepaged_scan_file(mm, file, pgoff, hpage);
+ *result = hpage_collapse_scan_file(mm,
+ khugepaged_scan.address,
+ file, pgoff, cc);
+ mmap_locked = false;
fput(file);
} else {
- ret = khugepaged_scan_pmd(mm, vma,
- khugepaged_scan.address,
- hpage);
+ *result = hpage_collapse_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ &mmap_locked,
+ cc);
+ }
+ switch (*result) {
+ case SCAN_PTE_MAPPED_HUGEPAGE: {
+ pmd_t *pmd;
+
+ *result = find_pmd_or_thp_or_none(mm,
+ khugepaged_scan.address,
+ &pmd);
+ if (*result != SCAN_SUCCEED)
+ break;
+ if (!khugepaged_add_pte_mapped_thp(mm,
+ khugepaged_scan.address))
+ break;
+ } fallthrough;
+ case SCAN_SUCCEED:
+ ++khugepaged_pages_collapsed;
+ break;
+ default:
+ break;
}
+
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
- if (ret)
- /* we released mmap_lock so break loop */
+ if (!mmap_locked)
+ /*
+ * We released mmap_lock so break loop. Note
+ * that we drop mmap_lock before all hugepage
+ * allocations, so if allocation fails, we are
+ * guaranteed to break here and report the
+ * correct result back to caller.
+ */
goto breakouterloop_mmap_lock;
if (progress >= pages)
goto breakouterloop;
@@ -2151,16 +2317,17 @@ breakouterloop_mmap_lock:
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
*/
- if (khugepaged_test_exit(mm) || !vma) {
+ if (hpage_collapse_test_exit(mm) || !vma) {
/*
* Make sure that if mm_users is reaching zero while
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
- if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
- khugepaged_scan.mm_slot = list_entry(
- mm_slot->mm_node.next,
- struct mm_slot, mm_node);
+ if (slot->mm_node.next != &khugepaged_scan.mm_head) {
+ slot = list_entry(slot->mm_node.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.mm_slot =
+ mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
} else {
khugepaged_scan.mm_slot = NULL;
@@ -2185,19 +2352,16 @@ static int khugepaged_wait_event(void)
kthread_should_stop();
}
-static void khugepaged_do_scan(void)
+static void khugepaged_do_scan(struct collapse_control *cc)
{
- struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
bool wait = true;
+ int result = SCAN_SUCCEED;
lru_add_drain_all();
- while (progress < pages) {
- if (!khugepaged_prealloc_page(&hpage, &wait))
- break;
-
+ while (true) {
cond_resched();
if (unlikely(kthread_should_stop() || try_to_freeze()))
@@ -2209,14 +2373,25 @@ static void khugepaged_do_scan(void)
if (khugepaged_has_work() &&
pass_through_head < 2)
progress += khugepaged_scan_mm_slot(pages - progress,
- &hpage);
+ &result, cc);
else
progress = pages;
spin_unlock(&khugepaged_mm_lock);
- }
- if (!IS_ERR_OR_NULL(hpage))
- put_page(hpage);
+ if (progress >= pages)
+ break;
+
+ if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
+ /*
+ * If fail to allocate the first time, try to sleep for
+ * a while. When hit again, cancel the scan.
+ */
+ if (!wait)
+ break;
+ wait = false;
+ khugepaged_alloc_sleep();
+ }
+ }
}
static bool khugepaged_should_wakeup(void)
@@ -2247,13 +2422,13 @@ static void khugepaged_wait_work(void)
static int khugepaged(void *none)
{
- struct mm_slot *mm_slot;
+ struct khugepaged_mm_slot *mm_slot;
set_freezable();
set_user_nice(current, MAX_NICE);
while (!kthread_should_stop()) {
- khugepaged_do_scan();
+ khugepaged_do_scan(&khugepaged_collapse_control);
khugepaged_wait_work();
}
@@ -2352,3 +2527,140 @@ void khugepaged_min_free_kbytes_update(void)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}
+
+static int madvise_collapse_errno(enum scan_result r)
+{
+ /*
+ * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
+ * actionable feedback to caller, so they may take an appropriate
+ * fallback measure depending on the nature of the failure.
+ */
+ switch (r) {
+ case SCAN_ALLOC_HUGE_PAGE_FAIL:
+ return -ENOMEM;
+ case SCAN_CGROUP_CHARGE_FAIL:
+ return -EBUSY;
+ /* Resource temporary unavailable - trying again might succeed */
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_LRU:
+ case SCAN_DEL_PAGE_LRU:
+ return -EAGAIN;
+ /*
+ * Other: Trying again likely not to succeed / error intrinsic to
+ * specified memory range. khugepaged likely won't be able to collapse
+ * either.
+ */
+ default:
+ return -EINVAL;
+ }
+}
+
+int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ struct collapse_control *cc;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long hstart, hend, addr;
+ int thps = 0, last_fail = SCAN_FAIL;
+ bool mmap_locked = true;
+
+ BUG_ON(vma->vm_start > start);
+ BUG_ON(vma->vm_end < end);
+
+ *prev = vma;
+
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
+ return -EINVAL;
+
+ cc = kmalloc(sizeof(*cc), GFP_KERNEL);
+ if (!cc)
+ return -ENOMEM;
+ cc->is_khugepaged = false;
+ cc->last_target_node = NUMA_NO_NODE;
+
+ mmgrab(mm);
+ lru_add_drain_all();
+
+ hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = end & HPAGE_PMD_MASK;
+
+ for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
+ int result = SCAN_FAIL;
+
+ if (!mmap_locked) {
+ cond_resched();
+ mmap_read_lock(mm);
+ mmap_locked = true;
+ result = hugepage_vma_revalidate(mm, addr, false, &vma,
+ cc);
+ if (result != SCAN_SUCCEED) {
+ last_fail = result;
+ goto out_nolock;
+ }
+
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ }
+ mmap_assert_locked(mm);
+ memset(cc->node_load, 0, sizeof(cc->node_load));
+ if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ struct file *file = get_file(vma->vm_file);
+ pgoff_t pgoff = linear_page_index(vma, addr);
+
+ mmap_read_unlock(mm);
+ mmap_locked = false;
+ result = hpage_collapse_scan_file(mm, addr, file, pgoff,
+ cc);
+ fput(file);
+ } else {
+ result = hpage_collapse_scan_pmd(mm, vma, addr,
+ &mmap_locked, cc);
+ }
+ if (!mmap_locked)
+ *prev = NULL; /* Tell caller we dropped mmap_lock */
+
+handle_result:
+ switch (result) {
+ case SCAN_SUCCEED:
+ case SCAN_PMD_MAPPED:
+ ++thps;
+ break;
+ case SCAN_PTE_MAPPED_HUGEPAGE:
+ BUG_ON(mmap_locked);
+ BUG_ON(*prev);
+ mmap_write_lock(mm);
+ result = collapse_pte_mapped_thp(mm, addr, true);
+ mmap_write_unlock(mm);
+ goto handle_result;
+ /* Whitelisted set of results where continuing OK */
+ case SCAN_PMD_NULL:
+ case SCAN_PTE_NON_PRESENT:
+ case SCAN_PTE_UFFD_WP:
+ case SCAN_PAGE_RO:
+ case SCAN_LACK_REFERENCED_PAGE:
+ case SCAN_PAGE_NULL:
+ case SCAN_PAGE_COUNT:
+ case SCAN_PAGE_LOCK:
+ case SCAN_PAGE_COMPOUND:
+ case SCAN_PAGE_LRU:
+ case SCAN_DEL_PAGE_LRU:
+ last_fail = result;
+ break;
+ default:
+ last_fail = result;
+ /* Other error, exit */
+ goto out_maybelock;
+ }
+ }
+
+out_maybelock:
+ /* Caller expects us to hold mmap_lock on return */
+ if (!mmap_locked)
+ mmap_read_lock(mm);
+out_nolock:
+ mmap_assert_locked(mm);
+ mmdrop(mm);
+ kfree(cc);
+
+ return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
+ : madvise_collapse_errno(last_fail);
+}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 1eddc0132f7f..646e2979641f 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -604,9 +604,8 @@ static int __save_stack_trace(unsigned long *trace)
* memory block and add it to the object_list and object_tree_root (or
* object_phys_tree_root).
*/
-static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp,
- bool is_phys)
+static void __create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp, bool is_phys)
{
unsigned long flags;
struct kmemleak_object *object, *parent;
@@ -618,7 +617,7 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
if (!object) {
pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
- return NULL;
+ return;
}
INIT_LIST_HEAD(&object->object_list);
@@ -687,7 +686,6 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
*/
dump_object_info(parent);
kmem_cache_free(object_cache, object);
- object = NULL;
goto out;
}
}
@@ -698,21 +696,20 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
list_add_tail_rcu(&object->object_list, &object_list);
out:
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
- return object;
}
/* Create kmemleak object which allocated with virtual address. */
-static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp)
+static void create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
{
- return __create_object(ptr, size, min_count, gfp, false);
+ __create_object(ptr, size, min_count, gfp, false);
}
/* Create kmemleak object which allocated with physical address. */
-static struct kmemleak_object *create_object_phys(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp)
+static void create_object_phys(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
{
- return __create_object(ptr, size, min_count, gfp, true);
+ __create_object(ptr, size, min_count, gfp, true);
}
/*
@@ -1464,6 +1461,27 @@ static void scan_gray_list(void)
}
/*
+ * Conditionally call resched() in a object iteration loop while making sure
+ * that the given object won't go away without RCU read lock by performing a
+ * get_object() if !pinned.
+ *
+ * Return: false if can't do a cond_resched() due to get_object() failure
+ * true otherwise
+ */
+static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned)
+{
+ if (!pinned && !get_object(object))
+ return false;
+
+ rcu_read_unlock();
+ cond_resched();
+ rcu_read_lock();
+ if (!pinned)
+ put_object(object);
+ return true;
+}
+
+/*
* Scan data sections and all the referenced memory blocks allocated via the
* kernel's standard allocators. This function must be called with the
* scan_mutex held.
@@ -1474,7 +1492,7 @@ static void kmemleak_scan(void)
struct zone *zone;
int __maybe_unused i;
int new_leaks = 0;
- int loop1_cnt = 0;
+ int loop_cnt = 0;
jiffies_last_scan = jiffies;
@@ -1483,7 +1501,6 @@ static void kmemleak_scan(void)
list_for_each_entry_rcu(object, &object_list, object_list) {
bool obj_pinned = false;
- loop1_cnt++;
raw_spin_lock_irq(&object->lock);
#ifdef DEBUG
/*
@@ -1517,24 +1534,11 @@ static void kmemleak_scan(void)
raw_spin_unlock_irq(&object->lock);
/*
- * Do a cond_resched() to avoid soft lockup every 64k objects.
- * Make sure a reference has been taken so that the object
- * won't go away without RCU read lock.
+ * Do a cond_resched() every 64k objects to avoid soft lockup.
*/
- if (!(loop1_cnt & 0xffff)) {
- if (!obj_pinned && !get_object(object)) {
- /* Try the next object instead */
- loop1_cnt--;
- continue;
- }
-
- rcu_read_unlock();
- cond_resched();
- rcu_read_lock();
-
- if (!obj_pinned)
- put_object(object);
- }
+ if (!(++loop_cnt & 0xffff) &&
+ !kmemleak_cond_resched(object, obj_pinned))
+ loop_cnt--; /* Try again on next object */
}
rcu_read_unlock();
@@ -1601,8 +1605,16 @@ static void kmemleak_scan(void)
* scan and color them gray until the next scan.
*/
rcu_read_lock();
+ loop_cnt = 0;
list_for_each_entry_rcu(object, &object_list, object_list) {
/*
+ * Do a cond_resched() every 64k objects to avoid soft lockup.
+ */
+ if (!(++loop_cnt & 0xffff) &&
+ !kmemleak_cond_resched(object, false))
+ loop_cnt--; /* Try again on next object */
+
+ /*
* This is racy but we can save the overhead of lock/unlock
* calls. The missed objects, if any, should be caught in
* the next scan.
@@ -1635,8 +1647,16 @@ static void kmemleak_scan(void)
* Scanning result reporting.
*/
rcu_read_lock();
+ loop_cnt = 0;
list_for_each_entry_rcu(object, &object_list, object_list) {
/*
+ * Do a cond_resched() every 64k objects to avoid soft lockup.
+ */
+ if (!(++loop_cnt & 0xffff) &&
+ !kmemleak_cond_resched(object, false))
+ loop_cnt--; /* Try again on next object */
+
+ /*
* This is racy but we can save the overhead of lock/unlock
* calls. The missed objects, if any, should be caught in
* the next scan.
diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile
new file mode 100644
index 000000000000..98eab2856626
--- /dev/null
+++ b/mm/kmsan/Makefile
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for KernelMemorySanitizer (KMSAN).
+#
+#
+obj-y := core.o instrumentation.o init.o hooks.o report.o shadow.o
+
+KMSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+UBSAN_SANITIZE := n
+
+# Disable instrumentation of KMSAN runtime with other tools.
+CC_FLAGS_KMSAN_RUNTIME := -fno-stack-protector
+CC_FLAGS_KMSAN_RUNTIME += $(call cc-option,-fno-conserve-stack)
+CC_FLAGS_KMSAN_RUNTIME += -DDISABLE_BRANCH_PROFILING
+
+CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE)
+
+CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_init.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME)
+CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME)
+
+obj-$(CONFIG_KMSAN_KUNIT_TEST) += kmsan_test.o
+KMSAN_SANITIZE_kmsan_test.o := y
+CFLAGS_kmsan_test.o += $(call cc-disable-warning, uninitialized)
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
new file mode 100644
index 000000000000..112dce135c7f
--- /dev/null
+++ b/mm/kmsan/core.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN runtime library.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <asm/page.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+#include <linux/highmem.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kmsan_types.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/mmzone.h>
+#include <linux/percpu-defs.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include "../slab.h"
+#include "kmsan.h"
+
+bool kmsan_enabled __read_mostly;
+
+/*
+ * Per-CPU KMSAN context to be used in interrupts, where current->kmsan is
+ * unavaliable.
+ */
+DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx);
+
+void kmsan_internal_task_create(struct task_struct *task)
+{
+ struct kmsan_ctx *ctx = &task->kmsan_ctx;
+ struct thread_info *info = current_thread_info();
+
+ __memset(ctx, 0, sizeof(*ctx));
+ ctx->allow_reporting = true;
+ kmsan_internal_unpoison_memory(info, sizeof(*info), false);
+}
+
+void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags,
+ unsigned int poison_flags)
+{
+ u32 extra_bits =
+ kmsan_extra_bits(/*depth*/ 0, poison_flags & KMSAN_POISON_FREE);
+ bool checked = poison_flags & KMSAN_POISON_CHECK;
+ depot_stack_handle_t handle;
+
+ handle = kmsan_save_stack_with_flags(flags, extra_bits);
+ kmsan_internal_set_shadow_origin(address, size, -1, handle, checked);
+}
+
+void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked)
+{
+ kmsan_internal_set_shadow_origin(address, size, 0, 0, checked);
+}
+
+depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
+ unsigned int extra)
+{
+ unsigned long entries[KMSAN_STACK_DEPTH];
+ unsigned int nr_entries;
+
+ nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0);
+
+ /* Don't sleep (see might_sleep_if() in __alloc_pages_nodemask()). */
+ flags &= ~__GFP_DIRECT_RECLAIM;
+
+ return __stack_depot_save(entries, nr_entries, extra, flags, true);
+}
+
+/* Copy the metadata following the memmove() behavior. */
+void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n)
+{
+ depot_stack_handle_t old_origin = 0, new_origin = 0;
+ int src_slots, dst_slots, i, iter, step, skip_bits;
+ depot_stack_handle_t *origin_src, *origin_dst;
+ void *shadow_src, *shadow_dst;
+ u32 *align_shadow_src, shadow;
+ bool backwards;
+
+ shadow_dst = kmsan_get_metadata(dst, KMSAN_META_SHADOW);
+ if (!shadow_dst)
+ return;
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(dst, n));
+
+ shadow_src = kmsan_get_metadata(src, KMSAN_META_SHADOW);
+ if (!shadow_src) {
+ /*
+ * @src is untracked: zero out destination shadow, ignore the
+ * origins, we're done.
+ */
+ __memset(shadow_dst, 0, n);
+ return;
+ }
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(src, n));
+
+ __memmove(shadow_dst, shadow_src, n);
+
+ origin_dst = kmsan_get_metadata(dst, KMSAN_META_ORIGIN);
+ origin_src = kmsan_get_metadata(src, KMSAN_META_ORIGIN);
+ KMSAN_WARN_ON(!origin_dst || !origin_src);
+ src_slots = (ALIGN((u64)src + n, KMSAN_ORIGIN_SIZE) -
+ ALIGN_DOWN((u64)src, KMSAN_ORIGIN_SIZE)) /
+ KMSAN_ORIGIN_SIZE;
+ dst_slots = (ALIGN((u64)dst + n, KMSAN_ORIGIN_SIZE) -
+ ALIGN_DOWN((u64)dst, KMSAN_ORIGIN_SIZE)) /
+ KMSAN_ORIGIN_SIZE;
+ KMSAN_WARN_ON((src_slots < 1) || (dst_slots < 1));
+ KMSAN_WARN_ON((src_slots - dst_slots > 1) ||
+ (dst_slots - src_slots < -1));
+
+ backwards = dst > src;
+ i = backwards ? min(src_slots, dst_slots) - 1 : 0;
+ iter = backwards ? -1 : 1;
+
+ align_shadow_src =
+ (u32 *)ALIGN_DOWN((u64)shadow_src, KMSAN_ORIGIN_SIZE);
+ for (step = 0; step < min(src_slots, dst_slots); step++, i += iter) {
+ KMSAN_WARN_ON(i < 0);
+ shadow = align_shadow_src[i];
+ if (i == 0) {
+ /*
+ * If @src isn't aligned on KMSAN_ORIGIN_SIZE, don't
+ * look at the first @src % KMSAN_ORIGIN_SIZE bytes
+ * of the first shadow slot.
+ */
+ skip_bits = ((u64)src % KMSAN_ORIGIN_SIZE) * 8;
+ shadow = (shadow >> skip_bits) << skip_bits;
+ }
+ if (i == src_slots - 1) {
+ /*
+ * If @src + n isn't aligned on
+ * KMSAN_ORIGIN_SIZE, don't look at the last
+ * (@src + n) % KMSAN_ORIGIN_SIZE bytes of the
+ * last shadow slot.
+ */
+ skip_bits = (((u64)src + n) % KMSAN_ORIGIN_SIZE) * 8;
+ shadow = (shadow << skip_bits) >> skip_bits;
+ }
+ /*
+ * Overwrite the origin only if the corresponding
+ * shadow is nonempty.
+ */
+ if (origin_src[i] && (origin_src[i] != old_origin) && shadow) {
+ old_origin = origin_src[i];
+ new_origin = kmsan_internal_chain_origin(old_origin);
+ /*
+ * kmsan_internal_chain_origin() may return
+ * NULL, but we don't want to lose the previous
+ * origin value.
+ */
+ if (!new_origin)
+ new_origin = old_origin;
+ }
+ if (shadow)
+ origin_dst[i] = new_origin;
+ else
+ origin_dst[i] = 0;
+ }
+ /*
+ * If dst_slots is greater than src_slots (i.e.
+ * dst_slots == src_slots + 1), there is an extra origin slot at the
+ * beginning or end of the destination buffer, for which we take the
+ * origin from the previous slot.
+ * This is only done if the part of the source shadow corresponding to
+ * slot is non-zero.
+ *
+ * E.g. if we copy 8 aligned bytes that are marked as uninitialized
+ * and have origins o111 and o222, to an unaligned buffer with offset 1,
+ * these two origins are copied to three origin slots, so one of then
+ * needs to be duplicated, depending on the copy direction (@backwards)
+ *
+ * src shadow: |uuuu|uuuu|....|
+ * src origin: |o111|o222|....|
+ *
+ * backwards = 0:
+ * dst shadow: |.uuu|uuuu|u...|
+ * dst origin: |....|o111|o222| - fill the empty slot with o111
+ * backwards = 1:
+ * dst shadow: |.uuu|uuuu|u...|
+ * dst origin: |o111|o222|....| - fill the empty slot with o222
+ */
+ if (src_slots < dst_slots) {
+ if (backwards) {
+ shadow = align_shadow_src[src_slots - 1];
+ skip_bits = (((u64)dst + n) % KMSAN_ORIGIN_SIZE) * 8;
+ shadow = (shadow << skip_bits) >> skip_bits;
+ if (shadow)
+ /* src_slots > 0, therefore dst_slots is at least 2 */
+ origin_dst[dst_slots - 1] =
+ origin_dst[dst_slots - 2];
+ } else {
+ shadow = align_shadow_src[0];
+ skip_bits = ((u64)dst % KMSAN_ORIGIN_SIZE) * 8;
+ shadow = (shadow >> skip_bits) << skip_bits;
+ if (shadow)
+ origin_dst[0] = origin_dst[1];
+ }
+ }
+}
+
+depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id)
+{
+ unsigned long entries[3];
+ u32 extra_bits;
+ int depth;
+ bool uaf;
+
+ if (!id)
+ return id;
+ /*
+ * Make sure we have enough spare bits in @id to hold the UAF bit and
+ * the chain depth.
+ */
+ BUILD_BUG_ON(
+ (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1));
+
+ extra_bits = stack_depot_get_extra_bits(id);
+ depth = kmsan_depth_from_eb(extra_bits);
+ uaf = kmsan_uaf_from_eb(extra_bits);
+
+ /*
+ * Stop chaining origins once the depth reached KMSAN_MAX_ORIGIN_DEPTH.
+ * This mostly happens in the case structures with uninitialized padding
+ * are copied around many times. Origin chains for such structures are
+ * usually periodic, and it does not make sense to fully store them.
+ */
+ if (depth == KMSAN_MAX_ORIGIN_DEPTH)
+ return id;
+
+ depth++;
+ extra_bits = kmsan_extra_bits(depth, uaf);
+
+ entries[0] = KMSAN_CHAIN_MAGIC_ORIGIN;
+ entries[1] = kmsan_save_stack_with_flags(GFP_ATOMIC, 0);
+ entries[2] = id;
+ /*
+ * @entries is a local var in non-instrumented code, so KMSAN does not
+ * know it is initialized. Explicitly unpoison it to avoid false
+ * positives when __stack_depot_save() passes it to instrumented code.
+ */
+ kmsan_internal_unpoison_memory(entries, sizeof(entries), false);
+ return __stack_depot_save(entries, ARRAY_SIZE(entries), extra_bits,
+ GFP_ATOMIC, true);
+}
+
+void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b,
+ u32 origin, bool checked)
+{
+ u64 address = (u64)addr;
+ void *shadow_start;
+ u32 *origin_start;
+ size_t pad = 0;
+
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size));
+ shadow_start = kmsan_get_metadata(addr, KMSAN_META_SHADOW);
+ if (!shadow_start) {
+ /*
+ * kmsan_metadata_is_contiguous() is true, so either all shadow
+ * and origin pages are NULL, or all are non-NULL.
+ */
+ if (checked) {
+ pr_err("%s: not memsetting %ld bytes starting at %px, because the shadow is NULL\n",
+ __func__, size, addr);
+ KMSAN_WARN_ON(true);
+ }
+ return;
+ }
+ __memset(shadow_start, b, size);
+
+ if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) {
+ pad = address % KMSAN_ORIGIN_SIZE;
+ address -= pad;
+ size += pad;
+ }
+ size = ALIGN(size, KMSAN_ORIGIN_SIZE);
+ origin_start =
+ (u32 *)kmsan_get_metadata((void *)address, KMSAN_META_ORIGIN);
+
+ for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++)
+ origin_start[i] = origin;
+}
+
+struct page *kmsan_vmalloc_to_page_or_null(void *vaddr)
+{
+ struct page *page;
+
+ if (!kmsan_internal_is_vmalloc_addr(vaddr) &&
+ !kmsan_internal_is_module_addr(vaddr))
+ return NULL;
+ page = vmalloc_to_page(vaddr);
+ if (pfn_valid(page_to_pfn(page)))
+ return page;
+ else
+ return NULL;
+}
+
+void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr,
+ int reason)
+{
+ depot_stack_handle_t cur_origin = 0, new_origin = 0;
+ unsigned long addr64 = (unsigned long)addr;
+ depot_stack_handle_t *origin = NULL;
+ unsigned char *shadow = NULL;
+ int cur_off_start = -1;
+ int chunk_size;
+ size_t pos = 0;
+
+ if (!size)
+ return;
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size));
+ while (pos < size) {
+ chunk_size = min(size - pos,
+ PAGE_SIZE - ((addr64 + pos) % PAGE_SIZE));
+ shadow = kmsan_get_metadata((void *)(addr64 + pos),
+ KMSAN_META_SHADOW);
+ if (!shadow) {
+ /*
+ * This page is untracked. If there were uninitialized
+ * bytes before, report them.
+ */
+ if (cur_origin) {
+ kmsan_enter_runtime();
+ kmsan_report(cur_origin, addr, size,
+ cur_off_start, pos - 1, user_addr,
+ reason);
+ kmsan_leave_runtime();
+ }
+ cur_origin = 0;
+ cur_off_start = -1;
+ pos += chunk_size;
+ continue;
+ }
+ for (int i = 0; i < chunk_size; i++) {
+ if (!shadow[i]) {
+ /*
+ * This byte is unpoisoned. If there were
+ * poisoned bytes before, report them.
+ */
+ if (cur_origin) {
+ kmsan_enter_runtime();
+ kmsan_report(cur_origin, addr, size,
+ cur_off_start, pos + i - 1,
+ user_addr, reason);
+ kmsan_leave_runtime();
+ }
+ cur_origin = 0;
+ cur_off_start = -1;
+ continue;
+ }
+ origin = kmsan_get_metadata((void *)(addr64 + pos + i),
+ KMSAN_META_ORIGIN);
+ KMSAN_WARN_ON(!origin);
+ new_origin = *origin;
+ /*
+ * Encountered new origin - report the previous
+ * uninitialized range.
+ */
+ if (cur_origin != new_origin) {
+ if (cur_origin) {
+ kmsan_enter_runtime();
+ kmsan_report(cur_origin, addr, size,
+ cur_off_start, pos + i - 1,
+ user_addr, reason);
+ kmsan_leave_runtime();
+ }
+ cur_origin = new_origin;
+ cur_off_start = pos + i;
+ }
+ }
+ pos += chunk_size;
+ }
+ KMSAN_WARN_ON(pos != size);
+ if (cur_origin) {
+ kmsan_enter_runtime();
+ kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1,
+ user_addr, reason);
+ kmsan_leave_runtime();
+ }
+}
+
+bool kmsan_metadata_is_contiguous(void *addr, size_t size)
+{
+ char *cur_shadow = NULL, *next_shadow = NULL, *cur_origin = NULL,
+ *next_origin = NULL;
+ u64 cur_addr = (u64)addr, next_addr = cur_addr + PAGE_SIZE;
+ depot_stack_handle_t *origin_p;
+ bool all_untracked = false;
+
+ if (!size)
+ return true;
+
+ /* The whole range belongs to the same page. */
+ if (ALIGN_DOWN(cur_addr + size - 1, PAGE_SIZE) ==
+ ALIGN_DOWN(cur_addr, PAGE_SIZE))
+ return true;
+
+ cur_shadow = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ false);
+ if (!cur_shadow)
+ all_untracked = true;
+ cur_origin = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ true);
+ if (all_untracked && cur_origin)
+ goto report;
+
+ for (; next_addr < (u64)addr + size;
+ cur_addr = next_addr, cur_shadow = next_shadow,
+ cur_origin = next_origin, next_addr += PAGE_SIZE) {
+ next_shadow = kmsan_get_metadata((void *)next_addr, false);
+ next_origin = kmsan_get_metadata((void *)next_addr, true);
+ if (all_untracked) {
+ if (next_shadow || next_origin)
+ goto report;
+ if (!next_shadow && !next_origin)
+ continue;
+ }
+ if (((u64)cur_shadow == ((u64)next_shadow - PAGE_SIZE)) &&
+ ((u64)cur_origin == ((u64)next_origin - PAGE_SIZE)))
+ continue;
+ goto report;
+ }
+ return true;
+
+report:
+ pr_err("%s: attempting to access two shadow page ranges.\n", __func__);
+ pr_err("Access of size %ld at %px.\n", size, addr);
+ pr_err("Addresses belonging to different ranges: %px and %px\n",
+ (void *)cur_addr, (void *)next_addr);
+ pr_err("page[0].shadow: %px, page[1].shadow: %px\n", cur_shadow,
+ next_shadow);
+ pr_err("page[0].origin: %px, page[1].origin: %px\n", cur_origin,
+ next_origin);
+ origin_p = kmsan_get_metadata(addr, KMSAN_META_ORIGIN);
+ if (origin_p) {
+ pr_err("Origin: %08x\n", *origin_p);
+ kmsan_print_origin(*origin_p);
+ } else {
+ pr_err("Origin: unavailable\n");
+ }
+ return false;
+}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
new file mode 100644
index 000000000000..35f6b6e6a908
--- /dev/null
+++ b/mm/kmsan/hooks.c
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN hooks for kernel subsystems.
+ *
+ * These functions handle creation of KMSAN metadata for memory allocations.
+ *
+ * Copyright (C) 2018-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <linux/cacheflush.h>
+#include <linux/dma-direction.h>
+#include <linux/gfp.h>
+#include <linux/kmsan.h>
+#include <linux/mm.h>
+#include <linux/mm_types.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/usb.h>
+
+#include "../internal.h"
+#include "../slab.h"
+#include "kmsan.h"
+
+/*
+ * Instrumented functions shouldn't be called under
+ * kmsan_enter_runtime()/kmsan_leave_runtime(), because this will lead to
+ * skipping effects of functions like memset() inside instrumented code.
+ */
+
+void kmsan_task_create(struct task_struct *task)
+{
+ kmsan_enter_runtime();
+ kmsan_internal_task_create(task);
+ kmsan_leave_runtime();
+}
+
+void kmsan_task_exit(struct task_struct *task)
+{
+ struct kmsan_ctx *ctx = &task->kmsan_ctx;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ ctx->allow_reporting = false;
+}
+
+void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags)
+{
+ if (unlikely(object == NULL))
+ return;
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ /*
+ * There's a ctor or this is an RCU cache - do nothing. The memory
+ * status hasn't changed since last use.
+ */
+ if (s->ctor || (s->flags & SLAB_TYPESAFE_BY_RCU))
+ return;
+
+ kmsan_enter_runtime();
+ if (flags & __GFP_ZERO)
+ kmsan_internal_unpoison_memory(object, s->object_size,
+ KMSAN_POISON_CHECK);
+ else
+ kmsan_internal_poison_memory(object, s->object_size, flags,
+ KMSAN_POISON_CHECK);
+ kmsan_leave_runtime();
+}
+
+void kmsan_slab_free(struct kmem_cache *s, void *object)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ /* RCU slabs could be legally used after free within the RCU period */
+ if (unlikely(s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)))
+ return;
+ /*
+ * If there's a constructor, freed memory must remain in the same state
+ * until the next allocation. We cannot save its state to detect
+ * use-after-free bugs, instead we just keep it unpoisoned.
+ */
+ if (s->ctor)
+ return;
+ kmsan_enter_runtime();
+ kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL,
+ KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
+ kmsan_leave_runtime();
+}
+
+void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
+{
+ if (unlikely(ptr == NULL))
+ return;
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ if (flags & __GFP_ZERO)
+ kmsan_internal_unpoison_memory((void *)ptr, size,
+ /*checked*/ true);
+ else
+ kmsan_internal_poison_memory((void *)ptr, size, flags,
+ KMSAN_POISON_CHECK);
+ kmsan_leave_runtime();
+}
+
+void kmsan_kfree_large(const void *ptr)
+{
+ struct page *page;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ page = virt_to_head_page((void *)ptr);
+ KMSAN_WARN_ON(ptr != page_address(page));
+ kmsan_internal_poison_memory((void *)ptr,
+ PAGE_SIZE << compound_order(page),
+ GFP_KERNEL,
+ KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
+ kmsan_leave_runtime();
+}
+
+static unsigned long vmalloc_shadow(unsigned long addr)
+{
+ return (unsigned long)kmsan_get_metadata((void *)addr,
+ KMSAN_META_SHADOW);
+}
+
+static unsigned long vmalloc_origin(unsigned long addr)
+{
+ return (unsigned long)kmsan_get_metadata((void *)addr,
+ KMSAN_META_ORIGIN);
+}
+
+void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end)
+{
+ __vunmap_range_noflush(vmalloc_shadow(start), vmalloc_shadow(end));
+ __vunmap_range_noflush(vmalloc_origin(start), vmalloc_origin(end));
+ flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end));
+ flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end));
+}
+
+/*
+ * This function creates new shadow/origin pages for the physical pages mapped
+ * into the virtual memory. If those physical pages already had shadow/origin,
+ * those are ignored.
+ */
+void kmsan_ioremap_page_range(unsigned long start, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot,
+ unsigned int page_shift)
+{
+ gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO;
+ struct page *shadow, *origin;
+ unsigned long off = 0;
+ int nr;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ nr = (end - start) / PAGE_SIZE;
+ kmsan_enter_runtime();
+ for (int i = 0; i < nr; i++, off += PAGE_SIZE) {
+ shadow = alloc_pages(gfp_mask, 1);
+ origin = alloc_pages(gfp_mask, 1);
+ __vmap_pages_range_noflush(
+ vmalloc_shadow(start + off),
+ vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow,
+ PAGE_SHIFT);
+ __vmap_pages_range_noflush(
+ vmalloc_origin(start + off),
+ vmalloc_origin(start + off + PAGE_SIZE), prot, &origin,
+ PAGE_SHIFT);
+ }
+ flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end));
+ flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end));
+ kmsan_leave_runtime();
+}
+
+void kmsan_iounmap_page_range(unsigned long start, unsigned long end)
+{
+ unsigned long v_shadow, v_origin;
+ struct page *shadow, *origin;
+ int nr;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ nr = (end - start) / PAGE_SIZE;
+ kmsan_enter_runtime();
+ v_shadow = (unsigned long)vmalloc_shadow(start);
+ v_origin = (unsigned long)vmalloc_origin(start);
+ for (int i = 0; i < nr;
+ i++, v_shadow += PAGE_SIZE, v_origin += PAGE_SIZE) {
+ shadow = kmsan_vmalloc_to_page_or_null((void *)v_shadow);
+ origin = kmsan_vmalloc_to_page_or_null((void *)v_origin);
+ __vunmap_range_noflush(v_shadow, vmalloc_shadow(end));
+ __vunmap_range_noflush(v_origin, vmalloc_origin(end));
+ if (shadow)
+ __free_pages(shadow, 1);
+ if (origin)
+ __free_pages(origin, 1);
+ }
+ flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end));
+ flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end));
+ kmsan_leave_runtime();
+}
+
+void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
+ size_t left)
+{
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ /*
+ * At this point we've copied the memory already. It's hard to check it
+ * before copying, as the size of actually copied buffer is unknown.
+ */
+
+ /* copy_to_user() may copy zero bytes. No need to check. */
+ if (!to_copy)
+ return;
+ /* Or maybe copy_to_user() failed to copy anything. */
+ if (to_copy <= left)
+ return;
+
+ ua_flags = user_access_save();
+ if ((u64)to < TASK_SIZE) {
+ /* This is a user memory access, check it. */
+ kmsan_internal_check_memory((void *)from, to_copy - left, to,
+ REASON_COPY_TO_USER);
+ } else {
+ /* Otherwise this is a kernel memory access. This happens when a
+ * compat syscall passes an argument allocated on the kernel
+ * stack to a real syscall.
+ * Don't check anything, just copy the shadow of the copied
+ * bytes.
+ */
+ kmsan_internal_memmove_metadata((void *)to, (void *)from,
+ to_copy - left);
+ }
+ user_access_restore(ua_flags);
+}
+EXPORT_SYMBOL(kmsan_copy_to_user);
+
+/* Helper function to check an URB. */
+void kmsan_handle_urb(const struct urb *urb, bool is_out)
+{
+ if (!urb)
+ return;
+ if (is_out)
+ kmsan_internal_check_memory(urb->transfer_buffer,
+ urb->transfer_buffer_length,
+ /*user_addr*/ 0, REASON_SUBMIT_URB);
+ else
+ kmsan_internal_unpoison_memory(urb->transfer_buffer,
+ urb->transfer_buffer_length,
+ /*checked*/ false);
+}
+
+static void kmsan_handle_dma_page(const void *addr, size_t size,
+ enum dma_data_direction dir)
+{
+ switch (dir) {
+ case DMA_BIDIRECTIONAL:
+ kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0,
+ REASON_ANY);
+ kmsan_internal_unpoison_memory((void *)addr, size,
+ /*checked*/ false);
+ break;
+ case DMA_TO_DEVICE:
+ kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0,
+ REASON_ANY);
+ break;
+ case DMA_FROM_DEVICE:
+ kmsan_internal_unpoison_memory((void *)addr, size,
+ /*checked*/ false);
+ break;
+ case DMA_NONE:
+ break;
+ }
+}
+
+/* Helper function to handle DMA data transfers. */
+void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
+ enum dma_data_direction dir)
+{
+ u64 page_offset, to_go, addr;
+
+ if (PageHighMem(page))
+ return;
+ addr = (u64)page_address(page) + offset;
+ /*
+ * The kernel may occasionally give us adjacent DMA pages not belonging
+ * to the same allocation. Process them separately to avoid triggering
+ * internal KMSAN checks.
+ */
+ while (size > 0) {
+ page_offset = addr % PAGE_SIZE;
+ to_go = min(PAGE_SIZE - page_offset, (u64)size);
+ kmsan_handle_dma_page((void *)addr, to_go, dir);
+ addr += to_go;
+ size -= to_go;
+ }
+}
+
+void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
+ enum dma_data_direction dir)
+{
+ struct scatterlist *item;
+ int i;
+
+ for_each_sg(sg, item, nents, i)
+ kmsan_handle_dma(sg_page(item), item->offset, item->length,
+ dir);
+}
+
+/* Functions from kmsan-checks.h follow. */
+void kmsan_poison_memory(const void *address, size_t size, gfp_t flags)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ /* The users may want to poison/unpoison random memory. */
+ kmsan_internal_poison_memory((void *)address, size, flags,
+ KMSAN_POISON_NOCHECK);
+ kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(kmsan_poison_memory);
+
+void kmsan_unpoison_memory(const void *address, size_t size)
+{
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ ua_flags = user_access_save();
+ kmsan_enter_runtime();
+ /* The users may want to poison/unpoison random memory. */
+ kmsan_internal_unpoison_memory((void *)address, size,
+ KMSAN_POISON_NOCHECK);
+ kmsan_leave_runtime();
+ user_access_restore(ua_flags);
+}
+EXPORT_SYMBOL(kmsan_unpoison_memory);
+
+/*
+ * Version of kmsan_unpoison_memory() that can be called from within the KMSAN
+ * runtime.
+ *
+ * Non-instrumented IRQ entry functions receive struct pt_regs from assembly
+ * code. Those regs need to be unpoisoned, otherwise using them will result in
+ * false positives.
+ * Using kmsan_unpoison_memory() is not an option in entry code, because the
+ * return value of in_task() is inconsistent - as a result, certain calls to
+ * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that
+ * the registers are unpoisoned even if kmsan_in_runtime() is true in the early
+ * entry code.
+ */
+void kmsan_unpoison_entry_regs(const struct pt_regs *regs)
+{
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled)
+ return;
+
+ ua_flags = user_access_save();
+ kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs),
+ KMSAN_POISON_NOCHECK);
+ user_access_restore(ua_flags);
+}
+
+void kmsan_check_memory(const void *addr, size_t size)
+{
+ if (!kmsan_enabled)
+ return;
+ return kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0,
+ REASON_ANY);
+}
+EXPORT_SYMBOL(kmsan_check_memory);
diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c
new file mode 100644
index 000000000000..7fb794242fad
--- /dev/null
+++ b/mm/kmsan/init.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN initialization routines.
+ *
+ * Copyright (C) 2017-2021 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include "kmsan.h"
+
+#include <asm/sections.h>
+#include <linux/mm.h>
+#include <linux/memblock.h>
+
+#include "../internal.h"
+
+#define NUM_FUTURE_RANGES 128
+struct start_end_pair {
+ u64 start, end;
+};
+
+static struct start_end_pair start_end_pairs[NUM_FUTURE_RANGES] __initdata;
+static int future_index __initdata;
+
+/*
+ * Record a range of memory for which the metadata pages will be created once
+ * the page allocator becomes available.
+ */
+static void __init kmsan_record_future_shadow_range(void *start, void *end)
+{
+ u64 nstart = (u64)start, nend = (u64)end, cstart, cend;
+ bool merged = false;
+
+ KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES);
+ KMSAN_WARN_ON((nstart >= nend) || !nstart || !nend);
+ nstart = ALIGN_DOWN(nstart, PAGE_SIZE);
+ nend = ALIGN(nend, PAGE_SIZE);
+
+ /*
+ * Scan the existing ranges to see if any of them overlaps with
+ * [start, end). In that case, merge the two ranges instead of
+ * creating a new one.
+ * The number of ranges is less than 20, so there is no need to organize
+ * them into a more intelligent data structure.
+ */
+ for (int i = 0; i < future_index; i++) {
+ cstart = start_end_pairs[i].start;
+ cend = start_end_pairs[i].end;
+ if ((cstart < nstart && cend < nstart) ||
+ (cstart > nend && cend > nend))
+ /* ranges are disjoint - do not merge */
+ continue;
+ start_end_pairs[i].start = min(nstart, cstart);
+ start_end_pairs[i].end = max(nend, cend);
+ merged = true;
+ break;
+ }
+ if (merged)
+ return;
+ start_end_pairs[future_index].start = nstart;
+ start_end_pairs[future_index].end = nend;
+ future_index++;
+}
+
+/*
+ * Initialize the shadow for existing mappings during kernel initialization.
+ * These include kernel text/data sections, NODE_DATA and future ranges
+ * registered while creating other data (e.g. percpu).
+ *
+ * Allocations via memblock can be only done before slab is initialized.
+ */
+void __init kmsan_init_shadow(void)
+{
+ const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+ phys_addr_t p_start, p_end;
+ u64 loop;
+ int nid;
+
+ for_each_reserved_mem_range(loop, &p_start, &p_end)
+ kmsan_record_future_shadow_range(phys_to_virt(p_start),
+ phys_to_virt(p_end));
+ /* Allocate shadow for .data */
+ kmsan_record_future_shadow_range(_sdata, _edata);
+
+ for_each_online_node(nid)
+ kmsan_record_future_shadow_range(
+ NODE_DATA(nid), (char *)NODE_DATA(nid) + nd_size);
+
+ for (int i = 0; i < future_index; i++)
+ kmsan_init_alloc_meta_for_range(
+ (void *)start_end_pairs[i].start,
+ (void *)start_end_pairs[i].end);
+}
+
+struct metadata_page_pair {
+ struct page *shadow, *origin;
+};
+static struct metadata_page_pair held_back[MAX_ORDER] __initdata;
+
+/*
+ * Eager metadata allocation. When the memblock allocator is freeing pages to
+ * pagealloc, we use 2/3 of them as metadata for the remaining 1/3.
+ * We store the pointers to the returned blocks of pages in held_back[] grouped
+ * by their order: when kmsan_memblock_free_pages() is called for the first
+ * time with a certain order, it is reserved as a shadow block, for the second
+ * time - as an origin block. On the third time the incoming block receives its
+ * shadow and origin ranges from the previously saved shadow and origin blocks,
+ * after which held_back[order] can be used again.
+ *
+ * At the very end there may be leftover blocks in held_back[]. They are
+ * collected later by kmsan_memblock_discard().
+ */
+bool kmsan_memblock_free_pages(struct page *page, unsigned int order)
+{
+ struct page *shadow, *origin;
+
+ if (!held_back[order].shadow) {
+ held_back[order].shadow = page;
+ return false;
+ }
+ if (!held_back[order].origin) {
+ held_back[order].origin = page;
+ return false;
+ }
+ shadow = held_back[order].shadow;
+ origin = held_back[order].origin;
+ kmsan_setup_meta(page, shadow, origin, order);
+
+ held_back[order].shadow = NULL;
+ held_back[order].origin = NULL;
+ return true;
+}
+
+#define MAX_BLOCKS 8
+struct smallstack {
+ struct page *items[MAX_BLOCKS];
+ int index;
+ int order;
+};
+
+static struct smallstack collect = {
+ .index = 0,
+ .order = MAX_ORDER,
+};
+
+static void smallstack_push(struct smallstack *stack, struct page *pages)
+{
+ KMSAN_WARN_ON(stack->index == MAX_BLOCKS);
+ stack->items[stack->index] = pages;
+ stack->index++;
+}
+#undef MAX_BLOCKS
+
+static struct page *smallstack_pop(struct smallstack *stack)
+{
+ struct page *ret;
+
+ KMSAN_WARN_ON(stack->index == 0);
+ stack->index--;
+ ret = stack->items[stack->index];
+ stack->items[stack->index] = NULL;
+ return ret;
+}
+
+static void do_collection(void)
+{
+ struct page *page, *shadow, *origin;
+
+ while (collect.index >= 3) {
+ page = smallstack_pop(&collect);
+ shadow = smallstack_pop(&collect);
+ origin = smallstack_pop(&collect);
+ kmsan_setup_meta(page, shadow, origin, collect.order);
+ __free_pages_core(page, collect.order);
+ }
+}
+
+static void collect_split(void)
+{
+ struct smallstack tmp = {
+ .order = collect.order - 1,
+ .index = 0,
+ };
+ struct page *page;
+
+ if (!collect.order)
+ return;
+ while (collect.index) {
+ page = smallstack_pop(&collect);
+ smallstack_push(&tmp, &page[0]);
+ smallstack_push(&tmp, &page[1 << tmp.order]);
+ }
+ __memcpy(&collect, &tmp, sizeof(tmp));
+}
+
+/*
+ * Memblock is about to go away. Split the page blocks left over in held_back[]
+ * and return 1/3 of that memory to the system.
+ */
+static void kmsan_memblock_discard(void)
+{
+ /*
+ * For each order=N:
+ * - push held_back[N].shadow and .origin to @collect;
+ * - while there are >= 3 elements in @collect, do garbage collection:
+ * - pop 3 ranges from @collect;
+ * - use two of them as shadow and origin for the third one;
+ * - repeat;
+ * - split each remaining element from @collect into 2 ranges of
+ * order=N-1,
+ * - repeat.
+ */
+ collect.order = MAX_ORDER - 1;
+ for (int i = MAX_ORDER - 1; i >= 0; i--) {
+ if (held_back[i].shadow)
+ smallstack_push(&collect, held_back[i].shadow);
+ if (held_back[i].origin)
+ smallstack_push(&collect, held_back[i].origin);
+ held_back[i].shadow = NULL;
+ held_back[i].origin = NULL;
+ do_collection();
+ collect_split();
+ }
+}
+
+void __init kmsan_init_runtime(void)
+{
+ /* Assuming current is init_task */
+ kmsan_internal_task_create(current);
+ kmsan_memblock_discard();
+ pr_info("Starting KernelMemorySanitizer\n");
+ pr_info("ATTENTION: KMSAN is a debugging tool! Do not use it on production machines!\n");
+ kmsan_enabled = true;
+}
diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c
new file mode 100644
index 000000000000..271f135f97a1
--- /dev/null
+++ b/mm/kmsan/instrumentation.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN compiler API.
+ *
+ * This file implements __msan_XXX hooks that Clang inserts into the code
+ * compiled with -fsanitize=kernel-memory.
+ * See Documentation/dev-tools/kmsan.rst for more information on how KMSAN
+ * instrumentation works.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include "kmsan.h"
+#include <linux/gfp.h>
+#include <linux/kmsan_string.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+static inline bool is_bad_asm_addr(void *addr, uintptr_t size, bool is_store)
+{
+ if ((u64)addr < TASK_SIZE)
+ return true;
+ if (!kmsan_get_metadata(addr, KMSAN_META_SHADOW))
+ return true;
+ return false;
+}
+
+static inline struct shadow_origin_ptr
+get_shadow_origin_ptr(void *addr, u64 size, bool store)
+{
+ unsigned long ua_flags = user_access_save();
+ struct shadow_origin_ptr ret;
+
+ ret = kmsan_get_shadow_origin_ptr(addr, size, store);
+ user_access_restore(ua_flags);
+ return ret;
+}
+
+/* Get shadow and origin pointers for a memory load with non-standard size. */
+struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr,
+ uintptr_t size)
+{
+ return get_shadow_origin_ptr(addr, size, /*store*/ false);
+}
+EXPORT_SYMBOL(__msan_metadata_ptr_for_load_n);
+
+/* Get shadow and origin pointers for a memory store with non-standard size. */
+struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr,
+ uintptr_t size)
+{
+ return get_shadow_origin_ptr(addr, size, /*store*/ true);
+}
+EXPORT_SYMBOL(__msan_metadata_ptr_for_store_n);
+
+/*
+ * Declare functions that obtain shadow/origin pointers for loads and stores
+ * with fixed size.
+ */
+#define DECLARE_METADATA_PTR_GETTER(size) \
+ struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \
+ void *addr) \
+ { \
+ return get_shadow_origin_ptr(addr, size, /*store*/ false); \
+ } \
+ EXPORT_SYMBOL(__msan_metadata_ptr_for_load_##size); \
+ struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \
+ void *addr) \
+ { \
+ return get_shadow_origin_ptr(addr, size, /*store*/ true); \
+ } \
+ EXPORT_SYMBOL(__msan_metadata_ptr_for_store_##size)
+
+DECLARE_METADATA_PTR_GETTER(1);
+DECLARE_METADATA_PTR_GETTER(2);
+DECLARE_METADATA_PTR_GETTER(4);
+DECLARE_METADATA_PTR_GETTER(8);
+
+/*
+ * Handle a memory store performed by inline assembly. KMSAN conservatively
+ * attempts to unpoison the outputs of asm() directives to prevent false
+ * positives caused by missed stores.
+ */
+void __msan_instrument_asm_store(void *addr, uintptr_t size)
+{
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ ua_flags = user_access_save();
+ /*
+ * Most of the accesses are below 32 bytes. The two exceptions so far
+ * are clwb() (64 bytes) and FPU state (512 bytes).
+ * It's unlikely that the assembly will touch more than 512 bytes.
+ */
+ if (size > 512) {
+ WARN_ONCE(1, "assembly store size too big: %ld\n", size);
+ size = 8;
+ }
+ if (is_bad_asm_addr(addr, size, /*is_store*/ true)) {
+ user_access_restore(ua_flags);
+ return;
+ }
+ kmsan_enter_runtime();
+ /* Unpoisoning the memory on best effort. */
+ kmsan_internal_unpoison_memory(addr, size, /*checked*/ false);
+ kmsan_leave_runtime();
+ user_access_restore(ua_flags);
+}
+EXPORT_SYMBOL(__msan_instrument_asm_store);
+
+/*
+ * KMSAN instrumentation pass replaces LLVM memcpy, memmove and memset
+ * intrinsics with calls to respective __msan_ functions. We use
+ * get_param0_metadata() and set_retval_metadata() to store the shadow/origin
+ * values for the destination argument of these functions and use them for the
+ * functions' return values.
+ */
+static inline void get_param0_metadata(u64 *shadow,
+ depot_stack_handle_t *origin)
+{
+ struct kmsan_ctx *ctx = kmsan_get_context();
+
+ *shadow = *(u64 *)(ctx->cstate.param_tls);
+ *origin = ctx->cstate.param_origin_tls[0];
+}
+
+static inline void set_retval_metadata(u64 shadow, depot_stack_handle_t origin)
+{
+ struct kmsan_ctx *ctx = kmsan_get_context();
+
+ *(u64 *)(ctx->cstate.retval_tls) = shadow;
+ ctx->cstate.retval_origin_tls = origin;
+}
+
+/* Handle llvm.memmove intrinsic. */
+void *__msan_memmove(void *dst, const void *src, uintptr_t n)
+{
+ depot_stack_handle_t origin;
+ void *result;
+ u64 shadow;
+
+ get_param0_metadata(&shadow, &origin);
+ result = __memmove(dst, src, n);
+ if (!n)
+ /* Some people call memmove() with zero length. */
+ return result;
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return result;
+
+ kmsan_enter_runtime();
+ kmsan_internal_memmove_metadata(dst, (void *)src, n);
+ kmsan_leave_runtime();
+
+ set_retval_metadata(shadow, origin);
+ return result;
+}
+EXPORT_SYMBOL(__msan_memmove);
+
+/* Handle llvm.memcpy intrinsic. */
+void *__msan_memcpy(void *dst, const void *src, uintptr_t n)
+{
+ depot_stack_handle_t origin;
+ void *result;
+ u64 shadow;
+
+ get_param0_metadata(&shadow, &origin);
+ result = __memcpy(dst, src, n);
+ if (!n)
+ /* Some people call memcpy() with zero length. */
+ return result;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return result;
+
+ kmsan_enter_runtime();
+ /* Using memmove instead of memcpy doesn't affect correctness. */
+ kmsan_internal_memmove_metadata(dst, (void *)src, n);
+ kmsan_leave_runtime();
+
+ set_retval_metadata(shadow, origin);
+ return result;
+}
+EXPORT_SYMBOL(__msan_memcpy);
+
+/* Handle llvm.memset intrinsic. */
+void *__msan_memset(void *dst, int c, uintptr_t n)
+{
+ depot_stack_handle_t origin;
+ void *result;
+ u64 shadow;
+
+ get_param0_metadata(&shadow, &origin);
+ result = __memset(dst, c, n);
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return result;
+
+ kmsan_enter_runtime();
+ /*
+ * Clang doesn't pass parameter metadata here, so it is impossible to
+ * use shadow of @c to set up the shadow for @dst.
+ */
+ kmsan_internal_unpoison_memory(dst, n, /*checked*/ false);
+ kmsan_leave_runtime();
+
+ set_retval_metadata(shadow, origin);
+ return result;
+}
+EXPORT_SYMBOL(__msan_memset);
+
+/*
+ * Create a new origin from an old one. This is done when storing an
+ * uninitialized value to memory. When reporting an error, KMSAN unrolls and
+ * prints the whole chain of stores that preceded the use of this value.
+ */
+depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin)
+{
+ depot_stack_handle_t ret = 0;
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return ret;
+
+ ua_flags = user_access_save();
+
+ /* Creating new origins may allocate memory. */
+ kmsan_enter_runtime();
+ ret = kmsan_internal_chain_origin(origin);
+ kmsan_leave_runtime();
+ user_access_restore(ua_flags);
+ return ret;
+}
+EXPORT_SYMBOL(__msan_chain_origin);
+
+/* Poison a local variable when entering a function. */
+void __msan_poison_alloca(void *address, uintptr_t size, char *descr)
+{
+ depot_stack_handle_t handle;
+ unsigned long entries[4];
+ unsigned long ua_flags;
+
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ ua_flags = user_access_save();
+ entries[0] = KMSAN_ALLOCA_MAGIC_ORIGIN;
+ entries[1] = (u64)descr;
+ entries[2] = (u64)__builtin_return_address(0);
+ /*
+ * With frame pointers enabled, it is possible to quickly fetch the
+ * second frame of the caller stack without calling the unwinder.
+ * Without them, simply do not bother.
+ */
+ if (IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER))
+ entries[3] = (u64)__builtin_return_address(1);
+ else
+ entries[3] = 0;
+
+ /* stack_depot_save() may allocate memory. */
+ kmsan_enter_runtime();
+ handle = stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC);
+ kmsan_leave_runtime();
+
+ kmsan_internal_set_shadow_origin(address, size, -1, handle,
+ /*checked*/ true);
+ user_access_restore(ua_flags);
+}
+EXPORT_SYMBOL(__msan_poison_alloca);
+
+/* Unpoison a local variable. */
+void __msan_unpoison_alloca(void *address, uintptr_t size)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+
+ kmsan_enter_runtime();
+ kmsan_internal_unpoison_memory(address, size, /*checked*/ true);
+ kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(__msan_unpoison_alloca);
+
+/*
+ * Report that an uninitialized value with the given origin was used in a way
+ * that constituted undefined behavior.
+ */
+void __msan_warning(u32 origin)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ kmsan_report(origin, /*address*/ 0, /*size*/ 0,
+ /*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ 0,
+ REASON_ANY);
+ kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(__msan_warning);
+
+/*
+ * At the beginning of an instrumented function, obtain the pointer to
+ * `struct kmsan_context_state` holding the metadata for function parameters.
+ */
+struct kmsan_context_state *__msan_get_context_state(void)
+{
+ return &kmsan_get_context()->cstate;
+}
+EXPORT_SYMBOL(__msan_get_context_state);
diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h
new file mode 100644
index 000000000000..a14744205435
--- /dev/null
+++ b/mm/kmsan/kmsan.h
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Functions used by the KMSAN runtime.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#ifndef __MM_KMSAN_KMSAN_H
+#define __MM_KMSAN_KMSAN_H
+
+#include <asm/pgtable_64_types.h>
+#include <linux/irqflags.h>
+#include <linux/sched.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+
+#define KMSAN_ALLOCA_MAGIC_ORIGIN 0xabcd0100
+#define KMSAN_CHAIN_MAGIC_ORIGIN 0xabcd0200
+
+#define KMSAN_POISON_NOCHECK 0x0
+#define KMSAN_POISON_CHECK 0x1
+#define KMSAN_POISON_FREE 0x2
+
+#define KMSAN_ORIGIN_SIZE 4
+#define KMSAN_MAX_ORIGIN_DEPTH 7
+
+#define KMSAN_STACK_DEPTH 64
+
+#define KMSAN_META_SHADOW (false)
+#define KMSAN_META_ORIGIN (true)
+
+extern bool kmsan_enabled;
+extern int panic_on_kmsan;
+
+/*
+ * KMSAN performs a lot of consistency checks that are currently enabled by
+ * default. BUG_ON is normally discouraged in the kernel, unless used for
+ * debugging, but KMSAN itself is a debugging tool, so it makes little sense to
+ * recover if something goes wrong.
+ */
+#define KMSAN_WARN_ON(cond) \
+ ({ \
+ const bool __cond = WARN_ON(cond); \
+ if (unlikely(__cond)) { \
+ WRITE_ONCE(kmsan_enabled, false); \
+ if (panic_on_kmsan) { \
+ /* Can't call panic() here because */ \
+ /* of uaccess checks. */ \
+ BUG(); \
+ } \
+ } \
+ __cond; \
+ })
+
+/*
+ * A pair of metadata pointers to be returned by the instrumentation functions.
+ */
+struct shadow_origin_ptr {
+ void *shadow, *origin;
+};
+
+struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *addr, u64 size,
+ bool store);
+void *kmsan_get_metadata(void *addr, bool is_origin);
+void __init kmsan_init_alloc_meta_for_range(void *start, void *end);
+
+enum kmsan_bug_reason {
+ REASON_ANY,
+ REASON_COPY_TO_USER,
+ REASON_SUBMIT_URB,
+};
+
+void kmsan_print_origin(depot_stack_handle_t origin);
+
+/**
+ * kmsan_report() - Report a use of uninitialized value.
+ * @origin: Stack ID of the uninitialized value.
+ * @address: Address at which the memory access happens.
+ * @size: Memory access size.
+ * @off_first: Offset (from @address) of the first byte to be reported.
+ * @off_last: Offset (from @address) of the last byte to be reported.
+ * @user_addr: When non-NULL, denotes the userspace address to which the kernel
+ * is leaking data.
+ * @reason: Error type from enum kmsan_bug_reason.
+ *
+ * kmsan_report() prints an error message for a consequent group of bytes
+ * sharing the same origin. If an uninitialized value is used in a comparison,
+ * this function is called once without specifying the addresses. When checking
+ * a memory range, KMSAN may call kmsan_report() multiple times with the same
+ * @address, @size, @user_addr and @reason, but different @off_first and
+ * @off_last corresponding to different @origin values.
+ */
+void kmsan_report(depot_stack_handle_t origin, void *address, int size,
+ int off_first, int off_last, const void *user_addr,
+ enum kmsan_bug_reason reason);
+
+DECLARE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx);
+
+static __always_inline struct kmsan_ctx *kmsan_get_context(void)
+{
+ return in_task() ? &current->kmsan_ctx : raw_cpu_ptr(&kmsan_percpu_ctx);
+}
+
+/*
+ * When a compiler hook or KMSAN runtime function is invoked, it may make a
+ * call to instrumented code and eventually call itself recursively. To avoid
+ * that, we guard the runtime entry regions with
+ * kmsan_enter_runtime()/kmsan_leave_runtime() and exit the hook if
+ * kmsan_in_runtime() is true.
+ *
+ * Non-runtime code may occasionally get executed in nested IRQs from the
+ * runtime code (e.g. when called via smp_call_function_single()). Because some
+ * KMSAN routines may take locks (e.g. for memory allocation), we conservatively
+ * bail out instead of calling them. To minimize the effect of this (potentially
+ * missing initialization events) kmsan_in_runtime() is not checked in
+ * non-blocking runtime functions.
+ */
+static __always_inline bool kmsan_in_runtime(void)
+{
+ if ((hardirq_count() >> HARDIRQ_SHIFT) > 1)
+ return true;
+ if (in_nmi())
+ return true;
+ return kmsan_get_context()->kmsan_in_runtime;
+}
+
+static __always_inline void kmsan_enter_runtime(void)
+{
+ struct kmsan_ctx *ctx;
+
+ ctx = kmsan_get_context();
+ KMSAN_WARN_ON(ctx->kmsan_in_runtime++);
+}
+
+static __always_inline void kmsan_leave_runtime(void)
+{
+ struct kmsan_ctx *ctx = kmsan_get_context();
+
+ KMSAN_WARN_ON(--ctx->kmsan_in_runtime);
+}
+
+depot_stack_handle_t kmsan_save_stack(void);
+depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
+ unsigned int extra_bits);
+
+/*
+ * Pack and unpack the origin chain depth and UAF flag to/from the extra bits
+ * provided by the stack depot.
+ * The UAF flag is stored in the lowest bit, followed by the depth in the upper
+ * bits.
+ * set_dsh_extra_bits() is responsible for clamping the value.
+ */
+static __always_inline unsigned int kmsan_extra_bits(unsigned int depth,
+ bool uaf)
+{
+ return (depth << 1) | uaf;
+}
+
+static __always_inline bool kmsan_uaf_from_eb(unsigned int extra_bits)
+{
+ return extra_bits & 1;
+}
+
+static __always_inline unsigned int kmsan_depth_from_eb(unsigned int extra_bits)
+{
+ return extra_bits >> 1;
+}
+
+/*
+ * kmsan_internal_ functions are supposed to be very simple and not require the
+ * kmsan_in_runtime() checks.
+ */
+void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n);
+void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags,
+ unsigned int poison_flags);
+void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked);
+void kmsan_internal_set_shadow_origin(void *address, size_t size, int b,
+ u32 origin, bool checked);
+depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id);
+
+void kmsan_internal_task_create(struct task_struct *task);
+
+bool kmsan_metadata_is_contiguous(void *addr, size_t size);
+void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr,
+ int reason);
+
+struct page *kmsan_vmalloc_to_page_or_null(void *vaddr);
+void kmsan_setup_meta(struct page *page, struct page *shadow,
+ struct page *origin, int order);
+
+/*
+ * kmsan_internal_is_module_addr() and kmsan_internal_is_vmalloc_addr() are
+ * non-instrumented versions of is_module_address() and is_vmalloc_addr() that
+ * are safe to call from KMSAN runtime without recursion.
+ */
+static inline bool kmsan_internal_is_module_addr(void *vaddr)
+{
+ return ((u64)vaddr >= MODULES_VADDR) && ((u64)vaddr < MODULES_END);
+}
+
+static inline bool kmsan_internal_is_vmalloc_addr(void *addr)
+{
+ return ((u64)addr >= VMALLOC_START) && ((u64)addr < VMALLOC_END);
+}
+
+#endif /* __MM_KMSAN_KMSAN_H */
diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c
new file mode 100644
index 000000000000..9a29ea2dbfb9
--- /dev/null
+++ b/mm/kmsan/kmsan_test.c
@@ -0,0 +1,581 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test cases for KMSAN.
+ * For each test case checks the presence (or absence) of generated reports.
+ * Relies on 'console' tracepoint to capture reports as they appear in the
+ * kernel log.
+ *
+ * Copyright (C) 2021-2022, Google LLC.
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <kunit/test.h>
+#include "kmsan.h"
+
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kmsan.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tracepoint.h>
+#include <trace/events/printk.h>
+
+static DEFINE_PER_CPU(int, per_cpu_var);
+
+/* Report as observed from console. */
+static struct {
+ spinlock_t lock;
+ bool available;
+ bool ignore; /* Stop console output collection. */
+ char header[256];
+} observed = {
+ .lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Probe for console output: obtains observed lines of interest. */
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ unsigned long flags;
+
+ if (observed.ignore)
+ return;
+ spin_lock_irqsave(&observed.lock, flags);
+
+ if (strnstr(buf, "BUG: KMSAN: ", len)) {
+ /*
+ * KMSAN report and related to the test.
+ *
+ * The provided @buf is not NUL-terminated; copy no more than
+ * @len bytes and let strscpy() add the missing NUL-terminator.
+ */
+ strscpy(observed.header, buf,
+ min(len + 1, sizeof(observed.header)));
+ WRITE_ONCE(observed.available, true);
+ observed.ignore = true;
+ }
+ spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+static bool report_available(void)
+{
+ return READ_ONCE(observed.available);
+}
+
+/* Information we expect in a report. */
+struct expect_report {
+ const char *error_type; /* Error type. */
+ /*
+ * Kernel symbol from the error header, or NULL if no report is
+ * expected.
+ */
+ const char *symbol;
+};
+
+/* Check observed report matches information in @r. */
+static bool report_matches(const struct expect_report *r)
+{
+ typeof(observed.header) expected_header;
+ unsigned long flags;
+ bool ret = false;
+ const char *end;
+ char *cur;
+
+ /* Doubled-checked locking. */
+ if (!report_available() || !r->symbol)
+ return (!report_available() && !r->symbol);
+
+ /* Generate expected report contents. */
+
+ /* Title */
+ cur = expected_header;
+ end = &expected_header[sizeof(expected_header) - 1];
+
+ cur += scnprintf(cur, end - cur, "BUG: KMSAN: %s", r->error_type);
+
+ scnprintf(cur, end - cur, " in %s", r->symbol);
+ /* The exact offset won't match, remove it; also strip module name. */
+ cur = strchr(expected_header, '+');
+ if (cur)
+ *cur = '\0';
+
+ spin_lock_irqsave(&observed.lock, flags);
+ if (!report_available())
+ goto out; /* A new report is being captured. */
+
+ /* Finally match expected output to what we actually observed. */
+ ret = strstr(observed.header, expected_header);
+out:
+ spin_unlock_irqrestore(&observed.lock, flags);
+
+ return ret;
+}
+
+/* ===== Test cases ===== */
+
+/* Prevent replacing branch with select in LLVM. */
+static noinline void check_true(char *arg)
+{
+ pr_info("%s is true\n", arg);
+}
+
+static noinline void check_false(char *arg)
+{
+ pr_info("%s is false\n", arg);
+}
+
+#define USE(x) \
+ do { \
+ if (x) \
+ check_true(#x); \
+ else \
+ check_false(#x); \
+ } while (0)
+
+#define EXPECTATION_ETYPE_FN(e, reason, fn) \
+ struct expect_report e = { \
+ .error_type = reason, \
+ .symbol = fn, \
+ }
+
+#define EXPECTATION_NO_REPORT(e) EXPECTATION_ETYPE_FN(e, NULL, NULL)
+#define EXPECTATION_UNINIT_VALUE_FN(e, fn) \
+ EXPECTATION_ETYPE_FN(e, "uninit-value", fn)
+#define EXPECTATION_UNINIT_VALUE(e) EXPECTATION_UNINIT_VALUE_FN(e, __func__)
+#define EXPECTATION_USE_AFTER_FREE(e) \
+ EXPECTATION_ETYPE_FN(e, "use-after-free", __func__)
+
+/* Test case: ensure that kmalloc() returns uninitialized memory. */
+static void test_uninit_kmalloc(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ int *ptr;
+
+ kunit_info(test, "uninitialized kmalloc test (UMR report)\n");
+ ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
+ USE(*ptr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that kmalloc'ed memory becomes initialized after memset().
+ */
+static void test_init_kmalloc(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ int *ptr;
+
+ kunit_info(test, "initialized kmalloc test (no reports)\n");
+ ptr = kmalloc(sizeof(*ptr), GFP_KERNEL);
+ memset(ptr, 0, sizeof(*ptr));
+ USE(*ptr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that kzalloc() returns initialized memory. */
+static void test_init_kzalloc(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ int *ptr;
+
+ kunit_info(test, "initialized kzalloc test (no reports)\n");
+ ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+ USE(*ptr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that local variables are uninitialized by default. */
+static void test_uninit_stack_var(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ volatile int cond;
+
+ kunit_info(test, "uninitialized stack variable (UMR report)\n");
+ USE(cond);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that local variables with initializers are initialized. */
+static void test_init_stack_var(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ volatile int cond = 1;
+
+ kunit_info(test, "initialized stack variable (no reports)\n");
+ USE(cond);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static noinline void two_param_fn_2(int arg1, int arg2)
+{
+ USE(arg1);
+ USE(arg2);
+}
+
+static noinline void one_param_fn(int arg)
+{
+ two_param_fn_2(arg, arg);
+ USE(arg);
+}
+
+static noinline void two_param_fn(int arg1, int arg2)
+{
+ int init = 0;
+
+ one_param_fn(init);
+ USE(arg1);
+ USE(arg2);
+}
+
+static void test_params(struct kunit *test)
+{
+#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL
+ /*
+ * With eager param/retval checking enabled, KMSAN will report an error
+ * before the call to two_param_fn().
+ */
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_params");
+#else
+ EXPECTATION_UNINIT_VALUE_FN(expect, "two_param_fn");
+#endif
+ volatile int uninit, init = 1;
+
+ kunit_info(test,
+ "uninit passed through a function parameter (UMR report)\n");
+ two_param_fn(uninit, init);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static int signed_sum3(int a, int b, int c)
+{
+ return a + b + c;
+}
+
+/*
+ * Test case: ensure that uninitialized values are tracked through function
+ * arguments.
+ */
+static void test_uninit_multiple_params(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ volatile char b = 3, c;
+ volatile int a;
+
+ kunit_info(test, "uninitialized local passed to fn (UMR report)\n");
+ USE(signed_sum3(a, b, c));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Helper function to make an array uninitialized. */
+static noinline void do_uninit_local_array(char *array, int start, int stop)
+{
+ volatile char uninit;
+
+ for (int i = start; i < stop; i++)
+ array[i] = uninit;
+}
+
+/*
+ * Test case: ensure kmsan_check_memory() reports an error when checking
+ * uninitialized memory.
+ */
+static void test_uninit_kmsan_check_memory(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_uninit_kmsan_check_memory");
+ volatile char local_array[8];
+
+ kunit_info(
+ test,
+ "kmsan_check_memory() called on uninit local (UMR report)\n");
+ do_uninit_local_array((char *)local_array, 5, 7);
+
+ kmsan_check_memory((char *)local_array, 8);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: check that a virtual memory range created with vmap() from
+ * initialized pages is still considered as initialized.
+ */
+static void test_init_kmsan_vmap_vunmap(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ const int npages = 2;
+ struct page **pages;
+ void *vbuf;
+
+ kunit_info(test, "pages initialized via vmap (no reports)\n");
+
+ pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
+ for (int i = 0; i < npages; i++)
+ pages[i] = alloc_page(GFP_KERNEL);
+ vbuf = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
+ memset(vbuf, 0xfe, npages * PAGE_SIZE);
+ for (int i = 0; i < npages; i++)
+ kmsan_check_memory(page_address(pages[i]), PAGE_SIZE);
+
+ if (vbuf)
+ vunmap(vbuf);
+ for (int i = 0; i < npages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that memset() can initialize a buffer allocated via
+ * vmalloc().
+ */
+static void test_init_vmalloc(struct kunit *test)
+{
+ EXPECTATION_NO_REPORT(expect);
+ int npages = 8;
+ char *buf;
+
+ kunit_info(test, "vmalloc buffer can be initialized (no reports)\n");
+ buf = vmalloc(PAGE_SIZE * npages);
+ buf[0] = 1;
+ memset(buf, 0xfe, PAGE_SIZE * npages);
+ USE(buf[0]);
+ for (int i = 0; i < npages; i++)
+ kmsan_check_memory(&buf[PAGE_SIZE * i], PAGE_SIZE);
+ vfree(buf);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test case: ensure that use-after-free reporting works. */
+static void test_uaf(struct kunit *test)
+{
+ EXPECTATION_USE_AFTER_FREE(expect);
+ volatile int value;
+ volatile int *var;
+
+ kunit_info(test, "use-after-free in kmalloc-ed buffer (UMR report)\n");
+ var = kmalloc(80, GFP_KERNEL);
+ var[3] = 0xfeedface;
+ kfree((int *)var);
+ /* Copy the invalid value before checking it. */
+ value = var[3];
+ USE(value);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that uninitialized values are propagated through per-CPU
+ * memory.
+ */
+static void test_percpu_propagate(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE(expect);
+ volatile int uninit, check;
+
+ kunit_info(test,
+ "uninit local stored to per_cpu memory (UMR report)\n");
+
+ this_cpu_write(per_cpu_var, uninit);
+ check = this_cpu_read(per_cpu_var);
+ USE(check);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that passing uninitialized values to printk() leads to an
+ * error report.
+ */
+static void test_printk(struct kunit *test)
+{
+#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL
+ /*
+ * With eager param/retval checking enabled, KMSAN will report an error
+ * before the call to pr_info().
+ */
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_printk");
+#else
+ EXPECTATION_UNINIT_VALUE_FN(expect, "number");
+#endif
+ volatile int uninit;
+
+ kunit_info(test, "uninit local passed to pr_info() (UMR report)\n");
+ pr_info("%px contains %d\n", &uninit, uninit);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that memcpy() correctly copies uninitialized values between
+ * aligned `src` and `dst`.
+ */
+static void test_memcpy_aligned_to_aligned(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_aligned");
+ volatile int uninit_src;
+ volatile int dst = 0;
+
+ kunit_info(
+ test,
+ "memcpy()ing aligned uninit src to aligned dst (UMR report)\n");
+ memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src));
+ kmsan_check_memory((void *)&dst, sizeof(dst));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that memcpy() correctly copies uninitialized values between
+ * aligned `src` and unaligned `dst`.
+ *
+ * Copying aligned 4-byte value to an unaligned one leads to touching two
+ * aligned 4-byte values. This test case checks that KMSAN correctly reports an
+ * error on the first of the two values.
+ */
+static void test_memcpy_aligned_to_unaligned(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_unaligned");
+ volatile int uninit_src;
+ volatile char dst[8] = { 0 };
+
+ kunit_info(
+ test,
+ "memcpy()ing aligned uninit src to unaligned dst (UMR report)\n");
+ memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src));
+ kmsan_check_memory((void *)dst, 4);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * Test case: ensure that memcpy() correctly copies uninitialized values between
+ * aligned `src` and unaligned `dst`.
+ *
+ * Copying aligned 4-byte value to an unaligned one leads to touching two
+ * aligned 4-byte values. This test case checks that KMSAN correctly reports an
+ * error on the second of the two values.
+ */
+static void test_memcpy_aligned_to_unaligned2(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect,
+ "test_memcpy_aligned_to_unaligned2");
+ volatile int uninit_src;
+ volatile char dst[8] = { 0 };
+
+ kunit_info(
+ test,
+ "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n");
+ memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src));
+ kmsan_check_memory((void *)&dst[4], sizeof(uninit_src));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static noinline void fibonacci(int *array, int size, int start) {
+ if (start < 2 || (start == size))
+ return;
+ array[start] = array[start - 1] + array[start - 2];
+ fibonacci(array, size, start + 1);
+}
+
+static void test_long_origin_chain(struct kunit *test)
+{
+ EXPECTATION_UNINIT_VALUE_FN(expect,
+ "test_long_origin_chain");
+ /* (KMSAN_MAX_ORIGIN_DEPTH * 2) recursive calls to fibonacci(). */
+ volatile int accum[KMSAN_MAX_ORIGIN_DEPTH * 2 + 2];
+ int last = ARRAY_SIZE(accum) - 1;
+
+ kunit_info(
+ test,
+ "origin chain exceeding KMSAN_MAX_ORIGIN_DEPTH (UMR report)\n");
+ /*
+ * We do not set accum[1] to 0, so the uninitializedness will be carried
+ * over to accum[2..last].
+ */
+ accum[0] = 1;
+ fibonacci((int *)accum, ARRAY_SIZE(accum), 2);
+ kmsan_check_memory((void *)&accum[last], sizeof(int));
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static struct kunit_case kmsan_test_cases[] = {
+ KUNIT_CASE(test_uninit_kmalloc),
+ KUNIT_CASE(test_init_kmalloc),
+ KUNIT_CASE(test_init_kzalloc),
+ KUNIT_CASE(test_uninit_stack_var),
+ KUNIT_CASE(test_init_stack_var),
+ KUNIT_CASE(test_params),
+ KUNIT_CASE(test_uninit_multiple_params),
+ KUNIT_CASE(test_uninit_kmsan_check_memory),
+ KUNIT_CASE(test_init_kmsan_vmap_vunmap),
+ KUNIT_CASE(test_init_vmalloc),
+ KUNIT_CASE(test_uaf),
+ KUNIT_CASE(test_percpu_propagate),
+ KUNIT_CASE(test_printk),
+ KUNIT_CASE(test_memcpy_aligned_to_aligned),
+ KUNIT_CASE(test_memcpy_aligned_to_unaligned),
+ KUNIT_CASE(test_memcpy_aligned_to_unaligned2),
+ KUNIT_CASE(test_long_origin_chain),
+ {},
+};
+
+/* ===== End test cases ===== */
+
+static int test_init(struct kunit *test)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ observed.header[0] = '\0';
+ observed.ignore = false;
+ observed.available = false;
+ spin_unlock_irqrestore(&observed.lock, flags);
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+}
+
+static void register_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ check_trace_callback_type_console(probe_console);
+ if (!strcmp(tp->name, "console"))
+ WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+}
+
+static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ if (!strcmp(tp->name, "console"))
+ tracepoint_probe_unregister(tp, probe_console, NULL);
+}
+
+static int kmsan_suite_init(struct kunit_suite *suite)
+{
+ /*
+ * Because we want to be able to build the test as a module, we need to
+ * iterate through all known tracepoints, since the static registration
+ * won't work here.
+ */
+ for_each_kernel_tracepoint(register_tracepoints, NULL);
+ return 0;
+}
+
+static void kmsan_suite_exit(struct kunit_suite *suite)
+{
+ for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+ tracepoint_synchronize_unregister();
+}
+
+static struct kunit_suite kmsan_test_suite = {
+ .name = "kmsan",
+ .test_cases = kmsan_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+ .suite_init = kmsan_suite_init,
+ .suite_exit = kmsan_suite_exit,
+};
+kunit_test_suites(&kmsan_test_suite);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Alexander Potapenko <glider@google.com>");
diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c
new file mode 100644
index 000000000000..02736ec757f2
--- /dev/null
+++ b/mm/kmsan/report.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN error reporting routines.
+ *
+ * Copyright (C) 2019-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <linux/console.h>
+#include <linux/moduleparam.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/uaccess.h>
+
+#include "kmsan.h"
+
+static DEFINE_RAW_SPINLOCK(kmsan_report_lock);
+#define DESCR_SIZE 128
+/* Protected by kmsan_report_lock */
+static char report_local_descr[DESCR_SIZE];
+int panic_on_kmsan __read_mostly;
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "kmsan."
+module_param_named(panic, panic_on_kmsan, int, 0);
+
+/*
+ * Skip internal KMSAN frames.
+ */
+static int get_stack_skipnr(const unsigned long stack_entries[],
+ int num_entries)
+{
+ int len, skip;
+ char buf[64];
+
+ for (skip = 0; skip < num_entries; ++skip) {
+ len = scnprintf(buf, sizeof(buf), "%ps",
+ (void *)stack_entries[skip]);
+
+ /* Never show __msan_* or kmsan_* functions. */
+ if ((strnstr(buf, "__msan_", len) == buf) ||
+ (strnstr(buf, "kmsan_", len) == buf))
+ continue;
+
+ /*
+ * No match for runtime functions -- @skip entries to skip to
+ * get to first frame of interest.
+ */
+ break;
+ }
+
+ return skip;
+}
+
+/*
+ * Currently the descriptions of locals generated by Clang look as follows:
+ * ----local_name@function_name
+ * We want to print only the name of the local, as other information in that
+ * description can be confusing.
+ * The meaningful part of the description is copied to a global buffer to avoid
+ * allocating memory.
+ */
+static char *pretty_descr(char *descr)
+{
+ int pos = 0, len = strlen(descr);
+
+ for (int i = 0; i < len; i++) {
+ if (descr[i] == '@')
+ break;
+ if (descr[i] == '-')
+ continue;
+ report_local_descr[pos] = descr[i];
+ if (pos + 1 == DESCR_SIZE)
+ break;
+ pos++;
+ }
+ report_local_descr[pos] = 0;
+ return report_local_descr;
+}
+
+void kmsan_print_origin(depot_stack_handle_t origin)
+{
+ unsigned long *entries = NULL, *chained_entries = NULL;
+ unsigned int nr_entries, chained_nr_entries, skipnr;
+ void *pc1 = NULL, *pc2 = NULL;
+ depot_stack_handle_t head;
+ unsigned long magic;
+ char *descr = NULL;
+ unsigned int depth;
+
+ if (!origin)
+ return;
+
+ while (true) {
+ nr_entries = stack_depot_fetch(origin, &entries);
+ depth = kmsan_depth_from_eb(stack_depot_get_extra_bits(origin));
+ magic = nr_entries ? entries[0] : 0;
+ if ((nr_entries == 4) && (magic == KMSAN_ALLOCA_MAGIC_ORIGIN)) {
+ descr = (char *)entries[1];
+ pc1 = (void *)entries[2];
+ pc2 = (void *)entries[3];
+ pr_err("Local variable %s created at:\n",
+ pretty_descr(descr));
+ if (pc1)
+ pr_err(" %pSb\n", pc1);
+ if (pc2)
+ pr_err(" %pSb\n", pc2);
+ break;
+ }
+ if ((nr_entries == 3) && (magic == KMSAN_CHAIN_MAGIC_ORIGIN)) {
+ /*
+ * Origin chains deeper than KMSAN_MAX_ORIGIN_DEPTH are
+ * not stored, so the output may be incomplete.
+ */
+ if (depth == KMSAN_MAX_ORIGIN_DEPTH)
+ pr_err("<Zero or more stacks not recorded to save memory>\n\n");
+ head = entries[1];
+ origin = entries[2];
+ pr_err("Uninit was stored to memory at:\n");
+ chained_nr_entries =
+ stack_depot_fetch(head, &chained_entries);
+ kmsan_internal_unpoison_memory(
+ chained_entries,
+ chained_nr_entries * sizeof(*chained_entries),
+ /*checked*/ false);
+ skipnr = get_stack_skipnr(chained_entries,
+ chained_nr_entries);
+ stack_trace_print(chained_entries + skipnr,
+ chained_nr_entries - skipnr, 0);
+ pr_err("\n");
+ continue;
+ }
+ pr_err("Uninit was created at:\n");
+ if (nr_entries) {
+ skipnr = get_stack_skipnr(entries, nr_entries);
+ stack_trace_print(entries + skipnr, nr_entries - skipnr,
+ 0);
+ } else {
+ pr_err("(stack is not available)\n");
+ }
+ break;
+ }
+}
+
+void kmsan_report(depot_stack_handle_t origin, void *address, int size,
+ int off_first, int off_last, const void *user_addr,
+ enum kmsan_bug_reason reason)
+{
+ unsigned long stack_entries[KMSAN_STACK_DEPTH];
+ int num_stack_entries, skipnr;
+ char *bug_type = NULL;
+ unsigned long ua_flags;
+ bool is_uaf;
+
+ if (!kmsan_enabled)
+ return;
+ if (!current->kmsan_ctx.allow_reporting)
+ return;
+ if (!origin)
+ return;
+
+ current->kmsan_ctx.allow_reporting = false;
+ ua_flags = user_access_save();
+ raw_spin_lock(&kmsan_report_lock);
+ pr_err("=====================================================\n");
+ is_uaf = kmsan_uaf_from_eb(stack_depot_get_extra_bits(origin));
+ switch (reason) {
+ case REASON_ANY:
+ bug_type = is_uaf ? "use-after-free" : "uninit-value";
+ break;
+ case REASON_COPY_TO_USER:
+ bug_type = is_uaf ? "kernel-infoleak-after-free" :
+ "kernel-infoleak";
+ break;
+ case REASON_SUBMIT_URB:
+ bug_type = is_uaf ? "kernel-usb-infoleak-after-free" :
+ "kernel-usb-infoleak";
+ break;
+ }
+
+ num_stack_entries =
+ stack_trace_save(stack_entries, KMSAN_STACK_DEPTH, 1);
+ skipnr = get_stack_skipnr(stack_entries, num_stack_entries);
+
+ pr_err("BUG: KMSAN: %s in %pSb\n", bug_type,
+ (void *)stack_entries[skipnr]);
+ stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
+ 0);
+ pr_err("\n");
+
+ kmsan_print_origin(origin);
+
+ if (size) {
+ pr_err("\n");
+ if (off_first == off_last)
+ pr_err("Byte %d of %d is uninitialized\n", off_first,
+ size);
+ else
+ pr_err("Bytes %d-%d of %d are uninitialized\n",
+ off_first, off_last, size);
+ }
+ if (address)
+ pr_err("Memory access of size %d starts at %px\n", size,
+ address);
+ if (user_addr && reason == REASON_COPY_TO_USER)
+ pr_err("Data copied to user address %px\n", user_addr);
+ pr_err("\n");
+ dump_stack_print_info(KERN_ERR);
+ pr_err("=====================================================\n");
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+ raw_spin_unlock(&kmsan_report_lock);
+ if (panic_on_kmsan)
+ panic("kmsan.panic set ...\n");
+ user_access_restore(ua_flags);
+ current->kmsan_ctx.allow_reporting = true;
+}
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
new file mode 100644
index 000000000000..a787c04e9583
--- /dev/null
+++ b/mm/kmsan/shadow.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KMSAN shadow implementation.
+ *
+ * Copyright (C) 2017-2022 Google LLC
+ * Author: Alexander Potapenko <glider@google.com>
+ *
+ */
+
+#include <asm/kmsan.h>
+#include <asm/tlbflush.h>
+#include <linux/cacheflush.h>
+#include <linux/memblock.h>
+#include <linux/mm_types.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/stddef.h>
+
+#include "../internal.h"
+#include "kmsan.h"
+
+#define shadow_page_for(page) ((page)->kmsan_shadow)
+
+#define origin_page_for(page) ((page)->kmsan_origin)
+
+static void *shadow_ptr_for(struct page *page)
+{
+ return page_address(shadow_page_for(page));
+}
+
+static void *origin_ptr_for(struct page *page)
+{
+ return page_address(origin_page_for(page));
+}
+
+static bool page_has_metadata(struct page *page)
+{
+ return shadow_page_for(page) && origin_page_for(page);
+}
+
+static void set_no_shadow_origin_page(struct page *page)
+{
+ shadow_page_for(page) = NULL;
+ origin_page_for(page) = NULL;
+}
+
+/*
+ * Dummy load and store pages to be used when the real metadata is unavailable.
+ * There are separate pages for loads and stores, so that every load returns a
+ * zero, and every store doesn't affect other loads.
+ */
+static char dummy_load_page[PAGE_SIZE] __aligned(PAGE_SIZE);
+static char dummy_store_page[PAGE_SIZE] __aligned(PAGE_SIZE);
+
+static unsigned long vmalloc_meta(void *addr, bool is_origin)
+{
+ unsigned long addr64 = (unsigned long)addr, off;
+
+ KMSAN_WARN_ON(is_origin && !IS_ALIGNED(addr64, KMSAN_ORIGIN_SIZE));
+ if (kmsan_internal_is_vmalloc_addr(addr)) {
+ off = addr64 - VMALLOC_START;
+ return off + (is_origin ? KMSAN_VMALLOC_ORIGIN_START :
+ KMSAN_VMALLOC_SHADOW_START);
+ }
+ if (kmsan_internal_is_module_addr(addr)) {
+ off = addr64 - MODULES_VADDR;
+ return off + (is_origin ? KMSAN_MODULES_ORIGIN_START :
+ KMSAN_MODULES_SHADOW_START);
+ }
+ return 0;
+}
+
+static struct page *virt_to_page_or_null(void *vaddr)
+{
+ if (kmsan_virt_addr_valid(vaddr))
+ return virt_to_page(vaddr);
+ else
+ return NULL;
+}
+
+struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *address, u64 size,
+ bool store)
+{
+ struct shadow_origin_ptr ret;
+ void *shadow;
+
+ /*
+ * Even if we redirect this memory access to the dummy page, it will
+ * go out of bounds.
+ */
+ KMSAN_WARN_ON(size > PAGE_SIZE);
+
+ if (!kmsan_enabled)
+ goto return_dummy;
+
+ KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(address, size));
+ shadow = kmsan_get_metadata(address, KMSAN_META_SHADOW);
+ if (!shadow)
+ goto return_dummy;
+
+ ret.shadow = shadow;
+ ret.origin = kmsan_get_metadata(address, KMSAN_META_ORIGIN);
+ return ret;
+
+return_dummy:
+ if (store) {
+ /* Ignore this store. */
+ ret.shadow = dummy_store_page;
+ ret.origin = dummy_store_page;
+ } else {
+ /* This load will return zero. */
+ ret.shadow = dummy_load_page;
+ ret.origin = dummy_load_page;
+ }
+ return ret;
+}
+
+/*
+ * Obtain the shadow or origin pointer for the given address, or NULL if there's
+ * none. The caller must check the return value for being non-NULL if needed.
+ * The return value of this function should not depend on whether we're in the
+ * runtime or not.
+ */
+void *kmsan_get_metadata(void *address, bool is_origin)
+{
+ u64 addr = (u64)address, pad, off;
+ struct page *page;
+ void *ret;
+
+ if (is_origin && !IS_ALIGNED(addr, KMSAN_ORIGIN_SIZE)) {
+ pad = addr % KMSAN_ORIGIN_SIZE;
+ addr -= pad;
+ }
+ address = (void *)addr;
+ if (kmsan_internal_is_vmalloc_addr(address) ||
+ kmsan_internal_is_module_addr(address))
+ return (void *)vmalloc_meta(address, is_origin);
+
+ ret = arch_kmsan_get_meta_or_null(address, is_origin);
+ if (ret)
+ return ret;
+
+ page = virt_to_page_or_null(address);
+ if (!page)
+ return NULL;
+ if (!page_has_metadata(page))
+ return NULL;
+ off = addr % PAGE_SIZE;
+
+ return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off;
+}
+
+void kmsan_copy_page_meta(struct page *dst, struct page *src)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ if (!dst || !page_has_metadata(dst))
+ return;
+ if (!src || !page_has_metadata(src)) {
+ kmsan_internal_unpoison_memory(page_address(dst), PAGE_SIZE,
+ /*checked*/ false);
+ return;
+ }
+
+ kmsan_enter_runtime();
+ __memcpy(shadow_ptr_for(dst), shadow_ptr_for(src), PAGE_SIZE);
+ __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE);
+ kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(kmsan_copy_page_meta);
+
+void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags)
+{
+ bool initialized = (flags & __GFP_ZERO) || !kmsan_enabled;
+ struct page *shadow, *origin;
+ depot_stack_handle_t handle;
+ int pages = 1 << order;
+
+ if (!page)
+ return;
+
+ shadow = shadow_page_for(page);
+ origin = origin_page_for(page);
+
+ if (initialized) {
+ __memset(page_address(shadow), 0, PAGE_SIZE * pages);
+ __memset(page_address(origin), 0, PAGE_SIZE * pages);
+ return;
+ }
+
+ /* Zero pages allocated by the runtime should also be initialized. */
+ if (kmsan_in_runtime())
+ return;
+
+ __memset(page_address(shadow), -1, PAGE_SIZE * pages);
+ kmsan_enter_runtime();
+ handle = kmsan_save_stack_with_flags(flags, /*extra_bits*/ 0);
+ kmsan_leave_runtime();
+ /*
+ * Addresses are page-aligned, pages are contiguous, so it's ok
+ * to just fill the origin pages with @handle.
+ */
+ for (int i = 0; i < PAGE_SIZE * pages / sizeof(handle); i++)
+ ((depot_stack_handle_t *)page_address(origin))[i] = handle;
+}
+
+void kmsan_free_page(struct page *page, unsigned int order)
+{
+ if (!kmsan_enabled || kmsan_in_runtime())
+ return;
+ kmsan_enter_runtime();
+ kmsan_internal_poison_memory(page_address(page),
+ PAGE_SIZE << compound_order(page),
+ GFP_KERNEL,
+ KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
+ kmsan_leave_runtime();
+}
+
+void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end,
+ pgprot_t prot, struct page **pages,
+ unsigned int page_shift)
+{
+ unsigned long shadow_start, origin_start, shadow_end, origin_end;
+ struct page **s_pages, **o_pages;
+ int nr, mapped;
+
+ if (!kmsan_enabled)
+ return;
+
+ shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW);
+ shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW);
+ if (!shadow_start)
+ return;
+
+ nr = (end - start) / PAGE_SIZE;
+ s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL);
+ o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL);
+ if (!s_pages || !o_pages)
+ goto ret;
+ for (int i = 0; i < nr; i++) {
+ s_pages[i] = shadow_page_for(pages[i]);
+ o_pages[i] = origin_page_for(pages[i]);
+ }
+ prot = __pgprot(pgprot_val(prot) | _PAGE_NX);
+ prot = PAGE_KERNEL;
+
+ origin_start = vmalloc_meta((void *)start, KMSAN_META_ORIGIN);
+ origin_end = vmalloc_meta((void *)end, KMSAN_META_ORIGIN);
+ kmsan_enter_runtime();
+ mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot,
+ s_pages, page_shift);
+ KMSAN_WARN_ON(mapped);
+ mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot,
+ o_pages, page_shift);
+ KMSAN_WARN_ON(mapped);
+ kmsan_leave_runtime();
+ flush_tlb_kernel_range(shadow_start, shadow_end);
+ flush_tlb_kernel_range(origin_start, origin_end);
+ flush_cache_vmap(shadow_start, shadow_end);
+ flush_cache_vmap(origin_start, origin_end);
+
+ret:
+ kfree(s_pages);
+ kfree(o_pages);
+}
+
+/* Allocate metadata for pages allocated at boot time. */
+void __init kmsan_init_alloc_meta_for_range(void *start, void *end)
+{
+ struct page *shadow_p, *origin_p;
+ void *shadow, *origin;
+ struct page *page;
+ u64 size;
+
+ start = (void *)ALIGN_DOWN((u64)start, PAGE_SIZE);
+ size = ALIGN((u64)end - (u64)start, PAGE_SIZE);
+ shadow = memblock_alloc(size, PAGE_SIZE);
+ origin = memblock_alloc(size, PAGE_SIZE);
+ for (u64 addr = 0; addr < size; addr += PAGE_SIZE) {
+ page = virt_to_page_or_null((char *)start + addr);
+ shadow_p = virt_to_page_or_null((char *)shadow + addr);
+ set_no_shadow_origin_page(shadow_p);
+ shadow_page_for(page) = shadow_p;
+ origin_p = virt_to_page_or_null((char *)origin + addr);
+ set_no_shadow_origin_page(origin_p);
+ origin_page_for(page) = origin_p;
+ }
+}
+
+void kmsan_setup_meta(struct page *page, struct page *shadow,
+ struct page *origin, int order)
+{
+ for (int i = 0; i < (1 << order); i++) {
+ set_no_shadow_origin_page(&shadow[i]);
+ set_no_shadow_origin_page(&origin[i]);
+ shadow_page_for(&page[i]) = &shadow[i];
+ origin_page_for(&page[i]) = &origin[i];
+ }
+}
diff --git a/mm/ksm.c b/mm/ksm.c
index 42ab153335a2..c19fcca9bc03 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -42,6 +42,7 @@
#include <asm/tlbflush.h>
#include "internal.h"
+#include "mm_slot.h"
#ifdef CONFIG_NUMA
#define NUMA(x) (x)
@@ -82,7 +83,7 @@
* different KSM page copy of that content
*
* Internally, the regular nodes, "dups" and "chains" are represented
- * using the same struct stable_node structure.
+ * using the same struct ksm_stable_node structure.
*
* In addition to the stable tree, KSM uses a second data structure called the
* unstable tree: this tree holds pointers to pages which have been found to
@@ -112,17 +113,13 @@
*/
/**
- * struct mm_slot - ksm information per mm that is being scanned
- * @link: link to the mm_slots hash list
- * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
+ * struct ksm_mm_slot - ksm information per mm that is being scanned
+ * @slot: hash lookup from mm to mm_slot
* @rmap_list: head for this mm_slot's singly-linked list of rmap_items
- * @mm: the mm that this information is valid for
*/
-struct mm_slot {
- struct hlist_node link;
- struct list_head mm_list;
- struct rmap_item *rmap_list;
- struct mm_struct *mm;
+struct ksm_mm_slot {
+ struct mm_slot slot;
+ struct ksm_rmap_item *rmap_list;
};
/**
@@ -135,14 +132,14 @@ struct mm_slot {
* There is only the one ksm_scan instance of this cursor structure.
*/
struct ksm_scan {
- struct mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot;
unsigned long address;
- struct rmap_item **rmap_list;
+ struct ksm_rmap_item **rmap_list;
unsigned long seqnr;
};
/**
- * struct stable_node - node of the stable rbtree
+ * struct ksm_stable_node - node of the stable rbtree
* @node: rb node of this ksm page in the stable tree
* @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
* @hlist_dup: linked into the stable_node->hlist with a stable_node chain
@@ -153,7 +150,7 @@ struct ksm_scan {
* @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
* @nid: NUMA node id of stable tree in which linked (may not match kpfn)
*/
-struct stable_node {
+struct ksm_stable_node {
union {
struct rb_node node; /* when node of stable tree */
struct { /* when listed for migration */
@@ -182,7 +179,7 @@ struct stable_node {
};
/**
- * struct rmap_item - reverse mapping item for virtual addresses
+ * struct ksm_rmap_item - reverse mapping item for virtual addresses
* @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
* @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
* @nid: NUMA node id of unstable tree in which linked (may not match page)
@@ -193,8 +190,8 @@ struct stable_node {
* @head: pointer to stable_node heading this list in the stable tree
* @hlist: link into hlist of rmap_items hanging off that stable_node
*/
-struct rmap_item {
- struct rmap_item *rmap_list;
+struct ksm_rmap_item {
+ struct ksm_rmap_item *rmap_list;
union {
struct anon_vma *anon_vma; /* when stable */
#ifdef CONFIG_NUMA
@@ -207,7 +204,7 @@ struct rmap_item {
union {
struct rb_node node; /* when node of unstable tree */
struct { /* when listed from stable tree */
- struct stable_node *head;
+ struct ksm_stable_node *head;
struct hlist_node hlist;
};
};
@@ -230,8 +227,8 @@ static LIST_HEAD(migrate_nodes);
#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
-static struct mm_slot ksm_mm_head = {
- .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
+static struct ksm_mm_slot ksm_mm_head = {
+ .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node),
};
static struct ksm_scan ksm_scan = {
.mm_slot = &ksm_mm_head,
@@ -298,21 +295,21 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);
-#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
+#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
sizeof(struct __struct), __alignof__(struct __struct),\
(__flags), NULL)
static int __init ksm_slab_init(void)
{
- rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
+ rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0);
if (!rmap_item_cache)
goto out;
- stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
+ stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0);
if (!stable_node_cache)
goto out_free1;
- mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
+ mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0);
if (!mm_slot_cache)
goto out_free2;
@@ -334,18 +331,18 @@ static void __init ksm_slab_free(void)
mm_slot_cache = NULL;
}
-static __always_inline bool is_stable_node_chain(struct stable_node *chain)
+static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain)
{
return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
}
-static __always_inline bool is_stable_node_dup(struct stable_node *dup)
+static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup)
{
return dup->head == STABLE_NODE_DUP_HEAD;
}
-static inline void stable_node_chain_add_dup(struct stable_node *dup,
- struct stable_node *chain)
+static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup,
+ struct ksm_stable_node *chain)
{
VM_BUG_ON(is_stable_node_dup(dup));
dup->head = STABLE_NODE_DUP_HEAD;
@@ -354,14 +351,14 @@ static inline void stable_node_chain_add_dup(struct stable_node *dup,
ksm_stable_node_dups++;
}
-static inline void __stable_node_dup_del(struct stable_node *dup)
+static inline void __stable_node_dup_del(struct ksm_stable_node *dup)
{
VM_BUG_ON(!is_stable_node_dup(dup));
hlist_del(&dup->hlist_dup);
ksm_stable_node_dups--;
}
-static inline void stable_node_dup_del(struct stable_node *dup)
+static inline void stable_node_dup_del(struct ksm_stable_node *dup)
{
VM_BUG_ON(is_stable_node_chain(dup));
if (is_stable_node_dup(dup))
@@ -373,9 +370,9 @@ static inline void stable_node_dup_del(struct stable_node *dup)
#endif
}
-static inline struct rmap_item *alloc_rmap_item(void)
+static inline struct ksm_rmap_item *alloc_rmap_item(void)
{
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
__GFP_NORETRY | __GFP_NOWARN);
@@ -384,14 +381,15 @@ static inline struct rmap_item *alloc_rmap_item(void)
return rmap_item;
}
-static inline void free_rmap_item(struct rmap_item *rmap_item)
+static inline void free_rmap_item(struct ksm_rmap_item *rmap_item)
{
ksm_rmap_items--;
+ rmap_item->mm->ksm_rmap_items--;
rmap_item->mm = NULL; /* debug safety */
kmem_cache_free(rmap_item_cache, rmap_item);
}
-static inline struct stable_node *alloc_stable_node(void)
+static inline struct ksm_stable_node *alloc_stable_node(void)
{
/*
* The allocation can take too long with GFP_KERNEL when memory is under
@@ -401,43 +399,13 @@ static inline struct stable_node *alloc_stable_node(void)
return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
}
-static inline void free_stable_node(struct stable_node *stable_node)
+static inline void free_stable_node(struct ksm_stable_node *stable_node)
{
VM_BUG_ON(stable_node->rmap_hlist_len &&
!is_stable_node_chain(stable_node));
kmem_cache_free(stable_node_cache, stable_node);
}
-static inline struct mm_slot *alloc_mm_slot(void)
-{
- if (!mm_slot_cache) /* initialization failed */
- return NULL;
- return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
-}
-
-static inline void free_mm_slot(struct mm_slot *mm_slot)
-{
- kmem_cache_free(mm_slot_cache, mm_slot);
-}
-
-static struct mm_slot *get_mm_slot(struct mm_struct *mm)
-{
- struct mm_slot *slot;
-
- hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm)
- if (slot->mm == mm)
- return slot;
-
- return NULL;
-}
-
-static void insert_to_mm_slots_hash(struct mm_struct *mm,
- struct mm_slot *mm_slot)
-{
- mm_slot->mm = mm;
- hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
-}
-
/*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary,
@@ -475,7 +443,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
cond_resched();
page = follow_page(vma, addr,
FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
- if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
+ if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
ret = handle_mm_fault(vma, addr,
@@ -528,7 +496,7 @@ static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
return vma;
}
-static void break_cow(struct rmap_item *rmap_item)
+static void break_cow(struct ksm_rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
unsigned long addr = rmap_item->address;
@@ -547,7 +515,7 @@ static void break_cow(struct rmap_item *rmap_item)
mmap_read_unlock(mm);
}
-static struct page *get_mergeable_page(struct rmap_item *rmap_item)
+static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
unsigned long addr = rmap_item->address;
@@ -560,12 +528,15 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
goto out;
page = follow_page(vma, addr, FOLL_GET);
- if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
+ if (IS_ERR_OR_NULL(page))
goto out;
+ if (is_zone_device_page(page))
+ goto out_putpage;
if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
+out_putpage:
put_page(page);
out:
page = NULL;
@@ -585,10 +556,10 @@ static inline int get_kpfn_nid(unsigned long kpfn)
return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
}
-static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
+static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup,
struct rb_root *root)
{
- struct stable_node *chain = alloc_stable_node();
+ struct ksm_stable_node *chain = alloc_stable_node();
VM_BUG_ON(is_stable_node_chain(dup));
if (likely(chain)) {
INIT_HLIST_HEAD(&chain->hlist);
@@ -618,7 +589,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
return chain;
}
-static inline void free_stable_node_chain(struct stable_node *chain,
+static inline void free_stable_node_chain(struct ksm_stable_node *chain,
struct rb_root *root)
{
rb_erase(&chain->node, root);
@@ -626,9 +597,9 @@ static inline void free_stable_node_chain(struct stable_node *chain,
ksm_stable_node_chains--;
}
-static void remove_node_from_stable_tree(struct stable_node *stable_node)
+static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
{
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
/* check it's not STABLE_NODE_CHAIN or negative */
BUG_ON(stable_node->rmap_hlist_len < 0);
@@ -690,7 +661,7 @@ enum get_ksm_page_flags {
* a page to put something that might look like our key in page->mapping.
* is on its way to being freed; but it is an anomaly to bear in mind.
*/
-static struct page *get_ksm_page(struct stable_node *stable_node,
+static struct page *get_ksm_page(struct ksm_stable_node *stable_node,
enum get_ksm_page_flags flags)
{
struct page *page;
@@ -769,10 +740,10 @@ stale:
* Removing rmap_item from stable or unstable tree.
* This function will clean the information from the stable/unstable tree.
*/
-static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
+static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item)
{
if (rmap_item->address & STABLE_FLAG) {
- struct stable_node *stable_node;
+ struct ksm_stable_node *stable_node;
struct page *page;
stable_node = rmap_item->head;
@@ -819,10 +790,10 @@ out:
cond_resched(); /* we're called from many long loops */
}
-static void remove_trailing_rmap_items(struct rmap_item **rmap_list)
+static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
{
while (*rmap_list) {
- struct rmap_item *rmap_item = *rmap_list;
+ struct ksm_rmap_item *rmap_item = *rmap_list;
*rmap_list = rmap_item->rmap_list;
remove_rmap_item_from_tree(rmap_item);
free_rmap_item(rmap_item);
@@ -859,18 +830,18 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
return err;
}
-static inline struct stable_node *folio_stable_node(struct folio *folio)
+static inline struct ksm_stable_node *folio_stable_node(struct folio *folio)
{
return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
}
-static inline struct stable_node *page_stable_node(struct page *page)
+static inline struct ksm_stable_node *page_stable_node(struct page *page)
{
return folio_stable_node(page_folio(page));
}
static inline void set_page_stable_node(struct page *page,
- struct stable_node *stable_node)
+ struct ksm_stable_node *stable_node)
{
VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page);
page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
@@ -880,7 +851,7 @@ static inline void set_page_stable_node(struct page *page,
/*
* Only called through the sysfs control interface:
*/
-static int remove_stable_node(struct stable_node *stable_node)
+static int remove_stable_node(struct ksm_stable_node *stable_node)
{
struct page *page;
int err;
@@ -918,10 +889,10 @@ static int remove_stable_node(struct stable_node *stable_node)
return err;
}
-static int remove_stable_node_chain(struct stable_node *stable_node,
+static int remove_stable_node_chain(struct ksm_stable_node *stable_node,
struct rb_root *root)
{
- struct stable_node *dup;
+ struct ksm_stable_node *dup;
struct hlist_node *hlist_safe;
if (!is_stable_node_chain(stable_node)) {
@@ -945,14 +916,14 @@ static int remove_stable_node_chain(struct stable_node *stable_node,
static int remove_all_stable_nodes(void)
{
- struct stable_node *stable_node, *next;
+ struct ksm_stable_node *stable_node, *next;
int nid;
int err = 0;
for (nid = 0; nid < ksm_nr_node_ids; nid++) {
while (root_stable_tree[nid].rb_node) {
stable_node = rb_entry(root_stable_tree[nid].rb_node,
- struct stable_node, node);
+ struct ksm_stable_node, node);
if (remove_stable_node_chain(stable_node,
root_stable_tree + nid)) {
err = -EBUSY;
@@ -971,21 +942,25 @@ static int remove_all_stable_nodes(void)
static int unmerge_and_remove_all_rmap_items(void)
{
- struct mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot;
+ struct mm_slot *slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int err = 0;
spin_lock(&ksm_mmlist_lock);
- ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
- struct mm_slot, mm_list);
+ slot = list_entry(ksm_mm_head.slot.mm_node.next,
+ struct mm_slot, mm_node);
+ ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
spin_unlock(&ksm_mmlist_lock);
- for (mm_slot = ksm_scan.mm_slot;
- mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
- mm = mm_slot->mm;
+ for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head;
+ mm_slot = ksm_scan.mm_slot) {
+ VMA_ITERATOR(vmi, mm_slot->slot.mm, 0);
+
+ mm = mm_slot->slot.mm;
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (ksm_test_exit(mm))
break;
if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
@@ -1000,14 +975,15 @@ static int unmerge_and_remove_all_rmap_items(void)
mmap_read_unlock(mm);
spin_lock(&ksm_mmlist_lock);
- ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
- struct mm_slot, mm_list);
+ slot = list_entry(mm_slot->slot.mm_node.next,
+ struct mm_slot, mm_node);
+ ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
if (ksm_test_exit(mm)) {
- hash_del(&mm_slot->link);
- list_del(&mm_slot->mm_list);
+ hash_del(&mm_slot->slot.hash);
+ list_del(&mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
mmdrop(mm);
} else
@@ -1095,6 +1071,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
goto out_unlock;
}
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive && page_try_share_anon_rmap(page)) {
set_pte_at(mm, pvmw.address, pvmw.pte, entry);
goto out_unlock;
@@ -1133,7 +1110,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
struct page *kpage, pte_t orig_pte)
{
struct mm_struct *mm = vma->vm_mm;
+ struct folio *folio;
pmd_t *pmd;
+ pmd_t pmde;
pte_t *ptep;
pte_t newpte;
spinlock_t *ptl;
@@ -1148,6 +1127,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
pmd = mm_find_pmd(mm, addr);
if (!pmd)
goto out;
+ /*
+ * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
+ * without holding anon_vma lock for write. So when looking for a
+ * genuine pmde (in which to find pte), test present and !THP together.
+ */
+ pmde = *pmd;
+ barrier();
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ goto out;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
@@ -1191,10 +1179,11 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
ptep_clear_flush(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, newpte);
+ folio = page_folio(page);
page_remove_rmap(page, vma, false);
- if (!page_mapped(page))
- try_to_free_swap(page);
- put_page(page);
+ if (!folio_mapped(folio))
+ folio_free_swap(folio);
+ folio_put(folio);
pte_unmap_unlock(ptep, ptl);
err = 0;
@@ -1278,7 +1267,7 @@ out:
*
* This function returns 0 if the pages were merged, -EFAULT otherwise.
*/
-static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
+static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item,
struct page *page, struct page *kpage)
{
struct mm_struct *mm = rmap_item->mm;
@@ -1315,9 +1304,9 @@ out:
* Note that this function upgrades page to ksm page: if one of the pages
* is already a ksm page, try_to_merge_with_ksm_page should be used.
*/
-static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
+static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
struct page *page,
- struct rmap_item *tree_rmap_item,
+ struct ksm_rmap_item *tree_rmap_item,
struct page *tree_page)
{
int err;
@@ -1337,7 +1326,7 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
}
static __always_inline
-bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
+bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset)
{
VM_BUG_ON(stable_node->rmap_hlist_len < 0);
/*
@@ -1351,17 +1340,17 @@ bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
}
static __always_inline
-bool is_page_sharing_candidate(struct stable_node *stable_node)
+bool is_page_sharing_candidate(struct ksm_stable_node *stable_node)
{
return __is_page_sharing_candidate(stable_node, 0);
}
-static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
- struct stable_node **_stable_node,
+static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
+ struct ksm_stable_node **_stable_node,
struct rb_root *root,
bool prune_stale_stable_nodes)
{
- struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
+ struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node;
struct hlist_node *hlist_safe;
struct page *_tree_page, *tree_page = NULL;
int nr = 0;
@@ -1475,7 +1464,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
return tree_page;
}
-static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
+static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node,
struct rb_root *root)
{
if (!is_stable_node_chain(stable_node))
@@ -1502,12 +1491,12 @@ static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
* function and will be overwritten in all cases, the caller doesn't
* need to initialize it.
*/
-static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
- struct stable_node **_stable_node,
+static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup,
+ struct ksm_stable_node **_stable_node,
struct rb_root *root,
bool prune_stale_stable_nodes)
{
- struct stable_node *stable_node = *_stable_node;
+ struct ksm_stable_node *stable_node = *_stable_node;
if (!is_stable_node_chain(stable_node)) {
if (is_page_sharing_candidate(stable_node)) {
*_stable_node_dup = stable_node;
@@ -1524,18 +1513,18 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
prune_stale_stable_nodes);
}
-static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
- struct stable_node **s_n,
+static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d,
+ struct ksm_stable_node **s_n,
struct rb_root *root)
{
return __stable_node_chain(s_n_d, s_n, root, true);
}
-static __always_inline struct page *chain(struct stable_node **s_n_d,
- struct stable_node *s_n,
+static __always_inline struct page *chain(struct ksm_stable_node **s_n_d,
+ struct ksm_stable_node *s_n,
struct rb_root *root)
{
- struct stable_node *old_stable_node = s_n;
+ struct ksm_stable_node *old_stable_node = s_n;
struct page *tree_page;
tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
@@ -1559,8 +1548,8 @@ static struct page *stable_tree_search(struct page *page)
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
- struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
- struct stable_node *page_node;
+ struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
+ struct ksm_stable_node *page_node;
page_node = page_stable_node(page);
if (page_node && page_node->head != &migrate_nodes) {
@@ -1580,7 +1569,7 @@ again:
int ret;
cond_resched();
- stable_node = rb_entry(*new, struct stable_node, node);
+ stable_node = rb_entry(*new, struct ksm_stable_node, node);
stable_node_any = NULL;
tree_page = chain_prune(&stable_node_dup, &stable_node, root);
/*
@@ -1803,14 +1792,14 @@ chain_append:
* This function returns the stable tree node just allocated on success,
* NULL otherwise.
*/
-static struct stable_node *stable_tree_insert(struct page *kpage)
+static struct ksm_stable_node *stable_tree_insert(struct page *kpage)
{
int nid;
unsigned long kpfn;
struct rb_root *root;
struct rb_node **new;
struct rb_node *parent;
- struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
+ struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any;
bool need_chain = false;
kpfn = page_to_pfn(kpage);
@@ -1825,7 +1814,7 @@ again:
int ret;
cond_resched();
- stable_node = rb_entry(*new, struct stable_node, node);
+ stable_node = rb_entry(*new, struct ksm_stable_node, node);
stable_node_any = NULL;
tree_page = chain(&stable_node_dup, stable_node, root);
if (!stable_node_dup) {
@@ -1894,7 +1883,7 @@ again:
rb_insert_color(&stable_node_dup->node, root);
} else {
if (!is_stable_node_chain(stable_node)) {
- struct stable_node *orig = stable_node;
+ struct ksm_stable_node *orig = stable_node;
/* chain is missing so create it */
stable_node = alloc_stable_node_chain(orig, root);
if (!stable_node) {
@@ -1923,7 +1912,7 @@ again:
* the same walking algorithm in an rbtree.
*/
static
-struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
+struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item,
struct page *page,
struct page **tree_pagep)
{
@@ -1937,12 +1926,12 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
new = &root->rb_node;
while (*new) {
- struct rmap_item *tree_rmap_item;
+ struct ksm_rmap_item *tree_rmap_item;
struct page *tree_page;
int ret;
cond_resched();
- tree_rmap_item = rb_entry(*new, struct rmap_item, node);
+ tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node);
tree_page = get_mergeable_page(tree_rmap_item);
if (!tree_page)
return NULL;
@@ -1994,8 +1983,8 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
* rmap_items hanging off a given node of the stable tree, all sharing
* the same ksm page.
*/
-static void stable_tree_append(struct rmap_item *rmap_item,
- struct stable_node *stable_node,
+static void stable_tree_append(struct ksm_rmap_item *rmap_item,
+ struct ksm_stable_node *stable_node,
bool max_page_sharing_bypass)
{
/*
@@ -2037,12 +2026,12 @@ static void stable_tree_append(struct rmap_item *rmap_item,
* @page: the page that we are searching identical page to.
* @rmap_item: the reverse mapping into the virtual address of this page
*/
-static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
+static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
- struct rmap_item *tree_rmap_item;
+ struct ksm_rmap_item *tree_rmap_item;
struct page *tree_page = NULL;
- struct stable_node *stable_node;
+ struct ksm_stable_node *stable_node;
struct page *kpage;
unsigned int checksum;
int err;
@@ -2198,11 +2187,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
}
}
-static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
- struct rmap_item **rmap_list,
+static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
+ struct ksm_rmap_item **rmap_list,
unsigned long addr)
{
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
while (*rmap_list) {
rmap_item = *rmap_list;
@@ -2218,7 +2207,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
rmap_item = alloc_rmap_item();
if (rmap_item) {
/* It has already been zeroed */
- rmap_item->mm = mm_slot->mm;
+ rmap_item->mm = mm_slot->slot.mm;
+ rmap_item->mm->ksm_rmap_items++;
rmap_item->address = addr;
rmap_item->rmap_list = *rmap_list;
*rmap_list = rmap_item;
@@ -2226,19 +2216,21 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
return rmap_item;
}
-static struct rmap_item *scan_get_next_rmap_item(struct page **page)
+static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
+ struct ksm_mm_slot *mm_slot;
struct mm_slot *slot;
struct vm_area_struct *vma;
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
+ struct vma_iterator vmi;
int nid;
- if (list_empty(&ksm_mm_head.mm_list))
+ if (list_empty(&ksm_mm_head.slot.mm_node))
return NULL;
- slot = ksm_scan.mm_slot;
- if (slot == &ksm_mm_head) {
+ mm_slot = ksm_scan.mm_slot;
+ if (mm_slot == &ksm_mm_head) {
/*
* A number of pages can hang around indefinitely on per-cpu
* pagevecs, raised page count preventing write_protect_page
@@ -2258,7 +2250,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
* so prune them once before each full scan.
*/
if (!ksm_merge_across_nodes) {
- struct stable_node *stable_node, *next;
+ struct ksm_stable_node *stable_node, *next;
struct page *page;
list_for_each_entry_safe(stable_node, next,
@@ -2275,28 +2267,31 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
root_unstable_tree[nid] = RB_ROOT;
spin_lock(&ksm_mmlist_lock);
- slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
- ksm_scan.mm_slot = slot;
+ slot = list_entry(mm_slot->slot.mm_node.next,
+ struct mm_slot, mm_node);
+ mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
+ ksm_scan.mm_slot = mm_slot;
spin_unlock(&ksm_mmlist_lock);
/*
* Although we tested list_empty() above, a racing __ksm_exit
* of the last mm on the list may have removed it since then.
*/
- if (slot == &ksm_mm_head)
+ if (mm_slot == &ksm_mm_head)
return NULL;
next_mm:
ksm_scan.address = 0;
- ksm_scan.rmap_list = &slot->rmap_list;
+ ksm_scan.rmap_list = &mm_slot->rmap_list;
}
+ slot = &mm_slot->slot;
mm = slot->mm;
+ vma_iter_init(&vmi, mm, ksm_scan.address);
+
mmap_read_lock(mm);
if (ksm_test_exit(mm))
- vma = NULL;
- else
- vma = find_vma(mm, ksm_scan.address);
+ goto no_vmas;
- for (; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (!(vma->vm_flags & VM_MERGEABLE))
continue;
if (ksm_scan.address < vma->vm_start)
@@ -2308,15 +2303,17 @@ next_mm:
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
- if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
+ if (IS_ERR_OR_NULL(*page)) {
ksm_scan.address += PAGE_SIZE;
cond_resched();
continue;
}
+ if (is_zone_device_page(*page))
+ goto next_page;
if (PageAnon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
- rmap_item = get_next_rmap_item(slot,
+ rmap_item = get_next_rmap_item(mm_slot,
ksm_scan.rmap_list, ksm_scan.address);
if (rmap_item) {
ksm_scan.rmap_list =
@@ -2327,6 +2324,7 @@ next_mm:
mmap_read_unlock(mm);
return rmap_item;
}
+next_page:
put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
@@ -2334,8 +2332,9 @@ next_mm:
}
if (ksm_test_exit(mm)) {
+no_vmas:
ksm_scan.address = 0;
- ksm_scan.rmap_list = &slot->rmap_list;
+ ksm_scan.rmap_list = &mm_slot->rmap_list;
}
/*
* Nuke all the rmap_items that are above this current rmap:
@@ -2344,8 +2343,9 @@ next_mm:
remove_trailing_rmap_items(ksm_scan.rmap_list);
spin_lock(&ksm_mmlist_lock);
- ksm_scan.mm_slot = list_entry(slot->mm_list.next,
- struct mm_slot, mm_list);
+ slot = list_entry(mm_slot->slot.mm_node.next,
+ struct mm_slot, mm_node);
+ ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
if (ksm_scan.address == 0) {
/*
* We've completed a full scan of all vmas, holding mmap_lock
@@ -2356,11 +2356,11 @@ next_mm:
* or when all VM_MERGEABLE areas have been unmapped (and
* mmap_lock then protects against race with MADV_MERGEABLE).
*/
- hash_del(&slot->link);
- list_del(&slot->mm_list);
+ hash_del(&mm_slot->slot.hash);
+ list_del(&mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
- free_mm_slot(slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
mmap_read_unlock(mm);
mmdrop(mm);
@@ -2377,8 +2377,8 @@ next_mm:
}
/* Repeat until we've completed scanning the whole list */
- slot = ksm_scan.mm_slot;
- if (slot != &ksm_mm_head)
+ mm_slot = ksm_scan.mm_slot;
+ if (mm_slot != &ksm_mm_head)
goto next_mm;
ksm_scan.seqnr++;
@@ -2391,7 +2391,7 @@ next_mm:
*/
static void ksm_do_scan(unsigned int scan_npages)
{
- struct rmap_item *rmap_item;
+ struct ksm_rmap_item *rmap_item;
struct page *page;
while (scan_npages-- && likely(!freezing(current))) {
@@ -2406,7 +2406,7 @@ static void ksm_do_scan(unsigned int scan_npages)
static int ksmd_should_run(void)
{
- return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
+ return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node);
}
static int ksm_scan_thread(void *nothing)
@@ -2495,18 +2495,21 @@ EXPORT_SYMBOL_GPL(ksm_madvise);
int __ksm_enter(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot;
+ struct mm_slot *slot;
int needs_wakeup;
- mm_slot = alloc_mm_slot();
+ mm_slot = mm_slot_alloc(mm_slot_cache);
if (!mm_slot)
return -ENOMEM;
+ slot = &mm_slot->slot;
+
/* Check ksm_run too? Would need tighter locking */
- needs_wakeup = list_empty(&ksm_mm_head.mm_list);
+ needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node);
spin_lock(&ksm_mmlist_lock);
- insert_to_mm_slots_hash(mm, mm_slot);
+ mm_slot_insert(mm_slots_hash, mm, slot);
/*
* When KSM_RUN_MERGE (or KSM_RUN_STOP),
* insert just behind the scanning cursor, to let the area settle
@@ -2518,9 +2521,9 @@ int __ksm_enter(struct mm_struct *mm)
* missed: then we might as well insert at the end of the list.
*/
if (ksm_run & KSM_RUN_UNMERGE)
- list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list);
+ list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node);
else
- list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
+ list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
set_bit(MMF_VM_MERGEABLE, &mm->flags);
@@ -2534,7 +2537,8 @@ int __ksm_enter(struct mm_struct *mm)
void __ksm_exit(struct mm_struct *mm)
{
- struct mm_slot *mm_slot;
+ struct ksm_mm_slot *mm_slot;
+ struct mm_slot *slot;
int easy_to_free = 0;
/*
@@ -2547,21 +2551,22 @@ void __ksm_exit(struct mm_struct *mm)
*/
spin_lock(&ksm_mmlist_lock);
- mm_slot = get_mm_slot(mm);
+ slot = mm_slot_lookup(mm_slots_hash, mm);
+ mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
if (mm_slot && ksm_scan.mm_slot != mm_slot) {
if (!mm_slot->rmap_list) {
- hash_del(&mm_slot->link);
- list_del(&mm_slot->mm_list);
+ hash_del(&slot->hash);
+ list_del(&slot->mm_node);
easy_to_free = 1;
} else {
- list_move(&mm_slot->mm_list,
- &ksm_scan.mm_slot->mm_list);
+ list_move(&slot->mm_node,
+ &ksm_scan.mm_slot->slot.mm_node);
}
}
spin_unlock(&ksm_mmlist_lock);
if (easy_to_free) {
- free_mm_slot(mm_slot);
+ mm_slot_free(mm_slot_cache, mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
mmdrop(mm);
} else if (mm_slot) {
@@ -2612,8 +2617,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
{
- struct stable_node *stable_node;
- struct rmap_item *rmap_item;
+ struct ksm_stable_node *stable_node;
+ struct ksm_rmap_item *rmap_item;
int search_new_forks = 0;
VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);
@@ -2683,7 +2688,7 @@ again:
#ifdef CONFIG_MIGRATION
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
{
- struct stable_node *stable_node;
+ struct ksm_stable_node *stable_node;
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
@@ -2716,7 +2721,7 @@ static void wait_while_offlining(void)
}
}
-static bool stable_node_dup_remove_range(struct stable_node *stable_node,
+static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node,
unsigned long start_pfn,
unsigned long end_pfn)
{
@@ -2732,12 +2737,12 @@ static bool stable_node_dup_remove_range(struct stable_node *stable_node,
return false;
}
-static bool stable_node_chain_remove_range(struct stable_node *stable_node,
+static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node,
unsigned long start_pfn,
unsigned long end_pfn,
struct rb_root *root)
{
- struct stable_node *dup;
+ struct ksm_stable_node *dup;
struct hlist_node *hlist_safe;
if (!is_stable_node_chain(stable_node)) {
@@ -2761,14 +2766,14 @@ static bool stable_node_chain_remove_range(struct stable_node *stable_node,
static void ksm_check_stable_tree(unsigned long start_pfn,
unsigned long end_pfn)
{
- struct stable_node *stable_node, *next;
+ struct ksm_stable_node *stable_node, *next;
struct rb_node *node;
int nid;
for (nid = 0; nid < ksm_nr_node_ids; nid++) {
node = rb_first(root_stable_tree + nid);
while (node) {
- stable_node = rb_entry(node, struct stable_node, node);
+ stable_node = rb_entry(node, struct ksm_stable_node, node);
if (stable_node_chain_remove_range(stable_node,
start_pfn, end_pfn,
root_stable_tree +
diff --git a/mm/madvise.c b/mm/madvise.c
index 5f0f0948a50e..c7105ec6d08c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_FREE:
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
+ case MADV_COLLAPSE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -451,8 +452,11 @@ regular_page:
continue;
}
- /* Do not interfere with other mappings of this page */
- if (page_mapcount(page) != 1)
+ /*
+ * Do not interfere with other mappings of this page and
+ * non-LRU page.
+ */
+ if (!PageLRU(page) || page_mapcount(page) != 1)
continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -597,6 +601,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
pte_t *orig_pte, *pte, ptent;
+ struct folio *folio;
struct page *page;
int nr_swap = 0;
unsigned long next;
@@ -641,56 +646,56 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
page = vm_normal_page(vma, addr, ptent);
if (!page || is_zone_device_page(page))
continue;
+ folio = page_folio(page);
/*
- * If pmd isn't transhuge but the page is THP and
+ * If pmd isn't transhuge but the folio is large and
* is owned by only this process, split it and
* deactivate all pages.
*/
- if (PageTransCompound(page)) {
- if (page_mapcount(page) != 1)
+ if (folio_test_large(folio)) {
+ if (folio_mapcount(folio) != 1)
goto out;
- get_page(page);
- if (!trylock_page(page)) {
- put_page(page);
+ folio_get(folio);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
goto out;
}
pte_unmap_unlock(orig_pte, ptl);
- if (split_huge_page(page)) {
- unlock_page(page);
- put_page(page);
+ if (split_folio(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
goto out;
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte--;
addr -= PAGE_SIZE;
continue;
}
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
-
- if (PageSwapCache(page) || PageDirty(page)) {
- if (!trylock_page(page))
+ if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
+ if (!folio_trylock(folio))
continue;
/*
- * If page is shared with others, we couldn't clear
- * PG_dirty of the page.
+ * If folio is shared with others, we mustn't clear
+ * the folio's dirty flag.
*/
- if (page_mapcount(page) != 1) {
- unlock_page(page);
+ if (folio_mapcount(folio) != 1) {
+ folio_unlock(folio);
continue;
}
- if (PageSwapCache(page) && !try_to_free_swap(page)) {
- unlock_page(page);
+ if (folio_test_swapcache(folio) &&
+ !folio_free_swap(folio)) {
+ folio_unlock(folio);
continue;
}
- ClearPageDirty(page);
- unlock_page(page);
+ folio_clear_dirty(folio);
+ folio_unlock(folio);
}
if (pte_young(ptent) || pte_dirty(ptent)) {
@@ -708,7 +713,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(mm, addr, pte, ptent);
tlb_remove_tlb_entry(tlb, pte, addr);
}
- mark_page_lazyfree(page);
+ mark_page_lazyfree(&folio->page);
}
out:
if (nr_swap) {
@@ -808,7 +813,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
if (start & ~huge_page_mask(hstate_vma(vma)))
return false;
- *end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
+ /*
+ * Madvise callers expect the length to be rounded up to PAGE_SIZE
+ * boundaries, and may be unaware that this VMA uses huge pages.
+ * Avoid unexpected data loss by rounding down the number of
+ * huge pages freed.
+ */
+ *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
+
return true;
}
@@ -823,6 +835,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
return -EINVAL;
+ if (start == end)
+ return 0;
+
if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_lock has been dropped, prev is stale */
@@ -1057,6 +1072,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
if (error)
goto out;
break;
+ case MADV_COLLAPSE:
+ return madvise_collapse(vma, prev, start, end);
}
anon_name = anon_vma_name(vma);
@@ -1150,6 +1167,7 @@ madvise_behavior_valid(int behavior)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
+ case MADV_COLLAPSE:
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
@@ -1166,13 +1184,13 @@ madvise_behavior_valid(int behavior)
}
}
-static bool
-process_madvise_behavior_valid(int behavior)
+static bool process_madvise_behavior_valid(int behavior)
{
switch (behavior) {
case MADV_COLD:
case MADV_PAGEOUT:
case MADV_WILLNEED:
+ case MADV_COLLAPSE:
return true;
default:
return false;
@@ -1238,7 +1256,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
if (start >= end)
break;
if (prev)
- vma = prev->vm_next;
+ vma = find_vma(mm, prev->vm_end);
else /* madvise_remove dropped mmap_lock */
vma = find_vma(mm, start);
}
@@ -1339,6 +1357,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
* MADV_NOHUGEPAGE - mark the given range as not worth being backed by
* transparent huge pages so the existing pages will not be
* coalesced into THP and new pages will not be allocated as THP.
+ * MADV_COLLAPSE - synchronously coalesce pages into new THP.
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
diff --git a/mm/memblock.c b/mm/memblock.c
index b5d3026979fc..511d4783dcf1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2000,7 +2000,7 @@ static void __init free_unused_memmap(void)
* presume that there are no holes in the memory map inside
* a pageblock
*/
- start = round_down(start, pageblock_nr_pages);
+ start = pageblock_start_pfn(start);
/*
* If we had a previous bank, and there is a space
@@ -2014,12 +2014,12 @@ static void __init free_unused_memmap(void)
* presume that there are no holes in the memory map inside
* a pageblock
*/
- prev_end = ALIGN(end, pageblock_nr_pages);
+ prev_end = pageblock_align(end);
}
#ifdef CONFIG_SPARSEMEM
if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) {
- prev_end = ALIGN(end, pageblock_nr_pages);
+ prev_end = pageblock_align(end);
free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION));
}
#endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b69979c9ced5..2d8549ae1b30 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -88,13 +88,6 @@ static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */
static bool cgroup_memory_nokmem __ro_after_init;
-/* Whether the swap controller is active */
-#ifdef CONFIG_MEMCG_SWAP
-static bool cgroup_memory_noswap __ro_after_init;
-#else
-#define cgroup_memory_noswap 1
-#endif
-
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
@@ -102,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys);
}
#define THRESHOLDS_EVENTS_TARGET 128
@@ -597,25 +590,18 @@ static u64 flush_next_time;
*/
static void memcg_stats_lock(void)
{
-#ifdef CONFIG_PREEMPT_RT
- preempt_disable();
-#else
- VM_BUG_ON(!irqs_disabled());
-#endif
+ preempt_disable_nested();
+ VM_WARN_ON_IRQS_ENABLED();
}
static void __memcg_stats_lock(void)
{
-#ifdef CONFIG_PREEMPT_RT
- preempt_disable();
-#endif
+ preempt_disable_nested();
}
static void memcg_stats_unlock(void)
{
-#ifdef CONFIG_PREEMPT_RT
- preempt_enable();
-#endif
+ preempt_enable_nested();
}
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
@@ -669,6 +655,81 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
}
+/* Subset of vm_event_item to report for memcg event stats */
+static const unsigned int memcg_vm_event_stat[] = {
+ PGPGIN,
+ PGPGOUT,
+ PGSCAN_KSWAPD,
+ PGSCAN_DIRECT,
+ PGSTEAL_KSWAPD,
+ PGSTEAL_DIRECT,
+ PGFAULT,
+ PGMAJFAULT,
+ PGREFILL,
+ PGACTIVATE,
+ PGDEACTIVATE,
+ PGLAZYFREE,
+ PGLAZYFREED,
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ ZSWPIN,
+ ZSWPOUT,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ THP_FAULT_ALLOC,
+ THP_COLLAPSE_ALLOC,
+#endif
+};
+
+#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
+static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
+
+static void init_memcg_events(void)
+{
+ int i;
+
+ for (i = 0; i < NR_MEMCG_EVENTS; ++i)
+ mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
+}
+
+static inline int memcg_events_index(enum vm_event_item idx)
+{
+ return mem_cgroup_events_index[idx] - 1;
+}
+
+struct memcg_vmstats_percpu {
+ /* Local (CPU and cgroup) page state & events */
+ long state[MEMCG_NR_STAT];
+ unsigned long events[NR_MEMCG_EVENTS];
+
+ /* Delta calculation for lockless upward propagation */
+ long state_prev[MEMCG_NR_STAT];
+ unsigned long events_prev[NR_MEMCG_EVENTS];
+
+ /* Cgroup1: threshold notifications & softlimit tree updates */
+ unsigned long nr_page_events;
+ unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+struct memcg_vmstats {
+ /* Aggregated (CPU and subtree) page state & events */
+ long state[MEMCG_NR_STAT];
+ unsigned long events[NR_MEMCG_EVENTS];
+
+ /* Pending child counts during tree propagation */
+ long state_pending[MEMCG_NR_STAT];
+ unsigned long events_pending[NR_MEMCG_EVENTS];
+};
+
+unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+ long x = READ_ONCE(memcg->vmstats->state[idx]);
+#ifdef CONFIG_SMP
+ if (x < 0)
+ x = 0;
+#endif
+ return x;
+}
+
/**
* __mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
@@ -715,7 +776,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
* interrupt context while other caller need to have disabled interrupt.
*/
__memcg_stats_lock();
- if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ if (IS_ENABLED(CONFIG_DEBUG_VM)) {
switch (idx) {
case NR_ANON_MAPPED:
case NR_FILE_MAPPED:
@@ -725,7 +786,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
WARN_ON_ONCE(!in_task());
break;
default:
- WARN_ON_ONCE(!irqs_disabled());
+ VM_WARN_ON_IRQS_ENABLED();
}
}
@@ -816,27 +877,37 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
unsigned long count)
{
- if (mem_cgroup_disabled())
+ int index = memcg_events_index(idx);
+
+ if (mem_cgroup_disabled() || index < 0)
return;
memcg_stats_lock();
- __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
+ __this_cpu_add(memcg->vmstats_percpu->events[index], count);
memcg_rstat_updated(memcg, count);
memcg_stats_unlock();
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
- return READ_ONCE(memcg->vmstats.events[event]);
+ int index = memcg_events_index(event);
+
+ if (index < 0)
+ return 0;
+ return READ_ONCE(memcg->vmstats->events[index]);
}
static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
long x = 0;
int cpu;
+ int index = memcg_events_index(event);
+
+ if (index < 0)
+ return 0;
for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
+ x += per_cpu(memcg->vmstats_percpu->events[index], cpu);
return x;
}
@@ -1143,7 +1214,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
} while ((memcg = parent_mem_cgroup(memcg)));
/*
- * When cgruop1 non-hierarchy mode is used,
+ * When cgroup1 non-hierarchy mode is used,
* parent_mem_cgroup() does not walk all the way up to the
* cgroup root (root_mem_cgroup). So we have to handle
* dead_memcg from cgroup root separately.
@@ -1401,6 +1472,7 @@ static const struct memory_stat memory_stats[] = {
{ "kernel", MEMCG_KMEM },
{ "kernel_stack", NR_KERNEL_STACK_KB },
{ "pagetables", NR_PAGETABLE },
+ { "sec_pagetables", NR_SECONDARY_PAGETABLE },
{ "percpu", MEMCG_PERCPU_B },
{ "sock", MEMCG_SOCK },
{ "vmalloc", MEMCG_VMALLOC },
@@ -1467,29 +1539,6 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
}
-/* Subset of vm_event_item to report for memcg event stats */
-static const unsigned int memcg_vm_event_stat[] = {
- PGSCAN_KSWAPD,
- PGSCAN_DIRECT,
- PGSTEAL_KSWAPD,
- PGSTEAL_DIRECT,
- PGFAULT,
- PGMAJFAULT,
- PGREFILL,
- PGACTIVATE,
- PGDEACTIVATE,
- PGLAZYFREE,
- PGLAZYFREED,
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
- ZSWPIN,
- ZSWPOUT,
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- THP_FAULT_ALLOC,
- THP_COLLAPSE_ALLOC,
-#endif
-};
-
static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
{
struct seq_buf s;
@@ -1530,10 +1579,15 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT));
- for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++)
+ for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
+ if (memcg_vm_event_stat[i] == PGPGIN ||
+ memcg_vm_event_stat[i] == PGPGOUT)
+ continue;
+
seq_buf_printf(&s, "%s %lu\n",
vm_event_name(memcg_vm_event_stat[i]),
memcg_events(memcg, memcg_vm_event_stat[i]));
+ }
/* The above should easily fit into one page */
WARN_ON_ONCE(seq_buf_has_overflowed(&s));
@@ -1607,17 +1661,17 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
unsigned long max = READ_ONCE(memcg->memory.max);
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
- if (mem_cgroup_swappiness(memcg))
- max += min(READ_ONCE(memcg->swap.max),
- (unsigned long)total_swap_pages);
- } else { /* v1 */
+ if (do_memsw_account()) {
if (mem_cgroup_swappiness(memcg)) {
/* Calculate swap excess capacity from memsw limit */
unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
max += min(swap, (unsigned long)total_swap_pages);
}
+ } else {
+ if (mem_cgroup_swappiness(memcg))
+ max += min(READ_ONCE(memcg->swap.max),
+ (unsigned long)total_swap_pages);
}
return max;
}
@@ -2789,6 +2843,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
+ * - mem_cgroup_trylock_pages()
*/
folio->memcg_data = (unsigned long)memcg;
}
@@ -3362,7 +3417,7 @@ void split_page_memcg(struct page *head, unsigned int nr)
css_get_many(&memcg->css, nr - 1);
}
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_SWAP
/**
* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
* @entry: swap entry to be moved
@@ -3975,6 +4030,8 @@ static const unsigned int memcg1_stats[] = {
NR_FILE_MAPPED,
NR_FILE_DIRTY,
NR_WRITEBACK,
+ WORKINGSET_REFAULT_ANON,
+ WORKINGSET_REFAULT_FILE,
MEMCG_SWAP,
};
@@ -3988,6 +4045,8 @@ static const char *const memcg1_stat_names[] = {
"mapped_file",
"dirty",
"writeback",
+ "workingset_refault_anon",
+ "workingset_refault_file",
"swap",
};
@@ -4016,7 +4075,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
+ nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -4047,7 +4107,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
continue;
nr = memcg_page_state(memcg, memcg1_stats[i]);
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
- (u64)nr * PAGE_SIZE);
+ (u64)nr * memcg_page_state_unit(memcg1_stats[i]));
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -5110,8 +5170,8 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
struct mem_cgroup *memcg;
cgrp = cgroup_get_from_id(ino);
- if (!cgrp)
- return ERR_PTR(-ENOENT);
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
if (css)
@@ -5164,12 +5224,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
+ kfree(memcg->vmstats);
free_percpu(memcg->vmstats_percpu);
kfree(memcg);
}
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
__mem_cgroup_free(memcg);
}
@@ -5192,6 +5254,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
}
+ memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL);
+ if (!memcg->vmstats)
+ goto fail;
+
memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
GFP_KERNEL_ACCOUNT);
if (!memcg->vmstats_percpu)
@@ -5228,6 +5294,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+ lru_gen_init_memcg(memcg);
return memcg;
fail:
mem_cgroup_id_remove(memcg);
@@ -5262,6 +5329,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
+ init_memcg_events();
page_counter_init(&memcg->memory, NULL);
page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->kmem, NULL);
@@ -5410,9 +5478,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
* below us. We're in a per-cpu loop here and this is
* a global counter, so the first cycle will get them.
*/
- delta = memcg->vmstats.state_pending[i];
+ delta = memcg->vmstats->state_pending[i];
if (delta)
- memcg->vmstats.state_pending[i] = 0;
+ memcg->vmstats->state_pending[i] = 0;
/* Add CPU changes on this level since the last flush */
v = READ_ONCE(statc->state[i]);
@@ -5425,15 +5493,15 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
continue;
/* Aggregate counts on this level and propagate upwards */
- memcg->vmstats.state[i] += delta;
+ memcg->vmstats->state[i] += delta;
if (parent)
- parent->vmstats.state_pending[i] += delta;
+ parent->vmstats->state_pending[i] += delta;
}
- for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
- delta = memcg->vmstats.events_pending[i];
+ for (i = 0; i < NR_MEMCG_EVENTS; i++) {
+ delta = memcg->vmstats->events_pending[i];
if (delta)
- memcg->vmstats.events_pending[i] = 0;
+ memcg->vmstats->events_pending[i] = 0;
v = READ_ONCE(statc->events[i]);
if (v != statc->events_prev[i]) {
@@ -5444,9 +5512,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (!delta)
continue;
- memcg->vmstats.events[i] += delta;
+ memcg->vmstats->events[i] += delta;
if (parent)
- parent->vmstats.events_pending[i] += delta;
+ parent->vmstats->events_pending[i] += delta;
}
for_each_node_state(nid, N_MEMORY) {
@@ -5561,7 +5629,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
return NULL;
/*
- * Because lookup_swap_cache() updates some statistics counter,
+ * Because swap_cache_get_folio() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
*/
page = find_get_page(swap_address_space(ent), swp_offset(ent));
@@ -5871,7 +5939,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
unsigned long precharge;
mmap_read_lock(mm);
- walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
+ walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL);
mmap_read_unlock(mm);
precharge = mc.precharge;
@@ -6169,9 +6237,7 @@ retry:
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
- NULL);
-
+ walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL);
mmap_read_unlock(mc.mm);
atomic_dec(&mc.from->moving_account);
}
@@ -6196,6 +6262,30 @@ static void mem_cgroup_move_task(void)
}
#endif
+#ifdef CONFIG_LRU_GEN
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+
+ /* find the first leader if there is any */
+ cgroup_taskset_for_each_leader(task, css, tset)
+ break;
+
+ if (!task)
+ return;
+
+ task_lock(task);
+ if (task->mm && READ_ONCE(task->mm->owner) == task)
+ lru_gen_migrate_mm(task->mm);
+ task_unlock(task);
+}
+#else
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
{
if (value == PAGE_COUNTER_MAX)
@@ -6601,6 +6691,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_reset = mem_cgroup_css_reset,
.css_rstat_flush = mem_cgroup_css_rstat_flush,
.can_attach = mem_cgroup_can_attach,
+ .attach = mem_cgroup_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
.dfl_cftypes = memory_files,
@@ -6813,21 +6904,20 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
}
/**
- * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin
- * @page: page to charge
+ * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
+ * @folio: folio to charge.
* @mm: mm context of the victim
* @gfp: reclaim mode
- * @entry: swap entry for which the page is allocated
+ * @entry: swap entry for which the folio is allocated
*
- * This function charges a page allocated for swapin. Please call this before
- * adding the page to the swapcache.
+ * This function charges a folio allocated for swapin. Please call this before
+ * adding the folio to the swapcache.
*
* Returns 0 on success. Otherwise, an error code is returned.
*/
-int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry)
{
- struct folio *folio = page_folio(page);
struct mem_cgroup *memcg;
unsigned short id;
int ret;
@@ -7200,7 +7290,7 @@ static int __init mem_cgroup_init(void)
}
subsys_initcall(mem_cgroup_init);
-#ifdef CONFIG_MEMCG_SWAP
+#ifdef CONFIG_SWAP
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
{
while (!refcount_inc_not_zero(&memcg->id.ref)) {
@@ -7238,7 +7328,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
if (mem_cgroup_disabled())
return;
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (!do_memsw_account())
return;
memcg = folio_memcg(folio);
@@ -7267,7 +7357,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
- if (!cgroup_memory_noswap && memcg != swap_memcg) {
+ if (memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg))
page_counter_charge(&swap_memcg->memsw, nr_entries);
page_counter_uncharge(&memcg->memsw, nr_entries);
@@ -7303,7 +7393,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
struct mem_cgroup *memcg;
unsigned short oldid;
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (do_memsw_account())
return 0;
memcg = folio_memcg(folio);
@@ -7319,7 +7409,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
memcg = mem_cgroup_id_get_online(memcg);
- if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
+ if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
@@ -7347,15 +7437,18 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
struct mem_cgroup *memcg;
unsigned short id;
+ if (mem_cgroup_disabled())
+ return;
+
id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
- if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->swap, nr_pages);
- else
+ if (!mem_cgroup_is_root(memcg)) {
+ if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
+ else
+ page_counter_uncharge(&memcg->swap, nr_pages);
}
mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
mem_cgroup_id_put_many(memcg, nr_pages);
@@ -7367,7 +7460,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
long nr_swap_pages = get_nr_swap_pages();
- if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (mem_cgroup_disabled() || do_memsw_account())
return nr_swap_pages;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
@@ -7376,18 +7469,18 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
return nr_swap_pages;
}
-bool mem_cgroup_swap_full(struct page *page)
+bool mem_cgroup_swap_full(struct folio *folio)
{
struct mem_cgroup *memcg;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (vm_swap_full())
return true;
- if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (do_memsw_account())
return false;
- memcg = page_memcg(page);
+ memcg = folio_memcg(folio);
if (!memcg)
return false;
@@ -7404,10 +7497,9 @@ bool mem_cgroup_swap_full(struct page *page)
static int __init setup_swap_account(char *s)
{
- if (!strcmp(s, "1"))
- cgroup_memory_noswap = false;
- else if (!strcmp(s, "0"))
- cgroup_memory_noswap = true;
+ pr_warn_once("The swapaccount= commandline option is deprecated. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
return 1;
}
__setup("swapaccount=", setup_swap_account);
@@ -7676,20 +7768,9 @@ static struct cftype zswap_files[] = {
};
#endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
-/*
- * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
- * instead of a core_initcall(), this could mean cgroup_memory_noswap still
- * remains set to false even when memcg is disabled via "cgroup_disable=memory"
- * boot parameter. This may result in premature OOPS inside
- * mem_cgroup_get_nr_swap_pages() function in corner cases.
- */
static int __init mem_cgroup_swap_init(void)
{
- /* No memory control -> no swap control */
if (mem_cgroup_disabled())
- cgroup_memory_noswap = true;
-
- if (cgroup_memory_noswap)
return 0;
WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
@@ -7699,6 +7780,6 @@ static int __init mem_cgroup_swap_init(void)
#endif
return 0;
}
-core_initcall(mem_cgroup_swap_init);
+subsys_initcall(mem_cgroup_swap_init);
-#endif /* CONFIG_MEMCG_SWAP */
+#endif /* CONFIG_SWAP */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 14439806b5ef..bead6bccc7f2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -277,7 +277,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
* to SIG_IGN, but hopefully no one will do that?
*/
ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
- addr_lsb, t); /* synchronous? */
+ addr_lsb, t);
if (ret < 0)
pr_info("Error sending signal to %s:%d: %d\n",
t->comm, t->pid, ret);
@@ -345,13 +345,17 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
* not much we can do. We just print a message and ignore otherwise.
*/
+#define FSDAX_INVALID_PGOFF ULONG_MAX
+
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
*
- * Notice: @fsdax_pgoff is used only when @p is a fsdax page.
- * In other cases, such as anonymous and file-backend page, the address to be
- * killed can be caculated by @p itself.
+ * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
+ * filesystem with a memory failure handler has claimed the
+ * memory_failure event. In all other cases, page->index and
+ * page->mapping are sufficient for mapping the page back to its
+ * corresponding user virtual address.
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
@@ -367,11 +371,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
tk->addr = page_address_in_vma(p, vma);
if (is_zone_device_page(p)) {
- /*
- * Since page->mapping is not used for fsdax, we need
- * calculate the address based on the vma.
- */
- if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
+ if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
} else
@@ -413,7 +413,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
{
struct to_kill *tk, *next;
- list_for_each_entry_safe (tk, next, to_kill, nd) {
+ list_for_each_entry_safe(tk, next, to_kill, nd) {
if (forcekill) {
/*
* In case something went wrong with munmapping
@@ -437,6 +437,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
}
+ list_del(&tk->nd);
put_task_struct(tk->tsk);
kfree(tk);
}
@@ -520,14 +521,15 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
anon_vma_interval_tree_foreach(vmac, &av->rb_root,
pgoff, pgoff) {
vma = vmac->vma;
+ if (vma->vm_mm != t->mm)
+ continue;
if (!page_mapped_in_vma(page, vma))
continue;
- if (vma->vm_mm == t->mm)
- add_to_kill(t, page, 0, vma, to_kill);
+ add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
- page_unlock_anon_vma_read(av);
+ anon_vma_unlock_read(av);
}
/*
@@ -559,7 +561,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, 0, vma, to_kill);
+ add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
+ to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -632,7 +635,7 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
swp_entry_t swp = pte_to_swp_entry(pte);
if (is_hwpoison_entry(swp))
- pfn = hwpoison_entry_to_pfn(swp);
+ pfn = swp_offset_pfn(swp);
}
if (!pfn || pfn != poisoned_pfn)
@@ -743,6 +746,9 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
};
priv.tk.tsk = p;
+ if (!p->mm)
+ return -EFAULT;
+
mmap_read_lock(p->mm);
ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
(void *)&priv);
@@ -1074,6 +1080,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
+ bool extra_pins = false;
if (!PageHuge(hpage))
return MF_DELAYED;
@@ -1081,6 +1088,8 @@ static int me_huge_page(struct page_state *ps, struct page *p)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+ /* The page is kept in page cache. */
+ extra_pins = true;
unlock_page(hpage);
} else {
unlock_page(hpage);
@@ -1098,7 +1107,7 @@ static int me_huge_page(struct page_state *ps, struct page *p)
}
}
- if (has_extra_refcount(ps, p, false))
+ if (has_extra_refcount(ps, p, extra_pins))
res = MF_FAILED;
return res;
@@ -1243,9 +1252,9 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags)
return ret;
/*
- * This check prevents from calling get_hwpoison_unless_zero()
- * for any unsupported type of page in order to reduce the risk of
- * unexpected races caused by taking a page refcount.
+ * This check prevents from calling get_page_unless_zero() for any
+ * unsupported type of page in order to reduce the risk of unexpected
+ * races caused by taking a page refcount.
*/
if (!HWPoisonHandlable(head, flags))
return -EBUSY;
@@ -1396,14 +1405,14 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
- int kill = 1, forcekill;
+ int forcekill;
bool mlocked = PageMlocked(hpage);
/*
* Here we are interested only in user-mapped pages, so skip any
* other types of pages.
*/
- if (PageReserved(p) || PageSlab(p))
+ if (PageReserved(p) || PageSlab(p) || PageTable(p))
return true;
if (!(PageLRU(hpage) || PageHuge(p)))
return true;
@@ -1437,7 +1446,6 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
} else {
- kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
pfn);
@@ -1448,12 +1456,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
* because ttu takes the rmap data structures down.
- *
- * Error handling: We ignore errors here because
- * there's nothing that can be done.
*/
- if (kill)
- collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
if (PageHuge(hpage) && !PageAnon(hpage)) {
/*
@@ -1495,7 +1499,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
+ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) ||
+ !unmap_success;
kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
return unmap_success;
@@ -1524,20 +1529,18 @@ static int identify_page_state(unsigned long pfn, struct page *p,
return page_action(ps, p, pfn);
}
-static int try_to_split_thp_page(struct page *page, const char *msg)
+static int try_to_split_thp_page(struct page *page)
{
+ int ret;
+
lock_page(page);
- if (unlikely(split_huge_page(page))) {
- unsigned long pfn = page_to_pfn(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
- unlock_page(page);
- pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+ if (unlikely(ret))
put_page(page);
- return -EBUSY;
- }
- unlock_page(page);
- return 0;
+ return ret;
}
static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
@@ -1862,8 +1865,10 @@ retry:
if (hwpoison_filter(p)) {
hugetlb_clear_page_hwpoison(head);
- res = -EOPNOTSUPP;
- goto out;
+ unlock_page(head);
+ if (res == 1)
+ put_page(head);
+ return -EOPNOTSUPP;
}
/*
@@ -1928,7 +1933,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* Call driver's implementation to handle the memory failure, otherwise
* fall back to generic handler.
*/
- if (pgmap->ops->memory_failure) {
+ if (pgmap_has_memory_failure(pgmap)) {
rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
/*
* Fall back to generic handler too if operation is not
@@ -2026,7 +2031,7 @@ try_again:
/*
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
- * prep_new_page() will be the gate keeper.
+ * check_new_page() will be the gate keeper.
* 2) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been
@@ -2079,7 +2084,7 @@ try_again:
* page is a valid handlable page.
*/
SetPageHasHWPoisoned(hpage);
- if (try_to_split_thp_page(p, "Memory Failure") < 0) {
+ if (try_to_split_thp_page(p) < 0) {
action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
res = -EBUSY;
goto unlock_mutex;
@@ -2129,7 +2134,7 @@ try_again:
page_flags = p->flags;
if (hwpoison_filter(p)) {
- TestClearPageHWPoison(p);
+ ClearPageHWPoison(p);
unlock_page(p);
put_page(p);
res = -EOPNOTSUPP;
@@ -2354,7 +2359,7 @@ int unpoison_memory(unsigned long pfn)
goto unlock_mutex;
}
- if (PageSlab(page) || PageTable(page))
+ if (PageSlab(page) || PageTable(page) || PageReserved(page))
goto unlock_mutex;
ret = get_hwpoison_page(p, MF_UNPOISON);
@@ -2378,13 +2383,14 @@ int unpoison_memory(unsigned long pfn)
count = free_raw_hwp_pages(page, false);
if (count == 0) {
ret = -EBUSY;
+ put_page(page);
goto unlock_mutex;
}
}
freeit = !!TestClearPageHWPoison(p);
put_page(page);
- if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) {
+ if (freeit) {
put_page(page);
ret = 0;
}
@@ -2404,24 +2410,26 @@ EXPORT_SYMBOL(unpoison_memory);
static bool isolate_page(struct page *page, struct list_head *pagelist)
{
bool isolated = false;
- bool lru = PageLRU(page);
if (PageHuge(page)) {
isolated = !isolate_hugetlb(page, pagelist);
} else {
+ bool lru = !__PageMovable(page);
+
if (lru)
isolated = !isolate_lru_page(page);
else
- isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
+ isolated = !isolate_movable_page(page,
+ ISOLATE_UNEVICTABLE);
- if (isolated)
+ if (isolated) {
list_add(&page->lru, pagelist);
+ if (lru)
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_lru(page));
+ }
}
- if (isolated && lru)
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
-
/*
* If we succeed to isolate the page, we grabbed another refcount on
* the page, so we can safely drop the one we got from get_any_pages().
@@ -2434,11 +2442,11 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
}
/*
- * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
+ * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
* If the page is a non-dirty unmapped page-cache page, it simply invalidates.
* If the page is mapped, it migrates the contents over.
*/
-static int __soft_offline_page(struct page *page)
+static int soft_offline_in_use_page(struct page *page)
{
long ret = 0;
unsigned long pfn = page_to_pfn(page);
@@ -2451,6 +2459,14 @@ static int __soft_offline_page(struct page *page)
.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
};
+ if (!huge && PageTransHuge(hpage)) {
+ if (try_to_split_thp_page(page)) {
+ pr_info("soft offline: %#lx: thp split failed\n", pfn);
+ return -EBUSY;
+ }
+ hpage = page;
+ }
+
lock_page(page);
if (!PageHuge(page))
wait_on_page_writeback(page);
@@ -2500,26 +2516,6 @@ static int __soft_offline_page(struct page *page)
return ret;
}
-static int soft_offline_in_use_page(struct page *page)
-{
- struct page *hpage = compound_head(page);
-
- if (!PageHuge(page) && PageTransHuge(hpage))
- if (try_to_split_thp_page(page, "soft offline") < 0)
- return -EBUSY;
- return __soft_offline_page(page);
-}
-
-static int soft_offline_free_page(struct page *page)
-{
- int rc = 0;
-
- if (!page_handle_poison(page, true, false))
- rc = -EBUSY;
-
- return rc;
-}
-
static void put_ref_page(struct page *page)
{
if (page)
@@ -2587,8 +2583,6 @@ retry:
if (hwpoison_filter(page)) {
if (ret > 0)
put_page(page);
- else
- put_ref_page(ref_page);
mutex_unlock(&mf_mutex);
return -EOPNOTSUPP;
@@ -2597,7 +2591,7 @@ retry:
if (ret > 0) {
ret = soft_offline_in_use_page(page);
} else if (ret == 0) {
- if (soft_offline_free_page(page) && try_again) {
+ if (!page_handle_poison(page, true, false) && try_again) {
try_again = false;
flags &= ~MF_COUNT_INCREASED;
goto retry;
@@ -2611,7 +2605,7 @@ retry:
void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
- int i;
+ int i, total = 0;
/*
* A further optimization is to have per section refcounted
@@ -2624,8 +2618,10 @@ void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
for (i = 0; i < nr_pages; i++) {
if (PageHWPoison(&memmap[i])) {
- num_poisoned_pages_dec();
+ total++;
ClearPageHWPoison(&memmap[i]);
}
}
+ if (total)
+ num_poisoned_pages_sub(total);
}
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
new file mode 100644
index 000000000000..fa8c9d07f9ce
--- /dev/null
+++ b/mm/memory-tiers.c
@@ -0,0 +1,732 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/slab.h>
+#include <linux/lockdep.h>
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+#include <linux/memory.h>
+#include <linux/memory-tiers.h>
+
+#include "internal.h"
+
+struct memory_tier {
+ /* hierarchy of memory tiers */
+ struct list_head list;
+ /* list of all memory types part of this tier */
+ struct list_head memory_types;
+ /*
+ * start value of abstract distance. memory tier maps
+ * an abstract distance range,
+ * adistance_start .. adistance_start + MEMTIER_CHUNK_SIZE
+ */
+ int adistance_start;
+ struct device dev;
+ /* All the nodes that are part of all the lower memory tiers. */
+ nodemask_t lower_tier_mask;
+};
+
+struct demotion_nodes {
+ nodemask_t preferred;
+};
+
+struct node_memory_type_map {
+ struct memory_dev_type *memtype;
+ int map_count;
+};
+
+static DEFINE_MUTEX(memory_tier_lock);
+static LIST_HEAD(memory_tiers);
+static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
+static struct memory_dev_type *default_dram_type;
+
+static struct bus_type memory_tier_subsys = {
+ .name = "memory_tiering",
+ .dev_name = "memory_tier",
+};
+
+#ifdef CONFIG_MIGRATION
+static int top_tier_adistance;
+/*
+ * node_demotion[] examples:
+ *
+ * Example 1:
+ *
+ * Node 0 & 1 are CPU + DRAM nodes, node 2 & 3 are PMEM nodes.
+ *
+ * node distances:
+ * node 0 1 2 3
+ * 0 10 20 30 40
+ * 1 20 10 40 30
+ * 2 30 40 10 40
+ * 3 40 30 40 10
+ *
+ * memory_tiers0 = 0-1
+ * memory_tiers1 = 2-3
+ *
+ * node_demotion[0].preferred = 2
+ * node_demotion[1].preferred = 3
+ * node_demotion[2].preferred = <empty>
+ * node_demotion[3].preferred = <empty>
+ *
+ * Example 2:
+ *
+ * Node 0 & 1 are CPU + DRAM nodes, node 2 is memory-only DRAM node.
+ *
+ * node distances:
+ * node 0 1 2
+ * 0 10 20 30
+ * 1 20 10 30
+ * 2 30 30 10
+ *
+ * memory_tiers0 = 0-2
+ *
+ * node_demotion[0].preferred = <empty>
+ * node_demotion[1].preferred = <empty>
+ * node_demotion[2].preferred = <empty>
+ *
+ * Example 3:
+ *
+ * Node 0 is CPU + DRAM nodes, Node 1 is HBM node, node 2 is PMEM node.
+ *
+ * node distances:
+ * node 0 1 2
+ * 0 10 20 30
+ * 1 20 10 40
+ * 2 30 40 10
+ *
+ * memory_tiers0 = 1
+ * memory_tiers1 = 0
+ * memory_tiers2 = 2
+ *
+ * node_demotion[0].preferred = 2
+ * node_demotion[1].preferred = 0
+ * node_demotion[2].preferred = <empty>
+ *
+ */
+static struct demotion_nodes *node_demotion __read_mostly;
+#endif /* CONFIG_MIGRATION */
+
+static inline struct memory_tier *to_memory_tier(struct device *device)
+{
+ return container_of(device, struct memory_tier, dev);
+}
+
+static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memtier)
+{
+ nodemask_t nodes = NODE_MASK_NONE;
+ struct memory_dev_type *memtype;
+
+ list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling)
+ nodes_or(nodes, nodes, memtype->nodes);
+
+ return nodes;
+}
+
+static void memory_tier_device_release(struct device *dev)
+{
+ struct memory_tier *tier = to_memory_tier(dev);
+ /*
+ * synchronize_rcu in clear_node_memory_tier makes sure
+ * we don't have rcu access to this memory tier.
+ */
+ kfree(tier);
+}
+
+static ssize_t nodelist_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int ret;
+ nodemask_t nmask;
+
+ mutex_lock(&memory_tier_lock);
+ nmask = get_memtier_nodemask(to_memory_tier(dev));
+ ret = sysfs_emit(buf, "%*pbl\n", nodemask_pr_args(&nmask));
+ mutex_unlock(&memory_tier_lock);
+ return ret;
+}
+static DEVICE_ATTR_RO(nodelist);
+
+static struct attribute *memtier_dev_attrs[] = {
+ &dev_attr_nodelist.attr,
+ NULL
+};
+
+static const struct attribute_group memtier_dev_group = {
+ .attrs = memtier_dev_attrs,
+};
+
+static const struct attribute_group *memtier_dev_groups[] = {
+ &memtier_dev_group,
+ NULL
+};
+
+static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memtype)
+{
+ int ret;
+ bool found_slot = false;
+ struct memory_tier *memtier, *new_memtier;
+ int adistance = memtype->adistance;
+ unsigned int memtier_adistance_chunk_size = MEMTIER_CHUNK_SIZE;
+
+ lockdep_assert_held_once(&memory_tier_lock);
+
+ adistance = round_down(adistance, memtier_adistance_chunk_size);
+ /*
+ * If the memtype is already part of a memory tier,
+ * just return that.
+ */
+ if (!list_empty(&memtype->tier_sibiling)) {
+ list_for_each_entry(memtier, &memory_tiers, list) {
+ if (adistance == memtier->adistance_start)
+ return memtier;
+ }
+ WARN_ON(1);
+ return ERR_PTR(-EINVAL);
+ }
+
+ list_for_each_entry(memtier, &memory_tiers, list) {
+ if (adistance == memtier->adistance_start) {
+ goto link_memtype;
+ } else if (adistance < memtier->adistance_start) {
+ found_slot = true;
+ break;
+ }
+ }
+
+ new_memtier = kzalloc(sizeof(struct memory_tier), GFP_KERNEL);
+ if (!new_memtier)
+ return ERR_PTR(-ENOMEM);
+
+ new_memtier->adistance_start = adistance;
+ INIT_LIST_HEAD(&new_memtier->list);
+ INIT_LIST_HEAD(&new_memtier->memory_types);
+ if (found_slot)
+ list_add_tail(&new_memtier->list, &memtier->list);
+ else
+ list_add_tail(&new_memtier->list, &memory_tiers);
+
+ new_memtier->dev.id = adistance >> MEMTIER_CHUNK_BITS;
+ new_memtier->dev.bus = &memory_tier_subsys;
+ new_memtier->dev.release = memory_tier_device_release;
+ new_memtier->dev.groups = memtier_dev_groups;
+
+ ret = device_register(&new_memtier->dev);
+ if (ret) {
+ list_del(&memtier->list);
+ put_device(&memtier->dev);
+ return ERR_PTR(ret);
+ }
+ memtier = new_memtier;
+
+link_memtype:
+ list_add(&memtype->tier_sibiling, &memtier->memory_types);
+ return memtier;
+}
+
+static struct memory_tier *__node_get_memory_tier(int node)
+{
+ pg_data_t *pgdat;
+
+ pgdat = NODE_DATA(node);
+ if (!pgdat)
+ return NULL;
+ /*
+ * Since we hold memory_tier_lock, we can avoid
+ * RCU read locks when accessing the details. No
+ * parallel updates are possible here.
+ */
+ return rcu_dereference_check(pgdat->memtier,
+ lockdep_is_held(&memory_tier_lock));
+}
+
+#ifdef CONFIG_MIGRATION
+bool node_is_toptier(int node)
+{
+ bool toptier;
+ pg_data_t *pgdat;
+ struct memory_tier *memtier;
+
+ pgdat = NODE_DATA(node);
+ if (!pgdat)
+ return false;
+
+ rcu_read_lock();
+ memtier = rcu_dereference(pgdat->memtier);
+ if (!memtier) {
+ toptier = true;
+ goto out;
+ }
+ if (memtier->adistance_start <= top_tier_adistance)
+ toptier = true;
+ else
+ toptier = false;
+out:
+ rcu_read_unlock();
+ return toptier;
+}
+
+void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets)
+{
+ struct memory_tier *memtier;
+
+ /*
+ * pg_data_t.memtier updates includes a synchronize_rcu()
+ * which ensures that we either find NULL or a valid memtier
+ * in NODE_DATA. protect the access via rcu_read_lock();
+ */
+ rcu_read_lock();
+ memtier = rcu_dereference(pgdat->memtier);
+ if (memtier)
+ *targets = memtier->lower_tier_mask;
+ else
+ *targets = NODE_MASK_NONE;
+ rcu_read_unlock();
+}
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * Return: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+ struct demotion_nodes *nd;
+ int target;
+
+ if (!node_demotion)
+ return NUMA_NO_NODE;
+
+ nd = &node_demotion[node];
+
+ /*
+ * node_demotion[] is updated without excluding this
+ * function from running.
+ *
+ * Make sure to use RCU over entire code blocks if
+ * node_demotion[] reads need to be consistent.
+ */
+ rcu_read_lock();
+ /*
+ * If there are multiple target nodes, just select one
+ * target node randomly.
+ *
+ * In addition, we can also use round-robin to select
+ * target node, but we should introduce another variable
+ * for node_demotion[] to record last selected target node,
+ * that may cause cache ping-pong due to the changing of
+ * last target node. Or introducing per-cpu data to avoid
+ * caching issue, which seems more complicated. So selecting
+ * target node randomly seems better until now.
+ */
+ target = node_random(&nd->preferred);
+ rcu_read_unlock();
+
+ return target;
+}
+
+static void disable_all_demotion_targets(void)
+{
+ struct memory_tier *memtier;
+ int node;
+
+ for_each_node_state(node, N_MEMORY) {
+ node_demotion[node].preferred = NODE_MASK_NONE;
+ /*
+ * We are holding memory_tier_lock, it is safe
+ * to access pgda->memtier.
+ */
+ memtier = __node_get_memory_tier(node);
+ if (memtier)
+ memtier->lower_tier_mask = NODE_MASK_NONE;
+ }
+ /*
+ * Ensure that the "disable" is visible across the system.
+ * Readers will see either a combination of before+disable
+ * state or disable+after. They will never see before and
+ * after state together.
+ */
+ synchronize_rcu();
+}
+
+/*
+ * Find an automatic demotion target for all memory
+ * nodes. Failing here is OK. It might just indicate
+ * being at the end of a chain.
+ */
+static void establish_demotion_targets(void)
+{
+ struct memory_tier *memtier;
+ struct demotion_nodes *nd;
+ int target = NUMA_NO_NODE, node;
+ int distance, best_distance;
+ nodemask_t tier_nodes, lower_tier;
+
+ lockdep_assert_held_once(&memory_tier_lock);
+
+ if (!node_demotion || !IS_ENABLED(CONFIG_MIGRATION))
+ return;
+
+ disable_all_demotion_targets();
+
+ for_each_node_state(node, N_MEMORY) {
+ best_distance = -1;
+ nd = &node_demotion[node];
+
+ memtier = __node_get_memory_tier(node);
+ if (!memtier || list_is_last(&memtier->list, &memory_tiers))
+ continue;
+ /*
+ * Get the lower memtier to find the demotion node list.
+ */
+ memtier = list_next_entry(memtier, list);
+ tier_nodes = get_memtier_nodemask(memtier);
+ /*
+ * find_next_best_node, use 'used' nodemask as a skip list.
+ * Add all memory nodes except the selected memory tier
+ * nodelist to skip list so that we find the best node from the
+ * memtier nodelist.
+ */
+ nodes_andnot(tier_nodes, node_states[N_MEMORY], tier_nodes);
+
+ /*
+ * Find all the nodes in the memory tier node list of same best distance.
+ * add them to the preferred mask. We randomly select between nodes
+ * in the preferred mask when allocating pages during demotion.
+ */
+ do {
+ target = find_next_best_node(node, &tier_nodes);
+ if (target == NUMA_NO_NODE)
+ break;
+
+ distance = node_distance(node, target);
+ if (distance == best_distance || best_distance == -1) {
+ best_distance = distance;
+ node_set(target, nd->preferred);
+ } else {
+ break;
+ }
+ } while (1);
+ }
+ /*
+ * Promotion is allowed from a memory tier to higher
+ * memory tier only if the memory tier doesn't include
+ * compute. We want to skip promotion from a memory tier,
+ * if any node that is part of the memory tier have CPUs.
+ * Once we detect such a memory tier, we consider that tier
+ * as top tiper from which promotion is not allowed.
+ */
+ list_for_each_entry_reverse(memtier, &memory_tiers, list) {
+ tier_nodes = get_memtier_nodemask(memtier);
+ nodes_and(tier_nodes, node_states[N_CPU], tier_nodes);
+ if (!nodes_empty(tier_nodes)) {
+ /*
+ * abstract distance below the max value of this memtier
+ * is considered toptier.
+ */
+ top_tier_adistance = memtier->adistance_start +
+ MEMTIER_CHUNK_SIZE - 1;
+ break;
+ }
+ }
+ /*
+ * Now build the lower_tier mask for each node collecting node mask from
+ * all memory tier below it. This allows us to fallback demotion page
+ * allocation to a set of nodes that is closer the above selected
+ * perferred node.
+ */
+ lower_tier = node_states[N_MEMORY];
+ list_for_each_entry(memtier, &memory_tiers, list) {
+ /*
+ * Keep removing current tier from lower_tier nodes,
+ * This will remove all nodes in current and above
+ * memory tier from the lower_tier mask.
+ */
+ tier_nodes = get_memtier_nodemask(memtier);
+ nodes_andnot(lower_tier, lower_tier, tier_nodes);
+ memtier->lower_tier_mask = lower_tier;
+ }
+}
+
+#else
+static inline void disable_all_demotion_targets(void) {}
+static inline void establish_demotion_targets(void) {}
+#endif /* CONFIG_MIGRATION */
+
+static inline void __init_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+ if (!node_memory_types[node].memtype)
+ node_memory_types[node].memtype = memtype;
+ /*
+ * for each device getting added in the same NUMA node
+ * with this specific memtype, bump the map count. We
+ * Only take memtype device reference once, so that
+ * changing a node memtype can be done by droping the
+ * only reference count taken here.
+ */
+
+ if (node_memory_types[node].memtype == memtype) {
+ if (!node_memory_types[node].map_count++)
+ kref_get(&memtype->kref);
+ }
+}
+
+static struct memory_tier *set_node_memory_tier(int node)
+{
+ struct memory_tier *memtier;
+ struct memory_dev_type *memtype;
+ pg_data_t *pgdat = NODE_DATA(node);
+
+
+ lockdep_assert_held_once(&memory_tier_lock);
+
+ if (!node_state(node, N_MEMORY))
+ return ERR_PTR(-EINVAL);
+
+ __init_node_memory_type(node, default_dram_type);
+
+ memtype = node_memory_types[node].memtype;
+ node_set(node, memtype->nodes);
+ memtier = find_create_memory_tier(memtype);
+ if (!IS_ERR(memtier))
+ rcu_assign_pointer(pgdat->memtier, memtier);
+ return memtier;
+}
+
+static void destroy_memory_tier(struct memory_tier *memtier)
+{
+ list_del(&memtier->list);
+ device_unregister(&memtier->dev);
+}
+
+static bool clear_node_memory_tier(int node)
+{
+ bool cleared = false;
+ pg_data_t *pgdat;
+ struct memory_tier *memtier;
+
+ pgdat = NODE_DATA(node);
+ if (!pgdat)
+ return false;
+
+ /*
+ * Make sure that anybody looking at NODE_DATA who finds
+ * a valid memtier finds memory_dev_types with nodes still
+ * linked to the memtier. We achieve this by waiting for
+ * rcu read section to finish using synchronize_rcu.
+ * This also enables us to free the destroyed memory tier
+ * with kfree instead of kfree_rcu
+ */
+ memtier = __node_get_memory_tier(node);
+ if (memtier) {
+ struct memory_dev_type *memtype;
+
+ rcu_assign_pointer(pgdat->memtier, NULL);
+ synchronize_rcu();
+ memtype = node_memory_types[node].memtype;
+ node_clear(node, memtype->nodes);
+ if (nodes_empty(memtype->nodes)) {
+ list_del_init(&memtype->tier_sibiling);
+ if (list_empty(&memtier->memory_types))
+ destroy_memory_tier(memtier);
+ }
+ cleared = true;
+ }
+ return cleared;
+}
+
+static void release_memtype(struct kref *kref)
+{
+ struct memory_dev_type *memtype;
+
+ memtype = container_of(kref, struct memory_dev_type, kref);
+ kfree(memtype);
+}
+
+struct memory_dev_type *alloc_memory_type(int adistance)
+{
+ struct memory_dev_type *memtype;
+
+ memtype = kmalloc(sizeof(*memtype), GFP_KERNEL);
+ if (!memtype)
+ return ERR_PTR(-ENOMEM);
+
+ memtype->adistance = adistance;
+ INIT_LIST_HEAD(&memtype->tier_sibiling);
+ memtype->nodes = NODE_MASK_NONE;
+ kref_init(&memtype->kref);
+ return memtype;
+}
+EXPORT_SYMBOL_GPL(alloc_memory_type);
+
+void destroy_memory_type(struct memory_dev_type *memtype)
+{
+ kref_put(&memtype->kref, release_memtype);
+}
+EXPORT_SYMBOL_GPL(destroy_memory_type);
+
+void init_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+
+ mutex_lock(&memory_tier_lock);
+ __init_node_memory_type(node, memtype);
+ mutex_unlock(&memory_tier_lock);
+}
+EXPORT_SYMBOL_GPL(init_node_memory_type);
+
+void clear_node_memory_type(int node, struct memory_dev_type *memtype)
+{
+ mutex_lock(&memory_tier_lock);
+ if (node_memory_types[node].memtype == memtype)
+ node_memory_types[node].map_count--;
+ /*
+ * If we umapped all the attached devices to this node,
+ * clear the node memory type.
+ */
+ if (!node_memory_types[node].map_count) {
+ node_memory_types[node].memtype = NULL;
+ kref_put(&memtype->kref, release_memtype);
+ }
+ mutex_unlock(&memory_tier_lock);
+}
+EXPORT_SYMBOL_GPL(clear_node_memory_type);
+
+static int __meminit memtier_hotplug_callback(struct notifier_block *self,
+ unsigned long action, void *_arg)
+{
+ struct memory_tier *memtier;
+ struct memory_notify *arg = _arg;
+
+ /*
+ * Only update the node migration order when a node is
+ * changing status, like online->offline.
+ */
+ if (arg->status_change_nid < 0)
+ return notifier_from_errno(0);
+
+ switch (action) {
+ case MEM_OFFLINE:
+ mutex_lock(&memory_tier_lock);
+ if (clear_node_memory_tier(arg->status_change_nid))
+ establish_demotion_targets();
+ mutex_unlock(&memory_tier_lock);
+ break;
+ case MEM_ONLINE:
+ mutex_lock(&memory_tier_lock);
+ memtier = set_node_memory_tier(arg->status_change_nid);
+ if (!IS_ERR(memtier))
+ establish_demotion_targets();
+ mutex_unlock(&memory_tier_lock);
+ break;
+ }
+
+ return notifier_from_errno(0);
+}
+
+static int __init memory_tier_init(void)
+{
+ int ret, node;
+ struct memory_tier *memtier;
+
+ ret = subsys_virtual_register(&memory_tier_subsys, NULL);
+ if (ret)
+ panic("%s() failed to register memory tier subsystem\n", __func__);
+
+#ifdef CONFIG_MIGRATION
+ node_demotion = kcalloc(nr_node_ids, sizeof(struct demotion_nodes),
+ GFP_KERNEL);
+ WARN_ON(!node_demotion);
+#endif
+ mutex_lock(&memory_tier_lock);
+ /*
+ * For now we can have 4 faster memory tiers with smaller adistance
+ * than default DRAM tier.
+ */
+ default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
+ if (!default_dram_type)
+ panic("%s() failed to allocate default DRAM tier\n", __func__);
+
+ /*
+ * Look at all the existing N_MEMORY nodes and add them to
+ * default memory tier or to a tier if we already have memory
+ * types assigned.
+ */
+ for_each_node_state(node, N_MEMORY) {
+ memtier = set_node_memory_tier(node);
+ if (IS_ERR(memtier))
+ /*
+ * Continue with memtiers we are able to setup
+ */
+ break;
+ }
+ establish_demotion_targets();
+ mutex_unlock(&memory_tier_lock);
+
+ hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO);
+ return 0;
+}
+subsys_initcall(memory_tier_init);
+
+bool numa_demotion_enabled = false;
+
+#ifdef CONFIG_MIGRATION
+#ifdef CONFIG_SYSFS
+static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n",
+ numa_demotion_enabled ? "true" : "false");
+}
+
+static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &numa_demotion_enabled);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static struct kobj_attribute numa_demotion_enabled_attr =
+ __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
+ numa_demotion_enabled_store);
+
+static struct attribute *numa_attrs[] = {
+ &numa_demotion_enabled_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group numa_attr_group = {
+ .attrs = numa_attrs,
+};
+
+static int __init numa_init_sysfs(void)
+{
+ int err;
+ struct kobject *numa_kobj;
+
+ numa_kobj = kobject_create_and_add("numa", mm_kobj);
+ if (!numa_kobj) {
+ pr_err("failed to create numa kobject\n");
+ return -ENOMEM;
+ }
+ err = sysfs_create_group(numa_kobj, &numa_attr_group);
+ if (err) {
+ pr_err("failed to register numa group\n");
+ goto delete_obj;
+ }
+ return 0;
+
+delete_obj:
+ kobject_put(numa_kobj);
+ return err;
+}
+subsys_initcall(numa_init_sysfs);
+#endif /* CONFIG_SYSFS */
+#endif
diff --git a/mm/memory.c b/mm/memory.c
index 4ba73f5aa8bb..f88c351aecd4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -52,6 +52,7 @@
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
+#include <linux/kmsan.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
@@ -66,6 +67,7 @@
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
+#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
@@ -74,6 +76,7 @@
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
+#include <linux/sched/sysctl.h>
#include <trace/events/kmem.h>
@@ -125,18 +128,6 @@ int randomize_va_space __read_mostly =
2;
#endif
-#ifndef arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
-{
- /*
- * Those arches which don't have hw access flag feature need to
- * implement their own helper. By default, "true" means pagefault
- * will be hit on old pte.
- */
- return true;
-}
-#endif
-
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
@@ -402,12 +393,21 @@ void free_pgd_range(struct mmu_gather *tlb,
} while (pgd++, addr = next, addr != end);
}
-void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long floor, unsigned long ceiling)
+void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ struct vm_area_struct *vma, unsigned long floor,
+ unsigned long ceiling)
{
- while (vma) {
- struct vm_area_struct *next = vma->vm_next;
+ MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
+
+ do {
unsigned long addr = vma->vm_start;
+ struct vm_area_struct *next;
+
+ /*
+ * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
+ * be 0. This will underflow and is okay.
+ */
+ next = mas_find(&mas, ceiling - 1);
/*
* Hide vma from rmap and truncate_pagecache before freeing
@@ -426,7 +426,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
&& !is_vm_hugetlb_page(next)) {
vma = next;
- next = vma->vm_next;
+ next = mas_find(&mas, ceiling - 1);
unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
@@ -434,7 +434,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
floor, next ? next->vm_start : ceiling);
}
vma = next;
- }
+ } while (vma);
}
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
@@ -1393,10 +1393,12 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte,
struct zap_details *details, pte_t pteval)
{
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
if (zap_drop_file_uffd_wp(details))
return;
pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+#endif
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1685,10 +1687,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (vma->vm_file) {
zap_flags_t zap_flags = details ?
details->zap_flags : 0;
- i_mmap_lock_write(vma->vm_file->f_mapping);
__unmap_hugepage_range_final(tlb, vma, start, end,
NULL, zap_flags);
- i_mmap_unlock_write(vma->vm_file->f_mapping);
}
} else
unmap_page_range(tlb, vma, start, end, details);
@@ -1698,6 +1698,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
/**
* unmap_vmas - unmap a range of memory covered by a list of vma's
* @tlb: address of the caller's struct mmu_gather
+ * @mt: the maple tree
* @vma: the starting vma
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
@@ -1713,7 +1714,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-void unmap_vmas(struct mmu_gather *tlb,
+void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr)
{
@@ -1723,12 +1724,14 @@ void unmap_vmas(struct mmu_gather *tlb,
/* Careful - we need to zap private pages too! */
.even_cows = true,
};
+ MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
start_addr, end_addr);
mmu_notifier_invalidate_range_start(&range);
- for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
+ do {
unmap_single_vma(tlb, vma, start_addr, end_addr, &details);
+ } while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
}
@@ -1743,8 +1746,11 @@ void unmap_vmas(struct mmu_gather *tlb,
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long size)
{
+ struct maple_tree *mt = &vma->vm_mm->mm_mt;
+ unsigned long end = start + size;
struct mmu_notifier_range range;
struct mmu_gather tlb;
+ MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
lru_add_drain();
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
@@ -1752,8 +1758,9 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
- for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
+ do {
unmap_single_vma(&tlb, vma, start, range.end, NULL);
+ } while ((vma = mas_find(&mas, end - 1)) != NULL);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
}
@@ -2870,7 +2877,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
@@ -3128,6 +3135,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
delayacct_wpcopy_end();
return 0;
}
+ kmsan_copy_page_meta(new_page, old_page);
}
if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
@@ -3362,6 +3370,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
{
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio;
VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
@@ -3408,48 +3417,47 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(vmf->page)) {
- struct page *page = vmf->page;
-
+ folio = page_folio(vmf->page);
+ if (folio_test_anon(folio)) {
/*
* If the page is exclusive to this process we must reuse the
* page without further checks.
*/
- if (PageAnonExclusive(page))
+ if (PageAnonExclusive(vmf->page))
goto reuse;
/*
- * We have to verify under page lock: these early checks are
- * just an optimization to avoid locking the page and freeing
+ * We have to verify under folio lock: these early checks are
+ * just an optimization to avoid locking the folio and freeing
* the swapcache if there is little hope that we can reuse.
*
- * PageKsm() doesn't necessarily raise the page refcount.
+ * KSM doesn't necessarily raise the folio refcount.
*/
- if (PageKsm(page) || page_count(page) > 3)
+ if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
goto copy;
- if (!PageLRU(page))
+ if (!folio_test_lru(folio))
/*
* Note: We cannot easily detect+handle references from
- * remote LRU pagevecs or references to PageLRU() pages.
+ * remote LRU pagevecs or references to LRU folios.
*/
lru_add_drain();
- if (page_count(page) > 1 + PageSwapCache(page))
+ if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
goto copy;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto copy;
- if (PageSwapCache(page))
- try_to_free_swap(page);
- if (PageKsm(page) || page_count(page) != 1) {
- unlock_page(page);
+ if (folio_test_swapcache(folio))
+ folio_free_swap(folio);
+ if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
+ folio_unlock(folio);
goto copy;
}
/*
- * Ok, we've got the only page reference from our mapping
- * and the page is locked, it's dark out, and we're wearing
+ * Ok, we've got the only folio reference from our mapping
+ * and the folio is locked, it's dark out, and we're wearing
* sunglasses. Hit it.
*/
- page_move_anon_rmap(page, vma);
- unlock_page(page);
+ page_move_anon_rmap(vmf->page, vma);
+ folio_unlock(folio);
reuse:
if (unlikely(unshare)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3612,11 +3620,11 @@ EXPORT_SYMBOL(unmap_mapping_range);
*/
static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct vm_area_struct *vma = vmf->vma;
struct mmu_notifier_range range;
- if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+ if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags))
return VM_FAULT_RETRY;
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
vma->vm_mm, vmf->address & PAGE_MASK,
@@ -3626,23 +3634,23 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
- restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
+ restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
- unlock_page(page);
+ folio_unlock(folio);
mmu_notifier_invalidate_range_end(&range);
return 0;
}
-static inline bool should_try_to_free_swap(struct page *page,
+static inline bool should_try_to_free_swap(struct folio *folio,
struct vm_area_struct *vma,
unsigned int fault_flags)
{
- if (!PageSwapCache(page))
+ if (!folio_test_swapcache(folio))
return false;
- if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) ||
- PageMlocked(page))
+ if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
+ folio_test_mlocked(folio))
return true;
/*
* If we want to map a page that's in the swapcache writable, we
@@ -3650,8 +3658,8 @@ static inline bool should_try_to_free_swap(struct page *page,
* user. Try freeing the swapcache to get rid of the swapcache
* reference only in case it's likely that we'll be the exlusive user.
*/
- return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) &&
- page_count(page) == 2;
+ return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
+ folio_ref_count(folio) == 2;
}
static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
@@ -3718,7 +3726,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = NULL, *swapcache;
+ struct folio *swapcache, *folio = NULL;
+ struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
bool exclusive = false;
@@ -3741,7 +3750,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
vmf->page = pfn_swap_entry_to_page(entry);
- ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+ spin_unlock(vmf->ptl);
+ goto out;
+ }
+
+ /*
+ * Get a page reference while we know the page can't be
+ * freed.
+ */
+ get_page(vmf->page);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ vmf->page->pgmap->ops->migrate_to_ram(vmf);
+ put_page(vmf->page);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else if (is_swapin_error_entry(entry)) {
@@ -3760,21 +3783,25 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (unlikely(!si))
goto out;
- page = lookup_swap_cache(entry, vma, vmf->address);
- swapcache = page;
+ folio = swap_cache_get_folio(entry, vma, vmf->address);
+ if (folio)
+ page = folio_file_page(folio, swp_offset(entry));
+ swapcache = folio;
- if (!page) {
+ if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
/* skip swapcache */
- page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
- vmf->address);
- if (page) {
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
-
- if (mem_cgroup_swapin_charge_page(page,
- vma->vm_mm, GFP_KERNEL, entry)) {
+ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
+ vma, vmf->address, false);
+ page = &folio->page;
+ if (folio) {
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+
+ if (mem_cgroup_swapin_charge_folio(folio,
+ vma->vm_mm, GFP_KERNEL,
+ entry)) {
ret = VM_FAULT_OOM;
goto out_page;
}
@@ -3782,23 +3809,24 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
- workingset_refault(page_folio(page),
- shadow);
+ workingset_refault(folio, shadow);
- lru_cache_add(page);
+ folio_add_lru(folio);
/* To provide entry to swap_readpage() */
- set_page_private(page, entry.val);
+ folio_set_swap_entry(folio, entry);
swap_readpage(page, true, NULL);
- set_page_private(page, 0);
+ folio->private = NULL;
}
} else {
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
vmf);
- swapcache = page;
+ if (page)
+ folio = page_folio(page);
+ swapcache = folio;
}
- if (!page) {
+ if (!folio) {
/*
* Back out if somebody else faulted in this pte
* while we released the pte lock.
@@ -3823,7 +3851,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_release;
}
- locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
+ locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags);
if (!locked) {
ret |= VM_FAULT_RETRY;
@@ -3832,13 +3860,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (swapcache) {
/*
- * Make sure try_to_free_swap or swapoff did not release the
+ * Make sure folio_free_swap() or swapoff did not release the
* swapcache from under us. The page pin, and pte_same test
* below, are not enough to exclude that. Even if it is still
* swapcache, we need to check that the page's swap has not
* changed.
*/
- if (unlikely(!PageSwapCache(page) ||
+ if (unlikely(!folio_test_swapcache(folio) ||
page_private(page) != entry.val))
goto out_page;
@@ -3850,9 +3878,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
page = ksm_might_need_to_copy(page, vma, vmf->address);
if (unlikely(!page)) {
ret = VM_FAULT_OOM;
- page = swapcache;
goto out_page;
}
+ folio = page_folio(page);
/*
* If we want to map a page that's in the swapcache writable, we
@@ -3860,8 +3888,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* owner. Try removing the extra reference from the local LRU
* pagevecs if required.
*/
- if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache &&
- !PageKsm(page) && !PageLRU(page))
+ if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
+ !folio_test_ksm(folio) && !folio_test_lru(folio))
lru_add_drain();
}
@@ -3875,7 +3903,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
goto out_nomap;
- if (unlikely(!PageUptodate(page))) {
+ if (unlikely(!folio_test_uptodate(folio))) {
ret = VM_FAULT_SIGBUS;
goto out_nomap;
}
@@ -3888,26 +3916,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* check after taking the PT lock and making sure that nobody
* concurrently faulted in this page and set PG_anon_exclusive.
*/
- BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
- BUG_ON(PageAnon(page) && PageAnonExclusive(page));
+ BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
+ BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));
/*
* Check under PT lock (to protect against concurrent fork() sharing
* the swap entry concurrently) for certainly exclusive pages.
*/
- if (!PageKsm(page)) {
+ if (!folio_test_ksm(folio)) {
/*
* Note that pte_swp_exclusive() == false for architectures
* without __HAVE_ARCH_PTE_SWP_EXCLUSIVE.
*/
exclusive = pte_swp_exclusive(vmf->orig_pte);
- if (page != swapcache) {
+ if (folio != swapcache) {
/*
* We have a fresh page that is not exposed to the
* swapcache -> certainly exclusive.
*/
exclusive = true;
- } else if (exclusive && PageWriteback(page) &&
+ } else if (exclusive && folio_test_writeback(folio) &&
data_race(si->flags & SWP_STABLE_WRITES)) {
/*
* This is tricky: not all swap backends support
@@ -3937,8 +3965,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* yet.
*/
swap_free(entry);
- if (should_try_to_free_swap(page, vma, vmf->flags))
- try_to_free_swap(page);
+ if (should_try_to_free_swap(folio, vma, vmf->flags))
+ folio_free_swap(folio);
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
@@ -3950,7 +3978,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* exposing them to the swapcache or because the swap entry indicates
* exclusivity.
*/
- if (!PageKsm(page) && (exclusive || page_count(page) == 1)) {
+ if (!folio_test_ksm(folio) &&
+ (exclusive || folio_ref_count(folio) == 1)) {
if (vmf->flags & FAULT_FLAG_WRITE) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
@@ -3968,19 +3997,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->orig_pte = pte;
/* ksm created a completely new copy */
- if (unlikely(page != swapcache && swapcache)) {
+ if (unlikely(folio != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address);
- lru_cache_add_inactive_or_unevictable(page, vma);
+ folio_add_lru_vma(folio, vma);
} else {
page_add_anon_rmap(page, vma, vmf->address, rmap_flags);
}
- VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page)));
+ VM_BUG_ON(!folio_test_anon(folio) ||
+ (pte_write(pte) && !PageAnonExclusive(page)));
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
- unlock_page(page);
- if (page != swapcache && swapcache) {
+ folio_unlock(folio);
+ if (folio != swapcache && swapcache) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
@@ -3989,8 +4019,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* so that the swap count won't change under a
* parallel locked swapcache.
*/
- unlock_page(swapcache);
- put_page(swapcache);
+ folio_unlock(swapcache);
+ folio_put(swapcache);
}
if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -4011,12 +4041,12 @@ out:
out_nomap:
pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
- unlock_page(page);
+ folio_unlock(folio);
out_release:
- put_page(page);
- if (page != swapcache && swapcache) {
- unlock_page(swapcache);
- put_page(swapcache);
+ folio_put(folio);
+ if (folio != swapcache && swapcache) {
+ folio_unlock(swapcache);
+ folio_put(swapcache);
}
if (si)
put_swap_device(si);
@@ -4104,7 +4134,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
if (!pte_none(*vmf->pte)) {
- update_mmu_cache(vma, vmf->address, vmf->pte);
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release;
}
@@ -4386,14 +4416,20 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- ret = 0;
+
/* Re-check under ptl */
- if (likely(!vmf_pte_changed(vmf)))
+ if (likely(!vmf_pte_changed(vmf))) {
do_set_pte(vmf, page, vmf->address);
- else
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+
+ ret = 0;
+ } else {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
ret = VM_FAULT_NOPAGE;
+ }
- update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
@@ -4725,8 +4761,16 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
flags |= TNF_SHARED;
- last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
+ /*
+ * For memory tiering mode, cpupid of slow memory page is used
+ * to record page access time. So use default value.
+ */
+ if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(page_nid))
+ last_cpupid = (-1 & LAST_CPUPID_MASK);
+ else
+ last_cpupid = page_cpupid_last(page);
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
if (target_nid == NUMA_NO_NODE) {
@@ -4985,7 +5029,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return VM_FAULT_OOM;
retry_pud:
if (pud_none(*vmf.pud) &&
- hugepage_vma_check(vma, vm_flags, false, true)) {
+ hugepage_vma_check(vma, vm_flags, false, true, true)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -5019,7 +5063,7 @@ retry_pud:
goto retry_pud;
if (pmd_none(*vmf.pmd) &&
- hugepage_vma_check(vma, vm_flags, false, true)) {
+ hugepage_vma_check(vma, vm_flags, false, true, true)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -5114,6 +5158,27 @@ static inline void mm_account_fault(struct pt_regs *regs,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}
+#ifdef CONFIG_LRU_GEN
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+ /* the LRU algorithm doesn't apply to sequential or random reads */
+ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
+}
+
+static void lru_gen_exit_fault(void)
+{
+ current->in_lru_fault = false;
+}
+#else
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
+{
+}
+
+static void lru_gen_exit_fault(void)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
/*
* By the time we get here, we already hold the mm semaphore
*
@@ -5145,11 +5210,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
if (flags & FAULT_FLAG_USER)
mem_cgroup_enter_user_fault();
+ lru_gen_enter_fault(vma);
+
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
+ lru_gen_exit_fault();
+
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
@@ -5637,11 +5706,11 @@ static void clear_gigantic_page(struct page *page,
unsigned int pages_per_huge_page)
{
int i;
- struct page *p = page;
+ struct page *p;
might_sleep();
- for (i = 0; i < pages_per_huge_page;
- i++, p = mem_map_next(p, page, i)) {
+ for (i = 0; i < pages_per_huge_page; i++) {
+ p = nth_page(page, i);
cond_resched();
clear_user_highpage(p, addr + i * PAGE_SIZE);
}
@@ -5677,13 +5746,12 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src,
struct page *dst_base = dst;
struct page *src_base = src;
- for (i = 0; i < pages_per_huge_page; ) {
+ for (i = 0; i < pages_per_huge_page; i++) {
+ dst = nth_page(dst_base, i);
+ src = nth_page(src_base, i);
+
cond_resched();
copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
-
- i++;
- dst = mem_map_next(dst, dst_base, i);
- src = mem_map_next(src, src_base, i);
}
}
@@ -5730,10 +5798,10 @@ long copy_huge_page_from_user(struct page *dst_page,
void *page_kaddr;
unsigned long i, rc = 0;
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
- struct page *subpage = dst_page;
+ struct page *subpage;
- for (i = 0; i < pages_per_huge_page;
- i++, subpage = mem_map_next(subpage, dst_page, i)) {
+ for (i = 0; i < pages_per_huge_page; i++) {
+ subpage = nth_page(dst_page, i);
if (allow_pagefault)
page_kaddr = kmap(subpage);
else
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fad6d1f2262a..fd40f7e9f176 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1085,8 +1085,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
* of the physical memory space for vmemmaps. That space is pageblock
* aligned.
*/
- if (WARN_ON_ONCE(!nr_pages ||
- !IS_ALIGNED(pfn, pageblock_nr_pages) ||
+ if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) ||
!IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
return -EINVAL;
@@ -1806,8 +1805,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* of the physical memory space for vmemmaps. That space is pageblock
* aligned.
*/
- if (WARN_ON_ONCE(!nr_pages ||
- !IS_ALIGNED(start_pfn, pageblock_nr_pages) ||
+ if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) ||
!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
return -EINVAL;
@@ -1940,8 +1938,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
node_states_clear_node(node, &arg);
if (arg.status_change_nid >= 0) {
- kswapd_stop(node);
kcompactd_stop(node);
+ kswapd_stop(node);
}
writeback_set_ratelimit();
@@ -1969,11 +1967,10 @@ failed_removal:
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
- int ret = !is_memblock_offlined(mem);
int *nid = arg;
*nid = mem->nid;
- if (unlikely(ret)) {
+ if (unlikely(mem->state != MEM_OFFLINE)) {
phys_addr_t beginpa, endpa;
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b73d3248d976..61aa9aedb728 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -381,9 +381,10 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
mmap_write_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next)
+ for_each_vma(vmi, vma)
mpol_rebind_policy(vma->vm_policy, new);
mmap_write_unlock(mm);
}
@@ -654,7 +655,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
static int queue_pages_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->vma;
+ struct vm_area_struct *next, *vma = walk->vma;
struct queue_pages *qp = walk->private;
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
@@ -669,9 +670,10 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
/* hole at head side of range */
return -EFAULT;
}
+ next = find_vma(vma->vm_mm, vma->vm_end);
if (!(flags & MPOL_MF_DISCONTIG_OK) &&
((vma->vm_end < qp->end) &&
- (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
+ (!next || vma->vm_end < next->vm_start)))
/* hole at middle or tail of range */
return -EFAULT;
@@ -785,26 +787,29 @@ static int vma_replace_policy(struct vm_area_struct *vma,
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
+ MA_STATE(mas, &mm->mm_mt, start, start);
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
pgoff_t pgoff;
- unsigned long vmstart;
- unsigned long vmend;
- vma = find_vma(mm, start);
- VM_BUG_ON(!vma);
+ prev = mas_prev(&mas, 0);
+ if (unlikely(!prev))
+ mas_set(&mas, start);
+
+ vma = mas_find(&mas, end - 1);
+ if (WARN_ON(!vma))
+ return 0;
- prev = vma->vm_prev;
if (start > vma->vm_start)
prev = vma;
- for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
- vmstart = max(start, vma->vm_start);
- vmend = min(end, vma->vm_end);
+ for (; vma; vma = mas_next(&mas, end - 1)) {
+ unsigned long vmstart = max(start, vma->vm_start);
+ unsigned long vmend = min(end, vma->vm_end);
if (mpol_equal(vma_policy(vma), new_pol))
- continue;
+ goto next;
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
@@ -813,6 +818,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
new_pol, vma->vm_userfaultfd_ctx,
anon_vma_name(vma));
if (prev) {
+ /* vma_merge() invalidated the mas */
+ mas_pause(&mas);
vma = prev;
goto replace;
}
@@ -820,19 +827,25 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
err = split_vma(vma->vm_mm, vma, vmstart, 1);
if (err)
goto out;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
if (vma->vm_end != vmend) {
err = split_vma(vma->vm_mm, vma, vmend, 0);
if (err)
goto out;
+ /* split_vma() invalidated the mas */
+ mas_pause(&mas);
}
- replace:
+replace:
err = vma_replace_policy(vma, new_pol);
if (err)
goto out;
+next:
+ prev = vma;
}
- out:
+out:
return err;
}
@@ -853,12 +866,14 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
goto out;
}
+ task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
+ task_unlock(current);
mpol_put(new);
goto out;
}
- task_lock(current);
+
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
@@ -1047,6 +1062,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int flags)
{
nodemask_t nmask;
+ struct vm_area_struct *vma;
LIST_HEAD(pagelist);
int err = 0;
struct migration_target_control mtc = {
@@ -1062,8 +1078,9 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
* need migration. Between passing in the full user address
* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
*/
+ vma = find_vma(mm, 0);
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
- queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+ queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
@@ -1193,14 +1210,13 @@ static struct page *new_page(struct page *page, unsigned long start)
struct folio *dst, *src = page_folio(page);
struct vm_area_struct *vma;
unsigned long address;
+ VMA_ITERATOR(vmi, current->mm, start);
gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL;
- vma = find_vma(current->mm, start);
- while (vma) {
+ for_each_vma(vmi, vma) {
address = page_address_in_vma(page, vma);
if (address != -EFAULT)
break;
- vma = vma->vm_next;
}
if (folio_test_hugetlb(src))
@@ -1259,7 +1275,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (mode == MPOL_DEFAULT)
flags &= ~MPOL_MF_STRICT;
- len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ len = PAGE_ALIGN(len);
end = start + len;
if (end < start)
@@ -1478,6 +1494,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
unsigned long vmend;
unsigned long end;
int err = -ENOENT;
+ VMA_ITERATOR(vmi, mm, start);
start = untagged_addr(start);
if (start & ~PAGE_MASK)
@@ -1495,7 +1512,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
if (home_node >= MAX_NUMNODES || !node_online(home_node))
return -EINVAL;
- len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ len = PAGE_ALIGN(len);
end = start + len;
if (end < start)
@@ -1503,9 +1520,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le
if (end == start)
return 0;
mmap_write_lock(mm);
- vma = find_vma(mm, start);
- for (; vma && vma->vm_start < end; vma = vma->vm_next) {
-
+ for_each_vma_range(vmi, vma, end) {
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
new = mpol_dup(vma_policy(vma));
@@ -1803,7 +1818,7 @@ bool vma_policy_mof(struct vm_area_struct *vma)
return pol->flags & MPOL_F_MOF;
}
-static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
+bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
diff --git a/mm/memremap.c b/mm/memremap.c
index 58b20c3c300b..08cbf54fe037 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -138,8 +138,11 @@ void memunmap_pages(struct dev_pagemap *pgmap)
int i;
percpu_ref_kill(&pgmap->ref);
- for (i = 0; i < pgmap->nr_range; i++)
- percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
+ if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ pgmap->type != MEMORY_DEVICE_COHERENT)
+ for (i = 0; i < pgmap->nr_range; i++)
+ percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
+
wait_for_completion(&pgmap->done);
for (i = 0; i < pgmap->nr_range; i++)
@@ -264,7 +267,9 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), pgmap);
- percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
+ if (pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ pgmap->type != MEMORY_DEVICE_COHERENT)
+ percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
return 0;
err_add_memory:
@@ -330,6 +335,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
WARN(1, "File system DAX not supported\n");
return ERR_PTR(-EINVAL);
}
+ params.pgprot = pgprot_decrypted(params.pgprot);
break;
case MEMORY_DEVICE_GENERIC:
break;
@@ -454,7 +460,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
/* fall back to slow path lookup */
rcu_read_lock();
pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
- if (pgmap && !percpu_ref_tryget_live(&pgmap->ref))
+ if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref))
pgmap = NULL;
rcu_read_unlock();
@@ -502,11 +508,28 @@ void free_zone_device_page(struct page *page)
page->mapping = NULL;
page->pgmap->ops->page_free(page);
+ if (page->pgmap->type != MEMORY_DEVICE_PRIVATE &&
+ page->pgmap->type != MEMORY_DEVICE_COHERENT)
+ /*
+ * Reset the page count to 1 to prepare for handing out the page
+ * again.
+ */
+ set_page_count(page, 1);
+ else
+ put_dev_pagemap(page->pgmap);
+}
+
+void zone_device_page_init(struct page *page)
+{
/*
- * Reset the page count to 1 to prepare for handing out the page again.
+ * Drivers shouldn't be allocating pages after calling
+ * memunmap_pages().
*/
+ WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref));
set_page_count(page, 1);
+ lock_page(page);
}
+EXPORT_SYMBOL_GPL(zone_device_page_init);
#ifdef CONFIG_FS_DAX
bool __put_devmap_managed_page_refs(struct page *page, int refs)
diff --git a/mm/migrate.c b/mm/migrate.c
index 6a1597c92261..dff333593a8a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -50,6 +50,7 @@
#include <linux/memory.h>
#include <linux/random.h>
#include <linux/sched/sysctl.h>
+#include <linux/memory-tiers.h>
#include <asm/tlbflush.h>
@@ -198,7 +199,7 @@ static bool remove_migration_pte(struct folio *folio,
#endif
folio_get(folio);
- pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
+ pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
if (pte_swp_soft_dirty(*pvmw.pte))
pte = pte_mksoft_dirty(pte);
@@ -206,6 +207,10 @@ static bool remove_migration_pte(struct folio *folio,
* Recheck VMA as permissions can change since migration started
*/
entry = pte_to_swp_entry(*pvmw.pte);
+ if (!is_migration_entry_young(entry))
+ pte = pte_mkold(pte);
+ if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
+ pte = pte_mkdirty(pte);
if (is_writable_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(*pvmw.pte))
@@ -560,6 +565,18 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
* future migrations of this same page.
*/
cpupid = page_cpupid_xchg_last(&folio->page, -1);
+ /*
+ * For memory tiering mode, when migrate between slow and fast
+ * memory node, reset cpupid, because that is used to record
+ * page access time in slow memory node.
+ */
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
+ bool f_toptier = node_is_toptier(page_to_nid(&folio->page));
+ bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page));
+
+ if (f_toptier != t_toptier)
+ cpupid = -1;
+ }
page_cpupid_xchg_last(&newfolio->page, cpupid);
folio_migrate_ksm(newfolio, folio);
@@ -608,6 +625,25 @@ EXPORT_SYMBOL(folio_migrate_copy);
* Migration functions
***********************************************************/
+int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode, int extra_count)
+{
+ int rc;
+
+ BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
+
+ rc = folio_migrate_mapping(mapping, dst, src, extra_count);
+
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ folio_migrate_copy(dst, src);
+ else
+ folio_migrate_flags(dst, src);
+ return MIGRATEPAGE_SUCCESS;
+}
+
/**
* migrate_folio() - Simple folio migration.
* @mapping: The address_space containing the folio.
@@ -623,20 +659,7 @@ EXPORT_SYMBOL(folio_migrate_copy);
int migrate_folio(struct address_space *mapping, struct folio *dst,
struct folio *src, enum migrate_mode mode)
{
- int rc;
-
- BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
-
- rc = folio_migrate_mapping(mapping, dst, src, 0);
-
- if (rc != MIGRATEPAGE_SUCCESS)
- return rc;
-
- if (mode != MIGRATE_SYNC_NO_COPY)
- folio_migrate_copy(dst, src);
- else
- folio_migrate_flags(dst, src);
- return MIGRATEPAGE_SUCCESS;
+ return migrate_folio_extra(mapping, dst, src, mode, 0);
}
EXPORT_SYMBOL(migrate_folio);
@@ -976,17 +999,15 @@ out:
return rc;
}
-static int __unmap_and_move(struct page *page, struct page *newpage,
+static int __unmap_and_move(struct folio *src, struct folio *dst,
int force, enum migrate_mode mode)
{
- struct folio *folio = page_folio(page);
- struct folio *dst = page_folio(newpage);
int rc = -EAGAIN;
bool page_was_mapped = false;
struct anon_vma *anon_vma = NULL;
- bool is_lru = !__PageMovable(page);
+ bool is_lru = !__PageMovable(&src->page);
- if (!trylock_page(page)) {
+ if (!folio_trylock(src)) {
if (!force || mode == MIGRATE_ASYNC)
goto out;
@@ -1006,10 +1027,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (current->flags & PF_MEMALLOC)
goto out;
- lock_page(page);
+ folio_lock(src);
}
- if (PageWriteback(page)) {
+ if (folio_test_writeback(src)) {
/*
* Only in the case of a full synchronous migration is it
* necessary to wait for PageWriteback. In the async case,
@@ -1026,39 +1047,39 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
}
if (!force)
goto out_unlock;
- wait_on_page_writeback(page);
+ folio_wait_writeback(src);
}
/*
- * By try_to_migrate(), page->mapcount goes down to 0 here. In this case,
- * we cannot notice that anon_vma is freed while we migrates a page.
+ * By try_to_migrate(), src->mapcount goes down to 0 here. In this case,
+ * we cannot notice that anon_vma is freed while we migrate a page.
* This get_anon_vma() delays freeing anon_vma pointer until the end
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
*
- * Only page_get_anon_vma() understands the subtleties of
+ * Only folio_get_anon_vma() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms.
* But if we cannot get anon_vma, then we won't need it anyway,
* because that implies that the anon page is no longer mapped
* (and cannot be remapped so long as we hold the page lock).
*/
- if (PageAnon(page) && !PageKsm(page))
- anon_vma = page_get_anon_vma(page);
+ if (folio_test_anon(src) && !folio_test_ksm(src))
+ anon_vma = folio_get_anon_vma(src);
/*
* Block others from accessing the new page when we get around to
* establishing additional references. We are usually the only one
- * holding a reference to newpage at this point. We used to have a BUG
- * here if trylock_page(newpage) fails, but would like to allow for
- * cases where there might be a race with the previous use of newpage.
+ * holding a reference to dst at this point. We used to have a BUG
+ * here if folio_trylock(dst) fails, but would like to allow for
+ * cases where there might be a race with the previous use of dst.
* This is much like races on refcount of oldpage: just don't BUG().
*/
- if (unlikely(!trylock_page(newpage)))
+ if (unlikely(!folio_trylock(dst)))
goto out_unlock;
if (unlikely(!is_lru)) {
- rc = move_to_new_folio(dst, folio, mode);
+ rc = move_to_new_folio(dst, src, mode);
goto out_unlock_both;
}
@@ -1066,7 +1087,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* Corner case handling:
* 1. When a new swap-cache page is read into, it is added to the LRU
* and treated as swapcache but it has no rmap yet.
- * Calling try_to_unmap() against a page->mapping==NULL page will
+ * Calling try_to_unmap() against a src->mapping==NULL page will
* trigger a BUG. So handle it here.
* 2. An orphaned page (see truncate_cleanup_page) might have
* fs-private metadata. The page can be picked up due to memory
@@ -1074,57 +1095,56 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* invisible to the vm, so the page can not be migrated. So try to
* free the metadata, so the page can be freed.
*/
- if (!page->mapping) {
- VM_BUG_ON_PAGE(PageAnon(page), page);
- if (page_has_private(page)) {
- try_to_free_buffers(folio);
+ if (!src->mapping) {
+ if (folio_test_private(src)) {
+ try_to_free_buffers(src);
goto out_unlock_both;
}
- } else if (page_mapped(page)) {
+ } else if (folio_mapped(src)) {
/* Establish migration ptes */
- VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
- page);
- try_to_migrate(folio, 0);
+ VM_BUG_ON_FOLIO(folio_test_anon(src) &&
+ !folio_test_ksm(src) && !anon_vma, src);
+ try_to_migrate(src, 0);
page_was_mapped = true;
}
- if (!page_mapped(page))
- rc = move_to_new_folio(dst, folio, mode);
+ if (!folio_mapped(src))
+ rc = move_to_new_folio(dst, src, mode);
/*
- * When successful, push newpage to LRU immediately: so that if it
+ * When successful, push dst to LRU immediately: so that if it
* turns out to be an mlocked page, remove_migration_ptes() will
- * automatically build up the correct newpage->mlock_count for it.
+ * automatically build up the correct dst->mlock_count for it.
*
* We would like to do something similar for the old page, when
* unsuccessful, and other cases when a page has been temporarily
* isolated from the unevictable LRU: but this case is the easiest.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- lru_cache_add(newpage);
+ folio_add_lru(dst);
if (page_was_mapped)
lru_add_drain();
}
if (page_was_mapped)
- remove_migration_ptes(folio,
- rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
+ remove_migration_ptes(src,
+ rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
out_unlock_both:
- unlock_page(newpage);
+ folio_unlock(dst);
out_unlock:
/* Drop an anon_vma reference if we took one */
if (anon_vma)
put_anon_vma(anon_vma);
- unlock_page(page);
+ folio_unlock(src);
out:
/*
- * If migration is successful, decrease refcount of the newpage,
+ * If migration is successful, decrease refcount of dst,
* which will not free the page because new page owner increased
* refcounter.
*/
if (rc == MIGRATEPAGE_SUCCESS)
- put_page(newpage);
+ folio_put(dst);
return rc;
}
@@ -1140,6 +1160,7 @@ static int unmap_and_move(new_page_t get_new_page,
enum migrate_reason reason,
struct list_head *ret)
{
+ struct folio *dst, *src = page_folio(page);
int rc = MIGRATEPAGE_SUCCESS;
struct page *newpage = NULL;
@@ -1157,9 +1178,10 @@ static int unmap_and_move(new_page_t get_new_page,
newpage = get_new_page(page, private);
if (!newpage)
return -ENOMEM;
+ dst = page_folio(newpage);
newpage->private = 0;
- rc = __unmap_and_move(page, newpage, force, mode);
+ rc = __unmap_and_move(src, dst, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
set_page_owner_migrate_reason(newpage, reason);
@@ -1244,12 +1266,10 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
* tables or check whether the hugepage is pmd-based or not before
* kicking migration.
*/
- if (!hugepage_migration_supported(page_hstate(hpage))) {
- list_move_tail(&hpage->lru, ret);
+ if (!hugepage_migration_supported(page_hstate(hpage)))
return -ENOSYS;
- }
- if (page_count(hpage) == 1) {
+ if (folio_ref_count(src) == 1) {
/* page was freed from under us. So we are done. */
putback_active_hugepage(hpage);
return MIGRATEPAGE_SUCCESS;
@@ -1260,7 +1280,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
return -ENOMEM;
dst = page_folio(new_hpage);
- if (!trylock_page(hpage)) {
+ if (!folio_trylock(src)) {
if (!force)
goto out;
switch (mode) {
@@ -1270,29 +1290,29 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
default:
goto out;
}
- lock_page(hpage);
+ folio_lock(src);
}
/*
* Check for pages which are in the process of being freed. Without
- * page_mapping() set, hugetlbfs specific move page routine will not
+ * folio_mapping() set, hugetlbfs specific move page routine will not
* be called and we could leak usage counts for subpools.
*/
- if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
+ if (hugetlb_page_subpool(hpage) && !folio_mapping(src)) {
rc = -EBUSY;
goto out_unlock;
}
- if (PageAnon(hpage))
- anon_vma = page_get_anon_vma(hpage);
+ if (folio_test_anon(src))
+ anon_vma = folio_get_anon_vma(src);
- if (unlikely(!trylock_page(new_hpage)))
+ if (unlikely(!folio_trylock(dst)))
goto put_anon;
- if (page_mapped(hpage)) {
+ if (folio_mapped(src)) {
enum ttu_flags ttu = 0;
- if (!PageAnon(hpage)) {
+ if (!folio_test_anon(src)) {
/*
* In shared mappings, try_to_unmap could potentially
* call huge_pmd_unshare. Because of this, take
@@ -1313,7 +1333,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
i_mmap_unlock_write(mapping);
}
- if (!page_mapped(hpage))
+ if (!folio_mapped(src))
rc = move_to_new_folio(dst, src, mode);
if (page_was_mapped)
@@ -1321,7 +1341,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
unlock_put_anon:
- unlock_page(new_hpage);
+ folio_unlock(dst);
put_anon:
if (anon_vma)
@@ -1333,12 +1353,12 @@ put_anon:
}
out_unlock:
- unlock_page(hpage);
+ folio_unlock(src);
out:
if (rc == MIGRATEPAGE_SUCCESS)
putback_active_hugepage(hpage);
else if (rc != -EAGAIN)
- list_move_tail(&hpage->lru, ret);
+ list_move_tail(&src->lru, ret);
/*
* If migration was not successful and there's a freeing callback, use
@@ -1353,16 +1373,15 @@ out:
return rc;
}
-static inline int try_split_thp(struct page *page, struct page **page2,
- struct list_head *from)
+static inline int try_split_thp(struct page *page, struct list_head *split_pages)
{
- int rc = 0;
+ int rc;
lock_page(page);
- rc = split_huge_page_to_list(page, from);
+ rc = split_huge_page_to_list(page, split_pages);
unlock_page(page);
if (!rc)
- list_safe_reset_next(page, *page2, lru);
+ list_move_tail(&page->lru, split_pages);
return rc;
}
@@ -1400,6 +1419,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
int thp_retry = 1;
int nr_failed = 0;
int nr_failed_pages = 0;
+ int nr_retry_pages = 0;
int nr_succeeded = 0;
int nr_thp_succeeded = 0;
int nr_thp_failed = 0;
@@ -1420,9 +1440,9 @@ thp_subpage_migration:
for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
retry = 0;
thp_retry = 0;
+ nr_retry_pages = 0;
list_for_each_entry_safe(page, page2, from, lru) {
-retry:
/*
* THP statistics is based on the source huge page.
* Capture required information that might get lost
@@ -1447,6 +1467,7 @@ retry:
* page will be put back
* -EAGAIN: stay on the from list
* -ENOMEM: stay on the from list
+ * -ENOSYS: stay on the from list
* Other errno: put on ret_pages list then splice to
* from list
*/
@@ -1457,18 +1478,17 @@ retry:
* retry on the same page with the THP split
* to base pages.
*
- * Head page is retried immediately and tail
- * pages are added to the tail of the list so
- * we encounter them after the rest of the list
- * is processed.
+ * Sub-pages are put in thp_split_pages, and
+ * we will migrate them after the rest of the
+ * list is processed.
*/
case -ENOSYS:
/* THP migration is unsupported */
if (is_thp) {
nr_thp_failed++;
- if (!try_split_thp(page, &page2, &thp_split_pages)) {
+ if (!try_split_thp(page, &thp_split_pages)) {
nr_thp_split++;
- goto retry;
+ break;
}
/* Hugetlb migration is unsupported */
} else if (!no_subpage_counting) {
@@ -1476,24 +1496,25 @@ retry:
}
nr_failed_pages += nr_subpages;
+ list_move_tail(&page->lru, &ret_pages);
break;
case -ENOMEM:
/*
* When memory is low, don't bother to try to migrate
* other pages, just exit.
- * THP NUMA faulting doesn't split THP to retry.
*/
- if (is_thp && !nosplit) {
+ if (is_thp) {
nr_thp_failed++;
- if (!try_split_thp(page, &page2, &thp_split_pages)) {
+ /* THP NUMA faulting doesn't split THP to retry. */
+ if (!nosplit && !try_split_thp(page, &thp_split_pages)) {
nr_thp_split++;
- goto retry;
+ break;
}
} else if (!no_subpage_counting) {
nr_failed++;
}
- nr_failed_pages += nr_subpages;
+ nr_failed_pages += nr_subpages + nr_retry_pages;
/*
* There might be some subpages of fail-to-migrate THPs
* left in thp_split_pages list. Move them back to migration
@@ -1501,13 +1522,15 @@ retry:
* the caller otherwise the page refcnt will be leaked.
*/
list_splice_init(&thp_split_pages, from);
+ /* nr_failed isn't updated for not used */
nr_thp_failed += thp_retry;
goto out;
case -EAGAIN:
if (is_thp)
thp_retry++;
- else
+ else if (!no_subpage_counting)
retry++;
+ nr_retry_pages += nr_subpages;
break;
case MIGRATEPAGE_SUCCESS:
nr_succeeded += nr_subpages;
@@ -1533,6 +1556,7 @@ retry:
}
nr_failed += retry;
nr_thp_failed += thp_retry;
+ nr_failed_pages += nr_retry_pages;
/*
* Try to migrate subpages of fail-to-migrate THPs, no nr_failed
* counting in this round, since all subpages of a THP is counted
@@ -1558,6 +1582,13 @@ out:
*/
list_splice(&ret_pages, from);
+ /*
+ * Return 0 in case all subpages of fail-to-migrate THPs are
+ * migrated successfully.
+ */
+ if (list_empty(from))
+ rc = 0;
+
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
@@ -1672,9 +1703,12 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
goto out;
err = -ENOENT;
- if (!page || is_zone_device_page(page))
+ if (!page)
goto out;
+ if (is_zone_device_page(page))
+ goto out_putpage;
+
err = 0;
if (page_to_nid(page) == node)
goto out_putpage;
@@ -1735,7 +1769,7 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node,
* well.
*/
if (err > 0)
- err += nr_pages - i - 1;
+ err += nr_pages - i;
return err;
}
return store_status(status, start, node, i - start);
@@ -1821,8 +1855,12 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
err = move_pages_and_store_status(mm, current_node, &pagelist,
status, start, i, nr_pages);
- if (err)
+ if (err) {
+ /* We have accounted for page i */
+ if (err > 0)
+ err--;
goto out;
+ }
current_node = NUMA_NO_NODE;
}
out_flush:
@@ -1848,6 +1886,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
for (i = 0; i < nr_pages; i++) {
unsigned long addr = (unsigned long)(*pages);
+ unsigned int foll_flags = FOLL_DUMP;
struct vm_area_struct *vma;
struct page *page;
int err = -EFAULT;
@@ -1856,19 +1895,26 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
if (!vma)
goto set_status;
+ /* Not all huge page follow APIs support 'FOLL_GET' */
+ if (!is_vm_hugetlb_page(vma))
+ foll_flags |= FOLL_GET;
+
/* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
+ page = follow_page(vma, addr, foll_flags);
err = PTR_ERR(page);
if (IS_ERR(page))
goto set_status;
- if (page && !is_zone_device_page(page)) {
+ err = -ENOENT;
+ if (!page)
+ goto set_status;
+
+ if (!is_zone_device_page(page))
err = page_to_nid(page);
+
+ if (foll_flags & FOLL_GET)
put_page(page);
- } else {
- err = -ENOENT;
- }
set_status:
*status = err;
@@ -2170,456 +2216,4 @@ out:
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
-
-/*
- * node_demotion[] example:
- *
- * Consider a system with two sockets. Each socket has
- * three classes of memory attached: fast, medium and slow.
- * Each memory class is placed in its own NUMA node. The
- * CPUs are placed in the node with the "fast" memory. The
- * 6 NUMA nodes (0-5) might be split among the sockets like
- * this:
- *
- * Socket A: 0, 1, 2
- * Socket B: 3, 4, 5
- *
- * When Node 0 fills up, its memory should be migrated to
- * Node 1. When Node 1 fills up, it should be migrated to
- * Node 2. The migration path start on the nodes with the
- * processors (since allocations default to this node) and
- * fast memory, progress through medium and end with the
- * slow memory:
- *
- * 0 -> 1 -> 2 -> stop
- * 3 -> 4 -> 5 -> stop
- *
- * This is represented in the node_demotion[] like this:
- *
- * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1
- * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2
- * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate
- * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4
- * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5
- * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate
- *
- * Moreover some systems may have multiple slow memory nodes.
- * Suppose a system has one socket with 3 memory nodes, node 0
- * is fast memory type, and node 1/2 both are slow memory
- * type, and the distance between fast memory node and slow
- * memory node is same. So the migration path should be:
- *
- * 0 -> 1/2 -> stop
- *
- * This is represented in the node_demotion[] like this:
- * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
- * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
- * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
- */
-
-/*
- * Writes to this array occur without locking. Cycles are
- * not allowed: Node X demotes to Y which demotes to X...
- *
- * If multiple reads are performed, a single rcu_read_lock()
- * must be held over all reads to ensure that no cycles are
- * observed.
- */
-#define DEFAULT_DEMOTION_TARGET_NODES 15
-
-#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
-#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
-#else
-#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
-#endif
-
-struct demotion_nodes {
- unsigned short nr;
- short nodes[DEMOTION_TARGET_NODES];
-};
-
-static struct demotion_nodes *node_demotion __read_mostly;
-
-/**
- * next_demotion_node() - Get the next node in the demotion path
- * @node: The starting node to lookup the next node
- *
- * Return: node id for next memory node in the demotion path hierarchy
- * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
- * @node online or guarantee that it *continues* to be the next demotion
- * target.
- */
-int next_demotion_node(int node)
-{
- struct demotion_nodes *nd;
- unsigned short target_nr, index;
- int target;
-
- if (!node_demotion)
- return NUMA_NO_NODE;
-
- nd = &node_demotion[node];
-
- /*
- * node_demotion[] is updated without excluding this
- * function from running. RCU doesn't provide any
- * compiler barriers, so the READ_ONCE() is required
- * to avoid compiler reordering or read merging.
- *
- * Make sure to use RCU over entire code blocks if
- * node_demotion[] reads need to be consistent.
- */
- rcu_read_lock();
- target_nr = READ_ONCE(nd->nr);
-
- switch (target_nr) {
- case 0:
- target = NUMA_NO_NODE;
- goto out;
- case 1:
- index = 0;
- break;
- default:
- /*
- * If there are multiple target nodes, just select one
- * target node randomly.
- *
- * In addition, we can also use round-robin to select
- * target node, but we should introduce another variable
- * for node_demotion[] to record last selected target node,
- * that may cause cache ping-pong due to the changing of
- * last target node. Or introducing per-cpu data to avoid
- * caching issue, which seems more complicated. So selecting
- * target node randomly seems better until now.
- */
- index = get_random_int() % target_nr;
- break;
- }
-
- target = READ_ONCE(nd->nodes[index]);
-
-out:
- rcu_read_unlock();
- return target;
-}
-
-/* Disable reclaim-based migration. */
-static void __disable_all_migrate_targets(void)
-{
- int node, i;
-
- if (!node_demotion)
- return;
-
- for_each_online_node(node) {
- node_demotion[node].nr = 0;
- for (i = 0; i < DEMOTION_TARGET_NODES; i++)
- node_demotion[node].nodes[i] = NUMA_NO_NODE;
- }
-}
-
-static void disable_all_migrate_targets(void)
-{
- __disable_all_migrate_targets();
-
- /*
- * Ensure that the "disable" is visible across the system.
- * Readers will see either a combination of before+disable
- * state or disable+after. They will never see before and
- * after state together.
- *
- * The before+after state together might have cycles and
- * could cause readers to do things like loop until this
- * function finishes. This ensures they can only see a
- * single "bad" read and would, for instance, only loop
- * once.
- */
- synchronize_rcu();
-}
-
-/*
- * Find an automatic demotion target for 'node'.
- * Failing here is OK. It might just indicate
- * being at the end of a chain.
- */
-static int establish_migrate_target(int node, nodemask_t *used,
- int best_distance)
-{
- int migration_target, index, val;
- struct demotion_nodes *nd;
-
- if (!node_demotion)
- return NUMA_NO_NODE;
-
- nd = &node_demotion[node];
-
- migration_target = find_next_best_node(node, used);
- if (migration_target == NUMA_NO_NODE)
- return NUMA_NO_NODE;
-
- /*
- * If the node has been set a migration target node before,
- * which means it's the best distance between them. Still
- * check if this node can be demoted to other target nodes
- * if they have a same best distance.
- */
- if (best_distance != -1) {
- val = node_distance(node, migration_target);
- if (val > best_distance)
- goto out_clear;
- }
-
- index = nd->nr;
- if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
- "Exceeds maximum demotion target nodes\n"))
- goto out_clear;
-
- nd->nodes[index] = migration_target;
- nd->nr++;
-
- return migration_target;
-out_clear:
- node_clear(migration_target, *used);
- return NUMA_NO_NODE;
-}
-
-/*
- * When memory fills up on a node, memory contents can be
- * automatically migrated to another node instead of
- * discarded at reclaim.
- *
- * Establish a "migration path" which will start at nodes
- * with CPUs and will follow the priorities used to build the
- * page allocator zonelists.
- *
- * The difference here is that cycles must be avoided. If
- * node0 migrates to node1, then neither node1, nor anything
- * node1 migrates to can migrate to node0. Also one node can
- * be migrated to multiple nodes if the target nodes all have
- * a same best-distance against the source node.
- *
- * This function can run simultaneously with readers of
- * node_demotion[]. However, it can not run simultaneously
- * with itself. Exclusion is provided by memory hotplug events
- * being single-threaded.
- */
-static void __set_migration_target_nodes(void)
-{
- nodemask_t next_pass;
- nodemask_t this_pass;
- nodemask_t used_targets = NODE_MASK_NONE;
- int node, best_distance;
-
- /*
- * Avoid any oddities like cycles that could occur
- * from changes in the topology. This will leave
- * a momentary gap when migration is disabled.
- */
- disable_all_migrate_targets();
-
- /*
- * Allocations go close to CPUs, first. Assume that
- * the migration path starts at the nodes with CPUs.
- */
- next_pass = node_states[N_CPU];
-again:
- this_pass = next_pass;
- next_pass = NODE_MASK_NONE;
- /*
- * To avoid cycles in the migration "graph", ensure
- * that migration sources are not future targets by
- * setting them in 'used_targets'. Do this only
- * once per pass so that multiple source nodes can
- * share a target node.
- *
- * 'used_targets' will become unavailable in future
- * passes. This limits some opportunities for
- * multiple source nodes to share a destination.
- */
- nodes_or(used_targets, used_targets, this_pass);
-
- for_each_node_mask(node, this_pass) {
- best_distance = -1;
-
- /*
- * Try to set up the migration path for the node, and the target
- * migration nodes can be multiple, so doing a loop to find all
- * the target nodes if they all have a best node distance.
- */
- do {
- int target_node =
- establish_migrate_target(node, &used_targets,
- best_distance);
-
- if (target_node == NUMA_NO_NODE)
- break;
-
- if (best_distance == -1)
- best_distance = node_distance(node, target_node);
-
- /*
- * Visit targets from this pass in the next pass.
- * Eventually, every node will have been part of
- * a pass, and will become set in 'used_targets'.
- */
- node_set(target_node, next_pass);
- } while (1);
- }
- /*
- * 'next_pass' contains nodes which became migration
- * targets in this pass. Make additional passes until
- * no more migrations targets are available.
- */
- if (!nodes_empty(next_pass))
- goto again;
-}
-
-/*
- * For callers that do not hold get_online_mems() already.
- */
-void set_migration_target_nodes(void)
-{
- get_online_mems();
- __set_migration_target_nodes();
- put_online_mems();
-}
-
-/*
- * This leaves migrate-on-reclaim transiently disabled between
- * the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
- * whether reclaim-based migration is enabled or not, which
- * ensures that the user can turn reclaim-based migration at
- * any time without needing to recalculate migration targets.
- *
- * These callbacks already hold get_online_mems(). That is why
- * __set_migration_target_nodes() can be used as opposed to
- * set_migration_target_nodes().
- */
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
- unsigned long action, void *_arg)
-{
- struct memory_notify *arg = _arg;
-
- /*
- * Only update the node migration order when a node is
- * changing status, like online->offline. This avoids
- * the overhead of synchronize_rcu() in most cases.
- */
- if (arg->status_change_nid < 0)
- return notifier_from_errno(0);
-
- switch (action) {
- case MEM_GOING_OFFLINE:
- /*
- * Make sure there are not transient states where
- * an offline node is a migration target. This
- * will leave migration disabled until the offline
- * completes and the MEM_OFFLINE case below runs.
- */
- disable_all_migrate_targets();
- break;
- case MEM_OFFLINE:
- case MEM_ONLINE:
- /*
- * Recalculate the target nodes once the node
- * reaches its final state (online or offline).
- */
- __set_migration_target_nodes();
- break;
- case MEM_CANCEL_OFFLINE:
- /*
- * MEM_GOING_OFFLINE disabled all the migration
- * targets. Reenable them.
- */
- __set_migration_target_nodes();
- break;
- case MEM_GOING_ONLINE:
- case MEM_CANCEL_ONLINE:
- break;
- }
-
- return notifier_from_errno(0);
-}
-#endif
-
-void __init migrate_on_reclaim_init(void)
-{
- node_demotion = kcalloc(nr_node_ids,
- sizeof(struct demotion_nodes),
- GFP_KERNEL);
- WARN_ON(!node_demotion);
-#ifdef CONFIG_MEMORY_HOTPLUG
- hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
-#endif
- /*
- * At this point, all numa nodes with memory/CPus have their state
- * properly set, so we can build the demotion order now.
- * Let us hold the cpu_hotplug lock just, as we could possibily have
- * CPU hotplug events during boot.
- */
- cpus_read_lock();
- set_migration_target_nodes();
- cpus_read_unlock();
-}
-
-bool numa_demotion_enabled = false;
-
-#ifdef CONFIG_SYSFS
-static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sysfs_emit(buf, "%s\n",
- numa_demotion_enabled ? "true" : "false");
-}
-
-static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- ssize_t ret;
-
- ret = kstrtobool(buf, &numa_demotion_enabled);
- if (ret)
- return ret;
-
- return count;
-}
-
-static struct kobj_attribute numa_demotion_enabled_attr =
- __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
- numa_demotion_enabled_store);
-
-static struct attribute *numa_attrs[] = {
- &numa_demotion_enabled_attr.attr,
- NULL,
-};
-
-static const struct attribute_group numa_attr_group = {
- .attrs = numa_attrs,
-};
-
-static int __init numa_init_sysfs(void)
-{
- int err;
- struct kobject *numa_kobj;
-
- numa_kobj = kobject_create_and_add("numa", mm_kobj);
- if (!numa_kobj) {
- pr_err("failed to create numa kobject\n");
- return -ENOMEM;
- }
- err = sysfs_create_group(numa_kobj, &numa_attr_group);
- if (err) {
- pr_err("failed to register numa group\n");
- goto delete_obj;
- }
- return 0;
-
-delete_obj:
- kobject_put(numa_kobj);
- return err;
-}
-subsys_initcall(numa_init_sysfs);
-#endif /* CONFIG_SYSFS */
#endif /* CONFIG_NUMA */
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 27fb37d65476..6fa682eef7a0 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -185,18 +186,25 @@ again:
get_page(page);
/*
- * Optimize for the common case where page is only mapped once
- * in one process. If we can lock the page, then we can safely
- * set up a special migration page table entry now.
+ * We rely on trylock_page() to avoid deadlock between
+ * concurrent migrations where each is waiting on the others
+ * page lock. If we can't immediately lock the page we fail this
+ * migration as it is only best effort anyway.
+ *
+ * If we can lock the page it's safe to set up a migration entry
+ * now. In the common case where the page is mapped once in a
+ * single process setting up the migration entry now is an
+ * optimisation to avoid walking the rmap later with
+ * try_to_migrate().
*/
if (trylock_page(page)) {
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
@@ -206,11 +214,15 @@ again:
goto next;
}
} else {
- ptep_get_and_clear(mm, addr, ptep);
+ pte = ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
@@ -221,6 +233,12 @@ again:
else
entry = make_readable_migration_entry(
page_to_pfn(page));
+ if (pte_present(pte)) {
+ if (pte_young(pte))
+ entry = make_migration_entry_young(entry);
+ if (pte_dirty(pte))
+ entry = make_migration_entry_dirty(entry);
+ }
swp_pte = swp_entry_to_pte(entry);
if (pte_present(pte)) {
if (pte_soft_dirty(pte))
@@ -254,13 +272,14 @@ next:
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn;
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
return 0;
}
@@ -306,14 +325,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
* folio_migrate_mapping(), except that here we allow migration of a
* ZONE_DEVICE page.
*/
-static bool migrate_vma_check_page(struct page *page)
+static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
{
/*
* One extra ref because caller holds an extra reference, either from
* isolate_lru_page() for a regular page, or migrate_vma_collect() for
* a device page.
*/
- int extra = 1;
+ int extra = 1 + (page == fault_page);
/*
* FIXME support THP (transparent huge page), it is bit more complex to
@@ -338,26 +357,20 @@ static bool migrate_vma_check_page(struct page *page)
}
/*
- * migrate_vma_unmap() - replace page mapping with special migration pte entry
- * @migrate: migrate struct containing all migration information
- *
- * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
- * special migration pte entry and check if it has been pinned. Pinned pages are
- * restored because we cannot migrate them.
- *
- * This is the last step before we call the device driver callback to allocate
- * destination memory and copy contents of original page over to new page.
+ * Unmaps pages for migration. Returns number of unmapped pages.
*/
-static void migrate_vma_unmap(struct migrate_vma *migrate)
+static unsigned long migrate_device_unmap(unsigned long *src_pfns,
+ unsigned long npages,
+ struct page *fault_page)
{
- const unsigned long npages = migrate->npages;
unsigned long i, restore = 0;
bool allow_drain = true;
+ unsigned long unmapped = 0;
lru_add_drain();
for (i = 0; i < npages; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
if (!page)
@@ -372,8 +385,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
}
if (isolate_lru_page(page)) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
continue;
}
@@ -386,34 +398,55 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
if (folio_mapped(folio))
try_to_migrate(folio, 0);
- if (page_mapped(page) || !migrate_vma_check_page(page)) {
+ if (page_mapped(page) ||
+ !migrate_vma_check_page(page, fault_page)) {
if (!is_zone_device_page(page)) {
get_page(page);
putback_lru_page(page);
}
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
- migrate->cpages--;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
continue;
}
+
+ unmapped++;
}
for (i = 0; i < npages && restore; i++) {
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
- if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
+ if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE))
continue;
folio = page_folio(page);
remove_migration_ptes(folio, folio, false);
- migrate->src[i] = 0;
+ src_pfns[i] = 0;
folio_unlock(folio);
folio_put(folio);
restore--;
}
+
+ return unmapped;
+}
+
+/*
+ * migrate_vma_unmap() - replace page mapping with special migration pte entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * Isolate pages from the LRU and replace mappings (CPU page table pte) with a
+ * special migration pte entry and check if it has been pinned. Pinned pages are
+ * restored because we cannot migrate them.
+ *
+ * This is the last step before we call the device driver callback to allocate
+ * destination memory and copy contents of original page over to new page.
+ */
+static void migrate_vma_unmap(struct migrate_vma *migrate)
+{
+ migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages,
+ migrate->fault_page);
}
/**
@@ -498,6 +531,8 @@ int migrate_vma_setup(struct migrate_vma *args)
return -EINVAL;
if (!args->src || !args->dst)
return -EINVAL;
+ if (args->fault_page && !is_device_private_page(args->fault_page))
+ return -EINVAL;
memset(args->src, 0, sizeof(*args->src) * nr_pages);
args->cpages = 0;
@@ -658,42 +693,38 @@ abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
-/**
- * migrate_vma_pages() - migrate meta-data from src page to dst page
- * @migrate: migrate struct containing all migration information
- *
- * This migrates struct page meta-data from source struct page to destination
- * struct page. This effectively finishes the migration from source page to the
- * destination page.
- */
-void migrate_vma_pages(struct migrate_vma *migrate)
+static void __migrate_device_pages(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages,
+ struct migrate_vma *migrate)
{
- const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
struct mmu_notifier_range range;
- unsigned long addr, i;
+ unsigned long i;
bool notified = false;
- for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ for (i = 0; i < npages; i++) {
+ struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct address_space *mapping;
int r;
if (!newpage) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
if (!page) {
+ unsigned long addr;
+
+ if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
+ continue;
+
/*
* The only time there is no vma is when called from
* migrate_device_coherent_page(). However this isn't
* called if the page could not be unmapped.
*/
- VM_BUG_ON(!migrate->vma);
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
- continue;
+ VM_BUG_ON(!migrate);
+ addr = migrate->start + i*PAGE_SIZE;
if (!notified) {
notified = true;
@@ -704,7 +735,7 @@ void migrate_vma_pages(struct migrate_vma *migrate)
mmu_notifier_invalidate_range_start(&range);
}
migrate_vma_insert_page(migrate, addr, newpage,
- &migrate->src[i]);
+ &src_pfns[i]);
continue;
}
@@ -717,21 +748,26 @@ void migrate_vma_pages(struct migrate_vma *migrate)
* device private or coherent memory.
*/
if (mapping) {
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
} else if (is_zone_device_page(newpage)) {
/*
* Other types of ZONE_DEVICE page are not supported.
*/
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
- r = migrate_folio(mapping, page_folio(newpage),
- page_folio(page), MIGRATE_SYNC_NO_COPY);
+ if (migrate && migrate->fault_page == page)
+ r = migrate_folio_extra(mapping, page_folio(newpage),
+ page_folio(page),
+ MIGRATE_SYNC_NO_COPY, 1);
+ else
+ r = migrate_folio(mapping, page_folio(newpage),
+ page_folio(page), MIGRATE_SYNC_NO_COPY);
if (r != MIGRATEPAGE_SUCCESS)
- migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
}
/*
@@ -742,28 +778,56 @@ void migrate_vma_pages(struct migrate_vma *migrate)
if (notified)
mmu_notifier_invalidate_range_only_end(&range);
}
-EXPORT_SYMBOL(migrate_vma_pages);
/**
- * migrate_vma_finalize() - restore CPU page table entry
+ * migrate_device_pages() - migrate meta-data from src page to dst page
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
+ *
+ * Equivalent to migrate_vma_pages(). This is called to migrate struct page
+ * meta-data from source struct page to destination.
+ */
+void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
+ unsigned long npages)
+{
+ __migrate_device_pages(src_pfns, dst_pfns, npages, NULL);
+}
+EXPORT_SYMBOL(migrate_device_pages);
+
+/**
+ * migrate_vma_pages() - migrate meta-data from src page to dst page
* @migrate: migrate struct containing all migration information
*
- * This replaces the special migration pte entry with either a mapping to the
- * new page if migration was successful for that page, or to the original page
- * otherwise.
+ * This migrates struct page meta-data from source struct page to destination
+ * struct page. This effectively finishes the migration from source page to the
+ * destination page.
+ */
+void migrate_vma_pages(struct migrate_vma *migrate)
+{
+ __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate);
+}
+EXPORT_SYMBOL(migrate_vma_pages);
+
+/*
+ * migrate_device_finalize() - complete page migration
+ * @src_pfns: src_pfns returned from migrate_device_range()
+ * @dst_pfns: array of pfns allocated by the driver to migrate memory to
+ * @npages: number of pages in the range
*
- * This also unlocks the pages and puts them back on the lru, or drops the extra
- * refcount, for device pages.
+ * Completes migration of the page by removing special migration entries.
+ * Drivers must ensure copying of page data is complete and visible to the CPU
+ * before calling this.
*/
-void migrate_vma_finalize(struct migrate_vma *migrate)
+void migrate_device_finalize(unsigned long *src_pfns,
+ unsigned long *dst_pfns, unsigned long npages)
{
- const unsigned long npages = migrate->npages;
unsigned long i;
for (i = 0; i < npages; i++) {
struct folio *dst, *src;
- struct page *newpage = migrate_pfn_to_page(migrate->dst[i]);
- struct page *page = migrate_pfn_to_page(migrate->src[i]);
+ struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
+ struct page *page = migrate_pfn_to_page(src_pfns[i]);
if (!page) {
if (newpage) {
@@ -773,7 +837,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
continue;
}
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
+ if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
if (newpage) {
unlock_page(newpage);
put_page(newpage);
@@ -800,8 +864,72 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
}
}
}
+EXPORT_SYMBOL(migrate_device_finalize);
+
+/**
+ * migrate_vma_finalize() - restore CPU page table entry
+ * @migrate: migrate struct containing all migration information
+ *
+ * This replaces the special migration pte entry with either a mapping to the
+ * new page if migration was successful for that page, or to the original page
+ * otherwise.
+ *
+ * This also unlocks the pages and puts them back on the lru, or drops the extra
+ * refcount, for device pages.
+ */
+void migrate_vma_finalize(struct migrate_vma *migrate)
+{
+ migrate_device_finalize(migrate->src, migrate->dst, migrate->npages);
+}
EXPORT_SYMBOL(migrate_vma_finalize);
+/**
+ * migrate_device_range() - migrate device private pfns to normal memory.
+ * @src_pfns: array large enough to hold migrating source device private pfns.
+ * @start: starting pfn in the range to migrate.
+ * @npages: number of pages to migrate.
+ *
+ * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that
+ * instead of looking up pages based on virtual address mappings a range of
+ * device pfns that should be migrated to system memory is used instead.
+ *
+ * This is useful when a driver needs to free device memory but doesn't know the
+ * virtual mappings of every page that may be in device memory. For example this
+ * is often the case when a driver is being unloaded or unbound from a device.
+ *
+ * Like migrate_vma_setup() this function will take a reference and lock any
+ * migrating pages that aren't free before unmapping them. Drivers may then
+ * allocate destination pages and start copying data from the device to CPU
+ * memory before calling migrate_device_pages().
+ */
+int migrate_device_range(unsigned long *src_pfns, unsigned long start,
+ unsigned long npages)
+{
+ unsigned long i, pfn;
+
+ for (pfn = start, i = 0; i < npages; pfn++, i++) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (!get_page_unless_zero(page)) {
+ src_pfns[i] = 0;
+ continue;
+ }
+
+ if (!trylock_page(page)) {
+ src_pfns[i] = 0;
+ put_page(page);
+ continue;
+ }
+
+ src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+ }
+
+ migrate_device_unmap(src_pfns, npages, NULL);
+
+ return 0;
+}
+EXPORT_SYMBOL(migrate_device_range);
+
/*
* Migrate a device coherent page back to normal memory. The caller should have
* a reference on page which will be copied to the new page if migration is
@@ -810,25 +938,19 @@ EXPORT_SYMBOL(migrate_vma_finalize);
int migrate_device_coherent_page(struct page *page)
{
unsigned long src_pfn, dst_pfn = 0;
- struct migrate_vma args;
struct page *dpage;
WARN_ON_ONCE(PageCompound(page));
lock_page(page);
src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
- args.src = &src_pfn;
- args.dst = &dst_pfn;
- args.cpages = 1;
- args.npages = 1;
- args.vma = NULL;
/*
* We don't have a VMA and don't need to walk the page tables to find
* the source page. So call migrate_vma_unmap() directly to unmap the
* page as migrate_vma_setup() will fail if args.vma == NULL.
*/
- migrate_vma_unmap(&args);
+ migrate_device_unmap(&src_pfn, 1, NULL);
if (!(src_pfn & MIGRATE_PFN_MIGRATE))
return -EBUSY;
@@ -838,10 +960,10 @@ int migrate_device_coherent_page(struct page *page)
dst_pfn = migrate_pfn(page_to_pfn(dpage));
}
- migrate_vma_pages(&args);
+ migrate_device_pages(&src_pfn, &dst_pfn, 1);
if (src_pfn & MIGRATE_PFN_MIGRATE)
copy_highpage(dpage, page);
- migrate_vma_finalize(&args);
+ migrate_device_finalize(&src_pfn, &dst_pfn, 1);
if (src_pfn & MIGRATE_PFN_MIGRATE)
return 0;
diff --git a/mm/mlock.c b/mm/mlock.c
index b14e929084cc..7032f6dd0ce1 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -471,6 +471,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
unsigned long nstart, end, tmp;
struct vm_area_struct *vma, *prev;
int error;
+ MA_STATE(mas, &current->mm->mm_mt, start, start);
VM_BUG_ON(offset_in_page(start));
VM_BUG_ON(len != PAGE_ALIGN(len));
@@ -479,13 +480,14 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
return -EINVAL;
if (end == start)
return 0;
- vma = find_vma(current->mm, start);
- if (!vma || vma->vm_start > start)
+ vma = mas_walk(&mas);
+ if (!vma)
return -ENOMEM;
- prev = vma->vm_prev;
if (start > vma->vm_start)
prev = vma;
+ else
+ prev = mas_prev(&mas, 0);
for (nstart = start ; ; ) {
vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
@@ -505,7 +507,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
if (nstart >= end)
break;
- vma = prev->vm_next;
+ vma = find_vma(prev->vm_mm, prev->vm_end);
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
break;
@@ -526,24 +528,21 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
{
struct vm_area_struct *vma;
unsigned long count = 0;
+ unsigned long end;
+ VMA_ITERATOR(vmi, mm, start);
- if (mm == NULL)
- mm = current->mm;
+ /* Don't overflow past ULONG_MAX */
+ if (unlikely(ULONG_MAX - len < start))
+ end = ULONG_MAX;
+ else
+ end = start + len;
- vma = find_vma(mm, start);
- if (vma == NULL)
- return 0;
-
- for (; vma ; vma = vma->vm_next) {
- if (start >= vma->vm_end)
- continue;
- if (start + len <= vma->vm_start)
- break;
+ for_each_vma_range(vmi, vma, end) {
if (vma->vm_flags & VM_LOCKED) {
if (start > vma->vm_start)
count -= (start - vma->vm_start);
- if (start + len < vma->vm_end) {
- count += start + len - vma->vm_start;
+ if (end < vma->vm_end) {
+ count += end - vma->vm_start;
break;
}
count += vma->vm_end - vma->vm_start;
@@ -659,6 +658,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
*/
static int apply_mlockall_flags(int flags)
{
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
struct vm_area_struct *vma, *prev = NULL;
vm_flags_t to_add = 0;
@@ -679,7 +679,7 @@ static int apply_mlockall_flags(int flags)
to_add |= VM_LOCKONFAULT;
}
- for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
vm_flags_t newflags;
newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
@@ -687,6 +687,7 @@ static int apply_mlockall_flags(int flags)
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
+ mas_pause(&mas);
cond_resched();
}
out:
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 9ddaf0e1b0ab..0d7b2bd2454a 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void)
shift = 8 * sizeof(unsigned long);
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
LAST_CPUPID_WIDTH,
KASAN_TAG_WIDTH,
+ LRU_GEN_WIDTH,
+ LRU_REFS_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
diff --git a/mm/mm_slot.h b/mm/mm_slot.h
new file mode 100644
index 000000000000..83f18ed1c4bd
--- /dev/null
+++ b/mm/mm_slot.h
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _LINUX_MM_SLOT_H
+#define _LINUX_MM_SLOT_H
+
+#include <linux/hashtable.h>
+#include <linux/slab.h>
+
+/*
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: link to the mm_slots hash list
+ * @mm_node: link into the mm_slots list
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node hash;
+ struct list_head mm_node;
+ struct mm_struct *mm;
+};
+
+#define mm_slot_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+static inline void *mm_slot_alloc(struct kmem_cache *cache)
+{
+ if (!cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(cache, GFP_KERNEL);
+}
+
+static inline void mm_slot_free(struct kmem_cache *cache, void *objp)
+{
+ kmem_cache_free(cache, objp);
+}
+
+#define mm_slot_lookup(_hashtable, _mm) \
+({ \
+ struct mm_slot *tmp_slot, *mm_slot = NULL; \
+ \
+ hash_for_each_possible(_hashtable, tmp_slot, hash, (unsigned long)_mm) \
+ if (_mm == tmp_slot->mm) { \
+ mm_slot = tmp_slot; \
+ break; \
+ } \
+ \
+ mm_slot; \
+})
+
+#define mm_slot_insert(_hashtable, _mm, _mm_slot) \
+({ \
+ _mm_slot->mm = _mm; \
+ hash_add(_hashtable, &_mm_slot->hash, (unsigned long)_mm); \
+})
+
+#endif /* _LINUX_MM_SLOT_H */
diff --git a/mm/mmap.c b/mm/mmap.c
index 9d780f415be3..c3c5c1d6103d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -14,7 +14,6 @@
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
-#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -39,7 +38,6 @@
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
-#include <linux/rbtree_augmented.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
@@ -77,9 +75,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
struct vm_area_struct *vma, struct vm_area_struct *prev,
- unsigned long start, unsigned long end);
+ struct vm_area_struct *next, unsigned long start,
+ unsigned long end);
static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
@@ -132,12 +131,10 @@ void unlink_file_vma(struct vm_area_struct *vma)
}
/*
- * Close a vm structure and free it, returning the next.
+ * Close a vm structure and free it.
*/
-static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+static void remove_vma(struct vm_area_struct *vma)
{
- struct vm_area_struct *next = vma->vm_next;
-
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
@@ -145,20 +142,41 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
vm_area_free(vma);
- return next;
}
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
- struct list_head *uf);
+/*
+ * check_brk_limits() - Use platform specific check of range & verify mlock
+ * limits.
+ * @addr: The address to check
+ * @len: The size of increase.
+ *
+ * Return: 0 on success.
+ */
+static int check_brk_limits(unsigned long addr, unsigned long len)
+{
+ unsigned long mapped_addr;
+
+ mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (IS_ERR_VALUE(mapped_addr))
+ return mapped_addr;
+
+ return mlock_future_check(current->mm, current->mm->def_flags, len);
+}
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf);
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma,
+ unsigned long addr, unsigned long request, unsigned long flags);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *next;
+ struct vm_area_struct *brkvma, *next = NULL;
unsigned long min_brk;
bool populate;
bool downgraded = false;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -200,35 +218,51 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Always allow shrinking brk.
- * __do_munmap() may downgrade mmap_lock to read.
+ * do_brk_munmap() may downgrade mmap_lock to read.
*/
if (brk <= mm->brk) {
int ret;
+ /* Search one past newbrk */
+ mas_set(&mas, newbrk);
+ brkvma = mas_find(&mas, oldbrk);
+ BUG_ON(brkvma == NULL);
+ if (brkvma->vm_start >= oldbrk)
+ goto out; /* mapping intersects with an existing non-brk vma. */
/*
- * mm->brk must to be protected by write mmap_lock so update it
- * before downgrading mmap_lock. When __do_munmap() fails,
- * mm->brk will be restored from origbrk.
+ * mm->brk must be protected by write mmap_lock.
+ * do_brk_munmap() may downgrade the lock, so update it
+ * before calling do_brk_munmap().
*/
mm->brk = brk;
- ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
- if (ret < 0) {
- mm->brk = origbrk;
- goto out;
- } else if (ret == 1) {
+ ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
+ if (ret == 1) {
downgraded = true;
- }
- goto success;
+ goto success;
+ } else if (!ret)
+ goto success;
+
+ mm->brk = origbrk;
+ goto out;
}
- /* Check against existing mmap mappings. */
- next = find_vma(mm, oldbrk);
+ if (check_brk_limits(oldbrk, newbrk - oldbrk))
+ goto out;
+
+ /*
+ * Only check if the next VMA is within the stack_guard_gap of the
+ * expansion area
+ */
+ mas_set(&mas, oldbrk);
+ next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;
+ brkvma = mas_prev(&mas, mm->start_brk);
/* Ok, looks good - let it rip. */
- if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
+ if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
goto out;
+
mm->brk = brk;
success:
@@ -247,104 +281,45 @@ out:
return origbrk;
}
-static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
-{
- unsigned long gap, prev_end;
-
- /*
- * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
- * allow two stack_guard_gaps between them here, and when choosing
- * an unmapped area; whereas when expanding we only require one.
- * That's a little inconsistent, but keeps the code here simpler.
- */
- gap = vm_start_gap(vma);
- if (vma->vm_prev) {
- prev_end = vm_end_gap(vma->vm_prev);
- if (gap > prev_end)
- gap -= prev_end;
- else
- gap = 0;
- }
- return gap;
-}
-
-#ifdef CONFIG_DEBUG_VM_RB
-static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
-{
- unsigned long max = vma_compute_gap(vma), subtree_gap;
- if (vma->vm_rb.rb_left) {
- subtree_gap = rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb)->rb_subtree_gap;
- if (subtree_gap > max)
- max = subtree_gap;
- }
- if (vma->vm_rb.rb_right) {
- subtree_gap = rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb)->rb_subtree_gap;
- if (subtree_gap > max)
- max = subtree_gap;
- }
- return max;
-}
-
-static int browse_rb(struct mm_struct *mm)
-{
- struct rb_root *root = &mm->mm_rb;
- int i = 0, j, bug = 0;
- struct rb_node *nd, *pn = NULL;
- unsigned long prev = 0, pend = 0;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- if (vma->vm_start < prev) {
- pr_emerg("vm_start %lx < prev %lx\n",
- vma->vm_start, prev);
- bug = 1;
- }
- if (vma->vm_start < pend) {
- pr_emerg("vm_start %lx < pend %lx\n",
- vma->vm_start, pend);
- bug = 1;
- }
- if (vma->vm_start > vma->vm_end) {
- pr_emerg("vm_start %lx > vm_end %lx\n",
- vma->vm_start, vma->vm_end);
- bug = 1;
- }
- spin_lock(&mm->page_table_lock);
- if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
- pr_emerg("free gap %lx, correct %lx\n",
- vma->rb_subtree_gap,
- vma_compute_subtree_gap(vma));
- bug = 1;
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+extern void mt_validate(struct maple_tree *mt);
+extern void mt_dump(const struct maple_tree *mt);
+
+/* Validate the maple tree */
+static void validate_mm_mt(struct mm_struct *mm)
+{
+ struct maple_tree *mt = &mm->mm_mt;
+ struct vm_area_struct *vma_mt;
+
+ MA_STATE(mas, mt, 0, 0);
+
+ mt_validate(&mm->mm_mt);
+ mas_for_each(&mas, vma_mt, ULONG_MAX) {
+ if ((vma_mt->vm_start != mas.index) ||
+ (vma_mt->vm_end - 1 != mas.last)) {
+ pr_emerg("issue in %s\n", current->comm);
+ dump_stack();
+ dump_vma(vma_mt);
+ pr_emerg("mt piv: %p %lu - %lu\n", vma_mt,
+ mas.index, mas.last);
+ pr_emerg("mt vma: %p %lu - %lu\n", vma_mt,
+ vma_mt->vm_start, vma_mt->vm_end);
+
+ mt_dump(mas.tree);
+ if (vma_mt->vm_end != mas.last + 1) {
+ pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n",
+ mm, vma_mt->vm_start, vma_mt->vm_end,
+ mas.index, mas.last);
+ mt_dump(mas.tree);
+ }
+ VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm);
+ if (vma_mt->vm_start != mas.index) {
+ pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n",
+ mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end);
+ mt_dump(mas.tree);
+ }
+ VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm);
}
- spin_unlock(&mm->page_table_lock);
- i++;
- pn = nd;
- prev = vma->vm_start;
- pend = vma->vm_end;
- }
- j = 0;
- for (nd = pn; nd; nd = rb_prev(nd))
- j++;
- if (i != j) {
- pr_emerg("backwards %d, forwards %d\n", j, i);
- bug = 1;
- }
- return bug ? -1 : i;
-}
-
-static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
-{
- struct rb_node *nd;
-
- for (nd = rb_first(root); nd; nd = rb_next(nd)) {
- struct vm_area_struct *vma;
- vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- VM_BUG_ON_VMA(vma != ignore &&
- vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
- vma);
}
}
@@ -352,10 +327,13 @@ static void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
- unsigned long highest_address = 0;
- struct vm_area_struct *vma = mm->mmap;
+ struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
- while (vma) {
+ validate_mm_mt(mm);
+
+ mas_for_each(&mas, vma, ULONG_MAX) {
+#ifdef CONFIG_DEBUG_VM_RB
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
@@ -365,93 +343,20 @@ static void validate_mm(struct mm_struct *mm)
anon_vma_interval_tree_verify(avc);
anon_vma_unlock_read(anon_vma);
}
-
- highest_address = vm_end_gap(vma);
- vma = vma->vm_next;
+#endif
i++;
}
if (i != mm->map_count) {
- pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
- bug = 1;
- }
- if (highest_address != mm->highest_vm_end) {
- pr_emerg("mm->highest_vm_end %lx, found %lx\n",
- mm->highest_vm_end, highest_address);
- bug = 1;
- }
- i = browse_rb(mm);
- if (i != mm->map_count) {
- if (i != -1)
- pr_emerg("map_count %d rb %d\n", mm->map_count, i);
+ pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i);
bug = 1;
}
VM_BUG_ON_MM(bug, mm);
}
-#else
-#define validate_mm_rb(root, ignore) do { } while (0)
-#define validate_mm(mm) do { } while (0)
-#endif
-
-RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
- struct vm_area_struct, vm_rb,
- unsigned long, rb_subtree_gap, vma_compute_gap)
-
-/*
- * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
- * vma->vm_prev->vm_end values changed, without modifying the vma's position
- * in the rbtree.
- */
-static void vma_gap_update(struct vm_area_struct *vma)
-{
- /*
- * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
- * a callback function that does exactly what we want.
- */
- vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
-}
-
-static inline void vma_rb_insert(struct vm_area_struct *vma,
- struct rb_root *root)
-{
- /* All rb_subtree_gap values must be consistent prior to insertion */
- validate_mm_rb(root, NULL);
-
- rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-}
-
-static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
-{
- /*
- * Note rb_erase_augmented is a fairly large inline function,
- * so make sure we instantiate it only once with our desired
- * augmented rbtree callbacks.
- */
- rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-}
-
-static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
- struct rb_root *root,
- struct vm_area_struct *ignore)
-{
- /*
- * All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of
- *
- * a. the "next" vma being erased if next->vm_start was reduced in
- * __vma_adjust() -> __vma_unlink()
- * b. the vma being erased in detach_vmas_to_be_unmapped() ->
- * vma_rb_erase()
- */
- validate_mm_rb(root, ignore);
- __vma_rb_erase(vma, root);
-}
-
-static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
- struct rb_root *root)
-{
- vma_rb_erase_ignore(vma, root, vma);
-}
+#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
+#define validate_mm_mt(root) do { } while (0)
+#define validate_mm(mm) do { } while (0)
+#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
/*
* vma has some anon_vma assigned, and is already inserted on that
@@ -485,208 +390,220 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}
-static int find_vma_links(struct mm_struct *mm, unsigned long addr,
- unsigned long end, struct vm_area_struct **pprev,
- struct rb_node ***rb_link, struct rb_node **rb_parent)
+static unsigned long count_vma_pages_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long end)
{
- struct rb_node **__rb_link, *__rb_parent, *rb_prev;
+ VMA_ITERATOR(vmi, mm, addr);
+ struct vm_area_struct *vma;
+ unsigned long nr_pages = 0;
- mmap_assert_locked(mm);
- __rb_link = &mm->mm_rb.rb_node;
- rb_prev = __rb_parent = NULL;
+ for_each_vma_range(vmi, vma, end) {
+ unsigned long vm_start = max(addr, vma->vm_start);
+ unsigned long vm_end = min(end, vma->vm_end);
- while (*__rb_link) {
- struct vm_area_struct *vma_tmp;
+ nr_pages += PHYS_PFN(vm_end - vm_start);
+ }
- __rb_parent = *__rb_link;
- vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
+ return nr_pages;
+}
- if (vma_tmp->vm_end > addr) {
- /* Fail if an existing vma overlaps the area */
- if (vma_tmp->vm_start < end)
- return -ENOMEM;
- __rb_link = &__rb_parent->rb_left;
- } else {
- rb_prev = __rb_parent;
- __rb_link = &__rb_parent->rb_right;
- }
- }
+static void __vma_link_file(struct vm_area_struct *vma,
+ struct address_space *mapping)
+{
+ if (vma->vm_flags & VM_SHARED)
+ mapping_allow_writable(mapping);
- *pprev = NULL;
- if (rb_prev)
- *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
- *rb_link = __rb_link;
- *rb_parent = __rb_parent;
- return 0;
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
}
/*
- * vma_next() - Get the next VMA.
- * @mm: The mm_struct.
- * @vma: The current vma.
+ * vma_mas_store() - Store a VMA in the maple tree.
+ * @vma: The vm_area_struct
+ * @mas: The maple state
*
- * If @vma is NULL, return the first vma in the mm.
+ * Efficient way to store a VMA in the maple tree when the @mas has already
+ * walked to the correct location.
*
- * Returns: The next VMA after @vma.
+ * Note: the end address is inclusive in the maple tree.
*/
-static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
- struct vm_area_struct *vma)
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas)
{
- if (!vma)
- return mm->mmap;
-
- return vma->vm_next;
+ trace_vma_store(mas->tree, vma);
+ mas_set_range(mas, vma->vm_start, vma->vm_end - 1);
+ mas_store_prealloc(mas, vma);
}
/*
- * munmap_vma_range() - munmap VMAs that overlap a range.
- * @mm: The mm struct
- * @start: The start of the range.
- * @len: The length of the range.
- * @pprev: pointer to the pointer that will be set to previous vm_area_struct
- * @rb_link: the rb_node
- * @rb_parent: the parent rb_node
- *
- * Find all the vm_area_struct that overlap from @start to
- * @end and munmap them. Set @pprev to the previous vm_area_struct.
+ * vma_mas_remove() - Remove a VMA from the maple tree.
+ * @vma: The vm_area_struct
+ * @mas: The maple state
*
- * Returns: -ENOMEM on munmap failure or 0 on success.
+ * Efficient way to remove a VMA from the maple tree when the @mas has already
+ * been established and points to the correct location.
+ * Note: the end address is inclusive in the maple tree.
*/
-static inline int
-munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
- struct vm_area_struct **pprev, struct rb_node ***link,
- struct rb_node **parent, struct list_head *uf)
+void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
{
-
- while (find_vma_links(mm, start, start + len, pprev, link, parent))
- if (do_munmap(mm, start, len, uf))
- return -ENOMEM;
-
- return 0;
+ trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1);
+ mas->index = vma->vm_start;
+ mas->last = vma->vm_end - 1;
+ mas_store_prealloc(mas, NULL);
}
-static unsigned long count_vma_pages_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end)
+
+/*
+ * vma_mas_szero() - Set a given range to zero. Used when modifying a
+ * vm_area_struct start or end.
+ *
+ * @mm: The struct_mm
+ * @start: The start address to zero
+ * @end: The end address to zero.
+ */
+static inline void vma_mas_szero(struct ma_state *mas, unsigned long start,
+ unsigned long end)
{
- unsigned long nr_pages = 0;
- struct vm_area_struct *vma;
+ trace_vma_mas_szero(mas->tree, start, end - 1);
+ mas_set_range(mas, start, end - 1);
+ mas_store_prealloc(mas, NULL);
+}
- /* Find first overlapping mapping */
- vma = find_vma_intersection(mm, addr, end);
- if (!vma)
- return 0;
+static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ struct address_space *mapping = NULL;
- nr_pages = (min(end, vma->vm_end) -
- max(addr, vma->vm_start)) >> PAGE_SHIFT;
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
- /* Iterate over the rest of the overlaps */
- for (vma = vma->vm_next; vma; vma = vma->vm_next) {
- unsigned long overlap_len;
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ i_mmap_lock_write(mapping);
+ }
- if (vma->vm_start > end)
- break;
+ vma_mas_store(vma, &mas);
- overlap_len = min(end, vma->vm_end) - vma->vm_start;
- nr_pages += overlap_len >> PAGE_SHIFT;
+ if (mapping) {
+ __vma_link_file(vma, mapping);
+ i_mmap_unlock_write(mapping);
}
- return nr_pages;
+ mm->map_count++;
+ validate_mm(mm);
+ return 0;
}
-void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
- struct rb_node **rb_link, struct rb_node *rb_parent)
+/*
+ * vma_expand - Expand an existing VMA
+ *
+ * @mas: The maple state
+ * @vma: The vma to expand
+ * @start: The start of the vma
+ * @end: The exclusive end of the vma
+ * @pgoff: The page offset of vma
+ * @next: The current of next vma.
+ *
+ * Expand @vma to @start and @end. Can expand off the start and end. Will
+ * expand over @next if it's different from @vma and @end == @next->vm_end.
+ * Checking if the @vma can expand and merge with @next needs to be handled by
+ * the caller.
+ *
+ * Returns: 0 on success
+ */
+inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff,
+ struct vm_area_struct *next)
{
- /* Update tracking information for the gap following the new vma. */
- if (vma->vm_next)
- vma_gap_update(vma->vm_next);
- else
- mm->highest_vm_end = vm_end_gap(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ struct address_space *mapping = NULL;
+ struct rb_root_cached *root = NULL;
+ struct anon_vma *anon_vma = vma->anon_vma;
+ struct file *file = vma->vm_file;
+ bool remove_next = false;
- /*
- * vma->vm_prev wasn't known when we followed the rbtree to find the
- * correct insertion point for that vma. As a result, we could not
- * update the vma vm_rb parents rb_subtree_gap values on the way down.
- * So, we first insert the vma with a zero rb_subtree_gap value
- * (to be consistent with what we did on the way down), and then
- * immediately update the gap to the correct value. Finally we
- * rebalance the rbtree after all augmented values have been set.
- */
- rb_link_node(&vma->vm_rb, rb_parent, rb_link);
- vma->rb_subtree_gap = 0;
- vma_gap_update(vma);
- vma_rb_insert(vma, &mm->mm_rb);
-}
+ if (next && (vma != next) && (end == next->vm_end)) {
+ remove_next = true;
+ if (next->anon_vma && !vma->anon_vma) {
+ int error;
-static void __vma_link_file(struct vm_area_struct *vma)
-{
- struct file *file;
+ anon_vma = next->anon_vma;
+ vma->anon_vma = anon_vma;
+ error = anon_vma_clone(vma, next);
+ if (error)
+ return error;
+ }
+ }
+
+ /* Not merging but overwriting any part of next is not handled. */
+ VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start);
+ /* Only handles expanding */
+ VM_BUG_ON(vma->vm_start < start || vma->vm_end > end);
+
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ goto nomem;
+
+ vma_adjust_trans_huge(vma, start, end, 0);
- file = vma->vm_file;
if (file) {
- struct address_space *mapping = file->f_mapping;
+ mapping = file->f_mapping;
+ root = &mapping->i_mmap;
+ uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+ i_mmap_lock_write(mapping);
+ }
- if (vma->vm_flags & VM_SHARED)
- mapping_allow_writable(mapping);
+ if (anon_vma) {
+ anon_vma_lock_write(anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+ if (file) {
flush_dcache_mmap_lock(mapping);
- vma_interval_tree_insert(vma, &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
+ vma_interval_tree_remove(vma, root);
}
-}
-static void
-__vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
-{
- __vma_link_list(mm, vma, prev);
- __vma_link_rb(mm, vma, rb_link, rb_parent);
-}
+ vma->vm_start = start;
+ vma->vm_end = end;
+ vma->vm_pgoff = pgoff;
+ /* Note: mas must be pointing to the expanding VMA */
+ vma_mas_store(vma, mas);
-static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, struct rb_node **rb_link,
- struct rb_node *rb_parent)
-{
- struct address_space *mapping = NULL;
+ if (file) {
+ vma_interval_tree_insert(vma, root);
+ flush_dcache_mmap_unlock(mapping);
+ }
- if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
+ /* Expanding over the next vma */
+ if (remove_next && file) {
+ __remove_shared_vm_struct(next, file, mapping);
}
- __vma_link(mm, vma, prev, rb_link, rb_parent);
- __vma_link_file(vma);
+ if (anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(anon_vma);
+ }
- if (mapping)
+ if (file) {
i_mmap_unlock_write(mapping);
+ uprobe_mmap(vma);
+ }
- mm->map_count++;
- validate_mm(mm);
-}
-
-/*
- * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree. It has already been inserted into the interval tree.
- */
-static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- struct vm_area_struct *prev;
- struct rb_node **rb_link, *rb_parent;
+ if (remove_next) {
+ if (file) {
+ uprobe_munmap(next, next->vm_start, next->vm_end);
+ fput(file);
+ }
+ if (next->anon_vma)
+ anon_vma_merge(vma, next);
+ mm->map_count--;
+ mpol_put(vma_policy(next));
+ vm_area_free(next);
+ }
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
- BUG();
- __vma_link(mm, vma, prev, rb_link, rb_parent);
- mm->map_count++;
-}
+ validate_mm(mm);
+ return 0;
-static __always_inline void __vma_unlink(struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct vm_area_struct *ignore)
-{
- vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
- __vma_unlink_list(mm, vma);
- /* Kill the cache */
- vmacache_invalidate(mm);
+nomem:
+ return -ENOMEM;
}
/*
@@ -701,18 +618,20 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *expand)
{
struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
+ struct vm_area_struct *next_next = NULL; /* uninit var warning */
+ struct vm_area_struct *next = find_vma(mm, vma->vm_end);
+ struct vm_area_struct *orig_vma = vma;
struct address_space *mapping = NULL;
struct rb_root_cached *root = NULL;
struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
- bool start_changed = false, end_changed = false;
+ bool vma_changed = false;
long adjust_next = 0;
int remove_next = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ struct vm_area_struct *exporter = NULL, *importer = NULL;
if (next && !insert) {
- struct vm_area_struct *exporter = NULL, *importer = NULL;
-
if (end >= next->vm_end) {
/*
* vma expands, overlapping all the next, and
@@ -741,10 +660,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* remove_next == 1 is case 1 or 7.
*/
remove_next = 1 + (end > next->vm_end);
+ if (remove_next == 2)
+ next_next = find_vma(mm, next->vm_end);
+
VM_WARN_ON(remove_next == 2 &&
- end != next->vm_next->vm_end);
- /* trim end to next, for case 6 first pass */
- end = next->vm_end;
+ end != next_next->vm_end);
}
exporter = next;
@@ -755,7 +675,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* next, if the vma overlaps with it.
*/
if (remove_next == 2 && !next->anon_vma)
- exporter = next->vm_next;
+ exporter = next_next;
} else if (end > next->vm_start) {
/*
@@ -792,9 +712,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
return error;
}
}
-again:
- vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
+ vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
if (file) {
mapping = file->f_mapping;
root = &mapping->i_mmap;
@@ -804,14 +726,14 @@ again:
uprobe_munmap(next, next->vm_start, next->vm_end);
i_mmap_lock_write(mapping);
- if (insert) {
+ if (insert && insert->vm_file) {
/*
* Put into interval tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
* throughout; but we cannot insert into address
* space until vma start or end is updated.
*/
- __vma_link_file(insert);
+ __vma_link_file(insert, insert->vm_file->f_mapping);
}
}
@@ -835,17 +757,37 @@ again:
}
if (start != vma->vm_start) {
+ if ((vma->vm_start < start) &&
+ (!insert || (insert->vm_end != start))) {
+ vma_mas_szero(&mas, vma->vm_start, start);
+ VM_WARN_ON(insert && insert->vm_start > vma->vm_start);
+ } else {
+ vma_changed = true;
+ }
vma->vm_start = start;
- start_changed = true;
}
if (end != vma->vm_end) {
+ if (vma->vm_end > end) {
+ if (!insert || (insert->vm_start != end)) {
+ vma_mas_szero(&mas, end, vma->vm_end);
+ mas_reset(&mas);
+ VM_WARN_ON(insert &&
+ insert->vm_end < vma->vm_end);
+ }
+ } else {
+ vma_changed = true;
+ }
vma->vm_end = end;
- end_changed = true;
}
+
+ if (vma_changed)
+ vma_mas_store(vma, &mas);
+
vma->vm_pgoff = pgoff;
if (adjust_next) {
next->vm_start += adjust_next;
next->vm_pgoff += adjust_next >> PAGE_SHIFT;
+ vma_mas_store(next, &mas);
}
if (file) {
@@ -855,42 +797,19 @@ again:
flush_dcache_mmap_unlock(mapping);
}
- if (remove_next) {
- /*
- * vma_merge has merged next into vma, and needs
- * us to remove next before dropping the locks.
- */
- if (remove_next != 3)
- __vma_unlink(mm, next, next);
- else
- /*
- * vma is not before next if they've been
- * swapped.
- *
- * pre-swap() next->vm_start was reduced so
- * tell validate_mm_rb to ignore pre-swap()
- * "next" (which is stored in post-swap()
- * "vma").
- */
- __vma_unlink(mm, next, vma);
- if (file)
- __remove_shared_vm_struct(next, file, mapping);
+ if (remove_next && file) {
+ __remove_shared_vm_struct(next, file, mapping);
+ if (remove_next == 2)
+ __remove_shared_vm_struct(next_next, file, mapping);
} else if (insert) {
/*
* split_vma has split insert from vma, and needs
* us to insert it before dropping the locks
* (it may either follow vma or precede it).
*/
- __insert_vm_struct(mm, insert);
- } else {
- if (start_changed)
- vma_gap_update(vma);
- if (end_changed) {
- if (!next)
- mm->highest_vm_end = vm_end_gap(vma);
- else if (!adjust_next)
- vma_gap_update(next);
- }
+ mas_reset(&mas);
+ vma_mas_store(insert, &mas);
+ mm->map_count++;
}
if (anon_vma) {
@@ -909,6 +828,7 @@ again:
}
if (remove_next) {
+again:
if (file) {
uprobe_munmap(next, next->vm_start, next->vm_end);
fput(file);
@@ -917,66 +837,24 @@ again:
anon_vma_merge(vma, next);
mm->map_count--;
mpol_put(vma_policy(next));
+ if (remove_next != 2)
+ BUG_ON(vma->vm_end < next->vm_end);
vm_area_free(next);
+
/*
* In mprotect's case 6 (see comments on vma_merge),
- * we must remove another next too. It would clutter
- * up the code too much to do both in one go.
+ * we must remove next_next too.
*/
- if (remove_next != 3) {
- /*
- * If "next" was removed and vma->vm_end was
- * expanded (up) over it, in turn
- * "next->vm_prev->vm_end" changed and the
- * "vma->vm_next" gap must be updated.
- */
- next = vma->vm_next;
- } else {
- /*
- * For the scope of the comment "next" and
- * "vma" considered pre-swap(): if "vma" was
- * removed, next->vm_start was expanded (down)
- * over it and the "next" gap must be updated.
- * Because of the swap() the post-swap() "vma"
- * actually points to pre-swap() "next"
- * (post-swap() "next" as opposed is now a
- * dangling pointer).
- */
- next = vma;
- }
if (remove_next == 2) {
remove_next = 1;
- end = next->vm_end;
+ next = next_next;
goto again;
}
- else if (next)
- vma_gap_update(next);
- else {
- /*
- * If remove_next == 2 we obviously can't
- * reach this path.
- *
- * If remove_next == 3 we can't reach this
- * path because pre-swap() next is always not
- * NULL. pre-swap() "next" is not being
- * removed and its next->vm_end is not altered
- * (and furthermore "end" already matches
- * next->vm_end in remove_next == 3).
- *
- * We reach this only in the remove_next == 1
- * case if the "next" vma that was removed was
- * the highest vma of the mm. However in such
- * case next->vm_end == "end" and the extended
- * "vma" has vma->vm_end == next->vm_end so
- * mm->highest_vm_end doesn't need any update
- * in remove_next == 1 case.
- */
- VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
- }
}
if (insert && file)
uprobe_mmap(insert);
+ mas_destroy(&mas);
validate_mm(mm);
return 0;
@@ -1128,8 +1006,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct anon_vma_name *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
- struct vm_area_struct *area, *next;
- int err;
+ struct vm_area_struct *mid, *next, *res;
+ int err = -1;
+ bool merge_prev = false;
+ bool merge_next = false;
/*
* We later require that vma->vm_flags == vm_flags,
@@ -1138,76 +1018,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (vm_flags & VM_SPECIAL)
return NULL;
- next = vma_next(mm, prev);
- area = next;
- if (area && area->vm_end == end) /* cases 6, 7, 8 */
- next = next->vm_next;
+ next = find_vma(mm, prev ? prev->vm_end : 0);
+ mid = next;
+ if (next && next->vm_end == end) /* cases 6, 7, 8 */
+ next = find_vma(mm, next->vm_end);
/* verify some invariant that must be enforced by the caller */
VM_WARN_ON(prev && addr <= prev->vm_start);
- VM_WARN_ON(area && end > area->vm_end);
+ VM_WARN_ON(mid && end > mid->vm_end);
VM_WARN_ON(addr >= end);
- /*
- * Can it merge with the predecessor?
- */
+ /* Can we merge the predecessor? */
if (prev && prev->vm_end == addr &&
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
vm_userfaultfd_ctx, anon_name)) {
- /*
- * OK, it can. Can we now merge in the successor as well?
- */
- if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags,
- anon_vma, file,
- pgoff+pglen,
- vm_userfaultfd_ctx, anon_name) &&
- is_mergeable_anon_vma(prev->anon_vma,
- next->anon_vma, NULL)) {
- /* cases 1, 6 */
- err = __vma_adjust(prev, prev->vm_start,
- next->vm_end, prev->vm_pgoff, NULL,
- prev);
- } else /* cases 2, 5, 7 */
- err = __vma_adjust(prev, prev->vm_start,
- end, prev->vm_pgoff, NULL, prev);
- if (err)
- return NULL;
- khugepaged_enter_vma(prev, vm_flags);
- return prev;
+ merge_prev = true;
}
-
- /*
- * Can this new request be merged in front of next?
- */
+ /* Can we merge the successor? */
if (next && end == next->vm_start &&
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
vm_userfaultfd_ctx, anon_name)) {
+ merge_next = true;
+ }
+ /* Can we merge both the predecessor and the successor? */
+ if (merge_prev && merge_next &&
+ is_mergeable_anon_vma(prev->anon_vma,
+ next->anon_vma, NULL)) { /* cases 1, 6 */
+ err = __vma_adjust(prev, prev->vm_start,
+ next->vm_end, prev->vm_pgoff, NULL,
+ prev);
+ res = prev;
+ } else if (merge_prev) { /* cases 2, 5, 7 */
+ err = __vma_adjust(prev, prev->vm_start,
+ end, prev->vm_pgoff, NULL, prev);
+ res = prev;
+ } else if (merge_next) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
- addr, prev->vm_pgoff, NULL, next);
- else { /* cases 3, 8 */
- err = __vma_adjust(area, addr, next->vm_end,
- next->vm_pgoff - pglen, NULL, next);
- /*
- * In case 3 area is already equal to next and
- * this is a noop, but in case 8 "area" has
- * been removed and next was expanded over it.
- */
- area = next;
- }
- if (err)
- return NULL;
- khugepaged_enter_vma(area, vm_flags);
- return area;
+ addr, prev->vm_pgoff, NULL, next);
+ else /* cases 3, 8 */
+ err = __vma_adjust(mid, addr, next->vm_end,
+ next->vm_pgoff - pglen, NULL, next);
+ res = next;
}
- return NULL;
+ /*
+ * Cannot merge with predecessor or successor or error in __vma_adjust?
+ */
+ if (err)
+ return NULL;
+ khugepaged_enter_vma(res, vm_flags);
+ return res;
}
/*
@@ -1275,18 +1140,24 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
+ MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end);
struct anon_vma *anon_vma = NULL;
+ struct vm_area_struct *prev, *next;
/* Try next first. */
- if (vma->vm_next) {
- anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
+ next = mas_walk(&mas);
+ if (next) {
+ anon_vma = reusable_anon_vma(next, vma, next);
if (anon_vma)
return anon_vma;
}
+ prev = mas_prev(&mas, 0);
+ VM_BUG_ON_VMA(prev != vma, vma);
+ prev = mas_prev(&mas, 0);
/* Try prev next. */
- if (vma->vm_prev)
- anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
+ if (prev)
+ anon_vma = reusable_anon_vma(prev, prev, vma);
/*
* We might reach here with anon_vma == NULL if we can't find
@@ -1375,6 +1246,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
vm_flags_t vm_flags;
int pkey = 0;
+ validate_mm(mm);
*populate = 0;
if (!len)
@@ -1678,388 +1550,63 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}
-unsigned long mmap_region(struct file *file, unsigned long addr,
- unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
- struct list_head *uf)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev, *merge;
- int error;
- struct rb_node **rb_link, *rb_parent;
- unsigned long charged = 0;
-
- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
- unsigned long nr_pages;
-
- /*
- * MAP_FIXED may remove pages of mappings that intersects with
- * requested mapping. Account for the pages it would unmap.
- */
- nr_pages = count_vma_pages_range(mm, addr, addr + len);
-
- if (!may_expand_vm(mm, vm_flags,
- (len >> PAGE_SHIFT) - nr_pages))
- return -ENOMEM;
- }
-
- /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
- if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
- return -ENOMEM;
- /*
- * Private writable mapping: check memory availability
- */
- if (accountable_mapping(file, vm_flags)) {
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
-
- /*
- * Can we just expand an old mapping?
- */
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (vma)
- goto out;
-
- /*
- * Determine the object being mapped and call the appropriate
- * specific mapper. the address has already been validated, but
- * not unmapped, but the maps are removed from the list.
- */
- vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
- goto unacct_error;
- }
-
- vma->vm_start = addr;
- vma->vm_end = addr + len;
- vma->vm_flags = vm_flags;
- vma->vm_page_prot = vm_get_page_prot(vm_flags);
- vma->vm_pgoff = pgoff;
-
- if (file) {
- if (vm_flags & VM_SHARED) {
- error = mapping_map_writable(file->f_mapping);
- if (error)
- goto free_vma;
- }
-
- vma->vm_file = get_file(file);
- error = call_mmap(file, vma);
- if (error)
- goto unmap_and_free_vma;
-
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
- * Bug: If addr is changed, prev, rb_link, rb_parent should
- * be updated for vma_link()
- */
- WARN_ON_ONCE(addr != vma->vm_start);
-
- addr = vma->vm_start;
-
- /* If vm_flags changed after call_mmap(), we should try merge vma again
- * as we may succeed this time.
- */
- if (unlikely(vm_flags != vma->vm_flags && prev)) {
- merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
- NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (merge) {
- /* ->mmap() can change vma->vm_file and fput the original file. So
- * fput the vma->vm_file here or we would add an extra fput for file
- * and cause general protection fault ultimately.
- */
- fput(vma->vm_file);
- vm_area_free(vma);
- vma = merge;
- /* Update vm_flags to pick up the change. */
- vm_flags = vma->vm_flags;
- goto unmap_writable;
- }
- }
-
- vm_flags = vma->vm_flags;
- } else if (vm_flags & VM_SHARED) {
- error = shmem_zero_setup(vma);
- if (error)
- goto free_vma;
- } else {
- vma_set_anonymous(vma);
- }
-
- /* Allow architectures to sanity-check the vm_flags */
- if (!arch_validate_flags(vma->vm_flags)) {
- error = -EINVAL;
- if (file)
- goto unmap_and_free_vma;
- else
- goto free_vma;
- }
-
- vma_link(mm, vma, prev, rb_link, rb_parent);
-
- /*
- * vma_merge() calls khugepaged_enter_vma() either, the below
- * call covers the non-merge case.
- */
- khugepaged_enter_vma(vma, vma->vm_flags);
-
- /* Once vma denies write, undo our temporary denial count */
-unmap_writable:
- if (file && vm_flags & VM_SHARED)
- mapping_unmap_writable(file->f_mapping);
- file = vma->vm_file;
-out:
- perf_event_mmap(vma);
-
- vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
- if (vm_flags & VM_LOCKED) {
- if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
- is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current->mm))
- vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
- else
- mm->locked_vm += (len >> PAGE_SHIFT);
- }
-
- if (file)
- uprobe_mmap(vma);
-
- /*
- * New (or expanded) vma always get soft dirty status.
- * Otherwise user-space soft-dirty page tracker won't
- * be able to distinguish situation when vma area unmapped,
- * then new mapped in-place (which must be aimed as
- * a completely new data area).
- */
- vma->vm_flags |= VM_SOFTDIRTY;
-
- vma_set_page_prot(vma);
-
- return addr;
-
-unmap_and_free_vma:
- fput(vma->vm_file);
- vma->vm_file = NULL;
-
- /* Undo any partial mapping done by a device driver. */
- unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
- if (vm_flags & VM_SHARED)
- mapping_unmap_writable(file->f_mapping);
-free_vma:
- vm_area_free(vma);
-unacct_error:
- if (charged)
- vm_unacct_memory(charged);
- return error;
-}
-
+/**
+ * unmapped_area() - Find an area between the low_limit and the high_limit with
+ * the correct alignment and offset, all from @info. Note: current->mm is used
+ * for the search.
+ *
+ * @info: The unmapped area information including the range (low_limit -
+ * hight_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
- /*
- * We implement the search by looking for an rbtree node that
- * immediately follows a suitable gap. That is,
- * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
- * - gap_end = vma->vm_start >= info->low_limit + length;
- * - gap_end - gap_start >= length
- */
+ unsigned long length, gap;
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long length, low_limit, high_limit, gap_start, gap_end;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
- /* Adjust search limits by the desired length */
- if (info->high_limit < length)
- return -ENOMEM;
- high_limit = info->high_limit - length;
-
- if (info->low_limit > high_limit)
- return -ENOMEM;
- low_limit = info->low_limit + length;
-
- /* Check if rbtree root looks promising */
- if (RB_EMPTY_ROOT(&mm->mm_rb))
- goto check_highest;
- vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
- if (vma->rb_subtree_gap < length)
- goto check_highest;
-
- while (true) {
- /* Visit left subtree if it looks promising */
- gap_end = vm_start_gap(vma);
- if (gap_end >= low_limit && vma->vm_rb.rb_left) {
- struct vm_area_struct *left =
- rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb);
- if (left->rb_subtree_gap >= length) {
- vma = left;
- continue;
- }
- }
-
- gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
-check_current:
- /* Check if current node has a suitable gap */
- if (gap_start > high_limit)
- return -ENOMEM;
- if (gap_end >= low_limit &&
- gap_end > gap_start && gap_end - gap_start >= length)
- goto found;
-
- /* Visit right subtree if it looks promising */
- if (vma->vm_rb.rb_right) {
- struct vm_area_struct *right =
- rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb);
- if (right->rb_subtree_gap >= length) {
- vma = right;
- continue;
- }
- }
-
- /* Go back up the rbtree to find next candidate node */
- while (true) {
- struct rb_node *prev = &vma->vm_rb;
- if (!rb_parent(prev))
- goto check_highest;
- vma = rb_entry(rb_parent(prev),
- struct vm_area_struct, vm_rb);
- if (prev == vma->vm_rb.rb_left) {
- gap_start = vm_end_gap(vma->vm_prev);
- gap_end = vm_start_gap(vma);
- goto check_current;
- }
- }
- }
-
-check_highest:
- /* Check highest gap, which does not precede any rbtree node */
- gap_start = mm->highest_vm_end;
- gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
- if (gap_start > high_limit)
+ if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1,
+ length))
return -ENOMEM;
-found:
- /* We found a suitable gap. Clip it with the original low_limit. */
- if (gap_start < info->low_limit)
- gap_start = info->low_limit;
-
- /* Adjust gap address to the desired alignment */
- gap_start += (info->align_offset - gap_start) & info->align_mask;
-
- VM_BUG_ON(gap_start + info->length > info->high_limit);
- VM_BUG_ON(gap_start + info->length > gap_end);
- return gap_start;
+ gap = mas.index;
+ gap += (info->align_offset - gap) & info->align_mask;
+ return gap;
}
+/**
+ * unmapped_area_topdown() - Find an area between the low_limit and the
+ * high_limit with * the correct alignment and offset at the highest available
+ * address, all from @info. Note: current->mm is used for the search.
+ *
+ * @info: The unmapped area information including the range (low_limit -
+ * hight_limit), the alignment offset and mask.
+ *
+ * Return: A memory address or -ENOMEM.
+ */
static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long length, low_limit, high_limit, gap_start, gap_end;
+ unsigned long length, gap;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
return -ENOMEM;
- /*
- * Adjust search limits by the desired length.
- * See implementation comment at top of unmapped_area().
- */
- gap_end = info->high_limit;
- if (gap_end < length)
- return -ENOMEM;
- high_limit = gap_end - length;
-
- if (info->low_limit > high_limit)
- return -ENOMEM;
- low_limit = info->low_limit + length;
-
- /* Check highest gap, which does not precede any rbtree node */
- gap_start = mm->highest_vm_end;
- if (gap_start <= high_limit)
- goto found_highest;
-
- /* Check if rbtree root looks promising */
- if (RB_EMPTY_ROOT(&mm->mm_rb))
- return -ENOMEM;
- vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
- if (vma->rb_subtree_gap < length)
+ if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1,
+ length))
return -ENOMEM;
- while (true) {
- /* Visit right subtree if it looks promising */
- gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
- if (gap_start <= high_limit && vma->vm_rb.rb_right) {
- struct vm_area_struct *right =
- rb_entry(vma->vm_rb.rb_right,
- struct vm_area_struct, vm_rb);
- if (right->rb_subtree_gap >= length) {
- vma = right;
- continue;
- }
- }
-
-check_current:
- /* Check if current node has a suitable gap */
- gap_end = vm_start_gap(vma);
- if (gap_end < low_limit)
- return -ENOMEM;
- if (gap_start <= high_limit &&
- gap_end > gap_start && gap_end - gap_start >= length)
- goto found;
-
- /* Visit left subtree if it looks promising */
- if (vma->vm_rb.rb_left) {
- struct vm_area_struct *left =
- rb_entry(vma->vm_rb.rb_left,
- struct vm_area_struct, vm_rb);
- if (left->rb_subtree_gap >= length) {
- vma = left;
- continue;
- }
- }
-
- /* Go back up the rbtree to find next candidate node */
- while (true) {
- struct rb_node *prev = &vma->vm_rb;
- if (!rb_parent(prev))
- return -ENOMEM;
- vma = rb_entry(rb_parent(prev),
- struct vm_area_struct, vm_rb);
- if (prev == vma->vm_rb.rb_right) {
- gap_start = vma->vm_prev ?
- vm_end_gap(vma->vm_prev) : 0;
- goto check_current;
- }
- }
- }
-
-found:
- /* We found a suitable gap. Clip it with the original high_limit. */
- if (gap_end > info->high_limit)
- gap_end = info->high_limit;
-
-found_highest:
- /* Compute highest gap address at the desired alignment */
- gap_end -= info->length;
- gap_end -= (gap_end - info->align_offset) & info->align_mask;
-
- VM_BUG_ON(gap_end < info->low_limit);
- VM_BUG_ON(gap_end < gap_start);
- return gap_end;
+ gap = mas.last + 1 - info->length;
+ gap -= (gap - info->align_offset) & info->align_mask;
+ return gap;
}
/*
@@ -2232,6 +1779,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
*/
pgoff = 0;
get_area = shmem_get_unmapped_area;
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ /* Ensures that larger anonymous mappings are THP aligned. */
+ get_area = thp_get_unmapped_area;
}
addr = get_area(file, addr, len, pgoff, flags);
@@ -2249,58 +1799,67 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
EXPORT_SYMBOL(get_unmapped_area);
-/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
-struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+/**
+ * find_vma_intersection() - Look up the first VMA which intersects the interval
+ * @mm: The process address space.
+ * @start_addr: The inclusive start user address.
+ * @end_addr: The exclusive end user address.
+ *
+ * Returns: The first VMA within the provided range, %NULL otherwise. Assumes
+ * start_addr < end_addr.
+ */
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
{
- struct rb_node *rb_node;
- struct vm_area_struct *vma;
+ unsigned long index = start_addr;
mmap_assert_locked(mm);
- /* Check the cache first. */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
-
- rb_node = mm->mm_rb.rb_node;
-
- while (rb_node) {
- struct vm_area_struct *tmp;
-
- tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+ return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+EXPORT_SYMBOL(find_vma_intersection);
- if (tmp->vm_end > addr) {
- vma = tmp;
- if (tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
+/**
+ * find_vma() - Find the VMA for a given address, or the next VMA.
+ * @mm: The mm_struct to check
+ * @addr: The address
+ *
+ * Returns: The VMA associated with addr, or the next VMA.
+ * May return %NULL in the case of no VMA at addr or above.
+ */
+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long index = addr;
- if (vma)
- vmacache_update(addr, vma);
- return vma;
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, ULONG_MAX);
}
-
EXPORT_SYMBOL(find_vma);
-/*
- * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
+/**
+ * find_vma_prev() - Find the VMA for a given address, or the next vma and
+ * set %pprev to the previous VMA, if any.
+ * @mm: The mm_struct to check
+ * @addr: The address
+ * @pprev: The pointer to set to the previous VMA
+ *
+ * Note that RCU lock is missing here since the external mmap_lock() is used
+ * instead.
+ *
+ * Returns: The VMA associated with @addr, or the next vma.
+ * May return %NULL in the case of no vma at addr or above.
*/
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- vma = find_vma(mm, addr);
- if (vma) {
- *pprev = vma->vm_prev;
- } else {
- struct rb_node *rb_node = rb_last(&mm->mm_rb);
-
- *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
- }
+ vma = mas_walk(&mas);
+ *pprev = mas_prev(&mas, 0);
+ if (!vma)
+ vma = mas_next(&mas, ULONG_MAX);
return vma;
}
@@ -2354,6 +1913,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
struct vm_area_struct *next;
unsigned long gap_addr;
int error = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
@@ -2371,16 +1931,21 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (gap_addr < address || gap_addr > TASK_SIZE)
gap_addr = TASK_SIZE;
- next = vma->vm_next;
- if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
+ next = find_vma_intersection(mm, vma->vm_end, gap_addr);
+ if (next && vma_is_accessible(next)) {
if (!(next->vm_flags & VM_GROWSUP))
return -ENOMEM;
/* Check that both stack segments have the same anon_vma? */
}
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
/* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ mas_destroy(&mas);
return -ENOMEM;
+ }
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -2401,15 +1966,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
- * vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_lock
- * lock here, so we need to protect against
- * concurrent vma expansions.
- * anon_vma_lock_write() doesn't help here, as
- * we don't guarantee that all growable vmas
- * in a mm share the same root anon vma.
- * So, we reuse mm->page_table_lock to guard
- * against concurrent vma expansions.
+ * We only hold a shared mmap_lock lock here, so
+ * we need to protect against concurrent vma
+ * expansions. anon_vma_lock_write() doesn't
+ * help here, as we don't guarantee that all
+ * growable vmas in a mm share the same root
+ * anon vma. So, we reuse mm->page_table_lock
+ * to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
@@ -2417,11 +1980,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
+ /* Overwrite old entry in mtree. */
+ vma_mas_store(vma, &mas);
anon_vma_interval_tree_post_update_vma(vma);
- if (vma->vm_next)
- vma_gap_update(vma->vm_next);
- else
- mm->highest_vm_end = vm_end_gap(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2430,7 +1991,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma(vma, vma->vm_flags);
- validate_mm(mm);
+ mas_destroy(&mas);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -2438,10 +1999,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/*
* vma is the first one with address < vma->vm_start. Have to extend vma.
*/
-int expand_downwards(struct vm_area_struct *vma,
- unsigned long address)
+int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
struct vm_area_struct *prev;
int error = 0;
@@ -2450,7 +2011,7 @@ int expand_downwards(struct vm_area_struct *vma,
return -EPERM;
/* Enforce stack_guard_gap */
- prev = vma->vm_prev;
+ prev = mas_prev(&mas, 0);
/* Check that both stack segments have the same anon_vma? */
if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
vma_is_accessible(prev)) {
@@ -2458,9 +2019,14 @@ int expand_downwards(struct vm_area_struct *vma,
return -ENOMEM;
}
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
/* We must make sure the anon_vma is allocated. */
- if (unlikely(anon_vma_prepare(vma)))
+ if (unlikely(anon_vma_prepare(vma))) {
+ mas_destroy(&mas);
return -ENOMEM;
+ }
/*
* vma->vm_start/vm_end cannot change under us because the caller
@@ -2481,15 +2047,13 @@ int expand_downwards(struct vm_area_struct *vma,
error = acct_stack_growth(vma, size, grow);
if (!error) {
/*
- * vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_lock
- * lock here, so we need to protect against
- * concurrent vma expansions.
- * anon_vma_lock_write() doesn't help here, as
- * we don't guarantee that all growable vmas
- * in a mm share the same root anon vma.
- * So, we reuse mm->page_table_lock to guard
- * against concurrent vma expansions.
+ * We only hold a shared mmap_lock lock here, so
+ * we need to protect against concurrent vma
+ * expansions. anon_vma_lock_write() doesn't
+ * help here, as we don't guarantee that all
+ * growable vmas in a mm share the same root
+ * anon vma. So, we reuse mm->page_table_lock
+ * to guard against concurrent vma expansions.
*/
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
@@ -2498,8 +2062,9 @@ int expand_downwards(struct vm_area_struct *vma,
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
+ /* Overwrite old entry in mtree. */
+ vma_mas_store(vma, &mas);
anon_vma_interval_tree_post_update_vma(vma);
- vma_gap_update(vma);
spin_unlock(&mm->page_table_lock);
perf_event_mmap(vma);
@@ -2508,7 +2073,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma(vma, vma->vm_flags);
- validate_mm(mm);
+ mas_destroy(&mas);
return error;
}
@@ -2581,25 +2146,26 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
EXPORT_SYMBOL_GPL(find_extend_vma);
/*
- * Ok - we have the memory areas we should free on the vma list,
- * so release them, and do the vma updates.
+ * Ok - we have the memory areas we should free on a maple tree so release them,
+ * and do the vma updates.
*
* Called with the mm semaphore held.
*/
-static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
+static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
{
unsigned long nr_accounted = 0;
+ struct vm_area_struct *vma;
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
- do {
+ mas_for_each(mas, vma, ULONG_MAX) {
long nrpages = vma_pages(vma);
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
vm_stat_account(mm, vma->vm_flags, -nrpages);
- vma = remove_vma(vma);
- } while (vma);
+ remove_vma(vma);
+ }
vm_unacct_memory(nr_accounted);
validate_mm(mm);
}
@@ -2609,67 +2175,23 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
*
* Called with the mm semaphore held.
*/
-static void unmap_region(struct mm_struct *mm,
+static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
struct vm_area_struct *vma, struct vm_area_struct *prev,
+ struct vm_area_struct *next,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = vma_next(mm, prev);
struct mmu_gather tlb;
lru_add_drain();
tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
- unmap_vmas(&tlb, vma, start, end);
- free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ unmap_vmas(&tlb, mt, vma, start, end);
+ free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
}
/*
- * Create a list of vma's touched by the unmap, removing them from the mm's
- * vma list as we go..
- */
-static bool
-detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev, unsigned long end)
-{
- struct vm_area_struct **insertion_point;
- struct vm_area_struct *tail_vma = NULL;
-
- insertion_point = (prev ? &prev->vm_next : &mm->mmap);
- vma->vm_prev = NULL;
- do {
- vma_rb_erase(vma, &mm->mm_rb);
- if (vma->vm_flags & VM_LOCKED)
- mm->locked_vm -= vma_pages(vma);
- mm->map_count--;
- tail_vma = vma;
- vma = vma->vm_next;
- } while (vma && vma->vm_start < end);
- *insertion_point = vma;
- if (vma) {
- vma->vm_prev = prev;
- vma_gap_update(vma);
- } else
- mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
- tail_vma->vm_next = NULL;
-
- /* Kill the cache */
- vmacache_invalidate(mm);
-
- /*
- * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
- * VM_GROWSUP VMA. Such VMAs can change their size under
- * down_read(mmap_lock) and collide with the VMA we are about to unmap.
- */
- if (vma && (vma->vm_flags & VM_GROWSDOWN))
- return false;
- if (prev && (prev->vm_flags & VM_GROWSUP))
- return false;
- return true;
-}
-
-/*
* __split_vma() bypasses sysctl_max_map_count checking. We use this where it
* has already been checked or doesn't make sense to fail.
*/
@@ -2678,6 +2200,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct vm_area_struct *new;
int err;
+ validate_mm_mt(mm);
if (vma->vm_ops && vma->vm_ops->may_split) {
err = vma->vm_ops->may_split(vma, addr);
@@ -2720,6 +2243,9 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (!err)
return 0;
+ /* Avoid vm accounting in close() operation */
+ new->vm_start = new->vm_end;
+ new->vm_pgoff = 0;
/* Clean everything up if vma_adjust failed. */
if (new->vm_ops && new->vm_ops->close)
new->vm_ops->close(new);
@@ -2730,6 +2256,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
mpol_put(vma_policy(new));
out_free_vma:
vm_area_free(new);
+ validate_mm_mt(mm);
return err;
}
@@ -2746,38 +2273,48 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
return __split_vma(mm, vma, addr, new_below);
}
-/* Munmap is split into 2 main parts -- this part which finds
- * what needs doing, and the areas themselves, which do the
- * work. This now handles partial unmappings.
- * Jeremy Fitzhardinge <jeremy@goop.org>
- */
-int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
- struct list_head *uf, bool downgrade)
+static inline int munmap_sidetree(struct vm_area_struct *vma,
+ struct ma_state *mas_detach)
{
- unsigned long end;
- struct vm_area_struct *vma, *prev, *last;
-
- if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
- return -EINVAL;
+ mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
+ if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
+ return -ENOMEM;
- len = PAGE_ALIGN(len);
- end = start + len;
- if (len == 0)
- return -EINVAL;
+ if (vma->vm_flags & VM_LOCKED)
+ vma->vm_mm->locked_vm -= vma_pages(vma);
- /*
- * arch_unmap() might do unmaps itself. It must be called
- * and finish any rbtree manipulation before this code
- * runs and also starts to manipulate the rbtree.
- */
- arch_unmap(mm, start, end);
+ return 0;
+}
- /* Find the first overlapping VMA where start < vma->vm_end */
- vma = find_vma_intersection(mm, start, end);
- if (!vma)
- return 0;
- prev = vma->vm_prev;
+/*
+ * do_mas_align_munmap() - munmap the aligned region from @start to @end.
+ * @mas: The maple_state, ideally set up to alter the correct tree location.
+ * @vma: The starting vm_area_struct
+ * @mm: The mm_struct
+ * @start: The aligned start address to munmap.
+ * @end: The aligned end address to munmap.
+ * @uf: The userfaultfd list_head
+ * @downgrade: Set to true to attempt a write downgrade of the mmap_sem
+ *
+ * If @downgrade is true, check return code for potential release of the lock.
+ */
+static int
+do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf, bool downgrade)
+{
+ struct vm_area_struct *prev, *next = NULL;
+ struct maple_tree mt_detach;
+ int count = 0;
+ int error = -ENOMEM;
+ MA_STATE(mas_detach, &mt_detach, 0, 0);
+ mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
+ mt_set_external_lock(&mt_detach, &mm->mmap_lock);
+
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+ mas->last = end - 1;
/*
* If we need to split any vma, do it now to save pain later.
*
@@ -2785,8 +2322,9 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* unmapped vm_area_struct will remain in use: so lower split_vma
* places tmp vma above, and higher split_vma places tmp vma below.
*/
+
+ /* Does it split the first one? */
if (start > vma->vm_start) {
- int error;
/*
* Make sure that map_count on return from munmap() will
@@ -2794,22 +2332,61 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* its limit temporarily, to help free resources as expected.
*/
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
- return -ENOMEM;
+ goto map_count_exceeded;
+ /*
+ * mas_pause() is not needed since mas->index needs to be set
+ * differently than vma->vm_end anyways.
+ */
error = __split_vma(mm, vma, start, 0);
if (error)
- return error;
- prev = vma;
+ goto start_split_failed;
+
+ mas_set(mas, start);
+ vma = mas_walk(mas);
}
- /* Does it split the last one? */
- last = find_vma(mm, end);
- if (last && end > last->vm_start) {
- int error = __split_vma(mm, last, end, 1);
+ prev = mas_prev(mas, 0);
+ if (unlikely((!prev)))
+ mas_set(mas, start);
+
+ /*
+ * Detach a range of VMAs from the mm. Using next as a temp variable as
+ * it is always overwritten.
+ */
+ mas_for_each(mas, next, end - 1) {
+ /* Does it split the end? */
+ if (next->vm_end > end) {
+ struct vm_area_struct *split;
+
+ error = __split_vma(mm, next, end, 1);
+ if (error)
+ goto end_split_failed;
+
+ mas_set(mas, end);
+ split = mas_prev(mas, 0);
+ error = munmap_sidetree(split, &mas_detach);
+ if (error)
+ goto munmap_sidetree_failed;
+
+ count++;
+ if (vma == next)
+ vma = split;
+ break;
+ }
+ error = munmap_sidetree(next, &mas_detach);
if (error)
- return error;
+ goto munmap_sidetree_failed;
+
+ count++;
+#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
+ BUG_ON(next->vm_start < start);
+ BUG_ON(next->vm_start > end);
+#endif
}
- vma = vma_next(mm, prev);
+
+ if (!next)
+ next = mas_next(mas, ULONG_MAX);
if (unlikely(uf)) {
/*
@@ -2821,30 +2398,372 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
* split, despite we could. This is unlikely enough
* failure that it's not worth optimizing it for.
*/
- int error = userfaultfd_unmap_prep(vma, start, end, uf);
+ error = userfaultfd_unmap_prep(mm, start, end, uf);
+
if (error)
- return error;
+ goto userfaultfd_error;
+ }
+
+ /* Point of no return */
+ mas_set_range(mas, start, end - 1);
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ /* Make sure no VMAs are about to be lost. */
+ {
+ MA_STATE(test, &mt_detach, start, end - 1);
+ struct vm_area_struct *vma_mas, *vma_test;
+ int test_count = 0;
+
+ rcu_read_lock();
+ vma_test = mas_find(&test, end - 1);
+ mas_for_each(mas, vma_mas, end - 1) {
+ BUG_ON(vma_mas != vma_test);
+ test_count++;
+ vma_test = mas_next(&test, end - 1);
+ }
+ rcu_read_unlock();
+ BUG_ON(count != test_count);
+ mas_set_range(mas, start, end - 1);
+ }
+#endif
+ mas_store_prealloc(mas, NULL);
+ mm->map_count -= count;
+ /*
+ * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
+ * VM_GROWSUP VMA. Such VMAs can change their size under
+ * down_read(mmap_lock) and collide with the VMA we are about to unmap.
+ */
+ if (downgrade) {
+ if (next && (next->vm_flags & VM_GROWSDOWN))
+ downgrade = false;
+ else if (prev && (prev->vm_flags & VM_GROWSUP))
+ downgrade = false;
+ else
+ mmap_write_downgrade(mm);
}
- /* Detach vmas from rbtree */
- if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
- downgrade = false;
+ unmap_region(mm, &mt_detach, vma, prev, next, start, end);
+ /* Statistics and freeing VMAs */
+ mas_set(&mas_detach, start);
+ remove_mt(mm, &mas_detach);
+ __mt_destroy(&mt_detach);
- if (downgrade)
- mmap_write_downgrade(mm);
- unmap_region(mm, vma, prev, start, end);
+ validate_mm(mm);
+ return downgrade ? 1 : 0;
- /* Fix up all other VM information */
- remove_vma_list(mm, vma);
+userfaultfd_error:
+munmap_sidetree_failed:
+end_split_failed:
+ __mt_destroy(&mt_detach);
+start_split_failed:
+map_count_exceeded:
+ mas_destroy(mas);
+ return error;
+}
- return downgrade ? 1 : 0;
+/*
+ * do_mas_munmap() - munmap a given range.
+ * @mas: The maple state
+ * @mm: The mm_struct
+ * @start: The start address to munmap
+ * @len: The length of the range to munmap
+ * @uf: The userfaultfd list_head
+ * @downgrade: set to true if the user wants to attempt to write_downgrade the
+ * mmap_sem
+ *
+ * This function takes a @mas that is either pointing to the previous VMA or set
+ * to MA_START and sets it up to remove the mapping(s). The @len will be
+ * aligned and any arch_unmap work will be preformed.
+ *
+ * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise.
+ */
+int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
+ unsigned long start, size_t len, struct list_head *uf,
+ bool downgrade)
+{
+ unsigned long end;
+ struct vm_area_struct *vma;
+
+ if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
+ return -EINVAL;
+
+ end = start + PAGE_ALIGN(len);
+ if (end == start)
+ return -EINVAL;
+
+ /* arch_unmap() might do unmaps itself. */
+ arch_unmap(mm, start, end);
+
+ /* Find the first overlapping VMA */
+ vma = mas_find(mas, end - 1);
+ if (!vma)
+ return 0;
+
+ return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade);
}
+/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
+ * @mm: The mm_struct
+ * @start: The start address to munmap
+ * @len: The length to be munmapped.
+ * @uf: The userfaultfd list_head
+ */
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
struct list_head *uf)
{
- return __do_munmap(mm, start, len, uf, false);
+ MA_STATE(mas, &mm->mm_mt, start, start);
+
+ return do_mas_munmap(&mas, mm, start, len, uf, false);
+}
+
+unsigned long mmap_region(struct file *file, unsigned long addr,
+ unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
+ struct list_head *uf)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
+ struct vm_area_struct *next, *prev, *merge;
+ pgoff_t pglen = len >> PAGE_SHIFT;
+ unsigned long charged = 0;
+ unsigned long end = addr + len;
+ unsigned long merge_start = addr, merge_end = end;
+ pgoff_t vm_pgoff;
+ int error;
+ MA_STATE(mas, &mm->mm_mt, addr, end - 1);
+
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
+ unsigned long nr_pages;
+
+ /*
+ * MAP_FIXED may remove pages of mappings that intersects with
+ * requested mapping. Account for the pages it would unmap.
+ */
+ nr_pages = count_vma_pages_range(mm, addr, end);
+
+ if (!may_expand_vm(mm, vm_flags,
+ (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
+ }
+
+ /* Unmap any existing mapping in the area */
+ if (do_mas_munmap(&mas, mm, addr, len, uf, false))
+ return -ENOMEM;
+
+ /*
+ * Private writable mapping: check memory availability
+ */
+ if (accountable_mapping(file, vm_flags)) {
+ charged = len >> PAGE_SHIFT;
+ if (security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+ vm_flags |= VM_ACCOUNT;
+ }
+
+ next = mas_next(&mas, ULONG_MAX);
+ prev = mas_prev(&mas, 0);
+ if (vm_flags & VM_SPECIAL)
+ goto cannot_expand;
+
+ /* Attempt to expand an old mapping */
+ /* Check next */
+ if (next && next->vm_start == end && !vma_policy(next) &&
+ can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
+ NULL_VM_UFFD_CTX, NULL)) {
+ merge_end = next->vm_end;
+ vma = next;
+ vm_pgoff = next->vm_pgoff - pglen;
+ }
+
+ /* Check prev */
+ if (prev && prev->vm_end == addr && !vma_policy(prev) &&
+ (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
+ pgoff, vma->vm_userfaultfd_ctx, NULL) :
+ can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
+ NULL_VM_UFFD_CTX, NULL))) {
+ merge_start = prev->vm_start;
+ vma = prev;
+ vm_pgoff = prev->vm_pgoff;
+ }
+
+
+ /* Actually expand, if possible */
+ if (vma &&
+ !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) {
+ khugepaged_enter_vma(vma, vm_flags);
+ goto expanded;
+ }
+
+ mas.index = addr;
+ mas.last = end - 1;
+cannot_expand:
+ /*
+ * Determine the object being mapped and call the appropriate
+ * specific mapper. the address has already been validated, but
+ * not unmapped, but the maps are removed from the list.
+ */
+ vma = vm_area_alloc(mm);
+ if (!vma) {
+ error = -ENOMEM;
+ goto unacct_error;
+ }
+
+ vma->vm_start = addr;
+ vma->vm_end = end;
+ vma->vm_flags = vm_flags;
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
+ vma->vm_pgoff = pgoff;
+
+ if (file) {
+ if (vm_flags & VM_SHARED) {
+ error = mapping_map_writable(file->f_mapping);
+ if (error)
+ goto free_vma;
+ }
+
+ vma->vm_file = get_file(file);
+ error = call_mmap(file, vma);
+ if (error)
+ goto unmap_and_free_vma;
+
+ /*
+ * Expansion is handled above, merging is handled below.
+ * Drivers should not alter the address of the VMA.
+ */
+ if (WARN_ON((addr != vma->vm_start))) {
+ error = -EINVAL;
+ goto close_and_free_vma;
+ }
+ mas_reset(&mas);
+
+ /*
+ * If vm_flags changed after call_mmap(), we should try merge
+ * vma again as we may succeed this time.
+ */
+ if (unlikely(vm_flags != vma->vm_flags && prev)) {
+ merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
+ NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
+ if (merge) {
+ /*
+ * ->mmap() can change vma->vm_file and fput
+ * the original file. So fput the vma->vm_file
+ * here or we would add an extra fput for file
+ * and cause general protection fault
+ * ultimately.
+ */
+ fput(vma->vm_file);
+ vm_area_free(vma);
+ vma = merge;
+ /* Update vm_flags to pick up the change. */
+ vm_flags = vma->vm_flags;
+ goto unmap_writable;
+ }
+ }
+
+ vm_flags = vma->vm_flags;
+ } else if (vm_flags & VM_SHARED) {
+ error = shmem_zero_setup(vma);
+ if (error)
+ goto free_vma;
+ } else {
+ vma_set_anonymous(vma);
+ }
+
+ /* Allow architectures to sanity-check the vm_flags */
+ if (!arch_validate_flags(vma->vm_flags)) {
+ error = -EINVAL;
+ if (file)
+ goto close_and_free_vma;
+ else if (vma->vm_file)
+ goto unmap_and_free_vma;
+ else
+ goto free_vma;
+ }
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ error = -ENOMEM;
+ if (file)
+ goto close_and_free_vma;
+ else if (vma->vm_file)
+ goto unmap_and_free_vma;
+ else
+ goto free_vma;
+ }
+
+ if (vma->vm_file)
+ i_mmap_lock_write(vma->vm_file->f_mapping);
+
+ vma_mas_store(vma, &mas);
+ mm->map_count++;
+ if (vma->vm_file) {
+ if (vma->vm_flags & VM_SHARED)
+ mapping_allow_writable(vma->vm_file->f_mapping);
+
+ flush_dcache_mmap_lock(vma->vm_file->f_mapping);
+ vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap);
+ flush_dcache_mmap_unlock(vma->vm_file->f_mapping);
+ i_mmap_unlock_write(vma->vm_file->f_mapping);
+ }
+
+ /*
+ * vma_merge() calls khugepaged_enter_vma() either, the below
+ * call covers the non-merge case.
+ */
+ khugepaged_enter_vma(vma, vma->vm_flags);
+
+ /* Once vma denies write, undo our temporary denial count */
+unmap_writable:
+ if (file && vm_flags & VM_SHARED)
+ mapping_unmap_writable(file->f_mapping);
+ file = vma->vm_file;
+expanded:
+ perf_event_mmap(vma);
+
+ vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
+ if (vm_flags & VM_LOCKED) {
+ if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
+ is_vm_hugetlb_page(vma) ||
+ vma == get_gate_vma(current->mm))
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ else
+ mm->locked_vm += (len >> PAGE_SHIFT);
+ }
+
+ if (file)
+ uprobe_mmap(vma);
+
+ /*
+ * New (or expanded) vma always get soft dirty status.
+ * Otherwise user-space soft-dirty page tracker won't
+ * be able to distinguish situation when vma area unmapped,
+ * then new mapped in-place (which must be aimed as
+ * a completely new data area).
+ */
+ vma->vm_flags |= VM_SOFTDIRTY;
+
+ vma_set_page_prot(vma);
+
+ validate_mm(mm);
+ return addr;
+
+close_and_free_vma:
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+unmap_and_free_vma:
+ fput(vma->vm_file);
+ vma->vm_file = NULL;
+
+ /* Undo any partial mapping done by a device driver. */
+ unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end);
+ if (file && (vm_flags & VM_SHARED))
+ mapping_unmap_writable(file->f_mapping);
+free_vma:
+ vm_area_free(vma);
+unacct_error:
+ if (charged)
+ vm_unacct_memory(charged);
+ validate_mm(mm);
+ return error;
}
static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
@@ -2852,11 +2771,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
int ret;
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, start, start);
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = __do_munmap(mm, start, len, &uf, downgrade);
+ ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade);
/*
* Returning 1 indicates mmap_lock is downgraded.
* But 1 is not legal return value of vm_munmap() and munmap(), reset
@@ -2922,11 +2842,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
goto out;
if (start + size > vma->vm_end) {
- struct vm_area_struct *next;
+ VMA_ITERATOR(vmi, mm, vma->vm_end);
+ struct vm_area_struct *next, *prev = vma;
- for (next = vma->vm_next; next; next = next->vm_next) {
+ for_each_vma_range(vmi, next, start + size) {
/* hole between vmas ? */
- if (next->vm_start != next->vm_prev->vm_end)
+ if (next->vm_start != prev->vm_end)
goto out;
if (next->vm_file != vma->vm_file)
@@ -2937,6 +2858,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (start + size <= next->vm_end)
break;
+
+ prev = next;
}
if (!next)
@@ -2966,37 +2889,53 @@ out:
}
/*
- * this is really a simplified "do_mmap". it only handles
- * anonymous maps. eventually we may be able to do some
- * brk-specific accounting here.
+ * brk_munmap() - Unmap a parital vma.
+ * @mas: The maple tree state.
+ * @vma: The vma to be modified
+ * @newbrk: the start of the address to unmap
+ * @oldbrk: The end of the address to unmap
+ * @uf: The userfaultfd list_head
+ *
+ * Returns: 1 on success.
+ * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if
+ * possible.
*/
-static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf)
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
- struct rb_node **rb_link, *rb_parent;
- pgoff_t pgoff = addr >> PAGE_SHIFT;
- int error;
- unsigned long mapped_addr;
-
- /* Until we need other flags, refuse anything except VM_EXEC. */
- if ((flags & (~VM_EXEC)) != 0)
- return -EINVAL;
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-
- mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (IS_ERR_VALUE(mapped_addr))
- return mapped_addr;
+ struct mm_struct *mm = vma->vm_mm;
+ int ret;
- error = mlock_future_check(mm, mm->def_flags, len);
- if (error)
- return error;
+ arch_unmap(mm, newbrk, oldbrk);
+ ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true);
+ validate_mm_mt(mm);
+ return ret;
+}
- /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
- if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
- return -ENOMEM;
+/*
+ * do_brk_flags() - Increase the brk vma if the flags match.
+ * @mas: The maple tree state.
+ * @addr: The start address
+ * @len: The length of the increase
+ * @vma: The vma,
+ * @flags: The VMA Flags
+ *
+ * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
+ * do not match then create a new anonymous VMA. Eventually we may be able to
+ * do some brk-specific accounting here.
+ */
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long len, unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
- /* Check against address space limits *after* clearing old maps... */
+ validate_mm_mt(mm);
+ /*
+ * Check against address space limits by the changed size
+ * Note: This happens *after* clearing old mappings in some code paths.
+ */
+ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;
@@ -3006,28 +2945,50 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
- /* Can we just expand an old private anonymous mapping? */
- vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (vma)
- goto out;
-
/*
- * create a vma struct for an anonymous mapping
+ * Expand the existing vma if possible; Note that singular lists do not
+ * occur after forking, so the expand will only happen on new VMAs.
*/
- vma = vm_area_alloc(mm);
- if (!vma) {
- vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
+ if (vma &&
+ (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) &&
+ ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) {
+ mas_set_range(mas, vma->vm_start, addr + len - 1);
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ return -ENOMEM;
+
+ vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
+ if (vma->anon_vma) {
+ anon_vma_lock_write(vma->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+ vma->vm_end = addr + len;
+ vma->vm_flags |= VM_SOFTDIRTY;
+ mas_store_prealloc(mas, vma);
+
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+ khugepaged_enter_vma(vma, flags);
+ goto out;
}
+ /* create a vma struct for an anonymous mapping */
+ vma = vm_area_alloc(mm);
+ if (!vma)
+ goto vma_alloc_fail;
+
vma_set_anonymous(vma);
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_pgoff = pgoff;
+ vma->vm_pgoff = addr >> PAGE_SHIFT;
vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags);
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ mas_set_range(mas, vma->vm_start, addr + len - 1);
+ if (mas_store_gfp(mas, vma, GFP_KERNEL))
+ goto mas_store_fail;
+
+ mm->map_count++;
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
@@ -3035,16 +2996,25 @@ out:
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
+ validate_mm(mm);
return 0;
+
+mas_store_fail:
+ vm_area_free(vma);
+vma_alloc_fail:
+ vm_unacct_memory(len >> PAGE_SHIFT);
+ return -ENOMEM;
}
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
unsigned long len;
int ret;
bool populate;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
len = PAGE_ALIGN(request);
if (len < request)
@@ -3055,13 +3025,36 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_brk_flags(addr, len, flags, &uf);
+ /* Until we need other flags, refuse anything except VM_EXEC. */
+ if ((flags & (~VM_EXEC)) != 0)
+ return -EINVAL;
+
+ ret = check_brk_limits(addr, len);
+ if (ret)
+ goto limits_failed;
+
+ ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0);
+ if (ret)
+ goto munmap_failed;
+
+ vma = mas_prev(&mas, 0);
+ if (!vma || vma->vm_end != addr || vma_policy(vma) ||
+ !can_vma_merge_after(vma, flags, NULL, NULL,
+ addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL))
+ vma = NULL;
+
+ ret = do_brk_flags(&mas, vma, addr, len, flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
return ret;
+
+munmap_failed:
+limits_failed:
+ mmap_write_unlock(mm);
+ return ret;
}
EXPORT_SYMBOL(vm_brk_flags);
@@ -3077,34 +3070,19 @@ void exit_mmap(struct mm_struct *mm)
struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
+ int count = 0;
/* mm's last user has gone, and its about to be pulled down */
mmu_notifier_release(mm);
- if (unlikely(mm_is_oom_victim(mm))) {
- /*
- * Manually reap the mm to free as much memory as possible.
- * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
- * this mm from further consideration. Taking mm->mmap_lock for
- * write after setting MMF_OOM_SKIP will guarantee that the oom
- * reaper will not run on this mm again after mmap_lock is
- * dropped.
- *
- * Nothing can be holding mm->mmap_lock here and the above call
- * to mmu_notifier_release(mm) ensures mmu notifier callbacks in
- * __oom_reap_task_mm() will not block.
- */
- (void)__oom_reap_task_mm(mm);
- set_bit(MMF_OOM_SKIP, &mm->flags);
- }
-
- mmap_write_lock(mm);
+ mmap_read_lock(mm);
arch_exit_mmap(mm);
- vma = mm->mmap;
+ vma = mas_find(&mas, ULONG_MAX);
if (!vma) {
/* Can happen if dup_mmap() received an OOM */
- mmap_write_unlock(mm);
+ mmap_read_unlock(mm);
return;
}
@@ -3112,19 +3090,37 @@ void exit_mmap(struct mm_struct *mm)
flush_cache_mm(mm);
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
- /* Use -1 here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, vma, 0, -1);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
+ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
+ unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX);
+ mmap_read_unlock(mm);
+
+ /*
+ * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
+ * because the memory has been already freed.
+ */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ mmap_write_lock(mm);
+ free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
+ USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
- /* Walk the list again, actually closing and freeing it. */
- while (vma) {
+ /*
+ * Walk the list again, actually closing and freeing it, with preemption
+ * enabled, without holding any MM locks besides the unreachable
+ * mmap_write_lock.
+ */
+ do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- vma = remove_vma(vma);
+ remove_vma(vma);
+ count++;
cond_resched();
- }
- mm->mmap = NULL;
+ } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
+
+ BUG_ON(count != mm->map_count);
+
+ trace_exit_mmap(mm);
+ __mt_destroy(&mm->mm_mt);
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@@ -3135,14 +3131,14 @@ void exit_mmap(struct mm_struct *mm)
*/
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *prev;
- struct rb_node **rb_link, *rb_parent;
+ unsigned long charged = vma_pages(vma);
- if (find_vma_links(mm, vma->vm_start, vma->vm_end,
- &prev, &rb_link, &rb_parent))
+
+ if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
return -ENOMEM;
+
if ((vma->vm_flags & VM_ACCOUNT) &&
- security_vm_enough_memory_mm(mm, vma_pages(vma)))
+ security_vm_enough_memory_mm(mm, charged))
return -ENOMEM;
/*
@@ -3162,7 +3158,11 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- vma_link(mm, vma, prev, rb_link, rb_parent);
+ if (vma_link(mm, vma)) {
+ vm_unacct_memory(charged);
+ return -ENOMEM;
+ }
+
return 0;
}
@@ -3178,9 +3178,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
unsigned long vma_start = vma->vm_start;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma, *prev;
- struct rb_node **rb_link, *rb_parent;
bool faulted_in_anon_vma = true;
+ validate_mm_mt(mm);
/*
* If anonymous vma has not yet been faulted, update new pgoff
* to match new location, to increase its chance of merging.
@@ -3190,8 +3190,10 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
- if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+ new_vma = find_vma_prev(mm, addr, &prev);
+ if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
+
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
@@ -3232,16 +3234,27 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
get_file(new_vma->vm_file);
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
- vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ if (vma_link(mm, new_vma))
+ goto out_vma_link;
*need_rmap_locks = false;
}
+ validate_mm_mt(mm);
return new_vma;
+out_vma_link:
+ if (new_vma->vm_ops && new_vma->vm_ops->close)
+ new_vma->vm_ops->close(new_vma);
+
+ if (new_vma->vm_file)
+ fput(new_vma->vm_file);
+
+ unlink_anon_vmas(new_vma);
out_free_mempol:
mpol_put(vma_policy(new_vma));
out_free_vma:
vm_area_free(new_vma);
out:
+ validate_mm_mt(mm);
return NULL;
}
@@ -3378,6 +3391,7 @@ static struct vm_area_struct *__install_special_mapping(
int ret;
struct vm_area_struct *vma;
+ validate_mm_mt(mm);
vma = vm_area_alloc(mm);
if (unlikely(vma == NULL))
return ERR_PTR(-ENOMEM);
@@ -3400,10 +3414,12 @@ static struct vm_area_struct *__install_special_mapping(
perf_event_mmap(vma);
+ validate_mm_mt(mm);
return vma;
out:
vm_area_free(vma);
+ validate_mm_mt(mm);
return ERR_PTR(ret);
}
@@ -3528,12 +3544,13 @@ int mm_take_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
mmap_assert_write_locked(mm);
mutex_lock(&mm_all_locks_mutex);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3541,7 +3558,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_set(&mas, 0);
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3549,7 +3567,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_set(&mas, 0);
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (signal_pending(current))
goto out_unlock;
if (vma->anon_vma)
@@ -3608,11 +3627,12 @@ void mm_drop_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
mmap_assert_write_locked(mm);
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ mas_for_each(&mas, vma, ULONG_MAX) {
if (vma->anon_vma)
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
vm_unlock_anon_vma(avc->anon_vma);
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index a71924bd38c0..add4244e5790 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -1,6 +1,7 @@
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
+#include <linux/kmsan-checks.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/mm_inline.h>
@@ -265,6 +266,15 @@ void tlb_flush_mmu(struct mmu_gather *tlb)
static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
bool fullmm)
{
+ /*
+ * struct mmu_gather contains 7 1-bit fields packed into a 32-bit
+ * unsigned int value. The remaining 25 bits remain uninitialized
+ * and are never used, but KMSAN updates the origin for them in
+ * zap_pXX_range() in mm/memory.c, thus creating very long origin
+ * chains. This is technically correct, but consumes too much memory.
+ * Unpoisoning the whole structure will prevent creating such chains.
+ */
+ kmsan_unpoison_memory(tlb, sizeof(*tlb));
tlb->mm = mm;
tlb->fullmm = fullmm;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 0ae7571e35ab..68e1511be12d 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -88,6 +88,8 @@ void lruvec_init(struct lruvec *lruvec)
* Poison its list head, so that any operations on it would crash.
*/
list_del(&lruvec->lists[LRU_UNEVICTABLE]);
+
+ lru_gen_init_lruvec(lruvec);
}
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index bc6bddd156ca..668bfaa6ed2a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -31,6 +31,7 @@
#include <linux/pgtable.h>
#include <linux/sched/sysctl.h>
#include <linux/userfaultfd_k.h>
+#include <linux/memory-tiers.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -121,6 +122,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
if (prot_numa) {
struct page *page;
int nid;
+ bool toptier;
/* Avoid TLB flush if possible */
if (pte_protnone(oldpte))
@@ -150,14 +152,19 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
nid = page_to_nid(page);
if (target_node == nid)
continue;
+ toptier = node_is_toptier(nid);
/*
* Skip scanning top tier node if normal numa
* balancing is disabled
*/
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- node_is_toptier(nid))
+ toptier)
continue;
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !toptier)
+ xchg_page_access_time(page,
+ jiffies_to_msecs(jiffies));
}
oldpte = ptep_modify_prot_start(vma, addr, pte);
@@ -260,6 +267,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
} else {
/* It must be an none page, or what else?.. */
WARN_ON_ONCE(!pte_none(oldpte));
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
/*
* For file-backed mem, we need to be able to
@@ -271,6 +279,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
make_pte_marker(PTE_MARKER_UFFD_WP));
pages++;
}
+#endif
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
@@ -669,6 +678,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
(prot & PROT_READ);
struct mmu_gather tlb;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
start = untagged_addr(start);
@@ -700,7 +710,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey))
goto out;
- vma = find_vma(current->mm, start);
+ mas_set(&mas, start);
+ vma = mas_find(&mas, ULONG_MAX);
error = -ENOMEM;
if (!vma)
goto out;
@@ -726,7 +737,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (start > vma->vm_start)
prev = vma;
else
- prev = vma->vm_prev;
+ prev = mas_prev(&mas, 0);
tlb_gather_mmu(&tlb, current->mm);
for (nstart = start ; ; ) {
@@ -789,7 +800,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (nstart >= end)
break;
- vma = prev->vm_next;
+ vma = find_vma(current->mm, prev->vm_end);
if (!vma || vma->vm_start != nstart) {
error = -ENOMEM;
break;
diff --git a/mm/mremap.c b/mm/mremap.c
index b522cd0259a0..e465ffe279bb 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,6 +9,7 @@
*/
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/ksm.h>
@@ -23,6 +24,7 @@
#include <linux/mmu_notifier.h>
#include <linux/uaccess.h>
#include <linux/userfaultfd_k.h>
+#include <linux/mempolicy.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
@@ -716,7 +718,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (excess) {
vma->vm_flags |= VM_ACCOUNT;
if (split)
- vma->vm_next->vm_flags |= VM_ACCOUNT;
+ find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT;
}
return new_addr;
@@ -866,9 +868,10 @@ out:
static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
{
unsigned long end = vma->vm_end + delta;
+
if (end < vma->vm_end) /* overflow */
return 0;
- if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
+ if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
return 0;
if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
0, MAP_FIXED) & ~PAGE_MASK)
@@ -975,20 +978,23 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
- * __do_munmap does all the needed commit accounting, and
+ * do_mas_munmap does all the needed commit accounting, and
* downgrades mmap_lock to read if so directed.
*/
if (old_len >= new_len) {
int retval;
+ MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len);
- retval = __do_munmap(mm, addr+new_len, old_len - new_len,
- &uf_unmap, true);
- if (retval < 0 && old_len != new_len) {
- ret = retval;
- goto out;
+ retval = do_mas_munmap(&mas, mm, addr + new_len,
+ old_len - new_len, &uf_unmap, true);
/* Returning 1 indicates mmap_lock is downgraded to read. */
- } else if (retval == 1)
+ if (retval == 1) {
downgraded = true;
+ } else if (retval < 0 && old_len != new_len) {
+ ret = retval;
+ goto out;
+ }
+
ret = addr;
goto out;
}
@@ -1008,6 +1014,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/* can we just expand the current mapping? */
if (vma_expandable(vma, new_len - old_len)) {
long pages = (new_len - old_len) >> PAGE_SHIFT;
+ unsigned long extension_start = addr + old_len;
+ unsigned long extension_end = addr + new_len;
+ pgoff_t extension_pgoff = vma->vm_pgoff + (old_len >> PAGE_SHIFT);
if (vma->vm_flags & VM_ACCOUNT) {
if (security_vm_enough_memory_mm(mm, pages)) {
@@ -1016,8 +1025,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
}
- if (vma_adjust(vma, vma->vm_start, addr + new_len,
- vma->vm_pgoff, NULL)) {
+ /*
+ * Function vma_merge() is called on the extension we are adding to
+ * the already existing vma, vma_merge() will merge this extension with
+ * the already existing vma (expand operation itself) and possibly also
+ * with the next vma if it becomes adjacent to the expanded vma and
+ * otherwise compatible.
+ */
+ vma = vma_merge(mm, vma, extension_start, extension_end,
+ vma->vm_flags, vma->anon_vma, vma->vm_file,
+ extension_pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
+ if (!vma) {
vm_unacct_memory(pages);
ret = -ENOMEM;
goto out;
diff --git a/mm/msync.c b/mm/msync.c
index 137d1c104f3e..ac4c9bfea2e7 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -104,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
error = 0;
goto out_unlock;
}
- vma = vma->vm_next;
+ vma = find_vma(mm, vma->vm_end);
}
}
out_unlock:
diff --git a/mm/nommu.c b/mm/nommu.c
index e819cbc21b39..214c70e1d059 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -19,7 +19,6 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
-#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -545,26 +544,27 @@ static void put_nommu_region(struct vm_region *region)
__put_nommu_region(region);
}
-/*
- * add a VMA into a process's mm_struct in the appropriate place in the list
- * and tree and add to the address space's page tree also if not an anonymous
- * page
- * - should be called with mm->mmap_lock held writelocked
- */
-static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas)
{
- struct vm_area_struct *pvma, *prev;
- struct address_space *mapping;
- struct rb_node **p, *parent, *rb_prev;
+ mas_set_range(mas, vma->vm_start, vma->vm_end - 1);
+ mas_store_prealloc(mas, vma);
+}
- BUG_ON(!vma->vm_region);
+void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas)
+{
+ mas->index = vma->vm_start;
+ mas->last = vma->vm_end - 1;
+ mas_store_prealloc(mas, NULL);
+}
+static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm)
+{
mm->map_count++;
vma->vm_mm = mm;
/* add the VMA to the mapping */
if (vma->vm_file) {
- mapping = vma->vm_file->f_mapping;
+ struct address_space *mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
flush_dcache_mmap_lock(mapping);
@@ -572,67 +572,51 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
+}
- /* add the VMA to the tree */
- parent = rb_prev = NULL;
- p = &mm->mm_rb.rb_node;
- while (*p) {
- parent = *p;
- pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
-
- /* sort by: start addr, end addr, VMA struct addr in that order
- * (the latter is necessary as we may get identical VMAs) */
- if (vma->vm_start < pvma->vm_start)
- p = &(*p)->rb_left;
- else if (vma->vm_start > pvma->vm_start) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else if (vma->vm_end < pvma->vm_end)
- p = &(*p)->rb_left;
- else if (vma->vm_end > pvma->vm_end) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else if (vma < pvma)
- p = &(*p)->rb_left;
- else if (vma > pvma) {
- rb_prev = parent;
- p = &(*p)->rb_right;
- } else
- BUG();
- }
-
- rb_link_node(&vma->vm_rb, parent, p);
- rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+/*
+ * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm().
+ * @mas: The maple state with preallocations.
+ * @mm: The mm_struct
+ * @vma: The vma to add
+ *
+ */
+static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ BUG_ON(!vma->vm_region);
- /* add VMA to the VMA list also */
- prev = NULL;
- if (rb_prev)
- prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
+ setup_vma_to_mm(vma, mm);
- __vma_link_list(mm, vma, prev);
+ /* add the VMA to the tree */
+ vma_mas_store(vma, mas);
}
/*
- * delete a VMA from its owning mm_struct and address space
+ * add a VMA into a process's mm_struct in the appropriate place in the list
+ * and tree and add to the address space's page tree also if not an anonymous
+ * page
+ * - should be called with mm->mmap_lock held writelocked
*/
-static void delete_vma_from_mm(struct vm_area_struct *vma)
-{
- int i;
- struct address_space *mapping;
- struct mm_struct *mm = vma->vm_mm;
- struct task_struct *curr = current;
-
- mm->map_count--;
- for (i = 0; i < VMACACHE_SIZE; i++) {
- /* if the vma is cached, invalidate the entire cache */
- if (curr->vmacache.vmas[i] == vma) {
- vmacache_invalidate(mm);
- break;
- }
+static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
}
+ mas_add_vma_to_mm(&mas, mm, vma);
+ return 0;
+}
+static void cleanup_vma_from_mm(struct vm_area_struct *vma)
+{
+ vma->vm_mm->map_count--;
/* remove the VMA from the mapping */
if (vma->vm_file) {
+ struct address_space *mapping;
mapping = vma->vm_file->f_mapping;
i_mmap_lock_write(mapping);
@@ -641,11 +625,24 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
flush_dcache_mmap_unlock(mapping);
i_mmap_unlock_write(mapping);
}
+}
+/*
+ * delete a VMA from its owning mm_struct and address space
+ */
+static int delete_vma_from_mm(struct vm_area_struct *vma)
+{
+ MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0);
- /* remove from the MM's tree and list */
- rb_erase(&vma->vm_rb, &mm->mm_rb);
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ return -ENOMEM;
+ }
+ cleanup_vma_from_mm(vma);
- __vma_unlink_list(mm, vma);
+ /* remove from the MM's tree and list */
+ vma_mas_remove(vma, &mas);
+ return 0;
}
/*
@@ -661,31 +658,26 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
vm_area_free(vma);
}
+struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
+ unsigned long start_addr,
+ unsigned long end_addr)
+{
+ unsigned long index = start_addr;
+
+ mmap_assert_locked(mm);
+ return mt_find(&mm->mm_mt, &index, end_addr - 1);
+}
+EXPORT_SYMBOL(find_vma_intersection);
+
/*
* look up the first VMA in which addr resides, NULL if none
* - should be called with mm->mmap_lock at least held readlocked
*/
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- /* check the cache first */
- vma = vmacache_find(mm, addr);
- if (likely(vma))
- return vma;
-
- /* trawl the list (there may be multiple mappings in which addr
- * resides) */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_start > addr)
- return NULL;
- if (vma->vm_end > addr) {
- vmacache_update(addr, vma);
- return vma;
- }
- }
-
- return NULL;
+ return mas_walk(&mas);
}
EXPORT_SYMBOL(find_vma);
@@ -717,26 +709,17 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
{
struct vm_area_struct *vma;
unsigned long end = addr + len;
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
- /* check the cache first */
- vma = vmacache_find_exact(mm, addr, end);
- if (vma)
- return vma;
-
- /* trawl the list (there may be multiple mappings in which addr
- * resides) */
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->vm_start < addr)
- continue;
- if (vma->vm_start > addr)
- return NULL;
- if (vma->vm_end == end) {
- vmacache_update(addr, vma);
- return vma;
- }
- }
+ vma = mas_walk(&mas);
+ if (!vma)
+ return NULL;
+ if (vma->vm_start != addr)
+ return NULL;
+ if (vma->vm_end != end)
+ return NULL;
- return NULL;
+ return vma;
}
/*
@@ -1069,6 +1052,7 @@ unsigned long do_mmap(struct file *file,
vm_flags_t vm_flags;
unsigned long capabilities, result;
int ret;
+ MA_STATE(mas, &current->mm->mm_mt, 0, 0);
*populate = 0;
@@ -1087,6 +1071,7 @@ unsigned long do_mmap(struct file *file,
* now know into VMA flags */
vm_flags = determine_vm_flags(file, prot, flags, capabilities);
+
/* we're going to need to record the mapping */
region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
if (!region)
@@ -1096,6 +1081,9 @@ unsigned long do_mmap(struct file *file,
if (!vma)
goto error_getting_vma;
+ if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ goto error_maple_preallocate;
+
region->vm_usage = 1;
region->vm_flags = vm_flags;
region->vm_pgoff = pgoff;
@@ -1236,7 +1224,7 @@ unsigned long do_mmap(struct file *file,
current->mm->total_vm += len >> PAGE_SHIFT;
share:
- add_vma_to_mm(current->mm, vma);
+ mas_add_vma_to_mm(&mas, current->mm, vma);
/* we flush the region from the icache only when the first executable
* mapping of it is made */
@@ -1262,6 +1250,7 @@ error:
sharing_violation:
up_write(&nommu_region_sem);
+ mas_destroy(&mas);
pr_warn("Attempt to share mismatched mappings\n");
ret = -EINVAL;
goto error;
@@ -1278,6 +1267,14 @@ error_getting_region:
len, current->pid);
show_free_areas(0, NULL);
return -ENOMEM;
+
+error_maple_preallocate:
+ kmem_cache_free(vm_region_jar, region);
+ vm_area_free(vma);
+ pr_warn("Allocation of vma tree for process %d failed\n", current->pid);
+ show_free_areas(0, NULL);
+ return -ENOMEM;
+
}
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
@@ -1343,6 +1340,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *new;
struct vm_region *region;
unsigned long npages;
+ MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end);
/* we're only permitted to split anonymous regions (these should have
* only a single usage on the region) */
@@ -1357,9 +1355,13 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
return -ENOMEM;
new = vm_area_dup(vma);
- if (!new) {
- kmem_cache_free(vm_region_jar, region);
- return -ENOMEM;
+ if (!new)
+ goto err_vma_dup;
+
+ if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ pr_warn("Allocation of vma tree for process %d failed\n",
+ current->pid);
+ goto err_mas_preallocate;
}
/* most fields are the same, copy all, and then fixup */
@@ -1378,7 +1380,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
- delete_vma_from_mm(vma);
down_write(&nommu_region_sem);
delete_nommu_region(vma->vm_region);
if (new_below) {
@@ -1391,9 +1392,19 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
add_nommu_region(vma->vm_region);
add_nommu_region(new->vm_region);
up_write(&nommu_region_sem);
- add_vma_to_mm(mm, vma);
- add_vma_to_mm(mm, new);
+
+ setup_vma_to_mm(vma, mm);
+ setup_vma_to_mm(new, mm);
+ mas_set_range(&mas, vma->vm_start, vma->vm_end - 1);
+ mas_store(&mas, vma);
+ vma_mas_store(new, &mas);
return 0;
+
+err_mas_preallocate:
+ vm_area_free(new);
+err_vma_dup:
+ kmem_cache_free(vm_region_jar, region);
+ return -ENOMEM;
}
/*
@@ -1408,12 +1419,14 @@ static int shrink_vma(struct mm_struct *mm,
/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
- delete_vma_from_mm(vma);
+ if (delete_vma_from_mm(vma))
+ return -ENOMEM;
if (from > vma->vm_start)
vma->vm_end = from;
else
vma->vm_start = to;
- add_vma_to_mm(mm, vma);
+ if (add_vma_to_mm(mm, vma))
+ return -ENOMEM;
/* cut the backing region down to size */
region = vma->vm_region;
@@ -1441,9 +1454,10 @@ static int shrink_vma(struct mm_struct *mm,
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf)
{
+ MA_STATE(mas, &mm->mm_mt, start, start);
struct vm_area_struct *vma;
unsigned long end;
- int ret;
+ int ret = 0;
len = PAGE_ALIGN(len);
if (len == 0)
@@ -1452,7 +1466,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
end = start + len;
/* find the first potentially overlapping VMA */
- vma = find_vma(mm, start);
+ vma = mas_find(&mas, end - 1);
if (!vma) {
static int limit;
if (limit < 5) {
@@ -1471,7 +1485,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
return -EINVAL;
if (end == vma->vm_end)
goto erase_whole_vma;
- vma = vma->vm_next;
+ vma = mas_next(&mas, end - 1);
} while (vma);
return -EINVAL;
} else {
@@ -1493,9 +1507,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list
}
erase_whole_vma:
- delete_vma_from_mm(vma);
+ if (delete_vma_from_mm(vma))
+ ret = -ENOMEM;
delete_vma(mm, vma);
- return 0;
+ return ret;
}
int vm_munmap(unsigned long addr, size_t len)
@@ -1520,6 +1535,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
*/
void exit_mmap(struct mm_struct *mm)
{
+ VMA_ITERATOR(vmi, mm, 0);
struct vm_area_struct *vma;
if (!mm)
@@ -1527,12 +1543,18 @@ void exit_mmap(struct mm_struct *mm)
mm->total_vm = 0;
- while ((vma = mm->mmap)) {
- mm->mmap = vma->vm_next;
- delete_vma_from_mm(vma);
+ /*
+ * Lock the mm to avoid assert complaining even though this is the only
+ * user of the mm
+ */
+ mmap_write_lock(mm);
+ for_each_vma(vmi, vma) {
+ cleanup_vma_from_mm(vma);
delete_vma(mm, vma);
cond_resched();
}
+ __mt_destroy(&mm->mm_mt);
+ mmap_write_unlock(mm);
}
int vm_brk(unsigned long addr, unsigned long len)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3c6cf9e3cd66..1276e49b31b0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -461,7 +461,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
if (is_memcg_oom(oc))
mem_cgroup_print_oom_meminfo(oc->memcg);
else {
- show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
+ __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask));
if (should_dump_unreclaim_slab())
dump_unreclaimable_slab();
}
@@ -509,10 +509,11 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
static struct task_struct *oom_reaper_list;
static DEFINE_SPINLOCK(oom_reaper_lock);
-bool __oom_reap_task_mm(struct mm_struct *mm)
+static bool __oom_reap_task_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
bool ret = true;
+ VMA_ITERATOR(vmi, mm, 0);
/*
* Tell all users of get_user/copy_from_user etc... that the content
@@ -522,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
*/
set_bit(MMF_UNSTABLE, &mm->flags);
- for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP))
continue;
@@ -764,10 +765,8 @@ static void mark_oom_victim(struct task_struct *tsk)
return;
/* oom_mm is bound to the signal struct life time. */
- if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
+ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
mmgrab(tsk->signal->oom_mm);
- set_bit(MMF_OOM_VICTIM, &mm->flags);
- }
/*
* Make sure that the task is woken up from uninterruptible sleep
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 032a7bf8d259..7e9d8d857ecc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1933,6 +1933,7 @@ int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
wb_put(wb);
return ret;
}
+EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags);
/**
* balance_dirty_pages_ratelimited - balance dirty memory state.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e5486d47406e..218b28ee49ed 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -27,6 +27,7 @@
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
+#include <linux/kmsan.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
@@ -482,6 +483,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
static unsigned long prev_end_pfn, nr_initialised;
+ if (early_page_ext_enabled())
+ return false;
/*
* prev_end_pfn static that contains the end of previous zone
* No need to protect because called very early in boot before smp_init.
@@ -542,7 +545,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
#else
- pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
+ pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn);
#endif /* CONFIG_SPARSEMEM */
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
@@ -804,6 +807,7 @@ static void prep_compound_tail(struct page *head, int tail_idx)
p->mapping = TAIL_MAPPING;
set_compound_head(p, head);
+ set_page_private(p, 0);
}
void prep_compound_page(struct page *page, unsigned int order)
@@ -870,7 +874,8 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,
INIT_LIST_HEAD(&page->buddy_list);
set_page_private(page, order);
/* Guard pages are not available for any usage */
- __mod_zone_freepage_state(zone, -(1 << order), migratetype);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, -(1 << order), migratetype);
return true;
}
@@ -900,7 +905,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
* order of appearance. So we need to first gather the full picture of what was
* enabled, and then make decisions.
*/
-void init_mem_debugging_and_hardening(void)
+void __init init_mem_debugging_and_hardening(void)
{
bool page_poisoning_requested = false;
@@ -935,6 +940,10 @@ void init_mem_debugging_and_hardening(void)
else
static_branch_disable(&init_on_free);
+ if (IS_ENABLED(CONFIG_KMSAN) &&
+ (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
+ pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
+
#ifdef CONFIG_DEBUG_PAGEALLOC
if (!debug_pagealloc_enabled())
return;
@@ -1105,7 +1114,7 @@ static inline void __free_one_page(struct page *page,
int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
- unsigned long buddy_pfn;
+ unsigned long buddy_pfn = 0;
unsigned long combined_pfn;
struct page *buddy;
bool to_tail;
@@ -1283,20 +1292,20 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
return bad_reason;
}
-static void check_free_page_bad(struct page *page)
+static void free_page_is_bad_report(struct page *page)
{
bad_page(page,
page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
}
-static inline int check_free_page(struct page *page)
+static inline bool free_page_is_bad(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
- return 0;
+ return false;
/* Something has gone sideways, find it */
- check_free_page_bad(page);
- return 1;
+ free_page_is_bad_report(page);
+ return true;
}
static int free_tail_pages_check(struct page *head_page, struct page *page)
@@ -1398,6 +1407,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
+ kmsan_free_page(page, order);
if (unlikely(PageHWPoison(page)) && !order) {
/*
@@ -1428,7 +1438,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_pages_check(page, page + i);
- if (unlikely(check_free_page(page + i))) {
+ if (unlikely(free_page_is_bad(page + i))) {
bad++;
continue;
}
@@ -1439,8 +1449,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
page->mapping = NULL;
if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
- if (check_free)
- bad += check_free_page(page);
+ if (check_free && free_page_is_bad(page))
+ bad++;
if (bad)
return false;
@@ -1499,10 +1509,11 @@ static bool free_pcp_prepare(struct page *page, unsigned int order)
return free_pages_prepare(page, order, true, FPI_NONE);
}
+/* return true if this page has an inappropriate state */
static bool bulkfree_pcp_prepare(struct page *page)
{
if (debug_pagealloc_enabled_static())
- return check_free_page(page);
+ return free_page_is_bad(page);
else
return false;
}
@@ -1523,7 +1534,7 @@ static bool free_pcp_prepare(struct page *page, unsigned int order)
static bool bulkfree_pcp_prepare(struct page *page)
{
- return check_free_page(page);
+ return free_page_is_bad(page);
}
#endif /* CONFIG_DEBUG_VM */
@@ -1575,7 +1586,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
order = pindex_to_order(pindex);
nr_pages = 1 << order;
- BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH));
do {
int mt;
@@ -1804,6 +1814,10 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
{
if (early_page_uninitialised(pfn))
return;
+ if (!kmsan_memblock_free_pages(page, order)) {
+ /* KMSAN will take care of these pages. */
+ return;
+ }
__free_pages_core(page, order);
}
@@ -1855,7 +1869,7 @@ void set_zone_contiguous(struct zone *zone)
unsigned long block_start_pfn = zone->zone_start_pfn;
unsigned long block_end_pfn;
- block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(block_start_pfn);
for (; block_start_pfn < zone_end_pfn(zone);
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
@@ -1890,15 +1904,14 @@ static void __init deferred_free_range(unsigned long pfn,
page = pfn_to_page(pfn);
/* Free a large naturally-aligned chunk if possible */
- if (nr_pages == pageblock_nr_pages &&
- (pfn & (pageblock_nr_pages - 1)) == 0) {
+ if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
__free_pages_core(page, pageblock_order);
return;
}
for (i = 0; i < nr_pages; i++, page++, pfn++) {
- if ((pfn & (pageblock_nr_pages - 1)) == 0)
+ if (pageblock_aligned(pfn))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
__free_pages_core(page, 0);
}
@@ -1917,16 +1930,12 @@ static inline void __init pgdat_init_report_one_done(void)
/*
* Returns true if page needs to be initialized or freed to buddy allocator.
*
- * First we check if pfn is valid on architectures where it is possible to have
- * holes within pageblock_nr_pages. On systems where it is not possible, this
- * function is optimized out.
- *
- * Then, we check if a current large page is valid by only checking the validity
+ * We check if a current large page is valid by only checking the validity
* of the head pfn.
*/
static inline bool __init deferred_pfn_valid(unsigned long pfn)
{
- if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
+ if (pageblock_aligned(pfn) && !pfn_valid(pfn))
return false;
return true;
}
@@ -1938,14 +1947,13 @@ static inline bool __init deferred_pfn_valid(unsigned long pfn)
static void __init deferred_free_pages(unsigned long pfn,
unsigned long end_pfn)
{
- unsigned long nr_pgmask = pageblock_nr_pages - 1;
unsigned long nr_free = 0;
for (; pfn < end_pfn; pfn++) {
if (!deferred_pfn_valid(pfn)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 0;
- } else if (!(pfn & nr_pgmask)) {
+ } else if (pageblock_aligned(pfn)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 1;
} else {
@@ -1965,7 +1973,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
unsigned long pfn,
unsigned long end_pfn)
{
- unsigned long nr_pgmask = pageblock_nr_pages - 1;
int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
int zid = zone_idx(zone);
@@ -1975,7 +1982,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
if (!deferred_pfn_valid(pfn)) {
page = NULL;
continue;
- } else if (!page || !(pfn & nr_pgmask)) {
+ } else if (!page || pageblock_aligned(pfn)) {
page = pfn_to_page(pfn);
} else {
page++;
@@ -2651,8 +2658,8 @@ int move_freepages_block(struct zone *zone, struct page *page,
*num_movable = 0;
pfn = page_to_pfn(page);
- start_pfn = pfn & ~(pageblock_nr_pages - 1);
- end_pfn = start_pfn + pageblock_nr_pages - 1;
+ start_pfn = pageblock_start_pfn(pfn);
+ end_pfn = pageblock_end_pfn(pfn) - 1;
/* Do not cross zone boundaries */
if (!zone_spans_pfn(zone, start_pfn))
@@ -3010,7 +3017,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
* i.e. orders < pageblock_order. If there are no local zones free,
* the zonelists will be reiterated without ALLOC_NOFRAGMENT.
*/
- if (alloc_flags & ALLOC_NOFRAGMENT)
+ if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
min_order = pageblock_order;
/*
@@ -3440,7 +3447,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
int pindex;
bool free_high;
- __count_vm_event(PGFREE);
+ __count_vm_events(PGFREE, 1 << order);
pindex = order_to_pindex(migratetype, order);
list_add(&page->pcp_list, &pcp->lists[pindex]);
pcp->count += 1 << order;
@@ -3598,16 +3605,11 @@ EXPORT_SYMBOL_GPL(split_page);
int __isolate_free_page(struct page *page, unsigned int order)
{
- unsigned long watermark;
- struct zone *zone;
- int mt;
-
- BUG_ON(!PageBuddy(page));
-
- zone = page_zone(page);
- mt = get_pageblock_migratetype(page);
+ struct zone *zone = page_zone(page);
+ int mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt)) {
+ unsigned long watermark;
/*
* Obey watermarks as if the page was being allocated. We can
* emulate a high-order watermark check with a raised order-0
@@ -3621,8 +3623,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
__mod_zone_freepage_state(zone, -(1UL << order), mt);
}
- /* Remove page from free list */
-
del_page_from_free_list(page, zone, order);
/*
@@ -3643,7 +3643,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
}
-
return 1UL << order;
}
@@ -3670,8 +3669,6 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt)
/*
* Update NUMA hit/miss statistics
- *
- * Must be called with interrupts disabled.
*/
static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
long nr_account)
@@ -3777,8 +3774,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype,
- unsigned int alloc_flags)
+ int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
@@ -3808,15 +3804,24 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
pcp_spin_unlock_irqrestore(pcp, flags);
pcp_trylock_finish(UP_flags);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
}
return page;
}
/*
- * Allocate a page from the given zone. Use pcplists for order-0 allocations.
+ * Allocate a page from the given zone.
+ * Use pcplists for THP or "cheap" high-order allocations.
*/
+
+/*
+ * Do not instrument rmqueue() with KMSAN. This function may call
+ * __msan_poison_alloca() through a call to set_pfnblock_flags_mask().
+ * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it
+ * may call rmqueue() again, which will result in a deadlock.
+ */
+__no_sanitize_memory
static inline
struct page *rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
@@ -3839,7 +3844,7 @@ struct page *rmqueue(struct zone *preferred_zone,
if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
migratetype != MIGRATE_MOVABLE) {
page = rmqueue_pcplist(preferred_zone, zone, order,
- gfp_flags, migratetype, alloc_flags);
+ migratetype, alloc_flags);
if (likely(page))
goto out;
}
@@ -4329,7 +4334,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
- show_mem(filter, nodemask);
+ __show_mem(filter, nodemask, gfp_zone(gfp_mask));
}
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
@@ -4708,6 +4713,30 @@ void fs_reclaim_release(gfp_t gfp_mask)
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
+/*
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
+ * have been rebuilt so allocation retries. Reader side does not lock and
+ * retries the allocation if zonelist changes. Writer side is protected by the
+ * embedded spin_lock.
+ */
+static DEFINE_SEQLOCK(zonelist_update_seq);
+
+static unsigned int zonelist_iter_begin(void)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqbegin(&zonelist_update_seq);
+
+ return 0;
+}
+
+static unsigned int check_retry_zonelist(unsigned int seq)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqretry(&zonelist_update_seq, seq);
+
+ return seq;
+}
+
/* Perform direct synchronous page reclaim */
static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -5001,6 +5030,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
+ unsigned int zonelist_iter_cookie;
int reserve_flags;
/*
@@ -5011,11 +5041,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
-retry_cpuset:
+restart:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist_iter_cookie = zonelist_iter_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
@@ -5121,7 +5152,8 @@ retry:
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
- alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags);
+ alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) |
+ (alloc_flags & ALLOC_KSWAPD);
/*
* Reset the nodemask and zonelist iterators if memory policies can be
@@ -5187,9 +5219,13 @@ retry:
goto retry;
- /* Deal with possible cpuset update races before we start OOM killing */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -5209,9 +5245,13 @@ retry:
}
nopage:
- /* Deal with possible cpuset update races before we fail */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
@@ -5238,7 +5278,7 @@ nopage:
* so that we can identify them and convert them to something
* else.
*/
- WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask);
+ WARN_ON_ONCE_GFP(costly_order, gfp_mask);
/*
* Help non-failing allocations by giving them access to memory
@@ -5535,6 +5575,7 @@ out:
}
trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
+ kmsan_alloc_page(page, order, alloc_gfp);
return page;
}
@@ -5706,6 +5747,18 @@ refill:
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
+ if (unlikely(offset < 0)) {
+ /*
+ * The caller is trying to allocate a fragment
+ * with fragsz > PAGE_SIZE but the cache isn't big
+ * enough to satisfy the request, this may
+ * happen in low memory conditions.
+ * We don't release the cache page because
+ * it could make memory pressure worse
+ * so we simply return NULL here.
+ */
+ return NULL;
+ }
}
nc->pagecnt_bias--;
@@ -5732,14 +5785,18 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
size_t size)
{
if (addr) {
- unsigned long alloc_end = addr + (PAGE_SIZE << order);
- unsigned long used = addr + PAGE_ALIGN(size);
+ unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE);
+ struct page *page = virt_to_page((void *)addr);
+ struct page *last = page + nr;
- split_page(virt_to_page((void *)addr), order);
- while (used < alloc_end) {
- free_page(used);
- used += PAGE_SIZE;
- }
+ split_page_owner(page, 1 << order);
+ split_page_memcg(page, 1 << order);
+ while (page < --last)
+ set_page_refcounted(last);
+
+ last = page + (1UL << order);
+ for (page += nr; page < last; page++)
+ __free_pages_ok(page, 0, FPI_TO_TAIL);
}
return (void *)addr;
}
@@ -6011,6 +6068,15 @@ static void show_migration_types(unsigned char type)
printk(KERN_CONT "(%s) ", tmp);
}
+static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx)
+{
+ int zone_idx;
+ for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++)
+ if (zone_managed_pages(pgdat->node_zones + zone_idx))
+ return true;
+ return false;
+}
+
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
@@ -6020,7 +6086,7 @@ static void show_migration_types(unsigned char type)
* SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
* cpuset.
*/
-void show_free_areas(unsigned int filter, nodemask_t *nodemask)
+void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
{
unsigned long free_pcp = 0;
int cpu, nid;
@@ -6028,6 +6094,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
pg_data_t *pgdat;
for_each_populated_zone(zone) {
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
@@ -6039,7 +6107,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
" unevictable:%lu dirty:%lu writeback:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
- " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu\n"
+ " sec_pagetables:%lu bounce:%lu\n"
" kernel_misc_reclaimable:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
global_node_page_state(NR_ACTIVE_ANON),
@@ -6056,6 +6125,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_node_page_state(NR_PAGETABLE),
+ global_node_page_state(NR_SECONDARY_PAGETABLE),
global_zone_page_state(NR_BOUNCE),
global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE),
global_zone_page_state(NR_FREE_PAGES),
@@ -6065,6 +6135,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
for_each_online_pgdat(pgdat) {
if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
continue;
+ if (!node_has_managed_zones(pgdat, max_zone_idx))
+ continue;
printk("Node %d"
" active_anon:%lukB"
@@ -6089,6 +6161,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" shadow_call_stack:%lukB"
#endif
" pagetables:%lukB"
+ " sec_pagetables:%lukB"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -6114,6 +6187,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
node_page_state(pgdat, NR_KERNEL_SCS_KB),
#endif
K(node_page_state(pgdat, NR_PAGETABLE)),
+ K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)),
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
@@ -6121,6 +6195,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
for_each_populated_zone(zone) {
int i;
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
@@ -6182,6 +6258,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
unsigned long nr[MAX_ORDER], flags, total = 0;
unsigned char types[MAX_ORDER];
+ if (zone_idx(zone) > max_zone_idx)
+ continue;
if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
continue;
show_node(zone);
@@ -6507,16 +6585,15 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
#define BOOT_PAGESET_BATCH 1
static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
-DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void __build_all_zonelists(void *data)
{
int nid;
int __maybe_unused cpu;
pg_data_t *self = data;
- static DEFINE_SPINLOCK(lock);
- spin_lock(&lock);
+ write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -6553,7 +6630,7 @@ static void __build_all_zonelists(void *data)
#endif
}
- spin_unlock(&lock);
+ write_sequnlock(&zonelist_update_seq);
}
static noinline void __init
@@ -6704,7 +6781,7 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
* such that unmovable allocations won't be scattered all
* over the place during system boot.
*/
- if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
+ if (pageblock_aligned(pfn)) {
set_pageblock_migratetype(page, migratetype);
cond_resched();
}
@@ -6747,10 +6824,18 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
- if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
+ if (pageblock_aligned(pfn)) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
+
+ /*
+ * ZONE_DEVICE pages are released directly to the driver page allocator
+ * which will set the page count to 1 when allocating the page.
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
+ pgmap->type == MEMORY_DEVICE_COHERENT)
+ set_page_count(page, 0);
}
/*
@@ -6810,7 +6895,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
unsigned long start = jiffies;
int nid = pgdat->node_id;
- if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
+ if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE))
return;
/*
@@ -6879,9 +6964,8 @@ static void __init init_unavailable_range(unsigned long spfn,
u64 pgcnt = 0;
for (pfn = spfn; pfn < epfn; pfn++) {
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
- + pageblock_nr_pages - 1;
+ if (!pfn_valid(pageblock_start_pfn(pfn))) {
+ pfn = pageblock_end_pfn(pfn) - 1;
continue;
}
__init_single_page(pfn_to_page(pfn), pfn, zone, node);
@@ -6986,7 +7070,7 @@ static int zone_batchsize(struct zone *zone)
* size is striking a balance between allocation latency
* and zone lock contention.
*/
- batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE);
+ batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE);
batch /= 4; /* We effectively *= 4 below */
if (batch < 1)
batch = 1;
@@ -7171,6 +7255,17 @@ void __meminit setup_zone_pageset(struct zone *zone)
}
/*
+ * The zone indicated has a new number of managed_pages; batch sizes and percpu
+ * page high values need to be recalculated.
+ */
+static void zone_pcp_update(struct zone *zone, int cpu_online)
+{
+ mutex_lock(&pcp_batch_high_lock);
+ zone_set_pageset_high_and_batch(zone, cpu_online);
+ mutex_unlock(&pcp_batch_high_lock);
+}
+
+/*
* Allocate per cpu pagesets and initialize them.
* Before this call only boot pagesets were available.
*/
@@ -7614,6 +7709,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
int i;
pgdat_resize_init(pgdat);
+ pgdat_kswapd_lock_init(pgdat);
pgdat_init_split_queue(pgdat);
pgdat_init_kcompactd(pgdat);
@@ -7908,17 +8004,6 @@ unsigned long __init node_map_pfn_alignment(void)
return ~accl_mask + 1;
}
-/**
- * find_min_pfn_with_active_regions - Find the minimum PFN registered
- *
- * Return: the minimum PFN based on information provided via
- * memblock_set_node().
- */
-unsigned long __init find_min_pfn_with_active_regions(void)
-{
- return PHYS_PFN(memblock_start_of_DRAM());
-}
-
/*
* early_calculate_totalpages()
* Sum pages in active regions for movable zone.
@@ -8211,7 +8296,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
- start_pfn = find_min_pfn_with_active_regions();
+ start_pfn = PHYS_PFN(memblock_start_of_DRAM());
descending = arch_has_descending_max_zone_pfns();
for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -8460,8 +8545,8 @@ void __init mem_init_print_info(void)
#endif
")\n",
K(nr_free_pages()), K(physpages),
- codesize >> 10, datasize >> 10, rosize >> 10,
- (init_data_size + init_code_size) >> 10, bss_size >> 10,
+ codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K,
+ (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K,
K(physpages - totalram_pages() - totalcma_pages),
K(totalcma_pages)
#ifdef CONFIG_HIGHMEM
@@ -8974,7 +9059,7 @@ void *__init alloc_large_system_hash(const char *tablename,
{
unsigned long long max = high_limit;
unsigned long log2qty, size;
- void *table = NULL;
+ void *table;
gfp_t gfp_flags;
bool virt;
bool huge;
@@ -8986,8 +9071,8 @@ void *__init alloc_large_system_hash(const char *tablename,
numentries -= arch_reserved_kernel_pages();
/* It isn't necessary when PAGE_SIZE >= 1MB */
- if (PAGE_SHIFT < 20)
- numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
+ if (PAGE_SIZE < SZ_1M)
+ numentries = round_up(numentries, SZ_1M / PAGE_SIZE);
#if __BITS_PER_LONG > 32
if (!high_limit) {
@@ -9412,17 +9497,6 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages)
EXPORT_SYMBOL(free_contig_range);
/*
- * The zone indicated has a new number of managed_pages; batch sizes and percpu
- * page high values need to be recalculated.
- */
-void zone_pcp_update(struct zone *zone, int cpu_online)
-{
- mutex_lock(&pcp_batch_high_lock);
- zone_set_pageset_high_and_batch(zone, cpu_online);
- mutex_unlock(&pcp_batch_high_lock);
-}
-
-/*
* Effectively disable pcplists for the zone by setting the high limit to 0
* and draining all cpus. A concurrent page freeing on another CPU that's about
* to put the page on pcplist will either finish before the drain and the page
@@ -9454,9 +9528,11 @@ void zone_pcp_reset(struct zone *zone)
drain_zonestat(zone, pzstats);
}
free_percpu(zone->per_cpu_pageset);
- free_percpu(zone->per_cpu_zonestats);
zone->per_cpu_pageset = &boot_pageset;
- zone->per_cpu_zonestats = &boot_zonestats;
+ if (zone->per_cpu_zonestats != &boot_zonestats) {
+ free_percpu(zone->per_cpu_zonestats);
+ zone->per_cpu_zonestats = &boot_zonestats;
+ }
}
}
diff --git a/mm/page_counter.c b/mm/page_counter.c
index eb156ff5d603..db20d6452b71 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -17,24 +17,23 @@ static void propagate_protected_usage(struct page_counter *c,
unsigned long usage)
{
unsigned long protected, old_protected;
- unsigned long low, min;
long delta;
if (!c->parent)
return;
- min = READ_ONCE(c->min);
- if (min || atomic_long_read(&c->min_usage)) {
- protected = min(usage, min);
+ protected = min(usage, READ_ONCE(c->min));
+ old_protected = atomic_long_read(&c->min_usage);
+ if (protected != old_protected) {
old_protected = atomic_long_xchg(&c->min_usage, protected);
delta = protected - old_protected;
if (delta)
atomic_long_add(delta, &c->parent->children_min_usage);
}
- low = READ_ONCE(c->low);
- if (low || atomic_long_read(&c->low_usage)) {
- protected = min(usage, low);
+ protected = min(usage, READ_ONCE(c->low));
+ old_protected = atomic_long_read(&c->low_usage);
+ if (protected != old_protected) {
old_protected = atomic_long_xchg(&c->low_usage, protected);
delta = protected - old_protected;
if (delta)
@@ -193,7 +192,7 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
old = xchg(&counter->max, nr_pages);
- if (page_counter_read(counter) <= usage)
+ if (page_counter_read(counter) <= usage || nr_pages >= old)
return 0;
counter->max = old;
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 3dc715d7ac29..affe80243b6d 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -9,6 +9,7 @@
#include <linux/page_owner.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
+#include <linux/rcupdate.h>
/*
* struct page extension
@@ -59,6 +60,10 @@
* can utilize this callback to initialize the state of it correctly.
*/
+#ifdef CONFIG_SPARSEMEM
+#define PAGE_EXT_INVALID (0x1)
+#endif
+
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
static bool need_page_idle(void)
{
@@ -84,6 +89,15 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
unsigned long page_ext_size = sizeof(struct page_ext);
static unsigned long total_usage;
+static struct page_ext *lookup_page_ext(const struct page *page);
+
+bool early_page_ext;
+static int __init setup_early_page_ext(char *str)
+{
+ early_page_ext = true;
+ return 0;
+}
+early_param("early_page_ext", setup_early_page_ext);
static bool __init invoke_need_callbacks(void)
{
@@ -125,6 +139,48 @@ static inline struct page_ext *get_entry(void *base, unsigned long index)
return base + page_ext_size * index;
}
+/**
+ * page_ext_get() - Get the extended information for a page.
+ * @page: The page we're interested in.
+ *
+ * Ensures that the page_ext will remain valid until page_ext_put()
+ * is called.
+ *
+ * Return: NULL if no page_ext exists for this page.
+ * Context: Any context. Caller may not sleep until they have called
+ * page_ext_put().
+ */
+struct page_ext *page_ext_get(struct page *page)
+{
+ struct page_ext *page_ext;
+
+ rcu_read_lock();
+ page_ext = lookup_page_ext(page);
+ if (!page_ext) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ return page_ext;
+}
+
+/**
+ * page_ext_put() - Working with page extended information is done.
+ * @page_ext - Page extended information received from page_ext_get().
+ *
+ * The page extended information of the page may not be valid after this
+ * function is called.
+ *
+ * Return: None.
+ * Context: Any context with corresponding page_ext_get() is called.
+ */
+void page_ext_put(struct page_ext *page_ext)
+{
+ if (unlikely(!page_ext))
+ return;
+
+ rcu_read_unlock();
+}
#ifndef CONFIG_SPARSEMEM
@@ -133,12 +189,13 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
pgdat->node_page_ext = NULL;
}
-struct page_ext *lookup_page_ext(const struct page *page)
+static struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned long index;
struct page_ext *base;
+ WARN_ON_ONCE(!rcu_read_lock_held());
base = NODE_DATA(page_to_nid(page))->node_page_ext;
/*
* The sanity checks the page allocator does upon freeing a
@@ -206,20 +263,27 @@ fail:
}
#else /* CONFIG_SPARSEMEM */
+static bool page_ext_invalid(struct page_ext *page_ext)
+{
+ return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID);
+}
-struct page_ext *lookup_page_ext(const struct page *page)
+static struct page_ext *lookup_page_ext(const struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
+ struct page_ext *page_ext = READ_ONCE(section->page_ext);
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
*/
- if (!section->page_ext)
+ if (page_ext_invalid(page_ext))
return NULL;
- return get_entry(section->page_ext, pfn);
+ return get_entry(page_ext, pfn);
}
static void *__meminit alloc_page_ext(size_t size, int nid)
@@ -298,9 +362,30 @@ static void __free_page_ext(unsigned long pfn)
ms = __pfn_to_section(pfn);
if (!ms || !ms->page_ext)
return;
- base = get_entry(ms->page_ext, pfn);
+
+ base = READ_ONCE(ms->page_ext);
+ /*
+ * page_ext here can be valid while doing the roll back
+ * operation in online_page_ext().
+ */
+ if (page_ext_invalid(base))
+ base = (void *)base - PAGE_EXT_INVALID;
+ WRITE_ONCE(ms->page_ext, NULL);
+
+ base = get_entry(base, pfn);
free_page_ext(base);
- ms->page_ext = NULL;
+}
+
+static void __invalidate_page_ext(unsigned long pfn)
+{
+ struct mem_section *ms;
+ void *val;
+
+ ms = __pfn_to_section(pfn);
+ if (!ms || !ms->page_ext)
+ return;
+ val = (void *)ms->page_ext + PAGE_EXT_INVALID;
+ WRITE_ONCE(ms->page_ext, val);
}
static int __meminit online_page_ext(unsigned long start_pfn,
@@ -336,13 +421,27 @@ static int __meminit online_page_ext(unsigned long start_pfn,
}
static int __meminit offline_page_ext(unsigned long start_pfn,
- unsigned long nr_pages, int nid)
+ unsigned long nr_pages)
{
unsigned long start, end, pfn;
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
+ /*
+ * Freeing of page_ext is done in 3 steps to avoid
+ * use-after-free of it:
+ * 1) Traverse all the sections and mark their page_ext
+ * as invalid.
+ * 2) Wait for all the existing users of page_ext who
+ * started before invalidation to finish.
+ * 3) Free the page_ext.
+ */
+ for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+ __invalidate_page_ext(pfn);
+
+ synchronize_rcu();
+
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
__free_page_ext(pfn);
return 0;
@@ -362,11 +461,11 @@ static int __meminit page_ext_callback(struct notifier_block *self,
break;
case MEM_OFFLINE:
offline_page_ext(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
+ mn->nr_pages);
break;
case MEM_CANCEL_ONLINE:
offline_page_ext(mn->start_pfn,
- mn->nr_pages, mn->status_change_nid);
+ mn->nr_pages);
break;
case MEM_GOING_OFFLINE:
break;
diff --git a/mm/page_io.c b/mm/page_io.c
index 68318134dc92..2af34dd8fa4d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -28,7 +28,7 @@
#include <linux/delayacct.h>
#include "swap.h"
-void end_swap_bio_write(struct bio *bio)
+static void end_swap_bio_write(struct bio *bio)
{
struct page *page = bio_first_page_all(bio);
@@ -180,29 +180,30 @@ bad_bmap:
*/
int swap_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
int ret = 0;
- if (try_to_free_swap(page)) {
- unlock_page(page);
+ if (folio_free_swap(folio)) {
+ folio_unlock(folio);
goto out;
}
/*
* Arch code may have to preserve more data than just the page
* contents, e.g. memory tags.
*/
- ret = arch_prepare_to_swap(page);
+ ret = arch_prepare_to_swap(&folio->page);
if (ret) {
- set_page_dirty(page);
- unlock_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
goto out;
}
- if (frontswap_store(page) == 0) {
- set_page_writeback(page);
- unlock_page(page);
- end_page_writeback(page);
+ if (frontswap_store(&folio->page) == 0) {
+ folio_start_writeback(folio);
+ folio_unlock(folio);
+ folio_end_writeback(folio);
goto out;
}
- ret = __swap_writepage(page, wbc, end_swap_bio_write);
+ ret = __swap_writepage(&folio->page, wbc);
out:
return ret;
}
@@ -332,8 +333,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
return 0;
}
-int __swap_writepage(struct page *page, struct writeback_control *wbc,
- bio_end_io_t end_write_func)
+int __swap_writepage(struct page *page, struct writeback_control *wbc)
{
struct bio *bio;
int ret;
@@ -358,7 +358,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
GFP_NOIO);
bio->bi_iter.bi_sector = swap_page_sector(page);
- bio->bi_end_io = end_write_func;
+ bio->bi_end_io = end_swap_bio_write;
bio_add_page(bio, page, thp_size(page), 0);
bio_associate_blkg_from_page(bio, page);
@@ -453,18 +453,21 @@ int swap_readpage(struct page *page, bool synchronous,
struct swap_info_struct *sis = page_swap_info(page);
bool workingset = PageWorkingset(page);
unsigned long pflags;
+ bool in_thrashing;
VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageUptodate(page), page);
/*
- * Count submission time as memory stall. When the device is congested,
- * or the submitting cgroup IO-throttled, submission can be a
- * significant part of overall IO time.
+ * Count submission time as memory stall and delay. When the device
+ * is congested, or the submitting cgroup IO-throttled, submission
+ * can be a significant part of overall IO time.
*/
- if (workingset)
+ if (workingset) {
+ delayacct_thrashing_start(&in_thrashing);
psi_memstall_enter(&pflags);
+ }
delayacct_swapin_start();
if (frontswap_load(page) == 0) {
@@ -513,8 +516,10 @@ int swap_readpage(struct page *page, bool synchronous,
bio_put(bio);
out:
- if (workingset)
+ if (workingset) {
+ delayacct_thrashing_end(&in_thrashing);
psi_memstall_leave(&pflags);
+ }
delayacct_swapin_end();
return ret;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 9d73dc38e3d7..47fbc1696466 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -37,8 +37,8 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e
struct zone *zone = page_zone(page);
unsigned long pfn;
- VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) !=
- ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages));
+ VM_BUG_ON(pageblock_start_pfn(start_pfn) !=
+ pageblock_start_pfn(end_pfn - 1));
if (is_migrate_cma_page(page)) {
/*
@@ -172,7 +172,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
* to avoid redundant checks.
*/
check_unmovable_start = max(page_to_pfn(page), start_pfn);
- check_unmovable_end = min(ALIGN(page_to_pfn(page) + 1, pageblock_nr_pages),
+ check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)),
end_pfn);
unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
@@ -288,6 +288,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* @isolate_before: isolate the pageblock before the boundary_pfn
* @skip_isolation: the flag to skip the pageblock isolation in second
* isolate_single_pageblock()
+ * @migratetype: migrate type to set in error recovery.
*
* Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
* pageblock. When not all pageblocks within a page are isolated at the same
@@ -302,16 +303,16 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* the in-use page then splitting the free page.
*/
static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
- gfp_t gfp_flags, bool isolate_before, bool skip_isolation)
+ gfp_t gfp_flags, bool isolate_before, bool skip_isolation,
+ int migratetype)
{
- unsigned char saved_mt;
unsigned long start_pfn;
unsigned long isolate_pageblock;
unsigned long pfn;
struct zone *zone;
int ret;
- VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages));
+ VM_BUG_ON(!pageblock_aligned(boundary_pfn));
if (isolate_before)
isolate_pageblock = boundary_pfn - pageblock_nr_pages;
@@ -328,13 +329,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
zone->zone_start_pfn);
- saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+ if (skip_isolation) {
+ int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
- if (skip_isolation)
- VM_BUG_ON(!is_migrate_isolate(saved_mt));
- else {
- ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
- isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+ VM_BUG_ON(!is_migrate_isolate(mt));
+ } else {
+ ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype,
+ flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
if (ret)
return ret;
@@ -475,7 +476,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
failed:
/* restore the original migratetype */
if (!skip_isolation)
- unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
+ unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype);
return -EBUSY;
}
@@ -531,13 +532,14 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned long pfn;
struct page *page;
/* isolation is done at page block granularity */
- unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
- unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
+ unsigned long isolate_start = pageblock_start_pfn(start_pfn);
+ unsigned long isolate_end = pageblock_align(end_pfn);
int ret;
bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
- ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation);
+ ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false,
+ skip_isolation, migratetype);
if (ret)
return ret;
@@ -545,7 +547,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
skip_isolation = true;
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
- ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation);
+ ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true,
+ skip_isolation, migratetype);
if (ret) {
unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
return ret;
@@ -576,9 +579,8 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
{
unsigned long pfn;
struct page *page;
- unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
- unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
-
+ unsigned long isolate_start = pageblock_start_pfn(start_pfn);
+ unsigned long isolate_end = pageblock_align(end_pfn);
for (pfn = isolate_start;
pfn < isolate_end;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index e4c6f3f1695b..2d27f532df4c 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -141,7 +141,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
struct page_owner *page_owner;
u64 free_ts_nsec = local_clock();
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
@@ -153,6 +153,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
page_owner->free_ts_nsec = free_ts_nsec;
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
static inline void __set_page_owner_handle(struct page_ext *page_ext,
@@ -183,19 +184,21 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext,
noinline void __set_page_owner(struct page *page, unsigned short order,
gfp_t gfp_mask)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext;
depot_stack_handle_t handle;
+ handle = save_stack(gfp_mask);
+
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
return;
-
- handle = save_stack(gfp_mask);
__set_page_owner_handle(page_ext, handle, order, gfp_mask);
+ page_ext_put(page_ext);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = page_ext_get(page);
struct page_owner *page_owner;
if (unlikely(!page_ext))
@@ -203,12 +206,13 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
page_owner = get_page_owner(page_ext);
page_owner->last_migrate_reason = reason;
+ page_ext_put(page_ext);
}
void __split_page_owner(struct page *page, unsigned int nr)
{
int i;
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = page_ext_get(page);
struct page_owner *page_owner;
if (unlikely(!page_ext))
@@ -219,17 +223,25 @@ void __split_page_owner(struct page *page, unsigned int nr)
page_owner->order = 0;
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
- struct page_ext *old_ext = lookup_page_ext(&old->page);
- struct page_ext *new_ext = lookup_page_ext(&newfolio->page);
+ struct page_ext *old_ext;
+ struct page_ext *new_ext;
struct page_owner *old_page_owner, *new_page_owner;
- if (unlikely(!old_ext || !new_ext))
+ old_ext = page_ext_get(&old->page);
+ if (unlikely(!old_ext))
return;
+ new_ext = page_ext_get(&newfolio->page);
+ if (unlikely(!new_ext)) {
+ page_ext_put(old_ext);
+ return;
+ }
+
old_page_owner = get_page_owner(old_ext);
new_page_owner = get_page_owner(new_ext);
new_page_owner->order = old_page_owner->order;
@@ -254,6 +266,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old)
*/
__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
+ page_ext_put(new_ext);
+ page_ext_put(old_ext);
}
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -283,7 +297,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
continue;
}
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(pfn);
block_end_pfn = min(block_end_pfn, end_pfn);
pageblock_mt = get_pageblock_migratetype(page);
@@ -307,12 +321,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (PageReserved(page))
continue;
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
continue;
if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
- continue;
+ goto ext_put_continue;
page_owner = get_page_owner(page_ext);
page_mt = gfp_migratetype(page_owner->gfp_mask);
@@ -323,9 +337,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
count[pageblock_mt]++;
pfn = block_end_pfn;
+ page_ext_put(page_ext);
break;
}
pfn += (1UL << page_owner->order) - 1;
+ext_put_continue:
+ page_ext_put(page_ext);
}
}
@@ -435,7 +452,7 @@ err:
void __dump_page_owner(const struct page *page)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = page_ext_get((void *)page);
struct page_owner *page_owner;
depot_stack_handle_t handle;
gfp_t gfp_mask;
@@ -452,6 +469,7 @@ void __dump_page_owner(const struct page *page)
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
pr_alert("page_owner info is not present (never set?)\n");
+ page_ext_put(page_ext);
return;
}
@@ -482,6 +500,7 @@ void __dump_page_owner(const struct page *page)
if (page_owner->last_migrate_reason != -1)
pr_alert("page has been migrated, last migrate reason: %s\n",
migrate_reason_names[page_owner->last_migrate_reason]);
+ page_ext_put(page_ext);
}
static ssize_t
@@ -497,17 +516,25 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
return -EINVAL;
page = NULL;
- pfn = min_low_pfn + *ppos;
-
+ if (*ppos == 0)
+ pfn = min_low_pfn;
+ else
+ pfn = *ppos;
/* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */
while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0)
pfn++;
- drain_all_pages(NULL);
-
/* Find an allocated page */
for (; pfn < max_pfn; pfn++) {
/*
+ * This temporary page_owner is required so
+ * that we can avoid the context switches while holding
+ * the rcu lock and copying the page owner information to
+ * user through copy_to_user() or GFP_KERNEL allocations.
+ */
+ struct page_owner page_owner_tmp;
+
+ /*
* If the new page is in a new MAX_ORDER_NR_PAGES area,
* validate the area as existing, skip it if not
*/
@@ -525,7 +552,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
continue;
}
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
continue;
@@ -534,14 +561,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
* because we don't hold the zone lock.
*/
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
- continue;
+ goto ext_put_continue;
/*
* Although we do have the info about past allocation of free
* pages, it's not relevant for current memory usage.
*/
if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
- continue;
+ goto ext_put_continue;
page_owner = get_page_owner(page_ext);
@@ -550,7 +577,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
* would inflate the stats.
*/
if (!IS_ALIGNED(pfn, 1 << page_owner->order))
- continue;
+ goto ext_put_continue;
/*
* Access to page_ext->handle isn't synchronous so we should
@@ -558,18 +585,37 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
*/
handle = READ_ONCE(page_owner->handle);
if (!handle)
- continue;
+ goto ext_put_continue;
/* Record the next PFN to read in the file offset */
- *ppos = (pfn - min_low_pfn) + 1;
+ *ppos = pfn + 1;
+ page_owner_tmp = *page_owner;
+ page_ext_put(page_ext);
return print_page_owner(buf, count, pfn, page,
- page_owner, handle);
+ &page_owner_tmp, handle);
+ext_put_continue:
+ page_ext_put(page_ext);
}
return 0;
}
+static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig)
+{
+ switch (orig) {
+ case SEEK_SET:
+ file->f_pos = offset;
+ break;
+ case SEEK_CUR:
+ file->f_pos += offset;
+ break;
+ default:
+ return -EINVAL;
+ }
+ return file->f_pos;
+}
+
static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
{
unsigned long pfn = zone->zone_start_pfn;
@@ -589,7 +635,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
continue;
}
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+ block_end_pfn = pageblock_end_pfn(pfn);
block_end_pfn = min(block_end_pfn, end_pfn);
for (; pfn < block_end_pfn; pfn++) {
@@ -617,18 +663,20 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
if (PageReserved(page))
continue;
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
if (unlikely(!page_ext))
continue;
/* Maybe overlapping zone */
if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
- continue;
+ goto ext_put_continue;
/* Found early allocated page */
__set_page_owner_handle(page_ext, early_handle,
0, 0);
count++;
+ext_put_continue:
+ page_ext_put(page_ext);
}
cond_resched();
}
@@ -660,6 +708,7 @@ static void init_early_allocated_pages(void)
static const struct file_operations proc_page_owner_operations = {
.read = read_page_owner,
+ .llseek = lseek_page_owner,
};
static int __init pageowner_init(void)
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index e2062748791a..433dbce13fe1 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -53,7 +53,7 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
}
/*
- * An enty is removed from the page table, decrement the counters for that page
+ * An entry is removed from the page table, decrement the counters for that page
* verify that it is of correct type and counters do not become negative.
*/
static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
@@ -68,7 +68,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
return;
page = pfn_to_page(pfn);
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
anon = PageAnon(page);
for (i = 0; i < pgcnt; i++) {
@@ -83,10 +83,11 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
}
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
/*
- * A new enty is added to the page table, increment the counters for that page
+ * A new entry is added to the page table, increment the counters for that page
* verify that it is of correct type and is not being mapped with a different
* type to a different process.
*/
@@ -103,7 +104,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
return;
page = pfn_to_page(pfn);
- page_ext = lookup_page_ext(page);
+ page_ext = page_ext_get(page);
anon = PageAnon(page);
for (i = 0; i < pgcnt; i++) {
@@ -118,6 +119,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
}
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
/*
@@ -126,9 +128,10 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
*/
void __page_table_check_zero(struct page *page, unsigned int order)
{
- struct page_ext *page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext;
unsigned long i;
+ page_ext = page_ext_get(page);
BUG_ON(!page_ext);
for (i = 0; i < (1ul << order); i++) {
struct page_table_check *ptc = get_page_table_check(page_ext);
@@ -137,6 +140,7 @@ void __page_table_check_zero(struct page *page, unsigned int order)
BUG_ON(atomic_read(&ptc->file_map_count));
page_ext = page_ext_next(page_ext);
}
+ page_ext_put(page_ext);
}
void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 8e9e574d535a..93e13fc17d3c 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -86,7 +86,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
!is_device_exclusive_entry(entry))
return false;
- pfn = swp_offset(entry);
+ pfn = swp_offset_pfn(entry);
} else if (is_swap_pte(*pvmw->pte)) {
swp_entry_t entry;
@@ -96,7 +96,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
!is_device_exclusive_entry(entry))
return false;
- pfn = swp_offset(entry);
+ pfn = swp_offset_pfn(entry);
} else {
if (!pte_present(*pvmw->pte))
return false;
@@ -221,7 +221,7 @@ restart:
return not_found(pvmw);
entry = pmd_to_swp_entry(pmde);
if (!is_migration_entry(entry) ||
- !check_pmd(swp_offset(entry), pvmw))
+ !check_pmd(swp_offset_pfn(entry), pvmw))
return not_found(pvmw);
return true;
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index fa7a3d21a751..2ff3a5bebceb 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -460,7 +460,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
} else { /* inside vma */
walk.vma = vma;
next = min(end, vma->vm_end);
- vma = vma->vm_next;
+ vma = find_vma(mm, vma->vm_end);
err = walk_page_test(start, next, &walk);
if (err > 0) {
@@ -482,7 +482,15 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
return err;
}
-/*
+/**
+ * walk_page_range_novma - walk a range of pagetables not backed by a vma
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @pgd: pgd to walk if different from mm->pgd
+ * @private: private data for callbacks' usage
+ *
* Similar to walk_page_range() but can walk any page tables even if they are
* not backed by VMAs. Because 'unusual' entries may be walked this function
* will also not lock the PTEs for the pte_entry() callback. This is useful for
diff --git a/mm/readahead.c b/mm/readahead.c
index fdcd28cbd92d..b10f0cf81d80 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -122,6 +122,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
#include <linux/pagemap.h>
+#include <linux/psi.h>
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
@@ -152,6 +153,8 @@ static void read_pages(struct readahead_control *rac)
if (!readahead_count(rac))
return;
+ if (unlikely(rac->_workingset))
+ psi_memstall_enter(&rac->_pflags);
blk_start_plug(&plug);
if (aops->readahead) {
@@ -179,6 +182,9 @@ static void read_pages(struct readahead_control *rac)
}
blk_finish_plug(&plug);
+ if (unlikely(rac->_workingset))
+ psi_memstall_leave(&rac->_pflags);
+ rac->_workingset = false;
BUG_ON(readahead_count(rac));
}
@@ -252,6 +258,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
}
if (i == nr_to_read - lookahead_size)
folio_set_readahead(folio);
+ ractl->_workingset |= folio_test_workingset(folio);
ractl->_nr_pages++;
}
@@ -480,11 +487,14 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
if (index == mark)
folio_set_readahead(folio);
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
- if (err)
+ if (err) {
folio_put(folio);
- else
- ractl->_nr_pages += 1UL << order;
- return err;
+ return err;
+ }
+
+ ractl->_nr_pages += 1UL << order;
+ ractl->_workingset |= folio_test_workingset(folio);
+ return 0;
}
void page_cache_ra_order(struct readahead_control *ractl,
@@ -826,6 +836,10 @@ void readahead_expand(struct readahead_control *ractl,
put_page(page);
return;
}
+ if (unlikely(PageWorkingset(page)) && !ractl->_workingset) {
+ ractl->_workingset = true;
+ psi_memstall_enter(&ractl->_pflags);
+ }
ractl->_nr_pages++;
if (ra) {
ra->size++;
diff --git a/mm/rmap.c b/mm/rmap.c
index 93d5a6f793d2..2ec925e5fa6a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,10 +23,9 @@
* inode->i_rwsem (while writing or truncating, not reading or faulting)
* mm->mmap_lock
* mapping->invalidate_lock (in filemap_fault)
- * page->flags PG_locked (lock_page) * (see hugetlbfs below)
- * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ * page->flags PG_locked (lock_page)
+ * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
* mapping->i_mmap_rwsem
- * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* swap_lock (in swap_duplicate, swap_info_get)
@@ -46,10 +45,11 @@
* ->tasklist_lock
* pte map lock
*
- * * hugetlbfs PageHuge() pages take locks in this order:
- * mapping->i_mmap_rwsem
- * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
- * page->flags PG_locked (lock_page)
+ * hugetlbfs PageHuge() take locks in this order:
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ * vma_lock (hugetlb specific lock for pmd_sharing)
+ * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
+ * page->flags PG_locked (lock_page)
*/
#include <linux/mm.h>
@@ -489,16 +489,16 @@ void __init anon_vma_init(void)
* if there is a mapcount, we can dereference the anon_vma after observing
* those.
*/
-struct anon_vma *page_get_anon_vma(struct page *page)
+struct anon_vma *folio_get_anon_vma(struct folio *folio)
{
struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long)READ_ONCE(page->mapping);
+ anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
- if (!page_mapped(page))
+ if (!folio_mapped(folio))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
@@ -508,13 +508,13 @@ struct anon_vma *page_get_anon_vma(struct page *page)
}
/*
- * If this page is still mapped, then its anon_vma cannot have been
+ * If this folio is still mapped, then its anon_vma cannot have been
* freed. But if it has been unmapped, we have no security against the
* anon_vma structure being freed and reused (for another anon_vma:
* SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
* above cannot corrupt).
*/
- if (!page_mapped(page)) {
+ if (!folio_mapped(folio)) {
rcu_read_unlock();
put_anon_vma(anon_vma);
return NULL;
@@ -526,11 +526,11 @@ out:
}
/*
- * Similar to page_get_anon_vma() except it locks the anon_vma.
+ * Similar to folio_get_anon_vma() except it locks the anon_vma.
*
* Its a little more complex as it tries to keep the fast path to a single
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
- * reference like with page_get_anon_vma() and then block on the mutex
+ * reference like with folio_get_anon_vma() and then block on the mutex
* on !rwc->try_lock case.
*/
struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
@@ -602,11 +602,6 @@ out:
return anon_vma;
}
-void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
-{
- anon_vma_unlock_read(anon_vma);
-}
-
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
* Flush TLB entries for recently unmapped pages from remote CPUs. It is
@@ -770,13 +765,17 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return vma_address(page, vma);
}
+/*
+ * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
+ * NULL if it doesn't exist. No guarantees / checks on what the pmd_t*
+ * represents.
+ */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd = NULL;
- pmd_t pmde;
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -791,15 +790,6 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
goto out;
pmd = pmd_offset(pud, address);
- /*
- * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
- * without holding anon_vma lock for write. So when looking for a
- * genuine pmde (in which to find pte), test present and !THP together.
- */
- pmde = *pmd;
- barrier();
- if (!pmd_present(pmde) || pmd_trans_huge(pmde))
- pmd = NULL;
out:
return pmd;
}
@@ -833,6 +823,12 @@ static bool folio_referenced_one(struct folio *folio,
}
if (pvmw.pte) {
+ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
+ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
+ lru_gen_look_around(&pvmw);
+ referenced++;
+ }
+
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
/*
@@ -1101,22 +1097,20 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
*/
void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
{
- struct anon_vma *anon_vma = vma->anon_vma;
- struct page *subpage = page;
-
- page = compound_head(page);
+ void *anon_vma = vma->anon_vma;
+ struct folio *folio = page_folio(page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_VMA(!anon_vma, vma);
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ anon_vma += PAGE_MAPPING_ANON;
/*
* Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
* simultaneously, so a concurrent reader (eg folio_referenced()'s
* folio_test_anon()) will not see one without the other.
*/
- WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
- SetPageAnonExclusive(subpage);
+ WRITE_ONCE(folio->mapping, anon_vma);
+ SetPageAnonExclusive(page);
}
/**
@@ -1560,33 +1554,45 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* To call huge_pmd_unshare, i_mmap_rwsem must be
* held in write mode. Caller needs to explicitly
* do this outside rmap routines.
+ *
+ * We also must hold hugetlb vma_lock in write mode.
+ * Lock order dictates acquiring vma_lock BEFORE
+ * i_mmap_rwsem. We can only try lock here and fail
+ * if unsuccessful.
*/
- VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
- if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
-
- /*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
- */
- page_vma_mapped_walk_done(&pvmw);
- break;
+ if (!anon) {
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (!hugetlb_vma_trylock_write(vma)) {
+ page_vma_mapped_walk_done(&pvmw);
+ ret = false;
+ break;
+ }
+ if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ flush_tlb_range(vma,
+ range.start, range.end);
+ mmu_notifier_invalidate_range(mm,
+ range.start, range.end);
+ /*
+ * The ref count of the PMD page was
+ * dropped which is part of the way map
+ * counting is done for shared PMDs.
+ * Return 'true' here. When there is
+ * no other sharing, huge_pmd_unshare
+ * returns false and we will unmap the
+ * actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ hugetlb_vma_unlock_write(vma);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
- /*
- * Nuke the page table entry. When having to clear
- * PageAnonExclusive(), we always have to flush.
- */
- if (should_defer_flush(mm, flags) && !anon_exclusive) {
+ /* Nuke the page table entry. */
+ if (should_defer_flush(mm, flags)) {
/*
* We clear the PTE but do not flush so potentially
* a remote CPU could still be writing to the folio.
@@ -1717,6 +1723,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
swap_free(entry);
@@ -1936,26 +1944,41 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* To call huge_pmd_unshare, i_mmap_rwsem must be
* held in write mode. Caller needs to explicitly
* do this outside rmap routines.
+ *
+ * We also must hold hugetlb vma_lock in write mode.
+ * Lock order dictates acquiring vma_lock BEFORE
+ * i_mmap_rwsem. We can only try lock here and
+ * fail if unsuccessful.
*/
- VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
- if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
-
- /*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
- */
- page_vma_mapped_walk_done(&pvmw);
- break;
+ if (!anon) {
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (!hugetlb_vma_trylock_write(vma)) {
+ page_vma_mapped_walk_done(&pvmw);
+ ret = false;
+ break;
+ }
+ if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ flush_tlb_range(vma,
+ range.start, range.end);
+ mmu_notifier_invalidate_range(mm,
+ range.start, range.end);
+
+ /*
+ * The ref count of the PMD page was
+ * dropped which is part of the way map
+ * counting is done for shared PMDs.
+ * Return 'true' here. When there is
+ * no other sharing, huge_pmd_unshare
+ * returns false and we will unmap the
+ * actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ hugetlb_vma_unlock_write(vma);
}
-
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
@@ -2048,6 +2071,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
}
VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
!anon_exclusive, subpage);
+
+ /* See page_try_share_anon_rmap(): clear PTE first. */
if (anon_exclusive &&
page_try_share_anon_rmap(subpage)) {
if (folio_test_hugetlb(folio))
@@ -2073,7 +2098,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
-
+ if (pte_young(pteval))
+ entry = make_migration_entry_young(entry);
+ if (pte_dirty(pteval))
+ entry = make_migration_entry_dirty(entry);
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 2613371945b7..6d783436951f 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -9,13 +9,13 @@
#include <linux/rodata_test.h>
#include <linux/uaccess.h>
+#include <linux/mm.h>
#include <asm/sections.h>
static const int rodata_test_data = 0xC3;
void rodata_test(void)
{
- unsigned long start, end;
int zero = 0;
/* test 1: read the value */
@@ -39,13 +39,11 @@ void rodata_test(void)
}
/* test 4: check if the rodata section is PAGE_SIZE aligned */
- start = (unsigned long)__start_rodata;
- end = (unsigned long)__end_rodata;
- if (start & (PAGE_SIZE - 1)) {
+ if (!PAGE_ALIGNED(__start_rodata)) {
pr_err("start of .rodata is not page size aligned\n");
return;
}
- if (end & (PAGE_SIZE - 1)) {
+ if (!PAGE_ALIGNED(__end_rodata)) {
pr_err("end of .rodata is not page size aligned\n");
return;
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index e3e9590c6fb3..04c3ac9448a1 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -276,20 +276,18 @@ static struct file_system_type secretmem_fs = {
.kill_sb = kill_anon_super,
};
-static int secretmem_init(void)
+static int __init secretmem_init(void)
{
- int ret = 0;
-
if (!secretmem_enable)
- return ret;
+ return 0;
secretmem_mnt = kern_mount(&secretmem_fs);
if (IS_ERR(secretmem_mnt))
- ret = PTR_ERR(secretmem_mnt);
+ return PTR_ERR(secretmem_mnt);
/* prevent secretmem mappings from ever getting PROT_EXEC */
secretmem_mnt->mnt_flags |= MNT_NOEXEC;
- return ret;
+ return 0;
}
fs_initcall(secretmem_init);
diff --git a/mm/shmem.c b/mm/shmem.c
index 42e5888bf84d..c1d8b8a1aa3b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -38,6 +38,7 @@
#include <linux/hugetlb.h>
#include <linux/fs_parser.h>
#include <linux/swapfile.h>
+#include <linux/iversion.h>
#include "swap.h"
static struct vfsmount *shm_mnt;
@@ -139,17 +140,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
vm_fault_t *fault_type);
-static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp,
- gfp_t gfp, struct vm_area_struct *vma,
- struct vm_fault *vmf, vm_fault_t *fault_type);
-
-int shmem_getpage(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp)
-{
- return shmem_getpage_gfp(inode, index, pagep, sgp,
- mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
-}
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
@@ -190,7 +180,7 @@ static inline int shmem_reacct_size(unsigned long flags,
/*
* ... whereas tmpfs objects are accounted incrementally as
* pages are allocated, in order to allow large sparse files.
- * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
+ * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/
static inline int shmem_acct_block(unsigned long flags, long pages)
@@ -472,20 +462,22 @@ static bool shmem_confirm_swap(struct address_space *mapping,
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
-bool shmem_is_huge(struct vm_area_struct *vma,
- struct inode *inode, pgoff_t index)
+bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
+ pgoff_t index, bool shmem_huge_force)
{
loff_t i_size;
if (!S_ISREG(inode->i_mode))
return false;
- if (shmem_huge == SHMEM_HUGE_DENY)
- return false;
if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
return false;
+ if (shmem_huge_force)
+ return true;
if (shmem_huge == SHMEM_HUGE_FORCE)
return true;
+ if (shmem_huge == SHMEM_HUGE_DENY)
+ return false;
switch (SHMEM_SB(inode->i_sb)->huge) {
case SHMEM_HUGE_ALWAYS:
@@ -629,7 +621,7 @@ next:
goto move_back;
}
- ret = split_huge_page(&folio->page);
+ ret = split_folio(folio);
folio_unlock(folio);
folio_put(folio);
@@ -680,8 +672,8 @@ static long shmem_unused_huge_count(struct super_block *sb,
#define shmem_huge SHMEM_HUGE_DENY
-bool shmem_is_huge(struct vm_area_struct *vma,
- struct inode *inode, pgoff_t index)
+bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
+ pgoff_t index, bool shmem_huge_force)
{
return false;
}
@@ -763,23 +755,22 @@ error:
}
/*
- * Like delete_from_page_cache, but substitutes swap for page.
+ * Like delete_from_page_cache, but substitutes swap for @folio.
*/
-static void shmem_delete_from_page_cache(struct page *page, void *radswap)
+static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
+ long nr = folio_nr_pages(folio);
int error;
- VM_BUG_ON_PAGE(PageCompound(page), page);
-
xa_lock_irq(&mapping->i_pages);
- error = shmem_replace_entry(mapping, page->index, page, radswap);
- page->mapping = NULL;
- mapping->nrpages--;
- __dec_lruvec_page_state(page, NR_FILE_PAGES);
- __dec_lruvec_page_state(page, NR_SHMEM);
+ error = shmem_replace_entry(mapping, folio->index, folio, radswap);
+ folio->mapping = NULL;
+ mapping->nrpages -= nr;
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
xa_unlock_irq(&mapping->i_pages);
- put_page(page);
+ folio_put(folio);
BUG_ON(error);
}
@@ -886,10 +877,9 @@ void shmem_unlock_mapping(struct address_space *mapping)
static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
{
struct folio *folio;
- struct page *page;
/*
- * At first avoid shmem_getpage(,,,SGP_READ): that fails
+ * At first avoid shmem_get_folio(,,,SGP_READ): that fails
* beyond i_size, and reports fallocated pages as holes.
*/
folio = __filemap_get_folio(inode->i_mapping, index,
@@ -900,9 +890,9 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
* But read a page back from swap if any of it is within i_size
* (although in some cases this is just a waste of time).
*/
- page = NULL;
- shmem_getpage(inode, index, &page, SGP_READ);
- return page ? page_folio(page) : NULL;
+ folio = NULL;
+ shmem_get_folio(inode, index, &folio, SGP_READ);
+ return folio;
}
/*
@@ -1043,6 +1033,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -1069,7 +1060,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
STATX_ATTR_NODUMP);
generic_fillattr(&init_user_ns, inode, stat);
- if (shmem_is_huge(NULL, inode, 0))
+ if (shmem_is_huge(NULL, inode, 0, false))
stat->blksize = HPAGE_PMD_SIZE;
if (request_mask & STATX_BTIME) {
@@ -1087,6 +1078,8 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
int error;
+ bool update_mtime = false;
+ bool update_ctime = true;
error = setattr_prepare(&init_user_ns, dentry, attr);
if (error)
@@ -1107,7 +1100,9 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
if (error)
return error;
i_size_write(inode, newsize);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ update_mtime = true;
+ } else {
+ update_ctime = false;
}
if (newsize <= oldsize) {
loff_t holebegin = round_up(newsize, PAGE_SIZE);
@@ -1127,6 +1122,12 @@ static int shmem_setattr(struct user_namespace *mnt_userns,
setattr_copy(&init_user_ns, inode, attr);
if (attr->ia_valid & ATTR_MODE)
error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ if (!error && update_ctime) {
+ inode->i_ctime = current_time(inode);
+ if (update_mtime)
+ inode->i_mtime = inode->i_ctime;
+ inode_inc_iversion(inode);
+ }
return error;
}
@@ -1328,17 +1329,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
* "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
* and its shmem_writeback() needs them to be split when swapping.
*/
- if (PageTransCompound(page)) {
+ if (folio_test_large(folio)) {
/* Ensure the subpages are still dirty */
- SetPageDirty(page);
+ folio_test_set_dirty(folio);
if (split_huge_page(page) < 0)
goto redirty;
- ClearPageDirty(page);
+ folio = page_folio(page);
+ folio_clear_dirty(folio);
}
- BUG_ON(!PageLocked(page));
- mapping = page->mapping;
- index = page->index;
+ BUG_ON(!folio_test_locked(folio));
+ mapping = folio->mapping;
+ index = folio->index;
inode = mapping->host;
info = SHMEM_I(inode);
if (info->flags & VM_LOCKED)
@@ -1361,15 +1363,15 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
/*
* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
* value into swapfile.c, the only way we can correctly account for a
- * fallocated page arriving here is now to initialize it and write it.
+ * fallocated folio arriving here is now to initialize it and write it.
*
- * That's okay for a page already fallocated earlier, but if we have
+ * That's okay for a folio already fallocated earlier, but if we have
* not yet completed the fallocation, then (a) we want to keep track
- * of this page in case we have to undo it, and (b) it may not be a
+ * of this folio in case we have to undo it, and (b) it may not be a
* good idea to continue anyway, once we're pushing into swap. So
- * reactivate the page, and let shmem_fallocate() quit when too many.
+ * reactivate the folio, and let shmem_fallocate() quit when too many.
*/
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
if (inode->i_private) {
struct shmem_falloc *shmem_falloc;
spin_lock(&inode->i_lock);
@@ -1385,9 +1387,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (shmem_falloc)
goto redirty;
}
- clear_highpage(page);
- flush_dcache_page(page);
- SetPageUptodate(page);
+ folio_zero_range(folio, 0, folio_size(folio));
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
}
swap = folio_alloc_swap(folio);
@@ -1396,7 +1398,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
- * if it's not already there. Do it now before the page is
+ * if it's not already there. Do it now before the folio is
* moved to swap cache, when its pagelock no longer protects
* the inode from eviction. But don't unlock the mutex until
* we've incremented swapped, because shmem_unuse_inode() will
@@ -1406,7 +1408,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (list_empty(&info->swaplist))
list_add(&info->swaplist, &shmem_swaplist);
- if (add_to_swap_cache(page, swap,
+ if (add_to_swap_cache(folio, swap,
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
NULL) == 0) {
spin_lock_irq(&info->lock);
@@ -1415,21 +1417,21 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
spin_unlock_irq(&info->lock);
swap_shmem_alloc(swap);
- shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+ shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
mutex_unlock(&shmem_swaplist_mutex);
- BUG_ON(page_mapped(page));
- swap_writepage(page, wbc);
+ BUG_ON(folio_mapped(folio));
+ swap_writepage(&folio->page, wbc);
return 0;
}
mutex_unlock(&shmem_swaplist_mutex);
- put_swap_page(page, swap);
+ put_swap_folio(folio, swap);
redirty:
- set_page_dirty(page);
+ folio_mark_dirty(folio);
if (wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
- unlock_page(page);
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
+ folio_unlock(folio);
return 0;
}
@@ -1486,7 +1488,7 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
mpol_cond_put(vma->vm_policy);
}
-static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
@@ -1499,7 +1501,9 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
page = swap_cluster_readahead(swap, gfp, &vmf);
shmem_pseudo_vma_destroy(&pvma);
- return page;
+ if (!page)
+ return NULL;
+ return page_folio(page);
}
/*
@@ -1560,12 +1564,6 @@ static struct folio *shmem_alloc_folio(gfp_t gfp,
return folio;
}
-static struct page *shmem_alloc_page(gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index)
-{
- return &shmem_alloc_folio(gfp, info, index)->page;
-}
-
static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
pgoff_t index, bool huge)
{
@@ -1599,7 +1597,7 @@ failed:
/*
* When a page is moved from swapcache to shmem filecache (either by the
- * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
* shmem_unuse_inode()), it may have been read in earlier from swap, in
* ignorance of the mapping it belongs to. If that mapping has special
* constraints (like the gma500 GEM driver, which requires RAM below 4GB),
@@ -1614,54 +1612,52 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
return folio_zonenum(folio) > gfp_zone(gfp);
}
-static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct page *oldpage, *newpage;
struct folio *old, *new;
struct address_space *swap_mapping;
swp_entry_t entry;
pgoff_t swap_index;
int error;
- oldpage = *pagep;
- entry.val = page_private(oldpage);
+ old = *foliop;
+ entry = folio_swap_entry(old);
swap_index = swp_offset(entry);
- swap_mapping = page_mapping(oldpage);
+ swap_mapping = swap_address_space(entry);
/*
* We have arrived here because our zones are constrained, so don't
* limit chance of success by further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK;
- newpage = shmem_alloc_page(gfp, info, index);
- if (!newpage)
+ VM_BUG_ON_FOLIO(folio_test_large(old), old);
+ new = shmem_alloc_folio(gfp, info, index);
+ if (!new)
return -ENOMEM;
- get_page(newpage);
- copy_highpage(newpage, oldpage);
- flush_dcache_page(newpage);
+ folio_get(new);
+ folio_copy(new, old);
+ flush_dcache_folio(new);
- __SetPageLocked(newpage);
- __SetPageSwapBacked(newpage);
- SetPageUptodate(newpage);
- set_page_private(newpage, entry.val);
- SetPageSwapCache(newpage);
+ __folio_set_locked(new);
+ __folio_set_swapbacked(new);
+ folio_mark_uptodate(new);
+ folio_set_swap_entry(new, entry);
+ folio_set_swapcache(new);
/*
* Our caller will very soon move newpage out of swapcache, but it's
* a nice clean interface for us to replace oldpage by newpage there.
*/
xa_lock_irq(&swap_mapping->i_pages);
- error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
+ error = shmem_replace_entry(swap_mapping, swap_index, old, new);
if (!error) {
- old = page_folio(oldpage);
- new = page_folio(newpage);
mem_cgroup_migrate(old, new);
- __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
- __inc_lruvec_page_state(newpage, NR_SHMEM);
- __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
- __dec_lruvec_page_state(oldpage, NR_SHMEM);
+ __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
+ __lruvec_stat_mod_folio(new, NR_SHMEM, 1);
+ __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
+ __lruvec_stat_mod_folio(old, NR_SHMEM, -1);
}
xa_unlock_irq(&swap_mapping->i_pages);
@@ -1671,18 +1667,17 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
* both PageSwapCache and page_private after getting page lock;
* but be defensive. Reverse old to newpage for clear and free.
*/
- oldpage = newpage;
+ old = new;
} else {
- lru_cache_add(newpage);
- *pagep = newpage;
+ folio_add_lru(new);
+ *foliop = new;
}
- ClearPageSwapCache(oldpage);
- set_page_private(oldpage, 0);
+ folio_clear_swapcache(old);
+ old->private = NULL;
- unlock_page(oldpage);
- put_page(oldpage);
- put_page(oldpage);
+ folio_unlock(old);
+ folio_put_refs(old, 2);
return error;
}
@@ -1730,7 +1725,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
- struct page *page;
struct folio *folio = NULL;
swp_entry_t swap;
int error;
@@ -1743,8 +1737,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
return -EIO;
/* Look it up and read it in.. */
- page = lookup_swap_cache(swap, NULL, 0);
- if (!page) {
+ folio = swap_cache_get_folio(swap, NULL, 0);
+ if (!folio) {
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
@@ -1752,13 +1746,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
count_memcg_event_mm(charge_mm, PGMAJFAULT);
}
/* Here we actually start the io */
- page = shmem_swapin(swap, gfp, info, index);
- if (!page) {
+ folio = shmem_swapin(swap, gfp, info, index);
+ if (!folio) {
error = -ENOMEM;
goto failed;
}
}
- folio = page_folio(page);
/* We have to do this with folio locked to prevent races */
folio_lock(folio);
@@ -1781,8 +1774,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
arch_swap_restore(swap, folio);
if (shmem_should_replace_folio(folio, gfp)) {
- error = shmem_replace_page(&page, gfp, info, index);
- folio = page_folio(page);
+ error = shmem_replace_folio(&folio, gfp, info, index);
if (error)
goto failed;
}
@@ -1822,7 +1814,7 @@ unlock:
}
/*
- * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
+ * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
@@ -1831,10 +1823,10 @@ unlock:
* vma, vmf, and fault_type are only supplied by shmem_fault:
* otherwise they are NULL.
*/
-static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, gfp_t gfp,
- struct vm_area_struct *vma, struct vm_fault *vmf,
- vm_fault_t *fault_type)
+static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
+ struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
+ struct vm_area_struct *vma, struct vm_fault *vmf,
+ vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
@@ -1874,7 +1866,7 @@ repeat:
if (error == -EEXIST)
goto repeat;
- *pagep = &folio->page;
+ *foliop = folio;
return error;
}
@@ -1884,7 +1876,7 @@ repeat:
folio_mark_accessed(folio);
if (folio_test_uptodate(folio))
goto out;
- /* fallocated page */
+ /* fallocated folio */
if (sgp != SGP_READ)
goto clear;
folio_unlock(folio);
@@ -1892,10 +1884,10 @@ repeat:
}
/*
- * SGP_READ: succeed on hole, with NULL page, letting caller zero.
- * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail.
+ * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
+ * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
*/
- *pagep = NULL;
+ *foliop = NULL;
if (sgp == SGP_READ)
return 0;
if (sgp == SGP_NOALLOC)
@@ -1910,7 +1902,7 @@ repeat:
return 0;
}
- if (!shmem_is_huge(vma, inode, index))
+ if (!shmem_is_huge(vma, inode, index, false))
goto alloc_nohuge;
huge_gfp = vma_thp_gfp_mask(vma);
@@ -1928,7 +1920,7 @@ alloc_nohuge:
if (error != -ENOSPC)
goto unlock;
/*
- * Try to reclaim some space by splitting a huge page
+ * Try to reclaim some space by splitting a large folio
* beyond i_size on the filesystem.
*/
while (retry--) {
@@ -1964,9 +1956,9 @@ alloc_nohuge:
if (folio_test_pmd_mappable(folio) &&
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
- hindex + HPAGE_PMD_NR - 1) {
+ folio_next_index(folio) - 1) {
/*
- * Part of the huge page is beyond i_size: subject
+ * Part of the large folio is beyond i_size: subject
* to shrink under memory pressure.
*/
spin_lock(&sbinfo->shrinklist_lock);
@@ -1983,14 +1975,14 @@ alloc_nohuge:
}
/*
- * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
*/
if (sgp == SGP_FALLOC)
sgp = SGP_WRITE;
clear:
/*
- * Let SGP_WRITE caller clear ends if write does not fill page;
- * but SGP_FALLOC on a page fallocated earlier must initialize
+ * Let SGP_WRITE caller clear ends if write does not fill folio;
+ * but SGP_FALLOC on a folio fallocated earlier must initialize
* it now, lest undo on failure cancel our earlier guarantee.
*/
if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
@@ -2016,7 +2008,7 @@ clear:
goto unlock;
}
out:
- *pagep = folio_page(folio, index - hindex);
+ *foliop = folio;
return 0;
/*
@@ -2046,6 +2038,13 @@ unlock:
return error;
}
+int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
+ enum sgp_type sgp)
+{
+ return shmem_get_folio_gfp(inode, index, foliop, sgp,
+ mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
+}
+
/*
* This is like autoremove_wake_function, but it removes the wait queue
* entry unconditionally - even if something else had already woken the
@@ -2063,6 +2062,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
+ struct folio *folio = NULL;
int err;
vm_fault_t ret = VM_FAULT_LOCKED;
@@ -2125,10 +2125,12 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
spin_unlock(&inode->i_lock);
}
- err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
+ err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
gfp, vma, vmf, &ret);
if (err)
return vmf_error(err);
+ if (folio)
+ vmf->page = folio_file_page(folio, vmf->pgoff);
return ret;
}
@@ -2330,7 +2332,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
inode_init_owner(&init_user_ns, inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
@@ -2398,7 +2400,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
void *page_kaddr;
struct folio *folio;
- struct page *page;
int ret;
pgoff_t max_off;
@@ -2417,53 +2418,70 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (!*pagep) {
ret = -ENOMEM;
- page = shmem_alloc_page(gfp, info, pgoff);
- if (!page)
+ folio = shmem_alloc_folio(gfp, info, pgoff);
+ if (!folio)
goto out_unacct_blocks;
if (!zeropage) { /* COPY */
- page_kaddr = kmap_atomic(page);
+ page_kaddr = kmap_local_folio(folio, 0);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
ret = copy_from_user(page_kaddr,
(const void __user *)src_addr,
PAGE_SIZE);
- kunmap_atomic(page_kaddr);
+ pagefault_enable();
+ kunmap_local(page_kaddr);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
- *pagep = page;
+ *pagep = &folio->page;
ret = -ENOENT;
/* don't free the page */
goto out_unacct_blocks;
}
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
} else { /* ZEROPAGE */
- clear_user_highpage(page, dst_addr);
+ clear_user_highpage(&folio->page, dst_addr);
}
} else {
- page = *pagep;
+ folio = page_folio(*pagep);
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
*pagep = NULL;
}
- VM_BUG_ON(PageLocked(page));
- VM_BUG_ON(PageSwapBacked(page));
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
- __SetPageUptodate(page);
+ VM_BUG_ON(folio_test_locked(folio));
+ VM_BUG_ON(folio_test_swapbacked(folio));
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+ __folio_mark_uptodate(folio);
ret = -EFAULT;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(pgoff >= max_off))
goto out_release;
- folio = page_folio(page);
ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
gfp & GFP_RECLAIM_MASK, dst_mm);
if (ret)
goto out_release;
ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- page, true, wp_copy);
+ &folio->page, true, wp_copy);
if (ret)
goto out_delete_from_cache;
@@ -2473,13 +2491,13 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
- unlock_page(page);
+ folio_unlock(folio);
return 0;
out_delete_from_cache:
- delete_from_page_cache(page);
+ filemap_remove_folio(folio);
out_release:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
out_unacct_blocks:
shmem_inode_unacct_blocks(inode, 1);
return ret;
@@ -2498,6 +2516,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_SHIFT;
+ struct folio *folio;
int ret = 0;
/* i_rwsem is held by caller */
@@ -2509,14 +2528,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
return -EPERM;
}
- ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
+ ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
if (ret)
return ret;
+ *pagep = folio_file_page(folio, index);
if (PageHWPoison(*pagep)) {
- unlock_page(*pagep);
- put_page(*pagep);
+ folio_unlock(folio);
+ folio_put(folio);
*pagep = NULL;
return -EIO;
}
@@ -2575,6 +2595,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
offset = *ppos & ~PAGE_MASK;
for (;;) {
+ struct folio *folio = NULL;
struct page *page = NULL;
pgoff_t end_index;
unsigned long nr, ret;
@@ -2589,17 +2610,18 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
break;
}
- error = shmem_getpage(inode, index, &page, SGP_READ);
+ error = shmem_get_folio(inode, index, &folio, SGP_READ);
if (error) {
if (error == -EINVAL)
error = 0;
break;
}
- if (page) {
- unlock_page(page);
+ if (folio) {
+ folio_unlock(folio);
+ page = folio_file_page(folio, index);
if (PageHWPoison(page)) {
- put_page(page);
+ folio_put(folio);
error = -EIO;
break;
}
@@ -2615,14 +2637,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (index == end_index) {
nr = i_size & ~PAGE_MASK;
if (nr <= offset) {
- if (page)
- put_page(page);
+ if (folio)
+ folio_put(folio);
break;
}
}
nr -= offset;
- if (page) {
+ if (folio) {
/*
* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
@@ -2634,13 +2656,13 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* Mark the page accessed if we read the beginning.
*/
if (!offset)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*/
ret = copy_page_to_iter(page, offset, nr, to);
- put_page(page);
+ folio_put(folio);
} else if (user_backed_iter(to)) {
/*
@@ -2783,7 +2805,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
info->fallocend = end;
for (index = start; index < end; ) {
- struct page *page;
+ struct folio *folio;
/*
* Good, the fallocate(2) manpage permits EINTR: we may have
@@ -2794,10 +2816,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
error = -ENOMEM;
else
- error = shmem_getpage(inode, index, &page, SGP_FALLOC);
+ error = shmem_get_folio(inode, index, &folio,
+ SGP_FALLOC);
if (error) {
info->fallocend = undo_fallocend;
- /* Remove the !PageUptodate pages we added */
+ /* Remove the !uptodate folios we added */
if (index > start) {
shmem_undo_range(inode,
(loff_t)start << PAGE_SHIFT,
@@ -2806,37 +2829,34 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
goto undone;
}
- index++;
/*
* Here is a more important optimization than it appears:
- * a second SGP_FALLOC on the same huge page will clear it,
- * making it PageUptodate and un-undoable if we fail later.
+ * a second SGP_FALLOC on the same large folio will clear it,
+ * making it uptodate and un-undoable if we fail later.
*/
- if (PageTransCompound(page)) {
- index = round_up(index, HPAGE_PMD_NR);
- /* Beware 32-bit wraparound */
- if (!index)
- index--;
- }
+ index = folio_next_index(folio);
+ /* Beware 32-bit wraparound */
+ if (!index)
+ index--;
/*
* Inform shmem_writepage() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
- if (!PageUptodate(page))
+ if (!folio_test_uptodate(folio))
shmem_falloc.nr_falloced += index - shmem_falloc.next;
shmem_falloc.next = index;
/*
- * If !PageUptodate, leave it that way so that freeable pages
+ * If !uptodate, leave it that way so that freeable folios
* can be recognized if we need to rollback on error later.
- * But set_page_dirty so that memory pressure will swap rather
- * than free the pages we are allocating (and SGP_CACHE pages
+ * But mark it dirty so that memory pressure will swap rather
+ * than free the folios we are allocating (and SGP_CACHE folios
* might still be clean: we now need to mark those dirty too).
*/
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
cond_resched();
}
@@ -2901,6 +2921,7 @@ shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
error = 0;
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = current_time(dir);
+ inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
}
@@ -2912,7 +2933,7 @@ out_iput:
static int
shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+ struct file *file, umode_t mode)
{
struct inode *inode;
int error = -ENOSPC;
@@ -2927,9 +2948,9 @@ shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
error = simple_acl_create(dir, inode);
if (error)
goto out_iput;
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
}
- return error;
+ return finish_open_simple(file, error);
out_iput:
iput(inode);
return error;
@@ -2976,6 +2997,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ inode_inc_iversion(dir);
inc_nlink(inode);
ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
@@ -2993,6 +3015,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
dir->i_size -= BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ inode_inc_iversion(dir);
drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - this does all the work */
return 0;
@@ -3082,6 +3105,8 @@ static int shmem_rename2(struct user_namespace *mnt_userns,
old_dir->i_ctime = old_dir->i_mtime =
new_dir->i_ctime = new_dir->i_mtime =
inode->i_ctime = current_time(old_dir);
+ inode_inc_iversion(old_dir);
+ inode_inc_iversion(new_dir);
return 0;
}
@@ -3091,7 +3116,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
int error;
int len;
struct inode *inode;
- struct page *page;
+ struct folio *folio;
len = strlen(symname) + 1;
if (len > PAGE_SIZE)
@@ -3119,21 +3144,22 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
- error = shmem_getpage(inode, 0, &page, SGP_WRITE);
+ error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
if (error) {
iput(inode);
return error;
}
inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
- memcpy(page_address(page), symname, len);
- SetPageUptodate(page);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ memcpy(folio_address(folio), symname, len);
+ folio_mark_uptodate(folio);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
}
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = current_time(dir);
+ inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry);
return 0;
@@ -3141,40 +3167,41 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
static void shmem_put_link(void *arg)
{
- mark_page_accessed(arg);
- put_page(arg);
+ folio_mark_accessed(arg);
+ folio_put(arg);
}
static const char *shmem_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
int error;
+
if (!dentry) {
- page = find_get_page(inode->i_mapping, 0);
- if (!page)
+ folio = filemap_get_folio(inode->i_mapping, 0);
+ if (!folio)
return ERR_PTR(-ECHILD);
- if (PageHWPoison(page) ||
- !PageUptodate(page)) {
- put_page(page);
+ if (PageHWPoison(folio_page(folio, 0)) ||
+ !folio_test_uptodate(folio)) {
+ folio_put(folio);
return ERR_PTR(-ECHILD);
}
} else {
- error = shmem_getpage(inode, 0, &page, SGP_READ);
+ error = shmem_get_folio(inode, 0, &folio, SGP_READ);
if (error)
return ERR_PTR(error);
- if (!page)
+ if (!folio)
return ERR_PTR(-ECHILD);
- if (PageHWPoison(page)) {
- unlock_page(page);
- put_page(page);
+ if (PageHWPoison(folio_page(folio, 0))) {
+ folio_unlock(folio);
+ folio_put(folio);
return ERR_PTR(-ECHILD);
}
- unlock_page(page);
+ folio_unlock(folio);
}
- set_delayed_call(done, shmem_put_link, page);
- return page_address(page);
+ set_delayed_call(done, shmem_put_link, folio);
+ return folio_address(folio);
}
#ifdef CONFIG_TMPFS_XATTR
@@ -3204,6 +3231,7 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns,
shmem_set_inode_flags(inode, info->fsflags);
inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
return 0;
}
@@ -3267,9 +3295,15 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler,
size_t size, int flags)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ int err;
name = xattr_full_name(handler, name);
- return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
+ err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
+ if (!err) {
+ inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
+ }
+ return err;
}
static const struct xattr_handler shmem_security_xattr_handler = {
@@ -3732,7 +3766,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
- sb->s_flags |= SB_NOSEC;
+ sb->s_flags |= SB_NOSEC | SB_I_VERSION;
#else
sb->s_flags |= SB_NOUSER;
#endif
@@ -4266,18 +4300,20 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
{
#ifdef CONFIG_SHMEM
struct inode *inode = mapping->host;
+ struct folio *folio;
struct page *page;
int error;
BUG_ON(!shmem_mapping(mapping));
- error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
+ error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
gfp, NULL, NULL, NULL);
if (error)
return ERR_PTR(error);
- unlock_page(page);
+ folio_unlock(folio);
+ page = folio_file_page(folio, index);
if (PageHWPoison(page)) {
- put_page(page);
+ folio_put(folio);
return ERR_PTR(-EIO);
}
diff --git a/mm/shuffle.c b/mm/shuffle.c
index c13c33b247e8..fb1393b8b3a9 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -12,23 +12,22 @@
DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
static bool shuffle_param;
-static int shuffle_show(char *buffer, const struct kernel_param *kp)
-{
- return sprintf(buffer, "%c\n", shuffle_param ? 'Y' : 'N');
-}
-static __meminit int shuffle_store(const char *val,
+static __meminit int shuffle_param_set(const char *val,
const struct kernel_param *kp)
{
- int rc = param_set_bool(val, kp);
-
- if (rc < 0)
- return rc;
- if (shuffle_param)
+ if (param_set_bool(val, kp))
+ return -EINVAL;
+ if (*(bool *)kp->arg)
static_branch_enable(&page_alloc_shuffle_key);
return 0;
}
-module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
+
+static const struct kernel_param_ops shuffle_param_ops = {
+ .set = shuffle_param_set,
+ .get = param_get_bool,
+};
+module_param_cb(shuffle, &shuffle_param_ops, &shuffle_param, 0400);
/*
* For two pages to be swapped in the shuffle, they must be free (on a
diff --git a/mm/slab.c b/mm/slab.c
index 10e96137b44f..59c8e28f7b6a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1619,7 +1619,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slab)
* although actual page can be freed in rcu context
*/
if (OFF_SLAB(cachep))
- kmem_cache_free(cachep->freelist_cache, freelist);
+ kfree(freelist);
}
/*
@@ -1671,21 +1671,27 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
if (flags & CFLGS_OFF_SLAB) {
struct kmem_cache *freelist_cache;
size_t freelist_size;
+ size_t freelist_cache_size;
freelist_size = num * sizeof(freelist_idx_t);
- freelist_cache = kmalloc_slab(freelist_size, 0u);
- if (!freelist_cache)
- continue;
-
- /*
- * Needed to avoid possible looping condition
- * in cache_grow_begin()
- */
- if (OFF_SLAB(freelist_cache))
- continue;
+ if (freelist_size > KMALLOC_MAX_CACHE_SIZE) {
+ freelist_cache_size = PAGE_SIZE << get_order(freelist_size);
+ } else {
+ freelist_cache = kmalloc_slab(freelist_size, 0u);
+ if (!freelist_cache)
+ continue;
+ freelist_cache_size = freelist_cache->size;
+
+ /*
+ * Needed to avoid possible looping condition
+ * in cache_grow_begin()
+ */
+ if (OFF_SLAB(freelist_cache))
+ continue;
+ }
/* check if off slab has enough benefit */
- if (freelist_cache->size > cachep->size / 2)
+ if (freelist_cache_size > cachep->size / 2)
continue;
}
@@ -2061,11 +2067,6 @@ done:
cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
#endif
- if (OFF_SLAB(cachep)) {
- cachep->freelist_cache =
- kmalloc_slab(cachep->freelist_size, 0u);
- }
-
err = setup_cpu_cache(cachep, gfp);
if (err) {
__kmem_cache_release(cachep);
@@ -2292,7 +2293,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
freelist = NULL;
else if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
- freelist = kmem_cache_alloc_node(cachep->freelist_cache,
+ freelist = kmalloc_node(cachep->freelist_size,
local_flags, nodeid);
} else {
/* We will use last bytes at the slab for freelist */
@@ -2380,7 +2381,7 @@ static bool freelist_state_initialize(union freelist_init_state *state,
unsigned int rand;
/* Use best entropy available to define a random shift */
- rand = get_random_int();
+ rand = get_random_u32();
/* Use a random state if the pre-computed list is not available */
if (!cachep->random_seq) {
@@ -3181,84 +3182,46 @@ must_grow:
}
static __always_inline void *
-slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
- unsigned long caller)
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- unsigned long save_flags;
- void *ptr;
+ void *objp = NULL;
int slab_node = numa_mem_id();
- struct obj_cgroup *objcg = NULL;
- bool init = false;
-
- flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, NULL, &objcg, 1, flags);
- if (unlikely(!cachep))
- return NULL;
-
- ptr = kfence_alloc(cachep, orig_size, flags);
- if (unlikely(ptr))
- goto out_hooks;
-
- local_irq_save(save_flags);
-
- if (nodeid == NUMA_NO_NODE)
- nodeid = slab_node;
- if (unlikely(!get_node(cachep, nodeid))) {
- /* Node not bootstrapped yet */
- ptr = fallback_alloc(cachep, flags);
- goto out;
- }
-
- if (nodeid == slab_node) {
+ if (nodeid == NUMA_NO_NODE) {
+ if (current->mempolicy || cpuset_do_slab_mem_spread()) {
+ objp = alternate_node_alloc(cachep, flags);
+ if (objp)
+ goto out;
+ }
/*
* Use the locally cached objects if possible.
* However ____cache_alloc does not allow fallback
* to other nodes. It may fail while we still have
* objects on other nodes available.
*/
- ptr = ____cache_alloc(cachep, flags);
- if (ptr)
- goto out;
- }
- /* ___cache_alloc_node can fall back to other nodes */
- ptr = ____cache_alloc_node(cachep, flags, nodeid);
-out:
- local_irq_restore(save_flags);
- ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
- init = slab_want_init_on_alloc(flags, cachep);
-
-out_hooks:
- slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr, init);
- return ptr;
-}
-
-static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
-{
- void *objp;
-
- if (current->mempolicy || cpuset_do_slab_mem_spread()) {
- objp = alternate_node_alloc(cache, flags);
- if (objp)
- goto out;
+ objp = ____cache_alloc(cachep, flags);
+ nodeid = slab_node;
+ } else if (nodeid == slab_node) {
+ objp = ____cache_alloc(cachep, flags);
+ } else if (!get_node(cachep, nodeid)) {
+ /* Node not bootstrapped yet */
+ objp = fallback_alloc(cachep, flags);
+ goto out;
}
- objp = ____cache_alloc(cache, flags);
/*
* We may just have run out of memory on the local node.
* ____cache_alloc_node() knows how to locate memory on other nodes
*/
if (!objp)
- objp = ____cache_alloc_node(cache, flags, numa_mem_id());
-
+ objp = ____cache_alloc_node(cachep, flags, nodeid);
out:
return objp;
}
#else
static __always_inline void *
-__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags, int nodeid __maybe_unused)
{
return ____cache_alloc(cachep, flags);
}
@@ -3266,8 +3229,8 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
#endif /* CONFIG_NUMA */
static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
- size_t orig_size, unsigned long caller)
+slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
+ int nodeid, size_t orig_size, unsigned long caller)
{
unsigned long save_flags;
void *objp;
@@ -3284,7 +3247,7 @@ slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
goto out;
local_irq_save(save_flags);
- objp = __do_cache_alloc(cachep, flags);
+ objp = __do_cache_alloc(cachep, flags, nodeid);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
prefetchw(objp);
@@ -3295,6 +3258,14 @@ out:
return objp;
}
+static __always_inline void *
+slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
+ size_t orig_size, unsigned long caller)
+{
+ return slab_alloc_node(cachep, lru, flags, NUMA_NO_NODE, orig_size,
+ caller);
+}
+
/*
* Caller needs to acquire correct kmem_cache_node's list_lock
* @list: List of detached free slabs should be freed by caller
@@ -3470,8 +3441,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
{
void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_);
- trace_kmem_cache_alloc(_RET_IP_, ret, cachep,
- cachep->object_size, cachep->size, flags);
+ trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, NUMA_NO_NODE);
return ret;
}
@@ -3521,7 +3491,8 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
local_irq_disable();
for (i = 0; i < size; i++) {
- void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags);
+ void *objp = kfence_alloc(s, s->object_size, flags) ?:
+ __do_cache_alloc(s, flags, NUMA_NO_NODE);
if (unlikely(!objp))
goto error;
@@ -3548,23 +3519,6 @@ error:
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
-#ifdef CONFIG_TRACING
-void *
-kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
-{
- void *ret;
-
- ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(_RET_IP_, ret, cachep,
- size, cachep->size, flags);
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc_trace);
-#endif
-
-#ifdef CONFIG_NUMA
/**
* kmem_cache_alloc_node - Allocate an object on the specified node
* @cachep: The cache to allocate from.
@@ -3580,65 +3534,21 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
*/
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
+ void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_);
- trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep,
- cachep->object_size, cachep->size,
- flags, nodeid);
+ trace_kmem_cache_alloc(_RET_IP_, ret, cachep, flags, nodeid);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
- gfp_t flags,
- int nodeid,
- size_t size)
-{
- void *ret;
-
- ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc_node(_RET_IP_, ret, cachep,
- size, cachep->size,
- flags, nodeid);
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
-#endif
-
-static __always_inline void *
-__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
-{
- struct kmem_cache *cachep;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return NULL;
- cachep = kmalloc_slab(size, flags);
- if (unlikely(ZERO_OR_NULL_PTR(cachep)))
- return cachep;
- ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
- ret = kasan_kmalloc(cachep, ret, size, flags);
-
- return ret;
-}
-
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
+void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+ int nodeid, size_t orig_size,
+ unsigned long caller)
{
- return __do_kmalloc_node(size, flags, node, _RET_IP_);
+ return slab_alloc_node(cachep, NULL, flags, nodeid,
+ orig_size, caller);
}
-EXPORT_SYMBOL(__kmalloc_node);
-
-void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
- int node, unsigned long caller)
-{
- return __do_kmalloc_node(size, flags, node, caller);
-}
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif /* CONFIG_NUMA */
#ifdef CONFIG_PRINTK
void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
@@ -3662,45 +3572,25 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
}
#endif
-/**
- * __do_kmalloc - allocate memory
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kmalloc).
- * @caller: function caller for debug tracking of the caller
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
- unsigned long caller)
+static __always_inline
+void __do_kmem_cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
{
- struct kmem_cache *cachep;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return NULL;
- cachep = kmalloc_slab(size, flags);
- if (unlikely(ZERO_OR_NULL_PTR(cachep)))
- return cachep;
- ret = slab_alloc(cachep, NULL, flags, size, caller);
-
- ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(caller, ret, cachep,
- size, cachep->size, flags);
-
- return ret;
-}
+ unsigned long flags;
-void *__kmalloc(size_t size, gfp_t flags)
-{
- return __do_kmalloc(size, flags, _RET_IP_);
+ local_irq_save(flags);
+ debug_check_no_locks_freed(objp, cachep->object_size);
+ if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, cachep->object_size);
+ __cache_free(cachep, objp, caller);
+ local_irq_restore(flags);
}
-EXPORT_SYMBOL(__kmalloc);
-void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
+void __kmem_cache_free(struct kmem_cache *cachep, void *objp,
+ unsigned long caller)
{
- return __do_kmalloc(size, flags, caller);
+ __do_kmem_cache_free(cachep, objp, caller);
}
-EXPORT_SYMBOL(__kmalloc_track_caller);
/**
* kmem_cache_free - Deallocate an object
@@ -3712,34 +3602,38 @@ EXPORT_SYMBOL(__kmalloc_track_caller);
*/
void kmem_cache_free(struct kmem_cache *cachep, void *objp)
{
- unsigned long flags;
cachep = cache_from_obj(cachep, objp);
if (!cachep)
return;
- trace_kmem_cache_free(_RET_IP_, objp, cachep->name);
- local_irq_save(flags);
- debug_check_no_locks_freed(objp, cachep->object_size);
- if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
- debug_check_no_obj_freed(objp, cachep->object_size);
- __cache_free(cachep, objp, _RET_IP_);
- local_irq_restore(flags);
+ trace_kmem_cache_free(_RET_IP_, objp, cachep);
+ __do_kmem_cache_free(cachep, objp, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
{
- struct kmem_cache *s;
- size_t i;
local_irq_disable();
- for (i = 0; i < size; i++) {
+ for (int i = 0; i < size; i++) {
void *objp = p[i];
+ struct kmem_cache *s;
- if (!orig_s) /* called via kfree_bulk */
- s = virt_to_cache(objp);
- else
+ if (!orig_s) {
+ struct folio *folio = virt_to_folio(objp);
+
+ /* called via kfree_bulk */
+ if (!folio_test_slab(folio)) {
+ local_irq_enable();
+ free_large_kmalloc(folio, objp);
+ local_irq_disable();
+ continue;
+ }
+ s = folio_slab(folio)->slab_cache;
+ } else {
s = cache_from_obj(orig_s, objp);
+ }
+
if (!s)
continue;
@@ -3755,39 +3649,6 @@ void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
-/**
- * kfree - free previously allocated memory
- * @objp: pointer returned by kmalloc.
- *
- * If @objp is NULL, no operation is performed.
- *
- * Don't free memory not originally allocated by kmalloc()
- * or you will run into trouble.
- */
-void kfree(const void *objp)
-{
- struct kmem_cache *c;
- unsigned long flags;
-
- trace_kfree(_RET_IP_, objp);
-
- if (unlikely(ZERO_OR_NULL_PTR(objp)))
- return;
- local_irq_save(flags);
- kfree_debugcheck(objp);
- c = virt_to_cache(objp);
- if (!c) {
- local_irq_restore(flags);
- return;
- }
- debug_check_no_locks_freed(objp, c->object_size);
-
- debug_check_no_obj_freed(objp, c->object_size);
- __cache_free(c, (void *)objp, _RET_IP_);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(kfree);
-
/*
* This initializes kmem_cache_node or resizes various caches for all nodes.
*/
@@ -4190,28 +4051,3 @@ void __check_heap_object(const void *ptr, unsigned long n,
usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
-
-/**
- * __ksize -- Uninstrumented ksize.
- * @objp: pointer to the object
- *
- * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
- * safety checks as ksize() with KASAN instrumentation enabled.
- *
- * Return: size of the actual memory used by @objp in bytes
- */
-size_t __ksize(const void *objp)
-{
- struct kmem_cache *c;
- size_t size;
-
- BUG_ON(!objp);
- if (unlikely(objp == ZERO_SIZE_PTR))
- return 0;
-
- c = virt_to_cache(objp);
- size = c ? c->object_size : 0;
-
- return size;
-}
-EXPORT_SYMBOL(__ksize);
diff --git a/mm/slab.h b/mm/slab.h
index 4ec82bec15ec..0202a8c2f0d2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -273,6 +273,11 @@ void create_kmalloc_caches(slab_flags_t);
/* Find the kmalloc slab corresponding for a certain size */
struct kmem_cache *kmalloc_slab(size_t, gfp_t);
+
+void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t orig_size,
+ unsigned long caller);
+void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller);
#endif
gfp_t kmalloc_fix_flags(gfp_t flags);
@@ -658,8 +663,13 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
print_tracking(cachep, x);
return cachep;
}
+
+void free_large_kmalloc(struct folio *folio, void *object);
+
#endif /* CONFIG_SLOB */
+size_t __ksize(const void *objp);
+
static inline size_t slab_ksize(const struct kmem_cache *s)
{
#ifndef CONFIG_SLUB
@@ -729,6 +739,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
memset(p[i], 0, s->object_size);
kmemleak_alloc_recursive(p[i], s->object_size, 1,
s->flags, flags);
+ kmsan_slab_alloc(s, p[i], flags);
}
memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 07b948288f84..0042fb2730d1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -475,6 +475,7 @@ void slab_kmem_cache_release(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
int refcnt;
+ bool rcu_set;
if (unlikely(!s) || !kasan_check_byte(s))
return;
@@ -482,6 +483,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
cpus_read_lock();
mutex_lock(&slab_mutex);
+ rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU;
+
refcnt = --s->refcount;
if (refcnt)
goto out_unlock;
@@ -492,7 +495,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
out_unlock:
mutex_unlock(&slab_mutex);
cpus_read_unlock();
- if (!refcnt && !(s->flags & SLAB_TYPESAFE_BY_RCU))
+ if (!refcnt && !rcu_set)
kmem_cache_release(s);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -508,13 +511,9 @@ EXPORT_SYMBOL(kmem_cache_destroy);
*/
int kmem_cache_shrink(struct kmem_cache *cachep)
{
- int ret;
-
-
kasan_cache_shrink(cachep);
- ret = __kmem_cache_shrink(cachep);
- return ret;
+ return __kmem_cache_shrink(cachep);
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -662,7 +661,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
if (!s)
panic("Out of memory when creating slab %s\n", name);
- create_boot_cache(s, name, size, flags, useroffset, usersize);
+ create_boot_cache(s, name, size, flags | SLAB_KMALLOC, useroffset,
+ usersize);
kasan_cache_create_kmalloc(s);
list_add(&s->list, &slab_caches);
s->refcount = 1;
@@ -734,6 +734,26 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
return kmalloc_caches[kmalloc_type(flags)][index];
}
+size_t kmalloc_size_roundup(size_t size)
+{
+ struct kmem_cache *c;
+
+ /* Short-circuit the 0 size case. */
+ if (unlikely(size == 0))
+ return 0;
+ /* Short-circuit saturated "too-large" case. */
+ if (unlikely(size == SIZE_MAX))
+ return SIZE_MAX;
+ /* Above the smaller buckets, size is a multiple of page size. */
+ if (size > KMALLOC_MAX_CACHE_SIZE)
+ return PAGE_SIZE << get_order(size);
+
+ /* The flags don't matter since size_index is common to all. */
+ c = kmalloc_slab(size, GFP_KERNEL);
+ return c ? c->object_size : 0;
+}
+EXPORT_SYMBOL(kmalloc_size_roundup);
+
#ifdef CONFIG_ZONE_DMA
#define KMALLOC_DMA_NAME(sz) .name[KMALLOC_DMA] = "dma-kmalloc-" #sz,
#else
@@ -757,8 +777,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
/*
* kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
- * kmalloc_index() supports up to 2^25=32MB, so the final entry of the table is
- * kmalloc-32M.
+ * kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
+ * kmalloc-2M.
*/
const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(0, 0),
@@ -782,11 +802,7 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
INIT_KMALLOC_INFO(262144, 256k),
INIT_KMALLOC_INFO(524288, 512k),
INIT_KMALLOC_INFO(1048576, 1M),
- INIT_KMALLOC_INFO(2097152, 2M),
- INIT_KMALLOC_INFO(4194304, 4M),
- INIT_KMALLOC_INFO(8388608, 8M),
- INIT_KMALLOC_INFO(16777216, 16M),
- INIT_KMALLOC_INFO(33554432, 32M)
+ INIT_KMALLOC_INFO(2097152, 2M)
};
/*
@@ -899,6 +915,154 @@ void __init create_kmalloc_caches(slab_flags_t flags)
/* Kmalloc array is now usable */
slab_state = UP;
}
+
+void free_large_kmalloc(struct folio *folio, void *object)
+{
+ unsigned int order = folio_order(folio);
+
+ if (WARN_ON_ONCE(order == 0))
+ pr_warn_once("object pointer: 0x%p\n", object);
+
+ kmemleak_free(object);
+ kasan_kfree_large(object);
+ kmsan_kfree_large(object);
+
+ mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
+ __free_pages(folio_page(folio, 0), order);
+}
+
+static void *__kmalloc_large_node(size_t size, gfp_t flags, int node);
+static __always_inline
+void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
+{
+ struct kmem_cache *s;
+ void *ret;
+
+ if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
+ ret = __kmalloc_large_node(size, flags, node);
+ trace_kmalloc(caller, ret, size,
+ PAGE_SIZE << get_order(size), flags, node);
+ return ret;
+ }
+
+ s = kmalloc_slab(size, flags);
+
+ if (unlikely(ZERO_OR_NULL_PTR(s)))
+ return s;
+
+ ret = __kmem_cache_alloc_node(s, flags, node, size, caller);
+ ret = kasan_kmalloc(s, ret, size, flags);
+ trace_kmalloc(caller, ret, size, s->size, flags, node);
+ return ret;
+}
+
+void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+ return __do_kmalloc_node(size, flags, node, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+
+void *__kmalloc(size_t size, gfp_t flags)
+{
+ return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc);
+
+void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
+ int node, unsigned long caller)
+{
+ return __do_kmalloc_node(size, flags, node, caller);
+}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
+
+/**
+ * kfree - free previously allocated memory
+ * @object: pointer returned by kmalloc.
+ *
+ * If @object is NULL, no operation is performed.
+ *
+ * Don't free memory not originally allocated by kmalloc()
+ * or you will run into trouble.
+ */
+void kfree(const void *object)
+{
+ struct folio *folio;
+ struct slab *slab;
+ struct kmem_cache *s;
+
+ trace_kfree(_RET_IP_, object);
+
+ if (unlikely(ZERO_OR_NULL_PTR(object)))
+ return;
+
+ folio = virt_to_folio(object);
+ if (unlikely(!folio_test_slab(folio))) {
+ free_large_kmalloc(folio, (void *)object);
+ return;
+ }
+
+ slab = folio_slab(folio);
+ s = slab->slab_cache;
+ __kmem_cache_free(s, (void *)object, _RET_IP_);
+}
+EXPORT_SYMBOL(kfree);
+
+/**
+ * __ksize -- Report full size of underlying allocation
+ * @object: pointer to the object
+ *
+ * This should only be used internally to query the true size of allocations.
+ * It is not meant to be a way to discover the usable size of an allocation
+ * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond
+ * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS,
+ * and/or FORTIFY_SOURCE.
+ *
+ * Return: size of the actual memory used by @object in bytes
+ */
+size_t __ksize(const void *object)
+{
+ struct folio *folio;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return 0;
+
+ folio = virt_to_folio(object);
+
+ if (unlikely(!folio_test_slab(folio))) {
+ if (WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE))
+ return 0;
+ if (WARN_ON(object != folio_address(folio)))
+ return 0;
+ return folio_size(folio);
+ }
+
+ return slab_ksize(folio_slab(folio)->slab_cache);
+}
+
+void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+{
+ void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE,
+ size, _RET_IP_);
+
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE);
+
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_trace);
+
+void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t size)
+{
+ void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_);
+
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node);
+
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_node_trace);
#endif /* !CONFIG_SLOB */
gfp_t kmalloc_fix_flags(gfp_t flags)
@@ -918,37 +1082,51 @@ gfp_t kmalloc_fix_flags(gfp_t flags)
* directly to the page allocator. We use __GFP_COMP, because we will need to
* know the allocation order to free the pages properly in kfree.
*/
-void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+
+static void *__kmalloc_large_node(size_t size, gfp_t flags, int node)
{
- void *ret = NULL;
struct page *page;
+ void *ptr = NULL;
+ unsigned int order = get_order(size);
if (unlikely(flags & GFP_SLAB_BUG_MASK))
flags = kmalloc_fix_flags(flags);
flags |= __GFP_COMP;
- page = alloc_pages(flags, order);
- if (likely(page)) {
- ret = page_address(page);
+ page = alloc_pages_node(node, flags, order);
+ if (page) {
+ ptr = page_address(page);
mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
PAGE_SIZE << order);
}
- ret = kasan_kmalloc_large(ret, size, flags);
- /* As ret might get tagged, call kmemleak hook after KASAN. */
- kmemleak_alloc(ret, size, 1, flags);
+
+ ptr = kasan_kmalloc_large(ptr, size, flags);
+ /* As ptr might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc(ptr, size, 1, flags);
+ kmsan_kmalloc_large(ptr, size, flags);
+
+ return ptr;
+}
+
+void *kmalloc_large(size_t size, gfp_t flags)
+{
+ void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE);
+
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
+ flags, NUMA_NO_NODE);
return ret;
}
-EXPORT_SYMBOL(kmalloc_order);
+EXPORT_SYMBOL(kmalloc_large);
-#ifdef CONFIG_TRACING
-void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
- void *ret = kmalloc_order(size, flags, order);
- trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags);
+ void *ret = __kmalloc_large_node(size, flags, node);
+
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
+ flags, node);
return ret;
}
-EXPORT_SYMBOL(kmalloc_order_trace);
-#endif
+EXPORT_SYMBOL(kmalloc_large_node);
#ifdef CONFIG_SLAB_FREELIST_RANDOM
/* Randomize a generic freelist */
@@ -1147,8 +1325,8 @@ module_init(slab_proc_init);
#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
-static __always_inline void *__do_krealloc(const void *p, size_t new_size,
- gfp_t flags)
+static __always_inline __realloc_size(2) void *
+__do_krealloc(const void *p, size_t new_size, gfp_t flags)
{
void *ret;
size_t ks;
@@ -1231,20 +1409,6 @@ void kfree_sensitive(const void *p)
}
EXPORT_SYMBOL(kfree_sensitive);
-/**
- * ksize - get the actual amount of memory allocated for a given object
- * @objp: Pointer to the object
- *
- * kmalloc may internally round up allocations and return more memory
- * than requested. ksize() can be used to determine the actual amount of
- * memory allocated. The caller may use this additional memory, even though
- * a smaller amount of memory was initially specified with the kmalloc call.
- * The caller must guarantee that objp points to a valid object previously
- * allocated with either kmalloc() or kmem_cache_alloc(). The object
- * must not be freed during the duration of the call.
- *
- * Return: size of the actual memory used by @objp in bytes
- */
size_t ksize(const void *objp)
{
size_t size;
@@ -1280,8 +1444,6 @@ EXPORT_SYMBOL(ksize);
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
-EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
EXPORT_TRACEPOINT_SYMBOL(kfree);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --git a/mm/slob.c b/mm/slob.c
index 2bd4f476c340..fe567fcfa3a3 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -507,8 +507,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
*m = size;
ret = (void *)m + minalign;
- trace_kmalloc_node(caller, ret, NULL,
- size, size + minalign, gfp, node);
+ trace_kmalloc(caller, ret, size, size + minalign, gfp, node);
} else {
unsigned int order = get_order(size);
@@ -516,8 +515,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
gfp |= __GFP_COMP;
ret = slob_new_pages(gfp, order, node);
- trace_kmalloc_node(caller, ret, NULL,
- size, PAGE_SIZE << order, gfp, node);
+ trace_kmalloc(caller, ret, size, PAGE_SIZE << order, gfp, node);
}
kmemleak_alloc(ret, size, 1, gfp);
@@ -530,20 +528,12 @@ void *__kmalloc(size_t size, gfp_t gfp)
}
EXPORT_SYMBOL(__kmalloc);
-void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
-{
- return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
-}
-EXPORT_SYMBOL(__kmalloc_track_caller);
-
-#ifdef CONFIG_NUMA
void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
int node, unsigned long caller)
{
return __do_kmalloc_node(size, gfp, node, caller);
}
EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif
void kfree(const void *block)
{
@@ -574,6 +564,20 @@ void kfree(const void *block)
}
EXPORT_SYMBOL(kfree);
+size_t kmalloc_size_roundup(size_t size)
+{
+ /* Short-circuit the 0 size case. */
+ if (unlikely(size == 0))
+ return 0;
+ /* Short-circuit saturated "too-large" case. */
+ if (unlikely(size == SIZE_MAX))
+ return SIZE_MAX;
+
+ return ALIGN(size, ARCH_KMALLOC_MINALIGN);
+}
+
+EXPORT_SYMBOL(kmalloc_size_roundup);
+
/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
size_t __ksize(const void *block)
{
@@ -594,7 +598,6 @@ size_t __ksize(const void *block)
m = (unsigned int *)(block - align);
return SLOB_UNITS(*m) * SLOB_UNIT;
}
-EXPORT_SYMBOL(__ksize);
int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
{
@@ -602,6 +605,9 @@ int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
/* leave room for rcu footer at the end of object */
c->size += sizeof(struct slob_rcu);
}
+
+ /* Actual size allocated */
+ c->size = SLOB_UNITS(c->size) * SLOB_UNIT;
c->flags = flags;
return 0;
}
@@ -616,14 +622,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node, 0);
- trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size,
- SLOB_UNITS(c->size) * SLOB_UNIT,
- flags, node);
+ trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node);
} else {
b = slob_new_pages(flags, get_order(c->size), node);
- trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size,
- PAGE_SIZE << get_order(c->size),
- flags, node);
+ trace_kmem_cache_alloc(_RET_IP_, b, c, flags, node);
}
if (b && c->ctor) {
@@ -647,7 +649,7 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, gfp_
return slob_alloc_node(cachep, flags, NUMA_NO_NODE);
}
EXPORT_SYMBOL(kmem_cache_alloc_lru);
-#ifdef CONFIG_NUMA
+
void *__kmalloc_node(size_t size, gfp_t gfp, int node)
{
return __do_kmalloc_node(size, gfp, node, _RET_IP_);
@@ -659,7 +661,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
return slob_alloc_node(cachep, gfp, node);
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#endif
static void __kmem_cache_free(void *b, int size)
{
@@ -680,7 +681,7 @@ static void kmem_rcu_free(struct rcu_head *head)
void kmem_cache_free(struct kmem_cache *c, void *b)
{
kmemleak_free_recursive(b, c->flags);
- trace_kmem_cache_free(_RET_IP_, b, c->name);
+ trace_kmem_cache_free(_RET_IP_, b, c);
if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
struct slob_rcu *slob_rcu;
slob_rcu = b + (c->size - sizeof(struct slob_rcu));
diff --git a/mm/slub.c b/mm/slub.c
index 862dbd9af4f5..157527d7101b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -22,6 +22,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
+#include <linux/kmsan.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
@@ -50,7 +51,7 @@
* 1. slab_mutex (Global Mutex)
* 2. node->list_lock (Spinlock)
* 3. kmem_cache->cpu_slab->lock (Local lock)
- * 4. slab_lock(slab) (Only on some arches or for debugging)
+ * 4. slab_lock(slab) (Only on some arches)
* 5. object_map_lock (Only for debugging)
*
* slab_mutex
@@ -64,8 +65,9 @@
* The slab_lock is a wrapper around the page lock, thus it is a bit
* spinlock.
*
- * The slab_lock is only used for debugging and on arches that do not
- * have the ability to do a cmpxchg_double. It only protects:
+ * The slab_lock is only used on arches that do not have the ability
+ * to do a cmpxchg_double. It only protects:
+ *
* A. slab->freelist -> List of free objects in a slab
* B. slab->inuse -> Number of objects in use
* C. slab->objects -> Number of objects in slab
@@ -94,15 +96,20 @@
* allocating a long series of objects that fill up slabs does not require
* the list lock.
*
+ * For debug caches, all allocations are forced to go through a list_lock
+ * protected region to serialize against concurrent validation.
+ *
* cpu_slab->lock local lock
*
* This locks protect slowpath manipulation of all kmem_cache_cpu fields
* except the stat counters. This is a percpu structure manipulated only by
* the local cpu, so the lock protects against being preempted or interrupted
* by an irq. Fast path operations rely on lockless operations instead.
- * On PREEMPT_RT, the local lock does not actually disable irqs (and thus
- * prevent the lockless operations), so fastpath operations also need to take
- * the lock and are no longer lockless.
+ *
+ * On PREEMPT_RT, the local lock neither disables interrupts nor preemption
+ * which means the lockless fastpath cannot be used as it might interfere with
+ * an in-progress slow path operations. In this case the local lock is always
+ * taken but it still utilizes the freelist for the common operations.
*
* lockless fastpaths
*
@@ -163,8 +170,9 @@
* function call even on !PREEMPT_RT, use inline preempt_disable() there.
*/
#ifndef CONFIG_PREEMPT_RT
-#define slub_get_cpu_ptr(var) get_cpu_ptr(var)
-#define slub_put_cpu_ptr(var) put_cpu_ptr(var)
+#define slub_get_cpu_ptr(var) get_cpu_ptr(var)
+#define slub_put_cpu_ptr(var) put_cpu_ptr(var)
+#define USE_LOCKLESS_FAST_PATH() (true)
#else
#define slub_get_cpu_ptr(var) \
({ \
@@ -176,6 +184,7 @@ do { \
(void)(var); \
migrate_enable(); \
} while (0)
+#define USE_LOCKLESS_FAST_PATH() (false)
#endif
#ifdef CONFIG_SLUB_DEBUG
@@ -186,11 +195,24 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
#endif
#endif /* CONFIG_SLUB_DEBUG */
+/* Structure holding parameters for get_partial() call chain */
+struct partial_context {
+ struct slab **slab;
+ gfp_t flags;
+ unsigned int orig_size;
+};
+
static inline bool kmem_cache_debug(struct kmem_cache *s)
{
return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
}
+static inline bool slub_debug_orig_size(struct kmem_cache *s)
+{
+ return (kmem_cache_debug_flags(s, SLAB_STORE_USER) &&
+ (s->flags & SLAB_KMALLOC));
+}
+
void *fixup_red_left(struct kmem_cache *s, void *p)
{
if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
@@ -310,6 +332,11 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
*/
static nodemask_t slab_nodes;
+/*
+ * Workqueue used for flush_cpu_slab().
+ */
+static struct workqueue_struct *flushwq;
+
/********************************************************************
* Core slab cache functions
*******************************************************************/
@@ -359,6 +386,17 @@ static void prefetch_freepointer(const struct kmem_cache *s, void *object)
prefetchw(object + s->offset);
}
+/*
+ * When running under KMSAN, get_freepointer_safe() may return an uninitialized
+ * pointer value in the case the current thread loses the race for the next
+ * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
+ * slab_alloc_node() will fail, so the uninitialized value won't be used, but
+ * KMSAN will still check all arguments of cmpxchg because of imperfect
+ * handling of inline assembly.
+ * To work around this problem, we apply __no_kmsan_checks to ensure that
+ * get_freepointer_safe() returns initialized memory.
+ */
+__no_kmsan_checks
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
unsigned long freepointer_addr;
@@ -442,7 +480,7 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
/*
* Per slab locking using the pagelock
*/
-static __always_inline void __slab_lock(struct slab *slab)
+static __always_inline void slab_lock(struct slab *slab)
{
struct page *page = slab_page(slab);
@@ -450,7 +488,7 @@ static __always_inline void __slab_lock(struct slab *slab)
bit_spin_lock(PG_locked, &page->flags);
}
-static __always_inline void __slab_unlock(struct slab *slab)
+static __always_inline void slab_unlock(struct slab *slab)
{
struct page *page = slab_page(slab);
@@ -458,31 +496,19 @@ static __always_inline void __slab_unlock(struct slab *slab)
__bit_spin_unlock(PG_locked, &page->flags);
}
-static __always_inline void slab_lock(struct slab *slab, unsigned long *flags)
-{
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_save(*flags);
- __slab_lock(slab);
-}
-
-static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags)
-{
- __slab_unlock(slab);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_restore(*flags);
-}
-
/*
* Interrupts must be disabled (for the fallback code to work right), typically
- * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different
- * so we disable interrupts as part of slab_[un]lock().
+ * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is
+ * part of bit_spin_lock(), is sufficient because the policy is not to allow any
+ * allocation/ free operation in hardirq context. Therefore nothing can
+ * interrupt the operation.
*/
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
- if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ if (USE_LOCKLESS_FAST_PATH())
lockdep_assert_irqs_disabled();
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
@@ -494,18 +520,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab
} else
#endif
{
- /* init to 0 to prevent spurious warnings */
- unsigned long flags = 0;
-
- slab_lock(slab, &flags);
+ slab_lock(slab);
if (slab->freelist == freelist_old &&
slab->counters == counters_old) {
slab->freelist = freelist_new;
slab->counters = counters_new;
- slab_unlock(slab, &flags);
+ slab_unlock(slab);
return true;
}
- slab_unlock(slab, &flags);
+ slab_unlock(slab);
}
cpu_relax();
@@ -536,16 +559,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
unsigned long flags;
local_irq_save(flags);
- __slab_lock(slab);
+ slab_lock(slab);
if (slab->freelist == freelist_old &&
slab->counters == counters_old) {
slab->freelist = freelist_new;
slab->counters = counters_new;
- __slab_unlock(slab);
+ slab_unlock(slab);
local_irq_restore(flags);
return true;
}
- __slab_unlock(slab);
+ slab_unlock(slab);
local_irq_restore(flags);
}
@@ -561,7 +584,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
#ifdef CONFIG_SLUB_DEBUG
static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
-static DEFINE_RAW_SPINLOCK(object_map_lock);
+static DEFINE_SPINLOCK(object_map_lock);
static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
struct slab *slab)
@@ -595,30 +618,6 @@ static bool slab_add_kunit_errors(void)
static inline bool slab_add_kunit_errors(void) { return false; }
#endif
-/*
- * Determine a map of objects in use in a slab.
- *
- * Node listlock must be held to guarantee that the slab does
- * not vanish from under us.
- */
-static unsigned long *get_map(struct kmem_cache *s, struct slab *slab)
- __acquires(&object_map_lock)
-{
- VM_BUG_ON(!irqs_disabled());
-
- raw_spin_lock(&object_map_lock);
-
- __fill_map(object_map, s, slab);
-
- return object_map;
-}
-
-static void put_map(unsigned long *map) __releases(&object_map_lock)
-{
- VM_BUG_ON(map != object_map);
- raw_spin_unlock(&object_map_lock);
-}
-
static inline unsigned int size_from_object(struct kmem_cache *s)
{
if (s->flags & SLAB_RED_ZONE)
@@ -816,6 +815,39 @@ static void print_slab_info(const struct slab *slab)
folio_flags(folio, 0));
}
+/*
+ * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API
+ * family will round up the real request size to these fixed ones, so
+ * there could be an extra area than what is requested. Save the original
+ * request size in the meta data area, for better debug and sanity check.
+ */
+static inline void set_orig_size(struct kmem_cache *s,
+ void *object, unsigned int orig_size)
+{
+ void *p = kasan_reset_tag(object);
+
+ if (!slub_debug_orig_size(s))
+ return;
+
+ p += get_info_end(s);
+ p += sizeof(struct track) * 2;
+
+ *(unsigned int *)p = orig_size;
+}
+
+static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
+{
+ void *p = kasan_reset_tag(object);
+
+ if (!slub_debug_orig_size(s))
+ return s->object_size;
+
+ p += get_info_end(s);
+ p += sizeof(struct track) * 2;
+
+ return *(unsigned int *)p;
+}
+
static void slab_bug(struct kmem_cache *s, char *fmt, ...)
{
struct va_format vaf;
@@ -875,6 +907,9 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
+ if (slub_debug_orig_size(s))
+ off += sizeof(unsigned int);
+
off += kasan_metadata_size(s);
if (off != size_from_object(s))
@@ -1008,7 +1043,8 @@ skip_bug_print:
*
* A. Free pointer (if we cannot overwrite object on free)
* B. Tracking data for SLAB_STORE_USER
- * C. Padding to reach required alignment boundary or at minimum
+ * C. Original request size for kmalloc object (SLAB_STORE_USER enabled)
+ * D. Padding to reach required alignment boundary or at minimum
* one word if debugging is on to be able to detect writes
* before the word boundary.
*
@@ -1026,10 +1062,14 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
{
unsigned long off = get_info_end(s); /* The end of info */
- if (s->flags & SLAB_STORE_USER)
+ if (s->flags & SLAB_STORE_USER) {
/* We also have user information there */
off += 2 * sizeof(struct track);
+ if (s->flags & SLAB_KMALLOC)
+ off += sizeof(unsigned int);
+ }
+
off += kasan_metadata_size(s);
if (size_from_object(s) == off)
@@ -1324,18 +1364,16 @@ static inline int alloc_consistency_checks(struct kmem_cache *s,
}
static noinline int alloc_debug_processing(struct kmem_cache *s,
- struct slab *slab,
- void *object, unsigned long addr)
+ struct slab *slab, void *object, int orig_size)
{
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
if (!alloc_consistency_checks(s, slab, object))
goto bad;
}
- /* Success perform special debug activities for allocs */
- if (s->flags & SLAB_STORE_USER)
- set_track(s, object, TRACK_ALLOC, addr);
+ /* Success. Perform special debug activities for allocs */
trace(s, slab, object, 1);
+ set_orig_size(s, object, orig_size);
init_object(s, object, SLUB_RED_ACTIVE);
return 1;
@@ -1385,63 +1423,6 @@ static inline int free_consistency_checks(struct kmem_cache *s,
return 1;
}
-/* Supports checking bulk free of a constructed freelist */
-static noinline int free_debug_processing(
- struct kmem_cache *s, struct slab *slab,
- void *head, void *tail, int bulk_cnt,
- unsigned long addr)
-{
- struct kmem_cache_node *n = get_node(s, slab_nid(slab));
- void *object = head;
- int cnt = 0;
- unsigned long flags, flags2;
- int ret = 0;
- depot_stack_handle_t handle = 0;
-
- if (s->flags & SLAB_STORE_USER)
- handle = set_track_prepare();
-
- spin_lock_irqsave(&n->list_lock, flags);
- slab_lock(slab, &flags2);
-
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!check_slab(s, slab))
- goto out;
- }
-
-next_object:
- cnt++;
-
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!free_consistency_checks(s, slab, object, addr))
- goto out;
- }
-
- if (s->flags & SLAB_STORE_USER)
- set_track_update(s, object, TRACK_FREE, addr, handle);
- trace(s, slab, object, 0);
- /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
- init_object(s, object, SLUB_RED_INACTIVE);
-
- /* Reached end of constructed freelist yet? */
- if (object != tail) {
- object = get_freepointer(s, object);
- goto next_object;
- }
- ret = 1;
-
-out:
- if (cnt != bulk_cnt)
- slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
- bulk_cnt, cnt);
-
- slab_unlock(slab, &flags2);
- spin_unlock_irqrestore(&n->list_lock, flags);
- if (!ret)
- slab_fix(s, "Object at 0x%p not freed", object);
- return ret;
-}
-
/*
* Parse a block of slub_debug options. Blocks are delimited by ';'
*
@@ -1661,16 +1642,18 @@ static inline
void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
- struct slab *slab, void *object, unsigned long addr) { return 0; }
+ struct slab *slab, void *object, int orig_size) { return 0; }
-static inline int free_debug_processing(
+static inline void free_debug_processing(
struct kmem_cache *s, struct slab *slab,
void *head, void *tail, int bulk_cnt,
- unsigned long addr) { return 0; }
+ unsigned long addr) {}
static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
static inline int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val) { return 1; }
+static inline void set_track(struct kmem_cache *s, void *object,
+ enum track_item alloc, unsigned long addr) {}
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct slab *slab) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
@@ -1704,24 +1687,11 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
*/
-static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
-{
- ptr = kasan_kmalloc_large(ptr, size, flags);
- /* As ptr might get tagged, call kmemleak hook after KASAN. */
- kmemleak_alloc(ptr, size, 1, flags);
- return ptr;
-}
-
-static __always_inline void kfree_hook(void *x)
-{
- kmemleak_free(x);
- kasan_kfree_large(x);
-}
-
static __always_inline bool slab_free_hook(struct kmem_cache *s,
void *x, bool init)
{
kmemleak_free_recursive(x, s->flags);
+ kmsan_slab_free(s, x);
debug_check_no_locks_freed(x, s->object_size);
@@ -1911,7 +1881,7 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
return false;
freelist_count = oo_objects(s->oo);
- pos = get_random_int() % freelist_count;
+ pos = prandom_u32_max(freelist_count);
page_limit = slab->objects * s->size;
start = fixup_red_left(s, slab_address(slab));
@@ -1976,11 +1946,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
slab = alloc_slab_page(alloc_gfp, node, oo);
if (unlikely(!slab))
- goto out;
+ return NULL;
stat(s, ORDER_FALLBACK);
}
slab->objects = oo_objects(oo);
+ slab->inuse = 0;
+ slab->frozen = 0;
account_slab(slab, oo_order(oo), s, flags);
@@ -2007,15 +1979,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
set_freepointer(s, p, NULL);
}
- slab->inuse = slab->objects;
- slab->frozen = 1;
-
-out:
- if (!slab)
- return NULL;
-
- inc_slabs_node(s, slab_nid(slab), slab->objects);
-
return slab;
}
@@ -2103,6 +2066,75 @@ static inline void remove_partial(struct kmem_cache_node *n,
}
/*
+ * Called only for kmem_cache_debug() caches instead of acquire_slab(), with a
+ * slab from the n->partial list. Remove only a single object from the slab, do
+ * the alloc_debug_processing() checks and leave the slab on the list, or move
+ * it to full list if it was the last free object.
+ */
+static void *alloc_single_from_partial(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct slab *slab, int orig_size)
+{
+ void *object;
+
+ lockdep_assert_held(&n->list_lock);
+
+ object = slab->freelist;
+ slab->freelist = get_freepointer(s, object);
+ slab->inuse++;
+
+ if (!alloc_debug_processing(s, slab, object, orig_size)) {
+ remove_partial(n, slab);
+ return NULL;
+ }
+
+ if (slab->inuse == slab->objects) {
+ remove_partial(n, slab);
+ add_full(s, n, slab);
+ }
+
+ return object;
+}
+
+/*
+ * Called only for kmem_cache_debug() caches to allocate from a freshly
+ * allocated slab. Allocate a single object instead of whole freelist
+ * and put the slab to the partial (or full) list.
+ */
+static void *alloc_single_from_new_slab(struct kmem_cache *s,
+ struct slab *slab, int orig_size)
+{
+ int nid = slab_nid(slab);
+ struct kmem_cache_node *n = get_node(s, nid);
+ unsigned long flags;
+ void *object;
+
+
+ object = slab->freelist;
+ slab->freelist = get_freepointer(s, object);
+ slab->inuse = 1;
+
+ if (!alloc_debug_processing(s, slab, object, orig_size))
+ /*
+ * It's not really expected that this would fail on a
+ * freshly allocated slab, but a concurrent memory
+ * corruption in theory could cause that.
+ */
+ return NULL;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ if (slab->inuse == slab->objects)
+ add_full(s, n, slab);
+ else
+ add_partial(n, slab, DEACTIVATE_TO_HEAD);
+
+ inc_slabs_node(s, nid, slab->objects);
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ return object;
+}
+
+/*
* Remove slab from the partial list, freeze it and
* return the pointer to the freelist.
*
@@ -2159,7 +2191,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
* Try to allocate a partial slab from a specific node.
*/
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
- struct slab **ret_slab, gfp_t gfpflags)
+ struct partial_context *pc)
{
struct slab *slab, *slab2;
void *object = NULL;
@@ -2179,15 +2211,23 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
void *t;
- if (!pfmemalloc_match(slab, gfpflags))
+ if (!pfmemalloc_match(slab, pc->flags))
continue;
+ if (kmem_cache_debug(s)) {
+ object = alloc_single_from_partial(s, n, slab,
+ pc->orig_size);
+ if (object)
+ break;
+ continue;
+ }
+
t = acquire_slab(s, n, slab, object == NULL);
if (!t)
break;
if (!object) {
- *ret_slab = slab;
+ *pc->slab = slab;
stat(s, ALLOC_FROM_PARTIAL);
object = t;
} else {
@@ -2211,14 +2251,13 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
/*
* Get a slab from somewhere. Search in increasing NUMA distances.
*/
-static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
- struct slab **ret_slab)
+static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
- enum zone_type highest_zoneidx = gfp_zone(flags);
+ enum zone_type highest_zoneidx = gfp_zone(pc->flags);
void *object;
unsigned int cpuset_mems_cookie;
@@ -2246,15 +2285,15 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
do {
cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = node_zonelist(mempolicy_slab_node(), flags);
+ zonelist = node_zonelist(mempolicy_slab_node(), pc->flags);
for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
struct kmem_cache_node *n;
n = get_node(s, zone_to_nid(zone));
- if (n && cpuset_zone_allowed(zone, flags) &&
+ if (n && cpuset_zone_allowed(zone, pc->flags) &&
n->nr_partial > s->min_partial) {
- object = get_partial_node(s, n, ret_slab, flags);
+ object = get_partial_node(s, n, pc);
if (object) {
/*
* Don't check read_mems_allowed_retry()
@@ -2275,8 +2314,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
/*
* Get a partial slab, lock it and return it.
*/
-static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
- struct slab **ret_slab)
+static void *get_partial(struct kmem_cache *s, int node, struct partial_context *pc)
{
void *object;
int searchnode = node;
@@ -2284,11 +2322,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags);
+ object = get_partial_node(s, get_node(s, searchnode), pc);
if (object || node != NUMA_NO_NODE)
return object;
- return get_any_partial(s, flags, ret_slab);
+ return get_any_partial(s, pc);
}
#ifdef CONFIG_PREEMPTION
@@ -2730,7 +2768,7 @@ static void flush_all_cpus_locked(struct kmem_cache *s)
INIT_WORK(&sfw->work, flush_cpu_slab);
sfw->skip = false;
sfw->s = s;
- schedule_work_on(cpu, &sfw->work);
+ queue_work_on(cpu, flushwq, &sfw->work);
}
for_each_online_cpu(cpu) {
@@ -2788,6 +2826,113 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
{
return atomic_long_read(&n->total_objects);
}
+
+/* Supports checking bulk free of a constructed freelist */
+static noinline void free_debug_processing(
+ struct kmem_cache *s, struct slab *slab,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr)
+{
+ struct kmem_cache_node *n = get_node(s, slab_nid(slab));
+ struct slab *slab_free = NULL;
+ void *object = head;
+ int cnt = 0;
+ unsigned long flags;
+ bool checks_ok = false;
+ depot_stack_handle_t handle = 0;
+
+ if (s->flags & SLAB_STORE_USER)
+ handle = set_track_prepare();
+
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!check_slab(s, slab))
+ goto out;
+ }
+
+ if (slab->inuse < bulk_cnt) {
+ slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n",
+ slab->inuse, bulk_cnt);
+ goto out;
+ }
+
+next_object:
+
+ if (++cnt > bulk_cnt)
+ goto out_cnt;
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!free_consistency_checks(s, slab, object, addr))
+ goto out;
+ }
+
+ if (s->flags & SLAB_STORE_USER)
+ set_track_update(s, object, TRACK_FREE, addr, handle);
+ trace(s, slab, object, 0);
+ /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
+ init_object(s, object, SLUB_RED_INACTIVE);
+
+ /* Reached end of constructed freelist yet? */
+ if (object != tail) {
+ object = get_freepointer(s, object);
+ goto next_object;
+ }
+ checks_ok = true;
+
+out_cnt:
+ if (cnt != bulk_cnt)
+ slab_err(s, slab, "Bulk free expected %d objects but found %d\n",
+ bulk_cnt, cnt);
+
+out:
+ if (checks_ok) {
+ void *prior = slab->freelist;
+
+ /* Perform the actual freeing while we still hold the locks */
+ slab->inuse -= cnt;
+ set_freepointer(s, tail, prior);
+ slab->freelist = head;
+
+ /*
+ * If the slab is empty, and node's partial list is full,
+ * it should be discarded anyway no matter it's on full or
+ * partial list.
+ */
+ if (slab->inuse == 0 && n->nr_partial >= s->min_partial)
+ slab_free = slab;
+
+ if (!prior) {
+ /* was on full list */
+ remove_full(s, n, slab);
+ if (!slab_free) {
+ add_partial(n, slab, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
+ }
+ } else if (slab_free) {
+ remove_partial(n, slab);
+ stat(s, FREE_REMOVE_PARTIAL);
+ }
+ }
+
+ if (slab_free) {
+ /*
+ * Update the counters while still holding n->list_lock to
+ * prevent spurious validation warnings
+ */
+ dec_slabs_node(s, slab_nid(slab_free), slab_free->objects);
+ }
+
+ spin_unlock_irqrestore(&n->list_lock, flags);
+
+ if (!checks_ok)
+ slab_fix(s, "Object at 0x%p not freed", object);
+
+ if (slab_free) {
+ stat(s, FREE_SLAB);
+ free_slab(s, slab_free);
+ }
+}
#endif /* CONFIG_SLUB_DEBUG */
#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
@@ -2905,11 +3050,12 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
* already disabled (which is the case for bulk allocation).
*/
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c)
+ unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
void *freelist;
struct slab *slab;
unsigned long flags;
+ struct partial_context pc;
stat(s, ALLOC_SLOWPATH);
@@ -3023,7 +3169,10 @@ new_slab:
new_objects:
- freelist = get_partial(s, gfpflags, node, &slab);
+ pc.flags = gfpflags;
+ pc.slab = &slab;
+ pc.orig_size = orig_size;
+ freelist = get_partial(s, node, &pc);
if (freelist)
goto check_new_slab;
@@ -3036,36 +3185,53 @@ new_objects:
return NULL;
}
+ stat(s, ALLOC_SLAB);
+
+ if (kmem_cache_debug(s)) {
+ freelist = alloc_single_from_new_slab(s, slab, orig_size);
+
+ if (unlikely(!freelist))
+ goto new_objects;
+
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, freelist, TRACK_ALLOC, addr);
+
+ return freelist;
+ }
+
/*
* No other reference to the slab yet so we can
* muck around with it freely without cmpxchg
*/
freelist = slab->freelist;
slab->freelist = NULL;
+ slab->inuse = slab->objects;
+ slab->frozen = 1;
- stat(s, ALLOC_SLAB);
+ inc_slabs_node(s, slab_nid(slab), slab->objects);
check_new_slab:
if (kmem_cache_debug(s)) {
- if (!alloc_debug_processing(s, slab, freelist, addr)) {
- /* Slab failed checks. Next slab needed */
- goto new_slab;
- } else {
- /*
- * For debug case, we don't load freelist so that all
- * allocations go through alloc_debug_processing()
- */
- goto return_single;
- }
+ /*
+ * For debug caches here we had to go through
+ * alloc_single_from_partial() so just store the tracking info
+ * and return the object
+ */
+ if (s->flags & SLAB_STORE_USER)
+ set_track(s, freelist, TRACK_ALLOC, addr);
+
+ return freelist;
}
- if (unlikely(!pfmemalloc_match(slab, gfpflags)))
+ if (unlikely(!pfmemalloc_match(slab, gfpflags))) {
/*
* For !pfmemalloc_match() case we don't load freelist so that
* we don't make further mismatched allocations easier.
*/
- goto return_single;
+ deactivate_slab(s, slab, get_freepointer(s, freelist));
+ return freelist;
+ }
retry_load_slab:
@@ -3089,11 +3255,6 @@ retry_load_slab:
c->slab = slab;
goto load_freelist;
-
-return_single:
-
- deactivate_slab(s, slab, get_freepointer(s, freelist));
- return freelist;
}
/*
@@ -3102,7 +3263,7 @@ return_single:
* pointer.
*/
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c)
+ unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
{
void *p;
@@ -3115,7 +3276,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
c = slub_get_cpu_ptr(s->cpu_slab);
#endif
- p = ___slab_alloc(s, gfpflags, node, addr, c);
+ p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
#ifdef CONFIG_PREEMPT_COUNT
slub_put_cpu_ptr(s->cpu_slab);
#endif
@@ -3197,16 +3358,10 @@ redo:
object = c->freelist;
slab = c->slab;
- /*
- * We cannot use the lockless fastpath on PREEMPT_RT because if a
- * slowpath has taken the local_lock_irqsave(), it is not protected
- * against a fast path operation in an irq handler. So we need to take
- * the slow path which uses local_lock. It is still relatively fast if
- * there is a suitable cpu freelist.
- */
- if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
+
+ if (!USE_LOCKLESS_FAST_PATH() ||
unlikely(!object || !slab || !node_match(slab, node))) {
- object = __slab_alloc(s, gfpflags, node, addr, c);
+ object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
} else {
void *next_object = get_freepointer_safe(s, object);
@@ -3257,8 +3412,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
{
void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size);
- trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size,
- s->size, gfpflags);
+ trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE);
return ret;
}
@@ -3276,46 +3430,24 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
}
EXPORT_SYMBOL(kmem_cache_alloc_lru);
-#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
+ int node, size_t orig_size,
+ unsigned long caller)
{
- void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size);
- trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags);
- ret = kasan_kmalloc(s, ret, size, gfpflags);
- return ret;
+ return slab_alloc_node(s, NULL, gfpflags, node,
+ caller, orig_size);
}
-EXPORT_SYMBOL(kmem_cache_alloc_trace);
-#endif
-#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
- trace_kmem_cache_alloc_node(_RET_IP_, ret, s,
- s->object_size, s->size, gfpflags, node);
+ trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
- gfp_t gfpflags,
- int node, size_t size)
-{
- void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
-
- trace_kmalloc_node(_RET_IP_, ret, s,
- size, s->size, gfpflags, node);
-
- ret = kasan_kmalloc(s, ret, size, gfpflags);
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
-#endif
-#endif /* CONFIG_NUMA */
-
/*
* Slow path handling. This may still be called frequently since objects
* have a longer lifetime than the cpu slabs in most processing loads.
@@ -3341,9 +3473,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
if (kfence_free(head))
return;
- if (kmem_cache_debug(s) &&
- !free_debug_processing(s, slab, head, tail, cnt, addr))
+ if (kmem_cache_debug(s)) {
+ free_debug_processing(s, slab, head, tail, cnt, addr);
return;
+ }
do {
if (unlikely(n)) {
@@ -3463,6 +3596,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
+ void **freelist;
redo:
/*
@@ -3477,9 +3611,13 @@ redo:
/* Same with comment on barrier() in slab_alloc_node() */
barrier();
- if (likely(slab == c->slab)) {
-#ifndef CONFIG_PREEMPT_RT
- void **freelist = READ_ONCE(c->freelist);
+ if (unlikely(slab != c->slab)) {
+ __slab_free(s, slab, head, tail_obj, cnt, addr);
+ return;
+ }
+
+ if (USE_LOCKLESS_FAST_PATH()) {
+ freelist = READ_ONCE(c->freelist);
set_freepointer(s, tail_obj, freelist);
@@ -3491,16 +3629,8 @@ redo:
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
-#else /* CONFIG_PREEMPT_RT */
- /*
- * We cannot use the lockless fastpath on PREEMPT_RT because if
- * a slowpath has taken the local_lock_irqsave(), it is not
- * protected against a fast path operation in an irq handler. So
- * we need to take the local_lock. We shouldn't simply defer to
- * __slab_free() as that wouldn't use the cpu freelist at all.
- */
- void **freelist;
-
+ } else {
+ /* Update the free list under the local lock */
local_lock(&s->cpu_slab->lock);
c = this_cpu_ptr(s->cpu_slab);
if (unlikely(slab != c->slab)) {
@@ -3515,11 +3645,8 @@ redo:
c->tid = next_tid(tid);
local_unlock(&s->cpu_slab->lock);
-#endif
- stat(s, FREE_FASTPATH);
- } else
- __slab_free(s, slab, head, tail_obj, cnt, addr);
-
+ }
+ stat(s, FREE_FASTPATH);
}
static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab,
@@ -3542,12 +3669,17 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
}
#endif
+void __kmem_cache_free(struct kmem_cache *s, void *x, unsigned long caller)
+{
+ slab_free(s, virt_to_slab(x), x, NULL, &x, 1, caller);
+}
+
void kmem_cache_free(struct kmem_cache *s, void *x)
{
s = cache_from_obj(s, x);
if (!s)
return;
- trace_kmem_cache_free(_RET_IP_, x, s->name);
+ trace_kmem_cache_free(_RET_IP_, x, s);
slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -3560,19 +3692,6 @@ struct detached_freelist {
struct kmem_cache *s;
};
-static inline void free_large_kmalloc(struct folio *folio, void *object)
-{
- unsigned int order = folio_order(folio);
-
- if (WARN_ON_ONCE(order == 0))
- pr_warn_once("object pointer: 0x%p\n", object);
-
- kfree_hook(object);
- mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
- -(PAGE_SIZE << order));
- __free_pages(folio_page(folio, 0), order);
-}
-
/*
* This function progressively scans the array with free objects (with
* a limited look ahead) and extract objects belonging to the same
@@ -3709,7 +3828,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
* of re-populating per CPU c->freelist
*/
p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
- _RET_IP_, c);
+ _RET_IP_, c, s->object_size);
if (unlikely(!p[i]))
goto error;
@@ -3936,6 +4055,7 @@ static void early_kmem_cache_node_alloc(int node)
slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
BUG_ON(!slab);
+ inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects);
if (slab_nid(slab) != node) {
pr_err("SLUB: Unable to allocate memory from node %d\n", node);
pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
@@ -3950,7 +4070,6 @@ static void early_kmem_cache_node_alloc(int node)
n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
slab->freelist = get_freepointer(kmem_cache_node, n);
slab->inuse = 1;
- slab->frozen = 0;
kmem_cache_node->node[node] = n;
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, slab->objects);
@@ -4112,12 +4231,17 @@ static int calculate_sizes(struct kmem_cache *s)
}
#ifdef CONFIG_SLUB_DEBUG
- if (flags & SLAB_STORE_USER)
+ if (flags & SLAB_STORE_USER) {
/*
* Need to store information about allocs and frees after
* the object.
*/
size += 2 * sizeof(struct track);
+
+ /* Save the original kmalloc request size */
+ if (flags & SLAB_KMALLOC)
+ size += sizeof(unsigned int);
+ }
#endif
kasan_cache_create(s, &size, &s->flags);
@@ -4237,23 +4361,21 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = slab_address(slab);
- unsigned long flags;
- unsigned long *map;
void *p;
slab_err(s, slab, text, s->name);
- slab_lock(slab, &flags);
- map = get_map(s, slab);
+ spin_lock(&object_map_lock);
+ __fill_map(object_map, s, slab);
+
for_each_object(p, s, addr, slab->objects) {
- if (!test_bit(__obj_to_index(s, addr, p), map)) {
+ if (!test_bit(__obj_to_index(s, addr, p), object_map)) {
pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
- put_map(map);
- slab_unlock(slab, &flags);
+ spin_unlock(&object_map_lock);
#endif
}
@@ -4404,78 +4526,6 @@ static int __init setup_slub_min_objects(char *str)
__setup("slub_min_objects=", setup_slub_min_objects);
-void *__kmalloc(size_t size, gfp_t flags)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large(size, flags);
-
- s = kmalloc_slab(size, flags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc(s, NULL, flags, _RET_IP_, size);
-
- trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags);
-
- ret = kasan_kmalloc(s, ret, size, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc);
-
-#ifdef CONFIG_NUMA
-static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
-{
- struct page *page;
- void *ptr = NULL;
- unsigned int order = get_order(size);
-
- flags |= __GFP_COMP;
- page = alloc_pages_node(node, flags, order);
- if (page) {
- ptr = page_address(page);
- mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
- PAGE_SIZE << order);
- }
-
- return kmalloc_large_node_hook(ptr, size, flags);
-}
-
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
- ret = kmalloc_large_node(size, flags, node);
-
- trace_kmalloc_node(_RET_IP_, ret, NULL,
- size, PAGE_SIZE << get_order(size),
- flags, node);
-
- return ret;
- }
-
- s = kmalloc_slab(size, flags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size);
-
- trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node);
-
- ret = kasan_kmalloc(s, ret, size, flags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_node);
-#endif /* CONFIG_NUMA */
-
#ifdef CONFIG_HARDENED_USERCOPY
/*
* Rejects incorrectly sized objects and objects that are to be copied
@@ -4526,43 +4576,6 @@ void __check_heap_object(const void *ptr, unsigned long n,
}
#endif /* CONFIG_HARDENED_USERCOPY */
-size_t __ksize(const void *object)
-{
- struct folio *folio;
-
- if (unlikely(object == ZERO_SIZE_PTR))
- return 0;
-
- folio = virt_to_folio(object);
-
- if (unlikely(!folio_test_slab(folio)))
- return folio_size(folio);
-
- return slab_ksize(folio_slab(folio)->slab_cache);
-}
-EXPORT_SYMBOL(__ksize);
-
-void kfree(const void *x)
-{
- struct folio *folio;
- struct slab *slab;
- void *object = (void *)x;
-
- trace_kfree(_RET_IP_, x);
-
- if (unlikely(ZERO_OR_NULL_PTR(x)))
- return;
-
- folio = virt_to_folio(x);
- if (unlikely(!folio_test_slab(folio))) {
- free_large_kmalloc(folio, object);
- return;
- }
- slab = folio_slab(folio);
- slab_free(slab->slab_cache, slab, object, NULL, &object, 1, _RET_IP_);
-}
-EXPORT_SYMBOL(kfree);
-
#define SHRINK_PROMOTE_MAX 32
/*
@@ -4611,6 +4624,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
if (free == slab->objects) {
list_move(&slab->slab_list, &discard);
n->nr_partial--;
+ dec_slabs_node(s, node, slab->objects);
} else if (free <= SHRINK_PROMOTE_MAX)
list_move(&slab->slab_list, promote + free - 1);
}
@@ -4626,7 +4640,7 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
/* Release empty slabs */
list_for_each_entry_safe(slab, t, &discard, slab_list)
- discard_slab(s, slab);
+ free_slab(s, slab);
if (slabs_node(s, node))
ret = 1;
@@ -4858,6 +4872,8 @@ void __init kmem_cache_init(void)
void __init kmem_cache_init_late(void)
{
+ flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0);
+ WARN_ON(!flushwq);
}
struct kmem_cache *
@@ -4908,60 +4924,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
return 0;
}
-void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
- return kmalloc_large(size, gfpflags);
-
- s = kmalloc_slab(size, gfpflags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc(s, NULL, gfpflags, caller, size);
-
- /* Honor the call site pointer we received. */
- trace_kmalloc(caller, ret, s, size, s->size, gfpflags);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_track_caller);
-
-#ifdef CONFIG_NUMA
-void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
- int node, unsigned long caller)
-{
- struct kmem_cache *s;
- void *ret;
-
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
- ret = kmalloc_large_node(size, gfpflags, node);
-
- trace_kmalloc_node(caller, ret, NULL,
- size, PAGE_SIZE << get_order(size),
- gfpflags, node);
-
- return ret;
- }
-
- s = kmalloc_slab(size, gfpflags);
-
- if (unlikely(ZERO_OR_NULL_PTR(s)))
- return s;
-
- ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size);
-
- /* Honor the call site pointer we received. */
- trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node);
-
- return ret;
-}
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#endif
-
#ifdef CONFIG_SYSFS
static int count_inuse(struct slab *slab)
{
@@ -4980,12 +4942,9 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab,
{
void *p;
void *addr = slab_address(slab);
- unsigned long flags;
-
- slab_lock(slab, &flags);
if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
- goto unlock;
+ return;
/* Now we know that a valid freelist exists */
__fill_map(obj_map, s, slab);
@@ -4996,8 +4955,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab,
if (!check_object(s, slab, p, val))
break;
}
-unlock:
- slab_unlock(slab, &flags);
}
static int validate_slab_node(struct kmem_cache *s,
@@ -5068,6 +5025,7 @@ struct location {
depot_stack_handle_t handle;
unsigned long count;
unsigned long addr;
+ unsigned long waste;
long long sum_time;
long min_time;
long max_time;
@@ -5114,13 +5072,15 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
}
static int add_location(struct loc_track *t, struct kmem_cache *s,
- const struct track *track)
+ const struct track *track,
+ unsigned int orig_size)
{
long start, end, pos;
struct location *l;
- unsigned long caddr, chandle;
+ unsigned long caddr, chandle, cwaste;
unsigned long age = jiffies - track->when;
depot_stack_handle_t handle = 0;
+ unsigned int waste = s->object_size - orig_size;
#ifdef CONFIG_STACKDEPOT
handle = READ_ONCE(track->handle);
@@ -5138,11 +5098,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
if (pos == end)
break;
- caddr = t->loc[pos].addr;
- chandle = t->loc[pos].handle;
- if ((track->addr == caddr) && (handle == chandle)) {
+ l = &t->loc[pos];
+ caddr = l->addr;
+ chandle = l->handle;
+ cwaste = l->waste;
+ if ((track->addr == caddr) && (handle == chandle) &&
+ (waste == cwaste)) {
- l = &t->loc[pos];
l->count++;
if (track->when) {
l->sum_time += age;
@@ -5167,6 +5129,9 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
end = pos;
else if (track->addr == caddr && handle < chandle)
end = pos;
+ else if (track->addr == caddr && handle == chandle &&
+ waste < cwaste)
+ end = pos;
else
start = pos;
}
@@ -5190,6 +5155,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
l->min_pid = track->pid;
l->max_pid = track->pid;
l->handle = handle;
+ l->waste = waste;
cpumask_clear(to_cpumask(l->cpus));
cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
nodes_clear(l->nodes);
@@ -5202,13 +5168,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
unsigned long *obj_map)
{
void *addr = slab_address(slab);
+ bool is_alloc = (alloc == TRACK_ALLOC);
void *p;
__fill_map(obj_map, s, slab);
for_each_object(p, s, addr, slab->objects)
if (!test_bit(__obj_to_index(s, addr, p), obj_map))
- add_location(t, s, get_track(s, p, alloc));
+ add_location(t, s, get_track(s, p, alloc),
+ is_alloc ? get_orig_size(s, p) :
+ s->object_size);
}
#endif /* CONFIG_DEBUG_FS */
#endif /* CONFIG_SLUB_DEBUG */
@@ -5601,7 +5570,7 @@ static ssize_t validate_store(struct kmem_cache *s,
{
int ret = -EINVAL;
- if (buf[0] == '1') {
+ if (buf[0] == '1' && kmem_cache_debug(s)) {
ret = validate_slab_cache(s);
if (ret >= 0)
ret = length;
@@ -5745,6 +5714,29 @@ STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
#endif /* CONFIG_SLUB_STATS */
+#ifdef CONFIG_KFENCE
+static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE));
+}
+
+static ssize_t skip_kfence_store(struct kmem_cache *s,
+ const char *buf, size_t length)
+{
+ int ret = length;
+
+ if (buf[0] == '0')
+ s->flags &= ~SLAB_SKIP_KFENCE;
+ else if (buf[0] == '1')
+ s->flags |= SLAB_SKIP_KFENCE;
+ else
+ ret = -EINVAL;
+
+ return ret;
+}
+SLAB_ATTR(skip_kfence);
+#endif
+
static struct attribute *slab_attrs[] = {
&slab_size_attr.attr,
&object_size_attr.attr,
@@ -5812,6 +5804,9 @@ static struct attribute *slab_attrs[] = {
&failslab_attr.attr,
#endif
&usersize_attr.attr,
+#ifdef CONFIG_KFENCE
+ &skip_kfence_attr.attr,
+#endif
NULL
};
@@ -5826,7 +5821,6 @@ static ssize_t slab_attr_show(struct kobject *kobj,
{
struct slab_attribute *attribute;
struct kmem_cache *s;
- int err;
attribute = to_slab_attr(attr);
s = to_slab(kobj);
@@ -5834,9 +5828,7 @@ static ssize_t slab_attr_show(struct kobject *kobj,
if (!attribute->show)
return -EIO;
- err = attribute->show(s, buf);
-
- return err;
+ return attribute->show(s, buf);
}
static ssize_t slab_attr_store(struct kobject *kobj,
@@ -5845,7 +5837,6 @@ static ssize_t slab_attr_store(struct kobject *kobj,
{
struct slab_attribute *attribute;
struct kmem_cache *s;
- int err;
attribute = to_slab_attr(attr);
s = to_slab(kobj);
@@ -5853,8 +5844,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
if (!attribute->store)
return -EIO;
- err = attribute->store(s, buf, len);
- return err;
+ return attribute->store(s, buf, len);
}
static void kmem_cache_release(struct kobject *k)
@@ -5879,7 +5869,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
return slab_kset;
}
-#define ID_STR_LENGTH 64
+#define ID_STR_LENGTH 32
/* Create a unique string id for a slab cache:
*
@@ -5890,7 +5880,8 @@ static char *create_unique_id(struct kmem_cache *s)
char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
char *p = name;
- BUG_ON(!name);
+ if (!name)
+ return ERR_PTR(-ENOMEM);
*p++ = ':';
/*
@@ -5912,9 +5903,13 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'A';
if (p != name + 1)
*p++ = '-';
- p += sprintf(p, "%07u", s->size);
+ p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size);
- BUG_ON(p > name + ID_STR_LENGTH - 1);
+ if (WARN_ON(p > name + ID_STR_LENGTH - 1)) {
+ kfree(name);
+ return ERR_PTR(-EINVAL);
+ }
+ kmsan_unpoison_memory(name, p - name);
return name;
}
@@ -5948,6 +5943,8 @@ static int sysfs_slab_add(struct kmem_cache *s)
* for the symlinks.
*/
name = create_unique_id(s);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
}
s->kobj.kset = kset;
@@ -6016,6 +6013,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
al->name = name;
al->next = alias_list;
alias_list = al;
+ kmsan_unpoison_memory(al, sizeof(*al));
return 0;
}
@@ -6078,6 +6076,10 @@ static int slab_debugfs_show(struct seq_file *seq, void *v)
else
seq_puts(seq, "<not-available>");
+ if (l->waste)
+ seq_printf(seq, " waste=%lu/%lu",
+ l->count * l->waste, l->waste);
+
if (l->sum_time != l->min_time) {
seq_printf(seq, " age=%ld/%llu/%ld",
l->min_time, div_u64(l->sum_time, l->count),
diff --git a/mm/swap.c b/mm/swap.c
index 9cee7f6a3809..955930f41d20 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -366,7 +366,7 @@ static void folio_activate_drain(int cpu)
folio_batch_move_lru(fbatch, folio_activate_fn);
}
-static void folio_activate(struct folio *folio)
+void folio_activate(struct folio *folio)
{
if (folio_test_lru(folio) && !folio_test_active(folio) &&
!folio_test_unevictable(folio)) {
@@ -385,7 +385,7 @@ static inline void folio_activate_drain(int cpu)
{
}
-static void folio_activate(struct folio *folio)
+void folio_activate(struct folio *folio)
{
struct lruvec *lruvec;
@@ -428,6 +428,40 @@ static void __lru_cache_activate_folio(struct folio *folio)
local_unlock(&cpu_fbatches.lock);
}
+#ifdef CONFIG_LRU_GEN
+static void folio_inc_refs(struct folio *folio)
+{
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ if (folio_test_unevictable(folio))
+ return;
+
+ if (!folio_test_referenced(folio)) {
+ folio_set_referenced(folio);
+ return;
+ }
+
+ if (!folio_test_workingset(folio)) {
+ folio_set_workingset(folio);
+ return;
+ }
+
+ /* see the comment on MAX_NR_TIERS */
+ do {
+ new_flags = old_flags & LRU_REFS_MASK;
+ if (new_flags == LRU_REFS_MASK)
+ break;
+
+ new_flags += BIT(LRU_REFS_PGOFF);
+ new_flags |= old_flags & ~LRU_REFS_MASK;
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+}
+#else
+static void folio_inc_refs(struct folio *folio)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
/*
* Mark a page as having seen activity.
*
@@ -440,6 +474,11 @@ static void __lru_cache_activate_folio(struct folio *folio)
*/
void folio_mark_accessed(struct folio *folio)
{
+ if (lru_gen_enabled()) {
+ folio_inc_refs(folio);
+ return;
+ }
+
if (!folio_test_referenced(folio)) {
folio_set_referenced(folio);
} else if (folio_test_unevictable(folio)) {
@@ -484,6 +523,11 @@ void folio_add_lru(struct folio *folio)
folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ /* see the comment in lru_gen_add_folio() */
+ if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
+ folio_set_active(folio);
+
folio_get(folio);
local_lock(&cpu_fbatches.lock);
fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
@@ -493,22 +537,21 @@ void folio_add_lru(struct folio *folio)
EXPORT_SYMBOL(folio_add_lru);
/**
- * lru_cache_add_inactive_or_unevictable
- * @page: the page to be added to LRU
- * @vma: vma in which page is mapped for determining reclaimability
+ * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA.
+ * @folio: The folio to be added to the LRU.
+ * @vma: VMA in which the folio is mapped.
*
- * Place @page on the inactive or unevictable LRU list, depending on its
- * evictability.
+ * If the VMA is mlocked, @folio is added to the unevictable list.
+ * Otherwise, it is treated the same way as folio_add_lru().
*/
-void lru_cache_add_inactive_or_unevictable(struct page *page,
- struct vm_area_struct *vma)
+void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
{
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
- mlock_new_page(page);
+ mlock_new_page(&folio->page);
else
- lru_cache_add(page);
+ folio_add_lru(folio);
}
/*
@@ -575,7 +618,7 @@ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio)
static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
{
- if (folio_test_active(folio) && !folio_test_unevictable(folio)) {
+ if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) {
long nr_pages = folio_nr_pages(folio);
lruvec_del_folio(lruvec, folio);
@@ -688,8 +731,8 @@ void deactivate_page(struct page *page)
{
struct folio *folio = page_folio(page);
- if (folio_test_lru(folio) && folio_test_active(folio) &&
- !folio_test_unevictable(folio)) {
+ if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
+ (folio_test_active(folio) || lru_gen_enabled())) {
struct folio_batch *fbatch;
folio_get(folio);
diff --git a/mm/swap.h b/mm/swap.h
index 17936e068c1c..cc08c459c619 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -18,9 +18,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
}
void swap_write_unplug(struct swap_iocb *sio);
int swap_writepage(struct page *page, struct writeback_control *wbc);
-void end_swap_bio_write(struct bio *bio);
-int __swap_writepage(struct page *page, struct writeback_control *wbc,
- bio_end_io_t end_write_func);
+int __swap_writepage(struct page *page, struct writeback_control *wbc);
/* linux/mm/swap_state.c */
/* One swap address space for each 64M swap space */
@@ -34,16 +32,15 @@ extern struct address_space *swapper_spaces[];
void show_swap_cache_info(void);
bool add_to_swap(struct folio *folio);
void *get_shadow_from_swap_cache(swp_entry_t entry);
-int add_to_swap_cache(struct page *page, swp_entry_t entry,
+int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp);
void __delete_from_swap_cache(struct folio *folio,
swp_entry_t entry, void *shadow);
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
-struct page *lookup_swap_cache(swp_entry_t entry,
- struct vm_area_struct *vma,
- unsigned long addr);
+struct folio *swap_cache_get_folio(swp_entry_t entry,
+ struct vm_area_struct *vma, unsigned long addr);
struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index);
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
@@ -101,9 +98,8 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
return 0;
}
-static inline struct page *lookup_swap_cache(swp_entry_t swp,
- struct vm_area_struct *vma,
- unsigned long addr)
+static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
+ struct vm_area_struct *vma, unsigned long addr)
{
return NULL;
}
@@ -124,7 +120,7 @@ static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
return NULL;
}
-static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
+static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp_mask, void **shadowp)
{
return -1;
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 5a9442979a18..db6c4a26cf59 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -170,6 +170,9 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
unsigned long length;
struct swap_cgroup_ctrl *ctrl;
+ if (mem_cgroup_disabled())
+ return 0;
+
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
array = vcalloc(length, sizeof(void *));
@@ -204,6 +207,9 @@ void swap_cgroup_swapoff(int type)
unsigned long i, length;
struct swap_cgroup_ctrl *ctrl;
+ if (mem_cgroup_disabled())
+ return;
+
mutex_lock(&swap_cgroup_mutex);
ctrl = &swap_cgroup_ctrl[type];
map = ctrl->map;
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 10b94d64cc25..0bec1f705f8e 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -343,7 +343,7 @@ repeat:
get_swap_pages(1, &entry, 1);
out:
if (mem_cgroup_try_charge_swap(folio, entry)) {
- put_swap_page(&folio->page, entry);
+ put_swap_folio(folio, entry);
entry.val = 0;
}
return entry;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e166051566f4..438d0676c5be 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -85,21 +85,21 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
* add_to_swap_cache resembles filemap_add_folio on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-int add_to_swap_cache(struct page *page, swp_entry_t entry,
+int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
gfp_t gfp, void **shadowp)
{
struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
- XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
- unsigned long i, nr = thp_nr_pages(page);
+ XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
+ unsigned long i, nr = folio_nr_pages(folio);
void *old;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageSwapCache(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
- page_ref_add(page, nr);
- SetPageSwapCache(page);
+ folio_ref_add(folio, nr);
+ folio_set_swapcache(folio);
do {
xas_lock_irq(&xas);
@@ -107,19 +107,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
if (xas_error(&xas))
goto unlock;
for (i = 0; i < nr; i++) {
- VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
+ VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
old = xas_load(&xas);
if (xa_is_value(old)) {
if (shadowp)
*shadowp = old;
}
- set_page_private(page + i, entry.val + i);
- xas_store(&xas, page);
+ set_page_private(folio_page(folio, i), entry.val + i);
+ xas_store(&xas, folio);
xas_next(&xas);
}
address_space->nrpages += nr;
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
- __mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
+ __node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+ __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
@@ -127,8 +127,8 @@ unlock:
if (!xas_error(&xas))
return 0;
- ClearPageSwapCache(page);
- page_ref_sub(page, nr);
+ folio_clear_swapcache(folio);
+ folio_ref_sub(folio, nr);
return xas_error(&xas);
}
@@ -151,7 +151,7 @@ void __delete_from_swap_cache(struct folio *folio,
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, shadow);
- VM_BUG_ON_FOLIO(entry != folio, folio);
+ VM_BUG_ON_PAGE(entry != folio, entry);
set_page_private(folio_page(folio, i), 0);
xas_next(&xas);
}
@@ -194,7 +194,7 @@ bool add_to_swap(struct folio *folio)
/*
* Add it to the swap cache.
*/
- err = add_to_swap_cache(&folio->page, entry,
+ err = add_to_swap_cache(folio, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
if (err)
/*
@@ -218,7 +218,7 @@ bool add_to_swap(struct folio *folio)
return true;
fail:
- put_swap_page(&folio->page, entry);
+ put_swap_folio(folio, entry);
return false;
}
@@ -237,7 +237,7 @@ void delete_from_swap_cache(struct folio *folio)
__delete_from_swap_cache(folio, entry, NULL);
xa_unlock_irq(&address_space->i_pages);
- put_swap_page(&folio->page, entry);
+ put_swap_folio(folio, entry);
folio_ref_sub(folio, folio_nr_pages(folio));
}
@@ -272,16 +272,19 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin,
/*
* If we are the only user, then try to free up the swap cache.
*
- * Its ok to check for PageSwapCache without the page lock
+ * Its ok to check the swapcache flag without the folio lock
* here because we are going to recheck again inside
- * try_to_free_swap() _with_ the lock.
+ * folio_free_swap() _with_ the lock.
* - Marcelo
*/
void free_swap_cache(struct page *page)
{
- if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
- try_to_free_swap(page);
- unlock_page(page);
+ struct folio *folio = page_folio(page);
+
+ if (folio_test_swapcache(folio) && !folio_mapped(folio) &&
+ folio_trylock(folio)) {
+ folio_free_swap(folio);
+ folio_unlock(folio);
}
}
@@ -317,24 +320,24 @@ static inline bool swap_use_vma_readahead(void)
}
/*
- * Lookup a swap entry in the swap cache. A found page will be returned
+ * Lookup a swap entry in the swap cache. A found folio will be returned
* unlocked and with its refcount incremented - we rely on the kernel
- * lock getting page table operations atomic even if we drop the page
+ * lock getting page table operations atomic even if we drop the folio
* lock before returning.
*/
-struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
- unsigned long addr)
+struct folio *swap_cache_get_folio(swp_entry_t entry,
+ struct vm_area_struct *vma, unsigned long addr)
{
- struct page *page;
+ struct folio *folio;
struct swap_info_struct *si;
si = get_swap_device(entry);
if (!si)
return NULL;
- page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
put_swap_device(si);
- if (page) {
+ if (folio) {
bool vma_ra = swap_use_vma_readahead();
bool readahead;
@@ -342,10 +345,10 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
* At the moment, we don't support PG_readahead for anon THP
* so let's bail out rather than confusing the readahead stat.
*/
- if (unlikely(PageTransCompound(page)))
- return page;
+ if (unlikely(folio_test_large(folio)))
+ return folio;
- readahead = TestClearPageReadahead(page);
+ readahead = folio_test_clear_readahead(folio);
if (vma && vma_ra) {
unsigned long ra_val;
int win, hits;
@@ -366,7 +369,7 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
}
}
- return page;
+ return folio;
}
/**
@@ -411,7 +414,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
bool *new_page_allocated)
{
struct swap_info_struct *si;
- struct page *page;
+ struct folio *folio;
void *shadow = NULL;
*new_page_allocated = false;
@@ -420,17 +423,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
int err;
/*
* First check the swap cache. Since this is normally
- * called after lookup_swap_cache() failed, re-calling
+ * called after swap_cache_get_folio() failed, re-calling
* that would confuse statistics.
*/
si = get_swap_device(entry);
if (!si)
return NULL;
- page = find_get_page(swap_address_space(entry),
- swp_offset(entry));
+ folio = filemap_get_folio(swap_address_space(entry),
+ swp_offset(entry));
put_swap_device(si);
- if (page)
- return page;
+ if (folio)
+ return folio_file_page(folio, swp_offset(entry));
/*
* Just skip read ahead for unused swap slot.
@@ -448,8 +451,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
* cause any racers to loop around until we add it to cache.
*/
- page = alloc_page_vma(gfp_mask, vma, addr);
- if (!page)
+ folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false);
+ if (!folio)
return NULL;
/*
@@ -459,7 +462,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (!err)
break;
- put_page(page);
+ folio_put(folio);
if (err != -EEXIST)
return NULL;
@@ -477,30 +480,30 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* The swap entry is ours to swap in. Prepare the new page.
*/
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
- if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
+ if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry))
goto fail_unlock;
/* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
+ if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
goto fail_unlock;
mem_cgroup_swapin_uncharge_swap(entry);
if (shadow)
- workingset_refault(page_folio(page), shadow);
+ workingset_refault(folio, shadow);
- /* Caller will initiate read into locked page */
- lru_cache_add(page);
+ /* Caller will initiate read into locked folio */
+ folio_add_lru(folio);
*new_page_allocated = true;
- return page;
+ return &folio->page;
fail_unlock:
- put_swap_page(page, entry);
- unlock_page(page);
- put_page(page);
+ put_swap_folio(folio, entry);
+ folio_unlock(folio);
+ folio_put(folio);
return NULL;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1fdccd2f1422..5fc1237a9f21 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -63,6 +63,10 @@ EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority = -1;
+unsigned long swapfile_maximum_size;
+#ifdef CONFIG_MIGRATION
+bool swap_migration_ad_supported;
+#endif /* CONFIG_MIGRATION */
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
@@ -128,27 +132,27 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
unsigned long offset, unsigned long flags)
{
swp_entry_t entry = swp_entry(si->type, offset);
- struct page *page;
+ struct folio *folio;
int ret = 0;
- page = find_get_page(swap_address_space(entry), offset);
- if (!page)
+ folio = filemap_get_folio(swap_address_space(entry), offset);
+ if (!folio)
return 0;
/*
* When this function is called from scan_swap_map_slots() and it's
- * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
+ * called by vmscan.c at reclaiming folios. So we hold a folio lock
* here. We have to use trylock for avoiding deadlock. This is a special
- * case and you should use try_to_free_swap() with explicit lock_page()
+ * case and you should use folio_free_swap() with explicit folio_lock()
* in usual operations.
*/
- if (trylock_page(page)) {
+ if (folio_trylock(folio)) {
if ((flags & TTRS_ANYWAY) ||
- ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
- ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
- ret = try_to_free_swap(page);
- unlock_page(page);
+ ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
+ ret = folio_free_swap(folio);
+ folio_unlock(folio);
}
- put_page(page);
+ folio_put(folio);
return ret;
}
@@ -1328,7 +1332,7 @@ void swap_free(swp_entry_t entry)
/*
* Called after dropping swapcache to decrease refcnt to swap entries.
*/
-void put_swap_page(struct page *page, swp_entry_t entry)
+void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
unsigned long offset = swp_offset(entry);
unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1337,7 +1341,7 @@ void put_swap_page(struct page *page, swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
- int size = swap_entry_size(thp_nr_pages(page));
+ int size = swap_entry_size(folio_nr_pages(folio));
si = _swap_info_get(entry);
if (!si)
@@ -1427,30 +1431,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
spin_unlock(&p->lock);
}
-/*
- * How many references to page are currently swapped out?
- * This does not give an exact answer when swap count is continued,
- * but does include the high COUNT_CONTINUED flag to allow for that.
- */
-static int page_swapcount(struct page *page)
-{
- int count = 0;
- struct swap_info_struct *p;
- struct swap_cluster_info *ci;
- swp_entry_t entry;
- unsigned long offset;
-
- entry.val = page_private(page);
- p = _swap_info_get(entry);
- if (p) {
- offset = swp_offset(entry);
- ci = lock_cluster_or_swap_info(p, offset);
- count = swap_count(p->swap_map[offset]);
- unlock_cluster_or_swap_info(p, ci);
- }
- return count;
-}
-
int __swap_count(swp_entry_t entry)
{
struct swap_info_struct *si;
@@ -1465,11 +1445,16 @@ int __swap_count(swp_entry_t entry)
return count;
}
+/*
+ * How many references to @entry are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
+ */
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
{
- int count = 0;
pgoff_t offset = swp_offset(entry);
struct swap_cluster_info *ci;
+ int count;
ci = lock_cluster_or_swap_info(si, offset);
count = swap_count(si->swap_map[offset]);
@@ -1570,56 +1555,59 @@ unlock_out:
static bool folio_swapped(struct folio *folio)
{
- swp_entry_t entry;
- struct swap_info_struct *si;
+ swp_entry_t entry = folio_swap_entry(folio);
+ struct swap_info_struct *si = _swap_info_get(entry);
+
+ if (!si)
+ return false;
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
- return page_swapcount(&folio->page) != 0;
+ return swap_swapcount(si, entry) != 0;
- entry = folio_swap_entry(folio);
- si = _swap_info_get(entry);
- if (si)
- return swap_page_trans_huge_swapped(si, entry);
- return false;
+ return swap_page_trans_huge_swapped(si, entry);
}
-/*
- * If swap is getting full, or if there are no more mappings of this page,
- * then try_to_free_swap is called to free its swap space.
+/**
+ * folio_free_swap() - Free the swap space used for this folio.
+ * @folio: The folio to remove.
+ *
+ * If swap is getting full, or if there are no more mappings of this folio,
+ * then call folio_free_swap to free its swap space.
+ *
+ * Return: true if we were able to release the swap space.
*/
-int try_to_free_swap(struct page *page)
+bool folio_free_swap(struct folio *folio)
{
- struct folio *folio = page_folio(page);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (!folio_test_swapcache(folio))
- return 0;
+ return false;
if (folio_test_writeback(folio))
- return 0;
+ return false;
if (folio_swapped(folio))
- return 0;
+ return false;
/*
* Once hibernation has begun to create its image of memory,
- * there's a danger that one of the calls to try_to_free_swap()
+ * there's a danger that one of the calls to folio_free_swap()
* - most probably a call from __try_to_reclaim_swap() while
* hibernation is allocating its own swap pages for the image,
* but conceivably even a call from memory reclaim - will free
- * the swap from a page which has already been recorded in the
- * image as a clean swapcache page, and then reuse its swap for
+ * the swap from a folio which has already been recorded in the
+ * image as a clean swapcache folio, and then reuse its swap for
* another page of the image. On waking from hibernation, the
- * original page might be freed under memory pressure, then
+ * original folio might be freed under memory pressure, then
* later read back in from swap, now with the wrong data.
*
* Hibernation suspends storage while it is writing the image
* to disk so check that here.
*/
if (pm_suspended_storage())
- return 0;
+ return false;
delete_from_swap_cache(folio);
folio_set_dirty(folio);
- return 1;
+ return true;
}
/*
@@ -1770,8 +1758,9 @@ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
* force COW, vm_page_prot omits write permission from any private vma.
*/
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, swp_entry_t entry, struct page *page)
+ unsigned long addr, swp_entry_t entry, struct folio *folio)
{
+ struct page *page = folio_file_page(folio, swp_offset(entry));
struct page *swapcache;
spinlock_t *ptl;
pte_t *pte, new_pte;
@@ -1843,17 +1832,18 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned int type)
{
- struct page *page;
swp_entry_t entry;
pte_t *pte;
struct swap_info_struct *si;
- unsigned long offset;
int ret = 0;
volatile unsigned char *swap_map;
si = swap_info[type];
pte = pte_offset_map(pmd, addr);
do {
+ struct folio *folio;
+ unsigned long offset;
+
if (!is_swap_pte(*pte))
continue;
@@ -1864,8 +1854,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
offset = swp_offset(entry);
pte_unmap(pte);
swap_map = &si->swap_map[offset];
- page = lookup_swap_cache(entry, vma, addr);
- if (!page) {
+ folio = swap_cache_get_folio(entry, vma, addr);
+ if (!folio) {
+ struct page *page;
struct vm_fault vmf = {
.vma = vma,
.address = addr,
@@ -1875,25 +1866,27 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
&vmf);
+ if (page)
+ folio = page_folio(page);
}
- if (!page) {
+ if (!folio) {
if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
goto try_next;
return -ENOMEM;
}
- lock_page(page);
- wait_on_page_writeback(page);
- ret = unuse_pte(vma, pmd, addr, entry, page);
+ folio_lock(folio);
+ folio_wait_writeback(folio);
+ ret = unuse_pte(vma, pmd, addr, entry, folio);
if (ret < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
goto out;
}
- try_to_free_swap(page);
- unlock_page(page);
- put_page(page);
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ folio_put(folio);
try_next:
pte = pte_offset_map(pmd, addr);
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1990,14 +1983,16 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type)
{
struct vm_area_struct *vma;
int ret = 0;
+ VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
if (vma->anon_vma) {
ret = unuse_vma(vma, type);
if (ret)
break;
}
+
cond_resched();
}
mmap_read_unlock(mm);
@@ -2042,7 +2037,7 @@ static int try_to_unuse(unsigned int type)
struct list_head *p;
int retval = 0;
struct swap_info_struct *si = swap_info[type];
- struct page *page;
+ struct folio *folio;
swp_entry_t entry;
unsigned int i;
@@ -2092,21 +2087,21 @@ retry:
(i = find_next_to_unuse(si, i)) != 0) {
entry = swp_entry(type, i);
- page = find_get_page(swap_address_space(entry), i);
- if (!page)
+ folio = filemap_get_folio(swap_address_space(entry), i);
+ if (!folio)
continue;
/*
- * It is conceivable that a racing task removed this page from
- * swap cache just before we acquired the page lock. The page
+ * It is conceivable that a racing task removed this folio from
+ * swap cache just before we acquired the page lock. The folio
* might even be back in swap cache on another swap area. But
- * that is okay, try_to_free_swap() only removes stale pages.
+ * that is okay, folio_free_swap() only removes stale folios.
*/
- lock_page(page);
- wait_on_page_writeback(page);
- try_to_free_swap(page);
- unlock_page(page);
- put_page(page);
+ folio_lock(folio);
+ folio_wait_writeback(folio);
+ folio_free_swap(folio);
+ folio_unlock(folio);
+ folio_put(folio);
}
/*
@@ -2816,7 +2811,7 @@ unsigned long generic_max_swapfile_size(void)
}
/* Can be overridden by an architecture for additional checks. */
-__weak unsigned long max_swapfile_size(void)
+__weak unsigned long arch_max_swapfile_size(void)
{
return generic_max_swapfile_size();
}
@@ -2856,7 +2851,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
p->cluster_next = 1;
p->cluster_nr = 0;
- maxpages = max_swapfile_size();
+ maxpages = swapfile_maximum_size;
last_page = swap_header->info.last_page;
if (!last_page) {
pr_warn("Empty swap-file\n");
@@ -3655,7 +3650,7 @@ void __cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
avail_lists[nid]) {
if (si->bdev) {
- blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
+ blkcg_schedule_throttle(si->bdev->bd_disk, true);
break;
}
}
@@ -3677,6 +3672,13 @@ static int __init swapfile_init(void)
for_each_node(nid)
plist_head_init(&swap_avail_heads[nid]);
+ swapfile_maximum_size = arch_max_swapfile_size();
+
+#ifdef CONFIG_MIGRATION
+ if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
+ swap_migration_ad_supported = true;
+#endif /* CONFIG_MIGRATION */
+
return 0;
}
subsys_initcall(swapfile_init);
diff --git a/mm/truncate.c b/mm/truncate.c
index 0b0708bf935f..c0be77e5c008 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -240,7 +240,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
folio_invalidate(folio, offset, length);
if (!folio_test_large(folio))
return true;
- if (split_huge_page(&folio->page) == 0)
+ if (split_folio(folio) == 0)
return true;
if (folio_test_dirty(folio))
return false;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 7327b2573f7c..650ab6cfd5f4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -64,7 +64,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
pte_t _dst_pte, *dst_pte;
bool writable = dst_vma->vm_flags & VM_WRITE;
bool vm_shared = dst_vma->vm_flags & VM_SHARED;
- bool page_in_cache = page->mapping;
+ bool page_in_cache = page_mapping(page);
spinlock_t *ptl;
struct inode *inode;
pgoff_t offset, max_off;
@@ -157,11 +157,28 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!page)
goto out;
- page_kaddr = kmap_atomic(page);
+ page_kaddr = kmap_local_page(page);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
ret = copy_from_user(page_kaddr,
(const void __user *) src_addr,
PAGE_SIZE);
- kunmap_atomic(page_kaddr);
+ pagefault_enable();
+ kunmap_local(page_kaddr);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
@@ -243,20 +260,22 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
{
struct inode *inode = file_inode(dst_vma->vm_file);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct folio *folio;
struct page *page;
int ret;
- ret = shmem_getpage(inode, pgoff, &page, SGP_NOALLOC);
- /* Our caller expects us to return -EFAULT if we failed to find page. */
+ ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
+ /* Our caller expects us to return -EFAULT if we failed to find folio */
if (ret == -ENOENT)
ret = -EFAULT;
if (ret)
goto out;
- if (!page) {
+ if (!folio) {
ret = -EFAULT;
goto out;
}
+ page = folio_file_page(folio, pgoff);
if (PageHWPoison(page)) {
ret = -EIO;
goto out_release;
@@ -267,13 +286,13 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
if (ret)
goto out_release;
- unlock_page(page);
+ folio_unlock(folio);
ret = 0;
out:
return ret;
out_release:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
goto out;
}
@@ -377,30 +396,30 @@ retry:
BUG_ON(dst_addr >= dst_start + len);
/*
- * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
- * i_mmap_rwsem ensures the dst_pte remains valid even
+ * Serialize via vma_lock and hugetlb_fault_mutex.
+ * vma_lock ensures the dst_pte remains valid even
* in the case of shared pmds. fault mutex prevents
* races with other faulting threads.
*/
- mapping = dst_vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
idx = linear_page_index(dst_vma, dst_addr);
+ mapping = dst_vma->vm_file->f_mapping;
hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ hugetlb_vma_lock_read(dst_vma);
err = -ENOMEM;
dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
if (!dst_pte) {
+ hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
goto out_unlock;
}
if (mode != MCOPY_ATOMIC_CONTINUE &&
!huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
err = -EEXIST;
+ hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -408,8 +427,8 @@ retry:
dst_addr, src_addr, mode, &page,
wp_copy);
+ hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
cond_resched();
@@ -644,11 +663,11 @@ retry:
mmap_read_unlock(dst_mm);
BUG_ON(!page);
- page_kaddr = kmap(page);
+ page_kaddr = kmap_local_page(page);
err = copy_from_user(page_kaddr,
(const void __user *) src_addr,
PAGE_SIZE);
- kunmap(page);
+ kunmap_local(page_kaddr);
if (unlikely(err)) {
err = -EFAULT;
goto out;
diff --git a/mm/util.c b/mm/util.c
index c9439c66d8cf..12984e76767e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -272,38 +272,6 @@ void *memdup_user_nul(const void __user *src, size_t len)
}
EXPORT_SYMBOL(memdup_user_nul);
-void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev)
-{
- struct vm_area_struct *next;
-
- vma->vm_prev = prev;
- if (prev) {
- next = prev->vm_next;
- prev->vm_next = vma;
- } else {
- next = mm->mmap;
- mm->mmap = vma;
- }
- vma->vm_next = next;
- if (next)
- next->vm_prev = vma;
-}
-
-void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- struct vm_area_struct *prev, *next;
-
- next = vma->vm_next;
- prev = vma->vm_prev;
- if (prev)
- prev->vm_next = next;
- else
- mm->mmap = next;
- if (next)
- next->vm_prev = prev;
-}
-
/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
@@ -619,6 +587,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (ret || size <= PAGE_SIZE)
return ret;
+ /* non-sleeping allocations are not supported by vmalloc */
+ if (!gfpflags_allow_blocking(flags))
+ return NULL;
+
/* Don't even allow crazy sizes */
if (unlikely(size > INT_MAX)) {
WARN_ON_ONCE(!(flags & __GFP_NOWARN));
@@ -850,10 +822,10 @@ int folio_mapcount(struct folio *folio)
return atomic_read(&folio->_mapcount) + 1;
compound = folio_entire_mapcount(folio);
- nr = folio_nr_pages(folio);
if (folio_test_hugetlb(folio))
return compound;
ret = compound;
+ nr = folio_nr_pages(folio);
for (i = 0; i < nr; i++)
ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1;
/* File pages has compound_mapcount included in _mapcount */
@@ -1052,6 +1024,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
return 0;
error:
+ pr_warn_ratelimited("%s: pid: %d, comm: %s, no enough memory for the allocation\n",
+ __func__, current->pid, current->comm);
vm_unacct_memory(pages);
return -ENOMEM;
diff --git a/mm/vmacache.c b/mm/vmacache.c
deleted file mode 100644
index 01a6e6688ec1..000000000000
--- a/mm/vmacache.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2014 Davidlohr Bueso.
- */
-#include <linux/sched/signal.h>
-#include <linux/sched/task.h>
-#include <linux/mm.h>
-#include <linux/vmacache.h>
-
-/*
- * Hash based on the pmd of addr if configured with MMU, which provides a good
- * hit rate for workloads with spatial locality. Otherwise, use pages.
- */
-#ifdef CONFIG_MMU
-#define VMACACHE_SHIFT PMD_SHIFT
-#else
-#define VMACACHE_SHIFT PAGE_SHIFT
-#endif
-#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK)
-
-/*
- * This task may be accessing a foreign mm via (for example)
- * get_user_pages()->find_vma(). The vmacache is task-local and this
- * task's vmacache pertains to a different mm (ie, its own). There is
- * nothing we can do here.
- *
- * Also handle the case where a kernel thread has adopted this mm via
- * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm.
- */
-static inline bool vmacache_valid_mm(struct mm_struct *mm)
-{
- return current->mm == mm && !(current->flags & PF_KTHREAD);
-}
-
-void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
-{
- if (vmacache_valid_mm(newvma->vm_mm))
- current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma;
-}
-
-static bool vmacache_valid(struct mm_struct *mm)
-{
- struct task_struct *curr;
-
- if (!vmacache_valid_mm(mm))
- return false;
-
- curr = current;
- if (mm->vmacache_seqnum != curr->vmacache.seqnum) {
- /*
- * First attempt will always be invalid, initialize
- * the new cache for this task here.
- */
- curr->vmacache.seqnum = mm->vmacache_seqnum;
- vmacache_flush(curr);
- return false;
- }
- return true;
-}
-
-struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
-{
- int idx = VMACACHE_HASH(addr);
- int i;
-
- count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
- if (!vmacache_valid(mm))
- return NULL;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[idx];
-
- if (vma) {
-#ifdef CONFIG_DEBUG_VM_VMACACHE
- if (WARN_ON_ONCE(vma->vm_mm != mm))
- break;
-#endif
- if (vma->vm_start <= addr && vma->vm_end > addr) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
- }
- }
- if (++idx == VMACACHE_SIZE)
- idx = 0;
- }
-
- return NULL;
-}
-
-#ifndef CONFIG_MMU
-struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
-{
- int idx = VMACACHE_HASH(start);
- int i;
-
- count_vm_vmacache_event(VMACACHE_FIND_CALLS);
-
- if (!vmacache_valid(mm))
- return NULL;
-
- for (i = 0; i < VMACACHE_SIZE; i++) {
- struct vm_area_struct *vma = current->vmacache.vmas[idx];
-
- if (vma && vma->vm_start == start && vma->vm_end == end) {
- count_vm_vmacache_event(VMACACHE_FIND_HITS);
- return vma;
- }
- if (++idx == VMACACHE_SIZE)
- idx = 0;
- }
-
- return NULL;
-}
-#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index dd6cdb201195..ccaa461998f3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -320,6 +320,9 @@ int ioremap_page_range(unsigned long addr, unsigned long end,
err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
ioremap_max_page_shift);
flush_cache_vmap(addr, end);
+ if (!err)
+ kmsan_ioremap_page_range(addr, end, phys_addr, prot,
+ ioremap_max_page_shift);
return err;
}
@@ -416,7 +419,7 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
*
* This is an internal function only. Do not use outside mm/.
*/
-void vunmap_range_noflush(unsigned long start, unsigned long end)
+void __vunmap_range_noflush(unsigned long start, unsigned long end)
{
unsigned long next;
pgd_t *pgd;
@@ -438,6 +441,12 @@ void vunmap_range_noflush(unsigned long start, unsigned long end)
arch_sync_kernel_mappings(start, end);
}
+void vunmap_range_noflush(unsigned long start, unsigned long end)
+{
+ kmsan_vunmap_range_noflush(start, end);
+ __vunmap_range_noflush(start, end);
+}
+
/**
* vunmap_range - unmap kernel virtual addresses
* @addr: start of the VM area to unmap
@@ -575,7 +584,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
*
* This is an internal function only. Do not use outside mm/.
*/
-int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
pgprot_t prot, struct page **pages, unsigned int page_shift)
{
unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
@@ -590,7 +599,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
int err;
err = vmap_range_noflush(addr, addr + (1UL << page_shift),
- __pa(page_address(pages[i])), prot,
+ page_to_phys(pages[i]), prot,
page_shift);
if (err)
return err;
@@ -601,6 +610,13 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
return 0;
}
+int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
+ pgprot_t prot, struct page **pages, unsigned int page_shift)
+{
+ kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+ return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
+}
+
/**
* vmap_pages_range - map pages to a kernel virtual address
* @addr: start of the VM area to map
@@ -1300,12 +1316,12 @@ find_vmap_lowest_match(struct rb_root *root, unsigned long size,
#include <linux/random.h>
static struct vmap_area *
-find_vmap_lowest_linear_match(unsigned long size,
+find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
unsigned long align, unsigned long vstart)
{
struct vmap_area *va;
- list_for_each_entry(va, &free_vmap_area_list, list) {
+ list_for_each_entry(va, head, list) {
if (!is_within_this_va(va, size, align, vstart))
continue;
@@ -1316,7 +1332,8 @@ find_vmap_lowest_linear_match(unsigned long size,
}
static void
-find_vmap_lowest_match_check(unsigned long size, unsigned long align)
+find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
+ unsigned long size, unsigned long align)
{
struct vmap_area *va_1, *va_2;
unsigned long vstart;
@@ -1325,8 +1342,8 @@ find_vmap_lowest_match_check(unsigned long size, unsigned long align)
get_random_bytes(&rnd, sizeof(rnd));
vstart = VMALLOC_START + rnd;
- va_1 = find_vmap_lowest_match(size, align, vstart, false);
- va_2 = find_vmap_lowest_linear_match(size, align, vstart);
+ va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
+ va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);
if (va_1 != va_2)
pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
@@ -1513,7 +1530,7 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head,
return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
- find_vmap_lowest_match_check(size, align);
+ find_vmap_lowest_match_check(root, head, size, align);
#endif
return nva_start_addr;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2b1431352dc..04d8b88e5216 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -43,12 +43,17 @@
#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <linux/memory-tiers.h>
#include <linux/oom.h>
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
#include <linux/psi.h>
+#include <linux/pagewalk.h>
+#include <linux/shmem_fs.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -85,7 +90,7 @@ struct scan_control {
unsigned long anon_cost;
unsigned long file_cost;
- /* Can active pages be deactivated as part of reclaim? */
+ /* Can active folios be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
unsigned int may_deactivate:2;
@@ -95,10 +100,10 @@ struct scan_control {
/* Writepage batching in laptop mode; RECLAIM_WRITE */
unsigned int may_writepage:1;
- /* Can mapped pages be reclaimed? */
+ /* Can mapped folios be reclaimed? */
unsigned int may_unmap:1;
- /* Can pages be swapped as part of reclaim? */
+ /* Can folios be swapped as part of reclaim? */
unsigned int may_swap:1;
/* Proactive reclaim invoked by userspace through memory.reclaim */
@@ -123,19 +128,25 @@ struct scan_control {
/* There is easily reclaimable cold cache in the current node */
unsigned int cache_trim_mode:1;
- /* The file pages on the current node are dangerously low */
+ /* The file folios on the current node are dangerously low */
unsigned int file_is_tiny:1;
/* Always discard instead of demoting to lower tier memory */
unsigned int no_demotion:1;
+#ifdef CONFIG_LRU_GEN
+ /* help kswapd make better choices among multiple memcgs */
+ unsigned int memcgs_need_aging:1;
+ unsigned long last_reclaimed;
+#endif
+
/* Allocation order */
s8 order;
/* Scan (total_size >> priority) pages at once */
s8 priority;
- /* The highest zone to isolate pages for reclaim from */
+ /* The highest zone to isolate folios for reclaim from */
s8 reclaim_idx;
/* This context's GFP mask */
@@ -443,7 +454,7 @@ static bool cgroup_reclaim(struct scan_control *sc)
*
* The normal page dirty throttling mechanism in balance_dirty_pages() is
* completely broken with the legacy memcg and direct stalling in
- * shrink_page_list() is used for throttling instead, which lacks all the
+ * shrink_folio_list() is used for throttling instead, which lacks all the
* niceties such as fairness, adaptive pausing, bandwidth proportional
* allocation and configurability.
*
@@ -564,9 +575,9 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
}
/*
- * This misses isolated pages which are not accounted for to save counters.
+ * This misses isolated folios which are not accounted for to save counters.
* As the data only determines if reclaim or compaction continues, it is
- * not expected that isolated pages will be a dominating factor.
+ * not expected that isolated folios will be a dominating factor.
*/
unsigned long zone_reclaimable_pages(struct zone *zone)
{
@@ -1039,9 +1050,9 @@ void drop_slab(void)
static inline int is_page_cache_freeable(struct folio *folio)
{
/*
- * A freeable page cache page is referenced only by the caller
- * that isolated the page, the page cache and optional buffer
- * heads at page->private.
+ * A freeable page cache folio is referenced only by the caller
+ * that isolated the folio, the page cache and optional filesystem
+ * private data at folio->private.
*/
return folio_ref_count(folio) - folio_test_private(folio) ==
1 + folio_nr_pages(folio);
@@ -1081,8 +1092,8 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat)
return true;
/*
- * If there are a lot of dirty/writeback pages then do not
- * throttle as throttling will occur when the pages cycle
+ * If there are a lot of dirty/writeback folios then do not
+ * throttle as throttling will occur when the folios cycle
* towards the end of the LRU if still under writeback.
*/
for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -1125,7 +1136,7 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
* short. Failing to make progress or waiting on writeback are
* potentially long-lived events so use a longer timeout. This is shaky
* logic as a failure to make progress could be due to anything from
- * writeback to a slow device to excessive references pages at the tail
+ * writeback to a slow device to excessive referenced folios at the tail
* of the inactive LRU.
*/
switch(reason) {
@@ -1171,8 +1182,8 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
}
/*
- * Account for pages written if tasks are throttled waiting on dirty
- * pages to clean. If enough pages have been cleaned since throttling
+ * Account for folios written if tasks are throttled waiting on dirty
+ * folios to clean. If enough folios have been cleaned since throttling
* started then wakeup the throttled tasks.
*/
void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
@@ -1198,18 +1209,18 @@ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
/* possible outcome of pageout() */
typedef enum {
- /* failed to write page out, page is locked */
+ /* failed to write folio out, folio is locked */
PAGE_KEEP,
- /* move page to the active list, page is locked */
+ /* move folio to the active list, folio is locked */
PAGE_ACTIVATE,
- /* page has been sent to the disk successfully, page is unlocked */
+ /* folio has been sent to the disk successfully, folio is unlocked */
PAGE_SUCCESS,
- /* page is clean and locked */
+ /* folio is clean and locked */
PAGE_CLEAN,
} pageout_t;
/*
- * pageout is called by shrink_page_list() for each dirty page.
+ * pageout is called by shrink_folio_list() for each dirty folio.
* Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
@@ -1283,7 +1294,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
}
/*
- * Same as remove_mapping, but if the page is removed from the mapping, it
+ * Same as remove_mapping, but if the folio is removed from the mapping, it
* gets returned with a refcount of 0.
*/
static int __remove_mapping(struct address_space *mapping, struct folio *folio,
@@ -1299,34 +1310,34 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
/*
- * The non racy check for a busy page.
+ * The non racy check for a busy folio.
*
* Must be careful with the order of the tests. When someone has
- * a ref to the page, it may be possible that they dirty it then
- * drop the reference. So if PageDirty is tested before page_count
- * here, then the following race may occur:
+ * a ref to the folio, it may be possible that they dirty it then
+ * drop the reference. So if the dirty flag is tested before the
+ * refcount here, then the following race may occur:
*
* get_user_pages(&page);
* [user mapping goes away]
* write_to(page);
- * !PageDirty(page) [good]
- * SetPageDirty(page);
- * put_page(page);
- * !page_count(page) [good, discard it]
+ * !folio_test_dirty(folio) [good]
+ * folio_set_dirty(folio);
+ * folio_put(folio);
+ * !refcount(folio) [good, discard it]
*
* [oops, our write_to data is lost]
*
* Reversing the order of the tests ensures such a situation cannot
- * escape unnoticed. The smp_rmb is needed to ensure the page->flags
- * load is not satisfied before that of page->_refcount.
+ * escape unnoticed. The smp_rmb is needed to ensure the folio->flags
+ * load is not satisfied before that of folio->_refcount.
*
- * Note that if SetPageDirty is always performed via set_page_dirty,
+ * Note that if the dirty flag is always set via folio_mark_dirty,
* and thus under the i_pages lock, then this ordering is not required.
*/
refcount = 1 + folio_nr_pages(folio);
if (!folio_ref_freeze(folio, refcount))
goto cannot_free;
- /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
+ /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */
if (unlikely(folio_test_dirty(folio))) {
folio_ref_unfreeze(folio, refcount);
goto cannot_free;
@@ -1334,12 +1345,14 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (folio_test_swapcache(folio)) {
swp_entry_t swap = folio_swap_entry(folio);
- mem_cgroup_swapout(folio, swap);
+
+ /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
+ mem_cgroup_swapout(folio, swap);
__delete_from_swap_cache(folio, swap, shadow);
xa_unlock_irq(&mapping->i_pages);
- put_swap_page(&folio->page, swap);
+ put_swap_folio(folio, swap);
} else {
void (*free_folio)(struct folio *);
@@ -1355,7 +1368,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
* back.
*
* We also don't store shadows for DAX mappings because the
- * only page cache pages found in these are zero pages
+ * only page cache folios found in these are zero pages
* covering holes, and because we don't want to mix DAX
* exceptional entries and shadow exceptional entries in the
* same address_space.
@@ -1423,14 +1436,14 @@ void folio_putback_lru(struct folio *folio)
folio_put(folio); /* drop ref from isolate */
}
-enum page_references {
- PAGEREF_RECLAIM,
- PAGEREF_RECLAIM_CLEAN,
- PAGEREF_KEEP,
- PAGEREF_ACTIVATE,
+enum folio_references {
+ FOLIOREF_RECLAIM,
+ FOLIOREF_RECLAIM_CLEAN,
+ FOLIOREF_KEEP,
+ FOLIOREF_ACTIVATE,
};
-static enum page_references folio_check_references(struct folio *folio,
+static enum folio_references folio_check_references(struct folio *folio,
struct scan_control *sc)
{
int referenced_ptes, referenced_folio;
@@ -1445,11 +1458,11 @@ static enum page_references folio_check_references(struct folio *folio,
* Let the folio, now marked Mlocked, be moved to the unevictable list.
*/
if (vm_flags & VM_LOCKED)
- return PAGEREF_ACTIVATE;
+ return FOLIOREF_ACTIVATE;
/* rmap lock contention: rotate */
if (referenced_ptes == -1)
- return PAGEREF_KEEP;
+ return FOLIOREF_KEEP;
if (referenced_ptes) {
/*
@@ -1469,34 +1482,34 @@ static enum page_references folio_check_references(struct folio *folio,
folio_set_referenced(folio);
if (referenced_folio || referenced_ptes > 1)
- return PAGEREF_ACTIVATE;
+ return FOLIOREF_ACTIVATE;
/*
* Activate file-backed executable folios after first usage.
*/
if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
- return PAGEREF_ACTIVATE;
+ return FOLIOREF_ACTIVATE;
- return PAGEREF_KEEP;
+ return FOLIOREF_KEEP;
}
/* Reclaim if clean, defer dirty folios to writeback */
if (referenced_folio && folio_is_file_lru(folio))
- return PAGEREF_RECLAIM_CLEAN;
+ return FOLIOREF_RECLAIM_CLEAN;
- return PAGEREF_RECLAIM;
+ return FOLIOREF_RECLAIM;
}
-/* Check if a page is dirty or under writeback */
+/* Check if a folio is dirty or under writeback */
static void folio_check_dirty_writeback(struct folio *folio,
bool *dirty, bool *writeback)
{
struct address_space *mapping;
/*
- * Anonymous pages are not handled by flushers and must be written
+ * Anonymous folios are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them.
- * MADV_FREE anonymous pages are put into inactive file list too.
+ * MADV_FREE anonymous folios are put into inactive file list too.
* They could be mistakenly treated as file lru. So further anon
* test is needed.
*/
@@ -1520,44 +1533,71 @@ static void folio_check_dirty_writeback(struct folio *folio,
mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
}
-static struct page *alloc_demote_page(struct page *page, unsigned long node)
+static struct page *alloc_demote_page(struct page *page, unsigned long private)
{
- struct migration_target_control mtc = {
- /*
- * Allocate from 'node', or fail quickly and quietly.
- * When this happens, 'page' will likely just be discarded
- * instead of migrated.
- */
- .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
- __GFP_THISNODE | __GFP_NOWARN |
- __GFP_NOMEMALLOC | GFP_NOWAIT,
- .nid = node
- };
+ struct page *target_page;
+ nodemask_t *allowed_mask;
+ struct migration_target_control *mtc;
+
+ mtc = (struct migration_target_control *)private;
+
+ allowed_mask = mtc->nmask;
+ /*
+ * make sure we allocate from the target node first also trying to
+ * demote or reclaim pages from the target node via kswapd if we are
+ * low on free memory on target node. If we don't do this and if
+ * we have free memory on the slower(lower) memtier, we would start
+ * allocating pages from slower(lower) memory tiers without even forcing
+ * a demotion of cold pages from the target memtier. This can result
+ * in the kernel placing hot pages in slower(lower) memory tiers.
+ */
+ mtc->nmask = NULL;
+ mtc->gfp_mask |= __GFP_THISNODE;
+ target_page = alloc_migration_target(page, (unsigned long)mtc);
+ if (target_page)
+ return target_page;
+
+ mtc->gfp_mask &= ~__GFP_THISNODE;
+ mtc->nmask = allowed_mask;
- return alloc_migration_target(page, (unsigned long)&mtc);
+ return alloc_migration_target(page, (unsigned long)mtc);
}
/*
- * Take pages on @demote_list and attempt to demote them to
- * another node. Pages which are not demoted are left on
- * @demote_pages.
+ * Take folios on @demote_folios and attempt to demote them to another node.
+ * Folios which are not demoted are left on @demote_folios.
*/
-static unsigned int demote_page_list(struct list_head *demote_pages,
+static unsigned int demote_folio_list(struct list_head *demote_folios,
struct pglist_data *pgdat)
{
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
+ nodemask_t allowed_mask;
+
+ struct migration_target_control mtc = {
+ /*
+ * Allocate from 'node', or fail quickly and quietly.
+ * When this happens, 'page' will likely just be discarded
+ * instead of migrated.
+ */
+ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
+ __GFP_NOMEMALLOC | GFP_NOWAIT,
+ .nid = target_nid,
+ .nmask = &allowed_mask
+ };
- if (list_empty(demote_pages))
+ if (list_empty(demote_folios))
return 0;
if (target_nid == NUMA_NO_NODE)
return 0;
+ node_get_allowed_targets(pgdat, &allowed_mask);
+
/* Demotion ignores all cpuset and mempolicy settings */
- migrate_pages(demote_pages, alloc_demote_page, NULL,
- target_nid, MIGRATE_ASYNC, MR_DEMOTION,
- &nr_succeeded);
+ migrate_pages(demote_folios, alloc_demote_page, NULL,
+ (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
+ &nr_succeeded);
if (current_is_kswapd())
__count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
@@ -1584,17 +1624,15 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
}
/*
- * shrink_page_list() returns the number of reclaimed pages
+ * shrink_folio_list() returns the number of reclaimed pages
*/
-static unsigned int shrink_page_list(struct list_head *page_list,
- struct pglist_data *pgdat,
- struct scan_control *sc,
- struct reclaim_stat *stat,
- bool ignore_references)
-{
- LIST_HEAD(ret_pages);
- LIST_HEAD(free_pages);
- LIST_HEAD(demote_pages);
+static unsigned int shrink_folio_list(struct list_head *folio_list,
+ struct pglist_data *pgdat, struct scan_control *sc,
+ struct reclaim_stat *stat, bool ignore_references)
+{
+ LIST_HEAD(ret_folios);
+ LIST_HEAD(free_folios);
+ LIST_HEAD(demote_folios);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
@@ -1605,16 +1643,16 @@ static unsigned int shrink_page_list(struct list_head *page_list,
do_demote_pass = can_demote(pgdat->node_id, sc);
retry:
- while (!list_empty(page_list)) {
+ while (!list_empty(folio_list)) {
struct address_space *mapping;
struct folio *folio;
- enum page_references references = PAGEREF_RECLAIM;
+ enum folio_references references = FOLIOREF_RECLAIM;
bool dirty, writeback;
unsigned int nr_pages;
cond_resched();
- folio = lru_to_folio(page_list);
+ folio = lru_to_folio(folio_list);
list_del(&folio->lru);
if (!folio_trylock(folio))
@@ -1633,6 +1671,11 @@ retry:
if (!sc->may_unmap && folio_mapped(folio))
goto keep_locked;
+ /* folio_update_gen() tried to promote this page? */
+ if (lru_gen_enabled() && !ignore_references &&
+ folio_mapped(folio) && folio_test_referenced(folio))
+ goto keep_locked;
+
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested. kswapd will stall and start writing
@@ -1733,7 +1776,7 @@ retry:
folio_unlock(folio);
folio_wait_writeback(folio);
/* then go back and try same folio again */
- list_add_tail(&folio->lru, page_list);
+ list_add_tail(&folio->lru, folio_list);
continue;
}
}
@@ -1742,13 +1785,13 @@ retry:
references = folio_check_references(folio, sc);
switch (references) {
- case PAGEREF_ACTIVATE:
+ case FOLIOREF_ACTIVATE:
goto activate_locked;
- case PAGEREF_KEEP:
+ case FOLIOREF_KEEP:
stat->nr_ref_keep += nr_pages;
goto keep_locked;
- case PAGEREF_RECLAIM:
- case PAGEREF_RECLAIM_CLEAN:
+ case FOLIOREF_RECLAIM:
+ case FOLIOREF_RECLAIM_CLEAN:
; /* try to reclaim the folio below */
}
@@ -1758,7 +1801,7 @@ retry:
*/
if (do_demote_pass &&
(thp_migration_supported() || !folio_test_large(folio))) {
- list_add(&folio->lru, &demote_pages);
+ list_add(&folio->lru, &demote_folios);
folio_unlock(folio);
continue;
}
@@ -1785,7 +1828,7 @@ retry:
*/
if (!folio_entire_mapcount(folio) &&
split_folio_to_list(folio,
- page_list))
+ folio_list))
goto activate_locked;
}
if (!add_to_swap(folio)) {
@@ -1793,7 +1836,7 @@ retry:
goto activate_locked_split;
/* Fallback to swap normal pages */
if (split_folio_to_list(folio,
- page_list))
+ folio_list))
goto activate_locked;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
count_vm_event(THP_SWPOUT_FALLBACK);
@@ -1805,7 +1848,7 @@ retry:
} else if (folio_test_swapbacked(folio) &&
folio_test_large(folio)) {
/* Split shmem folio */
- if (split_folio_to_list(folio, page_list))
+ if (split_folio_to_list(folio, folio_list))
goto keep_locked;
}
@@ -1870,7 +1913,7 @@ retry:
goto activate_locked;
}
- if (references == PAGEREF_RECLAIM_CLEAN)
+ if (references == FOLIOREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs(folio, sc->gfp_mask))
goto keep_locked;
@@ -1983,13 +2026,13 @@ free_it:
nr_reclaimed += nr_pages;
/*
- * Is there need to periodically free_page_list? It would
+ * Is there need to periodically free_folio_list? It would
* appear not as the counts should be low
*/
if (unlikely(folio_test_large(folio)))
destroy_large_folio(folio);
else
- list_add(&folio->lru, &free_pages);
+ list_add(&folio->lru, &free_folios);
continue;
activate_locked_split:
@@ -2004,9 +2047,8 @@ activate_locked_split:
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (folio_test_swapcache(folio) &&
- (mem_cgroup_swap_full(&folio->page) ||
- folio_test_mlocked(folio)))
- try_to_free_swap(&folio->page);
+ (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
+ folio_free_swap(folio);
VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
if (!folio_test_mlocked(folio)) {
int type = folio_is_file_lru(folio);
@@ -2017,29 +2059,29 @@ activate_locked:
keep_locked:
folio_unlock(folio);
keep:
- list_add(&folio->lru, &ret_pages);
+ list_add(&folio->lru, &ret_folios);
VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
folio_test_unevictable(folio), folio);
}
- /* 'page_list' is always empty here */
+ /* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- nr_reclaimed += demote_page_list(&demote_pages, pgdat);
- /* Folios that could not be demoted are still in @demote_pages */
- if (!list_empty(&demote_pages)) {
- /* Folios which weren't demoted go back on @page_list for retry: */
- list_splice_init(&demote_pages, page_list);
+ nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
+ /* Folios that could not be demoted are still in @demote_folios */
+ if (!list_empty(&demote_folios)) {
+ /* Folios which weren't demoted go back on @folio_list for retry: */
+ list_splice_init(&demote_folios, folio_list);
do_demote_pass = false;
goto retry;
}
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
- mem_cgroup_uncharge_list(&free_pages);
+ mem_cgroup_uncharge_list(&free_folios);
try_to_unmap_flush();
- free_unref_page_list(&free_pages);
+ free_unref_page_list(&free_folios);
- list_splice(&ret_pages, page_list);
+ list_splice(&ret_folios, folio_list);
count_vm_events(PGACTIVATE, pgactivate);
if (plug)
@@ -2048,7 +2090,7 @@ keep:
}
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
- struct list_head *folio_list)
+ struct list_head *folio_list)
{
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
@@ -2076,7 +2118,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
* change in the future.
*/
noreclaim_flag = memalloc_noreclaim_save();
- nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc,
+ nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
&stat, true);
memalloc_noreclaim_restore(noreclaim_flag);
@@ -2135,7 +2177,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
*
* returns how many pages were moved onto *@dst.
*/
-static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
enum lru_list lru)
@@ -2242,8 +2284,8 @@ move:
*
* Context:
*
- * (1) Must be called with an elevated refcount on the page. This is a
- * fundamental difference from isolate_lru_pages() (which is called
+ * (1) Must be called with an elevated refcount on the folio. This is a
+ * fundamental difference from isolate_lru_folios() (which is called
* without a stable reference).
* (2) The lru_lock must not be held.
* (3) Interrupts must be enabled.
@@ -2315,13 +2357,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
}
/*
- * move_pages_to_lru() moves folios from private @list to appropriate LRU list.
+ * move_folios_to_lru() moves folios from private @list to appropriate LRU list.
* On return, @list is reused as a list of folios to be freed by the caller.
*
* Returns the number of pages moved to the given lruvec.
*/
-static unsigned int move_pages_to_lru(struct lruvec *lruvec,
- struct list_head *list)
+static unsigned int move_folios_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
{
int nr_pages, nr_moved = 0;
LIST_HEAD(folios_to_free);
@@ -2341,7 +2383,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
/*
* The folio_set_lru needs to be kept here for list integrity.
* Otherwise:
- * #0 move_pages_to_lru #1 release_pages
+ * #0 move_folios_to_lru #1 release_pages
* if (!folio_put_testzero())
* if (folio_put_testzero())
* !lru //skip lru_lock
@@ -2398,11 +2440,11 @@ static int current_may_throttle(void)
* shrink_inactive_list() is a helper for shrink_node(). It returns the number
* of reclaimed pages
*/
-static unsigned long
-shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
- struct scan_control *sc, enum lru_list lru)
+static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
+ struct lruvec *lruvec, struct scan_control *sc,
+ enum lru_list lru)
{
- LIST_HEAD(page_list);
+ LIST_HEAD(folio_list);
unsigned long nr_scanned;
unsigned int nr_reclaimed = 0;
unsigned long nr_taken;
@@ -2429,7 +2471,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&lruvec->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
+ nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
@@ -2444,10 +2486,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
+ nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
spin_lock_irq(&lruvec->lru_lock);
- move_pages_to_lru(lruvec, &page_list);
+ move_folios_to_lru(lruvec, &folio_list);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
@@ -2458,16 +2500,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_unlock_irq(&lruvec->lru_lock);
lru_note_cost(lruvec, file, stat.nr_pageout);
- mem_cgroup_uncharge_list(&page_list);
- free_unref_page_list(&page_list);
+ mem_cgroup_uncharge_list(&folio_list);
+ free_unref_page_list(&folio_list);
/*
- * If dirty pages are scanned that are not queued for IO, it
+ * If dirty folios are scanned that are not queued for IO, it
* implies that flushers are not doing their job. This can
- * happen when memory pressure pushes dirty pages to the end of
+ * happen when memory pressure pushes dirty folios to the end of
* the LRU before the dirty limits are breached and the dirty
* data has expired. It can also happen when the proportion of
- * dirty pages grows not through writes but through memory
+ * dirty folios grows not through writes but through memory
* pressure reclaiming all the clean cache. And in some cases,
* the flushers simply cannot keep up with the allocation
* rate. Nudge the flusher threads in case they are asleep.
@@ -2526,7 +2568,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
spin_lock_irq(&lruvec->lru_lock);
- nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
+ nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
@@ -2550,8 +2592,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
if (unlikely(buffer_heads_over_limit)) {
- if (folio_get_private(folio) && folio_trylock(folio)) {
- if (folio_get_private(folio))
+ if (folio_test_private(folio) && folio_trylock(folio)) {
+ if (folio_test_private(folio))
filemap_release_folio(folio, 0);
folio_unlock(folio);
}
@@ -2586,8 +2628,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
*/
spin_lock_irq(&lruvec->lru_lock);
- nr_activate = move_pages_to_lru(lruvec, &l_active);
- nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
+ nr_activate = move_folios_to_lru(lruvec, &l_active);
+ nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
/* Keep all free folios in l_active list */
list_splice(&l_inactive, &l_active);
@@ -2603,7 +2645,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_deactivate, nr_rotated, sc->priority, file);
}
-static unsigned int reclaim_page_list(struct list_head *page_list,
+static unsigned int reclaim_folio_list(struct list_head *folio_list,
struct pglist_data *pgdat)
{
struct reclaim_stat dummy_stat;
@@ -2617,9 +2659,9 @@ static unsigned int reclaim_page_list(struct list_head *page_list,
.no_demotion = 1,
};
- nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false);
- while (!list_empty(page_list)) {
- folio = lru_to_folio(page_list);
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
+ while (!list_empty(folio_list)) {
+ folio = lru_to_folio(folio_list);
list_del(&folio->lru);
folio_putback_lru(folio);
}
@@ -2649,11 +2691,11 @@ unsigned long reclaim_pages(struct list_head *folio_list)
continue;
}
- nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid));
+ nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
nid = folio_nid(lru_to_folio(folio_list));
} while (!list_empty(folio_list));
- nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid));
+ nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
memalloc_noreclaim_restore(noreclaim_flag);
@@ -2683,13 +2725,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
* but large enough to avoid thrashing the aggregate readahead window.
*
* Both inactive lists should also be large enough that each inactive
- * page has a chance to be referenced again before it is reclaimed.
+ * folio has a chance to be referenced again before it is reclaimed.
*
* If that fails and refaulting is observed, the inactive list grows.
*
- * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
+ * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios
* on this LRU, maintained by the pageout code. An inactive_ratio
- * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
+ * of 3 means 3:1 or 25% of the folios are kept on the inactive list.
*
* total target max
* memory ratio inactive
@@ -2728,12 +2770,118 @@ enum scan_balance {
SCAN_FILE,
};
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
+{
+ unsigned long file;
+ struct lruvec *target_lruvec;
+
+ if (lru_gen_enabled())
+ return;
+
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+ /*
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
+ * lruvec stats for heuristics.
+ */
+ mem_cgroup_flush_stats();
+
+ /*
+ * Determine the scan balance between anon and file LRUs.
+ */
+ spin_lock_irq(&target_lruvec->lru_lock);
+ sc->anon_cost = target_lruvec->anon_cost;
+ sc->file_cost = target_lruvec->file_cost;
+ spin_unlock_irq(&target_lruvec->lru_lock);
+
+ /*
+ * Target desirable inactive:active list ratios for the anon
+ * and file LRU lists.
+ */
+ if (!sc->force_deactivate) {
+ unsigned long refaults;
+
+ /*
+ * When refaults are being observed, it means a new
+ * workingset is being established. Deactivate to get
+ * rid of any stale active pages quickly.
+ */
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_ANON);
+ if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ sc->may_deactivate |= DEACTIVATE_ANON;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
+
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_FILE);
+ if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+ sc->may_deactivate |= DEACTIVATE_FILE;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
+ } else
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+
+ /*
+ * If we have plenty of inactive file pages that aren't
+ * thrashing, try to reclaim those first before touching
+ * anonymous pages.
+ */
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+ sc->cache_trim_mode = 1;
+ else
+ sc->cache_trim_mode = 0;
+
+ /*
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
+ */
+ if (!cgroup_reclaim(sc)) {
+ unsigned long total_high_wmark = 0;
+ unsigned long free, anon;
+ int z;
+
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_FILE);
+
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+
+ if (!managed_zone(zone))
+ continue;
+
+ total_high_wmark += high_wmark_pages(zone);
+ }
+
+ /*
+ * Consider anon: if that's low too, this isn't a
+ * runaway file reclaim problem, but rather just
+ * extreme pressure. Reclaim as per usual then.
+ */
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ sc->file_is_tiny =
+ file + free <= total_high_wmark &&
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
+ anon >> sc->priority;
+ }
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned.
*
- * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
- * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
+ * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
+ * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
*/
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
@@ -2748,7 +2896,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long ap, fp;
enum lru_list lru;
- /* If we have no swap space, do not bother scanning anon pages. */
+ /* If we have no swap space, do not bother scanning anon folios. */
if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
scan_balance = SCAN_FILE;
goto out;
@@ -2947,6 +3095,2747 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
return can_demote(pgdat->node_id, sc);
}
+#ifdef CONFIG_LRU_GEN
+
+#ifdef CONFIG_LRU_GEN_ENABLED
+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
+#else
+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
+#endif
+
+/******************************************************************************
+ * shorthand helpers
+ ******************************************************************************/
+
+#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
+
+#define DEFINE_MAX_SEQ(lruvec) \
+ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
+
+#define DEFINE_MIN_SEQ(lruvec) \
+ unsigned long min_seq[ANON_AND_FILE] = { \
+ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \
+ READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
+ }
+
+#define for_each_gen_type_zone(gen, type, zone) \
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+
+static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+
+#ifdef CONFIG_MEMCG
+ if (memcg) {
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
+
+ /* for hotadd_new_pgdat() */
+ if (!lruvec->pgdat)
+ lruvec->pgdat = pgdat;
+
+ return lruvec;
+ }
+#endif
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
+
+ return pgdat ? &pgdat->__lruvec : NULL;
+}
+
+static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
+{
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ if (!can_demote(pgdat->node_id, sc) &&
+ mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
+ return 0;
+
+ return mem_cgroup_swappiness(memcg);
+}
+
+static int get_nr_gens(struct lruvec *lruvec, int type)
+{
+ return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
+}
+
+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
+{
+ /* see the comment on lru_gen_struct */
+ return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
+ get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
+ get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
+}
+
+/******************************************************************************
+ * mm_struct list
+ ******************************************************************************/
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+ static struct lru_gen_mm_list mm_list = {
+ .fifo = LIST_HEAD_INIT(mm_list.fifo),
+ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
+ };
+
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ return &memcg->mm_list;
+#endif
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
+
+ return &mm_list;
+}
+
+void lru_gen_add_mm(struct mm_struct *mm)
+{
+ int nid;
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+ VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
+#ifdef CONFIG_MEMCG
+ VM_WARN_ON_ONCE(mm->lru_gen.memcg);
+ mm->lru_gen.memcg = memcg;
+#endif
+ spin_lock(&mm_list->lock);
+
+ for_each_node_state(nid, N_MEMORY) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ if (!lruvec)
+ continue;
+
+ /* the first addition since the last iteration */
+ if (lruvec->mm_state.tail == &mm_list->fifo)
+ lruvec->mm_state.tail = &mm->lru_gen.list;
+ }
+
+ list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
+
+ spin_unlock(&mm_list->lock);
+}
+
+void lru_gen_del_mm(struct mm_struct *mm)
+{
+ int nid;
+ struct lru_gen_mm_list *mm_list;
+ struct mem_cgroup *memcg = NULL;
+
+ if (list_empty(&mm->lru_gen.list))
+ return;
+
+#ifdef CONFIG_MEMCG
+ memcg = mm->lru_gen.memcg;
+#endif
+ mm_list = get_mm_list(memcg);
+
+ spin_lock(&mm_list->lock);
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ if (!lruvec)
+ continue;
+
+ /* where the last iteration ended (exclusive) */
+ if (lruvec->mm_state.tail == &mm->lru_gen.list)
+ lruvec->mm_state.tail = lruvec->mm_state.tail->next;
+
+ /* where the current iteration continues (inclusive) */
+ if (lruvec->mm_state.head != &mm->lru_gen.list)
+ continue;
+
+ lruvec->mm_state.head = lruvec->mm_state.head->next;
+ /* the deletion ends the current iteration */
+ if (lruvec->mm_state.head == &mm_list->fifo)
+ WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
+ }
+
+ list_del_init(&mm->lru_gen.list);
+
+ spin_unlock(&mm_list->lock);
+
+#ifdef CONFIG_MEMCG
+ mem_cgroup_put(mm->lru_gen.memcg);
+ mm->lru_gen.memcg = NULL;
+#endif
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg;
+ struct task_struct *task = rcu_dereference_protected(mm->owner, true);
+
+ VM_WARN_ON_ONCE(task->mm != mm);
+ lockdep_assert_held(&task->alloc_lock);
+
+ /* for mm_update_next_owner() */
+ if (mem_cgroup_disabled())
+ return;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(task);
+ rcu_read_unlock();
+ if (memcg == mm->lru_gen.memcg)
+ return;
+
+ VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
+ VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
+
+ lru_gen_del_mm(mm);
+ lru_gen_add_mm(mm);
+}
+#endif
+
+/*
+ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
+ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
+ * bits in a bitmap, k is the number of hash functions and n is the number of
+ * inserted items.
+ *
+ * Page table walkers use one of the two filters to reduce their search space.
+ * To get rid of non-leaf entries that no longer have enough leaf entries, the
+ * aging uses the double-buffering technique to flip to the other filter each
+ * time it produces a new generation. For non-leaf entries that have enough
+ * leaf entries, the aging carries them over to the next generation in
+ * walk_pmd_range(); the eviction also report them when walking the rmap
+ * in lru_gen_look_around().
+ *
+ * For future optimizations:
+ * 1. It's not necessary to keep both filters all the time. The spare one can be
+ * freed after the RCU grace period and reallocated if needed again.
+ * 2. And when reallocating, it's worth scaling its size according to the number
+ * of inserted entries in the other filter, to reduce the memory overhead on
+ * small systems and false positives on large systems.
+ * 3. Jenkins' hash function is an alternative to Knuth's.
+ */
+#define BLOOM_FILTER_SHIFT 15
+
+static inline int filter_gen_from_seq(unsigned long seq)
+{
+ return seq % NR_BLOOM_FILTERS;
+}
+
+static void get_item_key(void *item, int *key)
+{
+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
+
+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
+
+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
+ key[1] = hash >> BLOOM_FILTER_SHIFT;
+}
+
+static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+{
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = lruvec->mm_state.filters[gen];
+ if (filter) {
+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
+ return;
+ }
+
+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
+}
+
+static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+ int key[2];
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+ if (!filter)
+ return;
+
+ get_item_key(item, key);
+
+ if (!test_bit(key[0], filter))
+ set_bit(key[0], filter);
+ if (!test_bit(key[1], filter))
+ set_bit(key[1], filter);
+}
+
+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+ int key[2];
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = READ_ONCE(lruvec->mm_state.filters[gen]);
+ if (!filter)
+ return true;
+
+ get_item_key(item, key);
+
+ return test_bit(key[0], filter) && test_bit(key[1], filter);
+}
+
+static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
+{
+ int i;
+ int hist;
+
+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
+
+ if (walk) {
+ hist = lru_hist_from_seq(walk->max_seq);
+
+ for (i = 0; i < NR_MM_STATS; i++) {
+ WRITE_ONCE(lruvec->mm_state.stats[hist][i],
+ lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
+ walk->mm_stats[i] = 0;
+ }
+ }
+
+ if (NR_HIST_GENS > 1 && last) {
+ hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
+
+ for (i = 0; i < NR_MM_STATS; i++)
+ WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
+ }
+}
+
+static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+{
+ int type;
+ unsigned long size = 0;
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
+
+ if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
+ return true;
+
+ clear_bit(key, &mm->lru_gen.bitmap);
+
+ for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
+ size += type ? get_mm_counter(mm, MM_FILEPAGES) :
+ get_mm_counter(mm, MM_ANONPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
+ }
+
+ if (size < MIN_LRU_BATCH)
+ return true;
+
+ return !mmget_not_zero(mm);
+}
+
+static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
+ struct mm_struct **iter)
+{
+ bool first = false;
+ bool last = true;
+ struct mm_struct *mm = NULL;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+ struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+
+ /*
+ * There are four interesting cases for this page table walker:
+ * 1. It tries to start a new iteration of mm_list with a stale max_seq;
+ * there is nothing left to do.
+ * 2. It's the first of the current generation, and it needs to reset
+ * the Bloom filter for the next generation.
+ * 3. It reaches the end of mm_list, and it needs to increment
+ * mm_state->seq; the iteration is done.
+ * 4. It's the last of the current generation, and it needs to reset the
+ * mm stats counters for the next generation.
+ */
+ spin_lock(&mm_list->lock);
+
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
+ VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
+ VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
+
+ if (walk->max_seq <= mm_state->seq) {
+ if (!*iter)
+ last = false;
+ goto done;
+ }
+
+ if (!mm_state->nr_walkers) {
+ VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
+
+ mm_state->head = mm_list->fifo.next;
+ first = true;
+ }
+
+ while (!mm && mm_state->head != &mm_list->fifo) {
+ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
+
+ mm_state->head = mm_state->head->next;
+
+ /* force scan for those added after the last iteration */
+ if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
+ mm_state->tail = mm_state->head;
+ walk->force_scan = true;
+ }
+
+ if (should_skip_mm(mm, walk))
+ mm = NULL;
+ }
+
+ if (mm_state->head == &mm_list->fifo)
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+done:
+ if (*iter && !mm)
+ mm_state->nr_walkers--;
+ if (!*iter && mm)
+ mm_state->nr_walkers++;
+
+ if (mm_state->nr_walkers)
+ last = false;
+
+ if (*iter || last)
+ reset_mm_stats(lruvec, walk, last);
+
+ spin_unlock(&mm_list->lock);
+
+ if (mm && first)
+ reset_bloom_filter(lruvec, walk->max_seq + 1);
+
+ if (*iter)
+ mmput_async(*iter);
+
+ *iter = mm;
+
+ return last;
+}
+
+static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
+{
+ bool success = false;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+ struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
+
+ spin_lock(&mm_list->lock);
+
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
+
+ if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
+ VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
+
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
+ reset_mm_stats(lruvec, NULL, true);
+ success = true;
+ }
+
+ spin_unlock(&mm_list->lock);
+
+ return success;
+}
+
+/******************************************************************************
+ * refault feedback loop
+ ******************************************************************************/
+
+/*
+ * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
+ *
+ * The P term is refaulted/(evicted+protected) from a tier in the generation
+ * currently being evicted; the I term is the exponential moving average of the
+ * P term over the generations previously evicted, using the smoothing factor
+ * 1/2; the D term isn't supported.
+ *
+ * The setpoint (SP) is always the first tier of one type; the process variable
+ * (PV) is either any tier of the other type or any other tier of the same
+ * type.
+ *
+ * The error is the difference between the SP and the PV; the correction is to
+ * turn off protection when SP>PV or turn on protection when SP<PV.
+ *
+ * For future optimizations:
+ * 1. The D term may discount the other two terms over time so that long-lived
+ * generations can resist stale information.
+ */
+struct ctrl_pos {
+ unsigned long refaulted;
+ unsigned long total;
+ int gain;
+};
+
+static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
+ struct ctrl_pos *pos)
+{
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+
+ pos->refaulted = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ pos->total = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ pos->total += lrugen->protected[hist][type][tier - 1];
+ pos->gain = gain;
+}
+
+static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
+{
+ int hist, tier;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
+ unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
+
+ lockdep_assert_held(&lruvec->lru_lock);
+
+ if (!carryover && !clear)
+ return;
+
+ hist = lru_hist_from_seq(seq);
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ if (carryover) {
+ unsigned long sum;
+
+ sum = lrugen->avg_refaulted[type][tier] +
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
+
+ sum = lrugen->avg_total[type][tier] +
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ sum += lrugen->protected[hist][type][tier - 1];
+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
+ }
+
+ if (clear) {
+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
+ if (tier)
+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
+ }
+ }
+}
+
+static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
+{
+ /*
+ * Return true if the PV has a limited number of refaults or a lower
+ * refaulted/total than the SP.
+ */
+ return pv->refaulted < MIN_LRU_BATCH ||
+ pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
+ (sp->refaulted + 1) * pv->total * pv->gain;
+}
+
+/******************************************************************************
+ * the aging
+ ******************************************************************************/
+
+/* promote pages accessed through page tables */
+static int folio_update_gen(struct folio *folio, int gen)
+{
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(!rcu_read_lock_held());
+
+ do {
+ /* lru_gen_del_folio() has isolated this page? */
+ if (!(old_flags & LRU_GEN_MASK)) {
+ /* for shrink_folio_list() */
+ new_flags = old_flags | BIT(PG_referenced);
+ continue;
+ }
+
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
+/* protect pages accessed multiple times through file descriptors */
+static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
+{
+ int type = folio_is_file_lru(folio);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+ unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
+
+ VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
+
+ do {
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+ /* folio_update_gen() has promoted this page? */
+ if (new_gen >= 0 && new_gen != old_gen)
+ return new_gen;
+
+ new_gen = (old_gen + 1) % MAX_NR_GENS;
+
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
+ new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
+ /* for folio_end_writeback() */
+ if (reclaiming)
+ new_flags |= BIT(PG_reclaim);
+ } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
+
+ lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+
+ return new_gen;
+}
+
+static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
+ int old_gen, int new_gen)
+{
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ int delta = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
+ VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
+
+ walk->batched++;
+
+ walk->nr_pages[old_gen][type][zone] -= delta;
+ walk->nr_pages[new_gen][type][zone] += delta;
+}
+
+static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
+{
+ int gen, type, zone;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ walk->batched = 0;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ int delta = walk->nr_pages[gen][type][zone];
+
+ if (!delta)
+ continue;
+
+ walk->nr_pages[gen][type][zone] = 0;
+ WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
+ lrugen->nr_pages[gen][type][zone] + delta);
+
+ if (lru_gen_is_active(lruvec, gen))
+ lru += LRU_ACTIVE;
+ __update_lru_size(lruvec, lru, zone, delta);
+ }
+}
+
+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
+{
+ struct address_space *mapping;
+ struct vm_area_struct *vma = args->vma;
+ struct lru_gen_mm_walk *walk = args->private;
+
+ if (!vma_is_accessible(vma))
+ return true;
+
+ if (is_vm_hugetlb_page(vma))
+ return true;
+
+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
+ return true;
+
+ if (vma == get_gate_vma(vma->vm_mm))
+ return true;
+
+ if (vma_is_anonymous(vma))
+ return !walk->can_swap;
+
+ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
+ return true;
+
+ mapping = vma->vm_file->f_mapping;
+ if (mapping_unevictable(mapping))
+ return true;
+
+ if (shmem_mapping(mapping))
+ return !walk->can_swap;
+
+ /* to exclude special mappings like dax, etc. */
+ return !mapping->a_ops->read_folio;
+}
+
+/*
+ * Some userspace memory allocators map many single-page VMAs. Instead of
+ * returning back to the PGD table for each of such VMAs, finish an entire PMD
+ * table to reduce zigzags and improve cache performance.
+ */
+static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
+ unsigned long *vm_start, unsigned long *vm_end)
+{
+ unsigned long start = round_up(*vm_end, size);
+ unsigned long end = (start | ~mask) + 1;
+ VMA_ITERATOR(vmi, args->mm, start);
+
+ VM_WARN_ON_ONCE(mask & size);
+ VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
+
+ for_each_vma(vmi, args->vma) {
+ if (end && end <= args->vma->vm_start)
+ return false;
+
+ if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args))
+ continue;
+
+ *vm_start = max(start, args->vma->vm_start);
+ *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
+
+ return true;
+ }
+
+ return false;
+}
+
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long pfn = pte_pfn(pte);
+
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
+
+ if (!pte_present(pte) || is_zero_pfn(pfn))
+ return -1;
+
+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
+ return -1;
+
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
+ return -1;
+
+ return pfn;
+}
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
+{
+ unsigned long pfn = pmd_pfn(pmd);
+
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
+
+ if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
+ return -1;
+
+ if (WARN_ON_ONCE(pmd_devmap(pmd)))
+ return -1;
+
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
+ return -1;
+
+ return pfn;
+}
+#endif
+
+static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
+ struct pglist_data *pgdat, bool can_swap)
+{
+ struct folio *folio;
+
+ /* try to avoid unnecessary memory loads */
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ return NULL;
+
+ folio = pfn_folio(pfn);
+ if (folio_nid(folio) != pgdat->node_id)
+ return NULL;
+
+ if (folio_memcg_rcu(folio) != memcg)
+ return NULL;
+
+ /* file VMAs can contain anon pages from COW */
+ if (!folio_is_file_lru(folio) && !can_swap)
+ return NULL;
+
+ return folio;
+}
+
+static bool suitable_to_scan(int total, int young)
+{
+ int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
+
+ /* suitable if the average number of young PTEs per cacheline is >=1 */
+ return young * n >= total;
+}
+
+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
+ struct mm_walk *args)
+{
+ int i;
+ pte_t *pte;
+ spinlock_t *ptl;
+ unsigned long addr;
+ int total = 0;
+ int young = 0;
+ struct lru_gen_mm_walk *walk = args->private;
+ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+
+ VM_WARN_ON_ONCE(pmd_leaf(*pmd));
+
+ ptl = pte_lockptr(args->mm, pmd);
+ if (!spin_trylock(ptl))
+ return false;
+
+ arch_enter_lazy_mmu_mode();
+
+ pte = pte_offset_map(pmd, start & PMD_MASK);
+restart:
+ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ unsigned long pfn;
+ struct folio *folio;
+
+ total++;
+ walk->mm_stats[MM_LEAF_TOTAL]++;
+
+ pfn = get_pte_pfn(pte[i], args->vma, addr);
+ if (pfn == -1)
+ continue;
+
+ if (!pte_young(pte[i])) {
+ walk->mm_stats[MM_LEAF_OLD]++;
+ continue;
+ }
+
+ folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ if (!folio)
+ continue;
+
+ if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
+ VM_WARN_ON_ONCE(true);
+
+ young++;
+ walk->mm_stats[MM_LEAF_YOUNG]++;
+
+ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+ }
+
+ if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
+ goto restart;
+
+ pte_unmap(pte);
+
+ arch_leave_lazy_mmu_mode();
+ spin_unlock(ptl);
+
+ return suitable_to_scan(total, young);
+}
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
+static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
+ struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
+{
+ int i;
+ pmd_t *pmd;
+ spinlock_t *ptl;
+ struct lru_gen_mm_walk *walk = args->private;
+ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
+
+ VM_WARN_ON_ONCE(pud_leaf(*pud));
+
+ /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
+ if (*start == -1) {
+ *start = next;
+ return;
+ }
+
+ i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
+ if (i && i <= MIN_LRU_BATCH) {
+ __set_bit(i - 1, bitmap);
+ return;
+ }
+
+ pmd = pmd_offset(pud, *start);
+
+ ptl = pmd_lockptr(args->mm, pmd);
+ if (!spin_trylock(ptl))
+ goto done;
+
+ arch_enter_lazy_mmu_mode();
+
+ do {
+ unsigned long pfn;
+ struct folio *folio;
+ unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
+
+ pfn = get_pmd_pfn(pmd[i], vma, addr);
+ if (pfn == -1)
+ goto next;
+
+ if (!pmd_trans_huge(pmd[i])) {
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
+ get_cap(LRU_GEN_NONLEAF_YOUNG))
+ pmdp_test_and_clear_young(vma, addr, pmd + i);
+ goto next;
+ }
+
+ folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
+ if (!folio)
+ goto next;
+
+ if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
+ goto next;
+
+ walk->mm_stats[MM_LEAF_YOUNG]++;
+
+ if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen >= 0 && old_gen != new_gen)
+ update_batch_size(walk, folio, old_gen, new_gen);
+next:
+ i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
+ } while (i <= MIN_LRU_BATCH);
+
+ arch_leave_lazy_mmu_mode();
+ spin_unlock(ptl);
+done:
+ *start = -1;
+ bitmap_zero(bitmap, MIN_LRU_BATCH);
+}
+#else
+static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
+ struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
+{
+}
+#endif
+
+static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
+ struct mm_walk *args)
+{
+ int i;
+ pmd_t *pmd;
+ unsigned long next;
+ unsigned long addr;
+ struct vm_area_struct *vma;
+ unsigned long pos = -1;
+ struct lru_gen_mm_walk *walk = args->private;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+
+ VM_WARN_ON_ONCE(pud_leaf(*pud));
+
+ /*
+ * Finish an entire PMD in two passes: the first only reaches to PTE
+ * tables to avoid taking the PMD lock; the second, if necessary, takes
+ * the PMD lock to clear the accessed bit in PMD entries.
+ */
+ pmd = pmd_offset(pud, start & PUD_MASK);
+restart:
+ /* walk_pte_range() may call get_next_vma() */
+ vma = args->vma;
+ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
+ pmd_t val = pmd_read_atomic(pmd + i);
+
+ /* for pmd_read_atomic() */
+ barrier();
+
+ next = pmd_addr_end(addr, end);
+
+ if (!pmd_present(val) || is_huge_zero_pmd(val)) {
+ walk->mm_stats[MM_LEAF_TOTAL]++;
+ continue;
+ }
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(val)) {
+ unsigned long pfn = pmd_pfn(val);
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
+
+ walk->mm_stats[MM_LEAF_TOTAL]++;
+
+ if (!pmd_young(val)) {
+ walk->mm_stats[MM_LEAF_OLD]++;
+ continue;
+ }
+
+ /* try to avoid unnecessary memory loads */
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
+ continue;
+
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ continue;
+ }
+#endif
+ walk->mm_stats[MM_NONLEAF_TOTAL]++;
+
+#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
+ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ if (!pmd_young(val))
+ continue;
+
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
+ }
+#endif
+ if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
+ continue;
+
+ walk->mm_stats[MM_NONLEAF_FOUND]++;
+
+ if (!walk_pte_range(&val, addr, next, args))
+ continue;
+
+ walk->mm_stats[MM_NONLEAF_ADDED]++;
+
+ /* carry over to the next generation */
+ update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
+ }
+
+ walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
+
+ if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
+ goto restart;
+}
+
+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
+ struct mm_walk *args)
+{
+ int i;
+ pud_t *pud;
+ unsigned long addr;
+ unsigned long next;
+ struct lru_gen_mm_walk *walk = args->private;
+
+ VM_WARN_ON_ONCE(p4d_leaf(*p4d));
+
+ pud = pud_offset(p4d, start & P4D_MASK);
+restart:
+ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
+ pud_t val = READ_ONCE(pud[i]);
+
+ next = pud_addr_end(addr, end);
+
+ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
+ continue;
+
+ walk_pmd_range(&val, addr, next, args);
+
+ /* a racy check to curtail the waiting time */
+ if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
+ return 1;
+
+ if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
+ end = (addr | ~PUD_MASK) + 1;
+ goto done;
+ }
+ }
+
+ if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
+ goto restart;
+
+ end = round_up(end, P4D_SIZE);
+done:
+ if (!end || !args->vma)
+ return 1;
+
+ walk->next_addr = max(end, args->vma->vm_start);
+
+ return -EAGAIN;
+}
+
+static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
+{
+ static const struct mm_walk_ops mm_walk_ops = {
+ .test_walk = should_skip_vma,
+ .p4d_entry = walk_pud_range,
+ };
+
+ int err;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ walk->next_addr = FIRST_USER_ADDRESS;
+
+ do {
+ err = -EBUSY;
+
+ /* folio_update_gen() requires stable folio_memcg() */
+ if (!mem_cgroup_trylock_pages(memcg))
+ break;
+
+ /* the caller might be holding the lock for write */
+ if (mmap_read_trylock(mm)) {
+ err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
+
+ mmap_read_unlock(mm);
+ }
+
+ mem_cgroup_unlock_pages();
+
+ if (walk->batched) {
+ spin_lock_irq(&lruvec->lru_lock);
+ reset_batch_size(lruvec, walk);
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while (err == -EAGAIN);
+}
+
+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
+{
+ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+
+ if (pgdat && current_is_kswapd()) {
+ VM_WARN_ON_ONCE(walk);
+
+ walk = &pgdat->mm_walk;
+ } else if (!pgdat && !walk) {
+ VM_WARN_ON_ONCE(current_is_kswapd());
+
+ walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ }
+
+ current->reclaim_state->mm_walk = walk;
+
+ return walk;
+}
+
+static void clear_mm_walk(void)
+{
+ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
+
+ VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
+ VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
+
+ current->reclaim_state->mm_walk = NULL;
+
+ if (!current_is_kswapd())
+ kfree(walk);
+}
+
+static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
+{
+ int zone;
+ int remaining = MAX_LRU_BATCH;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ if (type == LRU_GEN_ANON && !can_swap)
+ goto done;
+
+ /* prevent cold/hot inversion if force_scan is true */
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ struct list_head *head = &lrugen->lists[old_gen][type][zone];
+
+ while (!list_empty(head)) {
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ new_gen = folio_inc_gen(lruvec, folio, false);
+ list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+done:
+ reset_ctrl_pos(lruvec, type, true);
+ WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
+
+ return true;
+}
+
+static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
+{
+ int gen, type, zone;
+ bool success = false;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ DEFINE_MIN_SEQ(lruvec);
+
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+
+ /* find the oldest populated generation */
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
+ gen = lru_gen_from_seq(min_seq[type]);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
+ goto next;
+ }
+
+ min_seq[type]++;
+ }
+next:
+ ;
+ }
+
+ /* see the comment on lru_gen_struct */
+ if (can_swap) {
+ min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
+ min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
+ }
+
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ if (min_seq[type] == lrugen->min_seq[type])
+ continue;
+
+ reset_ctrl_pos(lruvec, type, true);
+ WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
+ success = true;
+ }
+
+ return success;
+}
+
+static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
+{
+ int prev, next;
+ int type, zone;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+
+ for (type = ANON_AND_FILE - 1; type >= 0; type--) {
+ if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
+ continue;
+
+ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
+
+ while (!inc_min_seq(lruvec, type, can_swap)) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
+ }
+
+ /*
+ * Update the active/inactive LRU sizes for compatibility. Both sides of
+ * the current max_seq need to be covered, since max_seq+1 can overlap
+ * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
+ * overlap, cold/hot inversion happens.
+ */
+ prev = lru_gen_from_seq(lrugen->max_seq - 1);
+ next = lru_gen_from_seq(lrugen->max_seq + 1);
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
+ long delta = lrugen->nr_pages[prev][type][zone] -
+ lrugen->nr_pages[next][type][zone];
+
+ if (!delta)
+ continue;
+
+ __update_lru_size(lruvec, lru, zone, delta);
+ __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
+ }
+ }
+
+ for (type = 0; type < ANON_AND_FILE; type++)
+ reset_ctrl_pos(lruvec, type, false);
+
+ WRITE_ONCE(lrugen->timestamps[next], jiffies);
+ /* make sure preceding modifications appear */
+ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
+
+ spin_unlock_irq(&lruvec->lru_lock);
+}
+
+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
+ struct scan_control *sc, bool can_swap, bool force_scan)
+{
+ bool success;
+ struct lru_gen_mm_walk *walk;
+ struct mm_struct *mm = NULL;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
+
+ /* see the comment in iterate_mm_list() */
+ if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
+ success = false;
+ goto done;
+ }
+
+ /*
+ * If the hardware doesn't automatically set the accessed bit, fallback
+ * to lru_gen_look_around(), which only clears the accessed bit in a
+ * handful of PTEs. Spreading the work out over a period of time usually
+ * is less efficient, but it avoids bursty page faults.
+ */
+ if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
+ success = iterate_mm_list_nowalk(lruvec, max_seq);
+ goto done;
+ }
+
+ walk = set_mm_walk(NULL);
+ if (!walk) {
+ success = iterate_mm_list_nowalk(lruvec, max_seq);
+ goto done;
+ }
+
+ walk->lruvec = lruvec;
+ walk->max_seq = max_seq;
+ walk->can_swap = can_swap;
+ walk->force_scan = force_scan;
+
+ do {
+ success = iterate_mm_list(lruvec, walk, &mm);
+ if (mm)
+ walk_mm(lruvec, mm, walk);
+
+ cond_resched();
+ } while (mm);
+done:
+ if (!success) {
+ if (sc->priority <= DEF_PRIORITY - 2)
+ wait_event_killable(lruvec->mm_state.wait,
+ max_seq < READ_ONCE(lrugen->max_seq));
+
+ return max_seq < READ_ONCE(lrugen->max_seq);
+ }
+
+ VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
+
+ inc_max_seq(lruvec, can_swap, force_scan);
+ /* either this sees any waiters or they will see updated max_seq */
+ if (wq_has_sleeper(&lruvec->mm_state.wait))
+ wake_up_all(&lruvec->mm_state.wait);
+
+ return true;
+}
+
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
+ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
+{
+ int gen, type, zone;
+ unsigned long old = 0;
+ unsigned long young = 0;
+ unsigned long total = 0;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
+ unsigned long seq;
+
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
+ unsigned long size = 0;
+
+ gen = lru_gen_from_seq(seq);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+ total += size;
+ if (seq == max_seq)
+ young += size;
+ else if (seq + MIN_NR_GENS == max_seq)
+ old += size;
+ }
+ }
+
+ /* try to scrape all its memory if this memcg was deleted */
+ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
+
+ /*
+ * The aging tries to be lazy to reduce the overhead, while the eviction
+ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
+ * ideal number of generations is MIN_NR_GENS+1.
+ */
+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
+ return true;
+ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
+ return false;
+
+ /*
+ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
+ * of the total number of pages for each generation. A reasonable range
+ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
+ * aging cares about the upper bound of hot pages, while the eviction
+ * cares about the lower bound of cold pages.
+ */
+ if (young * MIN_NR_GENS > total)
+ return true;
+ if (old * (MIN_NR_GENS + 2) < total)
+ return true;
+
+ return false;
+}
+
+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
+{
+ bool need_aging;
+ unsigned long nr_to_scan;
+ int swappiness = get_swappiness(lruvec, sc);
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
+
+ mem_cgroup_calculate_protection(NULL, memcg);
+
+ if (mem_cgroup_below_min(memcg))
+ return false;
+
+ need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
+
+ if (min_ttl) {
+ int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ if (time_is_after_jiffies(birth + min_ttl))
+ return false;
+
+ /* the size is likely too small to be helpful */
+ if (!nr_to_scan && sc->priority != DEF_PRIORITY)
+ return false;
+ }
+
+ if (need_aging)
+ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
+
+ return true;
+}
+
+/* to protect the working set of the last N jiffies */
+static unsigned long lru_gen_min_ttl __read_mostly;
+
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+ struct mem_cgroup *memcg;
+ bool success = false;
+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
+
+ VM_WARN_ON_ONCE(!current_is_kswapd());
+
+ sc->last_reclaimed = sc->nr_reclaimed;
+
+ /*
+ * To reduce the chance of going into the aging path, which can be
+ * costly, optimistically skip it if the flag below was cleared in the
+ * eviction path. This improves the overall performance when multiple
+ * memcgs are available.
+ */
+ if (!sc->memcgs_need_aging) {
+ sc->memcgs_need_aging = true;
+ return;
+ }
+
+ set_mm_walk(pgdat);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+
+ if (age_lruvec(lruvec, sc, min_ttl))
+ success = true;
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ clear_mm_walk();
+
+ /* check the order to exclude compaction-induced reclaim */
+ if (success || !min_ttl || sc->order)
+ return;
+
+ /*
+ * The main goal is to OOM kill if every generation from all memcgs is
+ * younger than min_ttl. However, another possibility is all memcgs are
+ * either below min or empty.
+ */
+ if (mutex_trylock(&oom_lock)) {
+ struct oom_control oc = {
+ .gfp_mask = sc->gfp_mask,
+ };
+
+ out_of_memory(&oc);
+
+ mutex_unlock(&oom_lock);
+ }
+}
+
+/*
+ * This function exploits spatial locality when shrink_folio_list() walks the
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
+ * the scan was done cacheline efficiently, it adds the PMD entry pointing to
+ * the PTE table to the Bloom filter. This forms a feedback loop between the
+ * eviction and the aging.
+ */
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
+{
+ int i;
+ pte_t *pte;
+ unsigned long start;
+ unsigned long end;
+ unsigned long addr;
+ struct lru_gen_mm_walk *walk;
+ int young = 0;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
+ struct folio *folio = pfn_folio(pvmw->pfn);
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = folio_pgdat(folio);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ DEFINE_MAX_SEQ(lruvec);
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
+
+ lockdep_assert_held(pvmw->ptl);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
+
+ if (spin_is_contended(pvmw->ptl))
+ return;
+
+ /* avoid taking the LRU lock under the PTL when possible */
+ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
+
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
+
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
+ else {
+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
+ }
+ }
+
+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
+
+ rcu_read_lock();
+ arch_enter_lazy_mmu_mode();
+
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
+ unsigned long pfn;
+
+ pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
+ if (pfn == -1)
+ continue;
+
+ if (!pte_young(pte[i]))
+ continue;
+
+ folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap);
+ if (!folio)
+ continue;
+
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
+ VM_WARN_ON_ONCE(true);
+
+ young++;
+
+ if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
+ !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio)))
+ folio_mark_dirty(folio);
+
+ old_gen = folio_lru_gen(folio);
+ if (old_gen < 0)
+ folio_set_referenced(folio);
+ else if (old_gen != new_gen)
+ __set_bit(i, bitmap);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ rcu_read_unlock();
+
+ /* feedback from rmap walkers to page table walkers */
+ if (suitable_to_scan(i, young))
+ update_bloom_filter(lruvec, max_seq, pvmw->pmd);
+
+ if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = pfn_folio(pte_pfn(pte[i]));
+ folio_activate(folio);
+ }
+ return;
+ }
+
+ /* folio_update_gen() requires stable folio_memcg() */
+ if (!mem_cgroup_trylock_pages(memcg))
+ return;
+
+ if (!walk) {
+ spin_lock_irq(&lruvec->lru_lock);
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
+ }
+
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
+ folio = pfn_folio(pte_pfn(pte[i]));
+ if (folio_memcg_rcu(folio) != memcg)
+ continue;
+
+ old_gen = folio_update_gen(folio, new_gen);
+ if (old_gen < 0 || old_gen == new_gen)
+ continue;
+
+ if (walk)
+ update_batch_size(walk, folio, old_gen, new_gen);
+ else
+ lru_gen_update_size(lruvec, folio, old_gen, new_gen);
+ }
+
+ if (!walk)
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ mem_cgroup_unlock_pages();
+}
+
+/******************************************************************************
+ * the eviction
+ ******************************************************************************/
+
+static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
+{
+ bool success;
+ int gen = folio_lru_gen(folio);
+ int type = folio_is_file_lru(folio);
+ int zone = folio_zonenum(folio);
+ int delta = folio_nr_pages(folio);
+ int refs = folio_lru_refs(folio);
+ int tier = lru_tier_from_refs(refs);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
+
+ /* unevictable */
+ if (!folio_evictable(folio)) {
+ success = lru_gen_del_folio(lruvec, folio, true);
+ VM_WARN_ON_ONCE_FOLIO(!success, folio);
+ folio_set_unevictable(folio);
+ lruvec_add_folio(lruvec, folio);
+ __count_vm_events(UNEVICTABLE_PGCULLED, delta);
+ return true;
+ }
+
+ /* dirty lazyfree */
+ if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) {
+ success = lru_gen_del_folio(lruvec, folio, true);
+ VM_WARN_ON_ONCE_FOLIO(!success, folio);
+ folio_set_swapbacked(folio);
+ lruvec_add_folio_tail(lruvec, folio);
+ return true;
+ }
+
+ /* promoted */
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
+ list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ return true;
+ }
+
+ /* protected */
+ if (tier > tier_idx) {
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
+
+ gen = folio_inc_gen(lruvec, folio, false);
+ list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
+
+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
+ lrugen->protected[hist][type][tier - 1] + delta);
+ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
+ return true;
+ }
+
+ /* waiting for writeback */
+ if (folio_test_locked(folio) || folio_test_writeback(folio) ||
+ (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
+ gen = folio_inc_gen(lruvec, folio, true);
+ list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
+ return true;
+ }
+
+ return false;
+}
+
+static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc)
+{
+ bool success;
+
+ /* unmapping inhibited */
+ if (!sc->may_unmap && folio_mapped(folio))
+ return false;
+
+ /* swapping inhibited */
+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
+ (folio_test_dirty(folio) ||
+ (folio_test_anon(folio) && !folio_test_swapcache(folio))))
+ return false;
+
+ /* raced with release_pages() */
+ if (!folio_try_get(folio))
+ return false;
+
+ /* raced with another isolation */
+ if (!folio_test_clear_lru(folio)) {
+ folio_put(folio);
+ return false;
+ }
+
+ /* see the comment on MAX_NR_TIERS */
+ if (!folio_test_referenced(folio))
+ set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
+
+ /* for shrink_folio_list() */
+ folio_clear_reclaim(folio);
+ folio_clear_referenced(folio);
+
+ success = lru_gen_del_folio(lruvec, folio, true);
+ VM_WARN_ON_ONCE_FOLIO(!success, folio);
+
+ return true;
+}
+
+static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
+ int type, int tier, struct list_head *list)
+{
+ int gen, zone;
+ enum vm_event_item item;
+ int sorted = 0;
+ int scanned = 0;
+ int isolated = 0;
+ int remaining = MAX_LRU_BATCH;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+
+ VM_WARN_ON_ONCE(!list_empty(list));
+
+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
+ return 0;
+
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ for (zone = sc->reclaim_idx; zone >= 0; zone--) {
+ LIST_HEAD(moved);
+ int skipped = 0;
+ struct list_head *head = &lrugen->lists[gen][type][zone];
+
+ while (!list_empty(head)) {
+ struct folio *folio = lru_to_folio(head);
+ int delta = folio_nr_pages(folio);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ scanned += delta;
+
+ if (sort_folio(lruvec, folio, tier))
+ sorted += delta;
+ else if (isolate_folio(lruvec, folio, sc)) {
+ list_add(&folio->lru, list);
+ isolated += delta;
+ } else {
+ list_move(&folio->lru, &moved);
+ skipped += delta;
+ }
+
+ if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
+ break;
+ }
+
+ if (skipped) {
+ list_splice(&moved, head);
+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
+ }
+
+ if (!remaining || isolated >= MIN_LRU_BATCH)
+ break;
+ }
+
+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ if (!cgroup_reclaim(sc)) {
+ __count_vm_events(item, isolated);
+ __count_vm_events(PGREFILL, sorted);
+ }
+ __count_memcg_events(memcg, item, isolated);
+ __count_memcg_events(memcg, PGREFILL, sorted);
+ __count_vm_events(PGSCAN_ANON + type, isolated);
+
+ /*
+ * There might not be eligible pages due to reclaim_idx, may_unmap and
+ * may_writepage. Check the remaining to prevent livelock if it's not
+ * making progress.
+ */
+ return isolated || !remaining ? scanned : 0;
+}
+
+static int get_tier_idx(struct lruvec *lruvec, int type)
+{
+ int tier;
+ struct ctrl_pos sp, pv;
+
+ /*
+ * To leave a margin for fluctuations, use a larger gain factor (1:2).
+ * This value is chosen because any other tier would have at least twice
+ * as many refaults as the first tier.
+ */
+ read_ctrl_pos(lruvec, type, 0, 1, &sp);
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
+ read_ctrl_pos(lruvec, type, tier, 2, &pv);
+ if (!positive_ctrl_err(&sp, &pv))
+ break;
+ }
+
+ return tier - 1;
+}
+
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
+{
+ int type, tier;
+ struct ctrl_pos sp, pv;
+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
+
+ /*
+ * Compare the first tier of anon with that of file to determine which
+ * type to scan. Also need to compare other tiers of the selected type
+ * with the first tier of the other type to determine the last tier (of
+ * the selected type) to evict.
+ */
+ read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
+ read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
+ type = positive_ctrl_err(&sp, &pv);
+
+ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
+ read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
+ if (!positive_ctrl_err(&sp, &pv))
+ break;
+ }
+
+ *tier_idx = tier - 1;
+
+ return type;
+}
+
+static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ int *type_scanned, struct list_head *list)
+{
+ int i;
+ int type;
+ int scanned;
+ int tier = -1;
+ DEFINE_MIN_SEQ(lruvec);
+
+ /*
+ * Try to make the obvious choice first. When anon and file are both
+ * available from the same generation, interpret swappiness 1 as file
+ * first and 200 as anon first.
+ */
+ if (!swappiness)
+ type = LRU_GEN_FILE;
+ else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
+ type = LRU_GEN_ANON;
+ else if (swappiness == 1)
+ type = LRU_GEN_FILE;
+ else if (swappiness == 200)
+ type = LRU_GEN_ANON;
+ else
+ type = get_type_to_scan(lruvec, swappiness, &tier);
+
+ for (i = !swappiness; i < ANON_AND_FILE; i++) {
+ if (tier < 0)
+ tier = get_tier_idx(lruvec, type);
+
+ scanned = scan_folios(lruvec, sc, type, tier, list);
+ if (scanned)
+ break;
+
+ type = !type;
+ tier = -1;
+ }
+
+ *type_scanned = type;
+
+ return scanned;
+}
+
+static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ bool *need_swapping)
+{
+ int type;
+ int scanned;
+ int reclaimed;
+ LIST_HEAD(list);
+ struct folio *folio;
+ enum vm_event_item item;
+ struct reclaim_stat stat;
+ struct lru_gen_mm_walk *walk;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
+
+ scanned += try_to_inc_min_seq(lruvec, swappiness);
+
+ if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
+ scanned = 0;
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ if (list_empty(&list))
+ return scanned;
+
+ reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+
+ list_for_each_entry(folio, &list, lru) {
+ /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
+ if (folio_test_workingset(folio))
+ folio_set_referenced(folio);
+
+ /* don't add rejected pages to the oldest generation */
+ if (folio_test_reclaim(folio) &&
+ (folio_test_dirty(folio) || folio_test_writeback(folio)))
+ folio_clear_active(folio);
+ else
+ folio_set_active(folio);
+ }
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ move_folios_to_lru(lruvec, &list);
+
+ walk = current->reclaim_state->mm_walk;
+ if (walk && walk->batched)
+ reset_batch_size(lruvec, walk);
+
+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(item, reclaimed);
+ __count_memcg_events(memcg, item, reclaimed);
+ __count_vm_events(PGSTEAL_ANON + type, reclaimed);
+
+ spin_unlock_irq(&lruvec->lru_lock);
+
+ mem_cgroup_uncharge_list(&list);
+ free_unref_page_list(&list);
+
+ sc->nr_reclaimed += reclaimed;
+
+ if (need_swapping && type == LRU_GEN_ANON)
+ *need_swapping = true;
+
+ return scanned;
+}
+
+/*
+ * For future optimizations:
+ * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
+ * reclaim.
+ */
+static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
+ bool can_swap, bool *need_aging)
+{
+ unsigned long nr_to_scan;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (mem_cgroup_below_min(memcg) ||
+ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
+ return 0;
+
+ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
+ if (!*need_aging)
+ return nr_to_scan;
+
+ /* skip the aging path at the default priority */
+ if (sc->priority == DEF_PRIORITY)
+ goto done;
+
+ /* leave the work to lru_gen_age_node() */
+ if (current_is_kswapd())
+ return 0;
+
+ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
+ return nr_to_scan;
+done:
+ return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
+}
+
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
+ struct scan_control *sc, bool need_swapping)
+{
+ int i;
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (!current_is_kswapd()) {
+ /* age each memcg at most once to ensure fairness */
+ if (max_seq - seq > 1)
+ return true;
+
+ /* over-swapping can increase allocation latency */
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
+ return true;
+
+ /* give this thread a chance to exit and free its memory */
+ if (fatal_signal_pending(current)) {
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+ return true;
+ }
+
+ if (cgroup_reclaim(sc))
+ return false;
+ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
+ return false;
+
+ /* keep scanning at low priorities to ensure fairness */
+ if (sc->priority > DEF_PRIORITY - 2)
+ return false;
+
+ /*
+ * A minimum amount of work was done under global memory pressure. For
+ * kswapd, it may be overshooting. For direct reclaim, the allocation
+ * may succeed if all suitable zones are somewhat safe. In either case,
+ * it's better to stop now, and restart later if necessary.
+ */
+ for (i = 0; i <= sc->reclaim_idx; i++) {
+ unsigned long wmark;
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
+ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
+ return false;
+ }
+
+ sc->nr_reclaimed += MIN_LRU_BATCH;
+
+ return true;
+}
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+ struct blk_plug plug;
+ bool need_aging = false;
+ bool need_swapping = false;
+ unsigned long scanned = 0;
+ unsigned long reclaimed = sc->nr_reclaimed;
+ DEFINE_MAX_SEQ(lruvec);
+
+ lru_add_drain();
+
+ blk_start_plug(&plug);
+
+ set_mm_walk(lruvec_pgdat(lruvec));
+
+ while (true) {
+ int delta;
+ int swappiness;
+ unsigned long nr_to_scan;
+
+ if (sc->may_swap)
+ swappiness = get_swappiness(lruvec, sc);
+ else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
+ swappiness = 1;
+ else
+ swappiness = 0;
+
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
+ if (!nr_to_scan)
+ goto done;
+
+ delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
+ if (!delta)
+ goto done;
+
+ scanned += delta;
+ if (scanned >= nr_to_scan)
+ break;
+
+ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
+ break;
+
+ cond_resched();
+ }
+
+ /* see the comment in lru_gen_age_node() */
+ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
+ sc->memcgs_need_aging = false;
+done:
+ clear_mm_walk();
+
+ blk_finish_plug(&plug);
+}
+
+/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+{
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ if (lrugen->enabled) {
+ enum lru_list lru;
+
+ for_each_evictable_lru(lru) {
+ if (!list_empty(&lruvec->lists[lru]))
+ return false;
+ }
+ } else {
+ int gen, type, zone;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool fill_evictable(struct lruvec *lruvec)
+{
+ enum lru_list lru;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_evictable_lru(lru) {
+ int type = is_file_lru(lru);
+ bool active = is_active_lru(lru);
+ struct list_head *head = &lruvec->lists[lru];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
+
+ lruvec_del_folio(lruvec, folio);
+ success = lru_gen_add_folio(lruvec, folio, false);
+ VM_WARN_ON_ONCE(!success);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool drain_evictable(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ success = lru_gen_del_folio(lruvec, folio, false);
+ VM_WARN_ON_ONCE(!success);
+ lruvec_add_folio(lruvec, folio);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void lru_gen_change_state(bool enabled)
+{
+ static DEFINE_MUTEX(state_mutex);
+
+ struct mem_cgroup *memcg;
+
+ cgroup_lock();
+ cpus_read_lock();
+ get_online_mems();
+ mutex_lock(&state_mutex);
+
+ if (enabled == lru_gen_enabled())
+ goto unlock;
+
+ if (enabled)
+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+ else
+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ if (!lruvec)
+ continue;
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
+ VM_WARN_ON_ONCE(!state_is_valid(lruvec));
+
+ lruvec->lrugen.enabled = enabled;
+
+ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+unlock:
+ mutex_unlock(&state_mutex);
+ put_online_mems();
+ cpus_read_unlock();
+ cgroup_unlock();
+}
+
+/******************************************************************************
+ * sysfs interface
+ ******************************************************************************/
+
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
+}
+
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int msecs;
+
+ if (kstrtouint(buf, 0, &msecs))
+ return -EINVAL;
+
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
+);
+
+static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ unsigned int caps = 0;
+
+ if (get_cap(LRU_GEN_CORE))
+ caps |= BIT(LRU_GEN_CORE);
+
+ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
+ caps |= BIT(LRU_GEN_MM_WALK);
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
+
+ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
+}
+
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ int i;
+ unsigned int caps;
+
+ if (tolower(*buf) == 'n')
+ caps = 0;
+ else if (tolower(*buf) == 'y')
+ caps = -1;
+ else if (kstrtouint(buf, 0, &caps))
+ return -EINVAL;
+
+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
+ bool enabled = caps & BIT(i);
+
+ if (i == LRU_GEN_CORE)
+ lru_gen_change_state(enabled);
+ else if (enabled)
+ static_branch_enable(&lru_gen_caps[i]);
+ else
+ static_branch_disable(&lru_gen_caps[i]);
+ }
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
+ enabled, 0644, show_enabled, store_enabled
+);
+
+static struct attribute *lru_gen_attrs[] = {
+ &lru_gen_min_ttl_attr.attr,
+ &lru_gen_enabled_attr.attr,
+ NULL
+};
+
+static struct attribute_group lru_gen_attr_group = {
+ .name = "lru_gen",
+ .attrs = lru_gen_attrs,
+};
+
+/******************************************************************************
+ * debugfs interface
+ ******************************************************************************/
+
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct mem_cgroup *memcg;
+ loff_t nr_to_skip = *pos;
+
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
+ if (!m->private)
+ return ERR_PTR(-ENOMEM);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ if (!nr_to_skip--)
+ return get_lruvec(memcg, nid);
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ return NULL;
+}
+
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
+{
+ if (!IS_ERR_OR_NULL(v))
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
+
+ kvfree(m->private);
+ m->private = NULL;
+}
+
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int nid = lruvec_pgdat(v)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(v);
+
+ ++*pos;
+
+ nid = next_memory_node(nid);
+ if (nid == MAX_NUMNODES) {
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ if (!memcg)
+ return NULL;
+
+ nid = first_memory_node;
+ }
+
+ return get_lruvec(memcg, nid);
+}
+
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
+ unsigned long max_seq, unsigned long *min_seq,
+ unsigned long seq)
+{
+ int i;
+ int type, tier;
+ int hist = lru_hist_from_seq(seq);
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ seq_printf(m, " %10d", tier);
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ const char *s = " ";
+ unsigned long n[3] = {};
+
+ if (seq == max_seq) {
+ s = "RT ";
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
+ s = "rep";
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+ }
+
+ for (i = 0; i < 3; i++)
+ seq_printf(m, " %10lu%c", n[i], s[i]);
+ }
+ seq_putc(m, '\n');
+ }
+
+ seq_puts(m, " ");
+ for (i = 0; i < NR_MM_STATS; i++) {
+ const char *s = " ";
+ unsigned long n = 0;
+
+ if (seq == max_seq && NR_HIST_GENS == 1) {
+ s = "LOYNFA";
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ } else if (seq != max_seq && NR_HIST_GENS > 1) {
+ s = "loynfa";
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
+ }
+
+ seq_printf(m, " %10lu%c", n, s[i]);
+ }
+ seq_putc(m, '\n');
+}
+
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+static int lru_gen_seq_show(struct seq_file *m, void *v)
+{
+ unsigned long seq;
+ bool full = !debugfs_real_fops(m->file)->write;
+ struct lruvec *lruvec = v;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+ int nid = lruvec_pgdat(lruvec)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (nid == first_memory_node) {
+ const char *path = memcg ? m->private : "";
+
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
+#endif
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
+ }
+
+ seq_printf(m, " node %5d\n", nid);
+
+ if (!full)
+ seq = min_seq[LRU_GEN_ANON];
+ else if (max_seq >= MAX_NR_GENS)
+ seq = max_seq - MAX_NR_GENS + 1;
+ else
+ seq = 0;
+
+ for (; seq <= max_seq; seq++) {
+ int type, zone;
+ int gen = lru_gen_from_seq(seq);
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
+
+ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ unsigned long size = 0;
+ char mark = full && seq < min_seq[type] ? 'x' : ' ';
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
+
+ seq_printf(m, " %10lu%c", size, mark);
+ }
+
+ seq_putc(m, '\n');
+
+ if (full)
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations lru_gen_seq_ops = {
+ .start = lru_gen_seq_start,
+ .stop = lru_gen_seq_stop,
+ .next = lru_gen_seq_next,
+ .show = lru_gen_seq_show,
+};
+
+static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+ bool can_swap, bool force_scan)
+{
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < max_seq)
+ return 0;
+
+ if (seq > max_seq)
+ return -EINVAL;
+
+ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
+ return -ERANGE;
+
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
+
+ return 0;
+}
+
+static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
+ int swappiness, unsigned long nr_to_reclaim)
+{
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (seq + MIN_NR_GENS > max_seq)
+ return -EINVAL;
+
+ sc->nr_reclaimed = 0;
+
+ while (!signal_pending(current)) {
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < min_seq[!swappiness])
+ return 0;
+
+ if (sc->nr_reclaimed >= nr_to_reclaim)
+ return 0;
+
+ if (!evict_folios(lruvec, sc, swappiness, NULL))
+ return 0;
+
+ cond_resched();
+ }
+
+ return -EINTR;
+}
+
+static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
+ struct scan_control *sc, int swappiness, unsigned long opt)
+{
+ struct lruvec *lruvec;
+ int err = -EINVAL;
+ struct mem_cgroup *memcg = NULL;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
+ return -EINVAL;
+
+ if (!mem_cgroup_disabled()) {
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(memcg_id);
+#ifdef CONFIG_MEMCG
+ if (memcg && !css_tryget(&memcg->css))
+ memcg = NULL;
+#endif
+ rcu_read_unlock();
+
+ if (!memcg)
+ return -EINVAL;
+ }
+
+ if (memcg_id != mem_cgroup_id(memcg))
+ goto done;
+
+ lruvec = get_lruvec(memcg, nid);
+
+ if (swappiness < 0)
+ swappiness = get_swappiness(lruvec, sc);
+ else if (swappiness > 200)
+ goto done;
+
+ switch (cmd) {
+ case '+':
+ err = run_aging(lruvec, seq, sc, swappiness, opt);
+ break;
+ case '-':
+ err = run_eviction(lruvec, seq, sc, swappiness, opt);
+ break;
+ }
+done:
+ mem_cgroup_put(memcg);
+
+ return err;
+}
+
+/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
+ size_t len, loff_t *pos)
+{
+ void *buf;
+ char *cur, *next;
+ unsigned int flags;
+ struct blk_plug plug;
+ int err = -EINVAL;
+ struct scan_control sc = {
+ .may_writepage = true,
+ .may_unmap = true,
+ .may_swap = true,
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ buf = kvmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, src, len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ flags = memalloc_noreclaim_save();
+ blk_start_plug(&plug);
+ if (!set_mm_walk(NULL)) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ next = buf;
+ next[len] = '\0';
+
+ while ((cur = strsep(&next, ",;\n"))) {
+ int n;
+ int end;
+ char cmd;
+ unsigned int memcg_id;
+ unsigned int nid;
+ unsigned long seq;
+ unsigned int swappiness = -1;
+ unsigned long opt = -1;
+
+ cur = skip_spaces(cur);
+ if (!*cur)
+ continue;
+
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, &swappiness, &end, &opt, &end);
+ if (n < 4 || cur[end]) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
+ if (err)
+ break;
+ }
+done:
+ clear_mm_walk();
+ blk_finish_plug(&plug);
+ memalloc_noreclaim_restore(flags);
+ set_task_reclaim_state(current, NULL);
+
+ kvfree(buf);
+
+ return err ? : len;
+}
+
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &lru_gen_seq_ops);
+}
+
+static const struct file_operations lru_gen_rw_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .write = lru_gen_seq_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations lru_gen_ro_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/******************************************************************************
+ * initialization
+ ******************************************************************************/
+
+void lru_gen_init_lruvec(struct lruvec *lruvec)
+{
+ int i;
+ int gen, type, zone;
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ lrugen->max_seq = MIN_NR_GENS + 1;
+ lrugen->enabled = lru_gen_enabled();
+
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
+ lrugen->timestamps[i] = jiffies;
+
+ for_each_gen_type_zone(gen, type, zone)
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+
+ lruvec->mm_state.seq = MIN_NR_GENS;
+ init_waitqueue_head(&lruvec->mm_state.wait);
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+ INIT_LIST_HEAD(&memcg->mm_list.fifo);
+ spin_lock_init(&memcg->mm_list.lock);
+}
+
+void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+{
+ int i;
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
+ sizeof(lruvec->lrugen.nr_pages)));
+
+ for (i = 0; i < NR_BLOOM_FILTERS; i++) {
+ bitmap_free(lruvec->mm_state.filters[i]);
+ lruvec->mm_state.filters[i] = NULL;
+ }
+ }
+}
+#endif
+
+static int __init init_lru_gen(void)
+{
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
+ pr_err("lru_gen: failed to create sysfs group\n");
+
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+
+ return 0;
+};
+late_initcall(init_lru_gen);
+
+#else /* !CONFIG_LRU_GEN */
+
+static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+{
+}
+
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
@@ -2958,6 +5847,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
struct blk_plug plug;
bool scan_adjusted;
+ if (lru_gen_enabled()) {
+ lru_gen_shrink_lruvec(lruvec, sc);
+ return;
+ }
+
get_scan_count(lruvec, sc, nr);
/* Record the original scan target for proportional adjustments later */
@@ -3197,109 +6091,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long nr_reclaimed, nr_scanned;
struct lruvec *target_lruvec;
bool reclaimable = false;
- unsigned long file;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
again:
- /*
- * Flush the memory cgroup stats, so that we read accurate per-memcg
- * lruvec stats for heuristics.
- */
- mem_cgroup_flush_stats();
-
memset(&sc->nr, 0, sizeof(sc->nr));
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- /*
- * Determine the scan balance between anon and file LRUs.
- */
- spin_lock_irq(&target_lruvec->lru_lock);
- sc->anon_cost = target_lruvec->anon_cost;
- sc->file_cost = target_lruvec->file_cost;
- spin_unlock_irq(&target_lruvec->lru_lock);
-
- /*
- * Target desirable inactive:active list ratios for the anon
- * and file LRU lists.
- */
- if (!sc->force_deactivate) {
- unsigned long refaults;
-
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_ANON);
- if (refaults != target_lruvec->refaults[0] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
- sc->may_deactivate |= DEACTIVATE_ANON;
- else
- sc->may_deactivate &= ~DEACTIVATE_ANON;
-
- /*
- * When refaults are being observed, it means a new
- * workingset is being established. Deactivate to get
- * rid of any stale active pages quickly.
- */
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_FILE);
- if (refaults != target_lruvec->refaults[1] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
- sc->may_deactivate |= DEACTIVATE_FILE;
- else
- sc->may_deactivate &= ~DEACTIVATE_FILE;
- } else
- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
-
- /*
- * If we have plenty of inactive file pages that aren't
- * thrashing, try to reclaim those first before touching
- * anonymous pages.
- */
- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
- sc->cache_trim_mode = 1;
- else
- sc->cache_trim_mode = 0;
-
- /*
- * Prevent the reclaimer from falling into the cache trap: as
- * cache pages start out inactive, every cache fault will tip
- * the scan balance towards the file LRU. And as the file LRU
- * shrinks, so does the window for rotation from references.
- * This means we have a runaway feedback loop where a tiny
- * thrashing file LRU becomes infinitely more attractive than
- * anon pages. Try to detect this based on file LRU size.
- */
- if (!cgroup_reclaim(sc)) {
- unsigned long total_high_wmark = 0;
- unsigned long free, anon;
- int z;
-
- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
- node_page_state(pgdat, NR_INACTIVE_FILE);
-
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
-
- total_high_wmark += high_wmark_pages(zone);
- }
-
- /*
- * Consider anon: if that's low too, this isn't a
- * runaway file reclaim problem, but rather just
- * extreme pressure. Reclaim as per usual then.
- */
- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
-
- sc->file_is_tiny =
- file + free <= total_high_wmark &&
- !(sc->may_deactivate & DEACTIVATE_ANON) &&
- anon >> sc->priority;
- }
+ prepare_scan_count(pgdat, sc);
shrink_node_memcgs(pgdat, sc);
@@ -3557,11 +6358,14 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
struct lruvec *target_lruvec;
unsigned long refaults;
+ if (lru_gen_enabled())
+ return;
+
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
- target_lruvec->refaults[0] = refaults;
+ target_lruvec->refaults[WORKINGSET_ANON] = refaults;
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
- target_lruvec->refaults[1] = refaults;
+ target_lruvec->refaults[WORKINGSET_FILE] = refaults;
}
/*
@@ -3923,12 +6727,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
}
#endif
-static void age_active_anon(struct pglist_data *pgdat,
- struct scan_control *sc)
+static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;
+ if (lru_gen_enabled()) {
+ lru_gen_age_node(pgdat, sc);
+ return;
+ }
+
if (!can_age_anon_pages(pgdat, sc))
return;
@@ -4248,12 +7056,11 @@ restart:
sc.may_swap = !nr_boost_reclaim;
/*
- * Do some background aging of the anon list, to give
- * pages a chance to be referenced before reclaiming. All
- * pages are rotated regardless of classzone as this is
- * about consistent aging.
+ * Do some background aging, to give pages a chance to be
+ * referenced before reclaiming. All pages are rotated
+ * regardless of classzone as this is about consistent aging.
*/
- age_active_anon(pgdat, &sc);
+ kswapd_age_node(pgdat, &sc);
/*
* If we're getting trouble reclaiming, start doing writepage
@@ -4643,16 +7450,17 @@ void kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- if (pgdat->kswapd)
- return;
-
- pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
- if (IS_ERR(pgdat->kswapd)) {
- /* failure at boot is fatal */
- BUG_ON(system_state < SYSTEM_RUNNING);
- pr_err("Failed to start kswapd on node %d\n", nid);
- pgdat->kswapd = NULL;
+ pgdat_kswapd_lock(pgdat);
+ if (!pgdat->kswapd) {
+ pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ if (IS_ERR(pgdat->kswapd)) {
+ /* failure at boot is fatal */
+ BUG_ON(system_state < SYSTEM_RUNNING);
+ pr_err("Failed to start kswapd on node %d\n", nid);
+ pgdat->kswapd = NULL;
+ }
}
+ pgdat_kswapd_unlock(pgdat);
}
/*
@@ -4661,12 +7469,16 @@ void kswapd_run(int nid)
*/
void kswapd_stop(int nid)
{
- struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct task_struct *kswapd;
+ pgdat_kswapd_lock(pgdat);
+ kswapd = pgdat->kswapd;
if (kswapd) {
kthread_stop(kswapd);
- NODE_DATA(nid)->kswapd = NULL;
+ pgdat->kswapd = NULL;
}
+ pgdat_kswapd_unlock(pgdat);
}
static int __init kswapd_init(void)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 90af9a8572f5..b2371d745e00 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -28,7 +28,6 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
-#include <linux/migrate.h>
#include "internal.h"
@@ -355,8 +354,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
* CPU migrations and preemption potentially corrupts a counter so
* disable preemption.
*/
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
x = delta + __this_cpu_read(*p);
@@ -368,8 +366,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
}
__this_cpu_write(*p, x);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
EXPORT_SYMBOL(__mod_zone_page_state);
@@ -393,8 +390,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
}
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
x = delta + __this_cpu_read(*p);
@@ -406,8 +402,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
}
__this_cpu_write(*p, x);
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
EXPORT_SYMBOL(__mod_node_page_state);
@@ -441,8 +436,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
s8 v, t;
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -453,8 +447,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
__this_cpu_write(*p, -overstep);
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -466,8 +459,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -478,8 +470,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
__this_cpu_write(*p, -overstep);
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -501,8 +492,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
s8 v, t;
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -513,8 +503,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
__this_cpu_write(*p, overstep);
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
@@ -526,8 +515,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
/* See __mod_node_page_state */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_disable();
+ preempt_disable_nested();
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -538,8 +526,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
__this_cpu_write(*p, overstep);
}
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- preempt_enable();
+ preempt_enable_nested();
}
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
@@ -1247,11 +1234,13 @@ const char * const vmstat_text[] = {
"nr_shadow_call_stack",
#endif
"nr_page_table_pages",
+ "nr_sec_page_table_pages",
#ifdef CONFIG_SWAP
"nr_swapcached",
#endif
#ifdef CONFIG_NUMA_BALANCING
"pgpromote_success",
+ "pgpromote_candidate",
#endif
/* enum writeback_stat_item counters */
@@ -1389,10 +1378,6 @@ const char * const vmstat_text[] = {
"nr_tlb_local_flush_one",
#endif /* CONFIG_DEBUG_TLBFLUSH */
-#ifdef CONFIG_DEBUG_VM_VMACACHE
- "vmacache_find_calls",
- "vmacache_find_hits",
-#endif
#ifdef CONFIG_SWAP
"swap_ra",
"swap_ra_hit",
@@ -2067,7 +2052,6 @@ static int vmstat_cpu_online(unsigned int cpu)
if (!node_state(cpu_to_node(cpu), N_CPU)) {
node_set_state(cpu_to_node(cpu), N_CPU);
- set_migration_target_nodes();
}
return 0;
@@ -2092,7 +2076,6 @@ static int vmstat_cpu_dead(unsigned int cpu)
return 0;
node_clear_state(node, N_CPU);
- set_migration_target_nodes();
return 0;
}
@@ -2125,7 +2108,6 @@ void __init init_mm_internals(void)
start_shepherd_timer();
#endif
- migrate_on_reclaim_init();
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
diff --git a/mm/workingset.c b/mm/workingset.c
index a5e84862fc86..ae7e984b23c6 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_mostly;
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
bool workingset)
{
- eviction >>= bucket_order;
eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
@@ -212,10 +211,107 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
- *evictionp = entry << bucket_order;
+ *evictionp = entry;
*workingsetp = workingset;
}
+#ifdef CONFIG_LRU_GEN
+
+static void *lru_gen_eviction(struct folio *folio)
+{
+ int hist;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lru_gen_struct *lrugen;
+ int type = folio_is_file_lru(folio);
+ int delta = folio_nr_pages(folio);
+ int refs = folio_lru_refs(folio);
+ int tier = lru_tier_from_refs(refs);
+ struct mem_cgroup *memcg = folio_memcg(folio);
+ struct pglist_data *pgdat = folio_pgdat(folio);
+
+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->lrugen;
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
+
+ hist = lru_hist_from_seq(min_seq);
+ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
+
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs);
+}
+
+static void lru_gen_refault(struct folio *folio, void *shadow)
+{
+ int hist, tier, refs;
+ int memcg_id;
+ bool workingset;
+ unsigned long token;
+ unsigned long min_seq;
+ struct lruvec *lruvec;
+ struct lru_gen_struct *lrugen;
+ struct mem_cgroup *memcg;
+ struct pglist_data *pgdat;
+ int type = folio_is_file_lru(folio);
+ int delta = folio_nr_pages(folio);
+
+ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
+
+ if (pgdat != folio_pgdat(folio))
+ return;
+
+ rcu_read_lock();
+
+ memcg = folio_memcg_rcu(folio);
+ if (memcg_id != mem_cgroup_id(memcg))
+ goto unlock;
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ lrugen = &lruvec->lrugen;
+
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
+ if ((token >> LRU_REFS_WIDTH) != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
+ goto unlock;
+
+ hist = lru_hist_from_seq(min_seq);
+ /* see the comment in folio_lru_refs() */
+ refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset;
+ tier = lru_tier_from_refs(refs);
+
+ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
+
+ /*
+ * Count the following two cases as stalls:
+ * 1. For pages accessed through page tables, hotter pages pushed out
+ * hot pages which refaulted immediately.
+ * 2. For pages accessed multiple times through file descriptors,
+ * numbers of accesses might have been out of the range.
+ */
+ if (lru_gen_in_fault() || refs == BIT(LRU_REFS_WIDTH)) {
+ folio_set_workingset(folio);
+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
+ }
+unlock:
+ rcu_read_unlock();
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static void *lru_gen_eviction(struct folio *folio)
+{
+ return NULL;
+}
+
+static void lru_gen_refault(struct folio *folio, void *shadow)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
* @lruvec: the lruvec that was aged
@@ -264,10 +360,14 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ if (lru_gen_enabled())
+ return lru_gen_eviction(folio);
+
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
+ eviction >>= bucket_order;
workingset_age_nonresident(lruvec, folio_nr_pages(folio));
return pack_shadow(memcgid, pgdat, eviction,
folio_test_workingset(folio));
@@ -298,7 +398,13 @@ void workingset_refault(struct folio *folio, void *shadow)
int memcgid;
long nr;
+ if (lru_gen_enabled()) {
+ lru_gen_refault(folio, shadow);
+ return;
+ }
+
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
+ eviction <<= bucket_order;
rcu_read_lock();
/*
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 907c9b1e1e61..d03941cace2c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -472,12 +472,12 @@ static inline struct page *get_first_page(struct zspage *zspage)
return first_page;
}
-static inline int get_first_obj_offset(struct page *page)
+static inline unsigned int get_first_obj_offset(struct page *page)
{
return page->page_type;
}
-static inline void set_first_obj_offset(struct page *page, int offset)
+static inline void set_first_obj_offset(struct page *page, unsigned int offset)
{
page->page_type = offset;
}
@@ -1555,6 +1555,13 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
d_off += size;
d_size -= size;
+ /*
+ * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic()
+ * calls must occurs in reverse order of calls to kmap_atomic().
+ * So, to call kunmap_atomic(s_addr) we should first call
+ * kunmap_atomic(d_addr). For more details see
+ * Documentation/mm/highmem.rst.
+ */
if (s_off >= PAGE_SIZE) {
kunmap_atomic(d_addr);
kunmap_atomic(s_addr);
@@ -1585,7 +1592,7 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
static unsigned long find_alloced_obj(struct size_class *class,
struct page *page, int *obj_idx)
{
- int offset = 0;
+ unsigned int offset;
int index = *obj_idx;
unsigned long handle = 0;
void *addr = kmap_atomic(page);
@@ -1839,7 +1846,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
struct zspage *zspage;
struct page *dummy;
void *s_addr, *d_addr, *addr;
- int offset;
+ unsigned int offset;
unsigned long handle;
unsigned long old_obj, new_obj;
unsigned int obj_idx;
@@ -2103,8 +2110,6 @@ unsigned long zs_compact(struct zs_pool *pool)
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
- if (!class)
- continue;
if (class->index != i)
continue;
pages_freed += __zs_compact(pool, class);
@@ -2149,8 +2154,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
- if (!class)
- continue;
if (class->index != i)
continue;
diff --git a/mm/zswap.c b/mm/zswap.c
index 104835b379ec..2d48fd59cc7a 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1026,7 +1026,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
SetPageReclaim(page);
/* start writeback */
- __swap_writepage(page, &wbc, end_swap_bio_write);
+ __swap_writepage(page, &wbc);
put_page(page);
zswap_written_back_pages++;